{-# LANGUAGE Trustworthy #-} {-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-} {-# OPTIONS_HADDOCK not-home #-} ----------------------------------------------------------------------------- -- | -- Module : GHC.Unicode -- Copyright : (c) The University of Glasgow, 2003 -- License : see libraries/base/LICENSE -- -- Maintainer : cvs-ghc@haskell.org -- Stability : internal -- Portability : non-portable (GHC extensions) -- -- Implementations for the character predicates (isLower, isUpper, etc.) -- and the conversions (toUpper, toLower). The implementation uses -- libunicode on Unix systems if that is available. -- ----------------------------------------------------------------------------- module GHC.Unicode ( GeneralCategory (..), generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, isPunctuation, isSymbol, toUpper, toLower, toTitle, wgencat ) where import GHC.Base import GHC.Char (chr) import GHC.Real import GHC.Enum ( Enum (..), Bounded (..) ) import GHC.Arr ( Ix (..) ) import GHC.Num -- Data.Char.chr already imports this and we need to define a Show instance -- for GeneralCategory import GHC.Show ( Show ) #include "HsBaseConfig.h" -- | Unicode General Categories (column 2 of the UnicodeData table) in -- the order they are listed in the Unicode standard (the Unicode -- Character Database, in particular). -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> :t OtherLetter -- OtherLetter :: GeneralCategory -- -- 'Eq' instance: -- -- >>> UppercaseLetter == UppercaseLetter -- True -- >>> UppercaseLetter == LowercaseLetter -- False -- -- 'Ord' instance: -- -- >>> NonSpacingMark <= MathSymbol -- True -- -- 'Enum' instance: -- -- >>> enumFromTo ModifierLetter SpacingCombiningMark -- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark] -- -- 'Text.Read.Read' instance: -- -- >>> read "DashPunctuation" :: GeneralCategory -- DashPunctuation -- >>> read "17" :: GeneralCategory -- *** Exception: Prelude.read: no parse -- -- 'Show' instance: -- -- >>> show EnclosingMark -- "EnclosingMark" -- -- 'Bounded' instance: -- -- >>> minBound :: GeneralCategory -- UppercaseLetter -- >>> maxBound :: GeneralCategory -- NotAssigned -- -- 'Ix' instance: -- -- >>> import Data.Ix ( index ) -- >>> index (OtherLetter,Control) FinalQuote -- 12 -- >>> index (OtherLetter,Control) Format -- *** Exception: Error in array index -- data GeneralCategory = UppercaseLetter -- ^ Lu: Letter, Uppercase | LowercaseLetter -- ^ Ll: Letter, Lowercase | TitlecaseLetter -- ^ Lt: Letter, Titlecase | ModifierLetter -- ^ Lm: Letter, Modifier | OtherLetter -- ^ Lo: Letter, Other | NonSpacingMark -- ^ Mn: Mark, Non-Spacing | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining | EnclosingMark -- ^ Me: Mark, Enclosing | DecimalNumber -- ^ Nd: Number, Decimal | LetterNumber -- ^ Nl: Number, Letter | OtherNumber -- ^ No: Number, Other | ConnectorPunctuation -- ^ Pc: Punctuation, Connector | DashPunctuation -- ^ Pd: Punctuation, Dash | OpenPunctuation -- ^ Ps: Punctuation, Open | ClosePunctuation -- ^ Pe: Punctuation, Close | InitialQuote -- ^ Pi: Punctuation, Initial quote | FinalQuote -- ^ Pf: Punctuation, Final quote | OtherPunctuation -- ^ Po: Punctuation, Other | MathSymbol -- ^ Sm: Symbol, Math | CurrencySymbol -- ^ Sc: Symbol, Currency | ModifierSymbol -- ^ Sk: Symbol, Modifier | OtherSymbol -- ^ So: Symbol, Other | Space -- ^ Zs: Separator, Space | LineSeparator -- ^ Zl: Separator, Line | ParagraphSeparator -- ^ Zp: Separator, Paragraph | Control -- ^ Cc: Other, Control | Format -- ^ Cf: Other, Format | Surrogate -- ^ Cs: Other, Surrogate | PrivateUse -- ^ Co: Other, Private Use | NotAssigned -- ^ Cn: Other, Not Assigned deriving ( Show -- ^ @since 2.01 , Eq -- ^ @since 2.01 , Ord -- ^ @since 2.01 , Enum -- ^ @since 2.01 , Bounded -- ^ @since 2.01 , Ix -- ^ @since 2.01 ) -- | The Unicode general category of the character. This relies on the -- 'Enum' instance of 'GeneralCategory', which must remain in the -- same order as the categories are presented in the Unicode -- standard. -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> generalCategory 'a' -- LowercaseLetter -- >>> generalCategory 'A' -- UppercaseLetter -- >>> generalCategory '0' -- DecimalNumber -- >>> generalCategory '%' -- OtherPunctuation -- >>> generalCategory '♥' -- OtherSymbol -- >>> generalCategory '\31' -- Control -- >>> generalCategory ' ' -- Space -- generalCategory :: Char -> GeneralCategory generalCategory :: Char -> GeneralCategory generalCategory c :: Char c = Int -> GeneralCategory forall a. Enum a => Int -> a toEnum (Int -> GeneralCategory) -> Int -> GeneralCategory forall a b. (a -> b) -> a -> b $ Int -> Int forall a b. (Integral a, Num b) => a -> b fromIntegral (Int -> Int) -> Int -> Int forall a b. (a -> b) -> a -> b $ Int -> Int wgencat (Int -> Int) -> Int -> Int forall a b. (a -> b) -> a -> b $ Int -> Int forall a b. (Integral a, Num b) => a -> b fromIntegral (Int -> Int) -> Int -> Int forall a b. (a -> b) -> a -> b $ Char -> Int ord Char c -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool isAscii :: Char -> Bool isAscii c :: Char c = Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool < '\x80' -- | Selects the first 256 characters of the Unicode character set, -- corresponding to the ISO 8859-1 (Latin-1) character set. isLatin1 :: Char -> Bool isLatin1 :: Char -> Bool isLatin1 c :: Char c = Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool <= '\xff' -- | Selects ASCII lower-case letters, -- i.e. characters satisfying both 'isAscii' and 'isLower'. isAsciiLower :: Char -> Bool isAsciiLower :: Char -> Bool isAsciiLower c :: Char c = Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool >= 'a' Bool -> Bool -> Bool && Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool <= 'z' -- | Selects ASCII upper-case letters, -- i.e. characters satisfying both 'isAscii' and 'isUpper'. isAsciiUpper :: Char -> Bool isAsciiUpper :: Char -> Bool isAsciiUpper c :: Char c = Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool >= 'A' Bool -> Bool -> Bool && Char c Char -> Char -> Bool forall a. Ord a => a -> a -> Bool <= 'Z' -- | Selects control characters, which are the non-printing characters of -- the Latin-1 subset of Unicode. isControl :: Char -> Bool -- | Selects printable Unicode characters -- (letters, numbers, marks, punctuation, symbols and spaces). isPrint :: Char -> Bool -- | Returns 'True' for any Unicode space character, and the control -- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@. isSpace :: Char -> Bool -- isSpace includes non-breaking space -- The magic 0x377 isn't really that magical. As of 2014, all the codepoints -- at or below 0x377 have been assigned, so we shouldn't have to worry about -- any new spaces appearing below there. It would probably be best to -- use branchless ||, but currently the eqLit transformation will undo that, -- so we'll do it like this until there's a way around that. isSpace :: Char -> Bool isSpace c :: Char c | Word uc Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 0x377 = Word uc Word -> Word -> Bool forall a. Eq a => a -> a -> Bool == 32 Bool -> Bool -> Bool || Word uc Word -> Word -> Word forall a. Num a => a -> a -> a - 0x9 Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 4 Bool -> Bool -> Bool || Word uc Word -> Word -> Bool forall a. Eq a => a -> a -> Bool == 0xa0 | Bool otherwise = Int -> Int iswspace (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 where uc :: Word uc = Int -> Word forall a b. (Integral a, Num b) => a -> b fromIntegral (Char -> Int ord Char c) :: Word -- | Selects upper-case or title-case alphabetic Unicode characters (letters). -- Title case is used by a small number of letter ligatures like the -- single-character form of /Lj/. isUpper :: Char -> Bool -- | Selects lower-case alphabetic Unicode characters (letters). isLower :: Char -> Bool -- | Selects alphabetic Unicode characters (lower-case, upper-case and -- title-case letters, plus letters of caseless scripts and modifiers letters). -- This function is equivalent to 'Data.Char.isLetter'. isAlpha :: Char -> Bool -- | Selects alphabetic or numeric Unicode characters. -- -- Note that numeric digits outside the ASCII range, as well as numeric -- characters which aren't digits, are selected by this function but not by -- 'isDigit'. Such characters may be part of identifiers but are not used by -- the printer and reader to represent numbers. isAlphaNum :: Char -> Bool -- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@. isDigit :: Char -> Bool isDigit :: Char -> Bool isDigit c :: Char c = (Int -> Word forall a b. (Integral a, Num b) => a -> b fromIntegral (Char -> Int ord Char c Int -> Int -> Int forall a. Num a => a -> a -> a - Char -> Int ord '0') :: Word) Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 9 -- We use an addition and an unsigned comparison instead of two signed -- comparisons because it's usually faster and puts less strain on branch -- prediction. It likely also enables some CSE when combined with functions -- that follow up with an actual conversion. -- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@. isOctDigit :: Char -> Bool isOctDigit :: Char -> Bool isOctDigit c :: Char c = (Int -> Word forall a b. (Integral a, Num b) => a -> b fromIntegral (Char -> Int ord Char c Int -> Int -> Int forall a. Num a => a -> a -> a - Char -> Int ord '0') :: Word) Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 7 -- | Selects ASCII hexadecimal digits, -- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@. isHexDigit :: Char -> Bool isHexDigit :: Char -> Bool isHexDigit c :: Char c = Char -> Bool isDigit Char c Bool -> Bool -> Bool || (Int -> Word forall a b. (Integral a, Num b) => a -> b fromIntegral (Char -> Int ord Char c Int -> Int -> Int forall a. Num a => a -> a -> a - Char -> Int ord 'A')::Word) Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 5 Bool -> Bool -> Bool || (Int -> Word forall a b. (Integral a, Num b) => a -> b fromIntegral (Char -> Int ord Char c Int -> Int -> Int forall a. Num a => a -> a -> a - Char -> Int ord 'a')::Word) Word -> Word -> Bool forall a. Ord a => a -> a -> Bool <= 5 -- | Selects Unicode punctuation characters, including various kinds -- of connectors, brackets and quotes. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'ConnectorPunctuation' -- * 'DashPunctuation' -- * 'OpenPunctuation' -- * 'ClosePunctuation' -- * 'InitialQuote' -- * 'FinalQuote' -- * 'OtherPunctuation' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Punctuation\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isPunctuation 'a' -- False -- >>> isPunctuation '7' -- False -- >>> isPunctuation '♥' -- False -- >>> isPunctuation '"' -- True -- >>> isPunctuation '?' -- True -- >>> isPunctuation '—' -- True -- isPunctuation :: Char -> Bool isPunctuation :: Char -> Bool isPunctuation c :: Char c = case Char -> GeneralCategory generalCategory Char c of ConnectorPunctuation -> Bool True DashPunctuation -> Bool True OpenPunctuation -> Bool True ClosePunctuation -> Bool True InitialQuote -> Bool True FinalQuote -> Bool True OtherPunctuation -> Bool True _ -> Bool False -- | Selects Unicode symbol characters, including mathematical and -- currency symbols. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'MathSymbol' -- * 'CurrencySymbol' -- * 'ModifierSymbol' -- * 'OtherSymbol' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Symbol\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isSymbol 'a' -- False -- >>> isSymbol '6' -- False -- >>> isSymbol '=' -- True -- -- The definition of \"math symbol\" may be a little -- counter-intuitive depending on one's background: -- -- >>> isSymbol '+' -- True -- >>> isSymbol '-' -- False -- isSymbol :: Char -> Bool isSymbol :: Char -> Bool isSymbol c :: Char c = case Char -> GeneralCategory generalCategory Char c of MathSymbol -> Bool True CurrencySymbol -> Bool True ModifierSymbol -> Bool True OtherSymbol -> Bool True _ -> Bool False -- | Convert a letter to the corresponding upper-case letter, if any. -- Any other character is returned unchanged. toUpper :: Char -> Char -- | Convert a letter to the corresponding lower-case letter, if any. -- Any other character is returned unchanged. toLower :: Char -> Char -- | Convert a letter to the corresponding title-case or upper-case -- letter, if any. (Title case differs from upper case only for a small -- number of ligature letters.) -- Any other character is returned unchanged. toTitle :: Char -> Char -- ----------------------------------------------------------------------------- -- Implementation with the supplied auto-generated Unicode character properties -- table -- Regardless of the O/S and Library, use the functions contained in WCsubst.c isAlpha :: Char -> Bool isAlpha c :: Char c = Int -> Int iswalpha (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 isAlphaNum :: Char -> Bool isAlphaNum c :: Char c = Int -> Int iswalnum (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 isControl :: Char -> Bool isControl c :: Char c = Int -> Int iswcntrl (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 isPrint :: Char -> Bool isPrint c :: Char c = Int -> Int iswprint (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 isUpper :: Char -> Bool isUpper c :: Char c = Int -> Int iswupper (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 isLower :: Char -> Bool isLower c :: Char c = Int -> Int iswlower (Char -> Int ord Char c) Int -> Int -> Bool forall a. Eq a => a -> a -> Bool /= 0 toLower :: Char -> Char toLower c :: Char c = Int -> Char chr (Int -> Int towlower (Char -> Int ord Char c)) toUpper :: Char -> Char toUpper c :: Char c = Int -> Char chr (Int -> Int towupper (Char -> Int ord Char c)) toTitle :: Char -> Char toTitle c :: Char c = Int -> Char chr (Int -> Int towtitle (Char -> Int ord Char c)) foreign import ccall unsafe "u_iswalpha" iswalpha :: Int -> Int foreign import ccall unsafe "u_iswalnum" iswalnum :: Int -> Int foreign import ccall unsafe "u_iswcntrl" iswcntrl :: Int -> Int foreign import ccall unsafe "u_iswspace" iswspace :: Int -> Int foreign import ccall unsafe "u_iswprint" iswprint :: Int -> Int foreign import ccall unsafe "u_iswlower" iswlower :: Int -> Int foreign import ccall unsafe "u_iswupper" iswupper :: Int -> Int foreign import ccall unsafe "u_towlower" towlower :: Int -> Int foreign import ccall unsafe "u_towupper" towupper :: Int -> Int foreign import ccall unsafe "u_towtitle" towtitle :: Int -> Int foreign import ccall unsafe "u_gencat" wgencat :: Int -> Int