{-# LANGUAGE DeriveDataTypeable #-}
module Data.CharSet.Unicode.Category
(
Category(..)
, categories
, lookupCategory
, lookupCategoryCharSet
, modifierLetter, otherLetter, letter
, lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd
, nonSpacingMark, spacingCombiningMark, enclosingMark, mark
, space, lineSeparator, paragraphSeparator, separator
, mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol
, decimalNumber, letterNumber, otherNumber, number
, dashPunctuation, openPunctuation, closePunctuation, initialQuote
, finalQuote, connectorPunctuation, otherPunctuation, punctuation
, control, format, privateUse, surrogate, notAssigned, other
) where
import Data.Char
import Data.CharSet
import Data.Data
import Data.HashMap.Lazy (HashMap)
import qualified Data.HashMap.Lazy as HashMap
data Category = Category
{ categoryName :: String
, categoryAbbreviation :: String
, categoryCharSet :: CharSet
, categoryDescription :: String
} deriving (Show, Data, Typeable)
categories :: [Category]
categories =
[ Category "Letter" "L" letter "any kind of letter from any language."
, Category "Lowercase_Letter" "Ll" lowercaseLetter "a lowercase letter that has an uppercase variant"
, Category "Uppercase_Letter" "Lu" uppercaseLetter "an uppercase letter that has a lowercase variant"
, Category "Titlecase_Letter" "Lt" titlecaseLetter "a letter that appears at the start of a word when only the first letter of the word is capitalized"
, Category "Letter&" "L&" letterAnd "a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)"
, Category "Modifier_Letter" "Lm" modifierLetter "a special character that is used like a letter"
, Category "Other_Letter" "Lo" otherLetter "a letter or ideograph that does not have lowercase and uppercase variants"
, Category "Mark" "M" mark "a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)"
, Category "Non_Spacing_Mark" "Mn" nonSpacingMark "a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)"
, Category "Spacing_Combining_Mark" "Mc" spacingCombiningMark "a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)"
, Category "Enclosing_Mark" "Me" enclosingMark "a character that encloses the character is is combined with (circle, square, keycap, etc.)"
, Category "Separator" "Z" separator "any kind of whitespace or invisible separator"
, Category "Space_Separator" "Zs" space "a whitespace character that is invisible, but does take up space"
, Category "Line_Separator" "Zl" lineSeparator "line separator character U+2028"
, Category "Paragraph_Separator" "Zp" paragraphSeparator "paragraph separator character U+2029"
, Category "Symbol" "S" symbol "math symbols, currency signs, dingbats, box-drawing characters, etc."
, Category "Math_Symbol" "Sm" mathSymbol "any mathematical symbol"
, Category "Currency_Symbol" "Sc" currencySymbol "any currency sign"
, Category "Modifier_Symbol" "Sk" modifierSymbol "a combining character (mark) as a full character on its own"
, Category "Other_Symbol" "So" otherSymbol "various symbols that are not math symbols, currency signs, or combining characters"
, Category "Number" "N" number "any kind of numeric character in any script"
, Category "Decimal_Digit_Number" "Nd" decimalNumber "a digit zero through nine in any script except ideographic scripts"
, Category "Letter_Number" "Nl" letterNumber "a number that looks like a letter, such as a Roman numeral"
, Category "Other_Number" "No" otherNumber "a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)"
, Category "Punctuation" "P" punctuation "any kind of punctuation character"
, Category "Dash_Punctuation" "Pd" dashPunctuation "any kind of hyphen or dash"
, Category "Open_Punctuation" "Ps" openPunctuation "any kind of opening bracket"
, Category "Close_Punctuation" "Pe" closePunctuation "any kind of closing bracket"
, Category "Initial_Punctuation" "Pi" initialQuote "any kind of opening quote"
, Category "Final_Punctuation" "Pf" finalQuote "any kind of closing quote"
, Category "Connector_Punctuation" "Pc" connectorPunctuation "a punctuation character such as an underscore that connects words"
, Category "Other_Punctuation" "Po" otherPunctuation "any kind of punctuation character that is not a dash, bracket, quote or connector"
, Category "Other" "C" other "invisible control characters and unused code points"
, Category "Control" "Cc" control "an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character"
, Category "Format" "Cf" format "invisible formatting indicator"
, Category "Private_Use" "Co" privateUse "any code point reserved for private use"
, Category "Surrogate" "Cs" surrogate "one half of a surrogate pair in UTF-16 encoding"
, Category "Unassigned" "Cn" notAssigned "any code point to which no character has been assigned.properties" ]
lookupTable :: HashMap String Category
lookupTable = HashMap.fromList
[ (canonicalize x, category)
| category@(Category l s _ _) <- categories
, x <- [l,s]
]
lookupCategory :: String -> Maybe Category
lookupCategory s = HashMap.lookup (canonicalize s) lookupTable
lookupCategoryCharSet :: String -> Maybe CharSet
lookupCategoryCharSet = fmap categoryCharSet . lookupCategory
canonicalize :: String -> String
canonicalize s = case Prelude.map toLower s of
'i' : 's' : xs -> go xs
xs -> go xs
where
go ('-':xs) = go xs
go ('_':xs) = go xs
go (' ':xs) = go xs
go (x:xs) = x : go xs
go [] = []
cat :: GeneralCategory -> CharSet
cat category = build ((category ==) . generalCategory)
lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd, modifierLetter, otherLetter, letter :: CharSet
lowercaseLetter = cat LowercaseLetter
uppercaseLetter = cat UppercaseLetter
titlecaseLetter = cat TitlecaseLetter
letterAnd = lowercaseLetter
`union` uppercaseLetter
`union` titlecaseLetter
modifierLetter = cat ModifierLetter
otherLetter = cat OtherLetter
letter
= letterAnd
`union` modifierLetter
`union` otherLetter
nonSpacingMark, spacingCombiningMark, enclosingMark, mark :: CharSet
nonSpacingMark = cat NonSpacingMark
spacingCombiningMark = cat SpacingCombiningMark
enclosingMark = cat EnclosingMark
mark
= nonSpacingMark
`union` spacingCombiningMark
`union` enclosingMark
space, lineSeparator, paragraphSeparator, separator :: CharSet
space = cat Space
lineSeparator = cat LineSeparator
paragraphSeparator = cat ParagraphSeparator
separator
= space
`union` lineSeparator
`union` paragraphSeparator
mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol :: CharSet
mathSymbol = cat MathSymbol
currencySymbol = cat CurrencySymbol
modifierSymbol = cat ModifierSymbol
otherSymbol = cat OtherSymbol
symbol
= mathSymbol
`union` currencySymbol
`union` modifierSymbol
`union` otherSymbol
decimalNumber, letterNumber, otherNumber, number :: CharSet
decimalNumber = cat DecimalNumber
letterNumber = cat LetterNumber
otherNumber = cat OtherNumber
number
= decimalNumber
`union` letterNumber
`union` otherNumber
dashPunctuation, openPunctuation, closePunctuation, initialQuote,
finalQuote, connectorPunctuation, otherPunctuation, punctuation :: CharSet
dashPunctuation = cat DashPunctuation
openPunctuation = cat OpenPunctuation
closePunctuation = cat ClosePunctuation
initialQuote = cat InitialQuote
finalQuote = cat FinalQuote
connectorPunctuation = cat ConnectorPunctuation
otherPunctuation = cat OtherPunctuation
punctuation
= dashPunctuation
`union` openPunctuation
`union` closePunctuation
`union` initialQuote
`union` finalQuote
`union` connectorPunctuation
`union` otherPunctuation
control, format, privateUse, surrogate, notAssigned, other :: CharSet
control = cat Control
format = cat Format
privateUse = cat PrivateUse
surrogate = cat Surrogate
notAssigned = cat NotAssigned
other = control
`union` format
`union` privateUse
`union` surrogate
`union` notAssigned