Copyright | (c) Dong Han 2017-2020 |
---|---|
License | BSD |
Maintainer | winterland1989@gmail.com |
Stability | experimental |
Portability | non-portable |
Safe Haskell | None |
Language | Haskell2010 |
A Text
wrap a Bytes
which will be interpreted using UTF-8 encoding. User should always use validate
/ validateMaybe
to construt a Text
(instead of using construtor directly or coercing), otherwise illegal UTF-8 encoded codepoints will cause undefined behaviours.
This library also provide simple unicode processing based on utf8rewind,
see normalize
, caseFold
(current using unicode 13 databases).
Synopsis
- data Text
- getUTF8Bytes :: Text -> Bytes
- validate :: HasCallStack => Bytes -> Text
- validateASCII :: HasCallStack => Bytes -> Text
- validateMaybe :: Bytes -> Maybe Text
- validateASCIIMaybe :: Bytes -> Maybe Text
- data TextException
- index :: HasCallStack => Text -> Int -> Char
- indexMaybe :: Text -> Int -> Maybe Char
- indexR :: HasCallStack => Text -> Int -> Char
- indexMaybeR :: Text -> Int -> Maybe Char
- empty :: Text
- singleton :: Char -> Text
- copy :: Text -> Text
- replicate :: Int -> Char -> Text
- cycleN :: Int -> Text -> Text
- pack :: String -> Text
- packN :: Int -> String -> Text
- packR :: String -> Text
- packRN :: Int -> String -> Text
- unpack :: Text -> String
- unpackR :: Text -> String
- fromVector :: PrimVector Char -> Text
- toVector :: Text -> PrimVector Char
- class Print a where
- toUTF8BuilderP :: Int -> a -> Builder ()
- toText :: Print a => a -> Text
- toString :: Print a => a -> String
- toUTF8Builder :: Print a => a -> Builder ()
- toUTF8Bytes :: Print a => a -> Bytes
- null :: Text -> Bool
- length :: Text -> Int
- append :: Text -> Text -> Text
- map' :: (Char -> Char) -> Text -> Text
- imap' :: (Int -> Char -> Char) -> Text -> Text
- foldl' :: (b -> Char -> b) -> b -> Text -> b
- ifoldl' :: (b -> Int -> Char -> b) -> b -> Text -> b
- foldr' :: (Char -> b -> b) -> b -> Text -> b
- ifoldr' :: (Int -> Char -> b -> b) -> b -> Text -> b
- concat :: [Text] -> Text
- concatMap :: (Char -> Text) -> Text -> Text
- count :: Char -> Text -> Int
- all :: (Char -> Bool) -> Text -> Bool
- any :: (Char -> Bool) -> Text -> Bool
- displayWidth :: Text -> Int
- displayWidthChar :: Char -> Int
- cons :: Char -> Text -> Text
- snoc :: Text -> Char -> Text
- uncons :: Text -> Maybe (Char, Text)
- unsnoc :: Text -> Maybe (Text, Char)
- headMaybe :: Text -> Maybe Char
- tailMayEmpty :: Text -> Text
- lastMaybe :: Text -> Maybe Char
- initMayEmpty :: Text -> Text
- head :: Text -> Char
- tail :: Text -> Text
- last :: Text -> Char
- init :: Text -> Text
- inits :: Text -> [Text]
- tails :: Text -> [Text]
- take :: Int -> Text -> Text
- drop :: Int -> Text -> Text
- takeR :: Int -> Text -> Text
- dropR :: Int -> Text -> Text
- slice :: Int -> Int -> Text -> Text
- splitAt :: Int -> Text -> (Text, Text)
- takeWhile :: (Char -> Bool) -> Text -> Text
- takeWhileR :: (Char -> Bool) -> Text -> Text
- dropWhile :: (Char -> Bool) -> Text -> Text
- dropWhileR :: (Char -> Bool) -> Text -> Text
- dropAround :: (Char -> Bool) -> Text -> Text
- break :: (Char -> Bool) -> Text -> (Text, Text)
- span :: (Char -> Bool) -> Text -> (Text, Text)
- breakR :: (Char -> Bool) -> Text -> (Text, Text)
- spanR :: (Char -> Bool) -> Text -> (Text, Text)
- breakOn :: Text -> Text -> (Text, Text)
- breakOnAll :: Text -> Text -> [(Text, Text)]
- group :: Text -> [Text]
- groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
- stripPrefix :: Text -> Text -> Maybe Text
- stripSuffix :: Text -> Text -> Maybe Text
- split :: Char -> Text -> [Text]
- splitWith :: (Char -> Bool) -> Text -> [Text]
- splitOn :: Text -> Text -> [Text]
- isPrefixOf :: Text -> Text -> Bool
- isSuffixOf :: Text -> Text -> Bool
- isInfixOf :: Text -> Text -> Bool
- commonPrefix :: Text -> Text -> (Text, Text, Text)
- words :: Text -> [Text]
- lines :: Text -> [Text]
- unwords :: [Text] -> Text
- unlines :: [Text] -> Text
- padLeft :: Int -> Char -> Text -> Text
- padRight :: Int -> Char -> Text -> Text
- reverse :: Text -> Text
- intersperse :: Char -> Text -> Text
- intercalate :: Text -> [Text] -> Text
- intercalateElem :: Char -> [Text] -> Text
- transpose :: [Text] -> [Text]
- elem :: Char -> Text -> Bool
- notElem :: Char -> Text -> Bool
- find :: (Char -> Bool) -> Text -> (Int, Maybe Char)
- findR :: (Char -> Bool) -> Text -> (Int, Maybe Char)
- filter :: (Char -> Bool) -> Text -> Text
- partition :: (Char -> Bool) -> Text -> (Text, Text)
- data NormalizationResult
- data NormalizeMode
- isNormalized :: Text -> NormalizationResult
- isNormalizedTo :: NormalizeMode -> Text -> NormalizationResult
- normalize :: Text -> Text
- normalizeTo :: NormalizeMode -> Text -> Text
- envLocale :: IO Locale
- caseFold :: Text -> Text
- caseFoldWith :: Locale -> Text -> Text
- toLower :: Text -> Text
- toLowerWith :: Locale -> Text -> Text
- toUpper :: Text -> Text
- toUpperWith :: Locale -> Text -> Text
- toTitle :: Text -> Text
- toTitleWith :: Locale -> Text -> Text
- isCategory :: Category -> Text -> Bool
- spanCategory :: Category -> Text -> (Text, Text)
- collate :: Collator -> Text -> Text -> Ordering
- data Collator
- collatorFor :: Lang -> Collator
- collator :: QuasiQuoter
- setUpperBeforeLower :: Bool -> Collator -> Collator
- setFrenchAccents :: Bool -> Collator -> Collator
- setNormalization :: Bool -> Collator -> Collator
- setVariableWeighting :: VariableWeighting -> Collator -> Collator
- collatorLang :: Collator -> Maybe Lang
- rootCollator :: Collator
- renderSortKey :: SortKey -> String
- data VariableWeighting
- data CollatorOptions = CollatorOptions {}
- newtype SortKey = SortKey [Word16]
- data Collator
- tailorings :: [(Lang, Collation)]
- parseLang :: Text -> Either String Lang
- renderLang :: Lang -> Text
- lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a)
- data Lang = Lang {
- langLanguage :: Text
- langScript :: Maybe Text
- langRegion :: Maybe Text
- langVariants :: [Text]
- langExtensions :: [(Text, [(Text, Text)])]
- langPrivateUse :: [Text]
- type Locale = CSize
- pattern LocaleDefault :: Locale
- pattern LocaleLithuanian :: Locale
- pattern LocaleTurkishAndAzeriLatin :: Locale
- type Category = CSize
- pattern CategoryLetterUppercase :: Category
- pattern CategoryLetterLowercase :: Category
- pattern CategoryLetterTitlecase :: Category
- pattern CategoryLetterOther :: Category
- pattern CategoryLetter :: Category
- pattern CategoryCaseMapped :: Category
- pattern CategoryMarkNonSpacing :: Category
- pattern CategoryMarkSpacing :: Category
- pattern CategoryMarkEnclosing :: Category
- pattern CategoryMark :: Category
- pattern CategoryNumberDecimal :: Category
- pattern CategoryNumberLetter :: Category
- pattern CategoryNumberOther :: Category
- pattern CategoryNumber :: Category
- pattern CategoryPunctuationConnector :: Category
- pattern CategoryPunctuationDash :: Category
- pattern CategoryPunctuationOpen :: Category
- pattern CategoryPunctuationClose :: Category
- pattern CategoryPunctuationInitial :: Category
- pattern CategoryPunctuationFinal :: Category
- pattern CategoryPunctuationOther :: Category
- pattern CategoryPunctuation :: Category
- pattern CategorySymbolMath :: Category
- pattern CategorySymbolCurrency :: Category
- pattern CategorySymbolModifier :: Category
- pattern CategorySymbolOther :: Category
- pattern CategorySymbol :: Category
- pattern CategorySeparatorSpace :: Category
- pattern CategorySeparatorLine :: Category
- pattern CategorySeparatorParagraph :: Category
- pattern CategorySeparator :: Category
- pattern CategoryControl :: Category
- pattern CategoryFormat :: Category
- pattern CategorySurrogate :: Category
- pattern CategoryPrivateUse :: Category
- pattern CategoryUnassigned :: Category
- pattern CategoryCompatibility :: Category
- pattern CategoryIgnoreGraphemeCluster :: Category
- pattern CategoryIscntrl :: Category
- pattern CategoryIsprint :: Category
- pattern CategoryIsspace :: Category
- pattern CategoryIsblank :: Category
- pattern CategoryIsgraph :: Category
- pattern CategoryIspunct :: Category
- pattern CategoryIsalnum :: Category
- pattern CategoryIsalpha :: Category
- pattern CategoryIsupper :: Category
- pattern CategoryIslower :: Category
- pattern CategoryIsdigit :: Category
- pattern CategoryIsxdigit :: Category
Text type
Instances
IsList Text Source # | |
Eq Text Source # | |
Ord Text Source # | |
Read Text Source # | Accepted syntax and escaping rules are same with |
Show Text Source # | The escaping rules is different from |
IsString Text Source # | |
Defined in Z.Data.Text.Base fromString :: String -> Text # | |
Semigroup Text Source # | |
Monoid Text Source # | |
Arbitrary Text Source # | |
CoArbitrary Text Source # | |
Defined in Z.Data.Text.Base coarbitrary :: Text -> Gen b -> Gen b # | |
FoldCase Text Source # | case fold with default locale. |
Defined in Z.Data.Text.Base | |
NFData Text Source # | |
Defined in Z.Data.Text.Base | |
Hashable Text Source # | |
Defined in Z.Data.Text.Base | |
Print Text Source # | The escaping rules is same with |
Defined in Z.Data.Text.Print | |
JSON Text Source # | |
JSON a => JSON (Map Text a) Source # | |
JSON a => JSON (HashMap Text a) Source # | default instance prefer later key |
JSON a => JSON (FlatMap Text a) Source # | default instance prefer later key |
type Item Text Source # | |
Defined in Z.Data.Text.Base |
validate :: HasCallStack => Bytes -> Text Source #
O(n) Validate a sequence of bytes is UTF-8 encoded.
Throw InvalidUTF8Exception
in case of invalid codepoint.
validateASCII :: HasCallStack => Bytes -> Text Source #
O(n) Validate a sequence of bytes is all ascii char byte(<128).
Throw InvalidASCIIException
in case of invalid byte, It's not always faster
than validate
, use it only if you want to validate ASCII char sequences.
validateMaybe :: Bytes -> Maybe Text Source #
O(n) Validate a sequence of bytes is UTF-8 encoded.
Return Nothing
in case of invalid codepoint.
validateASCIIMaybe :: Bytes -> Maybe Text Source #
O(n) Validate a sequence of bytes is all ascii char byte(<128).
Return Nothing
in case of invalid byte.
data TextException Source #
InvalidUTF8Exception CallStack | |
InvalidASCIIException CallStack | |
IndexOutOfTextRange Int CallStack | first payload is invalid char index |
EmptyText CallStack |
Instances
Show TextException Source # | |
Defined in Z.Data.Text.Base showsPrec :: Int -> TextException -> ShowS # show :: TextException -> String # showList :: [TextException] -> ShowS # | |
Exception TextException Source # | |
Defined in Z.Data.Text.Base |
index :: HasCallStack => Text -> Int -> Char Source #
O(n) Get the nth codepoint from Text
, throw IndexOutOfTextRange
when out of bound.
indexR :: HasCallStack => Text -> Int -> Char Source #
O(n) Get the nth codepoint from Text
counting from the end,
throw IndexOutOfVectorRange n callStack
when out of bound.
indexMaybeR :: Text -> Int -> Maybe Char Source #
O(n) Get the nth codepoint from Text
counting from the end.
Basic creating
Conversion between list
pack :: String -> Text Source #
O(n) Convert a string into a text
Alias for
, will be rewritten to a memcpy if possible.packN
defaultInitSize
packN :: Int -> String -> Text Source #
O(n) Convert a list into a text with an approximate size(in bytes, not codepoints).
If the encoded bytes length is larger than the size given, we simply double the buffer size and continue building.
This function is a good consumer in the sense of build/foldr fusion.
packRN :: Int -> String -> Text Source #
O(n) packN
in reverse order.
This function is a good consumer in the sense of build/foldr fusion.
unpack :: Text -> String Source #
O(n) Convert text to a char list.
Unpacking is done lazily. i.e. we will retain reference to the array until all element are consumed.
This function is a good producer in the sense of build/foldr fusion.
unpackR :: Text -> String Source #
O(n) Convert text to a list in reverse order.
This function is a good producer in the sense of build/foldr fusion.
Conversion between codepoint vector
fromVector :: PrimVector Char -> Text Source #
O(n) convert from a char vector.
Print class
A class similar to Show
, serving the purpose that quickly convert a data type to a Text
value.
You can use newtype or generic deriving to implement instance of this class quickly:
{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DerivingStrategies #-} import GHC.Generics newtype FooInt = FooInt Int deriving (Generic) deriving anyclass Print > toText (FooInt 3) > "FooInt 3" newtype FooInt = FooInt Int deriving (Generic) deriving newtype Print > toText (FooInt 3) > "3"
Nothing
toUTF8BuilderP :: Int -> a -> Builder () Source #
Instances
Basic interface
append :: Text -> Text -> Text Source #
O(m+n)
There's no need to guard empty vector because we guard them for you, so appending empty text are no-ops.
map' :: (Char -> Char) -> Text -> Text Source #
O(n) map
f
t
is the Text
obtained by applying f
to
each char of t
. Performs replacement on invalid scalar values.
ifoldr' :: (Int -> Char -> b -> b) -> b -> Text -> b Source #
Strict right to left fold with index
NOTE: the index is counting from 0, not backwards
concat :: [Text] -> Text Source #
O(n) Concatenate a list of text.
Note: concat
have to force the entire list to filter out empty text and calculate
the length for allocation.
concatMap :: (Char -> Text) -> Text -> Text Source #
Map a function over a text and concatenate the results
Special folds
all :: (Char -> Bool) -> Text -> Bool Source #
O(n) Applied to a predicate and text, all
determines
if all chars of the text satisfy the predicate.
any :: (Char -> Bool) -> Text -> Bool Source #
O(n) Applied to a predicate and a text, any
determines
if any chars of the text satisfy the predicate.
Text display width
displayWidth :: Text -> Int Source #
Get the display width of a piece of text.
You shouldn't pass texts with control characters(<0x20, \DEL), which are counted with -1 width.
>>>
displayWidth "你好世界!"
>>>
10
>>>
displayWidth "hello world!"
>>>
12
displayWidthChar :: Char -> Int Source #
Get the display width of a Char
.
You shouldn't pass texts with control characters(<0x20, \DEL), which are counted with -1 width.
Slice manipulation
cons :: Char -> Text -> Text Source #
O(n) cons
is analogous to (:) for lists, but of different
complexity, as it requires making a copy.
uncons :: Text -> Maybe (Char, Text) Source #
O(1) Extract the head and tail of a text, return Nothing
if it is empty.
unsnoc :: Text -> Maybe (Text, Char) Source #
O(1) Extract the init and last of a text, return Nothing
if text is empty.
tailMayEmpty :: Text -> Text Source #
O(1) Extract the chars after the head of a text.
NOTE: tailMayEmpty
return empty text in the case of an empty text.
initMayEmpty :: Text -> Text Source #
O(1) Extract the chars before of the last one.
NOTE: initMayEmpty
return empty text in the case of an empty text.
O(1) Extract the first char of a text.
Throw EmptyText
if text is empty.
O(1) Extract the chars after the head of a text.
Throw EmptyText
if text is empty.
O(1) Extract the chars before of the last one.
Throw EmptyText
if text is empty.
slice :: Int -> Int -> Text -> Text Source #
O(1) Extract a sub-range text with give start index and length.
This function is a total function just like 'takedrop', indexlength exceeds range will be ingored, e.g.
slice 1 3 "hello" == "ell" slice -1 -1 "hello" == "" slice -2 2 "hello" == "" slice 2 10 "hello" == "llo"
This holds for all x y: slice x y vs == drop x . take (x+y) vs
takeWhile :: (Char -> Bool) -> Text -> Text Source #
O(n) Applied to a predicate p
and a text t
,
returns the longest prefix (possibly empty) of t
of elements that
satisfy p
.
takeWhileR :: (Char -> Bool) -> Text -> Text Source #
O(n) Applied to a predicate p
and a text t
,
returns the longest suffix (possibly empty) of t
of elements that
satisfy p
.
dropWhile :: (Char -> Bool) -> Text -> Text Source #
O(n) Applied to a predicate p
and a text vs
,
returns the suffix (possibly empty) remaining after takeWhile
p vs
.
dropWhileR :: (Char -> Bool) -> Text -> Text Source #
O(n) Applied to a predicate p
and a text vs
,
returns the prefix (possibly empty) remaining before takeWhileR
p vs
.
break :: (Char -> Bool) -> Text -> (Text, Text) Source #
O(n) Split the text into the longest prefix of elements that do not satisfy the predicate and the rest without copying.
span :: (Char -> Bool) -> Text -> (Text, Text) Source #
O(n) Split the text into the longest prefix of elements that satisfy the predicate and the rest without copying.
breakOn :: Text -> Text -> (Text, Text) Source #
Break a text on a subtext, returning a pair of the part of the text prior to the match, and the rest of the text, e.g.
break "wor" "hello, world" = ("hello, ", "world")
O(n+m) Find all non-overlapping instances of needle in haystack. Each element of the returned list consists of a pair:
- The entire string prior to the kth match (i.e. the prefix)
- The kth match, followed by the remainder of the string
Examples:
breakOnAll "::" "" ==> [] breakOnAll "" "abc" ==> [("a", "bc"), ("ab", "c"), ("abc", "/")]
The result list is lazy, search is performed when you force the list.
group :: Text -> [Text] Source #
The group function takes a text and returns a list of texts such that the concatenation of the result is equal to the argument. Moreover, each sublist in the result contains only equal elements. For example,
group Mississippi = [M,"i","ss","i","ss","i","pp","i"]
It is a special case of groupBy
, which allows the programmer to supply their own equality test.
stripPrefix :: Text -> Text -> Maybe Text Source #
O(n) The stripPrefix
function takes two texts and returns Just
the remainder of the second iff the first is its prefix, and otherwise
Nothing
.
stripSuffix :: Text -> Text -> Maybe Text Source #
O(n) The stripSuffix
function takes two texts and returns Just the remainder of the second iff the first is its suffix, and otherwise Nothing.
split :: Char -> Text -> [Text] Source #
O(n) Break a text into pieces separated by the delimiter element consuming the delimiter. I.e.
split '\n' "a\nb\nd\ne" == ["a","b","d","e"] split 'a' "aXaXaXa" == ["","X","X","X",""] split 'x' "x" == ["",""]
and
intercalate [c] . split c == id split == splitWith . (==)
NOTE, this function behavior different with bytestring's. see #56.
splitWith :: (Char -> Bool) -> Text -> [Text] Source #
O(n) Splits a text into components delimited by separators, where the predicate returns True for a separator char. The resulting components do not contain the separators. Two adjacent separators result in an empty component in the output. eg.
splitWith (=='a') "aabbaca" == ["","","bb","c",""] splitWith (=='a') [] == [""]
splitOn :: Text -> Text -> [Text] Source #
O(m+n) Break haystack into pieces separated by needle.
Note: An empty needle will essentially split haystack element by element.
Examples:
>>>
splitOn "\r\n" "a\r\nb\r\nd\r\ne"
["a","b","d","e"]
>>>
splitOn "aaa" "aaaXaaaXaaaXaaa"
["","X","X","X",""]
>>>
splitOn "x" "x"
["",""]
and
intercalate s . splitOn s == id splitOn (singleton c) == split (==c)
isPrefixOf :: Text -> Text -> Bool Source #
The isPrefix
function returns True
if the first argument is a prefix of the second.
isSuffixOf :: Text -> Text -> Bool Source #
O(n) The isSuffixOf
function takes two text and returns True
if the first is a suffix of the second.
isInfixOf :: Text -> Text -> Bool Source #
Check whether one text is a subtext of another.
needle
.isInfixOf
haystack === null haystack || indices needle haystake /= []
commonPrefix :: Text -> Text -> (Text, Text, Text) Source #
O(n) Find the longest non-empty common prefix of two strings and return it, along with the suffixes of each string at which they no longer match. e.g.
>>>
commonPrefix "foobar" "fooquux"
("foo","bar","quux")
>>>
commonPrefix "veeble" "fetzer"
("","veeble","fetzer")
words :: Text -> [Text] Source #
O(n) Breaks a Bytes
up into a list of words, delimited by unicode space.
unlines :: [Text] -> Text Source #
O(n) Joins lines with ascii n
.
NOTE: This functions is different from unlines
, it DOES NOT add a trailing n
.
padLeft :: Int -> Char -> Text -> Text Source #
Add padding to the left so that the whole text's length is at least n.
padRight :: Int -> Char -> Text -> Text Source #
Add padding to the right so that the whole text's length is at least n.
Transform
intersperse :: Char -> Text -> Text Source #
O(n) The intersperse
function takes a character and places it
between the characters of a Text
. Performs replacement on invalid scalar values.
intercalate :: Text -> [Text] -> Text Source #
O(n) The intercalate
function takes a Text
and a list of
Text
s and concatenates the list after interspersing the first
argument between each element of the list.
transpose :: [Text] -> [Text] Source #
The transpose
function transposes the rows and columns of its
text argument.
Search
searching by equality
element-wise search
O(n) find the first char matching the predicate in a text from left to right, if there isn't one, return the text length.
O(n) find the first char matching the predicate in a text from right to left.
filter :: (Char -> Bool) -> Text -> Text Source #
O(n) filter
, applied to a predicate and a text,
returns a text containing those chars that satisfy the
predicate.
partition :: (Char -> Bool) -> Text -> (Text, Text) Source #
O(n) The partition
function takes a predicate, a text, returns
a pair of text with codepoints which do and do not satisfy the
predicate, respectively; i.e.,
partition p txt == (filter p txt, filter (not . p) txt)
Unicode processing
normalization
data NormalizationResult Source #
Instances
data NormalizeMode Source #
These are the Unicode Normalization Forms:
Form | Description ---------------------------- | --------------------------------------------- Normalization Form D (NFD) | Canonical decomposition Normalization Form C (NFC) | Canonical decomposition, followed by canonical composition Normalization Form KD (NFKD) | Compatibility decomposition Normalization Form KC (NFKC) | Compatibility decomposition, followed by canonical composition
Instances
isNormalized :: Text -> NormalizationResult Source #
Check if a string is stable in the NFC (Normalization Form C).
isNormalizedTo :: NormalizeMode -> Text -> NormalizationResult Source #
Check if a string is stable in the specified Unicode Normalization Form.
This function can be used as a preprocessing step, before attempting to normalize a string. Normalization is a very expensive process, it is often cheaper to first determine if the string is unstable in the requested normalization form.
The result of the check will be YES if the string is stable and MAYBE or NO if it is unstable. If the result is MAYBE, the string does not necessarily have to be normalized.
For more information, please review <http://www.unicode.org/reports/tr15/ Unicode Standard Annex #15 - Unicode Normalization Forms>.
normalizeTo :: NormalizeMode -> Text -> Text Source #
Normalize a string to the specified Unicode Normalization Form.
The Unicode standard defines two standards for equivalence between characters: canonical and compatibility equivalence. Canonically equivalent characters and sequence represent the same abstract character and must be rendered with the same appearance and behavior. Compatibility equivalent characters have a weaker equivalence and may be rendered differently.
Unicode Normalization Forms are formally defined standards that can be used to test whether any two strings of characters are equivalent to each other. This equivalence may be canonical or compatibility.
The algorithm puts all combining marks into a specified order and uses the rules for decomposition and composition to transform the string into one of four Unicode Normalization Forms. A binary comparison can then be used to determine equivalence.
Case conversion
caseFold :: Text -> Text Source #
Remove case distinction from UTF-8 encoded text with default locale.
caseFoldWith :: Locale -> Text -> Text Source #
Remove case distinction from UTF-8 encoded text.
Case folding is the process of eliminating differences between code points concerning case mapping. It is most commonly used for comparing strings in a case-insensitive manner. Conversion is fully compliant with the Unicode 7.0 standard.
Although similar to lowercasing text, there are significant differences. For one, case folding does _not_ take locale into account when converting. In some cases, case folding can be up to 20% faster than lowercasing the same text, but the result cannot be treated as correct lowercased text.
Only two locale-specific exception are made when case folding text. In Turkish, U+0049 LATIN CAPITAL LETTER I maps to U+0131 LATIN SMALL LETTER DOTLESS I and U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE maps to U+0069 LATIN SMALL LETTER I.
Although most code points can be case folded without changing length, there are notable exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE) maps to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT ABOVE) when converted to lowercase.
Only a handful of scripts make a distinction between upper- and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.
toLowerWith :: Locale -> Text -> Text Source #
Convert UTF-8 encoded text to lowercase.
This function allows conversion of UTF-8 encoded strings to lowercase without first changing the encoding to UTF-32. Conversion is fully compliant with the Unicode 7.0 standard.
Although most code points can be converted to lowercase with changing length, there are notable exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE) maps to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT ABOVE) when converted to lowercase.
Only a handful of scripts make a distinction between upper- and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.
Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x))
.
Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
toUpperWith :: Locale -> Text -> Text Source #
Convert UTF-8 encoded text to uppercase.
Conversion is fully compliant with the Unicode 7.0 standard.
Although most code points can be converted without changing length, there are notable exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to "U+0053 U+0053" (LATIN CAPITAL LETTER S and LATIN CAPITAL LETTER S) when converted to uppercase.
Only a handful of scripts make a distinction between upper and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.
Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x))
.
Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
toTitleWith :: Locale -> Text -> Text Source #
Convert UTF-8 encoded text to titlecase.
This function allows conversion of UTF-8 encoded strings to titlecase. Conversion is fully compliant with the Unicode 7.0 standard.
Titlecase requires a bit more explanation than uppercase and lowercase, because it is not a common text transformation. Titlecase uses uppercase for the first letter of each word and lowercase for the rest. Words are defined as "collections of code points with general category Lu, Ll, Lt, Lm or Lo according to the Unicode database".
Effectively, any type of punctuation can break up a word, even if this is not grammatically valid. This happens because the titlecasing algorithm does not and cannot take grammar rules into account.
Text | Titlecase -------------------------------------|------------------------------------- The running man | The Running Man NATO Alliance | Nato Alliance You're amazing at building libraries | You'Re Amazing At Building Libraries
Although most code points can be converted to titlecase without changing length, there are notable exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to "U+0053 U+0073" (LATIN CAPITAL LETTER S and LATIN SMALL LETTER S) when converted to titlecase.
Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
Unicode category
isCategory :: Category -> Text -> Bool Source #
Check if the input string conforms to the category specified by the flags.
This function can be used to check if the code points in a string are part of a category. Valid flags are members of the "list of categories". The category for a code point is defined as part of the entry in UnicodeData.txt, the data file for the Unicode code point database.
By default, the function will treat grapheme clusters as a single code point. This means that the following string:
Code point | Canonical combining class | General category | Name ---------- | ------------------------- | --------------------- | ---------------------- U+0045 | 0 | Lu (Uppercase letter) | LATIN CAPITAL LETTER E U+0300 | 230 | Mn (Non-spacing mark) | COMBINING GRAVE ACCENT
Will match with CategoryLetterUppercase
in its entirety, because
the COMBINING GRAVE ACCENT is treated as part of the grapheme cluster. This
is useful when e.g. creating a text parser, because you do not have to
normalize the text first.
If this is undesired behavior, specify the CategoryIgnoreGraphemeCluster
flag.
In order to maintain backwards compatibility with POSIX functions
like isdigit
and isspace
, compatibility flags have been provided. Note,
however, that the result is only guaranteed to be correct for code points
in the Basic Latin range, between U+0000 and 0+007F. Combining a
compatibility flag with a regular category flag will result in undefined
behavior.
spanCategory :: Category -> Text -> (Text, Text) Source #
Try to match as many code points with the matching category flags as possible and return the prefix and suffix.
Collate
collate :: Collator -> Text -> Text -> Ordering Source #
Compare two Text
s with Unicode Collation Algorithm
Instances
IsString Collator | |
Defined in Text.Collate.Collator fromString :: String -> Collator # |
Re-exports
collatorFor :: Lang -> Collator #
Returns a collator based on a BCP 47 language tag.
If no exact match is found, we try to find the best match
(falling back to the root collation if nothing else succeeds).
If something other than the default collation for a language
is desired, the co
keyword of the unicode extensions can be
used (e.g. es-u-co-trad
for traditional Spanish).
Other unicode extensions affect the collator options:
- The
kb
keyword has the same effect assetFrenchAccents
(e.g.fr-FR-u-kb-true
). - The
ka
keyword has the same effect assetVariableWeight
(e.g.fr-FR-u-kb-ka-shifted
oren-u-ka-noignore
). - The
kf
keyword has the same effect assetUpperBeforeLower
(e.g.fr-u-kf-upper
orfr-u-kf-lower
). - The
kk
keyword has the same effect assetNormalization
(e.g.fr-u-kk-false
).
collator :: QuasiQuoter #
Create a collator at compile time based on a BCP 47 language
tag: e.g., [collator|es-u-co-trad|]
. Requires the QuasiQuotes
extension.
setUpperBeforeLower :: Bool -> Collator -> Collator #
Most collations default to sorting lowercase letters before
uppercase (exceptions: mt
, da
, cu
). To select the opposite
behavior, use setUpperBeforeLower True
.
setFrenchAccents :: Bool -> Collator -> Collator #
setFrenchAccents True
causes secondary weights to be scanned
in reverse order, so we get the sorting
cote côte coté côté
instead of cote coté côte côté
.
The default is usually False
, except for fr-CA
where it is True
.
setNormalization :: Bool -> Collator -> Collator #
The Unicode Collation Algorithm expects input to be normalized
into its canonical decomposition (NFD). By default, collators perform
this normalization. If your input is already normalized, you can increase
performance by disabling this step: setNormalization False
.
setVariableWeighting :: VariableWeighting -> Collator -> Collator #
Set method for handling variable elements (punctuation and spaces): see http://www.unicode.org/reports/tr10/, Tables 11 and 12.
collatorLang :: Collator -> Maybe Lang #
Lang
used for tailoring. Because of fallback rules, this may be somewhat
different from the Lang
passed to collatorFor
. This Lang
won't contain unicode extensions used to set options, but
it will specify the collation if a non-default collation is being used.
Default collator based on DUCET table (allkeys.txt
).
renderSortKey :: SortKey -> String #
Render sort key in the manner used in the CLDR collation test data: the character '|' is used to separate the levels of the key and corresponds to a 0 in the actual sort key.
data VariableWeighting #
VariableWeighting
affects how punctuation is treated.
See http://www.unicode.org/reports/tr10/#Variable_Weighting.
NonIgnorable | Don't ignore punctuation (Deluge < deluge-) |
Blanked | Completely ignore punctuation (Deluge = deluge-) |
Shifted | Consider punctuation at lower priority (de-luge < delu-ge < deluge < deluge- < Deluge) |
ShiftTrimmed | Variant of Shifted (deluge < de-luge < delu-ge) |
Instances
Eq VariableWeighting | |
Defined in Text.Collate.Collator (==) :: VariableWeighting -> VariableWeighting -> Bool # (/=) :: VariableWeighting -> VariableWeighting -> Bool # | |
Ord VariableWeighting | |
Defined in Text.Collate.Collator compare :: VariableWeighting -> VariableWeighting -> Ordering # (<) :: VariableWeighting -> VariableWeighting -> Bool # (<=) :: VariableWeighting -> VariableWeighting -> Bool # (>) :: VariableWeighting -> VariableWeighting -> Bool # (>=) :: VariableWeighting -> VariableWeighting -> Bool # max :: VariableWeighting -> VariableWeighting -> VariableWeighting # min :: VariableWeighting -> VariableWeighting -> VariableWeighting # | |
Show VariableWeighting | |
Defined in Text.Collate.Collator showsPrec :: Int -> VariableWeighting -> ShowS # show :: VariableWeighting -> String # showList :: [VariableWeighting] -> ShowS # |
data CollatorOptions #
CollatorOptions | |
|
Instances
Eq CollatorOptions | |
Defined in Text.Collate.Collator (==) :: CollatorOptions -> CollatorOptions -> Bool # (/=) :: CollatorOptions -> CollatorOptions -> Bool # | |
Ord CollatorOptions | |
Defined in Text.Collate.Collator compare :: CollatorOptions -> CollatorOptions -> Ordering # (<) :: CollatorOptions -> CollatorOptions -> Bool # (<=) :: CollatorOptions -> CollatorOptions -> Bool # (>) :: CollatorOptions -> CollatorOptions -> Bool # (>=) :: CollatorOptions -> CollatorOptions -> Bool # max :: CollatorOptions -> CollatorOptions -> CollatorOptions # min :: CollatorOptions -> CollatorOptions -> CollatorOptions # | |
Show CollatorOptions | |
Defined in Text.Collate.Collator showsPrec :: Int -> CollatorOptions -> ShowS # show :: CollatorOptions -> String # showList :: [CollatorOptions] -> ShowS # |
Instances
IsString Collator | |
Defined in Text.Collate.Collator fromString :: String -> Collator # |
tailorings :: [(Lang, Collation)] #
An association list matching Lang
s with tailored Collation
s.
renderLang :: Lang -> Text #
Render a Lang
in BCP 47 form.
lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a) #
Find best match for a Lang
in an association list.
Represents a BCP 47 language tag (https://tools.ietf.org/html/bcp47).
Lang | |
|
Constants
Locale
pattern LocaleDefault :: Locale Source #
pattern LocaleLithuanian :: Locale Source #
pattern LocaleTurkishAndAzeriLatin :: Locale Source #
Category
type Category = CSize Source #
Unicode categories.
See isCategory
, you can combine categories with bitwise or.
pattern CategoryLetterUppercase :: Category Source #
pattern CategoryLetterLowercase :: Category Source #
pattern CategoryLetterTitlecase :: Category Source #
pattern CategoryLetterOther :: Category Source #
pattern CategoryLetter :: Category Source #
pattern CategoryCaseMapped :: Category Source #
pattern CategoryMarkNonSpacing :: Category Source #
pattern CategoryMarkSpacing :: Category Source #
pattern CategoryMarkEnclosing :: Category Source #
pattern CategoryMark :: Category Source #
pattern CategoryNumberDecimal :: Category Source #
pattern CategoryNumberLetter :: Category Source #
pattern CategoryNumberOther :: Category Source #
pattern CategoryNumber :: Category Source #
pattern CategoryPunctuationConnector :: Category Source #
pattern CategoryPunctuationDash :: Category Source #
pattern CategoryPunctuationOpen :: Category Source #
pattern CategoryPunctuationClose :: Category Source #
pattern CategoryPunctuationInitial :: Category Source #
pattern CategoryPunctuationFinal :: Category Source #
pattern CategoryPunctuationOther :: Category Source #
pattern CategoryPunctuation :: Category Source #
pattern CategorySymbolMath :: Category Source #
pattern CategorySymbolCurrency :: Category Source #
pattern CategorySymbolModifier :: Category Source #
pattern CategorySymbolOther :: Category Source #
pattern CategorySymbol :: Category Source #
pattern CategorySeparatorSpace :: Category Source #
pattern CategorySeparatorLine :: Category Source #
pattern CategorySeparatorParagraph :: Category Source #
pattern CategorySeparator :: Category Source #
pattern CategoryControl :: Category Source #
pattern CategoryFormat :: Category Source #
pattern CategorySurrogate :: Category Source #
pattern CategoryPrivateUse :: Category Source #
pattern CategoryUnassigned :: Category Source #
pattern CategoryCompatibility :: Category Source #
pattern CategoryIgnoreGraphemeCluster :: Category Source #
pattern CategoryIscntrl :: Category Source #
pattern CategoryIsprint :: Category Source #
pattern CategoryIsspace :: Category Source #
pattern CategoryIsblank :: Category Source #
pattern CategoryIsgraph :: Category Source #
pattern CategoryIspunct :: Category Source #
pattern CategoryIsalnum :: Category Source #
pattern CategoryIsalpha :: Category Source #
pattern CategoryIsupper :: Category Source #
pattern CategoryIslower :: Category Source #
pattern CategoryIsdigit :: Category Source #
pattern CategoryIsxdigit :: Category Source #