{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
#if EMBED
{-# LANGUAGE TemplateHaskell #-}
#endif
module Text.Hyphenation.Language
(
Language(..)
, languageHyphenator
, afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese
, coptic, croatian, czech, danish, dutch, english_US, english_GB, esperanto
, estonian, ethiopic, finnish, french, friulan, galician, georgian, german_1901, german_1996
, german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian
, icelandic, indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic
, latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal
, norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian, romansh
, russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic
, serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil
, telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh
, loadHyphenator
, languageAffix
) where
import Codec.Compression.GZip
#if __GLASGOW_HASKELL__ < 710
import Data.Functor ((<$>))
#endif
import qualified Data.IntMap as IM
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Text.Hyphenation.ByteStringLazyCompat as Lazy
import Text.Hyphenation.Hyphenator
import Text.Hyphenation.Pattern
import Text.Hyphenation.Exception
import System.IO.Unsafe
#if !EMBED
import Paths_hyphenation
#else
import Data.FileEmbed
import qualified Data.ByteString.Char8 as Strict
hyphenatorFiles :: [(FilePath, Strict.ByteString)]
hyphenatorFiles = $(embedDir "data")
#endif
chrLine :: String -> [(Int, Char)]
chrLine (x:xs) = fmap (\y -> (fromEnum y, x)) xs
chrLine [] = []
loadHyphenator :: Language -> IO Hyphenator
#if !EMBED
loadHyphenator language = do
let affix = languageAffix language
hyp <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".hyp.txt.gz") >>= Lazy.readFile)
pat <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".pat.txt.gz") >>= Lazy.readFile)
chr <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".chr.txt.gz") >>= Lazy.readFile)
let chrMap = IM.fromList (Prelude.lines chr >>= chrLine)
tryLookup x = IM.findWithDefault x (fromEnum x) chrMap
(defaultLeftMin, defaultRightMin) = languageMins language
return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin
#else
loadHyphenator language = return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin
where affix = languageAffix language
Just hyp = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".hyp.txt.gz") hyphenatorFiles
Just pat = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".pat.txt.gz") hyphenatorFiles
Just chr = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".chr.txt.gz") hyphenatorFiles
chrMap = IM.fromList (Prelude.lines chr >>= chrLine)
(defaultLeftMin, defaultRightMin) = languageMins language
tryLookup x = IM.findWithDefault x (fromEnum x) chrMap
#endif
unzipUtf8 :: ByteString -> String
unzipUtf8 =
T.unpack . T.decodeUtf8With (\ _ -> fmap (toEnum . fromEnum))
. Lazy.toStrict . decompress
data Language
= Afrikaans
| Armenian
| Assamese
| Basque
| Bengali
| Bulgarian
| Catalan
| Chinese
| Coptic
| Croatian
| Czech
| Danish
| Dutch
| English_US | English_GB
| Esperanto
| Estonian
| Ethiopic
| Finnish
| French
| Friulan
| Galician
| Georgian
| German_1901 | German_1996 | German_Swiss
| Greek_Ancient
| Greek_Mono
| Greek_Poly
| Gujarati
| Hindi
| Hungarian
| Icelandic
| Indonesian
| Interlingua
| Irish
| Italian
| Kannada
| Kurmanji
| Latin
| Latin_Classic
| Latvian
| Lithuanian
| Malayalam
| Marathi
| Mongolian
| Norwegian_Bokmal | Norwegian_Nynorsk
| Occitan
| Oriya
| Panjabi
| Piedmontese
| Polish
| Portuguese
| Romanian
| Romansh
| Russian
| Sanskrit
| Serbian_Cyrillic
| Serbocroatian_Cyrillic | Serbocroatian_Latin
| Slovak
| Slovenian
| Spanish
| Swedish
| Tamil
| Telugu
| Thai
| Turkish
| Turkmen
| Ukrainian
| Uppersorbian
| Welsh
deriving (Eq,Ord,Show,Bounded,Enum)
languageAffix :: Language -> String
languageAffix s = case s of
Afrikaans -> "af"
Armenian -> "hy"
Assamese -> "as"
Basque -> "eu"
Bengali -> "bn"
Bulgarian -> "bg"
Catalan -> "ca"
Chinese -> "zh-latn-pinyin"
Coptic -> "cop"
Croatian -> "hr"
Czech -> "cs"
Danish -> "da"
Dutch -> "nl"
English_US -> "en-us"
English_GB -> "en-gb"
Esperanto -> "eo"
Estonian -> "et"
Ethiopic -> "mul-ethi"
Finnish -> "fi"
French -> "fr"
Friulan -> "fur"
Galician -> "gl"
Georgian -> "ka"
German_1901 -> "de-1901"
German_1996 -> "de-1996"
German_Swiss -> "de-ch-1901"
Greek_Ancient -> "grc"
Greek_Mono -> "el-monoton"
Greek_Poly -> "el-polyton"
Gujarati -> "gu"
Hindi -> "hi"
Hungarian -> "hu"
Icelandic -> "is"
Indonesian -> "id"
Interlingua -> "ia"
Irish -> "ga"
Italian -> "it"
Kannada -> "kn"
Kurmanji -> "kmr"
Latin -> "la"
Latin_Classic -> "la-x-classic"
Latvian -> "lv"
Lithuanian -> "lt"
Malayalam -> "ml"
Marathi -> "mr"
Mongolian -> "mn-cyrl"
Norwegian_Bokmal -> "nb"
Norwegian_Nynorsk -> "nn"
Occitan -> "oc"
Oriya -> "or"
Panjabi -> "pa"
Piedmontese -> "pms"
Polish -> "pl"
Portuguese -> "pt"
Romanian -> "ro"
Romansh -> "rm"
Russian -> "ru"
Sanskrit -> "sa"
Serbian_Cyrillic -> "sr-cyrl"
Serbocroatian_Cyrillic -> "sh-cyrl"
Serbocroatian_Latin -> "sh-latn"
Slovak -> "sk"
Slovenian -> "sl"
Spanish -> "es"
Swedish -> "sv"
Tamil -> "ta"
Telugu -> "te"
Thai -> "th"
Turkish -> "tr"
Turkmen -> "tk"
Ukrainian -> "uk"
Uppersorbian -> "hsb"
Welsh -> "cy"
languageMins :: Language -> (Int, Int)
languageMins s = case s of
Afrikaans -> (1, 2)
Armenian -> (1, 2)
Assamese -> (1, 1)
Basque -> (2, 2)
Bengali -> (1, 1)
Bulgarian -> (2, 2)
Catalan -> (2, 2)
Chinese -> (1, 1)
Coptic -> (1, 1)
Croatian -> (2, 2)
Czech -> (2, 3)
Danish -> (2, 2)
Dutch -> (2, 2)
English_GB -> (2, 3)
English_US -> (2, 3)
Esperanto -> (2, 2)
Estonian -> (2, 3)
Ethiopic -> (1, 1)
Finnish -> (2, 2)
French -> (2, 3)
Friulan -> (2, 2)
Galician -> (2, 2)
Georgian -> (1, 2)
German_1901 -> (2, 2)
German_1996 -> (2, 2)
German_Swiss -> (2, 2)
Greek_Ancient -> (1, 1)
Greek_Mono -> (1, 1)
Greek_Poly -> (1, 1)
Gujarati -> (1, 1)
Hindi -> (1, 1)
Hungarian -> (2, 2)
Icelandic -> (2, 2)
Indonesian -> (2, 2)
Interlingua -> (2, 2)
Irish -> (2, 3)
Italian -> (2, 2)
Kannada -> (1, 1)
Kurmanji -> (2, 2)
Latin -> (2, 2)
Latin_Classic -> (2, 2)
Latvian -> (2, 2)
Lithuanian -> (2, 2)
Malayalam -> (1, 1)
Marathi -> (1, 1)
Mongolian -> (2, 2)
Norwegian_Bokmal -> (2, 2)
Norwegian_Nynorsk -> (2, 2)
Occitan -> (2, 2)
Oriya -> (1, 1)
Panjabi -> (1, 1)
Piedmontese -> (2, 2)
Polish -> (2, 2)
Portuguese -> (2, 3)
Romanian -> (2, 2)
Romansh -> (2, 2)
Russian -> (2, 2)
Sanskrit -> (1, 3)
Serbian_Cyrillic -> (2, 2)
Serbocroatian_Cyrillic -> (2, 2)
Serbocroatian_Latin -> (2, 2)
Slovak -> (2, 3)
Slovenian -> (2, 2)
Spanish -> (2, 2)
Swedish -> (2, 2)
Tamil -> (1, 1)
Telugu -> (1, 1)
Thai -> (2, 3)
Turkish -> (2, 2)
Turkmen -> (2, 2)
Ukrainian -> (2, 2)
Uppersorbian -> (2, 2)
Welsh -> (2, 3)
english_US :: Hyphenator
english_GB :: Hyphenator
french :: Hyphenator
icelandic :: Hyphenator
afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese,
coptic, croatian, czech, danish, dutch, esperanto,
estonian, ethiopic, finnish, friulan, galician, georgian, german_1901, german_1996,
german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian,
indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic,
latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal,
norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian,
romansh, russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic,
serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil,
telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh :: Hyphenator
afrikaans = unsafePerformIO (loadHyphenator Afrikaans)
armenian = unsafePerformIO (loadHyphenator Armenian)
assamese = unsafePerformIO (loadHyphenator Assamese)
basque = unsafePerformIO (loadHyphenator Basque)
bengali = unsafePerformIO (loadHyphenator Bengali)
bulgarian = unsafePerformIO (loadHyphenator Bulgarian)
catalan = unsafePerformIO (loadHyphenator Catalan)
chinese = unsafePerformIO (loadHyphenator Chinese)
coptic = unsafePerformIO (loadHyphenator Coptic)
croatian = unsafePerformIO (loadHyphenator Croatian)
czech = unsafePerformIO (loadHyphenator Czech)
danish = unsafePerformIO (loadHyphenator Danish)
dutch = unsafePerformIO (loadHyphenator Dutch)
english_US = unsafePerformIO (loadHyphenator English_US)
english_GB = unsafePerformIO (loadHyphenator English_GB)
esperanto = unsafePerformIO (loadHyphenator Esperanto)
estonian = unsafePerformIO (loadHyphenator Estonian)
ethiopic = unsafePerformIO (loadHyphenator Ethiopic)
finnish = unsafePerformIO (loadHyphenator Finnish)
french = unsafePerformIO (loadHyphenator French)
friulan = unsafePerformIO (loadHyphenator Friulan)
galician = unsafePerformIO (loadHyphenator Galician)
georgian = unsafePerformIO (loadHyphenator Georgian)
german_1901 = unsafePerformIO (loadHyphenator German_1901)
german_1996 = unsafePerformIO (loadHyphenator German_1996)
german_Swiss = unsafePerformIO (loadHyphenator German_Swiss)
greek_Ancient = unsafePerformIO (loadHyphenator Greek_Ancient)
greek_Mono = unsafePerformIO (loadHyphenator Greek_Mono)
greek_Poly = unsafePerformIO (loadHyphenator Greek_Poly)
gujarati = unsafePerformIO (loadHyphenator Gujarati)
hindi = unsafePerformIO (loadHyphenator Hindi)
hungarian = unsafePerformIO (loadHyphenator Hungarian)
icelandic = unsafePerformIO (loadHyphenator Icelandic)
indonesian = unsafePerformIO (loadHyphenator Indonesian)
interlingua = unsafePerformIO (loadHyphenator Interlingua)
irish = unsafePerformIO (loadHyphenator Irish)
italian = unsafePerformIO (loadHyphenator Italian)
kannada = unsafePerformIO (loadHyphenator Kannada)
kurmanji = unsafePerformIO (loadHyphenator Kurmanji)
latin = unsafePerformIO (loadHyphenator Latin)
latin_Classic = unsafePerformIO (loadHyphenator Latin_Classic)
latvian = unsafePerformIO (loadHyphenator Latvian)
lithuanian = unsafePerformIO (loadHyphenator Lithuanian)
malayalam = unsafePerformIO (loadHyphenator Malayalam)
marathi = unsafePerformIO (loadHyphenator Marathi)
mongolian = unsafePerformIO (loadHyphenator Mongolian)
norwegian_Bokmal = unsafePerformIO (loadHyphenator Norwegian_Bokmal)
norwegian_Nynorsk = unsafePerformIO (loadHyphenator Norwegian_Nynorsk)
occitan = unsafePerformIO (loadHyphenator Occitan)
oriya = unsafePerformIO (loadHyphenator Oriya)
panjabi = unsafePerformIO (loadHyphenator Panjabi)
piedmontese = unsafePerformIO (loadHyphenator Piedmontese)
polish = unsafePerformIO (loadHyphenator Polish)
portuguese = unsafePerformIO (loadHyphenator Portuguese)
romanian = unsafePerformIO (loadHyphenator Romanian)
romansh = unsafePerformIO (loadHyphenator Romansh)
russian = unsafePerformIO (loadHyphenator Russian)
sanskrit = unsafePerformIO (loadHyphenator Sanskrit)
serbian_Cyrillic = unsafePerformIO (loadHyphenator Serbian_Cyrillic)
serbocroatian_Cyrillic = unsafePerformIO (loadHyphenator Serbocroatian_Cyrillic)
serbocroatian_Latin = unsafePerformIO (loadHyphenator Serbocroatian_Latin)
slovak = unsafePerformIO (loadHyphenator Slovak)
slovenian = unsafePerformIO (loadHyphenator Slovenian)
spanish = unsafePerformIO (loadHyphenator Spanish)
swedish = unsafePerformIO (loadHyphenator Swedish)
tamil = unsafePerformIO (loadHyphenator Tamil)
telugu = unsafePerformIO (loadHyphenator Telugu)
thai = unsafePerformIO (loadHyphenator Thai)
turkish = unsafePerformIO (loadHyphenator Turkish)
turkmen = unsafePerformIO (loadHyphenator Turkmen)
ukrainian = unsafePerformIO (loadHyphenator Ukrainian)
uppersorbian = unsafePerformIO (loadHyphenator Uppersorbian)
welsh = unsafePerformIO (loadHyphenator Welsh)
languageHyphenator :: Language -> Hyphenator
languageHyphenator s = case s of
Afrikaans -> afrikaans
Armenian -> armenian
Assamese -> assamese
Basque -> basque
Bengali -> bengali
Bulgarian -> bulgarian
Catalan -> catalan
Chinese -> chinese
Coptic -> coptic
Croatian -> croatian
Czech -> czech
Danish -> danish
Dutch -> dutch
English_US -> english_US
English_GB -> english_GB
Esperanto -> esperanto
Estonian -> estonian
Ethiopic -> ethiopic
Finnish -> finnish
French -> french
Friulan -> friulan
Galician -> galician
Georgian -> georgian
German_1901 -> german_1901
German_1996 -> german_1996
German_Swiss -> german_Swiss
Greek_Ancient -> greek_Ancient
Greek_Mono -> greek_Mono
Greek_Poly -> greek_Poly
Gujarati -> gujarati
Hindi -> hindi
Hungarian -> hungarian
Icelandic -> icelandic
Indonesian -> indonesian
Interlingua -> interlingua
Irish -> irish
Italian -> italian
Kannada -> kannada
Kurmanji -> kurmanji
Latin -> latin
Latin_Classic -> latin_Classic
Latvian -> latvian
Lithuanian -> lithuanian
Malayalam -> malayalam
Marathi -> marathi
Mongolian -> mongolian
Norwegian_Bokmal -> norwegian_Bokmal
Norwegian_Nynorsk -> norwegian_Nynorsk
Occitan -> occitan
Oriya -> oriya
Panjabi -> panjabi
Piedmontese -> piedmontese
Polish -> polish
Portuguese -> portuguese
Romanian -> romanian
Romansh -> romansh
Russian -> russian
Sanskrit -> sanskrit
Serbian_Cyrillic -> serbian_Cyrillic
Serbocroatian_Cyrillic -> serbocroatian_Cyrillic
Serbocroatian_Latin -> serbocroatian_Latin
Slovak -> slovak
Slovenian -> slovenian
Spanish -> spanish
Swedish -> swedish
Tamil -> tamil
Telugu -> telugu
Thai -> thai
Turkish -> turkish
Turkmen -> turkmen
Ukrainian -> ukrainian
Uppersorbian -> uppersorbian
Welsh -> welsh