{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}
module Text.Pronounce.ParseDict
( CMUdict
, UsesBin
, initDict
, stdDict
, parseDict
, parseLine
) where
import Paths_pronounce
import System.FilePath
import Text.ParserCombinators.ReadP
import Data.Char
import Data.Text.Encoding
import Data.Binary (decodeFile)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Map as Map
type CMUdict = Map.Map T.Text [T.Text]
type UsesBin = Bool
initDict :: Maybe FilePath -> UsesBin -> IO CMUdict
initDict path = \case
True ->
case path of
Just p ->
return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile p
Nothing ->
return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile =<< getDataFileName "cmubin"
False ->
case path of
Just p ->
return . parseDict =<< T.readFile p
Nothing ->
return . parseDict =<< T.readFile =<< getDataFileName "cmuutf"
stdDict :: IO CMUdict
stdDict = initDict Nothing True
parseDict :: T.Text -> CMUdict
parseDict = Map.fromListWith (++) . map packAndParse . filter ((/= ';') . T.head) . T.lines
where packAndParse = (\(a,b) -> (T.pack a, [T.pack b])) . fst . head . readP_to_S parseLine . T.unpack
parseLine :: ReadP (String, String)
parseLine = (,) <$> (many get) <* (paren <++ string "") <* string " "
<*> (munch . const $ True)
paren :: ReadP String
paren = char '(' *> munch isDigit <* char ')'