module Glider.NLP.Tokenizer
( Token(..)
, foldCase
, getWords
, tokenize
, wordParser
, numberParser
, punctuationParser
, symbolParser
, spaceParser
, allParser
) where
import Prelude hiding (null, takeWhile, dropWhile, head, tail)
import Data.Text
import Data.Char
import qualified Data.List as List
data Token = Word Text
| Number Text
| Punctuation Char
| Symbol Char
| Whitespace
| Unknown Char
deriving (Eq, Show)
tokenize :: Text -> [Token]
tokenize xs = case allParser xs of
[(v, out)] -> v : tokenize out
_ -> []
getWords :: [Token] -> [Text]
getWords [] = []
getWords (x:xs) = case x of
Word a -> a: getWords xs
_ -> getWords xs
foldCase :: [Text] -> [Text]
foldCase = List.map toCaseFold
type Parser = Text -> [(Token, Text)]
wordParser :: Parser
wordParser xs | null xs = []
| isLetter (head xs) = [(Word (takeWhile isAlphaNum xs), dropWhile isAlphaNum xs)]
| otherwise = []
numberParser :: Parser
numberParser xs | null xs = []
| isDigit (head xs) = [(Number (takeWhile isDigit xs), dropWhile isDigit xs)]
| otherwise = []
punctuationParser :: Parser
punctuationParser xs | null xs = []
| isPunctuation (head xs) = [(Punctuation (head xs), tail xs)]
| otherwise = []
symbolParser :: Parser
symbolParser xs | null xs = []
| isSymbol (head xs) = [(Symbol (head xs), tail xs)]
| otherwise = []
spaceParser :: Parser
spaceParser xs | null xs = []
| isSpace (head xs) = [(Whitespace, dropWhile isSpace xs)]
| otherwise = []
charParser :: Parser
charParser xs | null xs = []
| otherwise = [(Unknown (head xs), tail xs)]
allParser :: Parser
allParser xs = case wordParser xs of
[(v, out)] -> [(v, out)]
_ -> case numberParser xs of
[(v, out)] -> [(v, out)]
_ -> case punctuationParser xs of
[(v, out)] -> [(v, out)]
_ -> case symbolParser xs of
[(v, out)] -> [(v, out)]
_ -> case spaceParser xs of
[(v, out)] -> [(v, out)]
_ -> charParser xs