{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-}
module Control.Monad.Tokenizer (
Tokenizer,
runTokenizer,
runTokenizerCS,
untilEOT,
peek,
isEOT,
lookAhead,
walk,
walkBack,
pop,
walkWhile,
walkFold,
emit,
discard,
restore,
embed,
embed_,
discardAndEmbed,
convert,
convertWith,
Tokenizable(..)
) where
import Control.Monad
import Data.Char
import Data.Monoid
import Data.String
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import Control.Monad.Tokenizer.Class
newtype Tokenizer t a = Tokenizer { runTokenizer' :: (t, Int, t) -> (a,[t] -> [t],t,Int,t) }
instance Functor (Tokenizer t) where
fmap = liftM
instance Applicative (Tokenizer t) where
pure = return
(<*>) = ap
instance Monad (Tokenizer t) where
return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail)
m >>= f = Tokenizer $ \(whole,count,tail) ->
let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail)
(a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1)
in (a2,o1.o2,w2,c2,t2)
instance Tokenizable t => MonadTokenizer (Tokenizer t) where
lookAhead chars =
Tokenizer $ \(whole,count,tail) ->
let h = unpack $ ttake (length chars) tail
in (h == chars, id, whole, count, tail)
where unpack t | tnull t = []
| otherwise = thead t : unpack (ttail t)
walk = Tokenizer $ \(whole,count,tail) ->
if tnull tail
then ((),id,whole,count,tail)
else ((),id,whole,count+1,ttail tail)
walkBack = Tokenizer $ \(whole,count,_) ->
if count > 0
then ((),id,whole,count-1,tdrop (count-1) whole)
else ((),id,whole,0,whole)
restore = Tokenizer $ \(whole,_,_) -> ((),id,whole,0,whole)
peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail)
where th t | tnull t = '\0'
| otherwise = thead t
emit = Tokenizer $ \(whole,count,tail) -> ((),(ttake count whole:),tail,0,tail)
discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail)
isEOT = Tokenizer $ \(whole, count, tail) -> (tnull tail, id, whole, count, tail)
embed :: Tokenizable t => ((t,t) -> (a,[t],t)) -> Tokenizer t a
embed f = Tokenizer $ \(whole,count,tail) ->
let (a, rs, rem) = f (tdrop count whole, tail)
in (a, (rs++), rem, 0, rem)
embed_ :: Tokenizable t => ((t,t) -> ([t],t)) -> Tokenizer t ()
embed_ f = embed f'
where f' (v,u) = let (rs,rem) = f (v,u) in ((),rs,rem)
discardAndEmbed :: Tokenizable t => (t -> (a,[t],t)) -> Tokenizer t a
discardAndEmbed f = discard >> embed (f . snd)
convert :: (Tokenizable t, IsString t, Tokenizable s, IsString s) =>
Tokenizer s a -> Tokenizer t a
convert = convertWith fromString unpack . convertWith unpack fromString
where unpack t | tnull t = []
| otherwise = thead t : unpack (ttail t)
convertWith :: (s -> t) -> (t -> s) -> Tokenizer s a -> Tokenizer t a
convertWith fw bw (Tokenizer runTok) = Tokenizer $ \(whole,count,tail) ->
let (a, o, w, c, t) = runTok (bw whole, count, bw tail)
o' = map fw $ o []
in (a, (o'++), fw w, c, fw t)
runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizer m input =
let input' = tlower input
in case runTokenizer' m (input',0,input') of
(_, tokens, _, _, _) -> tokens []
runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizerCS m input =
case runTokenizer' m (input,0,input) of
(_, tokens, _, _, _) -> tokens []