{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-} -- | A monad for writing pure tokenizers in an imperative-looking way. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- This module supports strict text, lazy text, and strings, though the package -- also provides support for ASCII bytestrings in separate modules. -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: T.Text -> [T.Text] -- > tokenizeWords = runTokenizer $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` ("  \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer ( -- * Monad Tokenizer, runTokenizer, runTokenizerCS, untilEOT, -- * Tests peek, isEOT, lookAhead, -- * Movement walk, walkBack, pop, walkWhile, walkFold, -- * Transactions emit, discard, restore, -- * Text types Tokenizable(..) ) where import Control.Monad import Data.Char import Data.Monoid import qualified Data.Text as T import qualified Data.Text.Lazy as LT -- | Text types that can be split by the Tokenizer monad. In this module, -- instances are provided for String, strict Text, and lazy Text. -- If you are dealing with ASCII ByteStrings, you can find instances in -- the modules "Control.Monad.Tokenizer.Char8.Strict" and -- "Control.Monad.Tokenizer.Char8.Lazy" class Tokenizable t where tnull :: t -> Bool thead :: t -> Char ttail :: t -> t ttake :: Int -> t -> t tdrop :: Int -> t -> t tlower :: t -> t instance Tokenizable T.Text where tnull = T.null thead = T.head ttail = T.tail ttake = T.take tdrop = T.drop tlower = T.toLower instance Tokenizable LT.Text where tnull = LT.null thead = LT.head ttail = LT.tail ttake = LT.take . fromIntegral tdrop = LT.drop . fromIntegral tlower = LT.toLower instance Tokenizable [Char] where tnull = null thead = head ttail = tail ttake = take tdrop = drop tlower = map toLower -- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it newtype Tokenizer t a = Tokenizer { runTokenizer' :: (t, Int, t) -> (a,[t] -> [t],t,Int,t) } --- type explanation: (whole text since last emission, chars passed, remaining text) --- -> (result, difference list of tokens, whole text, chars passed, remaining text) instance Functor (Tokenizer t) where fmap = liftM instance Applicative (Tokenizer t) where pure = return (<*>) = ap instance Monad (Tokenizer t) where return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail) m >>= f = Tokenizer $ \(whole,count,tail) -> let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail) (a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1) in (a2,o1.o2,w2,c2,t2) -- | Check if the next input chars agree with the given string lookAhead :: Tokenizable t => [Char] -> Tokenizer t Bool lookAhead chars = Tokenizer $ \(whole,count,tail) -> let h = unpack $ ttake (length chars) tail in (h == chars, id, whole, count, tail) where unpack t | tnull t = [] | otherwise = thead t : unpack (ttail t) -- | Proceed to the next character walk :: Tokenizable t => Tokenizer t () walk = Tokenizer $ \(whole,count,tail) -> if tnull tail then ((),id,whole,count,tail) else ((),id,whole,count+1,ttail tail) -- | Walk back to the previous character, unless it was discarded/emitted. walkBack :: Tokenizable t => Tokenizer t () walkBack = Tokenizer $ \(whole,count,_) -> if count > 0 then ((),id,whole,count-1,tdrop (count-1) whole) else ((),id,whole,0,whole) -- | Restore the state after the last emit/discard. restore :: Tokenizer t () restore = Tokenizer $ \(whole,_,_) -> ((),id,whole,0,whole) -- | Peek the current character peek :: Tokenizable t => Tokenizer t Char peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail) where th t | tnull t = '\0' | otherwise = thead t -- | Peek the current character and proceed pop :: Tokenizable t => Tokenizer t Char pop = peek <* walk -- | Break at the current position and emit the scanned token emit :: Tokenizable t => Tokenizer t () emit = Tokenizer $ \(whole,count,tail) -> ((),(ttake count whole:),tail,0,tail) -- | Break at the current position and discard the scanned token discard :: Tokenizer t () discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail) -- | Have I reached the end of the input text? isEOT :: Tokenizable t => Tokenizer t Bool isEOT = Tokenizer $ \(whole, count, tail) -> (tnull tail, id, whole, count, tail) -- | Proceed as long as a given function succeeds walkWhile :: Tokenizable t => (Char -> Bool) -> Tokenizer t () walkWhile f = do c <- peek when (c /= '\0' && f c) $ walk >> walkWhile f -- | Proceed as long as a given fold returns Just (generalization of walkWhile) walkFold :: Tokenizable t => a -> (Char -> a -> Maybe a) -> Tokenizer t () walkFold s0 f = do c <- peek unless (c == '\0') $ case f c s0 of Nothing -> return () Just s -> walk >> walkFold s f -- | Repeat a given tokenizer as long as the end of text is not reached untilEOT :: Tokenizable t => Tokenizer t () -> Tokenizer t () untilEOT f = do eot <- isEOT unless eot $ f >> untilEOT f -- | Split a string into tokens using the given tokenizer runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t] runTokenizer m input = let input' = tlower input in case runTokenizer' m (input',0,input') of (_, tokens, _, _, _) -> tokens [] -- | Split a string into tokens using the given tokenizer, case sensitive version runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t] runTokenizerCS m input = case runTokenizer' m (input,0,input) of (_, tokens, _, _, _) -> tokens []