{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-}

-- | A monad for writing pure tokenizers in an imperative-looking way.
--
-- Main idea: You 'walk' through the input string like a turtle, and everytime
-- you find a token boundary, you call 'emit'. If some specific kinds of tokens
-- should be suppressed, you can 'discard' them instead (or filter afterwards).
--
-- This module supports strict text, lazy text, and strings, though the package
-- also provides support for ASCII bytestrings in separate modules.
--
-- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
--
-- > tokenizeWords :: T.Text -> [T.Text]
-- > tokenizeWords = runTokenizer $ untilEOT $ do
-- >   c <- pop
-- >   if isStopSym c
-- >     then discard
-- >     else if c `elem` ("  \t\r\n" :: [Char])
-- >          then discard
-- >          else do
-- >            walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
-- >            emit

module Control.Monad.Tokenizer (
  -- * Monad
  Tokenizer,
  runTokenizer,
  runTokenizerCS,
  untilEOT,
  -- * Tests
  peek,
  isEOT,
  lookAhead,
  -- * Movement
  walk,
  walkBack,
  pop,
  walkWhile,
  walkFold,
  -- * Transactions
  emit,
  discard,
  restore,
  -- * Text types
  Tokenizable(..)
  ) where

import Control.Monad
import Data.Char
import Data.Monoid
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT

-- | Text types that can be split by the Tokenizer monad. In this module,
-- instances are provided for String, strict Text, and lazy Text.
-- If you are dealing with ASCII ByteStrings, you can find instances in
-- the modules "Control.Monad.Tokenizer.Char8.Strict" and
-- "Control.Monad.Tokenizer.Char8.Lazy"
class Tokenizable t where
  tnull :: t -> Bool
  thead :: t -> Char
  ttail :: t -> t
  ttake :: Int -> t -> t
  tdrop :: Int -> t -> t
  tlower :: t -> t

instance Tokenizable T.Text where
  tnull = T.null
  thead = T.head
  ttail = T.tail
  ttake = T.take
  tdrop = T.drop
  tlower = T.toLower

instance Tokenizable LT.Text where
  tnull = LT.null
  thead = LT.head
  ttail = LT.tail
  ttake = LT.take . fromIntegral
  tdrop = LT.drop . fromIntegral
  tlower = LT.toLower

instance Tokenizable [Char] where
  tnull = null
  thead = head
  ttail = tail
  ttake = take
  tdrop = drop
  tlower = map toLower
    
-- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it
newtype Tokenizer t a = Tokenizer { runTokenizer' :: (t, Int, t) -> (a,[t] -> [t],t,Int,t) }
--- type explanation: (whole text since last emission, chars passed, remaining text)
---     -> (result, difference list of tokens, whole text, chars passed, remaining text)

instance Functor (Tokenizer t) where
    fmap = liftM

instance Applicative (Tokenizer t) where
    pure = return
    (<*>) = ap

instance Monad (Tokenizer t) where
    return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail)
    m >>= f = Tokenizer $ \(whole,count,tail) ->
              let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail)
                  (a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1)
              in (a2,o1.o2,w2,c2,t2)

-- | Check if the next input chars agree with the given string
lookAhead :: Tokenizable t => [Char] -> Tokenizer t Bool
lookAhead chars =
    Tokenizer $ \(whole,count,tail) ->
        let h = unpack $ ttake (length chars) tail
        in (h == chars, id, whole, count, tail)
  where unpack t | tnull t = []
                 | otherwise = thead t : unpack (ttail t)
                 
-- | Proceed to the next character
walk :: Tokenizable t => Tokenizer t ()
walk = Tokenizer $ \(whole,count,tail) ->
       if tnull tail
         then ((),id,whole,count,tail)
         else ((),id,whole,count+1,ttail tail)

-- | Walk back to the previous character, unless it was discarded/emitted.
walkBack :: Tokenizable t => Tokenizer t ()
walkBack = Tokenizer $ \(whole,count,_) ->
                         if count > 0
                         then ((),id,whole,count-1,tdrop (count-1) whole)
                         else ((),id,whole,0,whole)

-- | Restore the state after the last emit/discard.
restore :: Tokenizer t ()
restore = Tokenizer $ \(whole,_,_) -> ((),id,whole,0,whole)

-- | Peek the current character
peek :: Tokenizable t => Tokenizer t Char
peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail)
    where th t | tnull t = '\0'
               | otherwise = thead t

-- | Peek the current character and proceed
pop :: Tokenizable t => Tokenizer t Char
pop = peek <* walk
                             
-- | Break at the current position and emit the scanned token
emit :: Tokenizable t => Tokenizer t ()
emit = Tokenizer $ \(whole,count,tail) -> ((),(ttake count whole:),tail,0,tail)

-- | Break at the current position and discard the scanned token
discard :: Tokenizer t ()
discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail)

-- | Have I reached the end of the input text?
isEOT :: Tokenizable t => Tokenizer t Bool
isEOT = Tokenizer $ \(whole, count, tail) -> (tnull tail, id, whole, count, tail)

-- | Proceed as long as a given function succeeds
walkWhile :: Tokenizable t => (Char -> Bool) -> Tokenizer t ()
walkWhile f = do
  c <- peek
  when (c /= '\0' && f c) $
       walk >> walkWhile f

-- | Proceed as long as a given fold returns Just (generalization of walkWhile)
walkFold :: Tokenizable t => a -> (Char -> a -> Maybe a) -> Tokenizer t ()
walkFold s0 f = do
  c <- peek
  unless (c == '\0') $ case f c s0 of
      Nothing -> return ()
      Just s -> walk >> walkFold s f

-- | Repeat a given tokenizer as long as the end of text is not reached
untilEOT :: Tokenizable t => Tokenizer t () -> Tokenizer t ()
untilEOT f = do
  eot <- isEOT
  unless eot $ f >> untilEOT f

-- | Split a string into tokens using the given tokenizer
runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizer m input =
    let input' = tlower input
    in case runTokenizer' m (input',0,input') of
        (_, tokens, _, _, _) -> tokens []

-- | Split a string into tokens using the given tokenizer, case sensitive version
runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizerCS m input =
    case runTokenizer' m (input,0,input) of
        (_, tokens, _, _, _) -> tokens []