{-# LANGUAGE TemplateHaskellQuotes #-}
module Sasha.TTH (
    -- * SaTTH, staged Sasha the lexer.
    SaTTH,
    satth,
    -- * ERE specification
    ERE,
    empty,
    eps,
    char,
    charRange,
    charSet,
    utf8Char,
    anyChar,
    anyUtf8Char,
    appends,
    unions,
    intersections,
    star,
    plus,
    string,
    utf8String,
    complement,
    satisfy,
    digit,
) where

import Language.Haskell.TH (Code, CodeQ, Exp, Q)

import Control.Monad               (forM)
import Data.List                   (sortOn)
import Data.Map                    (Map)
import Data.Maybe                  (isJust, listToMaybe)
import Data.Ord                    (Down (..))
import Data.Word                   (Word8)
import Data.Word8Set               (Word8Set)
import Language.Haskell.TTH.LetRec (letrecE)

import qualified Data.ByteString     as BS
import qualified Data.Map.Strict     as Map
import qualified Data.Word8Set       as W8S
import qualified Language.Haskell.TH as TH

import Sasha.Internal.ERE
import Sasha.Internal.Word8Set (memberCode)

-- | Lexer grammar specification: regular expression and result builder function
-- which takes a prefix (the matching part) and a suffix (the rest of input).
type SaTTH r = [(ERE, Code Q BS.ByteString -> Code Q BS.ByteString -> Code Q r)]

-- | Scan for a single token.
satth
    :: forall r. Code Q r           -- ^ no match value
    -> SaTTH r                      -- ^ scanner rules definitions
    -> Code Q (BS.ByteString -> r)  -- ^ scanner code
satth noMatch rules = [|| \bs -> $$(satth' noMatch rules [|| bs ||]) bs ||]

-- | Generate a scanner code.
satth' :: forall r. Code Q r -> SaTTH r -> Code Q BS.ByteString -> Code Q (BS.ByteString -> r)
satth' noMatch grammar0 input0 = letrecE
    (\_ -> "state")
    trans
    start
  where
    grammar0' :: SaTTH' r
    grammar0' =
        [ S i f ere
        | (i, (ere, f)) <- zip [0..] grammar0
        ]

    start :: Monad m => (SaTTH' r -> m (Code Q (R r))) -> m (Code Q (BS.ByteString -> r))
    start rec = do
        startCode <- rec grammar0'
        -- we assume that none of the tokens accepts an empty string,
        -- so we start without specifying last match.
        return [|| \input -> $$startCode $$noMatch (0 :: Int) input ||]

    trans :: Monad m => (SaTTH' r -> m (Code Q (R r))) -> SaTTH' r -> m (Code Q (R r))
    trans _rec grammar
        | emptySashaTTH grammar
        = return [|| \ !acc _ _ -> acc ||]

    trans  rec grammar = do
        -- if the input is not empty?
        let grammarM1 :: Map (SaTTH' r) Word8Set
            grammarM1 = Map.fromListWith W8S.union
                [ (derivativeSaTTH c grammar, W8S.singleton c)
                | c <- [ minBound .. maxBound ]
                ]

            -- non-empty map
            grammarM :: [(Word8Set, SaTTH' r, M r)]
            grammarM =
                [ (c, grammar', makeM input0 grammar')
                | (grammar', c) <- Map.toList grammarM1
                ]

        -- next states
        nexts0 <- forM grammarM $ \(ws, grammar', modify) -> do
            if emptySashaTTH grammar' then return (ws, NextEmpty, modify)
            else if epsSashaTTH grammar' then return (ws, NextEps, modify)
            else do
                next <- rec grammar'
                return (ws, Next next, modify)

        -- sort next states
        let nexts :: [(Word8Set, Next (Code Q (R r)), M r)]
            nexts = sortOn (\(ws, _, _) -> meas ws) nexts0

        -- transition case
        let caseAnalysis
                :: Code Q r
                -> Code Q Int
                -> Code Q Word8
                -> Code Q BS.ByteString
                -> Code Q r
            caseAnalysis acc pos c input' = caseTTH [|| () ||]
                [ (memberCode c ws, body)

                | (ws, mnext, modify) <- nexts
                , let body = case mnext of
                        NextEmpty -> acc
                        NextEps   -> modify acc [|| $$pos + 1 ||]
                        Next next -> [|| let !pos' = $$pos + 1 in $$next $$(modify acc [|| pos' ||]) pos' $$input' ||]
                ]

        let debugWarns :: Q ()
            debugWarns = return ()

        -- Note: acc should stay lazy
        return $ TH.bindCode_ debugWarns [|| \ acc !_pos !input -> case BS.uncons input of
            Nothing           -> acc
            Just (c, _input') -> $$(caseAnalysis [|| acc ||] [|| _pos ||] [|| c ||] [|| _input' ||])
            ||]

-------------------------------------------------------------------------------
-- Sorting transitions
-------------------------------------------------------------------------------

data Meas
    = MeasLite Word8Set
    | MeasCont !(Down Int) !Word8Set
    | MeasSize !Int !Word8Set
  deriving (Eq, Ord)

meas :: Word8Set -> Meas
meas ws
    | W8S.size ws < 2         = MeasLite ws
    | isJust (W8S.isRange ws) = MeasCont (Down (W8S.size ws)) ws
    | otherwise               = MeasSize (W8S.size ws) ws

-------------------------------------------------------------------------------
-- Aliases
-------------------------------------------------------------------------------

-- | Inner scanner function.
--
-- * previous match
-- * position
-- * input
--
type R r = r -> Int -> BS.ByteString -> r

-- | Last accept modifier.
type M r = Code Q r -> CodeQ Int -> CodeQ r

makeM :: forall r. Code Q BS.ByteString -> SaTTH' r -> M r
makeM input0 grammar acc pos = case acc' of
    Nothing -> acc
    Just f  -> [|| case BS.splitAt $$pos $$input0 of (_pfx, _sfx) -> $$(f [|| _pfx ||] [|| _sfx ||]) ||]
  where
    acc' :: Maybe (Code Q BS.ByteString -> Code Q BS.ByteString -> Code Q r)
    acc' = listToMaybe
        [ f
        | S _ f ere <- grammar
        , nullable ere
        ]

data Next a
    = NextEmpty
    | NextEps
    | Next a

-------------------------------------------------------------------------------
-- TTH extras
-------------------------------------------------------------------------------

caseTTH :: Code Q a -> [(Code Q Bool, CodeQ r)] -> Code Q r
caseTTH c guards = TH.unsafeCodeCoerce $ TH.caseE (TH.unTypeCode c)
    [ TH.match TH.wildP (TH.guardedB (go guards))  [] ]
  where
    go :: [(Code Q Bool, Code Q r)] -> [Q (TH.Guard, Exp)]
    go []          = []
    go [(_,b)]     = [TH.normalGE [| otherwise |] (TH.unTypeCode b)]
    go ((g,b):gbs) = TH.normalGE (TH.unTypeCode g) (TH.unTypeCode b) : go gbs

-------------------------------------------------------------------------------
-- State
-------------------------------------------------------------------------------

-- | We give each tag an integer, so we can order them.
data S r = S !Int !(Code Q BS.ByteString -> Code Q BS.ByteString -> Code Q r) !ERE

instance Show (S tag) where
    show (S i _ ere) = show (i, ere)

instance Eq (S tag) where
    S i _ ere == S i' _ ere' = (i, ere) == (i', ere')

instance Ord (S tag) where
    compare (S i _ ere) (S i' _ ere') = compare (i, ere) (i', ere')

type SaTTH' tag = [S tag]

-------------------------------------------------------------------------------
-- Derivative
-------------------------------------------------------------------------------

derivativeSaTTH :: Word8 -> SaTTH' tag -> SaTTH' tag
derivativeSaTTH c ts =
    [ S i code ere''
    | S i code ere <- ts
    , let ere' = derivative c ere
    , let ere'' = simplifyERE ere'
    , not (equivalent empty ere'')
    ]

simplifyERE :: ERE -> ERE
simplifyERE ere
    | equivalent ere eps = eps
    | otherwise          = ere

-- does it make sense to look further?
emptySashaTTH :: SaTTH' tag -> Bool
emptySashaTTH = null

epsSashaTTH :: SaTTH' tag -> Bool
epsSashaTTH grammar = and [ equivalent ere eps | S _ _ ere <- grammar ]