{-|
Module      : Hlex
Description : Lexer creation tools
Copyright   : (c) Sebastian Tee, 2023
License     : MIT

Tools needed to create a 'Lexer' from a lexical 'Grammar'.
-}
module Hlex
     ( -- * Example
       -- $example

       -- * Types
       Grammar
     , GrammarRule(..)
     , Lexer
       -- ** Exceptions
     , LexException(..)
       -- * Functions
     , hlex
     ) where

import Text.Regex.TDFA ((=~))
import Data.Maybe (maybeToList)

-- | Exception thrown when a 'Lexer' encounters an error when lexxing a string.
data LexException 
  = UnmatchedException -- ^ Exception thrown when a substring cannot be matched.
    Int -- ^ The line number where the substring that couldn't be lexed is located.
    Int -- ^ The column where the substring that couldn't be lexed is located.
    String -- ^ The subtring that couldn't be lexed.
  | MatchedException -- ^ Exception thrown when a macth is found on the 'Error' 'GrammarRule'.
    Int -- ^ The line number where the matched string is located.
    Int -- ^ The column where the matched string is located.
    String -- ^ The matched string.
    String -- ^ Error message.
  deriving(Read, Show, Eq)

-- | These are the individual rules that make up a 'Grammar'.
--
-- Takes a __POSIX regular expression__ then converts it to a token or skips it.
data GrammarRule token
  = Skip -- ^ Skips over any matches.
    String -- ^ Regular expression.
  | Tokenize -- ^ Takes a function that converts the matched string to a token.
    String -- ^ Regular expression.
    (String -> token) -- ^ Function that converts the matched string into a token.
  | JustToken -- ^ Converts any regular expression matches to a given token.
    String -- ^ Regular expression.
    token -- ^ Given token.
  | Error -- ^ Returns an error with a message when a match occurs.
    String -- ^ Regular expression.
    String -- ^ Error message.

-- | Lexical grammar made up of 'GrammarRule's.
--
-- The __order is important__. The 'Lexer' will apply each 'GrammarRule' rule in the order listed.
type Grammar token = [GrammarRule token]

-- | Converts a string into a list of tokens.
-- If the string does not follow the Lexer's 'Grammar' a 'LexException' will be returned.
type Lexer token = String -> Either LexException [token]

-- | Takes a given 'Grammar' and turns it into a 'Lexer'.
hlex :: Grammar token -> Lexer token
hlex = hlex' 1 1

hlex' :: Int -> Int -> Grammar token -> Lexer token
hlex' _ _ _ [] = Right []
hlex' row col tzss@(tz:tzs) program =
  if null matchedText
  then hlex' row col tzs program
  else case tz of
    Error _ errMessage -> Left $ uncurry MatchedException (getLastCharPos row col beforeProgram) matchedText errMessage
    Skip _ -> lexCont Nothing
    Tokenize _ f -> lexCont $ Just $ f matchedText
    JustToken _ token -> lexCont $ Just token
  where
    (beforeProgram, matchedText, afterProgram) = program =~ getRegex tz :: (String, String, String)
    lexCont t = do
      before <- hlex' row col tzs beforeProgram
      after <- uncurry hlex' (getLastCharPos row col (beforeProgram ++ matchedText)) tzss afterProgram
      Right $ before ++ maybeToList t ++ after
hlex' row col _ invalidString = Left $ UnmatchedException row col invalidString

getLastCharPos :: Int -> Int -> String -> (Int, Int)
getLastCharPos startRow startCol x = (startRow + addRow, addCol + if addRow == 0 then startCol else 1)
  where
    ls = lines x
    addRow = length ls - 1
    addCol = length $ last ls

getRegex :: GrammarRule token -> String
getRegex (Skip regex) = regex
getRegex (Tokenize regex _) = regex
getRegex (JustToken regex _) = regex
getRegex (Error regex _) = regex

{- $example
Here is an example module for a simple language.

@
  module ExampleLang
       ( MyToken(..) -- Export the language's tokens and the lexer
       , myLexer
       ) where

  import Hlex

  data MyToken = Ident String -- String identifier token
               | Number Float -- Number token and numeric value
               | Assign       -- Assignment operator token
               deriving(Show)

  myGrammar :: Grammar MyToken
  myGrammar = [ Error "\"[^\"]*\n" "Can't have a new line in a string"        -- Return Exception when a new line occurs in a string
              , Tokenize "\"[^\"]*\"" $ Str . init . tail                     -- Encode string and strip the containing quotes
              , JustToken "=" Assign                                       -- "=" Operator becomes the assign token
              , Tokenize "[a-zA-Z]+" (\match -> Ident match)                -- Identifier token with string
              , Tokenize "[0-9]+(\\.[0-9]+)?" (\match -> Number (read match) -- Number token with the parsed numeric value stored as a Float
              , Skip "[ \\n\\r\\t]+"                                          -- Skip whitespace
              ]

  myLexer :: Lexer MyToken
  myLexer = hlex myGrammar -- hlex turns a Grammar into a Lexer
@

Here is the lexer being used on a simple program.

>>> lexer "x = 1.2"
Right [Ident "x", Assign, Number 1.2]

Here is the lexer being used on an program with a syntax error.

>>> lexer "x = \"a\nb\""
Left (MatchedException 1 5 "\"a\n" "Can't have a new line in a string")

The lexer uses 'Either'. Right means the lexer successfully parsed the program to a list of MyTokens.
If Left was returned it would be a 'LexException'.
-}