{-# OPTIONS_GHC -fno-warn-unused-do-bind #-}
module Database.Design.Ampersand.Input.ADL1.Lexer
    ( keywords
    , operators
    , symbols
    , lexer
    -- LexerMessage
    , LexerError(..)
    , LexerErrorInfo(..)
    , LexerWarning(..)
    , LexerWarningInfo(..)
    , keepOneTabWarning
    , showLexerErrorInfo
    , showLexerWarningInfo
    -- LexerToken
    , Token(..)
    , Lexeme(..)
    , lexemeText
    , initPos
    , FilePos(..)
) where

import Database.Design.Ampersand.Input.ADL1.FilePos(updatePos)
import Database.Design.Ampersand.Input.ADL1.LexerToken
import Database.Design.Ampersand.Input.ADL1.LexerMonad
import Database.Design.Ampersand.Input.ADL1.LexerMessage
import Data.Char hiding(isSymbol)
import Data.Set (member, fromList)
import Database.Design.Ampersand.Basics (fatalMsg)
import Database.Design.Ampersand.Misc
import Data.Time.Calendar
import Data.Time.Clock
import Numeric

fatal :: Int -> String -> a
fatal = fatalMsg "Lexer"

-- | Retrieves a list of keywords accepted by the ampersand language
keywords :: [String] -- ^ The keywords
keywords      = [ "INCLUDE"
                , "CONTEXT", "ENDCONTEXT", "EXTENDS", "THEMES"
                , "META"
                , "PATTERN", "ENDPATTERN"
                , "PROCESS", "ENDPROCESS"
                , "INTERFACE", "CLASS", "FOR", "BOX", "ROWS", "TABS", "COLS", "INITIAL", "SQLPLUG", "PHPPLUG"
                , "REPRESENT", "TYPE", "LINKTO"
                , "POPULATION", "CONTAINS"
                , "UNI", "INJ", "SUR", "TOT", "SYM", "ASY", "TRN", "RFX", "IRF", "AUT", "PROP", "ALWAYS"
                , "RULE", "MESSAGE", "VIOLATION", "SRC", "TGT", "TEST"
                , "RELATION", "MEANING", "CONCEPT", "IDENT"
                , "VIEW", "ENDVIEW", "DEFAULT", "TXT", "PRIMHTML", "TEMPLATE"
                , "IMPORT", "SPEC", "ISA", "IS", "I", "V"
                , "CLASSIFY"
                , "PRAGMA", "PURPOSE", "IN", "REF", "ENGLISH", "DUTCH"
                , "REST", "HTML", "LATEX", "MARKDOWN"
                , "ONE"
                , "BYPLUG"
                , "ROLE", "SERVICE", "EDITS", "MAINTAINS"
                -- Keywords for TType:
                , "ALPHANUMERIC", "BIGALPHANUMERIC", "HUGEALPHANUMERIC", "PASSWORD"
                , "BINARY", "BIGBINARY", "HUGEBINARY"
                , "DATE", "DATETIME", "BOOLEAN", "INTEGER", "FLOAT", "AUTOINCREMENT"
                -- Keywords for values of atoms:
                , "TRUE", "FALSE" --for booleans
                ]

-- | Retrieves a list of operators accepted by the ampersand language
operators :: [String] -- ^ The operators
operators = [ "|-", "-", "->", "<-", "=", "~", "+", "*", ";", "!", "#",
              "::", ":", "\\/", "/\\", "\\", "/", "<>" , "..", "."]

-- | Retrieves the list of symbols accepted by the ampersand language
symbols :: String -- ^ The list of symbol characters / [Char]
symbols = "()[],{}<>"

--TODO: Options should be one item, not a list
-- | Runs the lexer
lexer :: [Options]  -- ^ The command line options
      -> FilePath   -- ^ The file name, used for error messages
      -> String     -- ^ The content of the file
      -> Either LexerError ([Token], [LexerWarning]) -- ^ Either an error or a list of tokens and warnings
lexer opt file input = case runLexerMonad opt file (mainLexer (initPos file) input) of
                               Left err -> Left err
                               Right (ts, ws) -> Right (ts, ws)

-----------------------------------------------------------
-- Help functions
-----------------------------------------------------------

skipLine :: String -> String
skipLine = dropWhile (/= '\n')

takeLine :: String -> String
takeLine = takeWhile (/= '\n')

-----------------------------------------------------------
-- Lexer definition
-----------------------------------------------------------

type Lexer = FilePos -> String  -> LexerMonad [Token]

mainLexer :: Lexer

-----------------------------------------------------------
-- Removing unnecessary text artifacts (comment, spaces,...)
-----------------------------------------------------------

mainLexer _ [] =  return []

mainLexer p ('-':'-':s) = mainLexer p (skipLine s) --TODO: Test if we should increase line number and reset the column number

mainLexer p (c:s) | isSpace c = let (spc,next) = span isSpace s
                                in  mainLexer (foldl updatePos p (c:spc)) next

mainLexer p ('-':'+':s)  = returnToken lx p mainLexer p rest
                where lx   = LexExpl $ dropWhile isSpace (takeLine s)
                      rest = skipLine s

mainLexer p ('{':'-':s) = lexNest mainLexer (addPos 2 p) s
mainLexer p ('{':'+':s) = lexExpl mainLexer (addPos 2 p) s
mainLexer p ('"':ss) =
    let (s,swidth,rest) = scanString ss
    in if null rest || head rest /= '"'
                              then lexerError (NonTerminatedString s) p
                              else returnToken (LexString s) p mainLexer (addPos (swidth+2) p) (tail rest)

{- In Ampersand, atoms may be promoted to singleton relations by single-quoting them. For this purpose, we treat
   single quotes exactly as the double quote for strings. That substitutes the scanner code for character literals. -}
mainLexer p ('\'':ss)
     = let (s,swidth,rest) = scanSingletonInExpression ss
       in if null rest || head rest /= '\''
             then lexerError UnterminatedAtom p
             else returnToken (LexSingleton s) p mainLexer (addPos (swidth+2) p) (tail rest)

-----------------------------------------------------------
-- looking for keywords - operators - special chars
-----------------------------------------------------------

-- Special case for < since it's the beginning of operators but also a symbol when alone
mainLexer p ('<':d:s) = if isOperator ['<',d]
                        then returnToken (LexOperator ['<',d]) p mainLexer (addPos 2 p) s
                        else returnToken (LexSymbol    '<')    p mainLexer (addPos 1 p) (d:s)

mainLexer p cs@(c:s)
     | isIdStart c || isUpper c
         = let (name', p', s')    = scanIdent (addPos 1 p) s
               name               = c:name'
               tokt   | iskw name = LexKeyword name
                      | otherwise = if isIdStart c
                                    then LexVarId name
                                    else LexConId name
           in returnToken tokt p mainLexer p' s'
     | isOperatorBegin c
         = let (name, s') = getOp cs
           in returnToken (LexOperator name) p mainLexer (foldl updatePos p name) s'
     | isSymbol c = returnToken (LexSymbol c) p mainLexer (addPos 1 p) s
     | isDigit c
         = case  getDateTime cs of
            Just (Right (tk,_,width,s')) -> returnToken tk p mainLexer (addPos width p) s'
            Just (Left msg) -> lexerError msg p
            Nothing 
              -> case getDate cs of
                  Just (tk,_,width,s') -> returnToken tk p mainLexer (addPos width p) s'
                  Nothing -> let (tk,_,width,s') = getNumber cs
                             in  returnToken tk p mainLexer (addPos width p) s'
       -- Ignore unexpected characters in the beginning of the file because of the UTF-8 BOM marker.
       -- TODO: Find out the right way of handling the BOM marker.
     | beginFile p = do { lexerWarning UtfChar p; mainLexer p s }
     | otherwise  = lexerError (UnexpectedChar c) p
    where beginFile (FilePos _ 1 1) = True
          beginFile _ = False

-----------------------------------------------------------
-----------------------------------------------------------
-- Supporting functions for MainLexer
-----------------------------------------------------------
-----------------------------------------------------------

-----------------------------------------------------------
-- Check on keywords - operators - special chars
-----------------------------------------------------------

locatein :: Ord a => [a] -> a -> Bool
locatein es e = member e (fromList es)

iskw :: String -> Bool
iskw = locatein keywords

isSymbol :: Char -> Bool
isSymbol = locatein symbols

isOperator :: String -> Bool
isOperator  = locatein operators

isOperatorBegin :: Char -> Bool
isOperatorBegin  = locatein (map head operators)

isIdStart :: Char -> Bool
isIdStart c = isLower c || c == '_'

isIdChar :: Char -> Bool
isIdChar c =  isAlphaNum c || c == '_'

-- Finds the longest prefix of cs occurring in keywordsops
getOp :: String -> (String, String)
getOp cs = findOper operators cs ""
    where findOper :: [String] -> String -> String -> (String,String)
          findOper []  _     _  = ("", cs)
          findOper _   []    op = (op,[])
          findOper ops (c:rest) op =
              if null found then (op, c:rest)
              else findOper found rest (op ++ [c])
              where found = [s' | o:s'<-ops, c==o]

-- scan ident receives a file position and the resting contents, returning the scanned identifier, the file location and the resting contents.
scanIdent :: FilePos -> String -> (String, FilePos, String)
scanIdent p s = let (name,rest) = span isIdChar s
                in (name,addPos (length name) p,rest)


-----------------------------------------------------------
-- String clean-up functions / comments
-----------------------------------------------------------

lexNest :: Lexer -> Lexer
lexNest c p ('-':'}':s) = c (addPos 2 p) s
lexNest c p ('{':'-':s) = lexNest (lexNest c) (addPos 2 p) s
lexNest c p (x:s)       = lexNest c (updatePos p x) s
lexNest _ p []          = lexerError UnterminatedComment p

--TODO: Also accept {+ ... +} as delimiters
lexExpl :: Lexer -> Lexer
lexExpl = lexExpl' ""
 where lexExpl' str _ p ('-':'}':s) = returnToken (LexExpl str) p mainLexer (addPos 2 p)  s
       lexExpl' str c p ('{':'-':s) = lexNest (lexExpl' str c) (addPos 2 p) s
       lexExpl' str c p ('-':'-':s) = lexExpl' str c p (dropWhile (/= '\n') s)
       lexExpl' str c p (x:s)       = lexExpl' (str++[x]) c (updatePos p x) s
       lexExpl' _   _ p []          = lexerError UnterminatedPurpose p

-----------------------------------------------------------
-- iso 8601 date / time
-----------------------------------------------------------
-- Returns tuple with the parsed lexeme, the UTCTime, the amount of read characters and the rest of the text
getDateTime :: String -> Maybe (Either LexerErrorInfo (Lexeme, UTCTime, Int, String) )
getDateTime cs = 
  case getDate cs of
   Nothing -> Nothing
   Just (_,day,ld,rd) -> 
      case getTime rd of
        Nothing -> case rd of
                    'T':_ -> Just . Left $ ProblematicISO8601DateTime
                    _     -> Nothing
        Just (timeOfDay, tzoneOffset,lt,rt) -> 
            let ucttime = addUTCTime tzoneOffset (UTCTime day timeOfDay)
            in Just . Right $ 
                    ( LexDateTime ucttime
                    , ucttime
                    , ld + lt
                    , rt
                    )
getTime :: String -> Maybe (DiffTime, NominalDiffTime, Int, String)
getTime cs =
  case cs of
   'T':h1:h2:':':m1:m2:rest 
    -> if (all isDigit [h1,h2,m1,m2])
       then let (_,Left hours,_,_) = getNumber [h1,h2]
                (_,Left minutes,_,_) = getNumber [m1,m2]
                (seconds,ls,rs) = getSeconds rest
            in case getTZD rs of
                 Nothing -> Nothing
                 Just (offset,lo,ro)
                   -> if hours < 24 && minutes < 60 && seconds < 60
                      then Just (fromRational . toRational $
                                   ( fromIntegral hours*60
                                    +fromIntegral minutes
                                   )*60+ seconds
                                ,offset
                                ,1+5+ls+lo
                                ,ro)
                      else Nothing
       else Nothing
   _ -> Nothing
   
getSeconds :: String -> (Float,Int,String)
getSeconds cs =
 case cs of
  (':':s1:s2:rest) ->  
     if all isDigit [s1,s2]
     then let (fraction,lf,rf) = getFraction (s1:s2:rest)
          in (fraction,1+lf,rf)
     else (0,0,cs)
  _ -> (0,0,cs)
getFraction :: String -> (Float,Int,String)
getFraction cs = 
  case readFloat cs of
   [(a,str)] -> (a, length cs - length str, str) --TODO: Make more efficient.
   _ -> (0,0,cs)
   


getTZD :: String -> Maybe (NominalDiffTime, Int,String)
getTZD cs = 
 case cs of
  'Z':rest -> Just (0,1,rest)
  '+':h1:h2:':':m1:m2:rest -> mkOffset [h1,h2] [m1,m2] rest (+)
  '-':h1:h2:':':m1:m2:rest -> mkOffset [h1,h2] [m1,m2] rest (-)
  _ -> Nothing
  where
   mkOffset :: String -> String -> String -> (Int -> Int -> Int) -> Maybe (NominalDiffTime, Int,String)
   mkOffset hs ms rest op = 
    let (_,Left hours  ,_,_) = getNumber hs
        (_,Left minutes,_,_) = getNumber ms
        total = hours*60+minutes
    in if hours <= 24 && minutes < 60
       then Just (fromRational . toRational $ 0 `op` total
                 ,6,rest) 
       else Nothing 
getDate :: String -> Maybe (Lexeme, Day, Int, String)
getDate cs =
  case cs of
   y1:y2:y3:y4:'-':m1:m2:'-':d1:d2:rest ->
      if all isDigit [y1,y2,y3,y4,m1,m2,d1,d2] 
      then case fromGregorianValid (toInteger year) month day of
             Nothing -> Nothing 
             Just d -> Just (LexDate d, d, 10, rest)
      else Nothing
     where (_,Left year ,_,_) = getNumber [y1,y2,y3,y4]
           (_,Left month,_,_) = getNumber [m1,m2]
           (_,Left day  ,_,_) = getNumber [d1,d2]
   _  -> Nothing
     
-----------------------------------------------------------
-- Numbers
-----------------------------------------------------------
-- Returns tuple with the parsed lexeme, the integer, the amount of read characters and the rest of the text
getNumber :: String -> (Lexeme, (Either Int Double), Int, String)
getNumber str =
  case readDec str of
    [(_,('.':_))] -> case readFloat str of
                           [(flt,rest)] -> (LexFloat flt, Right flt, length str - length rest,rest)
                           _            -> fatal 342 "Unexpected: can read decimal, but not float???"
    [(dec,rest)]  -> (LexDecimal dec , Left dec, length str - length rest,rest)
    _  -> fatal 343 $ "No number to read!\n  " ++ take 40 str                    
--getNumber :: String -> (Lexeme, (Either Int Double), Int, String)
--getNumber [] = fatal 294 "getNumber"
--getNumber cs@(c:s)
--  | c /= '0'         = num10
--  | null s           = const0
--  | hs `elem` "xX"   = num16
--  | hs `elem` "oO"   = num8
--  | otherwise        = num10
--  where (hs:ts) = s
--        const0 = (LexDecimal 0, Left 0, 1, s)
--        num10 :: (Lexeme, (Either Int Double), Int, String)
--        num10  = let (n, rs) = span isDigit cs
--                     (isWhole,readed,rest) = 
--                        case rs of
--                          '.':cs' -> let (n',rs') = span isDigit cs'
--                                         wholeNumberString = n++"."++n'
--                                     in (all (=='0') n',wholeNumberString,rs')
--                          _       -> (True,n, rs)
--                     nrInt = read n
--                 in if isWhole
--                    then (LexDecimal nrInt , Left nrInt, length readed,rest)
--                    else let x = fst.head.readFloat $ readed
--                         in (LexFloat x, Right x, length readed,rest)
--        num16   = readIntNum isHexaDigit  16 LexHex
--        num8    = readIntNum isOctDigit 8  LexOctal
--        readIntNum :: (Char -> Bool) -> Int -> (Int -> Lexeme) -> (Lexeme, Either Int a , Int, String)
--        readIntNum p base lx
--          = let (n, rs) = span p ts
--            in  if null n
--                then const0
--                else let nr = readn base n
--                     in (lx nr, Left nr, 2 + length n, rs)

-----------------------------------------------------------
-- characters / strings
-----------------------------------------------------------
scanString :: String -> (String, Int, String)
scanString = scanUpto False ['"']

scanSingletonInExpression :: String -> (String, Int, String)
scanSingletonInExpression = scanUpto True ['\'']

-- | scan to some given character. The end char is scanned away too
scanUpto :: Bool    -- Special case for Ampersand Atomvalues? (if so, both singlequote and doublequote must be escaped)
         -> [Char]  -- non-empty list of ending characters
         -> String 
         -> (String, Int, String)
scanUpto isAtomScan echrs s = 
 case s of
   xs       -> let (ch,cw,cr) = getchar isAtomScan echrs xs
                   (str,w,r)  = scanUpto isAtomScan echrs cr
               in maybe ("",0,xs) (\c -> (c:str,cw+w,r)) ch

getchar :: Bool    -- Special case for Ampersand Atomvalues? (if so, both singlequote and doublequote must be escaped)
        -> [Char]  -- non-empty list of ending characters
        -> String  -- string to get the character from
        -> (Maybe Char, Int, String)
getchar isAtomScan echrs s =
  case s of
   []          -> (Nothing,0,[])
   ('\n':_ )   -> (Nothing,0,s)
   ('\t':_ )   -> (Nothing,0,s)
   ('\\':'&':xs) -> let (str,w,r) = getchar isAtomScan echrs xs -- Special case is required because an escaped & is equal to empty string in Haskell
                    in (str,w+2,r)
   ('\\':xs)   -> let (c,l,r) = getEscChar xs
                  in (c,l+1,r)
   (x:xs)      
    | x `elem` echrs   -> (Nothing,0,s)
    | isAtomScan && x `elem`[doubleQuote, singleQuote]  -> (Nothing,0,s) 
   --- | isAtomScan && ec == singleQuote && x == doubleQuote -> (Nothing,0,s) 
    | otherwise -> (Just x,1,xs)
  where
    (doubleQuote,singleQuote) = ('\"','\'')
getEscChar :: String -> (Maybe Char, Int, String)
getEscChar [] = (Nothing,0,[])
getEscChar s@(x:xs) | isDigit x = case readDec s of
                                    [(val,rest)]
                                      | val >= 0 && val <= 255 -> (Just (chr val),length s - length rest, rest)
                                      | otherwise -> (Nothing, 1, rest)
                                    _  -> fatal 432 $ "Impossible! first char is a digit.. "++take 40 s
                    | x `elem` ['\"','\''] = (Just x,2,xs)
                    | otherwise = case x `lookup` cntrChars of
                                 Nothing -> (Nothing,0,s)
                                 Just c  -> (Just c,1,xs)
  where cntrChars = [('a','\a'),('b','\b'),('f','\f'),('n','\n'),('r','\r'),('t','\t')
                    ,('v','\v'),('\\','\\')]

-----------------------------------------------------------
-- Token creation function
-----------------------------------------------------------

returnToken :: Lexeme -> FilePos -> Lexer -> Lexer
returnToken lx pos continue posi input = do
    let token = Tok lx pos
    tokens <- continue posi input
    return (token:tokens)