module Helium.Parser.Lexer
( lexer, strategiesLexer
, Token, Lexeme(..)
, lexemeLength
, module Helium.Parser.LexerMessage
) where
import Helium.Main.Args
import Helium.Parser.LexerMonad
import Helium.Parser.LexerMessage
import Helium.Parser.LexerToken
import Text.ParserCombinators.Parsec.Pos
import Helium.Utils.Utils(internalError, hole)
import Control.Monad(when, liftM)
import Data.Char(ord)
import Data.List(isPrefixOf)
lexer :: [Option] -> String -> [Char] -> Either LexerError ([Token], [LexerWarning])
lexer opts fileName input = runLexerMonad opts fileName (mainLexer input)
strategiesLexer :: [Option] -> String -> [Char] -> Either LexerError ([Token], [LexerWarning])
strategiesLexer opts fileName input =
case lexer opts fileName input of
Left err -> Left err
Right (tokens, warnings) -> Right (reserveStrategyNames tokens, warnings)
type Lexer = [Char] -> LexerMonad [Token]
mainLexer :: Lexer
mainLexer a =
do useTutor <- elem UseTutor `liftM` getOpts
mainLexer' useTutor a
mainLexer' :: Bool -> Lexer
mainLexer' _ [] = do
checkBracketsAtEOF
pos <- getPos
return [(incSourceLine (setSourceColumn pos 0) 1, LexEOF)]
mainLexer' _ ('-':'-':cs)
| not (nextCharSatisfy isSymbol rest) = do
incPos (2 + length minuses)
lexOneLineComment rest
where
(minuses, rest) = span (== '-') cs
mainLexer' useTutor ('{':'-':'#':' ':'M':'U':'S':'T':'U':'S':'E':' ':'#':'-':'}':cs) | useTutor =
returnToken LexMustUse 15 mainLexer cs
mainLexer' useTutor ('{':'-':'#':' ':'F':'C':cs) | useTutor = do
pos <- getPos
lexCaseFeedbackComment "" pos cs
mainLexer' useTutor ('{':'-':'#':' ':'F':cs) | useTutor = do
pos <- getPos
incPos 5
lexFeedbackComment "" pos cs
mainLexer' _ ('{':'-':cs) = do
pos <- getPos
incPos 2
lexMultiLineComment [pos] 0 cs
mainLexer' _ input@('\'':_) =
lexChar input
mainLexer' _ input@('"':_) =
lexString input
mainLexer' _ ('.':c:cs)
| myIsDigit c = do
pos <- getPos
lexerWarning (LooksLikeFloatNoDigits (takeWhile myIsDigit (c:cs))) pos
returnToken (LexVarSym ".") 1 mainLexer (c:cs)
mainLexer' useTutor input@(c:cs)
| myIsLower c || c == '_' =
lexName isLetter LexVar LexKeyword keywords input
| myIsSpace c = do
when (c == '\t') $ do
pos <- getPos
lexerWarning TabCharacter pos
nextPos c
mainLexer cs
| myIsUpper c =
lexName isLetter LexCon (internalError "Lexer" "mainLexer'" "constructor") [] input
| c == ':' =
lexName isSymbol LexConSym LexResConSym reservedConSyms input
| isSymbol c =
lexName isSymbol LexVarSym LexResVarSym (if useTutor then hole : reservedVarSyms else reservedVarSyms) input
| c `elem` "([{" = do
openBracket c
returnToken (LexSpecial c) 1 mainLexer cs
| c `elem` ")]}" = do
closeBracket c
returnToken (LexSpecial c) 1 mainLexer cs
| c `elem` specialsWithoutBrackets =
returnToken (LexSpecial c) 1 mainLexer cs
| myIsDigit c =
lexIntFloat input
| otherwise = do
pos <- getPos
lexerError (UnexpectedChar c) pos
lexName :: (Char -> Bool) -> (String -> Lexeme) ->
(String -> Lexeme) -> [String] -> Lexer
lexName predicate normal reserved reserveds cs = do
let (name@(first:_), rest) = span predicate cs
lexeme = if name `elem` reserveds
then reserved name
else normal name
when ((isSymbol first || first == ':') && name `contains` "--") $ do
pos <- getPos
lexerWarning CommentOperator pos
returnToken lexeme (length name) mainLexer rest
contains :: Eq a => [a] -> [a] -> Bool
[] `contains` _ = False
xs@(_:rest) `contains` ys = ys `isPrefixOf` xs || rest `contains` ys
lexIntFloat :: Lexer
lexIntFloat input = do
_ <- getPos
let (digits, rest) = span myIsDigit input
case rest of
('.':rest2@(next:_))
| myIsDigit next -> do
let (fraction, rest3) = span myIsDigit rest2
prefix = digits ++ "." ++ fraction
lexMaybeExponent prefix LexFloat rest3
| next /= '.' -> do
pos <- getPos
lexerWarning (LooksLikeFloatNoFraction digits) pos
lexMaybeExponent digits LexInt rest
_ ->
lexMaybeExponent digits LexInt rest
lexMaybeExponent :: String -> (String -> Lexeme) -> Lexer
lexMaybeExponent prefix lexemeFun input =
case input of
('e':'+':rest) ->
lexExponent (prefix ++ "e+") rest
('E':'+':rest) ->
lexExponent (prefix ++ "E+") rest
('e':'-':rest) ->
lexExponent (prefix ++ "e-") rest
('E':'-':rest) ->
lexExponent (prefix ++ "E-") rest
('e':rest) ->
lexExponent (prefix ++ "e") rest
('E':rest) ->
lexExponent (prefix ++ "E") rest
_ ->
returnToken (lexemeFun prefix) (length prefix) mainLexer input
lexExponent :: String -> Lexer
lexExponent prefix input = do
startPos <- getPos
let posAtExponent = addPos (length prefix) startPos
(exponentDigits, rest) = span myIsDigit input
if null exponentDigits then
lexerError MissingExponentDigits posAtExponent
else do
let text = prefix ++ exponentDigits
returnToken (LexFloat text) (length text) mainLexer rest
lexChar :: Lexer
lexChar input = do
pos <- getPos
case input of
'\'':'\\':c:'\'':cs ->
if c `elem` escapeChars then
returnToken (LexChar ['\\',c]) 4 mainLexer cs
else
lexerError IllegalEscapeInChar pos
'\'':'\'':_ ->
lexerError EmptyChar pos
'\'':c:'\'':cs ->
if ord c >= 32 && ord c <= 126 then
returnToken (LexChar [c]) 3 mainLexer cs
else
lexerError IllegalCharInChar pos
['\''] ->
lexerError EOFInChar pos
('\'':cs) ->
let (ds, es) = span (/= '\'') cs
ws = words ds
in if not (null es) && head es == '\'' && length ws == 1 && isName (head ws) then
lexerError (NonTerminatedChar (Just (head ws))) pos
else
lexerError (NonTerminatedChar Nothing) pos
_ -> internalError "Lexer" "lexChar" "unexpected characters"
lexString :: Lexer
lexString ('"':cs) =
lexStringChar "" cs
lexString _ =
internalError "Lexer" "lexString" "should start with \""
lexStringChar :: String -> Lexer
lexStringChar revSoFar input = do
startPos <- getPos
let curPos = addPos (length revSoFar + 1) startPos
case input of
[] ->
lexerError EOFInString startPos
'\\':c:cs ->
if c `elem` escapeChars then
lexStringChar (c:'\\':revSoFar) cs
else
lexerError IllegalEscapeInString curPos
'"':cs ->
returnToken (LexString (reverse revSoFar))
(length revSoFar + 2) mainLexer cs
'\n':_ ->
lexerError NewLineInString startPos
c:cs ->
if ord c >= 32 && ord c <= 126 then
lexStringChar (c:revSoFar) cs
else
lexerError IllegalCharInString curPos
nextCharSatisfy :: (Char -> Bool) -> String -> Bool
nextCharSatisfy _ [] = False
nextCharSatisfy p (c:_) = p c
returnToken :: Lexeme -> Int -> Lexer -> Lexer
returnToken lexeme width continue input = do
pos <- getPos
incPos width
let token = (pos, lexeme)
tokens <- continue input
return (token:tokens)
lexOneLineComment :: Lexer
lexOneLineComment input =
case input of
[] -> mainLexer []
('\n':cs) -> do
nextPos '\n'
mainLexer cs
(c:cs) -> do
nextPos c
lexOneLineComment cs
lexMultiLineComment :: [SourcePos] -> Int -> Lexer
lexMultiLineComment starts level input =
case input of
'-':'}':cs
| level == 0 -> do
incPos 2
mainLexer cs
| otherwise -> do
incPos 2
lexMultiLineComment (tail starts) (level 1) cs
'{':'-':cs -> do
pos <- getPos
lexerWarning (NestedComment (head starts)) pos
incPos 2
lexMultiLineComment (pos:starts) (level+1) cs
c:cs -> do
nextPos c
lexMultiLineComment starts level cs
[] ->
lexerError UnterminatedComment (head starts)
lexFeedbackComment :: String -> SourcePos -> Lexer
lexFeedbackComment feedback start input =
case input of
'#':'-':'}':cs ->
returnToken (LexFeedback (reverse feedback))
(length feedback + 6) mainLexer cs
c:cs -> do
nextPos c
lexFeedbackComment (c:feedback) start cs
[] ->
lexerError UnterminatedComment start
lexCaseFeedbackComment :: String -> SourcePos -> Lexer
lexCaseFeedbackComment feedback start input =
case input of
'#':'-':'}':cs ->
returnToken (LexCaseFeedback (reverse feedback)) 0 mainLexer cs
c:cs ->
lexCaseFeedbackComment (c:feedback) start cs
[] ->
lexerError UnterminatedComment start
isSymbol :: Char -> Bool
isSymbol = (`elem` symbols)
isLetter :: Char -> Bool
isLetter '\'' = True
isLetter '_' = True
isLetter c = myIsAlphaNum c
myIsLower :: Char -> Bool
myIsLower c = c >= 'a' && c <= 'z'
myIsUpper :: Char -> Bool
myIsUpper c = c >= 'A' && c <= 'Z'
myIsDigit :: Char -> Bool
myIsDigit c = c >= '0' && c <= '9'
myIsAlphaNum :: Char -> Bool
myIsAlphaNum c = myIsLower c || myIsUpper c || myIsDigit c
myIsSpace :: Char -> Bool
myIsSpace c = c == ' ' || c == '\n' || c == '\t' || c == '\r'
isName :: String -> Bool
isName [] = False
isName (hd:tl) = (myIsUpper hd || myIsLower hd) && all isLetter tl
escapeChars :: String
escapeChars = "\\nabfnrtv\"'"
symbols :: String
symbols = "!#$%&*+./<=>?@^|-~:\\"
keywords :: [String]
keywords =
[ "let", "in", "do", "where", "case", "of", "if"
, "then", "else", "data", "type", "module", "import", "hiding"
, "infix", "infixl", "infixr", "_", "deriving"
, "class", "instance", "default", "newtype"
]
reservedConSyms :: [String]
reservedConSyms =
[ "::" ]
reservedVarSyms :: [String]
reservedVarSyms =
[ "=>", "->", "<-", "..", "-", "-.", "@", "=", "\\", "|", "~" ]
specialsWithoutBrackets :: String
specialsWithoutBrackets =
",`;"
reserveStrategyNames :: [Token] -> [Token]
reserveStrategyNames =
map (\token@(pos, lexeme) -> case lexeme of
LexVar s | s `elem` strategiesKeywords -> (pos, LexKeyword s)
LexVarSym s | s == "==" -> (pos, LexResVarSym s)
LexConSym s | s == ":" -> (pos, LexResConSym s)
_ -> token
)
strategiesKeywords :: [String]
strategiesKeywords = [ "phase", "constraints", "siblings" ]