-- | -- Module : Text.Megaparsec.Lexer -- Copyright : © 2015–2017 Megaparsec contributors -- © 2007 Paolo Martini -- © 1999–2001 Daan Leijen -- License : FreeBSD -- -- Maintainer : Mark Karpov <markkarpov92@gmail.com> -- Stability : experimental -- Portability : non-portable -- -- High-level parsers to help you write your lexer. The module doesn't -- impose how you should write your parser, but certain approaches may be -- more elegant than others. Especially important theme is parsing of white -- space, comments, and indentation. -- -- This module is intended to be imported qualified: -- -- > import qualified Text.Megaparsec.Lexer as L {-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MultiWayIf #-} {-# LANGUAGE TypeFamilies #-} module Text.Megaparsec.Lexer ( -- * White space space , lexeme , symbol , symbol' , skipLineComment , skipBlockComment , skipBlockCommentNested -- * Indentation , indentLevel , incorrectIndent , indentGuard , nonIndented , IndentOpt (..) , indentBlock , lineFold -- * Character and string literals , charLiteral -- * Numbers , integer , decimal , hexadecimal , octal , scientific , float , number , signed ) where import Control.Applicative import Control.Monad (void) import Data.Char (readLitChar) import Data.List.NonEmpty (NonEmpty (..)) import Data.Maybe (listToMaybe, fromMaybe, isJust) import Data.Scientific (Scientific, toRealFloat) import qualified Data.Set as E import Text.Megaparsec.Combinator import Text.Megaparsec.Error import Text.Megaparsec.Pos import Text.Megaparsec.Prim import qualified Text.Megaparsec.Char as C ---------------------------------------------------------------------------- -- White space -- | @space spaceChar lineComment blockComment@ produces parser that can -- parse white space in general. It's expected that you create such a parser -- once and pass it to other functions in this module as needed (when you -- see @spaceConsumer@ in documentation, usually it means that something -- like 'space' is expected there). -- -- @spaceChar@ is used to parse trivial space characters. You can use -- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as -- your own parser (if you don't want to automatically consume newlines, for -- example). -- -- @lineComment@ is used to parse line comments. You can use -- 'skipLineComment' if you don't need anything special. -- -- @blockComment@ is used to parse block (multi-line) comments. You can use -- 'skipBlockComment' if you don't need anything special. -- -- Parsing of white space is an important part of any parser. We propose a -- convention where every lexeme parser assumes no spaces before the lexeme -- and consumes all spaces after the lexeme; this is what the 'lexeme' -- combinator does, and so it's enough to wrap every lexeme parser with -- 'lexeme' to achieve this. Note that you'll need to call 'space' manually -- to consume any white space before the first lexeme (i.e. at the beginning -- of the file). space :: MonadParsec e s m => m () -- ^ A parser for a space character (e.g. @'void' 'C.spaceChar'@) -> m () -- ^ A parser for a line comment (e.g. 'skipLineComment') -> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment') -> m () space ch line block = hidden . skipMany $ choice [ch, line, block] -- | This is a wrapper for lexemes. Typical usage is to supply the first -- argument (parser that consumes white space, probably defined via 'space') -- and use the resulting function to wrap parsers for every lexeme. -- -- > lexeme = L.lexeme spaceConsumer -- > integer = lexeme L.integer lexeme :: MonadParsec e s m => m () -- ^ How to consume white space after lexeme -> m a -- ^ How to parse actual lexeme -> m a lexeme spc p = p <* spc -- | This is a helper to parse symbols, i.e. verbatim strings. You pass the -- first argument (parser that consumes white space, probably defined via -- 'space') and then you can use the resulting function to parse strings: -- -- > symbol = L.symbol spaceConsumer -- > -- > parens = between (symbol "(") (symbol ")") -- > braces = between (symbol "{") (symbol "}") -- > angles = between (symbol "<") (symbol ">") -- > brackets = between (symbol "[") (symbol "]") -- > semicolon = symbol ";" -- > comma = symbol "," -- > colon = symbol ":" -- > dot = symbol "." symbol :: (MonadParsec e s m, Token s ~ Char) => m () -- ^ How to consume white space after lexeme -> String -- ^ String to parse -> m String symbol spc = lexeme spc . C.string -- | Case-insensitive version of 'symbol'. This may be helpful if you're -- working with case-insensitive languages. symbol' :: (MonadParsec e s m, Token s ~ Char) => m () -- ^ How to consume white space after lexeme -> String -- ^ String to parse (case-insensitive) -> m String symbol' spc = lexeme spc . C.string' -- | Given comment prefix this function returns a parser that skips line -- comments. Note that it stops just before the newline character but -- doesn't consume the newline. Newline is either supposed to be consumed by -- 'space' parser or picked up manually. skipLineComment :: (MonadParsec e s m, Token s ~ Char) => String -- ^ Line comment prefix -> m () skipLineComment prefix = p >> void (manyTill C.anyChar n) where p = C.string prefix n = lookAhead (void C.newline) <|> eof -- | @skipBlockComment start end@ skips non-nested block comment starting -- with @start@ and ending with @end@. skipBlockComment :: (MonadParsec e s m, Token s ~ Char) => String -- ^ Start of block comment -> String -- ^ End of block comment -> m () skipBlockComment start end = p >> void (manyTill C.anyChar n) where p = C.string start n = C.string end -- | @skipBlockCommentNested start end@ skips possibly nested block comment -- starting with @start@ and ending with @end@. -- -- @since 5.0.0 skipBlockCommentNested :: (MonadParsec e s m, Token s ~ Char) => String -- ^ Start of block comment -> String -- ^ End of block comment -> m () skipBlockCommentNested start end = p >> void (manyTill e n) where e = skipBlockCommentNested start end <|> void C.anyChar p = C.string start n = C.string end ---------------------------------------------------------------------------- -- Indentation -- | Return the current indentation level. -- -- The function is a simple shortcut defined as: -- -- > indentLevel = sourceColumn <$> getPosition -- -- @since 4.3.0 indentLevel :: MonadParsec e s m => m Pos indentLevel = sourceColumn <$> getPosition -- | Fail reporting incorrect indentation error. The error has attached -- information: -- -- * Desired ordering between reference level and actual level -- * Reference indentation level -- * Actual indentation level -- -- @since 5.0.0 incorrectIndent :: MonadParsec e s m => Ordering -- ^ Desired ordering between reference level and actual level -> Pos -- ^ Reference indentation level -> Pos -- ^ Actual indentation level -> m a incorrectIndent ord ref actual = failure E.empty E.empty (E.singleton x) where x = representIndentation ord ref actual -- | @indentGuard spaceConsumer ord ref@ first consumes all white space -- (indentation) with @spaceConsumer@ parser, then it checks the column -- position. Ordering between current indentation level and the reference -- indentation level @ref@ should be @ord@, otherwise the parser fails. On -- success the current column position is returned. -- -- When you want to parse a block of indentation, first run this parser with -- arguments like @indentGuard spaceConsumer GT (unsafePos 1)@—this will -- make sure you have some indentation. Use returned value to check -- indentation on every subsequent line according to syntax of your -- language. indentGuard :: MonadParsec e s m => m () -- ^ How to consume indentation (white space) -> Ordering -- ^ Desired ordering between reference level and actual level -> Pos -- ^ Reference indentation level -> m Pos -- ^ Current column (indentation level) indentGuard sc ord ref = do sc actual <- indentLevel if compare actual ref == ord then return actual else incorrectIndent ord ref actual -- | Parse a non-indented construction. This ensures that there is no -- indentation before actual data. Useful, for example, as a wrapper for -- top-level function definitions. -- -- @since 4.3.0 nonIndented :: MonadParsec e s m => m () -- ^ How to consume indentation (white space) -> m a -- ^ How to parse actual data -> m a nonIndented sc p = indentGuard sc EQ (unsafePos 1) *> p -- | The data type represents available behaviors for parsing of indented -- tokens. This is used in 'indentBlock', which see. -- -- @since 4.3.0 data IndentOpt m a b = IndentNone a -- ^ Parse no indented tokens, just return the value | IndentMany (Maybe Pos) ([b] -> m a) (m b) -- ^ Parse many indented tokens (possibly zero), use given indentation -- level (if 'Nothing', use level of the first indented token); the -- second argument tells how to get final result, and third argument -- describes how to parse an indented token | IndentSome (Maybe Pos) ([b] -> m a) (m b) -- ^ Just like 'IndentMany', but requires at least one indented token to -- be present -- | Parse a “reference” token and a number of other tokens that have -- greater (but the same) level of indentation than that of “reference” -- token. Reference token can influence parsing, see 'IndentOpt' for more -- information. -- -- Tokens /must not/ consume newlines after them. On the other hand, the -- first argument of this function /must/ consume newlines among other white -- space characters. -- -- @since 4.3.0 indentBlock :: (MonadParsec e s m, Token s ~ Char) => m () -- ^ How to consume indentation (white space) -> m (IndentOpt m a b) -- ^ How to parse “reference” token -> m a indentBlock sc r = do sc ref <- indentLevel a <- r case a of IndentNone x -> sc *> return x IndentMany indent f p -> do mlvl <- (optional . try) (C.eol *> indentGuard sc GT ref) done <- isJust <$> optional eof case (mlvl, done) of (Just lvl, False) -> indentedItems ref (fromMaybe lvl indent) sc p >>= f _ -> sc *> f [] IndentSome indent f p -> do lvl <- C.eol *> indentGuard sc GT ref x <- p xs <- indentedItems ref (fromMaybe lvl indent) sc p f (x:xs) -- | Grab indented items. This is a helper for 'indentBlock', it's not a -- part of the public API. indentedItems :: MonadParsec e s m => Pos -- ^ Reference indentation level -> Pos -- ^ Level of the first indented item ('lookAhead'ed) -> m () -- ^ How to consume indentation (white space) -> m b -- ^ How to parse indented tokens -> m [b] indentedItems ref lvl sc p = go where go = do sc pos <- indentLevel done <- isJust <$> optional eof if done then return [] else if | pos <= ref -> return [] | pos == lvl -> (:) <$> p <*> go | otherwise -> incorrectIndent EQ lvl pos -- | Create a parser that supports line-folding. The first argument is used -- to consume white space between components of line fold, thus it /must/ -- consume newlines in order to work properly. The second argument is a -- callback that receives a custom space-consuming parser as argument. This -- parser should be used after separate components of line fold that can be -- put on different lines. -- -- An example should clarify the usage pattern: -- -- > sc = L.space (void spaceChar) empty empty -- > -- > myFold = L.lineFold sc $ \sc' -> do -- > L.symbol sc' "foo" -- > L.symbol sc' "bar" -- > L.symbol sc "baz" -- for the last symbol we use normal space consumer -- -- @since 5.0.0 lineFold :: MonadParsec e s m => m () -- ^ How to consume indentation (white space) -> (m () -> m a) -- ^ Callback that uses provided space-consumer -> m a lineFold sc action = sc >> indentLevel >>= action . void . indentGuard sc GT ---------------------------------------------------------------------------- -- Character and string literals -- | The lexeme parser parses a single literal character without quotes. The -- purpose of this parser is to help with parsing of conventional escape -- sequences. It's your responsibility to take care of character literal -- syntax in your language (by surrounding it with single quotes or -- similar). -- -- The literal character is parsed according to the grammar rules defined in -- the Haskell report. -- -- Note that you can use this parser as a building block to parse various -- string literals: -- -- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"') -- -- If you want to write @stringLiteral@ that adheres to the Haskell report -- though, you'll need to take care of the @\\&@ combination which is not a -- character, but can be used to separate characters (as in @\"\\291\\&4\"@ -- which is two characters long): -- -- > stringLiteral = catMaybes <$> (char '"' >> manyTill ch (char '"')) -- > where ch = (Just <$> L.charLiteral) <|> (Nothing <$ string "\\&") charLiteral :: (MonadParsec e s m, Token s ~ Char) => m Char charLiteral = label "literal character" $ do -- The @~@ is needed to avoid requiring a MonadFail constraint, -- and we do know that r will be non-empty if count' succeeds. ~r@(x:_) <- lookAhead $ count' 1 8 C.anyChar case listToMaybe (readLitChar r) of Just (c, r') -> count (length r - length r') C.anyChar >> return c Nothing -> unexpected (Tokens (x:|[])) ---------------------------------------------------------------------------- -- Numbers -- | Parse an integer without sign in decimal representation (according to -- the format of integer literals described in the Haskell report). -- -- If you need to parse signed integers, see 'signed' combinator. integer :: (MonadParsec e s m, Token s ~ Char) => m Integer integer = decimal <?> "integer" -- | The same as 'integer', but 'integer' is 'label'ed with “integer” label, -- while this parser is labeled with “decimal integer”. decimal :: (MonadParsec e s m, Token s ~ Char) => m Integer decimal = nump "" C.digitChar <?> "decimal integer" -- | Parse an integer in hexadecimal representation. Representation of -- hexadecimal number is expected to be according to the Haskell report -- except for the fact that this parser doesn't parse “0x” or “0X” prefix. -- It is a responsibility of the programmer to parse correct prefix before -- parsing the number itself. -- -- For example you can make it conform to Haskell report like this: -- -- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal hexadecimal :: (MonadParsec e s m, Token s ~ Char) => m Integer hexadecimal = nump "0x" C.hexDigitChar <?> "hexadecimal integer" -- | Parse an integer in octal representation. Representation of octal -- number is expected to be according to the Haskell report except for the -- fact that this parser doesn't parse “0o” or “0O” prefix. It is a -- responsibility of the programmer to parse correct prefix before parsing -- the number itself. octal :: (MonadParsec e s m, Token s ~ Char) => m Integer octal = nump "0o" C.octDigitChar <?> "octal integer" -- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then -- prepends @prefix@ to returned value and tries to interpret the result as -- an integer according to Haskell syntax. nump :: MonadParsec e s m => String -> m Char -> m Integer nump prefix baseDigit = read . (prefix ++) <$> some baseDigit -- | Parse a floating point value as a 'Scientific' number. 'Scientific' is -- great for parsing of arbitrary precision numbers coming from an untrusted -- source. See documentation in "Data.Scientific" for more information. -- Representation of the floating point value is expected to be according to -- the Haskell report. -- -- This function does not parse sign, if you need to parse signed numbers, -- see 'signed'. -- -- @since 5.0.0 scientific :: (MonadParsec e s m, Token s ~ Char) => m Scientific scientific = label "floating point number" (read <$> f) where f = (++) <$> some C.digitChar <*> (fraction <|> fExp) -- | Parse a floating point number without sign. This is a simple shortcut -- defined as: -- -- > float = toRealFloat <$> scientific float :: (MonadParsec e s m, Token s ~ Char) => m Double float = toRealFloat <$> scientific -- | This is a helper for 'float' parser. It parses fractional part of -- floating point number, that is, dot and everything after it. fraction :: (MonadParsec e s m, Token s ~ Char) => m String fraction = do void (C.char '.') d <- some C.digitChar e <- option "" fExp return ('.' : d ++ e) -- | This helper parses exponent of floating point numbers. fExp :: (MonadParsec e s m, Token s ~ Char) => m String fExp = do expChar <- C.char' 'e' signStr <- option "" (pure <$> choice (C.char <$> "+-")) d <- some C.digitChar return (expChar : signStr ++ d) -- | Parse a number: either integer or floating point. The parser can handle -- overlapping grammars graciously. Use functions like -- 'Data.Scientific.floatingOrInteger' from "Data.Scientific" to test and -- extract integer or real values. number :: (MonadParsec e s m, Token s ~ Char) => m Scientific number = label "number" (read <$> f) where f = (++) <$> some C.digitChar <*> option "" (fraction <|> fExp) -- | @signed space p@ parser parses an optional sign, then if there is a -- sign it will consume optional white space (using @space@ parser), then it -- runs parser @p@ which should return a number. Sign of the number is -- changed according to previously parsed sign. -- -- For example, to parse signed integer you can write: -- -- > lexeme = L.lexeme spaceConsumer -- > integer = lexeme L.integer -- > signedInteger = L.signed spaceConsumer integer signed :: (MonadParsec e s m, Token s ~ Char, Num a) => m () -> m a -> m a signed spc p = ($) <$> option id (lexeme spc sign) <*> p -- | Parse a sign and return either 'id' or 'negate' according to parsed -- sign. sign :: (MonadParsec e s m, Token s ~ Char, Num a) => m (a -> a) sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)