tlex: A lexer generator

Versions [RSS]	0.1.0.0, 0.2.0.0, 0.3.0.0, 0.4.0.0, 0.4.0.1, 0.5.0.0
Change log	CHANGELOG.md
Dependencies	base (>=4.12.0 && <5), containers (>=0.6.0 && <1), enummapset (>=0.7.1 && <1), tlex-core (>=0.4.0 && <0.5) [details]
License	(Apache-2.0 OR MPL-2.0)
Copyright	(c) 2020 Mizunashi Mana
Author	Mizunashi Mana
Maintainer	contact@mizunashi.work
Category	Parsing
Home page	https://github.com/mizunashi-mana/tlex
Bug tracker	https://github.com/mizunashi-mana/tlex/issues
Source repo	head: git clone https://github.com/mizunashi-mana/tlex.git
Uploaded	by mizunashi_mana at 2023-11-02T06:59:26Z
Distributions	NixOS:0.5.0.0
Reverse Dependencies	3 direct, 0 indirect [details]
Downloads	614 total (4 in the last 30 days)
Rating	(no votes yet) [estimated by Bayesian average]
Your Rating	λ λ λ
Status	Docs available [build log] Last success reported on 2023-11-02 [all 1 reports]

Readme for tlex-0.5.0.0

[back to package description]

Tlex: A Generator for Lexical Analysers

Installation

Add dependencies on package.cabal:

build-depends:
    base,
    bytestring,
    tlex,          -- main
    tlex-encoding, -- for utf8 parsing
    tlex-th,       -- for outputing lexer with Template Haskell
    charset,
    template-haskell,

Usage

Setup:

import qualified Data.CharSet                        as CharSet
import qualified Data.Word                           as Word
import qualified Language.Haskell.TH                 as TH
import qualified Language.Lexer.Tlex                 as Tlex
import qualified Language.Lexer.Tlex.Plugin.Encoding as TlexEnc
import qualified Language.Lexer.Tlex.Plugin.TH       as TlexTH


type LexerState = ()
type LexerAction = [LexerCodeUnit] -> Token
type LexerCodeUnit = Word.Word8

type ScannerBuilder = TlexTH.THScannerBuilder LexerState LexerCodeUnit LexerAction
type Pattern = Tlex.Pattern LexerCodeUnit

rule :: Pattern -> TH.Q (TH.TExp LexerAction) -> ScannerBuilder ()
rule = TlexTH.thLexRule [()]

Setup charSetP:

charSetP :: CharSet.CharSet -> Pattern
charSetP cs = TlexEnc.charSetP TlexEnc.charSetPUtf8 cs

chP :: Char -> Pattern
chP c = TlexEnc.chP TlexEnc.charSetPUtf8 c

Write lexer rules:

buildLexer :: TH.Q [TH.Dec]
buildLexer = do
    lexer <- TlexTH.buildTHScannerWithReify lexerRules
    TlexTH.outputScanner lexer

data Token
    = TokWhiteSpace [LexerCodeUnit]
    | TokSmallAlpha [LexerCodeUnit]
    | TokLargeAlpha [LexerCodeUnit]
    | TokDigit [LexerCodeUnit]

lexerRules :: ScannerBuilder ()
lexerRules = do
    rule (Tlex.someP whitecharP) [||TokWhiteSpace||]
    rule (charSetP $ CharSet.range 'a' 'z') [||TokSmallAlpha||]
    rule (charSetP $ CharSet.range 'A' 'Z') [||TokLargeAlpha||]
    rule (charSetP $ CharSet.range '0' '9') [||TokDigit||]

whitecharP = Tlex.orP
    [ chP ' '
    , '\t'
    , '\n'
    , '\r'
    ]

Build lexer:

$(Lexer.Rules.buildLexer)

newtype InputByteString a = InputByteString
    { unInputByteString :: ByteString -> Int -> (a, Int)
    }
    deriving (Functor, Applicative, Monad)
        via (ReaderT ByteString (State Int))

runInputByteString :: InputByteString a -> ByteString -> (a, Int)
runInputByteString (InputByteString runner) input = runner input 0

instance TlexContext Int Word8 InputByteString where
    tlexGetInputPart = InputString $ \bs i -> (bs `indexMaybe` i, i)
    tlexGetMark = InputByteString $ \bs i -> (i, i)

lexByteString :: ByteString.ByteString -> Maybe [ByteString.ByteString]
lexByteString s0 = go s0 id where
    go s acc = case runInputByteString (tlexScan ()) s of
        (TlexEndOfInput, _)     -> Just $ acc []
        (TlexNotAccepted, _)    -> Nothing
        (TlexAccepted n act, _) ->
            let (consumed, rest) = splitAt n s
                token = act consumed
            in go rest $ \n -> acc act:n

Examples

Small language: https://github.com/mizunashi-mana/tlex/tree/master/example/small-lang
Haskell2010: https://github.com/mizunashi-mana/tlex/tree/master/example/haskell2010