{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeSynonymInstances #-}
module Text.XML.Lexer where
import Common
import Text.XML.Types
import Utils
import Data.Char (isAsciiLower, isAsciiUpper, isDigit, toLower)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Short as TS
import Numeric (readHex)
class XmlSource s where
uncons :: s -> Maybe (Char,s)
instance XmlSource String where
uncons (c:s) = Just (c,s)
uncons "" = Nothing
instance XmlSource T.Text where
uncons = T.uncons
instance XmlSource TL.Text where
uncons = TL.uncons
data Scanner s = Scanner (Maybe (Char,s)) (s -> Maybe (Char,s))
customScanner :: (s -> Maybe (Char,s)) -> s -> Scanner s
customScanner next s = Scanner (next s) next
instance XmlSource (Scanner s) where
uncons (Scanner this next) = do
(c,s1) <- this
return (c, Scanner (next s1) next)
type LChar = (Pos,Char)
type LString = [LChar]
data Token = TokStart !Pos QName [Attr] Bool
| TokEnd !Pos QName
| TokCRef ShortText
| TokText CData
| TokError !Pos String
| TokXmlDecl XmlDeclaration
| TokComment Comment
| TokPI !Pos PI
| TokDTD Text
deriving (Show,Data,Typeable,Generic)
instance NFData Token
eofErr :: [Token]
eofErr = [TokError (-1) "Premature EOF"]
scanXML :: XmlSource source => source -> [Token]
scanXML = tokens0 . eolNorm . go 0
where
go !n src = case uncons src of
Just (c,src') -> (n,c) : go (n+1) src'
Nothing -> []
eolNorm :: LString -> LString
eolNorm [] = []
eolNorm ((_,'\xD'):c@(_,'\xA'):cs) = c : eolNorm cs
eolNorm ((n,'\xD'):cs) = (n,'\xA') : eolNorm cs
eolNorm (c:cs) = c : eolNorm cs
tokens0 :: LString -> [Token]
tokens0 cs = tokens' cs
tokens' :: LString -> [Token]
tokens' ((_,'<') : (_,'!') : cs) = special cs
tokens' ((n,'<') : (_,'?') : cs) = procins n cs
tokens' ((_,'<') : cs) = tag cs
tokens' [] = []
tokens' cs@((n,_):_) = let (as,bs) = breakn ('<' ==) cs
in foldr cvt (tokens' bs) (decode_text as)
where
cvt (TxtBit x) cont
| T.all isChar dat = TokText CData { cdVerbatim = CDataText, cdData = dat } : cont
| otherwise = [TokError n "invalid code-point in text content"]
where
dat = T.pack x
cvt (CRefBit x) cont = case cref_to_char x of
Just c
| isChar c -> TokText CData { cdVerbatim = CDataText, cdData = T.singleton c } : cont
| otherwise -> [TokError n "invalid character reference"]
Nothing -> TokCRef (fromString x) : cont
procins :: Pos -> LString -> [Token]
procins n0 = go ""
where
go acc ((_,'?') : (_,'>') : ds) = mkPI (reverse acc) (tokens' ds)
go acc ((_,c) : ds) = go (c:acc) ds
go _ [] = eofErr
mkPI :: String -> [Token] -> [Token]
mkPI s0 ts
| tgt == "xml" = mkXMLDecl s' ts
| map toLower (TS.unpack tgt) == "xml" = [TokError (n0+2) "Invalid PI name"]
| otherwise = TokPI n0 (PI tgt payload) : ts
where
(tgt0,s') = break isS s0
tgt = TS.fromString tgt0
payload = T.pack (dropWhile isS s')
mkXMLDecl s0 ts
| n0 > 0 = [TokError n0 "XML declaration allowed only at the start of the document"]
| otherwise = go1 (simpleTokenize s0)
where
go1 ("":"version":"=":ver:rest)
| Just "1.0" <- unbrack ver = go2 rest
go1 _ = [TokError n0 "Unsupported or missing 'version' in XML declaration"]
go2 ("":"encoding":"=":enc:rest)
| Just enc' <- unbrack enc, isEnc enc' = go3 (Just $ TS.pack enc') rest
| otherwise = [TokError n0 "Bad 'encoding' value in XML declaration"]
go2 rest = go3 Nothing rest
go3 enc ("":"standalone":"=":sd:rest)
| Just sd' <- unbrack sd, Just sd'' <- isBoo sd' = go4 enc (Just sd'') rest
| otherwise = [TokError n0 "Bad 'standalone' value in XML declaration"]
go3 enc rest = go4 enc Nothing rest
go4 enc sd [] = TokXmlDecl (XmlDeclaration enc sd) : ts
go4 enc sd [""] = TokXmlDecl (XmlDeclaration enc sd) : ts
go4 _ _ _ = [TokError n0 "unexpected or malformed attribute in XML declaration"]
isEnc [] = False
isEnc (c:cs) = (isAsciiLower c || isAsciiUpper c) &&
all (\c' -> isAsciiLower c' || isAsciiUpper c' || isDigit c' || c' `elem` ['.','_','-']) cs
isBoo "yes" = Just True
isBoo "no" = Just False
isBoo _ = Nothing
unbrack ('\'':xs) | Just (s,'\'') <- unsnoc xs = Just s
unbrack ('"':xs) | Just (s,'"') <- unsnoc xs = Just s
unbrack _ = Nothing
special :: LString -> [Token]
special ((_,'-') : (_,'-') : cs) = go "" cs
where
go acc ((n,'-') : (_,'-') : (_,x) : ds)
| x == '>' = TokComment (Comment $ T.pack (reverse acc)) : tokens' ds
| otherwise = [TokError n "double hyphen within comment"]
go acc ((_,c) : ds) = go (c:acc) ds
go _ [] = eofErr
special ((n,'[') : (_,'C') : (_,'D') : (_,'A') : (_,'T') : (_,'A') : (_,'[') : cs) =
let (xs,ts) = cdata cs
dat = T.pack xs
in if T.all isChar dat then TokText CData { cdVerbatim = CDataVerbatim, cdData = dat } : tokens' ts
else [TokError (n-2) "invalid code-point in CDATA block"]
where
cdata ((_,']') : (_,']') : (_,'>') : ds) = ([],ds)
cdata ((_,d) : ds) = let (xs,ys) = cdata ds in (d:xs,ys)
cdata [] = ([],[])
special ((_,'D') : (_,'O') : (_,'C') : (_,'T') : (_,'Y') : (_,'P') : (_,'E') : cs) =
let (xs,ts) = munch "" 0 cs in TokDTD (T.pack (reverse xs)) : tokens' ts
where
munch acc nesting ((_,'>') : ds)
| nesting == (0::Int) = (acc,ds)
| otherwise = munch ('>':acc) (nesting-1) ds
munch acc nesting ((_,'<') : ds) = munch ('<':acc) (nesting+1) ds
munch acc n ((_,x) : ds) = munch (x:acc) n ds
munch acc _ [] = (acc,[])
special ((n,_):_) = [TokError (n-1) "invalid element name"]
special [] = eofErr
qualName :: LString -> (QName,LString)
qualName xs = (QName { qURI = Nothing
, qPrefix = fmap fromString q
, qLName = LName (fromString n)
}, bs)
where
(as,bs) = breakn endName xs
(q,n) = case break (':'==) as of
(q1,_:n1) -> (Just q1, n1)
_ -> (Nothing, as)
endName x = isS x || x == '=' || x == '>' || x == '/'
tag :: LString -> [Token]
tag ((p,'/') : cs)
| isValidQName n
= TokEnd p n : case dropSpace ds of
(_,'>') : es -> tokens' es
(p',_) : _ -> [TokError p' "expected '>'"]
[] -> eofErr
| otherwise = [TokError p "invalid element name"]
where
(n,ds) = qualName (dropSpace cs)
tag [] = eofErr
tag cs@((pos,_):_)
| not (isValidQName n) = [TokError pos "invalid element name"]
| not (all (isValidQName . attrKey) as) = [TokError pos "invalid attribute name"]
| not (all (T.all isChar . attrVal) as) = [TokError pos "invalid attribute value"]
| otherwise = TokStart pos n as b : ts
where
(n,ds) = qualName cs
(as,b,ts) = attribs (dropSpace ds)
attribs :: LString -> ([Attr], Bool, [Token])
attribs cs = case cs of
(_,'>') : ds -> ([], False, tokens' ds)
(_,'/') : ds -> ([], True, case ds of
(_,'>') : es -> tokens' es
(pos,_) : _ -> [TokError pos "expected '>'"]
[] -> eofErr)
(_,'?') : (_,'>') : ds -> ([], True, tokens' ds)
[] -> ([],False,eofErr)
_ -> let (a,cs1) = attrib cs
(as,b,ts) = attribs cs1
in (a:as,b,ts)
attrib :: LString -> (Attr,LString)
attrib cs = ((Attr ks (fromString $ decode_attr vs)),dropSpace cs2)
where
(vs,cs2) = attr_val (dropSpace cs1)
(ks,cs1) = qualName cs
attr_val :: LString -> (String,LString)
attr_val ((_,'=') : cs0) = string (dropSpace cs0)
where
string :: LString -> (String,LString)
string ((_,'"') : cs) = break' ('"' ==) cs
string ((_,'\'') : cs) = break' ('\'' ==) cs
string cs = ("\0",cs)
attr_val cs = ("\0",cs)
dropSpace :: LString -> LString
dropSpace = dropWhile (isS . snd)
break' :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
break' p xs = let (as,bs) = breakn p xs
in (as, case bs of
[] -> []
_ : cs -> cs)
breakn :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
breakn p l = (map snd as,bs) where (as,bs) = break (p . snd) l
decode_attr :: String -> String
decode_attr cs = concatMap cvt (decode_text cs)
where
cvt (TxtBit x) = norm x
cvt (CRefBit x)
| Just c <- cref_to_char x = [c]
| otherwise = "\0"
norm [] = []
norm ('\x9':xs) = '\x20' : norm xs
norm ('\xA':xs) = '\x20' : norm xs
norm (x:xs) = x : norm xs
data Txt = TxtBit String | CRefBit String deriving Show
decode_text :: [Char] -> [Txt]
decode_text xs@('&' : cs) = case break (';' ==) cs of
(as,_:bs) -> CRefBit as : decode_text bs
_ -> [TxtBit xs]
decode_text [] = []
decode_text cs = let (as,bs) = break ('&' ==) cs
in TxtBit as : decode_text bs
cref_to_char :: [Char] -> Maybe Char
cref_to_char cs = case cs of
'#' : ds -> num_esc ds
"lt" -> Just '<'
"gt" -> Just '>'
"amp" -> Just '&'
"apos" -> Just '\''
"quot" -> Just '"'
_ -> Nothing
num_esc :: String -> Maybe Char
num_esc cs = case cs of
'x' : ds -> check (readHex ds)
_ -> check (reads cs)
where check [(n,"")] = cvt_char n
check _ = Nothing
cvt_char :: Int -> Maybe Char
cvt_char x
| fromEnum (minBound :: Char) <= x && x <= fromEnum (maxBound::Char)
= Just (toEnum x)
| otherwise = Nothing
simpleTokenize :: String -> [String]
simpleTokenize [] = []
simpleTokenize (c:cs)
| isSorEQ c = let (sep,rest) = span isSorEQ (c:cs)
in (if ('=' `elem` sep) then "=" else "") : simpleTokenize rest
| c == '\'' = case break (== '\'') cs of
(_,"") -> [c:cs]
(str,_:rest) -> (c:str++"'") : simpleTokenize rest
| c == '"' = case break (== '"') cs of
(_,"") -> [c:cs]
(str,_:rest) -> (c:str++"\"") : simpleTokenize rest
| otherwise = let (t,rest) = break isSorEQ (c:cs)
in t : simpleTokenize rest
where
isSorEQ x = isS x || x == '='
isValidQName :: QName -> Bool
isValidQName (QName { qPrefix = Just pfx, qLName = LName ln }) = isNCName (TS.unpack pfx) && isNCName (TS.unpack ln)
isValidQName (QName { qPrefix = Nothing, qLName = LName ln }) = isNCName (TS.unpack ln)