{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Safe #-}
{-# LANGUAGE TypeSynonymInstances #-}
module Text.XML.Lexer where
import Common
import Text.XML.Types
import Utils
import Data.Char (isAsciiLower, isAsciiUpper, isDigit, toLower)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Short as TS
nullNs :: URI
nullNs = URI mempty
class XmlSource s where
uncons :: s -> Maybe (Char,s)
instance XmlSource String where
uncons (c:s) = Just (c,s)
uncons "" = Nothing
instance XmlSource T.Text where
uncons = T.uncons
instance XmlSource TL.Text where
uncons = TL.uncons
data Scanner s = Scanner (Maybe (Char,s)) (s -> Maybe (Char,s))
customScanner :: (s -> Maybe (Char,s)) -> s -> Scanner s
customScanner next s = Scanner (next s) next
instance XmlSource (Scanner s) where
uncons (Scanner this next) = do
(c,s1) <- this
return (c, Scanner (next s1) next)
type LChar = (Pos,Char)
type LString = [LChar]
type LexCont = LString -> [Token]
data Token = TokStart !Pos QName [Attr] Bool
| TokEnd !Pos QName
| TokCRef ShortText
| TokText CData
| TokError !Pos String
| TokXmlDecl XmlDeclaration
| TokComment Comment
| TokPI !Pos PI
| TokDTD Text
deriving (Show,Data,Typeable,Generic)
instance NFData Token
eofErr :: [Token]
eofErr = [TokError (-1) "Premature EOF"]
scanXML :: XmlSource source => source -> [Token]
scanXML = tokens2 . eolNorm . go 0
where
go !n src = case uncons src of
Just (c,src') -> (n,c) : go (n+1) src'
Nothing -> []
scanXML' :: XmlSource source => source -> [Token]
scanXML' = tokens1 . eolNorm . go 0
where
go !n src = case uncons src of
Just (c,src') -> (n,c) : go (n+1) src'
Nothing -> []
eolNorm :: LString -> LString
eolNorm [] = []
eolNorm ((_,'\xD'):c@(_,'\xA'):cs) = c : eolNorm cs
eolNorm ((n,'\xD'):cs) = (n,'\xA') : eolNorm cs
eolNorm (c:cs) = c : eolNorm cs
tokens1 :: LexCont
tokens1 [] = []
tokens1 ((_,'<') : (_,'!') : cs) = special tokens1 cs
tokens1 ((n,'<') : (_,'?') : cs) = procins tokens1 n cs
tokens1 ((_,'<') : cs) = tag tokens2 cs
tokens1 cs@(_:_) = sData tokens1 cs
sData :: LexCont -> LString -> [Token]
sData lcont [] = lcont []
sData lcont cs@((n,_):_)
= case breakn (not . isS) cs of
("",_:_) -> [TokError n "unexpected (non-misc) content nodes before root element"]
(as,bs) -> TokText CData { cdVerbatim = CDataText, cdData = T.pack as } : lcont bs
tokens2 :: LexCont
tokens2 [] = []
tokens2 ((_,'<') : (_,'!') : cs) = special tokens2 cs
tokens2 ((n,'<') : (_,'?') : cs) = procins tokens2 n cs
tokens2 ((_,'<') : cs) = tag tokens2 cs
tokens2 cs@(_:_) = charData tokens2 cs
charData :: LexCont -> LString -> [Token]
charData lcont [] = lcont []
charData lcont cs@((n,_):_)
= let (as,bs) = breakn ('<' ==) cs
in foldr cvt (lcont bs) (decode_text as)
where
cvt :: Txt -> ([Token] -> [Token])
cvt (TxtBit x) cont
| T.isInfixOf "]]>" dat = [TokError n "invalid literal ']]>' sequence in text content"]
| T.all isChar dat = TokText CData { cdVerbatim = CDataText, cdData = dat } : cont
| otherwise = [TokError n "invalid code-point in text content"]
where
dat = T.pack x
cvt (CRefBit _ False) _ = [TokError n "invalid character reference"]
cvt (CRefBit x True) cont = case cref_to_char x of
Just c
| isChar c -> TokText CData { cdVerbatim = CDataText, cdData = T.singleton c } : cont
| otherwise -> [TokError n "invalid character reference"]
Nothing -> TokCRef (fromString x) : cont
procins :: LexCont -> Pos -> LString -> [Token]
procins lcont n0 = go ""
where
go acc ((_,'?') : (_,'>') : ds) = mkPI (reverse acc) (lcont ds)
go acc ((_,c) : ds) = go (c:acc) ds
go _ [] = eofErr
mkPI :: String -> [Token] -> [Token]
mkPI s0 ts
| tgt == "xml" = mkXMLDecl s' ts
| map toLower tgt0 == "xml" || not (isNCName tgt0) = [TokError (n0+2) "Invalid PI name"]
| not (T.all isChar payload) = [TokError (n0+2) "invalid code-point in PI data"]
| otherwise = TokPI n0 (PI tgt payload) : ts
where
(tgt0,s') = break isS s0
tgt = TS.fromString tgt0
payload = T.pack (dropWhile isS s')
mkXMLDecl s0 ts
| n0 > 0 = [TokError n0 "XML declaration allowed only at the start of the document"]
| otherwise = go1 (simpleTokenize s0)
where
go1 ("":"version":"=":ver:rest)
| Just "1.0" <- unbrack ver = go2 rest
go1 _ = [TokError n0 "Unsupported or missing 'version' in XML declaration"]
go2 ("":"encoding":"=":enc:rest)
| Just enc' <- unbrack enc, isEnc enc' = go3 (Just $ TS.pack enc') rest
| otherwise = [TokError n0 "Bad 'encoding' value in XML declaration"]
go2 rest = go3 Nothing rest
go3 enc ("":"standalone":"=":sd:rest)
| Just sd' <- unbrack sd, Just sd'' <- isBoo sd' = go4 enc (Just sd'') rest
| otherwise = [TokError n0 "Bad 'standalone' value in XML declaration"]
go3 enc rest = go4 enc Nothing rest
go4 enc sd [] = TokXmlDecl (XmlDeclaration enc sd) : ts
go4 enc sd [""] = TokXmlDecl (XmlDeclaration enc sd) : ts
go4 _ _ _ = [TokError n0 "unexpected or malformed attribute in XML declaration"]
isEnc [] = False
isEnc (c:cs) = (isAsciiLower c || isAsciiUpper c) &&
all (\c' -> isAsciiLower c' || isAsciiUpper c' || isDigit c' || c' `elem` ['.','_','-']) cs
isBoo "yes" = Just True
isBoo "no" = Just False
isBoo _ = Nothing
unbrack ('\'':xs) | Just (s,'\'') <- unsnoc xs = Just s
unbrack ('"':xs) | Just (s,'"') <- unsnoc xs = Just s
unbrack _ = Nothing
special :: LexCont -> LString -> [Token]
special lcont ((n0,'-') : (_,'-') : cs) = go "" cs
where
go acc ((n,'-') : (_,'-') : (_,x) : ds)
| x == '>' = let dat = T.pack (reverse acc)
in if T.all isChar dat
then TokComment (Comment dat) : lcont ds
else [TokError (n0-2) "invalid code-point in comment"]
| otherwise = [TokError n "double hyphen within comment"]
go acc ((_,c) : ds) = go (c:acc) ds
go _ [] = eofErr
special lcont ((n,'[') : (_,'C') : (_,'D') : (_,'A') : (_,'T') : (_,'A') : (_,'[') : cs) =
let (xs,ts) = cdata cs
dat = T.pack xs
in if T.all isChar dat then TokText CData { cdVerbatim = CDataVerbatim, cdData = dat } : lcont ts
else [TokError (n-2) "invalid code-point in CDATA block"]
where
cdata ((_,']') : (_,']') : (_,'>') : ds) = ([],ds)
cdata ((_,d) : ds) = let (xs,ys) = cdata ds in (d:xs,ys)
cdata [] = ([],[])
special lcont ((_,'D') : (_,'O') : (_,'C') : (_,'T') : (_,'Y') : (_,'P') : (_,'E') : cs) =
let (xs,ts) = munch "" 0 cs in TokDTD (T.pack (reverse xs)) : lcont ts
where
munch acc nesting ((_,'>') : ds)
| nesting == (0::Int) = (acc,ds)
| otherwise = munch ('>':acc) (nesting-1) ds
munch acc nesting ((_,'<') : ds) = munch ('<':acc) (nesting+1) ds
munch acc n ((_,x) : ds) = munch (x:acc) n ds
munch acc _ [] = (acc,[])
special _ ((n,_):_) = [TokError (n-1) "invalid element name"]
special _ [] = eofErr
qualName :: LString -> (QName,LString)
qualName xs = (QName { qURI = nullNs
, qPrefix = fmap fromString q
, qLName = LName (fromString n)
}, bs)
where
(as,bs) = breakn endName xs
(q,n) = case break (':'==) as of
(q1,_:n1) -> (Just q1, n1)
_ -> (Nothing, as)
endName x = isS x || x == '=' || x == '>' || x == '/'
tag :: LexCont -> LString -> [Token]
tag lcont ((p,'/') : cs)
| isValidQName n
= TokEnd p n : case dropSpace ds of
(_,'>') : es -> lcont es
(p',_) : _ -> [TokError p' "expected '>'"]
[] -> eofErr
| otherwise = [TokError p "invalid element name"]
where
(n,ds) = qualName cs
tag _ [] = eofErr
tag lcont cs@((pos,_):_)
| not (isValidQName n) = [TokError pos "invalid element name"]
| not (all (isValidQName . attrKey) as) = [TokError pos "invalid attribute name"]
| not (all (T.all isChar . attrVal) as) = [TokError pos "invalid attribute value"]
| otherwise = TokStart pos n as b : ts
where
(n,ds) = qualName cs
(as,b,ts) = attribs lcont ds
attribs :: LexCont -> LString -> ([Attr], Bool, [Token])
attribs lcont = go
where
go cs = case (isS' cs, dropSpace cs) of
(_,(_,'>') : ds) -> ([], False, lcont ds)
(_,(_,'/') : ds) -> ([], True, case ds of
(_,'>') : es -> lcont es
(pos,_) : _ -> [TokError pos "expected '>'"]
[] -> eofErr)
(_,(_,'?') : (_,'>') : ds) -> ([], True, lcont ds)
(_,[]) -> ([],False,eofErr)
(True,cs') -> let (a,cs1) = attrib cs'
(as,b,ts) = go cs1
in (a:as,b,ts)
(False,(pos,_):_) -> ([], False, [TokError pos "expected whitespace"])
isS' ((_,c):_) = isS c
isS' [] = False
attrib :: LString -> (Attr,LString)
attrib cs = ((Attr ks (fromString $ decode_attr vs)),cs2)
where
(vs,cs2) = attr_val (dropSpace cs1)
(ks,cs1) = qualName cs
attr_val :: LString -> (String,LString)
attr_val ((_,'=') : cs0) = string (dropSpace cs0)
where
string :: LString -> (String,LString)
string ((_,'"') : cs) = break' ('"' ==) cs
string ((_,'\'') : cs) = break' ('\'' ==) cs
string cs = ("\0",cs)
attr_val cs = ("\0",cs)
dropSpace :: LString -> LString
dropSpace = dropWhile (isS . snd)
break' :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
break' p xs = let (as,bs) = breakn p xs
in (as, case bs of
[] -> []
_ : cs -> cs)
breakn :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
breakn p l = (map snd as,bs) where (as,bs) = break (p . snd) l
decode_attr :: String -> String
decode_attr cs = concatMap cvt (decode_text cs)
where
cvt (TxtBit x) = norm x
cvt (CRefBit _ False) = "\0"
cvt (CRefBit x True)
| Just c <- cref_to_char x = [c]
| otherwise = "\0"
norm [] = []
norm ('\x9':xs) = '\x20' : norm xs
norm ('\xA':xs) = '\x20' : norm xs
norm ('<':xs) = '<' : '\0' : norm xs
norm (x:xs) = x : norm xs
data Txt = TxtBit String
| CRefBit String Bool
decode_text :: [Char] -> [Txt]
decode_text ('&' : cs) = case break (';' ==) cs of
(as,_:bs) -> CRefBit as True : decode_text bs
(as,"") -> [CRefBit as False]
decode_text [] = []
decode_text cs = let (as,bs) = break ('&' ==) cs
in TxtBit as : decode_text bs
cref_to_char :: [Char] -> Maybe Char
cref_to_char cs = case cs of
'#' : ds -> maybe (Just '\0') Just $
num_esc ds
"lt" -> Just '<'
"gt" -> Just '>'
"amp" -> Just '&'
"apos" -> Just '\''
"quot" -> Just '"'
"" -> Just '\0'
(x:xs) | isNameStartChar x, all isNameChar xs -> Nothing
| otherwise -> Just '\0'
num_esc :: String -> Maybe Char
num_esc cs = case cs of
'x' : ds -> decodeCharRefHex ds
_ -> decodeCharRefDec cs
cvt_char :: Int -> Maybe Char
cvt_char x
| fromEnum (minBound :: Char) <= x && x <= fromEnum (maxBound::Char)
= Just (toEnum x)
| otherwise = Nothing
simpleTokenize :: String -> [String]
simpleTokenize [] = []
simpleTokenize (c:cs)
| isSorEQ c = case span isS (c:cs) of
(_,'=':rest) -> "=" : simpleTokenize (dropWhile isS rest)
(_:_,rest) -> "" : simpleTokenize rest
([],_) -> error "impossible"
| c == '\'' = case break (== '\'') cs of
(_,"") -> [c:cs]
(str,_:rest) -> (c:str++"'") : simpleTokenize rest
| c == '"' = case break (== '"') cs of
(_,"") -> [c:cs]
(str,_:rest) -> (c:str++"\"") : simpleTokenize rest
| otherwise = let (t,rest) = break isSorEQ (c:cs)
in t : simpleTokenize rest
where
isSorEQ x = isS x || x == '='
isValidQName :: QName -> Bool
isValidQName (QName { qPrefix = Just pfx, qLName = LName ln }) = isNCName (TS.unpack pfx) && isNCName (TS.unpack ln)
isValidQName (QName { qPrefix = Nothing, qLName = LName ln }) = isNCName (TS.unpack ln)