module Text.XML.HaXml.Lex
(
xmlLex
, xmlReLex
, reLexEntityValue
, Token
, TokenT(..)
, Special(..)
, Section(..)
) where
import Data.Char
import Text.XML.HaXml.Posn
data Where = InTag String | NotInTag
deriving (Eq)
type Token = (Posn, TokenT)
data TokenT =
TokCommentOpen
| TokCommentClose
| TokPIOpen
| TokPIClose
| TokSectionOpen
| TokSectionClose
| TokSection Section
| TokSpecialOpen
| TokSpecial Special
| TokEndOpen
| TokEndClose
| TokAnyOpen
| TokAnyClose
| TokSqOpen
| TokSqClose
| TokEqual
| TokQuery
| TokStar
| TokPlus
| TokAmp
| TokSemi
| TokHash
| TokBraOpen
| TokBraClose
| TokPipe
| TokPercent
| TokComma
| TokQuote
| TokName String
| TokFreeText String
| TokNull
| TokError String
deriving (Eq)
data Special =
DOCTYPEx
| ELEMENTx
| ATTLISTx
| ENTITYx
| NOTATIONx
deriving (Eq,Show)
data Section =
CDATAx
| INCLUDEx
| IGNOREx
deriving (Eq,Show)
instance Show TokenT where
showsPrec _p TokCommentOpen = showString "<!--"
showsPrec _p TokCommentClose = showString "-->"
showsPrec _p TokPIOpen = showString "<?"
showsPrec _p TokPIClose = showString "?>"
showsPrec _p TokSectionOpen = showString "<!["
showsPrec _p TokSectionClose = showString "]]>"
showsPrec p (TokSection s) = showsPrec p s
showsPrec _p TokSpecialOpen = showString "<!"
showsPrec p (TokSpecial s) = showsPrec p s
showsPrec _p TokEndOpen = showString "</"
showsPrec _p TokEndClose = showString "/>"
showsPrec _p TokAnyOpen = showString "<"
showsPrec _p TokAnyClose = showString ">"
showsPrec _p TokSqOpen = showString "["
showsPrec _p TokSqClose = showString "]"
showsPrec _p TokEqual = showString "="
showsPrec _p TokQuery = showString "?"
showsPrec _p TokStar = showString "*"
showsPrec _p TokPlus = showString "+"
showsPrec _p TokAmp = showString "&"
showsPrec _p TokSemi = showString ";"
showsPrec _p TokHash = showString "#"
showsPrec _p TokBraOpen = showString "("
showsPrec _p TokBraClose = showString ")"
showsPrec _p TokPipe = showString "|"
showsPrec _p TokPercent = showString "%"
showsPrec _p TokComma = showString ","
showsPrec _p TokQuote = showString "' or \""
showsPrec _p (TokName s) = showString s
showsPrec _p (TokFreeText s) = showString s
showsPrec _p TokNull = showString "(null)"
showsPrec _p (TokError s) = showString s
emit :: TokenT -> Posn -> Token
emit tok p = forcep p `seq` (p,tok)
lexerror :: String -> Posn -> [Token]
lexerror s p = [(p, TokError ("Lexical error:\n "++s))]
skip :: Int -> Posn -> String -> (Posn->String->[Token]) -> [Token]
skip n p s k = k (addcol n p) (drop n s)
blank :: ([Where]->Posn->String->[Token]) -> [Where]-> Posn-> String-> [Token]
blank _ (InTag t:_) p [] = lexerror ("unexpected EOF within "++t) p
blank _ _ _ [] = []
blank k w p (' ': s) = blank k w (addcol 1 p) s
blank k w p ('\t':s) = blank k w (tab p) s
blank k w p ('\n':s) = blank k w (newline p) s
blank k w p ('\r':s) = blank k w p s
blank k w p ('\xa0': s) = blank k w (addcol 1 p) s
blank k w p s = k w p s
prefixes :: String -> String -> Bool
[] `prefixes` _ = True
(x:xs) `prefixes` (y:ys) = x==y && xs `prefixes` ys
(_:_) `prefixes` [] = False
textUntil, textOrRefUntil
:: [Char] -> TokenT -> [Char] -> Posn -> Posn -> [Char]
-> (Posn->String->[Token]) -> [Token]
textUntil close _tok _acc pos p [] _k =
lexerror ("unexpected EOF while looking for closing token "++close
++"\n to match the opening token in "++show pos) p
textUntil close tok acc pos p (s:ss) k
| close `prefixes` (s:ss) = emit (TokFreeText (reverse acc)) pos:
emit tok p:
skip (length close1) (addcol 1 p) ss k
| tok==TokSemi && length acc >= 8
= emit (TokFreeText "amp") pos:
emit tok pos:
k (addcol 1 pos) (reverse acc++s:ss)
| isSpace s = textUntil close tok (s:acc) pos (white s p) ss k
| otherwise = textUntil close tok (s:acc) pos (addcol 1 p) ss k
textOrRefUntil close _tok _acc pos p [] _k =
lexerror ("unexpected EOF while looking for closing token "++close
++"\n to match the opening token in "++show pos) p
textOrRefUntil close tok acc pos p (s:ss) k
| close `prefixes` (s:ss) = emit (TokFreeText (reverse acc)) pos:
emit tok p:
skip (length close1) (addcol 1 p) ss k
| s=='&' = (if not (null acc)
then (emit (TokFreeText (reverse acc)) pos:)
else id)
(emit TokAmp p:
textUntil ";" TokSemi "" p (addcol 1 p) ss
(\p' i-> textOrRefUntil close tok "" p p' i k))
| isSpace s = textOrRefUntil close tok (s:acc) pos (white s p) ss k
| otherwise = textOrRefUntil close tok (s:acc) pos (addcol 1 p) ss k
xmlLex :: String -> String -> [Token]
xmlLex filename = xmlAny [] (posInNewCxt filename Nothing)
xmlReLex :: Posn -> String -> [Token]
xmlReLex p s
| "INCLUDE" `prefixes` s = emit (TokSection INCLUDEx) p: k 7
| "IGNORE" `prefixes` s = emit (TokSection IGNOREx) p: k 6
| otherwise = blank xmlAny [] p s
where
k n = skip n p s (blank xmlAny [])
reLexEntityValue :: (String->Maybe String) -> Posn -> String -> [Token]
reLexEntityValue lookup p s =
textOrRefUntil "%" TokNull [] p p (expand s++"%") (xmlAny [])
where
expand [] = []
expand ('%':xs) = let (sym,rest) = break (==';') xs in
case lookup sym of
Just val -> expand val ++ expand (tail rest)
Nothing -> "%"++sym++";"++ expand (tail rest)
expand (x:xs) = x: expand xs
xmlPI, xmlPIEnd, xmlComment, xmlAny, xmlTag, xmlSection, xmlSpecial
:: [Where] -> Posn -> String -> [Token]
xmlPI w p s = xmlName p s "name of processor in <? ?>" (blank xmlPIEnd w)
xmlPIEnd w p s = textUntil "?>" TokPIClose "" p p s (blank xmlAny (tail w))
xmlComment w p s = textUntil "-->" TokCommentClose "" p p s (blank xmlAny w)
xmlAny (InTag t:_) p [] = lexerror ("unexpected EOF within "++t) p
xmlAny _ _ [] = []
xmlAny w p s@('<':ss)
| "?" `prefixes` ss = emit TokPIOpen p:
skip 2 p s (xmlPI (InTag "<?...?>":w))
| "!--" `prefixes` ss = emit TokCommentOpen p: skip 4 p s (xmlComment w)
| "![" `prefixes` ss = emit TokSectionOpen p: skip 3 p s (xmlSection w)
| "!" `prefixes` ss = emit TokSpecialOpen p:
skip 2 p s (xmlSpecial (InTag "<!...>":w))
| "/" `prefixes` ss = emit TokEndOpen p:
skip 2 p s (xmlTag (InTag "</...>":tail w))
| otherwise = emit TokAnyOpen p:
skip 1 p s (xmlTag (InTag "<...>":NotInTag:w))
xmlAny (_:_:w) p s@('/':ss)
| ">" `prefixes` ss = emit TokEndClose p: skip 2 p s (xmlAny w)
xmlAny w p ('&':ss) = emit TokAmp p: textUntil ";" TokSemi "" p
(addcol 1 p) ss (xmlAny w)
xmlAny w@(NotInTag:_) p s = xmlContent "" w p p s
xmlAny w p ('>':ss) = emit TokAnyClose p: xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('[':ss) = emit TokSqOpen p:
blank xmlAny (InTag "[...]":w) (addcol 1 p) ss
xmlAny w p (']':ss)
| "]>" `prefixes` ss =
emit TokSectionClose p: skip 3 p (']':ss) (xmlAny (tail w))
| otherwise = emit TokSqClose p: blank xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('(':ss) = emit TokBraOpen p:
blank xmlAny (InTag "(...)":w) (addcol 1 p) ss
xmlAny w p (')':ss) = emit TokBraClose p: blank xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('=':ss) = emit TokEqual p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('*':ss) = emit TokStar p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('+':ss) = emit TokPlus p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('?':ss) = emit TokQuery p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('|':ss) = emit TokPipe p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('%':ss) = emit TokPercent p: blank xmlAny w (addcol 1 p) ss
xmlAny w p (';':ss) = emit TokSemi p: blank xmlAny w (addcol 1 p) ss
xmlAny w p (',':ss) = emit TokComma p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('#':ss) = emit TokHash p: blank xmlAny w (addcol 1 p) ss
xmlAny w p ('"':ss) = emit TokQuote p: textOrRefUntil "\"" TokQuote "" p1
p1 ss (xmlAny w)
where p1 = addcol 1 p
xmlAny w p ('\'':ss) = emit TokQuote p: textOrRefUntil "'" TokQuote "" p1
p1 ss (xmlAny w)
where p1 = addcol 1 p
xmlAny w p s
| isSpace (head s) = blank xmlAny w p s
| isAlphaNum (head s) || (head s)`elem`":_"
= xmlName p s "some kind of name" (blank xmlAny w)
| otherwise = lexerror ("unrecognised token: "++take 4 s) p
xmlTag w p s = xmlName p s "tagname for element in < >" (blank xmlAny w)
xmlSection = blank xmlSection0
where
xmlSection0 w p s
| "CDATA[" `prefixes` s = emit (TokSection CDATAx) p: accum w p s 6
| "INCLUDE" `prefixes` s = emit (TokSection INCLUDEx) p: k w p s 7
| "IGNORE" `prefixes` s = emit (TokSection IGNOREx) p: k w p s 6
| "%" `prefixes` s = emit TokPercent p: k w p s 1
| otherwise = lexerror ("expected CDATA, IGNORE, or INCLUDE, but got "
++take 7 s) p
accum w p s n =
let p0 = addcol n p in
textUntil "]]>" TokSectionClose "" p0 p0 (drop n s) (blank xmlAny w)
k w p s n =
skip n p s (xmlAny (w))
xmlSpecial w p s
| "DOCTYPE" `prefixes` s = emit (TokSpecial DOCTYPEx) p: k 7
| "ELEMENT" `prefixes` s = emit (TokSpecial ELEMENTx) p: k 7
| "ATTLIST" `prefixes` s = emit (TokSpecial ATTLISTx) p: k 7
| "ENTITY" `prefixes` s = emit (TokSpecial ENTITYx) p: k 6
| "NOTATION" `prefixes` s = emit (TokSpecial NOTATIONx) p: k 8
| otherwise = lexerror
("expected DOCTYPE, ELEMENT, ENTITY, ATTLIST, or NOTATION,"
++" but got "++take 7 s) p
where k n = skip n p s (blank xmlAny w)
xmlName :: Posn -> [Char] -> [Char] -> (Posn->[Char]->[Token]) -> [Token]
xmlName p (s:ss) cxt k
| isAlphaNum s || s==':' || s=='_' = gatherName (s:[]) p (addcol 1 p) ss k
| otherwise = lexerror ("expected a "++cxt++", but got char "++show s) p
where
gatherName acc pos p [] k =
emit (TokName (reverse acc)) pos: k p []
gatherName acc pos p (s:ss) k
| isAlphaNum s || s `elem` ".-_:"
= gatherName (s:acc) pos (addcol 1 p) ss k
| otherwise = emit (TokName (reverse acc)) pos: k p (s:ss)
xmlName p [] cxt _ = lexerror ("expected a "++cxt++", but got end of input") p
xmlContent :: [Char] -> [Where] -> Posn -> Posn -> [Char] -> [Token]
xmlContent acc _w _pos p [] = if all isSpace acc then []
else lexerror "unexpected EOF between tags" p
xmlContent acc w pos p (s:ss)
| elem s "<&" =
emit (TokFreeText (reverse acc)) pos: xmlAny w p (s:ss)
| isSpace s = xmlContent (s:acc) w pos (white s p) ss
| otherwise = xmlContent (s:acc) w pos (addcol 1 p) ss