module Text.XML.HXT.Parser.XmlParsec
( charData
, charData'
, comment
, pI
, cDSect
, document
, document'
, prolog
, xMLDecl
, xMLDecl'
, versionInfo
, misc
, doctypedecl
, markupdecl
, sDDecl
, element
, content
, contentWithTextDecl
, textDecl
, encodingDecl
, xread
, parseXmlContent
, parseXmlDocEncodingSpec
, parseXmlDocument
, parseXmlDTDPart
, parseXmlEncodingSpec
, parseXmlEntityEncodingSpec
, parseXmlEntityValueAsAttrValue
, parseXmlEntityValueAsContent
, parseXmlPart
, parseXmlText
, parseNMToken
, parseName
, removeEncodingSpec
)
where
import Control.Applicative ( (<$>) )
import Text.ParserCombinators.Parsec ( runParser
, (<?>), (<|>)
, char
, string
, eof
, between
, many
, many1
, notFollowedBy
, option
, try
, unexpected
, getPosition
, getInput
, sourceName
)
import Text.XML.HXT.DOM.ShowXml ( xshow
)
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.XmlNode ( mkElement'
, mkAttr'
, mkRoot'
, mkDTDElem'
, mkText'
, mkCmt'
, mkCdata'
, mkError'
, mkPi'
, isText
, isRoot
, getText
, getChildren
, getAttrl
, getAttrName
, changeAttrl
, mergeAttrl
)
import Text.XML.HXT.Parser.XmlCharParser ( xmlChar
, XParser
, SimpleXParser
, XPState
, withNormNewline
, withoutNormNewline
)
import qualified Text.XML.HXT.Parser.XmlTokenParser as XT
import qualified Text.XML.HXT.Parser.XmlDTDTokenParser as XD
import Control.FlatSeq
import Data.Char (toLower)
import Data.Maybe
charData :: XParser s XmlTrees
charData
= many (charData' <|> XT.referenceT)
charData' :: XParser s XmlTree
charData'
= do
t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>"
return (mkText' t)
comment :: XParser s XmlTree
comment
= comment'' $ XT.checkString "<!--"
comment' :: XParser s XmlTree
comment'
= comment'' (string "--" >> return ())
comment'' :: XParser s () -> XParser s XmlTree
comment'' op
= ( do
c <- between op (string ("-->")) (XT.allBut many "--")
return (mkCmt' c)
) <?> "comment"
pI :: XParser s XmlTree
pI = pI'' $ XT.checkString "<?"
pI' :: XParser s XmlTree
pI' = pI'' (char '?' >> return ())
pI'' :: XParser s () -> XParser s XmlTree
pI'' op
= between op (string "?>")
( do
n <- pITarget
p <- option "" (XT.sPace
>>
XT.allBut many "?>"
)
return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]])
) <?> "processing instruction"
where
pITarget :: XParser s String
pITarget = ( do
n <- XT.name
if map toLower n == t_xml
then unexpected n
else return n
)
cDSect :: XParser s XmlTree
cDSect
= cDSect'' $ XT.checkString "<![CDATA["
cDSect' :: XParser s XmlTree
cDSect'
= cDSect'' (string "[CDATA[" >> return ())
cDSect'' :: XParser s () -> XParser s XmlTree
cDSect'' op
= do
t <- between op (string "]]>") (XT.allBut many "]]>")
return (mkCdata' t)
<?> "CDATA section"
document :: XParser s XmlTree
document
= do
pos <- getPosition
dl <- document'
return (mkRoot' [ mkAttr' (mkName a_source) [mkText' (sourceName pos)]
, mkAttr' (mkName a_status) [mkText' (show c_ok)]
] dl
)
document' :: XParser s XmlTrees
document'
= do
pl <- prolog
el <- element
ml <- many misc
eof
return (pl ++ [el] ++ ml)
prolog :: XParser s XmlTrees
prolog
= do
xml <- option [] xMLDecl'
misc1 <- many misc
dtdPart <- option [] doctypedecl
misc2 <- many misc
return (xml ++ misc1 ++ dtdPart ++ misc2)
xMLDecl :: XParser s XmlTrees
xMLDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- versionInfo
ed <- option [] encodingDecl
sd <- option [] sDDecl
XT.skipS0
return (vi ++ ed ++ sd)
)
<?> "xml declaration"
xMLDecl' :: XParser s XmlTrees
xMLDecl'
= do
al <- xMLDecl
return [mkPi' (mkName t_xml) al]
xMLDecl'' :: XParser s XmlTree
xMLDecl''
= do
al <- option [] (try xMLDecl)
return (mkRoot' al [])
versionInfo :: XParser s XmlTrees
versionInfo
= ( do
try ( XT.skipS
>>
XT.keyword a_version
>>
return ()
)
XT.eq
vi <- XT.quoted XT.versionNum
return [mkAttr' (mkName a_version) [mkText' vi]]
)
<?> "version info (with quoted version number)"
misc :: XParser s XmlTree
misc
= comment
<|>
pI
<|>
( ( do
ws <- XT.sPace
return (mkText' ws)
) <?> ""
)
doctypedecl :: XParser s XmlTrees
doctypedecl
= between (try $ string "<!DOCTYPE") (char '>')
( do
XT.skipS
n <- XT.name
exId <- option [] ( try ( do
XT.skipS
externalID
)
)
XT.skipS0
markup <- option []
( do
m <- between (char '[' ) (char ']') markupOrDeclSep
XT.skipS0
return m
)
return [mkDTDElem' DOCTYPE ((a_name, n) : exId) markup]
)
markupOrDeclSep :: XParser s XmlTrees
markupOrDeclSep
= ( do
ll <- many ( markupdecl
<|>
declSep
<|>
XT.mkList conditionalSect
)
return (concat ll)
)
declSep :: XParser s XmlTrees
declSep
= XT.mkList XT.peReferenceT
<|>
( do
XT.skipS
return []
)
markupdecl :: XParser s XmlTrees
markupdecl
= XT.mkList
( pI
<|>
comment
<|>
XD.dtdDeclTokenizer
)
sDDecl :: XParser s XmlTrees
sDDecl
= do
try ( XT.skipS
>>
XT.keyword a_standalone
>>
return ()
)
XT.eq
sd <- XT.quoted (XT.keywords [v_yes, v_no])
return [mkAttr' (mkName a_standalone) [mkText' sd]]
element :: XParser s XmlTree
element
= char '<'
>>
element'
element' :: XParser s XmlTree
element'
= ( do
e <- elementStart
rwnf e `seq` elementRest e
) <?> "element"
elementStart :: XParser s (QName, XmlTrees)
elementStart
= do
n <- XT.name
al <- attrList
XT.skipS0
return (mkName n, al)
where
attrList
= option [] ( do
XT.skipS
attrList'
)
attrList'
= option [] ( do
a1 <- attribute
al <- attrList
let n = fromJust . getAttrName $ a1
if n `elem` map (fromJust . getAttrName) al
then unexpected
( "attribute name " ++
show (qualifiedName n) ++
" occurs twice in attribute list"
)
else return (a1 : al)
)
elementRest :: (QName, XmlTrees) -> XParser s XmlTree
elementRest (n, al)
= ( do
XT.checkString "/>"
return $ mkElement' n al []
)
<|>
( do
XT.gt
c <- content
eTag n
return $ mkElement' n al c
)
<?> "proper attribute list followed by \"/>\" or \">\""
eTag :: QName -> XParser s ()
eTag n'
= do
XT.checkString "</" <?> ""
n <- XT.name
XT.skipS0
XT.gt
if n == qualifiedName n'
then return ()
else unexpected ("illegal end tag </" ++ n ++ "> found, </" ++ qualifiedName n' ++ "> expected")
attribute :: XParser s XmlTree
attribute
= do
n <- XT.name
XT.eq
v <- XT.attrValueT
return $ mkAttr' (mkName n) v
content :: XParser s XmlTrees
content
= XT.mergeTextNodes <$>
many
( ( do
try ( XT.lt
>>
notFollowedBy (char '/')
>>
return ()
)
markup
)
<|>
charData'
<|>
XT.referenceT
)
where
markup
= element'
<|>
pI'
<|>
( char '!'
>>
( comment'
<|>
cDSect'
)
)
contentWithTextDecl :: XParser s XmlTrees
contentWithTextDecl
= option [] textDecl
>>
content
conditionalSect :: XParser s XmlTree
conditionalSect
= do
XT.checkString "<!["
cs <- many XD.dtdToken
_ <- char '['
sect <- condSectCont
return (mkDTDElem' CONDSECT [(a_value, sect)] cs)
where
condSectCont :: XParser s String
condSectCont
= ( XT.checkString "]]>"
>>
return ""
)
<|>
( do
XT.checkString "<!["
cs1 <- condSectCont
cs2 <- condSectCont
return ("<![" ++ cs1 ++ "]]>" ++ cs2)
)
<|>
( do
c <- xmlChar
cs <- condSectCont
return (c : cs)
)
externalID :: XParser s Attributes
externalID
= ( do
_ <- XT.keyword k_system
XT.skipS
lit <- XT.systemLiteral
return [(k_system, lit)]
)
<|>
( do
_ <- XT.keyword k_public
XT.skipS
pl <- XT.pubidLiteral
XT.skipS
sl <- XT.systemLiteral
return [ (k_system, sl)
, (k_public, pl) ]
)
<?> "SYSTEM or PUBLIC declaration"
textDecl :: XParser s XmlTrees
textDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- option [] versionInfo
ed <- encodingDecl
XT.skipS0
return (vi ++ ed)
)
<?> "text declaration"
textDecl'' :: XParser s XmlTree
textDecl''
= do
al <- option [] (try textDecl)
return (mkRoot' al [])
encodingDecl :: XParser s XmlTrees
encodingDecl
= do
try ( XT.skipS
>>
XT.keyword a_encoding
>>
return ()
)
XT.eq
ed <- XT.quoted XT.encName
return [mkAttr' (mkName a_encoding) [mkText' ed]]
xread :: String -> XmlTrees
xread str
= parseXmlFromString parser (withNormNewline ()) loc str
where
loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str)
parser = do
res <- content
eof
return res
parseXmlContent :: XmlTree -> XmlTrees
parseXmlContent
= xread . xshow . (:[])
parseXmlText :: SimpleXParser XmlTrees -> XPState () -> String -> XmlTree -> XmlTrees
parseXmlText p s0 loc = parseXmlFromString p s0 loc . xshow . (:[])
parseXmlDocument :: String -> String -> XmlTrees
parseXmlDocument = parseXmlFromString document' (withNormNewline ())
parseXmlFromString :: SimpleXParser XmlTrees -> XPState () -> String -> String -> XmlTrees
parseXmlFromString parser s0 loc
= either ((:[]) . mkError' c_err . (++ "\n") . show) id
. runParser parser s0 loc
removeEncodingSpec :: XmlTree -> XmlTrees
removeEncodingSpec t
| isText t
= ( either ((:[]) . mkError' c_err . (++ "\n") . show) ((:[]) . mkText')
. runParser parser (withNormNewline ()) "remove encoding spec"
. fromMaybe ""
. getText
) t
| otherwise
= [t]
where
parser :: XParser s String
parser = option [] textDecl
>>
getInput
parseXmlPart :: SimpleXParser XmlTrees -> String -> String -> XmlTree -> XmlTrees
parseXmlPart parser expected context t
= parseXmlText
( do
res <- parser
eof <?> expected
return res
) (withoutNormNewline ()) context
$ t
parseXmlDTDPart :: String -> XmlTree -> XmlTrees
parseXmlDTDPart
= parseXmlPart markupOrDeclSep "markup declaration"
parseXmlEntityValueAsContent :: String -> XmlTree -> XmlTrees
parseXmlEntityValueAsContent
= parseXmlPart content "general entity value"
parseXmlEntityValueAsAttrValue :: String -> XmlTree -> XmlTrees
parseXmlEntityValueAsAttrValue
= parseXmlPart (XT.attrValueT' "<&") "attribute value"
parseNMToken :: String -> XmlTree -> XmlTrees
parseNMToken
= parseXmlPart (many1 XT.nmtokenT) "nmtoken"
parseName :: String -> XmlTree -> XmlTrees
parseName
= parseXmlPart (many1 XT.nameT) "name"
parseXmlEncodingSpec :: SimpleXParser XmlTree -> XmlTree -> XmlTrees
parseXmlEncodingSpec encDecl x
= (:[]) .
( if isRoot x
then parseEncSpec
else id
) $ x
where
parseEncSpec r
= case ( runParser encDecl (withNormNewline ()) source
. xshow
. getChildren
$ r
) of
Right t
-> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r
Left _
-> r
where
source = xshow
. concat
. map getChildren
. filter ((== a_source)
. maybe "" qualifiedName . getAttrName)
. fromMaybe []
. getAttrl $ r
parseXmlEntityEncodingSpec :: XmlTree -> XmlTrees
parseXmlEntityEncodingSpec = parseXmlEncodingSpec textDecl''
parseXmlDocEncodingSpec :: XmlTree -> XmlTrees
parseXmlDocEncodingSpec = parseXmlEncodingSpec xMLDecl''