{-# LANGUAGE CPP #-}
module Text.XML.HXT.Parser.XmlParsec
( charData
, charData'
, comment
, pI
, cDSect
, document
, document'
, prolog
, xMLDecl
, xMLDecl'
, versionInfo
, misc
, doctypedecl
, markupdecl
, sDDecl
, element
, content
, contentWithTextDecl
, textDecl
, encodingDecl
, xread
, xreadDoc
, parseXmlContent
, parseXmlDocEncodingSpec
, parseXmlDocument
, parseXmlDTDPart
, parseXmlEncodingSpec
, parseXmlEntityEncodingSpec
, parseXmlEntityValueAsAttrValue
, parseXmlEntityValueAsContent
, parseXmlPart
, parseXmlText
, parseNMToken
, parseName
, removeEncodingSpec
)
where
#if MIN_VERSION_base(4,8,2)
#else
import Control.Applicative ((<$>))
#endif
import Text.ParserCombinators.Parsec (between, char, eof,
getInput, getPosition,
many, many1,
notFollowedBy, option,
runParser, sourceName,
string, try, unexpected,
(<?>), (<|>))
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.ShowXml (xshow)
import Text.XML.HXT.DOM.XmlNode (changeAttrl,
getAttrName, getAttrl,
getChildren, getText,
isRoot, isText,
mergeAttrl, mkAttr',
mkCdata', mkCmt',
mkDTDElem', mkElement',
mkError', mkPi',
mkRoot', mkText')
import Text.XML.HXT.Parser.XmlCharParser (SimpleXParser, XPState,
XParser,
withNormNewline,
withoutNormNewline,
xmlChar)
import qualified Text.XML.HXT.Parser.XmlDTDTokenParser as XD
import qualified Text.XML.HXT.Parser.XmlTokenParser as XT
import Control.FlatSeq
import Data.Char (toLower)
import Data.Maybe
charData :: XParser s XmlTrees
charData
= many (charData' <|> XT.referenceT)
charData' :: XParser s XmlTree
charData'
= do
t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>"
return (mkText' t)
comment :: XParser s XmlTree
comment
= comment'' $ XT.checkString "<!--"
comment' :: XParser s XmlTree
comment'
= comment'' (string "--" >> return ())
comment'' :: XParser s () -> XParser s XmlTree
comment'' op
= ( do
c <- between op (string ("-->")) (XT.allBut many "--")
return (mkCmt' c)
) <?> "comment"
pI :: XParser s XmlTree
pI = pI'' $ XT.checkString "<?"
pI' :: XParser s XmlTree
pI' = pI'' (char '?' >> return ())
pI'' :: XParser s () -> XParser s XmlTree
pI'' op
= between op (string "?>")
( do
n <- pITarget
p <- option "" (XT.sPace
>>
XT.allBut many "?>"
)
return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]])
) <?> "processing instruction"
where
pITarget :: XParser s String
pITarget = ( do
n <- XT.name
if map toLower n == t_xml
then unexpected n
else return n
)
cDSect :: XParser s XmlTree
cDSect
= cDSect'' $ XT.checkString "<![CDATA["
cDSect' :: XParser s XmlTree
cDSect'
= cDSect'' (string "[CDATA[" >> return ())
cDSect'' :: XParser s () -> XParser s XmlTree
cDSect'' op
= do
t <- between op (string "]]>") (XT.allBut many "]]>")
return (mkCdata' t)
<?> "CDATA section"
document :: XParser s XmlTree
document
= do
pos <- getPosition
dl <- document'
return (mkRoot' [ mkAttr' (mkName a_source) [mkText' (sourceName pos)]
, mkAttr' (mkName a_status) [mkText' (show c_ok)]
] dl
)
document' :: XParser s XmlTrees
document'
= do
pl <- prolog
el <- element
ml <- many misc
eof
return (pl ++ [el] ++ ml)
prolog :: XParser s XmlTrees
prolog
= do
xml <- option [] xMLDecl'
misc1 <- many misc
dtdPart <- option [] doctypedecl
misc2 <- many misc
return (xml ++ misc1 ++ dtdPart ++ misc2)
xMLDecl :: XParser s XmlTrees
xMLDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- versionInfo
ed <- option [] encodingDecl
sd <- option [] sDDecl
XT.skipS0
return (vi ++ ed ++ sd)
)
<?> "xml declaration"
xMLDecl' :: XParser s XmlTrees
xMLDecl'
= do
al <- xMLDecl
return [mkPi' (mkName t_xml) al]
xMLDecl'' :: XParser s XmlTree
xMLDecl''
= do
al <- option [] (try xMLDecl)
return (mkRoot' al [])
versionInfo :: XParser s XmlTrees
versionInfo
= ( do
try ( XT.skipS
>>
XT.keyword a_version
>>
return ()
)
XT.eq
vi <- XT.quoted XT.versionNum
return [mkAttr' (mkName a_version) [mkText' vi]]
)
<?> "version info (with quoted version number)"
misc :: XParser s XmlTree
misc
= comment
<|>
pI
<|>
( ( do
ws <- XT.sPace
return (mkText' ws)
) <?> ""
)
doctypedecl :: XParser s XmlTrees
doctypedecl
= between (try $ string "<!DOCTYPE") (char '>')
( do
XT.skipS
n <- XT.name
exId <- option [] ( try ( do
XT.skipS
externalID
)
)
XT.skipS0
markup <- option []
( do
m <- between (char '[' ) (char ']') markupOrDeclSep
XT.skipS0
return m
)
return [mkDTDElem' DOCTYPE ((a_name, n) : exId) markup]
)
markupOrDeclSep :: XParser s XmlTrees
markupOrDeclSep
= ( do
ll <- many ( markupdecl
<|>
declSep
<|>
XT.mkList conditionalSect
)
return (concat ll)
)
declSep :: XParser s XmlTrees
declSep
= XT.mkList XT.peReferenceT
<|>
( do
XT.skipS
return []
)
markupdecl :: XParser s XmlTrees
markupdecl
= XT.mkList
( pI
<|>
comment
<|>
XD.dtdDeclTokenizer
)
sDDecl :: XParser s XmlTrees
sDDecl
= do
try ( XT.skipS
>>
XT.keyword a_standalone
>>
return ()
)
XT.eq
sd <- XT.quoted (XT.keywords [v_yes, v_no])
return [mkAttr' (mkName a_standalone) [mkText' sd]]
element :: XParser s XmlTree
element
= char '<'
>>
element'
element' :: XParser s XmlTree
element'
= ( do
e <- elementStart
rwnf e `seq` elementRest e
) <?> "element"
elementStart :: XParser s (QName, XmlTrees)
elementStart
= do
n <- XT.name
al <- attrList
XT.skipS0
return (mkName n, al)
where
attrList
= option [] ( do
XT.skipS
attrList'
)
attrList'
= option [] ( do
a1 <- attribute
al <- attrList
let n = fromJust . getAttrName $ a1
if n `elem` map (fromJust . getAttrName) al
then unexpected
( "attribute name " ++
show (qualifiedName n) ++
" occurs twice in attribute list"
)
else return (a1 : al)
)
elementRest :: (QName, XmlTrees) -> XParser s XmlTree
elementRest (n, al)
= ( do
XT.checkString "/>"
return $ mkElement' n al []
)
<|>
( do
XT.gt
c <- content
eTag n
return $ mkElement' n al c
)
<?> "proper attribute list followed by \"/>\" or \">\""
eTag :: QName -> XParser s ()
eTag n'
= do
XT.checkString "</" <?> ""
n <- XT.name
XT.skipS0
XT.gt
if n == qualifiedName n'
then return ()
else unexpected ("illegal end tag </" ++ n ++ "> found, </" ++ qualifiedName n' ++ "> expected")
attribute :: XParser s XmlTree
attribute
= do
n <- XT.name
XT.eq
v <- XT.attrValueT
return $ mkAttr' (mkName n) v
content :: XParser s XmlTrees
content
= XT.mergeTextNodes <$>
many
( ( do
try ( XT.lt
>>
notFollowedBy (char '/')
>>
return ()
)
markup
)
<|>
charData'
<|>
XT.referenceT
)
where
markup
= element'
<|>
pI'
<|>
( char '!'
>>
( comment'
<|>
cDSect'
)
)
contentWithTextDecl :: XParser s XmlTrees
contentWithTextDecl
= option [] textDecl
>>
content
conditionalSect :: XParser s XmlTree
conditionalSect
= do
XT.checkString "<!["
cs <- many XD.dtdToken
_ <- char '['
sect <- condSectCont
return (mkDTDElem' CONDSECT [(a_value, sect)] cs)
where
condSectCont :: XParser s String
condSectCont
= ( XT.checkString "]]>"
>>
return ""
)
<|>
( do
XT.checkString "<!["
cs1 <- condSectCont
cs2 <- condSectCont
return ("<![" ++ cs1 ++ "]]>" ++ cs2)
)
<|>
( do
c <- xmlChar
cs <- condSectCont
return (c : cs)
)
externalID :: XParser s Attributes
externalID
= ( do
_ <- XT.keyword k_system
XT.skipS
lit <- XT.systemLiteral
return [(k_system, lit)]
)
<|>
( do
_ <- XT.keyword k_public
XT.skipS
pl <- XT.pubidLiteral
XT.skipS
sl <- XT.systemLiteral
return [ (k_system, sl)
, (k_public, pl) ]
)
<?> "SYSTEM or PUBLIC declaration"
textDecl :: XParser s XmlTrees
textDecl
= between (try $ string "<?xml") (string "?>")
( do
vi <- option [] versionInfo
ed <- encodingDecl
XT.skipS0
return (vi ++ ed)
)
<?> "text declaration"
textDecl'' :: XParser s XmlTree
textDecl''
= do
al <- option [] (try textDecl)
return (mkRoot' al [])
encodingDecl :: XParser s XmlTrees
encodingDecl
= do
try ( XT.skipS
>>
XT.keyword a_encoding
>>
return ()
)
XT.eq
ed <- XT.quoted XT.encName
return [mkAttr' (mkName a_encoding) [mkText' ed]]
xread :: String -> XmlTrees
xread = xread' content
xreadDoc :: String -> XmlTrees
xreadDoc = xread' document'
xread' :: XParser () XmlTrees -> String -> XmlTrees
xread' content' str
= parseXmlFromString parser (withNormNewline ()) loc str
where
loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str)
parser = do
res <- content'
eof
return res
parseXmlContent :: XmlTree -> XmlTrees
parseXmlContent
= xread . xshow . (:[])
parseXmlText :: SimpleXParser XmlTrees -> XPState () -> String -> XmlTree -> XmlTrees
parseXmlText p s0 loc = parseXmlFromString p s0 loc . xshow . (:[])
parseXmlDocument :: String -> String -> XmlTrees
parseXmlDocument = parseXmlFromString document' (withNormNewline ())
parseXmlFromString :: SimpleXParser XmlTrees -> XPState () -> String -> String -> XmlTrees
parseXmlFromString parser s0 loc
= either ((:[]) . mkError' c_err . (++ "\n") . show) id
. runParser parser s0 loc
removeEncodingSpec :: XmlTree -> XmlTrees
removeEncodingSpec t
| isText t
= ( either ((:[]) . mkError' c_err . (++ "\n") . show) ((:[]) . mkText')
. runParser parser (withNormNewline ()) "remove encoding spec"
. fromMaybe ""
. getText
) t
| otherwise
= [t]
where
parser :: XParser s String
parser = option [] textDecl
>>
getInput
parseXmlPart :: SimpleXParser XmlTrees -> String -> String -> XmlTree -> XmlTrees
parseXmlPart parser expected context t
= parseXmlText
( do
res <- parser
eof <?> expected
return res
) (withoutNormNewline ()) context
$ t
parseXmlDTDPart :: String -> XmlTree -> XmlTrees
parseXmlDTDPart
= parseXmlPart markupOrDeclSep "markup declaration"
parseXmlEntityValueAsContent :: String -> XmlTree -> XmlTrees
parseXmlEntityValueAsContent
= parseXmlPart content "general entity value"
parseXmlEntityValueAsAttrValue :: String -> XmlTree -> XmlTrees
parseXmlEntityValueAsAttrValue
= parseXmlPart (XT.attrValueT' "<&") "attribute value"
parseNMToken :: String -> XmlTree -> XmlTrees
parseNMToken
= parseXmlPart (many1 XT.nmtokenT) "nmtoken"
parseName :: String -> XmlTree -> XmlTrees
parseName
= parseXmlPart (many1 XT.nameT) "name"
parseXmlEncodingSpec :: SimpleXParser XmlTree -> XmlTree -> XmlTrees
parseXmlEncodingSpec encDecl x
= (:[]) .
( if isRoot x
then parseEncSpec
else id
) $ x
where
parseEncSpec r
= case ( runParser encDecl (withNormNewline ()) source
. xshow
. getChildren
$ r
) of
Right t
-> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r
Left _
-> r
where
source = xshow
. concat
. map getChildren
. filter ((== a_source)
. maybe "" qualifiedName . getAttrName)
. fromMaybe []
. getAttrl $ r
parseXmlEntityEncodingSpec :: XmlTree -> XmlTrees
parseXmlEntityEncodingSpec = parseXmlEncodingSpec textDecl''
parseXmlDocEncodingSpec :: XmlTree -> XmlTrees
parseXmlDocEncodingSpec = parseXmlEncodingSpec xMLDecl''