module Text.XML.HXT.Arrow.ProcessDocument
( parseXmlDocument
, parseXmlDocumentWithExpat
, parseHtmlDocument
, validateDocument
, propagateAndValidateNamespaces
, andValidateNamespaces
, getDocumentContents
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ListArrow ( fromLA )
import Control.Arrow.NTreeEdit
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlState
import Text.XML.HXT.Arrow.XmlState.TypeDefs
import Text.XML.HXT.Arrow.ParserInterface ( parseXmlDoc
, parseHtmlDoc
)
import Text.XML.HXT.Arrow.Edit ( transfAllCharRef
, substAllXHTMLEntityRefs
)
import Text.XML.HXT.Arrow.GeneralEntitySubstitution
( processGeneralEntities
)
import Text.XML.HXT.Arrow.DTDProcessing ( processDTD
)
import Text.XML.HXT.Arrow.DocumentInput ( getXmlContents
)
import Text.XML.HXT.Arrow.Namespace ( propagateNamespaces
, validateNamespaces
)
import Text.XML.HXT.DTDValidation.Validation ( validate
, getDTDSubset
, generalEntitiesDefined
, transform
)
parseXmlDocument :: Bool -> Bool -> Bool -> Bool -> IOStateArrow s XmlTree XmlTree
parseXmlDocument validateD substDTD substHTML validateRX
= ( replaceChildren ( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseXmlDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "parse XML document"
>>>
( ifA (fromLA getDTDSubset)
( processDTDandEntities
>>>
( if validate'
then validateDocument
else this
)
)
( if validate'
then traceMsg 2 "checkUndefinedEntityRefs: looking for undefined entity refs"
>>>
perform checkUndefinedEntityRefs
>>>
traceMsg 2 "checkUndefinedEntityRefs: looking for undefined entity refs done"
>>>
setDocumentStatusFromSystemState "decoding document"
else this
)
)
)
`when` documentStatusOk
where
validate'
= validateD && not validateRX
processDTDandEntities
= ( if validateD || substDTD
then processDTD
else this
)
>>>
( if substDTD
then ( processGeneralEntities
`when`
fromLA generalEntitiesDefined
)
else if substHTML
then substAllXHTMLEntityRefs
else this
)
>>>
transfAllCharRef
checkUndefinedEntityRefs :: IOStateArrow s XmlTree XmlTree
checkUndefinedEntityRefs
= deep isEntityRef
>>>
getEntityRef
>>>
arr (\ en -> "general entity reference \"&" ++ en ++ ";\" is undefined")
>>>
mkError c_err
>>>
filterErrorMsg
parseXmlDocumentWithExpat :: IOStateArrow s XmlTree XmlTree
parseXmlDocumentWithExpat
= ( withoutUserState $< getSysVar theExpatParser
)
`when` documentStatusOk
parseHtmlDocument :: IOStateArrow s XmlTree XmlTree
parseHtmlDocument
= ( perform ( getAttrValue a_source
>>>
traceValue 1 (("parseHtmlDoc: parse HTML document " ++) . show)
)
>>>
( parseHtml $< getSysVar (theTagSoup .&&&. theExpat) )
>>>
( removeWarnings $< getSysVar (theWarnings .&&&. theTagSoup) )
>>>
setDocumentStatusFromSystemState "parse HTML document"
>>>
traceTree
>>>
traceSource
>>>
perform ( getAttrValue a_source
>>>
traceValue 1 (\ src -> "parse HTML document " ++ show src ++ " finished")
)
)
`when` documentStatusOk
where
parseHtml (withTagSoup', withExpat')
| withExpat' = withoutUserState $< getSysVar theExpatParser
| withTagSoup' = withoutUserState $< getSysVar theTagSoupParser
| otherwise = traceMsg 1 ("parse document with parsec HTML parser")
>>>
replaceChildren
( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseHtmlDoc
)
removeWarnings (warnings, withTagSoup')
| warnings = processTopDownWithAttrl
filterErrorMsg
| withTagSoup' = this
| otherwise = fromLA $
editNTreeA [isError :-> none]
validateDocument :: IOStateArrow s XmlTree XmlTree
validateDocument
= ( traceMsg 1 "validating document"
>>>
perform ( validateDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "document validation"
>>>
traceMsg 1 "document validated, transforming doc with respect to DTD"
>>>
transformDoc
>>>
traceMsg 1 "document transformed"
>>>
traceSource
>>>
traceTree
)
`when`
documentStatusOk
propagateAndValidateNamespaces :: IOStateArrow s XmlTree XmlTree
propagateAndValidateNamespaces
= ( traceMsg 1 "propagating namespaces"
>>>
propagateNamespaces
>>>
traceDoc "propagating namespaces done"
>>>
andValidateNamespaces
)
`when`
documentStatusOk
andValidateNamespaces :: IOStateArrow s XmlTree XmlTree
andValidateNamespaces
= ( traceMsg 1 "validating namespaces"
>>>
( setDocumentStatusFromSystemState "namespace propagation"
`when`
( validateNamespaces >>> perform filterErrorMsg )
)
>>>
traceMsg 1 "namespace validation finished"
)
`when`
documentStatusOk
getDocumentContents :: String -> IOStateArrow s b XmlTree
getDocumentContents src
= root [] []
>>>
addAttr a_source src
>>>
traceMsg 1 ("readDocument: start processing document " ++ show src)
>>>
getXmlContents
validateDoc :: ArrowList a => a XmlTree XmlTree
validateDoc = fromLA ( validate
`when`
getDTDSubset
)
transformDoc :: ArrowList a => a XmlTree XmlTree
transformDoc = fromLA transform