{-# LANGUAGE CPP #-}
module Text.XML.HXT.Parser.HtmlParsec
( parseHtmlText
, parseHtmlDocument
, parseHtmlContent
, isEmptyHtmlTag
, isInnerHtmlTagOf
, closesHtmlTag
, emptyHtmlTags
)
where
#if MIN_VERSION_base(4,8,2)
#else
import Control.Applicative ((<$>))
#endif
import Data.Char ( toLower
, toUpper
)
import Data.Char.Properties.XMLCharProps ( isXmlChar
)
import Data.Maybe ( fromMaybe
, fromJust
)
import qualified Data.Map as M
import Text.ParserCombinators.Parsec ( SourcePos
, anyChar
, between
, eof
, getPosition
, many
, many1
, noneOf
, option
, runParser
, satisfy
, string
, try
, (<|>)
)
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.XmlNode ( mkText'
, mkError'
, mkCdata'
, mkCmt'
, mkCharRef'
, mkElement'
, mkAttr'
, mkDTDElem'
, mkPi'
, isEntityRef
, getEntityRef
)
import Text.XML.HXT.Parser.XmlTokenParser ( allBut
, amp
, dq
, eq
, gt
, lt
, name
, pubidLiteral
, skipS
, skipS0
, sPace
, sq
, systemLiteral
, checkString
, singleCharsT
, referenceT
, mergeTextNodes
)
import Text.XML.HXT.Parser.XmlParsec ( misc
, parseXmlText
, xMLDecl'
)
import Text.XML.HXT.Parser.XmlCharParser ( xmlChar
, SimpleXParser
, withNormNewline
)
import Text.XML.HXT.Parser.XhtmlEntities ( xhtmlEntities
)
parseHtmlText :: String -> XmlTree -> XmlTrees
parseHtmlText loc t = parseXmlText htmlDocument (withNormNewline ()) loc $ t
parseHtmlFromString :: SimpleXParser XmlTrees -> String -> String -> XmlTrees
parseHtmlFromString parser loc
= either ((:[]) . mkError' c_err . (++ "\n") . show) id . runParser parser (withNormNewline ()) loc
parseHtmlDocument :: String -> String -> XmlTrees
parseHtmlDocument = parseHtmlFromString htmlDocument
parseHtmlContent :: String -> XmlTrees
parseHtmlContent = parseHtmlFromString htmlContent "string"
type Context = (XmlTreeFl, OpenTags)
type XmlTreeFl = XmlTrees -> XmlTrees
type OpenTags = [(String, XmlTrees, XmlTreeFl)]
htmlDocument :: SimpleXParser XmlTrees
htmlDocument
= do
pl <- htmlProlog
el <- htmlContent
eof
return (pl ++ el)
htmlProlog :: SimpleXParser XmlTrees
htmlProlog
= do
xml <- option []
( try xMLDecl'
<|>
( do
pos <- getPosition
checkString "<?"
return $ [mkError' c_warn (show pos ++ " wrong XML declaration")]
)
)
misc1 <- many misc
dtdPart <- option []
( try doctypedecl
<|>
( do
pos <- getPosition
upperCaseString "<!DOCTYPE"
return $ [mkError' c_warn (show pos ++ " HTML DOCTYPE declaration ignored")]
)
)
return (xml ++ misc1 ++ dtdPart)
doctypedecl :: SimpleXParser XmlTrees
doctypedecl
= between (upperCaseString "<!DOCTYPE") gt
( do
skipS
n <- name
exId <- ( do
skipS
option [] externalID
)
skipS0
return [mkDTDElem' DOCTYPE ((a_name, n) : exId) []]
)
externalID :: SimpleXParser Attributes
externalID
= do
upperCaseString k_public
skipS
pl <- pubidLiteral
sl <- option "" $ try ( do
skipS
systemLiteral
)
return $ (k_public, pl) : if null sl then [] else [(k_system, sl)]
htmlContent :: SimpleXParser XmlTrees
htmlContent
= mergeTextNodes <$> htmlContent'
htmlContent' :: SimpleXParser XmlTrees
htmlContent'
= option []
( do
context <- hContent (id, [])
pos <- getPosition
return $ closeTags pos context
)
where
closeTags _pos (body, [])
= body []
closeTags pos' (body, ((tn, al, body1) : restOpen))
= closeTags pos'
( addHtmlWarn (show pos' ++ ": no closing tag found for \"<" ++ tn ++ " ...>\"")
.
addHtmlTag tn al body
$
(body1, restOpen)
)
hElement :: Context -> SimpleXParser Context
hElement context
= ( do
t <- hSimpleData
return (addHtmlElem t context)
)
<|>
hCloseTag context
<|>
hOpenTag context
<|>
( do
pos <- getPosition
c <- xmlChar
return ( addHtmlWarn (show pos ++ " markup char " ++ show c ++ " not allowed in this context")
.
addHtmlElem (mkText' [c])
$
context
)
)
<|>
( do
pos <- getPosition
c <- anyChar
return ( addHtmlWarn ( show pos
++ " illegal data in input or illegal XML char "
++ show c
++ " found and ignored, possibly wrong encoding scheme used")
$
context
)
)
hSimpleData :: SimpleXParser XmlTree
hSimpleData
= charData''
<|>
hReference'
<|>
hComment
<|>
hpI
<|>
hcDSect
where
charData''
= do
t <- many1 (satisfy (\ x -> isXmlChar x && not (x == '<' || x == '&')))
return (mkText' t)
hCloseTag :: Context -> SimpleXParser Context
hCloseTag context
= do
checkString "</"
n <- lowerCaseName
skipS0
pos <- getPosition
checkSymbol gt ("closing > in tag \"</" ++ n ++ "\" expected") (closeTag pos n context)
hOpenTag :: Context -> SimpleXParser Context
hOpenTag context
= ( do
e <- hOpenTagStart
hOpenTagRest e context
)
hOpenTagStart :: SimpleXParser ((SourcePos, String), XmlTrees)
hOpenTagStart
= do
np <- try ( do
lt
pos <- getPosition
n <- lowerCaseName
return (pos, n)
)
skipS0
as <- hAttrList
return (np, as)
hOpenTagRest :: ((SourcePos, String), XmlTrees) -> Context -> SimpleXParser Context
hOpenTagRest ((pos, tn), al) context
= ( do
checkString "/>"
return (addHtmlTag tn al id context)
)
<|>
( do
context1 <- checkSymbol gt ("closing > in tag \"<" ++ tn ++ "...\" expected") context
return ( let context2 = closePrevTag pos tn context1
in
( if isEmptyHtmlTag tn
then addHtmlTag tn al id
else openTag tn al
) context2
)
)
hAttrList :: SimpleXParser XmlTrees
hAttrList
= many (try hAttribute)
where
hAttribute
= do
n <- lowerCaseName
v <- hAttrValue
skipS0
return $ mkAttr' (mkName n) v
hAttrValue :: SimpleXParser XmlTrees
hAttrValue
= option []
( eq >> hAttrValue' )
hAttrValue' :: SimpleXParser XmlTrees
hAttrValue'
= try ( between dq dq (hAttrValue'' "&\"") )
<|>
try ( between sq sq (hAttrValue'' "&\'") )
<|>
( do
cs <- many (noneOf " \r\t\n>\"\'")
return [mkText' cs]
)
hAttrValue'' :: String -> SimpleXParser XmlTrees
hAttrValue'' notAllowed
= many ( hReference' <|> singleCharsT notAllowed)
hReference' :: SimpleXParser XmlTree
hReference'
= try hReferenceT
<|>
( do
amp
return (mkText' "&")
)
hReferenceT :: SimpleXParser XmlTree
hReferenceT
= do
r <- referenceT
return ( if isEntityRef r
then substRef r
else r
)
where
substRef r
= case (lookup en xhtmlEntities) of
Just i -> mkCharRef' i
Nothing -> r
where
en = fromJust . getEntityRef $ r
hContent :: Context -> SimpleXParser Context
hContent context
= option context
( hElement context
>>=
hContent
)
hComment :: SimpleXParser XmlTree
hComment
= do
checkString "<!--"
pos <- getPosition
c <- allBut many "-->"
closeCmt pos c
where
closeCmt pos c
= ( do
checkString "-->"
return (mkCmt' c)
)
<|>
( return $
mkError' c_warn (show pos ++ " no closing comment sequence \"-->\" found")
)
hpI :: SimpleXParser XmlTree
hpI = checkString "<?"
>>
( try ( do
n <- name
p <- sPace >> allBut many "?>"
string "?>" >>
return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]])
)
<|>
( do
pos <- getPosition
return $
mkError' c_warn (show pos ++ " illegal PI found")
)
)
hcDSect :: SimpleXParser XmlTree
hcDSect
= do
checkString "<![CDATA["
pos <- getPosition
t <- allBut many "]]>"
closeCD pos t
where
closeCD pos t
= ( do
checkString "]]>"
return (mkCdata' t)
)
<|>
( return $
mkError' c_warn (show pos ++ " no closing CDATA sequence \"]]>\" found")
)
checkSymbol :: SimpleXParser () -> String -> Context -> SimpleXParser Context
checkSymbol p msg context
= ( p
>>
return context
)
<|>
( do
pos <- getPosition
return $ addHtmlWarn (show pos ++ " " ++ msg) context
)
lowerCaseName :: SimpleXParser String
lowerCaseName
= do
n <- name
return (map toLower n)
upperCaseString :: String -> SimpleXParser ()
upperCaseString s
= try (sequence (map (\ c -> satisfy (( == c) . toUpper)) s)) >> return ()
addHtmlTag :: String -> XmlTrees -> XmlTreeFl -> Context -> Context
addHtmlTag tn al body context
= e `seq`
addHtmlElem e context
where
e = mkElement' (mkName tn) al (body [])
addHtmlWarn :: String -> Context -> Context
addHtmlWarn msg
= addHtmlElem (mkError' c_warn msg)
addHtmlElem :: XmlTree -> Context -> Context
addHtmlElem elem' (body, openTags)
= (body . (elem' :), openTags)
openTag :: String -> XmlTrees -> Context -> Context
openTag tn al (body, openTags)
= (id, (tn, al, body) : openTags)
closeTag :: SourcePos -> String -> Context -> Context
closeTag pos n context
| n `elem` (map ( \ (n1, _, _) -> n1) $ snd context)
= closeTag' n context
| otherwise
= addHtmlWarn (show pos ++ " no opening tag found for </" ++ n ++ ">")
.
addHtmlTag n [] id
$
context
where
closeTag' n' (body', (n1, al1, body1) : restOpen)
= close context1
where
context1
= addHtmlTag n1 al1 body' (body1, restOpen)
close
| n' == n1
= id
| n1 `isInnerHtmlTagOf` n'
= closeTag pos n'
| otherwise
= addHtmlWarn (show pos ++ " no closing tag found for \"<" ++ n1 ++ " ...>\"")
.
closeTag' n'
closeTag' _ _
= error "illegal argument for closeTag'"
closePrevTag :: SourcePos -> String -> Context -> Context
closePrevTag _pos _n context@(_body, [])
= context
closePrevTag pos n context@(body, (n1, al1, body1) : restOpen)
| n `closesHtmlTag` n1
= closePrevTag pos n
( addHtmlWarn (show pos ++ " tag \"<" ++ n1 ++ " ...>\" implicitly closed by opening tag \"<" ++ n ++ " ...>\"")
.
addHtmlTag n1 al1 body
$
(body1, restOpen)
)
| otherwise
= context
isEmptyHtmlTag :: String -> Bool
isEmptyHtmlTag n
= n `elem`
emptyHtmlTags
emptyHtmlTags :: [String]
emptyHtmlTags
= [ "area"
, "base"
, "br"
, "col"
, "frame"
, "hr"
, "img"
, "input"
, "link"
, "meta"
, "param"
]
{-# INLINE emptyHtmlTags #-}
isInnerHtmlTagOf :: String -> String -> Bool
n `isInnerHtmlTagOf` tn
= n `elem`
( fromMaybe [] . lookup tn
$ [ ("body", ["p"])
, ("caption", ["p"])
, ("dd", ["p"])
, ("div", ["p"])
, ("dl", ["dt","dd"])
, ("dt", ["p"])
, ("li", ["p"])
, ("map", ["p"])
, ("object", ["p"])
, ("ol", ["li"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("tbody", ["th","tr","td"])
, ("td", ["p"])
, ("tfoot", ["th","tr","td"])
, ("th", ["p"])
, ("thead", ["th","tr","td"])
, ("tr", ["th","td"])
, ("ul", ["li"])
]
)
closesHtmlTag :: String -> String -> Bool
closesHtmlTag t t2
= fromMaybe False . fmap ($ t) . M.lookup t2 $ closedByTable
{-# INLINE closesHtmlTag #-}
closedByTable :: M.Map String (String -> Bool)
closedByTable
= M.fromList $
[ ("a", (== "a"))
, ("li", (== "li" ))
, ("th", (`elem` ["th", "td", "tr"] ))
, ("td", (`elem` ["th", "td", "tr"] ))
, ("tr", (== "tr"))
, ("dt", (`elem` ["dt", "dd"] ))
, ("dd", (`elem` ["dt", "dd"] ))
, ("p", (`elem` ["hr"
, "h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("colgroup", (`elem` ["colgroup", "thead", "tfoot", "tbody"] ))
, ("form", (`elem` ["form"] ))
, ("label", (`elem` ["label"] ))
, ("map", (`elem` ["map"] ))
, ("option", const True)
, ("script", const True)
, ("style", const True)
, ("textarea", const True)
, ("title", const True)
, ("select", ( /= "option"))
, ("thead", (`elem` ["tfoot","tbody"] ))
, ("tbody", (== "tbody" ))
, ("tfoot", (== "tbody" ))
, ("h1", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("h2", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("h3", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("h4", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("h5", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
, ("h6", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] ))
]