module Text.XML.HaXml.Html.Parse
( htmlParse
, htmlParse'
) where
import Prelude hiding (either,maybe,sequence)
import qualified Prelude (either)
import Data.Maybe hiding (maybe)
import Data.Char (toLower, isDigit, isHexDigit)
import Numeric (readDec,readHex)
import Control.Monad
import Text.XML.HaXml.Types
import Text.XML.HaXml.Namespaces
import Text.XML.HaXml.Lex
import Text.XML.HaXml.Posn
import Text.ParserCombinators.Poly.Plain
#if defined(DEBUG)
# if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import Debug.Trace(trace)
# elif defined(__GLASGOW_HASKELL__)
import IOExts(trace)
# elif defined(__NHC__) || defined(__HBC__)
import NonStdTrace
# endif
debug :: Monad m => String -> m ()
debug s = trace s (return ())
#else
debug :: Monad m => String -> m ()
debug _ = return ()
#endif
htmlParse :: String -> String -> Document Posn
htmlParse file = Prelude.either error id . htmlParse' file
htmlParse' :: String -> String -> Either String (Document Posn)
htmlParse' file = Prelude.either Left (Right . simplify) . fst
. runParser document . xmlLex file
simplify :: Document i -> Document i
simplify (Document p st (Elem n avs cs) ms) =
Document p st (Elem n avs (deepfilter simp cs)) ms
where
simp (CElem (Elem (N "null") [] []) _) = False
simp (CElem (Elem t _ []) _)
| localName t `elem` ["font","p","i","b","em","tt","big","small"]
= False
simp _ = True
deepfilter f =
filter f . map (\c-> case c of
CElem (Elem t avs cs) i
-> CElem (Elem t avs (deepfilter f cs)) i
_ -> c)
selfclosingtags :: [String]
selfclosingtags = ["img","hr","br","meta","col","link","base"
,"param","area","frame","input"]
closeInnerTags :: [(String,[String])]
closeInnerTags =
[ ("ul", ["li"])
, ("ol", ["li"])
, ("dl", ["dt","dd"])
, ("tr", ["th","td"])
, ("div", ["p"])
, ("thead", ["th","tr","td"])
, ("tfoot", ["th","tr","td"])
, ("tbody", ["th","tr","td"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("caption", ["p"])
, ("th", ["p"])
, ("td", ["p"])
, ("li", ["p"])
, ("dt", ["p"])
, ("dd", ["p"])
, ("object", ["p"])
, ("map", ["p"])
, ("body", ["p"])
]
closes :: Name -> Name -> Bool
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"dd" `closes` t | t `elem` ["dt","dd"] = True
"form" `closes` "form" = True
"label" `closes` "label" = True
_ `closes` "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
"colgroup" `closes` "colgroup" = True
t `closes` "p"
| t `elem` ["p","h1","h2","h3","h4","h5","h6"
,"hr","div","ul","dl","ol","table"] = True
_ `closes` _ = False
type HParser a = Parser (Posn,TokenT) a
tok :: TokenT -> HParser TokenT
tok t = do (p,t') <- next
case t' of TokError _ -> report failBad (show t) p t'
_ | t'==t -> return t
| otherwise -> report fail (show t) p t'
qname :: HParser QName
qname = fmap N name
name :: HParser Name
name = do (p,tok) <- next
case tok of
TokName s -> return s
TokError _ -> report failBad "a name" p tok
_ -> report fail "a name" p tok
string, freetext :: HParser String
string = do (p,t) <- next
case t of TokName s -> return s
_ -> report fail "text" p t
freetext = do (p,t) <- next
case t of TokFreeText s -> return s
_ -> report fail "text" p t
maybe :: HParser a -> HParser (Maybe a)
maybe p =
( p >>= return . Just) `onFail`
( return Nothing)
either :: HParser a -> HParser b -> HParser (Either a b)
either p q =
( p >>= return . Left) `onFail`
( q >>= return . Right)
word :: String -> HParser ()
word s = do { x <- next
; case x of
(_p,TokName n) | s==n -> return ()
(_p,TokFreeText n) | s==n -> return ()
( p,t@(TokError _)) -> report failBad (show s) p t
( p,t) -> report fail (show s) p t
}
posn :: HParser Posn
posn = do { x@(p,_) <- next
; reparse [x]
; return p
} `onFail` return noPos
nmtoken :: HParser NmToken
nmtoken = (string `onFail` freetext)
failP, failBadP :: String -> HParser a
failP msg = do { p <- posn; fail (msg++"\n at "++show p) }
failBadP msg = do { p <- posn; failBad (msg++"\n at "++show p) }
report :: (String->HParser a) -> String -> Posn -> TokenT -> HParser a
report fail expect p t = fail ("Expected "++show expect++" but found "++show t
++"\n at "++show p)
adjustErrP :: HParser a -> (String->String) -> HParser a
p `adjustErrP` f = p `onFail` do pn <- posn
(p `adjustErr` f) `adjustErr` (++show pn)
document :: HParser (Document Posn)
document = do
p <- prolog `adjustErr` ("unrecognisable XML prolog\n"++)
ht <- many1 (element "HTML document")
ms <- many misc
return (Document p emptyST (case map snd ht of
[e] -> e
es -> Elem (N "html") [] (map mkCElem es))
ms)
where mkCElem e = CElem e noPos
comment :: HParser Comment
comment = do
bracket (tok TokCommentOpen) (tok TokCommentClose) freetext
processinginstruction :: HParser ProcessingInstruction
processinginstruction = do
tok TokPIOpen
commit $ do
n <- string `onFail` failP "processing instruction has no target"
f <- freetext
(tok TokPIClose `onFail` tok TokAnyClose) `onFail` failP "missing ?> or >"
return (n, f)
cdsect :: HParser CDSect
cdsect = do
tok TokSectionOpen
bracket (tok (TokSection CDATAx)) (commit $ tok TokSectionClose) chardata
prolog :: HParser Prolog
prolog = do
x <- maybe xmldecl
m1 <- many misc
dtd <- maybe doctypedecl
m2 <- many misc
return (Prolog x m1 dtd m2)
xmldecl :: HParser XMLDecl
xmldecl = do
tok TokPIOpen
(word "xml" `onFail` word "XML")
p <- posn
s <- freetext
tok TokPIClose `onFail` failBadP "missing ?> in <?xml ...?>"
(Prelude.either failP return . fst . runParser aux . xmlReLex p) s
where
aux = do
v <- versioninfo `onFail` failP "missing XML version info"
e <- maybe encodingdecl
s <- maybe sddecl
return (XMLDecl v e s)
versioninfo :: HParser VersionInfo
versioninfo = do
(word "version" `onFail` word "VERSION")
tok TokEqual
bracket (tok TokQuote) (commit $ tok TokQuote) freetext
misc :: HParser Misc
misc =
oneOf' [ ("<!--comment-->", comment >>= return . Comment)
, ("<?PI?>", processinginstruction >>= return . PI)
]
doctypedecl :: HParser DocTypeDecl
doctypedecl = do
tok TokSpecialOpen
tok (TokSpecial DOCTYPEx)
commit $ do
n <- qname
eid <- maybe externalid
tok TokAnyClose `onFail` failP "missing > in DOCTYPE decl"
return (DTD n eid [])
sddecl :: HParser SDDecl
sddecl = do
(word "standalone" `onFail` word "STANDALONE")
commit $ do
tok TokEqual `onFail` failP "missing = in 'standalone' decl"
bracket (tok TokQuote) (commit $ tok TokQuote)
( (word "yes" >> return True) `onFail`
(word "no" >> return False) `onFail`
failP "'standalone' decl requires 'yes' or 'no' value" )
type Stack = [(Name,[Attribute])]
element :: Name -> HParser (Stack,Element Posn)
element ctx =
do
tok TokAnyOpen
(ElemTag (N e) avs) <- elemtag
( if e `closes` ctx then
( do debug ("/")
unparse ([TokEndOpen, TokName ctx, TokAnyClose,
TokAnyOpen, TokName e] ++ reformatAttrs avs)
return ([], Elem (N "null") [] []))
else if e `elem` selfclosingtags then
( do tok TokEndClose
debug (e++"[+]")
return ([], Elem (N e) avs [])) `onFail`
( do tok TokAnyClose
debug (e++"[+]")
return ([], Elem (N e) avs []))
else
(( do tok TokEndClose
debug (e++"[]")
return ([], Elem (N e) avs [])) `onFail`
( do tok TokAnyClose `onFail` failP "missing > or /> in element tag"
debug (e++"[")
zz <- manyFinally (content e)
(tok TokEndOpen)
(N n) <- qname
commit (tok TokAnyClose)
debug "]"
let (ss,cs) = unzip zz
let s = if null ss then [] else last ss
( if e == (map toLower n :: Name) then
do unparse (reformatTags (closeInner e s))
debug "^"
return ([], Elem (N e) avs cs)
else
do unparse [TokEndOpen, TokName n, TokAnyClose]
debug "-"
return (((e,avs):s), Elem (N e) avs cs))
) `onFail` failP ("failed to repair non-matching tags in context: "++ctx)))
closeInner :: Name -> [(Name,[Attribute])] -> [(Name,[Attribute])]
closeInner c ts =
case lookup c closeInnerTags of
(Just these) -> filter ((`notElem` these).fst) ts
Nothing -> ts
unparse :: [TokenT] -> Parser (Posn, TokenT) ()
unparse ts = do p <- posn
reparse (zip (repeat p) ts)
reformatAttrs :: [(QName, AttValue)] -> [TokenT]
reformatAttrs avs = concatMap f0 avs
where f0 (a, v@(AttValue _)) = [ TokName (printableName a), TokEqual
, TokQuote, TokFreeText (show v), TokQuote ]
reformatTags :: [(String, [(QName, AttValue)])] -> [TokenT]
reformatTags ts = concatMap f0 ts
where f0 (t,avs) = [TokAnyOpen, TokName t]++reformatAttrs avs++[TokAnyClose]
content :: Name -> HParser (Stack,Content Posn)
content ctx = do { p <- posn ; content' p }
where content' p = oneOf'
[ ( "element", element ctx >>= \(s,e)-> return (s, CElem e p))
, ( "chardata", chardata >>= \s-> return ([], CString False s p))
, ( "reference", reference >>= \r-> return ([], CRef r p))
, ( "cdsect", cdsect >>= \c-> return ([], CString True c p))
, ( "misc", misc >>= \m-> return ([], CMisc m p))
] `adjustErrP` ("when looking for a content item,\n"++)
elemtag :: HParser ElemTag
elemtag = do
(N n) <- qname `adjustErrBad` ("malformed element tag\n"++)
as <- many attribute
return (ElemTag (N $ map toLower n) as)
attribute :: HParser Attribute
attribute = do
(N n) <- qname
v <- (do tok TokEqual
attvalue) `onFail`
(return (AttValue [Left "TRUE"]))
return (N $ map toLower n, v)
reference :: HParser Reference
reference = do
bracket (tok TokAmp) (tok TokSemi) (freetext >>= val)
where
val ('#':'x':i) | all isHexDigit i
= return . RefChar . fst . head . readHex $ i
val ('#':i) | all isDigit i
= return . RefChar . fst . head . readDec $ i
val ent = return . RefEntity $ ent
externalid :: HParser ExternalID
externalid =
( do word "SYSTEM"
s <- systemliteral
return (SYSTEM s)) `onFail`
( do word "PUBLIC"
p <- pubidliteral
s <- (systemliteral `onFail` return (SystemLiteral ""))
return (PUBLIC p s))
encodingdecl :: HParser EncodingDecl
encodingdecl = do
(word "encoding" `onFail` word "ENCODING")
tok TokEqual `onFail` failBadP "expected = in 'encoding' decl"
f <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
return (EncodingDecl f)
attvalue :: HParser AttValue
attvalue =
( do avs <- bracket (tok TokQuote) (commit $ tok TokQuote)
(many (either freetext reference))
return (AttValue avs) ) `onFail`
( do v <- nmtoken
s <- (tok TokPercent >> return "%") `onFail` return ""
return (AttValue [Left (v++s)]) ) `onFail`
( do s <- oneOf [ tok TokPlus >> return "+"
, tok TokHash >> return "#"
]
v <- nmtoken
return (AttValue [Left (s++v)]) ) `onFail`
failP "Badly formatted attribute value"
systemliteral :: HParser SystemLiteral
systemliteral = do
s <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
return (SystemLiteral s)
pubidliteral :: HParser PubidLiteral
pubidliteral = do
s <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
return (PubidLiteral s)
chardata :: HParser CharData
chardata = freetext