module HtmlParser2 (parseHtml,parseHtmlText,parseHtmlBlocks) where
import Html
import HtmlTags
import HtmlLex
import ParsOps3
import HtmlParsOps
import Utils2(isSpace')
-- | 'parseHtml' is an HTML parser. It normally returns @Right html@.
-- The parser is built using error-correcting parsing combinators, so it should
-- not fail, but if it does it returns @Left errormessage@.
--
-- 'parseHtml' parses /complete/ HTML documents. The structure of the
-- 'Html' value follows the structure described in the HTML 4.0 standard.
parseHtml :: String -> Either ([String],String) Html
parseHtml = parse' htmlDocument
parseHtmlText :: String -> Either ([String],String) Html
parseHtmlText = parse' optText
-- ^ Parse a text level HTML fragment
parseHtmlBlocks :: String -> Either ([String],String) Html
parseHtmlBlocks = parse' blocks
-- ^ Parse a block level HTML fragment
parse' part s =
case parseToEof part (htmlLex s) of
Right html -> Right html
Left (ts, es) -> Left (map sh1 es, showHtmlLex ts)
htmlDocument =
two <$> doctype <*> htmlDoc <* whitespace
-- one <$> htmlDoc
where
doctype = space *> optional defaultDoctype (comment <* whitespace)
where defaultDoctype = garbage doctypehtml40
htmlDoc = impliedCtx HTML headAndBody
doctypehtml40 =
"!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\""
headAndBody =
-- one <$> headPart
(:) <$> headPart <* whitespace <*> framesetAndBody
headPart = whitespace *> impliedCtx HEAD (many headElement)
where
headElement =
title
<|> headCmd
<|> script
<|> style
<|> badHeaderStuff
where
title = ctx1 TITLE plaintext
headCmd = cmd [ISINDEX, LINK, BASE,META] --"NEXTID"
script = anythingButCtx SCRIPT
style = anythingButCtx STYLE
badHeaderStuff = whitespace1
framesetAndBody = (++) <$> optFramesetPart <*> noFramesPart
optFramesetPart = optional [] (one <$> frameset)
where
frameset = ctx1' FRAMESET frames
frames = whitespace *> some frame
frame = (cmd1 FRAME <|> frameset) <* whitespace
noFramesPart = whitespace *> (one <$> (noframes <|> bodyPart))
where
noframes = ctx1' NOFRAMES (one <$> bodyPart)
bodyPart = whitespace *> impliedCtx BODY blocks <* whitespace
blocks = trimctx <$> many block
block = {-bclean <$>-} block'
block' = block'' <|> (p0 . trim) <$> text
block'' =
ctx [UL,OL,DIR,MENU] list
<|> ctx1 DL dlist
<|> ctx' [H1 .. H6] optText
<|> ctx nestedBlockTags blocks
<|> ctx1 TABLE table
<|> ctx1 FORM blocks -- form
<|> ctx1 PRE optPreText
<|> ctx1' ADDRESS optText --
is allowed too!
<|> cmd1 HR
<|> cmd1 ISINDEX -- only allowed in head according to standard
<|> ctx1' P optText
where
nestedBlockTags = [BLOCKQUOTE,DIV,CENTER,MAIN,ARTICLE,ASIDE,DETAILS,
FIGCAPTION,FIGURE,FOOTER,HEADER,HGROUP,NAV,SECTION]
list = whitespace *> (trimctx <$> many listElement)
where listElement =
(ctx1' LI blocks <* whitespace)
<|> badCtx LI block -- allows text outside
...
dlist = trimctx <$> many dlistElement
where dlistElement =
ctx1' DT blocks -- std: only text allowed
<|> ctx1' DD blocks
<|> badCtx DD block -- allows white space
table = whitespace *> many (tableElement <* whitespace)
where tableElement =
ctx1 CAPTION optText
<|> ctx [THEAD,TFOOT,TBODY] (many tr)
<|> tr
<|> formElement
tr = ctx1' TR row
<|> badCtx' TR row'
row = whitespace *> many (cell <* whitespace)
row' = some cell
cell = ctx' [TH,TD] blocks
-- <|> badCtx TD block -- allows white space
-- form = many formElement
-- elements can contain block elements in HTML5...
-- https://www.w3.org/TR/html5/textlevel-semantics.html#the-a-element
flow = many (block'' <|> textElement)
optText = optional [] text
optPreText = optional [] preText
--text = text' chars
preText = text --text' prechars
--text' chars = txt
text = trimctx <$> some textElement
textElement = plain <|> special <|> chars <|> comment
where
txt0 = optional [] text
-- Note: no direct or indirect references to text or preText below!
plain = ctx' textLevelTags txt0
special =
cmd [BR,IMG,BASEFONT]
<|> ctx1 A flow
<|> ctx [APPLET,FUPPLET,OBJECT] applet
<|> ctx [AUDIO,VIDEO,PICTURE] media
<|> ctx [IFRAME,BUTTON,LABEL,METER] txt0
<|> ctx1 MAP map
<|> formElement
<|> ctx1 SVG whitespace
<|> anythingButCtx SCRIPT
<|> anythingButCtx STYLE
where
applet = many appletElement
where appletElement = cmd1 PARAM <|> textElement
media = many mediaElement
where mediaElement = cmd1 SOURCE <|> textElement
map = many mapElement
where mapElement = cmd1 AREA <|> whitespace1
formElement =
cmd1 INPUT
<|> ctx1 SELECT slist
<|> ctx1 TEXTAREA plaintext
-- Form elements are only allowed inside forms, but can
-- occur in nested elements, so using a separate form
-- parser (like for tables) is no good.
where
slist = many slistElement
where slistElement = ctx1' OPTION plaintext
---}
plaintext = many chars
-- Text level markup:
logicalTags = [EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,SPAN,ABBR,ACRONYM,DEL,INS,
TIME,MARK,BDI]
physicalTags = [TT,I,B,U,STRIKE,BIG,SMALL,SUB,SUP]
-- All text level tags for which we allow the end tag to be missing:
textLevelTags = FONT:NOBR:logicalTags++physicalTags
one x = [x]
two x y = [x,y]
--impliedCtx' t p = ctx [t] p
p0 = HtmlContext (P,implicit)
-- Used for handling bad html:
badCtx t = fmap (HtmlContext (t,noAttrs) . trimctx . one)
badCtx' t = fmap (HtmlContext (t,noAttrs) . trimctx)
{-
bclean item =
case item of
HtmlContext t@(n,_) html | n/=PRE -> HtmlContext t (trim html)
_ -> item
-}
--trim = id
--trimctx = id
--{-
trim [] = []
trim (HtmlChars s:html) =
case dropWhile isSpace' s of
"" -> trim html
s -> HtmlChars s:trim html
trim (HtmlContext t []:html) | okToTrim t = trim html
trim html =trimctx html
trimctx [] = []
trimctx (HtmlContext ta []:html) | okToTrim ta = trimctx html
trimctx (item:html) = item:trimctx html
okToTrim t = not (isTarget t || keep t)
where
isTarget (t,attrs) = t==A && hasAttr "NAME" attrs || hasAttr "ID" attrs
keep (t,_) = t `elem` [TEXTAREA,TD,TH]
--}