module Text.HTML.Tagchup.Parser.Tag where
import Text.HTML.Tagchup.Parser.Combinator
(allowFail, withDefault,
voidChar, dropSpaces, getPos,
many, many0toN, many1toN,
many1Satisfy, readUntil,
satisfy, voidString,
emit, modifyEmission, )
import qualified Text.HTML.Tagchup.Parser.Combinator as Parser
import qualified Text.HTML.Tagchup.Parser.Status as Status
import qualified Text.HTML.Tagchup.Parser.Stream as Stream
import qualified Text.HTML.Tagchup.PositionTag as PosTag
import qualified Text.HTML.Tagchup.Tag as Tag
import qualified Text.XML.Basic.Position as Position
import qualified Text.HTML.Basic.Character as HTMLChar
import qualified Text.XML.Basic.ProcessingInstruction as PI
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.Name as Name
import qualified Text.XML.Basic.Tag as TagName
import qualified Text.HTML.Tagchup.Character as Chr
import Text.HTML.Tagchup.Character (fromChar, )
import qualified Text.HTML.Basic.Entity as HTMLEntity
import qualified Control.Monad.Exception.Synchronous as Exc
import Control.Monad.Trans.Writer (runWriterT, )
import Control.Monad.Trans.State (StateT(..), )
import Control.Monad (mplus, msum, when, liftM, )
import Data.Monoid (Monoid, mempty, mconcat, )
import qualified Data.Map as Map
import Data.Tuple.HT (mapSnd, )
import Data.Char (isAlphaNum, chr, ord, )
import Data.Maybe (maybeToList, )
type Warning = (Position.T, String)
type Parser source a = Parser.Full source Warning a
type ParserEmit source a = Parser.Emitting source Warning a
parsePosTagMergeWarnings ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name) =>
StateT (Status.T source) Maybe [PosTag.T name sink]
parsePosTagMergeWarnings =
liftM (\((ot,ct),warns) ->
ot :
map (\(pos,warn) -> PosTag.cons pos $ Tag.Warning warn) warns ++
maybeToList ct) $
runWriterT parsePosTag
parsePosTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name) =>
Parser source (PosTag.T name sink, Maybe (PosTag.T name sink))
parsePosTag = do
let omitClose :: Monad m => m t -> m (t, Maybe t)
omitClose = liftM (\t -> (t, Nothing))
pos <- getPos
mplus
(do voidChar '<'
allowFail $ withDefault
(msum $
omitClose (parseSpecialTag pos) :
omitClose (parseProcessingTag pos) :
omitClose (parseCloseTag pos) :
parseOpenTag pos :
[])
(do emitWarning pos "A '<', that is not part of a tag. Encode it as < please."
omitClose (returnTag pos (Tag.Text $ stringFromChar '<'))))
(omitClose (parseText pos))
parseOpenTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name) =>
Position.T ->
Parser source
(PosTag.T name sink, Maybe (PosTag.T name sink))
parseOpenTag pos =
do name <- parseName
allowFail $
do dropSpaces
tag <- returningTag pos (Tag.Open name) $
modifyEmission (restrictWarnings 10) $ many parseAttribute
liftM ((,) tag) $ withDefault
(do closePos <- getPos
voidString "/>"
allowFail $ liftM Just $ returnTag closePos (Tag.Close name))
(do junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in opening tag: \"" ++ junk ++ "\""))
("Unterminated open tag \"" ++ Name.toString name ++ "\"") ">"
return Nothing)
parseCloseTag ::
(Stream.C source, Name.Tag name) =>
Position.T -> Parser source (PosTag.T name sink)
parseCloseTag pos =
do voidChar '/'
name <- parseName
allowFail $
do tag <- returnTag pos (Tag.Close name)
dropSpaces
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in closing tag: \"" ++ junk ++"\""))
("Unterminated closing tag \"" ++ Name.toString name ++"\"") ">"
return tag
parseSpecialTag ::
(Stream.C source, Name.Tag name) =>
Position.T -> Parser source (PosTag.T name sink)
parseSpecialTag pos =
do voidChar '!'
msum $
(do voidString "--"
allowFail $ readUntilTerm
(\ cmt -> returnTag pos (Tag.Comment cmt))
"Unterminated comment" "-->") :
(do voidString TagName.cdataString
allowFail $ readUntilTerm
(\ cdata -> returnTag pos (Tag.cdata cdata))
"Unterminated cdata" "]]>") :
(do name <- parseName
allowFail $
do dropSpaces
readUntilTerm
(\ info -> returnTag pos (Tag.Special name info))
("Unterminated special tag \"" ++ Name.toString name ++ "\"") ">") :
[]
parseProcessingTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name) =>
Position.T -> Parser source (PosTag.T name sink)
parseProcessingTag pos =
do voidChar '?'
name <- parseName
allowFail $
do dropSpaces
returningTag pos (Tag.Processing name) $
if Name.matchAny ["xml", "xml-stylesheet"] name
then
do attrs <- many parseAttribute
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen (not $ null junk) junkPos
("Junk in processing info tag: \"" ++ junk ++ "\""))
("Unterminated processing info tag \"" ++ Name.toString name ++ "\"") "?>"
return $ PI.Known attrs
else readUntilTerm (return . PI.Unknown)
"Unterminated processing instruction" "?>"
parseText ::
(Stream.C source, StringType sink) =>
Position.T -> Parser source (PosTag.T name sink)
parseText pos =
returningTag pos Tag.Text (parseCharAsString (const True))
parseAttribute ::
(Stream.C source, StringType sink, Name.Attribute name) =>
Parser source (Attr.T name sink)
parseAttribute =
parseName >>= \name -> allowFail $
do dropSpaces
value <-
withDefault
(voidString "=" >> allowFail (dropSpaces >> parseValue))
(return mempty)
dropSpaces
return $ Attr.Cons name value
parseName ::
(Stream.C source, Name.C pname) =>
Parser source pname
parseName =
liftM Name.fromString $
many1Satisfy (\c -> isAlphaNum c || c `elem` "_-.:")
parseValue ::
(Stream.C source, StringType sink) =>
ParserEmit source sink
parseValue =
(msum $
parseQuoted "Unterminated doubly quoted value string" '"' :
parseQuoted "Unterminated singly quoted value string" '\'' :
[])
`withDefault`
parseUnquotedValueAsString
parseUnquotedValueChar ::
(Stream.C source) =>
ParserEmit source String
parseUnquotedValueChar =
let parseValueChar =
do pos <- getPos
str <- parseUnicodeChar (not . flip elem " >\"\'")
let wrong = filter (not . isValidValueChar) str
allowFail $
emitWarningWhen (not (null wrong)) pos $
"Illegal characters in unquoted value: " ++ wrong
return str
in liftM concat $ many parseValueChar
parseUnquotedValueHTMLChar ::
(Stream.C source) =>
ParserEmit source [HTMLChar.T]
parseUnquotedValueHTMLChar =
let parseValueChar =
do pos <- getPos
hc <- parseHTMLChar (not . flip elem " >\"\'")
allowFail $ mapM_ (checkUnquotedChar pos) hc
return hc
in liftM concat $ many parseValueChar
checkUnquotedChar :: Position.T -> HTMLChar.T -> ParserEmit source ()
checkUnquotedChar pos x =
case x of
HTMLChar.Unicode c ->
emitWarningWhen (not (isValidValueChar c)) pos $
"Illegal characters in unquoted value: '" ++ c : "'"
_ -> return ()
isValidValueChar :: Char -> Bool
isValidValueChar c = isAlphaNum c || c `elem` "_-:."
parseQuoted ::
(Stream.C source, StringType sink) =>
String -> Char -> Parser source sink
parseQuoted termMsg quote =
voidChar quote >>
(allowFail $
do str <- parseString (quote/=)
withDefault
(voidChar quote)
(do termPos <- getPos
emitWarning termPos termMsg)
return str)
readUntilTerm ::
(Stream.C source) =>
(String -> ParserEmit source a) -> String -> String -> ParserEmit source a
readUntilTerm generateTag termWarning termPat =
do ~(termFound,str) <- readUntil termPat
result <- generateTag str
termPos <- getPos
emitWarningWhen (not termFound) termPos termWarning
return result
class Chr.C char => CharType char where
parseChar :: (Stream.C source) => (Char -> Bool) -> Parser source [char]
parseUnquotedValue :: (Stream.C source) => ParserEmit source [char]
instance CharType Char where
parseChar = parseUnicodeChar
parseUnquotedValue = parseUnquotedValueChar
instance CharType HTMLChar.T where
parseChar = parseHTMLChar
parseUnquotedValue = parseUnquotedValueHTMLChar
class Monoid sink => StringType sink where
stringFromChar :: Char -> sink
parseCharAsString ::
(Stream.C source) =>
(Char -> Bool) -> Parser source sink
parseUnquotedValueAsString ::
(Stream.C source) =>
ParserEmit source sink
instance CharType char => StringType [char] where
stringFromChar c = [fromChar c]
parseCharAsString = parseChar
parseUnquotedValueAsString = parseUnquotedValue
parseString ::
(Stream.C source, StringType sink) =>
(Char -> Bool) -> ParserEmit source sink
parseString p = liftM mconcat $ many (parseCharAsString p)
parseUnicodeChar ::
(Stream.C source) =>
(Char -> Bool) -> Parser source String
parseUnicodeChar p =
do pos <- getPos
x <- parseHTMLChar p
allowFail $ liftM concat $
mapM (htmlCharToString pos) x
htmlCharToString ::
Position.T -> HTMLChar.T -> ParserEmit source String
htmlCharToString pos x =
let returnChar c = return $ c:[]
in case x of
HTMLChar.Unicode c -> returnChar c
HTMLChar.CharRef num -> returnChar (chr num)
HTMLChar.EntityRef name ->
maybe
(let refName = '&':name++";"
in emitWarning pos ("Unknown HTML entity " ++ refName) >>
return refName)
returnChar
(Map.lookup name HTMLEntity.mapNameToChar)
parseHTMLChar ::
(Stream.C source) =>
(Char -> Bool) -> Parser source [HTMLChar.T]
parseHTMLChar p =
do pos <- getPos
c <- satisfy p
allowFail $
if c=='&'
then
withDefault
(do ent <-
mplus
(do voidChar '#'
digits <- allowFail $ many0toN 10 (satisfy isAlphaNum)
Exc.switch
(\e ->
allowFail (emitWarning pos ("Error in numeric entity: " ++ e)) >>
return (map HTMLChar.fromUnicode ('&':'#':digits)))
(return . (:[]) . HTMLChar.CharRef . ord)
(HTMLEntity.numberToChar digits))
(liftM ((:[]) . HTMLChar.EntityRef) $
many1toN 10 (satisfy isAlphaNum))
voidChar ';'
return ent)
(emitWarning pos "Non-terminated entity reference" >>
return [HTMLChar.Unicode '&'])
else return [HTMLChar.Unicode c]
restrictWarnings :: Int -> [Warning] -> [Warning]
restrictWarnings n =
uncurry (++) .
mapSnd
(\rest ->
case rest of
(pos, _) : _ ->
[(pos, "further warnings suppressed")]
_ -> []) .
splitAt n
emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit source ()
emitWarningWhen cond pos msg =
when cond $ emitWarning pos msg
emitWarning :: Position.T -> String -> ParserEmit source ()
emitWarning = curry emit
returnTag ::
Position.T ->
Tag.T name sink ->
ParserEmit source (PosTag.T name sink)
returnTag p t = return (PosTag.cons p t)
returningTag ::
(Monad m) =>
Position.T ->
(a -> Tag.T name sink) ->
m a ->
m (PosTag.T name sink)
returningTag pos f =
liftM (PosTag.cons pos . f)