{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.Pandoc.Readers.HTML.Parsing
( TagOmission (..)
, pInTags
, pInTags'
, pInTag
, pInTagWithAttribs
, pAny
, pCloses
, pSatisfy
, pBlank
, matchTagClose
, matchTagOpen
, isSpace
, maybeFromAttrib
, toAttr
, toStringAttr
)
where
import Control.Monad (void, mzero, mplus)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
( Attribute, Tag (..), isTagPosition, isTagOpen, isTagClose, (~==) )
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Definition (Attr)
import Text.Pandoc.Parsing
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Shared (tshow)
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import qualified Data.Set as Set
import qualified Data.Text as T
data TagOmission
= TagsRequired
| ClosingTagOptional
| TagsOmittable
deriving (TagOmission -> TagOmission -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TagOmission -> TagOmission -> Bool
$c/= :: TagOmission -> TagOmission -> Bool
== :: TagOmission -> TagOmission -> Bool
$c== :: TagOmission -> TagOmission -> Bool
Eq)
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
pInTags :: forall (m :: * -> *) a.
(PandocMonad m, Monoid a) =>
Text -> TagParser m a -> TagParser m a
pInTags Text
tagtype TagParser m a
parser = forall (m :: * -> *) a.
(PandocMonad m, Monoid a) =>
Text -> (Tag Text -> Bool) -> TagParser m a -> TagParser m a
pInTags' Text
tagtype (forall a b. a -> b -> a
const Bool
True) TagParser m a
parser
pInTags' :: (PandocMonad m, Monoid a)
=> Text
-> (Tag Text -> Bool)
-> TagParser m a
-> TagParser m a
pInTags' :: forall (m :: * -> *) a.
(PandocMonad m, Monoid a) =>
Text -> (Tag Text -> Bool) -> TagParser m a -> TagParser m a
pInTags' Text
tagtype Tag Text -> Bool
tagtest TagParser m a
parser = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
try forall a b. (a -> b) -> a -> b
$ do
forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy forall a b. (a -> b) -> a -> b
$ \Tag Text
t -> Text -> [(Text, Text)] -> Tag Text -> Bool
matchTagOpen Text
tagtype [] Tag Text
t Bool -> Bool -> Bool
&& Tag Text -> Bool
tagtest Tag Text
t
forall a. Monoid a => [a] -> a
mconcat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s (m :: * -> *) t u a end.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m end -> ParsecT s u m [a]
manyTill TagParser m a
parser (forall (m :: * -> *). PandocMonad m => Text -> TagParser m ()
pCloses Text
tagtype forall s u (m :: * -> *) a.
ParsecT s u m a -> ParsecT s u m a -> ParsecT s u m a
<|> forall s (m :: * -> *) t u.
(Stream s m t, Show t) =>
ParsecT s u m ()
eof)
pInTag :: PandocMonad m
=> TagOmission
-> Text
-> TagParser m a
-> TagParser m a
pInTag :: forall (m :: * -> *) a.
PandocMonad m =>
TagOmission -> Text -> TagParser m a -> TagParser m a
pInTag TagOmission
tagOmission Text
tagtype = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (a, b) -> b
snd forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (m :: * -> *) a.
PandocMonad m =>
TagOmission
-> Text -> TagParser m a -> TagParser m ([(Text, Text)], a)
pInTagWithAttribs TagOmission
tagOmission Text
tagtype
pInTagWithAttribs :: PandocMonad m
=> TagOmission
-> Text
-> TagParser m a
-> TagParser m ([Attribute Text], a)
pInTagWithAttribs :: forall (m :: * -> *) a.
PandocMonad m =>
TagOmission
-> Text -> TagParser m a -> TagParser m ([(Text, Text)], a)
pInTagWithAttribs TagOmission
tagOmission Text
tagtype TagParser m a
p = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
try forall a b. (a -> b) -> a -> b
$ do
let openingOptional :: Bool
openingOptional = TagOmission
tagOmission forall a. Eq a => a -> a -> Bool
== TagOmission
TagsOmittable
let closingOptional :: Bool
closingOptional = TagOmission
tagOmission forall a. Eq a => a -> a -> Bool
/= TagOmission
TagsRequired
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m ()
skipMany forall (m :: * -> *). PandocMonad m => TagParser m ()
pBlank
[(Text, Text)]
attribs <- (if Bool
openingOptional then forall s (m :: * -> *) t a u.
Stream s m t =>
a -> ParsecT s u m a -> ParsecT s u m a
option [] else forall a. a -> a
id)
(forall {str}. Tag str -> [Attribute str]
getAttribs forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy (Text -> [(Text, Text)] -> Tag Text -> Bool
matchTagOpen Text
tagtype []))
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m ()
skipMany forall (m :: * -> *). PandocMonad m => TagParser m ()
pBlank
a
x <- TagParser m a
p
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m ()
skipMany forall (m :: * -> *). PandocMonad m => TagParser m ()
pBlank
(if Bool
closingOptional then forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m ()
optional else forall (f :: * -> *) a. Functor f => f a -> f ()
void) forall a b. (a -> b) -> a -> b
$
forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy (Text -> Tag Text -> Bool
matchTagClose Text
tagtype)
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m ()
skipMany forall (m :: * -> *). PandocMonad m => TagParser m ()
pBlank
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Text, Text)]
attribs, a
x)
where
getAttribs :: Tag str -> [Attribute str]
getAttribs = \case
TagOpen str
_ [Attribute str]
attribs -> [Attribute str]
attribs
Tag str
_ -> []
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses :: forall (m :: * -> *). PandocMonad m => Text -> TagParser m ()
pCloses Text
tagtype = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
try forall a b. (a -> b) -> a -> b
$ do
Tag Text
t <- forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m a
lookAhead forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy forall a b. (a -> b) -> a -> b
$ \Tag Text
tag -> forall str. Tag str -> Bool
isTagClose Tag Text
tag Bool -> Bool -> Bool
|| forall str. Tag str -> Bool
isTagOpen Tag Text
tag
case Tag Text
t of
(TagClose Text
t') | Text
t' forall a. Eq a => a -> a -> Bool
== Text
tagtype -> forall (f :: * -> *) a. Functor f => f a -> f ()
void forall (m :: * -> *). PandocMonad m => TagParser m (Tag Text)
pAny
(TagOpen Text
t' [(Text, Text)]
_) | Text
t' Text -> Text -> Bool
`closes` Text
tagtype -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"ul") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"li" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"ol") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"li" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"dl") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"dd" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"table") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"td" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"table") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"th" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"table") | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"tr" -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"td") | Text
tagtype forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Text
blockHtmlTags -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
"th") | Text
tagtype forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Text
blockHtmlTags -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
(TagClose Text
t') | Text
tagtype forall a. Eq a => a -> a -> Bool
== Text
"p" Bool -> Bool -> Bool
&& Text
t' forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Text
blockHtmlTags
-> forall (m :: * -> *) a. Monad m => a -> m a
return ()
Tag Text
_ -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
pBlank :: PandocMonad m => TagParser m ()
pBlank :: forall (m :: * -> *). PandocMonad m => TagParser m ()
pBlank = forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy Tag Text -> Bool
isBlank
where
isBlank :: Tag Text -> Bool
isBlank (TagText Text
t) = (Char -> Bool) -> Text -> Bool
T.all Char -> Bool
isSpace Text
t
isBlank (TagComment Text
_) = Bool
True
isBlank Tag Text
_ = Bool
False
pLocation :: PandocMonad m => TagParser m ()
pLocation :: forall (m :: * -> *). PandocMonad m => TagParser m ()
pLocation = do
(TagPosition Row
r Row
c) <- forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSat forall str. Tag str -> Bool
isTagPosition
forall (m :: * -> *) s u. Monad m => SourcePos -> ParsecT s u m ()
setPosition forall a b. (a -> b) -> a -> b
$ SourceName -> Row -> Row -> SourcePos
newPos SourceName
"input" Row
r Row
c
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat :: forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSat Tag Text -> Bool
f = do
SourcePos
pos <- forall (m :: * -> *) s u. Monad m => ParsecT s u m SourcePos
getPosition
forall s (m :: * -> *) t a st.
Stream s m t =>
(t -> Text)
-> (t -> SourcePos) -> (t -> Maybe a) -> ParsecT s st m a
token forall a. Show a => a -> Text
tshow (forall a b. a -> b -> a
const SourcePos
pos) (\Tag Text
x -> if Tag Text -> Bool
f Tag Text
x then forall a. a -> Maybe a
Just Tag Text
x else forall a. Maybe a
Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy :: forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy Tag Text -> Bool
f = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
try forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m ()
optional forall (m :: * -> *). PandocMonad m => TagParser m ()
pLocation forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSat Tag Text -> Bool
f
matchTagClose :: Text -> (Tag Text -> Bool)
matchTagClose :: Text -> Tag Text -> Bool
matchTagClose Text
t = (forall str t. (StringLike str, TagRep t) => Tag str -> t -> Bool
~== forall str. str -> Tag str
TagClose Text
t)
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
matchTagOpen :: Text -> [(Text, Text)] -> Tag Text -> Bool
matchTagOpen Text
t [(Text, Text)]
as = (forall str t. (StringLike str, TagRep t) => Tag str -> t -> Bool
~== forall str. str -> [Attribute str] -> Tag str
TagOpen Text
t [(Text, Text)]
as)
pAny :: PandocMonad m => TagParser m (Tag Text)
pAny :: forall (m :: * -> *). PandocMonad m => TagParser m (Tag Text)
pAny = forall (m :: * -> *).
PandocMonad m =>
(Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy (forall a b. a -> b -> a
const Bool
True)
isSpace :: Char -> Bool
isSpace :: Char -> Bool
isSpace Char
' ' = Bool
True
isSpace Char
'\t' = Bool
True
isSpace Char
'\n' = Bool
True
isSpace Char
'\r' = Bool
True
isSpace Char
_ = Bool
False
closes :: Text -> Text -> Bool
Text
_ closes :: Text -> Text -> Bool
`closes` Text
"body" = Bool
False
Text
_ `closes` Text
"html" = Bool
False
Text
"body" `closes` Text
"head" = Bool
True
Text
"a" `closes` Text
"a" = Bool
True
Text
"li" `closes` Text
"li" = Bool
True
Text
"th" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"th",Text
"td"] = Bool
True
Text
"td" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"th",Text
"td"] = Bool
True
Text
"tr" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"th",Text
"td",Text
"tr",Text
"colgroup"] = Bool
True
Text
"dd" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"dt", Text
"dd"] = Bool
True
Text
"dt" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"dt",Text
"dd"] = Bool
True
Text
"rt" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"rb", Text
"rt", Text
"rtc"] = Bool
True
Text
"col" `closes` Text
"col" = Bool
True
Text
"colgroup" `closes` Text
"col" = Bool
True
Text
"optgroup" `closes` Text
"optgroup" = Bool
True
Text
"optgroup" `closes` Text
"option" = Bool
True
Text
"option" `closes` Text
"option" = Bool
True
Text
x `closes` Text
"p" | Text
x forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"address", Text
"article", Text
"aside", Text
"blockquote",
Text
"dir", Text
"div", Text
"dl", Text
"fieldset", Text
"footer", Text
"form", Text
"h1", Text
"h2", Text
"h3", Text
"h4",
Text
"h5", Text
"h6", Text
"header", Text
"hr", Text
"main", Text
"menu", Text
"nav", Text
"ol", Text
"p", Text
"pre", Text
"section",
Text
"table", Text
"ul"] = Bool
True
Text
_ `closes` Text
"meta" = Bool
True
Text
"form" `closes` Text
"form" = Bool
True
Text
"label" `closes` Text
"label" = Bool
True
Text
"map" `closes` Text
"map" = Bool
True
Text
"object" `closes` Text
"object" = Bool
True
Text
_ `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"option",Text
"style",Text
"script",Text
"textarea",Text
"title"] = Bool
True
Text
t `closes` Text
"select" | Text
t forall a. Eq a => a -> a -> Bool
/= Text
"option" = Bool
True
Text
"thead" `closes` Text
"colgroup" = Bool
True
Text
"tfoot" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"thead",Text
"colgroup"] = Bool
True
Text
"tbody" `closes` Text
t | Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"tbody",Text
"tfoot",Text
"thead",Text
"colgroup"] = Bool
True
Text
t `closes` Text
t2 |
Text
t forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"h1",Text
"h2",Text
"h3",Text
"h4",Text
"h5",Text
"h6",Text
"dl",Text
"ol",Text
"ul",Text
"table",Text
"div",Text
"main",Text
"p"] Bool -> Bool -> Bool
&&
Text
t2 forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text
"h1",Text
"h2",Text
"h3",Text
"h4",Text
"h5",Text
"h6",Text
"p" ] = Bool
True
Text
t1 `closes` Text
t2 |
Text
t1 forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Text
blockTags Bool -> Bool -> Bool
&&
Text
t2 forall a. Ord a => a -> Set a -> Bool
`Set.notMember` Set Text
blockTags Bool -> Bool -> Bool
&&
Text
t2 forall a. Ord a => a -> Set a -> Bool
`Set.notMember` Set Text
eitherBlockOrInline = Bool
True
Text
_ `closes` Text
_ = Bool
False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
go []
where
go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
go (Text
"xml:lang",Text
y) [(Text, Text)]
ats = (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
go (Text
"lang",Text
y) [(Text, Text)]
ats
go (Text
x,Text
y) [(Text, Text)]
ats
| forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (\(Text
x',Text
_) -> Text
x forall a. Eq a => a -> a -> Bool
== Text
x') [(Text, Text)]
ats = [(Text, Text)]
ats
| Bool
otherwise =
case Text -> Text -> Maybe Text
T.stripPrefix Text
"data-" Text
x of
Just Text
x' | Text
x' forall a. Ord a => a -> Set a -> Bool
`Set.notMember` (Set Text
html5Attributes forall a. Semigroup a => a -> a -> a
<>
Set Text
html4Attributes forall a. Semigroup a => a -> a -> a
<> Set Text
rdfaAttributes)
-> (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
go (Text
x',Text
y) [(Text, Text)]
ats
Maybe Text
_ -> (Text
x,Text
y)forall a. a -> [a] -> [a]
:[(Text, Text)]
ats
maybeFromAttrib :: Text -> Tag Text -> Maybe Text
maybeFromAttrib :: Text -> Tag Text -> Maybe Text
maybeFromAttrib Text
name (TagOpen Text
_ [(Text, Text)]
attrs) = forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
name [(Text, Text)]
attrs
maybeFromAttrib Text
_ Tag Text
_ = forall a. Maybe a
Nothing
mkAttr :: [(Text, Text)] -> Attr
mkAttr :: [(Text, Text)] -> Attr
mkAttr [(Text, Text)]
attr = (Text
attribsId, [Text]
attribsClasses, [(Text, Text)]
attribsKV)
where attribsId :: Text
attribsId = forall a. a -> Maybe a -> a
fromMaybe Text
"" forall a b. (a -> b) -> a -> b
$ forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"id" [(Text, Text)]
attr forall (m :: * -> *) a. MonadPlus m => m a -> m a -> m a
`mplus` forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"name" [(Text, Text)]
attr
attribsClasses :: [Text]
attribsClasses = Text -> [Text]
T.words (forall a. a -> Maybe a -> a
fromMaybe Text
"" forall a b. (a -> b) -> a -> b
$ forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"class" [(Text, Text)]
attr) forall a. Semigroup a => a -> a -> a
<> [Text]
epubTypes
attribsKV :: [(Text, Text)]
attribsKV = forall a. (a -> Bool) -> [a] -> [a]
filter (\(Text
k,Text
_) -> Text
k forall a. Eq a => a -> a -> Bool
/= Text
"class" Bool -> Bool -> Bool
&& Text
k forall a. Eq a => a -> a -> Bool
/= Text
"id" Bool -> Bool -> Bool
&& Text
k forall a. Eq a => a -> a -> Bool
/= Text
"name")
[(Text, Text)]
attr
epubTypes :: [Text]
epubTypes = Text -> [Text]
T.words forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a -> a
fromMaybe Text
"" forall a b. (a -> b) -> a -> b
$ forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"epub:type" [(Text, Text)]
attr
toAttr :: [(Text, Text)] -> Attr
toAttr :: [(Text, Text)] -> Attr
toAttr = [(Text, Text)] -> Attr
mkAttr forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Text, Text)] -> [(Text, Text)]
toStringAttr