Safe Haskell | Safe-Inferred |
---|---|
Language | GHC2021 |
A Markup
parser and printer of strict ByteString
s focused on optimising performance. Markup
is a representation of data such as HTML, SVG or XML but the parsing is not always at standards.
Synopsis
- newtype Markup = Markup {}
- data Standard
- markup :: Standard -> ByteString -> Warn Markup
- markup_ :: Standard -> ByteString -> Markup
- data RenderStyle
- markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString
- markdown_ :: RenderStyle -> Standard -> Markup -> ByteString
- normalize :: Markup -> Markup
- normContent :: Markup -> Markup
- wellFormed :: Standard -> Markup -> [MarkupWarning]
- isWellFormed :: Standard -> Markup -> Bool
- data MarkupWarning
- type Warn a = These [MarkupWarning] a
- warnError :: Warn a -> a
- warnEither :: Warn a -> Either [MarkupWarning] a
- warnMaybe :: Warn a -> Maybe a
- type Element = Tree Token
- element :: NameTag -> [Attr] -> Markup -> Markup
- element_ :: NameTag -> [Attr] -> Markup
- emptyElem :: NameTag -> [Attr] -> Markup
- elementc :: NameTag -> [Attr] -> ByteString -> Markup
- content :: ByteString -> Markup
- contentRaw :: ByteString -> Markup
- type NameTag = ByteString
- selfClosers :: [NameTag]
- doctypeHtml :: Markup
- doctypeXml :: Markup
- type AttrName = ByteString
- type AttrValue = ByteString
- data Attr = Attr {}
- addAttrs :: [Attr] -> Token -> Maybe Token
- attrsP :: Standard -> Parser a [Attr]
- nameP :: Standard -> Parser e ByteString
- data OpenTagType
- data Token
- = OpenTag !OpenTagType !NameTag ![Attr]
- | EndTag !NameTag
- | Content !ByteString
- | Comment !ByteString
- | Decl !ByteString ![Attr]
- | Doctype !ByteString
- tokenize :: Standard -> ByteString -> Warn [Token]
- tokenize_ :: Standard -> ByteString -> [Token]
- tokenP :: Standard -> Parser e Token
- detokenize :: Standard -> Token -> ByteString
- gather :: Standard -> [Token] -> Warn Markup
- gather_ :: Standard -> [Token] -> Markup
- degather :: Standard -> Markup -> Warn [Token]
- degather_ :: Standard -> Markup -> [Token]
- xmlVersionInfoP :: Parser e ByteString
- xmlEncodingDeclP :: Parser e ByteString
- xmlStandaloneP :: Parser e ByteString
- xmlVersionNumP :: Parser e ByteString
- xmlEncNameP :: Parser e ByteString
- xmlYesNoP :: Parser e ByteString
- utf8ToStr :: ByteString -> String
- strToUtf8 :: String -> ByteString
- escapeChar :: Char -> ByteString
- escape :: ByteString -> ByteString
- data Tree a = Node {}
Usage
import MarkupParse import Data.ByteString qualified as B bs <- B.readFile "other/line.svg" m = markup_ bs
is an isomorphic round trip from markdown_
. markup_
Markup
to ByteString
to Markup
:
- This is subject to the
Markup
beingwellFormed
. - The round-trip
is not isomorphic as parsing forgets whitespace within tags, comments and declarations.markup_
.markdown_
- The underscores represent versions of main functions that throw an exception on warnings encountered along the way.
At a lower level, a round trip pipeline might look something like:
tokenize Html >=>
tokenize
converts aByteString
to aToken
list.
gather Html >=>
(normalize >>> pure) >=>
normalize
concatenates content, and normalizes attributes,
degather Html >=>
degather
turns the markup tree back into a token list. Finally,
fmap (detokenize Html) >>> pure
detokenize
turns a token back into a bytestring.
Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the These
monad instance, which is wrapped into a type synonym named Warn
.
Markup
A list of Element
s or Tree
Token
s
>>>
markup Html "<foo class=\"bar\">baz</foo>"
That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking.
The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/
The html parsing was based on a reading of html-parse, but ignores the various 'x00' to 'xfffd' & eof directives that form part of the html standards.
markup :: Standard -> ByteString -> Warn Markup Source #
Convert bytestrings to Markup
>>>
markup Html "<foo><br></foo><baz"
These [MarkupParser (ParserLeftover "<baz")] (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]})
data RenderStyle Source #
Indented 0
puts newlines in between the tags.
Instances
Generic RenderStyle Source # | |
Defined in MarkupParse type Rep RenderStyle :: Type -> Type # from :: RenderStyle -> Rep RenderStyle x # to :: Rep RenderStyle x -> RenderStyle # | |
Show RenderStyle Source # | |
Defined in MarkupParse showsPrec :: Int -> RenderStyle -> ShowS # show :: RenderStyle -> String # showList :: [RenderStyle] -> ShowS # | |
Eq RenderStyle Source # | |
Defined in MarkupParse (==) :: RenderStyle -> RenderStyle -> Bool # (/=) :: RenderStyle -> RenderStyle -> Bool # | |
type Rep RenderStyle Source # | |
Defined in MarkupParse type Rep RenderStyle = D1 ('MetaData "RenderStyle" "MarkupParse" "markup-parse-0.1.1-BdYhv7KNpuBDovL7j9hJZ4" 'False) (C1 ('MetaCons "Compact" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Indented" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int))) |
markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString Source #
Convert Markup
to bytestrings
>>>
markdown (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])
That "<foo>\n <br>\n</foo>"
markdown_ :: RenderStyle -> Standard -> Markup -> ByteString Source #
Convert Markup
to ByteString
and error on warnings.
>>>
B.putStr $ markdown_ (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])
<foo> <br> </foo>
normalize :: Markup -> Markup Source #
Concatenate sequential content and normalize attributes; unwording class values and removing duplicate attributes (taking last).
>>>
B.putStr $ warnError $ markdown Compact Xml $ normalize (markup_ Xml [i|<foo class="a" class="b" bar="first" bar="last"/>|])
<foo bar="last" class="a b"/>
normContent :: Markup -> Markup Source #
Normalise Content in Markup, concatenating adjacent Content, and removing mempty Content.
>>>
normContent $ content "a" <> content "" <> content "b"
Markup {elements = [Node {rootLabel = Content "ab", subForest = []}]}
wellFormed :: Standard -> Markup -> [MarkupWarning] Source #
Check for well-formedness and return warnings encountered.
>>>
wellFormed Html $ Markup [Node (Comment "") [], Node (EndTag "foo") [], Node (OpenTag EmptyElemTag "foo" []) [Node (Content "bar") []], Node (OpenTag EmptyElemTag "foo" []) []]
[EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag]
Warnings
data MarkupWarning Source #
markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings.
BadEmptyElemTag | A tag ending with "/>" that is not an element of |
SelfCloserWithChildren | A tag ending with "/>" that has children. Cannot happen in the parsing phase. |
LeafWithChildren | Only a |
TagMismatch NameTag NameTag | A CloseTag with a different name to the currently open StartTag. |
UnmatchedEndTag | An EndTag with no corresponding StartTag. |
UnclosedTag | An StartTag with no corresponding EndTag. |
EndTagInTree | An EndTag should never appear in |
EmptyContent | Empty Content, Comment, Decl or Doctype |
BadDecl | Badly formed declaration |
MarkupParser ParserWarning |
Instances
type Warn a = These [MarkupWarning] a Source #
A type synonym for the common returning type of many functions. A common computation pipeline is to take advantage of the These
Monad instance eg
markup s bs = bs & (tokenize s >=> gather s) & second (Markup s)
warnError :: Warn a -> a Source #
Convert any warnings to an error
>>>
warnError $ (tokenize Html) "<foo"
*** Exception: MarkupParser (ParserLeftover "<foo") ...
warnEither :: Warn a -> Either [MarkupWarning] a Source #
Returns Left on any warnings
>>>
warnEither $ (tokenize Html) "<foo><baz"
Left [MarkupParser (ParserLeftover "<baz")]
warnMaybe :: Warn a -> Maybe a Source #
Returns results, if any, ignoring warnings.
>>>
warnMaybe $ (tokenize Html) "<foo><baz"
Just [OpenTag StartTag "foo" []]
Element
element :: NameTag -> [Attr] -> Markup -> Markup Source #
Create Markup
from a name tag and attributes that wraps some other markup.
>>>
element "div" [] (element_ "br" [])
Markup {elements = [Node {rootLabel = OpenTag StartTag "div" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]}
element_ :: NameTag -> [Attr] -> Markup Source #
Create Markup
from a name tag and attributes that doesn't wrap some other markup. The OpenTagType
used is StartTag
. Use emptyElem
if you want to create EmptyElemTag
based markup.
>>>
(element_ "br" [])
Markup {elements = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}
emptyElem :: NameTag -> [Attr] -> Markup Source #
Create Markup
from a name tag and attributes using EmptyElemTag
, that doesn't wrap some other markup. No checks are made on whether this creates well-formed markup.
>>>
emptyElem "br" []
Markup {elements = [Node {rootLabel = OpenTag EmptyElemTag "br" [], subForest = []}]}
content :: ByteString -> Markup Source #
contentRaw :: ByteString -> Markup Source #
Create a Markup element from a bytestring, not escaping the usual characters.
>>>
contentRaw "<content>"
Markup {elements = [Node {rootLabel = Content "<content>", subForest = []}]}
>>>
markup_ Html $ markdown_ Compact Html $ contentRaw "<content>"
*** Exception: UnclosedTag ...
Token components
type NameTag = ByteString Source #
Name of token
selfClosers :: [NameTag] Source #
Html tags that self-close
doctypeHtml :: Markup Source #
Standard Html Doctype
>>>
markdown_ Compact Html doctypeHtml
"<!DOCTYPE html>"
doctypeXml :: Markup Source #
Standard Xml Doctype
>>>
markdown_ Compact Xml doctypeXml
"<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">"
type AttrName = ByteString Source #
Name of an attribute.
type AttrValue = ByteString Source #
Value of an attribute. "" is equivalent to true with respect to boolean attributes.
An attribute of a tag
In parsing, boolean attributes, which are not required to have a value in HTML, will be set a value of "", which is ok. But this will then be rendered.
>>>
detokenize Html <$> tokenize_ Html [i|<input checked>|]
["<input checked=\"\">"]
Instances
Generic Attr Source # | |
Show Attr Source # | |
NFData Attr Source # | |
Defined in MarkupParse | |
Eq Attr Source # | |
Ord Attr Source # | |
ToExpr Attr Source # | |
Defined in MarkupParse | |
type Rep Attr Source # | |
Defined in MarkupParse type Rep Attr = D1 ('MetaData "Attr" "MarkupParse" "markup-parse-0.1.1-BdYhv7KNpuBDovL7j9hJZ4" 'False) (C1 ('MetaCons "Attr" 'PrefixI 'True) (S1 ('MetaSel ('Just "attrName") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrName) :*: S1 ('MetaSel ('Just "attrValue") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrValue))) |
addAttrs :: [Attr] -> Token -> Maybe Token Source #
Append attributes to an existing Token attribute list. Returns Nothing for tokens that do not have attributes.
nameP :: Standard -> Parser e ByteString Source #
Parse a tag name. Each standard is slightly different.
Tokens
data OpenTagType Source #
Whether an opening tag is a start tag or an empty element tag.
Instances
A Markup token. The term is borrowed from HTML standards but is used across Html
and Xml
in this library.
Note that the Token
type is used in two slightly different contexts:
- As an intermediary representation of markup between
ByteString
andMarkup
. - As the primitives of
Markup
Element
s
Specifically, an EndTag
will occur in a list of tokens, but not as a primitive in Markup
. It may turn out to be better to have two different types for these two uses and future iterations of this library may head in this direction.
>>>
runParser_ (many (tokenP Html)) [i|<foo>content</foo>|]
[OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
>>>
runParser_ (tokenP Xml) [i|<foo/>|]
OpenTag EmptyElemTag "foo" []
>>>
runParser_ (tokenP Html) "<!-- Comment -->"
Comment " Comment "
>>>
runParser_ (tokenP Xml) [i|<?xml version="1.0" encoding="UTF-8"?>|]
Decl "xml" [Attr {attrName = "version", attrValue = " version=\"1.0\""},Attr {attrName = "encoding", attrValue = "UTF-8"}]
>>>
runParser_ (tokenP Html) "<!DOCTYPE html>"
Doctype "DOCTYPE html"
>>>
runParser_ (tokenP Xml) "<!DOCTYPE foo [ declarations ]>"
Doctype "DOCTYPE foo [ declarations ]"
>>>
runParser (tokenP Html) [i|<foo a="a" b="b" c=c check>|]
OK (OpenTag StartTag "foo" [Attr {attrName = "a", attrValue = "a"},Attr {attrName = "b", attrValue = "b"},Attr {attrName = "c", attrValue = "c"},Attr {attrName = "check", attrValue = ""}]) ""
>>>
runParser (tokenP Xml) [i|<foo a="a" b="b" c=c check>|]
Fail
OpenTag !OpenTagType !NameTag ![Attr] | A tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag |
EndTag !NameTag | A closing tag. |
Content !ByteString | The content between tags. |
Comment !ByteString | Contents of a comment. |
Decl !ByteString ![Attr] | Contents of a declaration |
Doctype !ByteString | Contents of a doctype declaration. |
Instances
tokenize :: Standard -> ByteString -> Warn [Token] Source #
Parse a bytestring into tokens
>>>
tokenize Html [i|<foo>content</foo>|]
That [OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
tokenP :: Standard -> Parser e Token Source #
A flatparse Token
parser.
>>>
runParser (tokenP Html) "<foo>content</foo>"
OK (OpenTag StartTag "foo" []) "content</foo>"
detokenize :: Standard -> Token -> ByteString Source #
bytestring representation of Token
.
>>>
detokenize Html (OpenTag StartTag "foo" [])
"<foo>"
gather :: Standard -> [Token] -> Warn Markup Source #
Gather together token trees from a token list, placing child elements in nodes and removing EndTags.
>>>
gather Html =<< tokenize Html "<foo class=\"bar\">baz</foo>"
That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
degather :: Standard -> Markup -> Warn [Token] Source #
Convert a markup into a token list, adding end tags.
>>>
degather Html =<< markup Html "<foo class=\"bar\">baz</foo>"
That [OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}],Content "baz",EndTag "foo"]
XML specific Parsers
xmlVersionInfoP :: Parser e ByteString Source #
xml production [24]
xmlEncodingDeclP :: Parser e ByteString Source #
xml production [80]
xmlStandaloneP :: Parser e ByteString Source #
xml production [32]
xmlVersionNumP :: Parser e ByteString Source #
xml production [26]
xmlEncNameP :: Parser e ByteString Source #
xml production [81]
xmlYesNoP :: Parser e ByteString Source #
xml yes/no
bytestring support
utf8ToStr :: ByteString -> String #
Convert a ByteString
to an UTF8-encoded String
.
strToUtf8 :: String -> ByteString #
Convert an UTF8-encoded String
to a ByteString
.
escapeChar :: Char -> ByteString Source #
Escape a single character.
escape :: ByteString -> ByteString Source #
Escape the following predefined character entity references:
escapeChar '<' = "<" escapeChar '>' = ">" escapeChar '&' = "&" escapeChar '\'' = "'" escapeChar '"' = """
No attempt is made to meet the HTML Standards
>>>
escape [i|<foo class="a" bar='b'>|]
"<foo class="a" bar='b'>"
Tree support
Non-empty, possibly infinite, multi-way trees; also known as rose trees.
Instances
MonadFix Tree | Since: containers-0.5.11 |
MonadZip Tree | |
Foldable Tree | |
Defined in Data.Tree fold :: Monoid m => Tree m -> m # foldMap :: Monoid m => (a -> m) -> Tree a -> m # foldMap' :: Monoid m => (a -> m) -> Tree a -> m # foldr :: (a -> b -> b) -> b -> Tree a -> b # foldr' :: (a -> b -> b) -> b -> Tree a -> b # foldl :: (b -> a -> b) -> b -> Tree a -> b # foldl' :: (b -> a -> b) -> b -> Tree a -> b # foldr1 :: (a -> a -> a) -> Tree a -> a # foldl1 :: (a -> a -> a) -> Tree a -> a # elem :: Eq a => a -> Tree a -> Bool # maximum :: Ord a => Tree a -> a # | |
Eq1 Tree | Since: containers-0.5.9 |
Ord1 Tree | Since: containers-0.5.9 |
Read1 Tree | Since: containers-0.5.9 |
Show1 Tree | Since: containers-0.5.9 |
Traversable Tree | |
Applicative Tree | |
Functor Tree | |
Monad Tree | |
Hashable1 Tree | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
Generic1 Tree | |
Data a => Data (Tree a) | |
Defined in Data.Tree gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> Tree a -> c (Tree a) # gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c (Tree a) # toConstr :: Tree a -> Constr # dataTypeOf :: Tree a -> DataType # dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c (Tree a)) # dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c (Tree a)) # gmapT :: (forall b. Data b => b -> b) -> Tree a -> Tree a # gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQr :: forall r r'. (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQ :: (forall d. Data d => d -> u) -> Tree a -> [u] # gmapQi :: Int -> (forall d. Data d => d -> u) -> Tree a -> u # gmapM :: Monad m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # | |
Generic (Tree a) | |
Read a => Read (Tree a) | |
Show a => Show (Tree a) | |
NFData a => NFData (Tree a) | |
Eq a => Eq (Tree a) | |
Ord a => Ord (Tree a) | Since: containers-0.6.5 |
Hashable v => Hashable (Tree v) | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
ToExpr a => ToExpr (Tree a) | |
Defined in Data.TreeDiff.Class | |
type Rep1 Tree | Since: containers-0.5.8 |
Defined in Data.Tree type Rep1 Tree = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) Par1 :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) ([] :.: Rec1 Tree))) | |
type Rep (Tree a) | Since: containers-0.5.8 |
Defined in Data.Tree type Rep (Tree a) = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a) :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Tree a]))) |