{-# LANGUAGE LambdaCase #-}
module Xmlbf.XmlHtml
( fromXmlHtmlNode
, fromRawXml
, fromRawHtml
) where
import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as HM
import qualified Text.XmlHtml as XmlHtml
import qualified Xmlbf
--------------------------------------------------------------------------------
-- XmlHtml support
-- | Convert a 'XmlHtml.Node' from "Text.XmlHtml" into an 'Node' from "Xmlbf",
-- if possible.
fromXmlHtmlNode
:: XmlHtml.Node -- ^ A 'XmlHtml.Node' from "Text.XmlHtml".
-> Either String Xmlbf.Node -- ^ A 'Xmlbf.Node' from "Xmlbf", if possible.
fromXmlHtmlNode = \case
XmlHtml.Comment _ -> Left "Comments not supported"
XmlHtml.TextNode t -> Xmlbf.text' t
XmlHtml.Element t as cs -> do
cs' <- traverse fromXmlHtmlNode cs
Xmlbf.element' t (HM.fromList as) cs'
-- | Parses a given UTF8-encoded raw XML fragment into @a@, using the @xmlhtml@
-- Haskell library, so all of @xmlhtml@'s parsing quirks apply.
--
-- You can provide the output of this function as input to "Xmlbf"'s
-- 'Xmlbf.parse'.
--
-- The given XML can contain more zero or more text or element nodes.
--
-- Comments are discarded from the resulting nodes and their children.
--
-- Surrounding whitespace is not stripped.
fromRawXml
:: B.ByteString -- ^ Raw XML fragment.
-> Either String [Xmlbf.Node] -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawXml = \bs -> case XmlHtml.parseXML "xmlbf-xmlhtml-input.xml" bs of
Left e -> Left ("Malformed XML: " ++ e)
Right d -> traverse fromXmlHtmlNode (XmlHtml.docContent d)
-- | Like 'fromRawXml', but parses using @xmlhtml@'s quirks HTML mode.
fromRawHtml
:: B.ByteString -- ^ Raw HTML fragment.
-> Either String [Xmlbf.Node] -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawHtml = \bs -> case XmlHtml.parseHTML "xmlbf-xmlhtml-input.html" bs of
Left e -> Left ("Malformed HTML: " ++ e)
Right d -> traverse fromXmlHtmlNode (XmlHtml.docContent d)