{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeFamilies #-}
-- | High-level XML parsers, built on top of "Data.XML.Parser.Mid":
--
-- - entity references are expanded
-- - CDATAs are abstracted away
-- - comments are ignored
-- - whitespace between tokens is ignored
-- - duplicate attributes are ignored
--
-- All documentation examples assume the following setup:
--
-- > :set -XOverloadedStrings
-- > import Data.Attoparsec.ByteString
module Data.XML.Parser.High
( module Data.XML.Parser.High.AttrParser
, module Data.XML.Parser.High.NameParser
, Prolog(..)
, Token(..)
, TokenParser()
, ContentParser()
, noContent
, withContent
, anyContent
, runTokenParser
, prolog
, instruction
, textContent
, textContent'
, tag
, tag'
, anyTag
, anyToken
, anyToken'
) where
import Control.Applicative
import Control.Arrow
import Control.Monad.Compat
import Control.Monad.Fail.Compat
import Data.Function
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe
import Data.String
import Data.Text (Text)
import qualified Data.Text as Text
import Data.XML.Parser.High.AttrParser
import Data.XML.Parser.High.NameParser
import Data.XML.Parser.Low
import qualified Data.XML.Parser.Mid as Mid
import Data.XML.Parser.Mid.Attribute
import Prelude ()
import Prelude.Compat
import Text.Parser.Char
import Text.Parser.Combinators
import Text.ParserCombinators.ReadP (readP_to_S)
-- $setup
-- >>> :set -XOverloadedStrings
-- >>> import Data.Attoparsec.ByteString
-- | XML document prolog.
data Prolog = Prolog
{ prologXmlDeclaration :: Maybe Mid.XMLDeclaration
, prologInstructions :: [Mid.Instruction]
, prologDoctype :: Maybe Mid.Doctype
} deriving (Eq, Ord, Read, Show)
data Token
= TokenProlog Prolog
| TokenInstruction Mid.Instruction
| TokenTag QName (Map QName Text) [Token]
| TokenTextContent Text
deriving (Eq, Ord, Read, Show)
-- | A parser that consumes whole 'Token's.
newtype TokenParser m a = TokenParser { runTokenParser :: m a }
deriving instance Functor m => Functor (TokenParser m)
deriving instance Applicative m => Applicative (TokenParser m)
deriving instance Alternative m => Alternative (TokenParser m)
deriving instance Monad m => Monad (TokenParser m)
instance (Parsing m, Monad m) => MonadFail (TokenParser m) where
fail = TokenParser . unexpected
-- | How to parse tag content.
data ContentParser m a
= NoContent (m a)
| AnyContent ([Token] -> m a)
| WithContent (TokenParser m a)
deriving instance Functor m => Functor (ContentParser m)
-- | Assert that content is not empty, and parse it using given token parser.
--
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "body"
-- Right "body"
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) ""
-- Left ...
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) ""
-- Left ...
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent $ pure ())) ""
-- Left ...
withContent :: TokenParser m a -> ContentParser m a
withContent = WithContent
-- | Assert that content is empty.
--
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "body"
-- Left ...
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) ""
-- Right ()
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) ""
-- Right ()
noContent :: Applicative m => ContentParser m ()
noContent = NoContent $ pure ()
-- | Accept any content, including empty content.
--
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "body"
-- Right ()
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) ""
-- Right ()
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) ""
-- Right ()
anyContent :: CharParsing m => Monad m => ContentParser m ()
anyContent = AnyContent $ const $ pure ()
-- | Parse a processing instruction.
--
--
--
-- >>> parseOnly (runTokenParser instruction) ""
-- Right (Instruction "php" "echo 'Hello World!'; ")
-- >>> parseOnly (runTokenParser instruction) " "
-- Right (Instruction "php" "echo 'Hello World!'; ")
instruction :: CharParsing m => Monad m => TokenParser m Mid.Instruction
instruction = TokenParser $ do
skipCommentsWhitespace
Mid.runTokenParser Mid.tokenInstruction
-- |
--
-- >>> parseOnly (runTokenParser prolog) ""
-- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)})
-- >>> parseOnly (runTokenParser prolog) " "
-- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)})
prolog :: CharParsing m => Monad m => TokenParser m Prolog
prolog = TokenParser $ do
xmlDeclaration <- optional $ Mid.runTokenParser Mid.tokenXmlDeclaration
skipCommentsWhitespace
instructions <- runTokenParser $ many instruction
doctype <- optional $ do
skipCommentsWhitespace
Mid.runTokenParser Mid.tokenDoctype
when (isNothing xmlDeclaration && null instructions && isNothing doctype)
$ unexpected "Expected XML prolog"
return $ Prolog xmlDeclaration instructions doctype
-- | Parse textual content of a tag, including CDATA.
textContent :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Text
textContent entityDecoder = TokenParser $ mconcat <$> do
skipComments
(textualData <|> Mid.runTokenParser Mid.tokenCdata) `sepBy1` Mid.runTokenParser Mid.tokenComment
where textualData = expandContents entityDecoder =<< Mid.runTokenParser Mid.tokenData
-- | Same as @textContent (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience.
--
-- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "bodyinnerbody]]>"
-- Right "bodyinnerbody"
textContent' :: CharParsing m => Monad m => TokenParser m Text
textContent' = textContent decodeStandardEntities
normalizeAttributes :: EntityDecoder -> [Attribute] -> Map QName Text
normalizeAttributes entityDecoder attributes = Map.fromList $ do
Attribute name contents <- attributes
value <- maybeToList $ expandContents entityDecoder contents
return (name, value)
-- | Generic tag parser.
tag :: CharParsing m => Monad m
=> EntityDecoder -- ^ How to expand entity references
-> NameParser a -- ^ How to parse tag name
-> (a -> AttrParser b) -- ^ How to parse tag attributes
-> (b -> ContentParser m c) -- ^ How to parse tag content
-> TokenParser m c
tag entityDecoder parseName parseAttributes parseContent = parseStartToEnd <|> parseEmptyElement where
parseStartToEnd = TokenParser $ do
skipCommentsWhitespace
Mid.StartTag name attributes <- Mid.runTokenParser Mid.tokenStartTag
a <- processName name
b <- processAttributes a attributes
c <- case parseContent b of
NoContent f -> f
AnyContent f -> f =<< runTokenParser (many $ anyToken entityDecoder)
WithContent parser -> runTokenParser parser
skipCommentsWhitespace
Mid.runTokenParser $ do
name' <- Mid.tokenEndTag
when (name /= name') $ fail "Invalid end tag name"
return c
parseEmptyElement = TokenParser $ do
skipCommentsWhitespace
Mid.EmptyElementTag name attributes <- Mid.runTokenParser Mid.tokenEmptyElementTag
a <- processName name
b <- processAttributes a attributes
case parseContent b of
NoContent f -> f
AnyContent f -> f mempty
WithContent parser -> unexpected "Expected non-empty tag"
processName name = runNameParser parseName name
& either unexpected return
processAttributes state attributes = runAttrParser (parseAttributes state) (normalizeAttributes entityDecoder attributes)
& either unexpected return
-- | Simplified version of 'tag':
--
-- - no state forwarding between name, attributes and content parsers
-- - uses @decodePredefinedEntities <> decodeHtmlEntities@ to expand entity references
tag' :: CharParsing m => Monad m
=> NameParser a -- ^ How to parse tag name
-> AttrParser b -- ^ How to parse tag attributes
-> ContentParser m c -- ^ How to parse tag content
-> TokenParser m c
tag' parseName parseAttributes parseBody = tag decodeStandardEntities parseName (const parseAttributes) (const parseBody)
-- | Parse a tag with any name, any attributes and any content.
--
-- >>> parseOnly (runTokenParser anyTag) "body"
-- Right ()
-- >>> parseOnly (runTokenParser anyTag) ""
-- Right ()
-- >>> parseOnly (runTokenParser anyTag) ""
-- Right ()
-- >>> parseOnly (runTokenParser anyTag) "body"
-- Right ()
anyTag :: CharParsing m => Monad m => TokenParser m ()
anyTag = tag' anyName anyAttr anyContent
-- | Parse any 'Token'.
anyToken :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Token
anyToken entityDecoder = (TokenProlog <$> prolog)
<|> (TokenInstruction <$> instruction)
<|> tokenTag
<|> (TokenTextContent <$> textContent entityDecoder)
where tokenTag = tag entityDecoder anyName (\name -> (name,) <$> forwardAttrs) $ \(name, attributes) ->
TokenTag name attributes <$> AnyContent pure
forwardAttrs = AttrParser Right
-- | Same as @anyToken (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience.
anyToken' :: CharParsing m => Monad m => TokenParser m Token
anyToken' = anyToken decodeStandardEntities
-- * Private functions
skipComments :: CharParsing m => Monad m => m ()
skipComments = void $ many $ Mid.runTokenParser Mid.tokenComment
skipCommentsWhitespace :: CharParsing m => Monad m => m ()
skipCommentsWhitespace = void $ many $ void (Mid.runTokenParser Mid.tokenComment) <|> void tokenWhitespace
decodeStandardEntities :: EntityDecoder
decodeStandardEntities = decodePredefinedEntities <> decodeHtmlEntities