{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE TypeFamilies #-} -- | High-level XML parsers, built on top of "Data.XML.Parser.Mid": -- -- - entity references are expanded -- - CDATAs are abstracted away -- - comments are ignored -- - whitespace between tokens is ignored -- - duplicate attributes are ignored -- -- All documentation examples assume the following setup: -- -- > :set -XOverloadedStrings -- > import Data.Attoparsec.ByteString module Data.XML.Parser.High ( module Data.XML.Parser.High.AttrParser , module Data.XML.Parser.High.NameParser , Prolog(..) , Token(..) , TokenParser() , ContentParser() , noContent , withContent , anyContent , runTokenParser , prolog , instruction , textContent , textContent' , tag , tag' , anyTag , anyToken , anyToken' ) where import Control.Applicative import Control.Arrow import Control.Monad.Compat import Control.Monad.Fail.Compat import Data.Function import Data.Map (Map) import qualified Data.Map as Map import Data.Maybe import Data.String import Data.Text (Text) import qualified Data.Text as Text import Data.XML.Parser.High.AttrParser import Data.XML.Parser.High.NameParser import Data.XML.Parser.Low import qualified Data.XML.Parser.Mid as Mid import Data.XML.Parser.Mid.Attribute import Prelude () import Prelude.Compat import Text.Parser.Char import Text.Parser.Combinators import Text.ParserCombinators.ReadP (readP_to_S) -- $setup -- >>> :set -XOverloadedStrings -- >>> import Data.Attoparsec.ByteString -- | XML document prolog. data Prolog = Prolog { prologXmlDeclaration :: Maybe Mid.XMLDeclaration , prologInstructions :: [Mid.Instruction] , prologDoctype :: Maybe Mid.Doctype } deriving (Eq, Ord, Read, Show) data Token = TokenProlog Prolog | TokenInstruction Mid.Instruction | TokenTag QName (Map QName Text) [Token] | TokenTextContent Text deriving (Eq, Ord, Read, Show) -- | A parser that consumes whole 'Token's. newtype TokenParser m a = TokenParser { runTokenParser :: m a } deriving instance Functor m => Functor (TokenParser m) deriving instance Applicative m => Applicative (TokenParser m) deriving instance Alternative m => Alternative (TokenParser m) deriving instance Monad m => Monad (TokenParser m) instance (Parsing m, Monad m) => MonadFail (TokenParser m) where fail = TokenParser . unexpected -- | How to parse tag content. data ContentParser m a = NoContent (m a) | AnyContent ([Token] -> m a) | WithContent (TokenParser m a) deriving instance Functor m => Functor (ContentParser m) -- | Assert that content is not empty, and parse it using given token parser. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "body" -- Right "body" -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent $ pure ())) "" -- Left ... withContent :: TokenParser m a -> ContentParser m a withContent = WithContent -- | Assert that content is empty. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "body" -- Left ... -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr noContent) "" -- Right () noContent :: Applicative m => ContentParser m () noContent = NoContent $ pure () -- | Accept any content, including empty content. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "body" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "" -- Right () -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr anyContent) "" -- Right () anyContent :: CharParsing m => Monad m => ContentParser m () anyContent = AnyContent $ const $ pure () -- | Parse a processing instruction. -- -- -- -- >>> parseOnly (runTokenParser instruction) "" -- Right (Instruction "php" "echo 'Hello World!'; ") -- >>> parseOnly (runTokenParser instruction) " " -- Right (Instruction "php" "echo 'Hello World!'; ") instruction :: CharParsing m => Monad m => TokenParser m Mid.Instruction instruction = TokenParser $ do skipCommentsWhitespace Mid.runTokenParser Mid.tokenInstruction -- | -- -- >>> parseOnly (runTokenParser prolog) "" -- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)}) -- >>> parseOnly (runTokenParser prolog) " " -- Right (Prolog {prologXmlDeclaration = Just (XMLDeclaration "1.0" ...), prologInstructions = [], prologDoctype = Just (Doctype "greeting" ...)}) prolog :: CharParsing m => Monad m => TokenParser m Prolog prolog = TokenParser $ do xmlDeclaration <- optional $ Mid.runTokenParser Mid.tokenXmlDeclaration skipCommentsWhitespace instructions <- runTokenParser $ many instruction doctype <- optional $ do skipCommentsWhitespace Mid.runTokenParser Mid.tokenDoctype when (isNothing xmlDeclaration && null instructions && isNothing doctype) $ unexpected "Expected XML prolog" return $ Prolog xmlDeclaration instructions doctype -- | Parse textual content of a tag, including CDATA. textContent :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Text textContent entityDecoder = TokenParser $ mconcat <$> do skipComments (textualData <|> Mid.runTokenParser Mid.tokenCdata) `sepBy1` Mid.runTokenParser Mid.tokenComment where textualData = expandContents entityDecoder =<< Mid.runTokenParser Mid.tokenData -- | Same as @textContent (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience. -- -- >>> parseOnly (runTokenParser $ tag' anyName anyAttr (withContent textContent')) "bodyinnerbody]]>" -- Right "bodyinnerbody" textContent' :: CharParsing m => Monad m => TokenParser m Text textContent' = textContent decodeStandardEntities normalizeAttributes :: EntityDecoder -> [Attribute] -> Map QName Text normalizeAttributes entityDecoder attributes = Map.fromList $ do Attribute name contents <- attributes value <- maybeToList $ expandContents entityDecoder contents return (name, value) -- | Generic tag parser. tag :: CharParsing m => Monad m => EntityDecoder -- ^ How to expand entity references -> NameParser a -- ^ How to parse tag name -> (a -> AttrParser b) -- ^ How to parse tag attributes -> (b -> ContentParser m c) -- ^ How to parse tag content -> TokenParser m c tag entityDecoder parseName parseAttributes parseContent = parseStartToEnd <|> parseEmptyElement where parseStartToEnd = TokenParser $ do skipCommentsWhitespace Mid.StartTag name attributes <- Mid.runTokenParser Mid.tokenStartTag a <- processName name b <- processAttributes a attributes c <- case parseContent b of NoContent f -> f AnyContent f -> f =<< runTokenParser (many $ anyToken entityDecoder) WithContent parser -> runTokenParser parser skipCommentsWhitespace Mid.runTokenParser $ do name' <- Mid.tokenEndTag when (name /= name') $ fail "Invalid end tag name" return c parseEmptyElement = TokenParser $ do skipCommentsWhitespace Mid.EmptyElementTag name attributes <- Mid.runTokenParser Mid.tokenEmptyElementTag a <- processName name b <- processAttributes a attributes case parseContent b of NoContent f -> f AnyContent f -> f mempty WithContent parser -> unexpected "Expected non-empty tag" processName name = runNameParser parseName name & either unexpected return processAttributes state attributes = runAttrParser (parseAttributes state) (normalizeAttributes entityDecoder attributes) & either unexpected return -- | Simplified version of 'tag': -- -- - no state forwarding between name, attributes and content parsers -- - uses @decodePredefinedEntities <> decodeHtmlEntities@ to expand entity references tag' :: CharParsing m => Monad m => NameParser a -- ^ How to parse tag name -> AttrParser b -- ^ How to parse tag attributes -> ContentParser m c -- ^ How to parse tag content -> TokenParser m c tag' parseName parseAttributes parseBody = tag decodeStandardEntities parseName (const parseAttributes) (const parseBody) -- | Parse a tag with any name, any attributes and any content. -- -- >>> parseOnly (runTokenParser anyTag) "body" -- Right () -- >>> parseOnly (runTokenParser anyTag) "" -- Right () -- >>> parseOnly (runTokenParser anyTag) "" -- Right () -- >>> parseOnly (runTokenParser anyTag) "body" -- Right () anyTag :: CharParsing m => Monad m => TokenParser m () anyTag = tag' anyName anyAttr anyContent -- | Parse any 'Token'. anyToken :: CharParsing m => Monad m => EntityDecoder -> TokenParser m Token anyToken entityDecoder = (TokenProlog <$> prolog) <|> (TokenInstruction <$> instruction) <|> tokenTag <|> (TokenTextContent <$> textContent entityDecoder) where tokenTag = tag entityDecoder anyName (\name -> (name,) <$> forwardAttrs) $ \(name, attributes) -> TokenTag name attributes <$> AnyContent pure forwardAttrs = AttrParser Right -- | Same as @anyToken (decodePredefinedEntities <> decodeHtmlEntities)@, provided for convenience. anyToken' :: CharParsing m => Monad m => TokenParser m Token anyToken' = anyToken decodeStandardEntities -- * Private functions skipComments :: CharParsing m => Monad m => m () skipComments = void $ many $ Mid.runTokenParser Mid.tokenComment skipCommentsWhitespace :: CharParsing m => Monad m => m () skipCommentsWhitespace = void $ many $ void (Mid.runTokenParser Mid.tokenComment) <|> void tokenWhitespace decodeStandardEntities :: EntityDecoder decodeStandardEntities = decodePredefinedEntities <> decodeHtmlEntities