module Text.XML.Expat.SAX (
Encoding(..),
XMLParseError(..),
XMLParseLocation(..),
ParseOptions(..),
SAXEvent(..),
parse,
parseG,
parseLocations,
parseLocationsG,
parseLocationsThrowing,
parseThrowing,
defaultParseOptions,
XMLParseException(..),
GenericXMLString(..)
) where
import Control.Concurrent.MVar
import Control.Exception as Exc
import Text.XML.Expat.Internal.IO
import Data.Bits
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as I
import Data.Int
import Data.ByteString.Internal (c2w, w2c, c_strlen)
import qualified Data.Monoid as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Codec.Binary.UTF8.String as U8
import Data.List.Class (List(..), ListItem(..), cons, fromList, mapL)
import Data.Typeable
import Data.Word
import Control.Applicative
import Control.DeepSeq
import Control.Monad
import System.IO.Unsafe
import Foreign.C
import Foreign.ForeignPtr
import Foreign.Marshal.Array
import Foreign.Ptr
import Foreign.Storable
data ParseOptions tag text = ParseOptions
{ overrideEncoding :: Maybe Encoding
, entityDecoder :: Maybe (tag -> Maybe text)
}
defaultParseOptions :: ParseOptions tag text
defaultParseOptions = ParseOptions Nothing Nothing
class (M.Monoid s, Eq s) => GenericXMLString s where
gxNullString :: s -> Bool
gxToString :: s -> String
gxFromString :: String -> s
gxFromChar :: Char -> s
gxHead :: s -> Char
gxTail :: s -> s
gxBreakOn :: Char -> s -> (s, s)
gxFromByteString :: B.ByteString -> s
gxToByteString :: s -> B.ByteString
instance GenericXMLString String where
gxNullString = null
gxToString = id
gxFromString = id
gxFromChar c = [c]
gxHead = head
gxTail = tail
gxBreakOn c = break (==c)
gxFromByteString = U8.decode . B.unpack
gxToByteString = B.pack . map c2w . U8.encodeString
instance GenericXMLString B.ByteString where
gxNullString = B.null
gxToString = U8.decodeString . map w2c . B.unpack
gxFromString = B.pack . map c2w . U8.encodeString
gxFromChar = B.singleton . c2w
gxHead = w2c . B.head
gxTail = B.tail
gxBreakOn c = B.break (== c2w c)
gxFromByteString = id
gxToByteString = id
instance GenericXMLString T.Text where
gxNullString = T.null
gxToString = T.unpack
gxFromString = T.pack
gxFromChar = T.singleton
gxHead = T.head
gxTail = T.tail
#if MIN_VERSION_text(0,11,0)
gxBreakOn c = T.break (==c)
#elif MIN_VERSION_text(0,10,0)
gxBreakOn c t = (T.takeWhile (/=c) t, T.dropWhile (/=c) t)
#else
gxBreakOn c = T.breakBy (==c)
#endif
gxFromByteString = TE.decodeUtf8
gxToByteString = TE.encodeUtf8
data SAXEvent tag text =
XMLDeclaration text (Maybe text) (Maybe Bool) |
StartElement tag [(tag, text)] |
EndElement tag |
CharacterData text |
StartCData |
EndCData |
ProcessingInstruction text text |
Comment text |
FailDocument XMLParseError
deriving (Eq, Show)
instance (NFData tag, NFData text) => NFData (SAXEvent tag text) where
rnf (XMLDeclaration ver mEnc mSD) = rnf ver `seq` rnf mEnc `seq` rnf mSD
rnf (StartElement tag atts) = rnf tag `seq` rnf atts
rnf (EndElement tag) = rnf tag
rnf (CharacterData text) = rnf text
rnf StartCData = ()
rnf EndCData = ()
rnf (ProcessingInstruction target text) = rnf target `seq` rnf text
rnf (Comment text) = rnf text
rnf (FailDocument err) = rnf err
parseG :: forall tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
ParseOptions tag text
-> l ByteString
-> l (SAXEvent tag text)
parseG opts inputBlocks = mapL (return . fst) $ parseImpl opts inputBlocks False noExtra failureA
where noExtra _ offset = return ((), offset)
failureA _ = return ()
parseLocationsG :: forall tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
ParseOptions tag text
-> l ByteString
-> l (SAXEvent tag text, XMLParseLocation)
parseLocationsG opts inputBlocks = parseImpl opts inputBlocks True fetchLocation id
where
fetchLocation pBuf offset = do
[a, b, c, d] <- peekArray 4 (pBuf `plusPtr` offset :: Ptr Int64)
return (XMLParseLocation a b c d, offset + 32)
parseImpl :: forall a tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
ParseOptions tag text
-> l ByteString
-> Bool
-> (Ptr Word8 -> Int -> IO (a, Int))
-> (IO XMLParseLocation -> IO a)
-> l (SAXEvent tag text, a)
parseImpl opts inputBlocks addLocations extra failureA = runParser inputBlocks parse cacheRef
where
(parse, getLocation, cacheRef) = unsafePerformIO $ do
(parse, getLocation) <- hexpatNewParser
(overrideEncoding opts)
((\decode -> fmap gxToByteString . decode . gxFromByteString) <$> entityDecoder opts)
addLocations
cacheRef <- newMVar Nothing
return (parse, getLocation, cacheRef)
runParser iblks parse cacheRef = joinL $ do
li <- runList iblks
return $ unsafePerformIO $ do
mCached <- takeMVar cacheRef
case mCached of
Just l -> do
putMVar cacheRef mCached
return l
Nothing -> do
(saxen, rema) <- case li of
Nil -> do
(buf, len, mError) <- parse B.empty True
saxen <- parseBuf buf len extra
rema <- handleFailure mError mzero
return (saxen, rema)
Cons blk t -> do
(buf, len, mError) <- parse blk False
saxen <- parseBuf buf len extra
cacheRef' <- newMVar Nothing
rema <- handleFailure mError (runParser t parse cacheRef')
return (saxen, rema)
let l = fromList saxen `mplus` rema
putMVar cacheRef (Just l)
return l
where
handleFailure (Just err) _ = do a <- failureA getLocation
return $ (FailDocument err, a) `cons` mzero
handleFailure Nothing l = return l
parseBuf :: (GenericXMLString tag, GenericXMLString text) =>
ForeignPtr Word8 -> CInt -> (Ptr Word8 -> Int -> IO (a, Int)) -> IO [(SAXEvent tag text, a)]
parseBuf buf _ processExtra = withForeignPtr buf $ \pBuf -> doit [] pBuf 0
where
roundUp32 offset = (offset + 3) .&. complement 3
doit acc pBuf offset0 = offset0 `seq` do
typ <- peek (pBuf `plusPtr` offset0 :: Ptr Word32)
(a, offset) <- processExtra pBuf (offset0 + 4)
case typ of
0 -> return (reverse acc)
1 -> do
nAtts <- peek (pBuf `plusPtr` offset :: Ptr Word32)
let pName = pBuf `plusPtr` (offset + 4)
lName <- fromIntegral <$> c_strlen pName
let name = gxFromByteString $ I.fromForeignPtr buf (offset + 4) lName
(atts, offset') <- foldM (\(atts, offset) _ -> do
let pAtt = pBuf `plusPtr` offset
lAtt <- fromIntegral <$> c_strlen pAtt
let att = gxFromByteString $ I.fromForeignPtr buf offset lAtt
offset' = offset + lAtt + 1
pValue = pBuf `plusPtr` offset'
lValue <- fromIntegral <$> c_strlen pValue
let value = gxFromByteString $ I.fromForeignPtr buf offset' lValue
return ((att, value):atts, offset' + lValue + 1)
) ([], offset + 4 + lName + 1) [1,3..nAtts]
doit ((StartElement name (reverse atts), a) : acc) pBuf (roundUp32 offset')
2 -> do
let pName = pBuf `plusPtr` offset
lName <- fromIntegral <$> c_strlen pName
let name = gxFromByteString $ I.fromForeignPtr buf offset lName
offset' = offset + lName + 1
doit ((EndElement name, a) : acc) pBuf (roundUp32 offset')
3 -> do
len <- fromIntegral <$> peek (pBuf `plusPtr` offset :: Ptr Word32)
let text = gxFromByteString $ I.fromForeignPtr buf (offset + 4) len
offset' = offset + 4 + len
doit ((CharacterData text, a) : acc) pBuf (roundUp32 offset')
4 -> do
let pEnc = pBuf `plusPtr` offset
lEnc <- fromIntegral <$> c_strlen pEnc
let enc = gxFromByteString $ I.fromForeignPtr buf offset lEnc
offset' = offset + lEnc + 1
pVer = pBuf `plusPtr` offset'
pVerFirst <- peek (castPtr pVer :: Ptr Word8)
(mVer, offset'') <- case pVerFirst of
0 -> return (Nothing, offset' + 1)
1 -> do
lVer <- fromIntegral <$> c_strlen (pVer `plusPtr` 1)
return (Just $ gxFromByteString $ I.fromForeignPtr buf (offset' + 1) lVer, offset' + 1 + lVer + 1)
_ -> error "hexpat: bad data from C land"
cSta <- peek (pBuf `plusPtr` offset'' :: Ptr Int8)
let sta = if cSta < 0 then Nothing else
if cSta == 0 then Just False else
Just True
doit ((XMLDeclaration enc mVer sta, a) : acc) pBuf (roundUp32 (offset'' + 1))
5 -> doit ((StartCData, a) : acc) pBuf offset
6 -> doit ((EndCData, a) : acc) pBuf offset
7 -> do
let pTarget = pBuf `plusPtr` offset
lTarget <- fromIntegral <$> c_strlen pTarget
let target = gxFromByteString $ I.fromForeignPtr buf offset lTarget
offset' = offset + lTarget + 1
pData = pBuf `plusPtr` offset'
lData <- fromIntegral <$> c_strlen pData
let dat = gxFromByteString $ I.fromForeignPtr buf offset' lData
doit ((ProcessingInstruction target dat, a) : acc) pBuf (roundUp32 (offset' + lData + 1))
8 -> do
let pText = pBuf `plusPtr` offset
lText <- fromIntegral <$> c_strlen pText
let text = gxFromByteString $ I.fromForeignPtr buf offset lText
doit ((Comment text, a) : acc) pBuf (roundUp32 (offset + lText + 1))
_ -> error "hexpat: bad data from C land"
parse :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parse opts input = parseG opts (L.toChunks input)
data XMLParseException = XMLParseException XMLParseError
deriving (Eq, Show, Typeable)
instance Exception XMLParseException where
parseLocations :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocations opts input = parseLocationsG opts (L.toChunks input)
parseThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parseThrowing opts bs = map freakOut $ parse opts bs
where
freakOut (FailDocument err) = Exc.throw $ XMLParseException err
freakOut other = other
parseLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocationsThrowing opts bs = map freakOut $ parseLocations opts bs
where
freakOut (FailDocument err, _) = Exc.throw $ XMLParseException err
freakOut other = other