module Data.Warc
( Warc(..)
, Record(..)
, parseWarc
, iterRecords
, produceRecords
, encodeRecord
, module Data.Warc.Header
) where
import Data.Char (ord)
import Pipes hiding (each)
import qualified Pipes.ByteString as PBS
import Control.Lens
import qualified Pipes.Attoparsec as PA
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy.Builder as BB
import Data.ByteString (ByteString)
import Control.Monad (join)
import Control.Monad.Trans.Free
import Control.Monad.Trans.State.Strict
import Data.Warc.Header
data Record m r = Record { recHeader :: RecordHeader
, recContent :: Producer BS.ByteString m r
}
instance Monad m => Functor (Record m) where
fmap f (Record hdr r) = Record hdr (fmap f r)
type Warc m a = FreeT (Record m) m (Producer BS.ByteString m a)
parseWarc :: (Functor m, Monad m)
=> Producer ByteString m a
-> Warc m a
parseWarc = loop
where
loop upstream = FreeT $ do
(hdr, rest) <- runStateT (PA.parse header) upstream
go hdr rest
go mhdr rest
| Nothing <- mhdr = return $ Pure rest
| Just (Left err) <- mhdr = error $ show err
| Just (Right hdr) <- mhdr
, Just (Right len) <- lookupField hdr contentLength = do
let produceBody = fmap consumeWhitespace . view (PBS.splitAt len)
consumeWhitespace = PBS.dropWhile isEOL
isEOL c = c == ord8 '\r' || c == ord8 '\n'
ord8 = fromIntegral . ord
return $ Free $ Record hdr $ fmap loop $ produceBody rest
iterRecords :: forall m a. Monad m
=> (forall b. Record m b -> m b)
-> Warc m a
-> m (Producer BS.ByteString m a)
iterRecords f warc = iterT iter warc
where
iter :: Record m (m (Producer BS.ByteString m a))
-> m (Producer BS.ByteString m a)
iter r = join $ f r
produceRecords :: forall m o a. Monad m
=> (forall b. RecordHeader -> Producer BS.ByteString m b
-> Producer o m b)
-> Warc m a
-> Producer o m (Producer BS.ByteString m a)
produceRecords f warc = iterTM iter warc
where
iter :: Record m (Producer o m (Producer BS.ByteString m a))
-> Producer o m (Producer BS.ByteString m a)
iter (Record hdr body) = join $ f hdr body
encodeRecord :: Monad m => Record m a -> Producer BS.ByteString m a
encodeRecord (Record hdr content) = do
PBS.fromLazy $ BB.toLazyByteString $ encodeHeader hdr
content