{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
module Data.Warc.Header
(
header
, encodeHeader
, Version(..)
, warc0_16
, RecordHeader(..)
, WarcType(..)
, RecordId(..)
, TruncationReason(..)
, Digest(..)
, Uri(..)
, Field(..)
, FieldName(..)
, field
, lookupField
, addField
, mapField
, rawField
, warcRecordId
, contentLength
, warcDate
, warcType
, contentType
, warcConcurrentTo
, warcBlockDigest
, warcPayloadDigest
, warcIpAddress
, warcRefersTo
, warcTargetUri
, warcTruncated
, warcWarcinfoID
, warcFilename
, warcProfile
, warcSegmentNumber
, warcSegmentTotalLength
, recWarcVersion, recHeaders
) where
import Control.Applicative
import Control.Monad (void, guard)
import Data.Maybe (catMaybes)
import Data.Monoid ((<>))
import Data.Time.Clock
import Data.Time.Format
import Data.Char (ord)
import Data.String (IsString)
import Data.Attoparsec.ByteString.Char8 as A
import qualified Data.Attoparsec.ByteString.Lazy as AL
import qualified Data.HashMap.Strict as HM
import Data.Hashable (Hashable(..))
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy as BSL
import qualified Data.ByteString.Lazy.Builder as BB
import Control.Lens
withName :: String -> Parser a -> Parser a
withName name parser = parser <?> name
data Version = Version {versionMajor, versionMinor :: !Int}
deriving (Show, Read, Eq, Ord)
warc0_16 :: Version
warc0_16 = Version 0 16
version :: Parser Version
version = withName "version" $ do
"WARC/"
major <- decimal
char '.'
minor <- decimal
return (Version major minor)
newtype FieldName = FieldName {getFieldName :: Text}
deriving (Show, Read, IsString)
instance Hashable FieldName where
hashWithSalt salt (FieldName t) = hashWithSalt salt (T.toCaseFold t)
instance Eq FieldName where
FieldName a == FieldName b = T.toCaseFold a == T.toCaseFold b
instance Ord FieldName where
FieldName a `compare` FieldName b = T.toCaseFold a `compare` T.toCaseFold b
separators :: String
separators = "()<>@,;:\\\"/[]?={}"
crlf :: Parser ()
crlf = void $ string "\r\n"
token :: Parser ByteString
token = takeTill (inClass $ separators++" \t\n\r")
utf8Token :: Parser Text
utf8Token = TE.decodeUtf8 <$> token
ord' = fromIntegral . ord
text :: Parser Text
text = do
let content :: TL.Text -> Parser TL.Text
content accum = do
satisfy (isHorizontalSpace . ord')
c <- takeTill (isEndOfLine . ord')
endOfLine
continuation (accum <> TL.fromStrict (TE.decodeUtf8 c))
continuation :: TL.Text -> Parser TL.Text
continuation accum = content accum <|> return accum
firstLine <- takeTill (isEndOfLine . ord')
endOfLine
TL.toStrict <$> continuation (TL.fromStrict $ TE.decodeUtf8 firstLine)
quotedString :: Parser Text
quotedString = do
char '"'
c <- TE.decodeUtf8 <$> takeTill (== '"')
char '"'
return c
data WarcType = WarcInfo
| Response
| Resource
| Request
| Metadata
| Revisit
| Conversion
| Continuation
| FutureType !Text
deriving (Show, Read, Ord, Eq)
parseWarcType :: Parser WarcType
parseWarcType = choice
[ "warcinfo" *> pure WarcInfo
, "response" *> pure Response
, "resource" *> pure Resource
, "request" *> pure Request
, "metadata" *> pure Metadata
, "revisit" *> pure Revisit
, "conversion" *> pure Conversion
, "continuation" *> pure Continuation
, FutureType <$> utf8Token
]
encodeText :: T.Text -> BB.Builder
encodeText = BB.byteString . TE.encodeUtf8
encodeWarcType :: WarcType -> BB.Builder
encodeWarcType WarcInfo = "warcinfo"
encodeWarcType Response = "response"
encodeWarcType Resource = "resource"
encodeWarcType Request = "request"
encodeWarcType Metadata = "metadata"
encodeWarcType Revisit = "revisit"
encodeWarcType Conversion = "conversion"
encodeWarcType Continuation = "continuation"
encodeWarcType (FutureType t) = encodeText t
newtype Uri = Uri ByteString
deriving (Show, Read, Eq, Ord)
uri :: Parser Uri
uri = do
char '<'
s <- takeTill (== '>')
char '>'
return $ Uri s
laxUri :: Parser Uri
laxUri = Uri <$> takeTill (isEndOfLine . ord')
encodeUri :: Uri -> BB.Builder
encodeUri (Uri b) = BB.char7 '<' <> BB.byteString b <> BB.char7 '>'
newtype RecordId = RecordId Uri
deriving (Show, Read, Eq, Ord)
recordId :: Parser RecordId
recordId = RecordId <$> uri
encodeRecordId :: RecordId -> BB.Builder
encodeRecordId (RecordId r) = encodeUri r
data TruncationReason = TruncLength
| TruncTime
| TruncDisconnect
| TruncUnspecified
| TruncOther !Text
deriving (Show, Read, Ord, Eq)
truncationReason :: Parser TruncationReason
truncationReason = choice
[ "length" *> pure TruncLength
, "time" *> pure TruncTime
, "disconnect" *> pure TruncDisconnect
, "unspecified" *> pure TruncUnspecified
, TruncOther <$> utf8Token
]
encodeTruncationReason :: TruncationReason -> BB.Builder
encodeTruncationReason TruncLength = "length"
encodeTruncationReason TruncTime = "time"
encodeTruncationReason TruncDisconnect = "disconnect"
encodeTruncationReason TruncUnspecified = "unspecified"
encodeTruncationReason (TruncOther o) = encodeText o
data Digest = Digest { digestAlgorithm, digestHash :: !ByteString }
deriving (Show, Read, Eq, Ord)
digest :: Parser Digest
digest = do
algo <- token <* char ':'
hash <- token
return $ Digest algo hash
encodeDigest :: Digest -> BB.Builder
encodeDigest (Digest algo hash) =
BB.byteString algo <> ":" <> BB.byteString hash
date :: Parser UTCTime
date = do
s <- takeTill isSpace
parseTimeM False defaultTimeLocale dateFormat (BS.unpack s)
encodeDate :: UTCTime -> BB.Builder
encodeDate = BB.string7 . formatTime defaultTimeLocale dateFormat
dateFormat = iso8601DateFormat (Just "%H:%M:%SZ")
warcField :: Parser (FieldName, BSL.ByteString)
warcField = withName "field" $ do
peekChar' >>= guard . not . isSpace
fieldName <- FieldName . TE.decodeUtf8 <$> A.takeTill (== ':')
char ':'
skipSpace
v0 <- takeLine
endOfLine
let continuation :: BB.Builder -> Parser BB.Builder
continuation v = do
c <- peekChar
case c of
Just c' | isHorizontalSpace (fromIntegral $ ord c') -> do
v' <- takeLine
endOfLine
continuation (v <> BB.byteString v')
_ -> return v
v1 <- continuation (BB.byteString v0)
return (fieldName, BB.toLazyByteString v1)
takeLine :: Parser BS.ByteString
takeLine = A.takeTill (isEndOfLine . ord')
data RecordHeader = RecordHeader { _recWarcVersion :: Version
, _recHeaders :: HM.HashMap FieldName BSL.ByteString
}
deriving (Show)
makeLenses ''RecordHeader
field :: Field a -> Traversal' RecordHeader a
field fld = recHeaders . ix (fieldName fld) . parsedField fld
parsedField :: Field a -> Prism' BSL.ByteString a
parsedField fld = prism' to from
where
from bs = case AL.parse (decode fld) bs of
AL.Fail _ _ _ -> Nothing
AL.Done _ x -> Just x
to = BB.toLazyByteString . encode fld
addField :: Field a -> a -> RecordHeader -> RecordHeader
addField fld v =
recHeaders . at (fieldName fld) .~ Just (BB.toLazyByteString $ encode fld v)
header :: Parser RecordHeader
header = withName "header" $ do
skipSpace
ver <- version <* endOfLine
fields <- fmap HM.fromList <$> withName "fields" $ many $ warcField
endOfLine
return $ RecordHeader ver fields
encodeHeader :: RecordHeader -> BB.Builder
encodeHeader (RecordHeader (Version maj min) flds) =
"WARC/"<>BB.intDec maj<>"."<>BB.intDec min <> "\r\n"
<> foldMap field (HM.toList flds)
<> "\r\n"
where field :: (FieldName, BSL.ByteString) -> BB.Builder
field (FieldName fname, value) =
TE.encodeUtf8Builder fname <> ": " <> BB.lazyByteString value <> "\r\n"
lookupField :: RecordHeader -> Field a -> Maybe (Either String a)
lookupField (RecordHeader {_recHeaders=headers}) fld
| Just v <- HM.lookup (fieldName fld) headers
= case AL.parse (decode fld) v of
AL.Fail _ _ err -> Just $ Left err
AL.Done _ x -> Just $ Right x
| otherwise
= Nothing
data Field a = Field { fieldName :: FieldName
, encode :: a -> BB.Builder
, decode :: Parser a
}
mapField :: (a -> b) -> (b -> a) -> Field a -> Field b
mapField f g (Field fieldName encode decode) =
Field fieldName (encode . g) (f <$> decode)
warcRecordId :: Field RecordId
warcRecordId = Field "WARC-Record-ID" encodeRecordId recordId
contentLength :: Field Integer
contentLength = Field "Content-Length" BB.integerDec decimal
warcDate :: Field UTCTime
warcDate = Field "WARC-Date" encodeDate date
warcType :: Field WarcType
warcType = Field "WARC-Type" encodeWarcType parseWarcType
contentType :: Field BS.ByteString
contentType = Field "Content-Type" BB.byteString (takeTill (isEndOfLine . ord'))
warcConcurrentTo :: Field RecordId
warcConcurrentTo = Field "WARC-Concurrent-To" encodeRecordId recordId
warcBlockDigest :: Field Digest
warcBlockDigest = Field "WARC-Block-Digest" encodeDigest digest
warcPayloadDigest :: Field Digest
warcPayloadDigest = Field "WARC-Payload-Digest" encodeDigest digest
warcIpAddress :: Field BS.ByteString
warcIpAddress = Field "WARC-IP-Address" BB.byteString (takeTill (isEndOfLine . ord'))
warcRefersTo :: Field Uri
warcRefersTo = Field "WARC-Refers-To" encodeUri uri
warcTargetUri :: Field Uri
warcTargetUri = Field "WARC-Target-URI" encodeUri laxUri
warcTruncated :: Field TruncationReason
warcTruncated = Field "WARC-Truncated" encodeTruncationReason truncationReason
warcWarcinfoID :: Field RecordId
warcWarcinfoID = Field "WARC-Warcinfo-ID" encodeRecordId recordId
warcFilename :: Field T.Text
warcFilename = Field "WARC-Filename"(quoted . encodeText) (text <|> quotedString)
warcProfile :: Field Uri
warcProfile = Field "WARC-Profile" encodeUri uri
warcSegmentNumber :: Field Integer
warcSegmentNumber = Field "WARC-Segment-Number" BB.integerDec decimal
warcSegmentTotalLength :: Field Integer
warcSegmentTotalLength = Field "WARC-Segment-Total-Length" BB.integerDec decimal
rawField :: FieldName -> Field BSL.ByteString
rawField fname = Field fname BB.lazyByteString takeLazyByteString
quoted x = q <> x <> q
where q = BB.char7 '"'