module Data.Warc.Header
(
header
, encodeHeader
, RecordHeader(..)
, Version(..)
, WarcType(..)
, RecordId(..)
, TruncationReason(..)
, Digest(..)
, Uri(..)
, Field(..)
, _WarcRecordId
, _ContentLength
, _WarcDate
, _WarcType
, _ContentType
, _WarcConcurrentTo
, _WarcBlockDigest
, _WarcPayloadDigest
, _WarcIpAddress
, _WarcRefersTo
, _WarcTargetUri
, _WarcTruncated
, _WarcWarcinfoId
, _WarcFilename
, _WarcProfile
, _WarcIdentifiedPayloadType
, _WarcSegmentNumber
, _WarcSegmentOriginId
, _WarcSegmentTotalLength
, recWarcVersion, recHeaders
) where
import Control.Applicative
import Control.Monad (void)
import Data.Maybe (catMaybes)
import Data.Monoid ((<>))
import Data.Time.Clock
import Data.Time.Format
import Data.Char (ord)
import Data.Attoparsec.ByteString.Char8
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Builder as BB
import Control.Lens
withName :: String -> Parser a -> Parser a
withName name parser = parser <?> name
data Version = Version {versionMajor, versionMinor :: !Int}
deriving (Show, Read, Eq, Ord)
version :: Parser Version
version = withName "version" $ do
"WARC/"
major <- decimal
char '.'
minor <- decimal
return (Version major minor)
newtype FieldName = FieldName {getFieldName :: Text}
deriving (Show, Read)
instance Eq FieldName where
FieldName a == FieldName b = T.toCaseFold a == T.toCaseFold b
instance Ord FieldName where
FieldName a `compare` FieldName b = T.toCaseFold a `compare` T.toCaseFold b
separators :: String
separators = "()<>@,;:\\\"/[]?={}"
crlf :: Parser ()
crlf = void $ string "\r\n"
token :: Parser ByteString
token = takeTill (inClass $ separators++" \t\n\r")
utf8Token :: Parser Text
utf8Token = TE.decodeUtf8 <$> token
fieldName :: Parser FieldName
fieldName = FieldName . TE.decodeUtf8 <$> token
ord' = fromIntegral . ord
text :: Parser Text
text = do
let content :: TL.Text -> Parser TL.Text
content accum = do
satisfy (isHorizontalSpace . ord')
c <- takeTill (isEndOfLine . ord')
continuation (accum <> TL.fromStrict (TE.decodeUtf8 c))
continuation :: TL.Text -> Parser TL.Text
continuation accum = content accum <|> return accum
firstLine <- takeTill (isEndOfLine . ord')
TL.toStrict <$> continuation (TL.fromStrict $ TE.decodeUtf8 firstLine)
quotedString :: Parser Text
quotedString = do
char '"'
c <- TE.decodeUtf8 <$> takeTill (== '"')
char '"'
return c
field :: Parser name -> Parser a -> Parser a
field name content = do
try name
char ':'
skipSpace
content <* endOfLine
data WarcType = WarcInfo
| Response
| Resource
| Request
| Metadata
| Revisit
| Conversion
| Continuation
| FutureType !Text
deriving (Show, Read, Ord, Eq)
warcType :: Parser WarcType
warcType = choice
[ "warcinfo" *> pure WarcInfo
, "response" *> pure Response
, "resource" *> pure Resource
, "request" *> pure Request
, "metadata" *> pure Metadata
, "revisit" *> pure Revisit
, "conversion" *> pure Conversion
, "continuation" *> pure Continuation
, FutureType <$> utf8Token
]
encodeText :: T.Text -> BB.Builder
encodeText = BB.byteString . TE.encodeUtf8
encodeWarcType :: WarcType -> BB.Builder
encodeWarcType WarcInfo = "warcinfo"
encodeWarcType Response = "response"
encodeWarcType Resource = "resource"
encodeWarcType Request = "request"
encodeWarcType Metadata = "metadata"
encodeWarcType Revisit = "revisit"
encodeWarcType Conversion = "conversion"
encodeWarcType Continuation = "continuation"
encodeWarcType (FutureType t) = encodeText t
newtype Uri = Uri ByteString
deriving (Show, Read, Eq, Ord)
uri :: Parser Uri
uri = do
char '<'
s <- takeTill (== '>')
char '>'
return $ Uri s
laxUri :: Parser Uri
laxUri = Uri <$> takeTill (isEndOfLine . ord')
encodeUri :: Uri -> BB.Builder
encodeUri (Uri b) = BB.char7 '<' <> BB.byteString b <> BB.char7 '>'
newtype RecordId = RecordId Uri
deriving (Show, Read, Eq, Ord)
recordId :: Parser RecordId
recordId = RecordId <$> uri
encodeRecordId :: RecordId -> BB.Builder
encodeRecordId (RecordId r) = encodeUri r
data TruncationReason = TruncLength
| TruncTime
| TruncDisconnect
| TruncUnspecified
| TruncOther !Text
deriving (Show, Read, Ord, Eq)
truncationReason :: Parser TruncationReason
truncationReason = choice
[ "length" *> pure TruncLength
, "time" *> pure TruncTime
, "disconnect" *> pure TruncDisconnect
, "unspecified" *> pure TruncUnspecified
, TruncOther <$> utf8Token
]
encodeTruncationReason :: TruncationReason -> BB.Builder
encodeTruncationReason TruncLength = "length"
encodeTruncationReason TruncTime = "time"
encodeTruncationReason TruncDisconnect = "disconnect"
encodeTruncationReason TruncUnspecified = "unspecified"
encodeTruncationReason (TruncOther o) = encodeText o
data Digest = Digest { digestAlgorithm, digestHash :: !ByteString }
deriving (Show, Read, Eq, Ord)
digest :: Parser Digest
digest = do
algo <- token <* char ':'
hash <- token
return $ Digest algo hash
encodeDigest :: Digest -> BB.Builder
encodeDigest (Digest algo hash) =
BB.byteString algo <> ":" <> BB.byteString hash
data Field = WarcRecordId !RecordId
| ContentLength !Integer
| WarcDate !UTCTime
| WarcType !WarcType
| ContentType !ByteString
| WarcConcurrentTo !RecordId
| WarcBlockDigest !Digest
| WarcPayloadDigest !Digest
| WarcIpAddress !ByteString
| WarcRefersTo !Uri
| WarcTargetUri !Uri
| WarcTruncated !TruncationReason
| WarcWarcinfoId !RecordId
| WarcFilename !Text
| WarcProfile !Uri
| WarcIdentifiedPayloadType !ByteString
| WarcSegmentNumber !Integer
| WarcSegmentOriginId !ByteString
| WarcSegmentTotalLength !Integer
deriving (Show, Read)
makePrisms ''Field
date :: Parser UTCTime
date = do
s <- takeTill isSpace
parseTimeM False defaultTimeLocale dateFormat (BS.unpack s)
encodeDate :: UTCTime -> BB.Builder
encodeDate = BB.string7 . formatTime defaultTimeLocale dateFormat
dateFormat = iso8601DateFormat (Just "%H:%M:%SZ")
warcField :: Parser Field
warcField = choice
[ field "WARC-Record-ID" (WarcRecordId <$> recordId)
, field "Content-Length" (ContentLength <$> decimal)
, field "WARC-Date" (WarcDate <$> date)
, field "WARC-Type" (WarcType <$> warcType)
, field "Content-Type" (ContentType <$> takeTill (isEndOfLine . ord'))
, field "WARC-Concurrent-To" (WarcConcurrentTo <$> recordId)
, field "WARC-Block-Digest" (WarcBlockDigest <$> digest)
, field "WARC-Payload-Digest" (WarcPayloadDigest <$> digest)
, field "WARC-IP-Address" (WarcIpAddress <$> takeTill (isEndOfLine . ord'))
, field "WARC-Refers-To" (WarcRefersTo <$> uri)
, field "WARC-Target-URI" (WarcTargetUri <$> laxUri)
, field "WARC-Truncated" (WarcTruncated <$> truncationReason)
, field "WARC-Warcinfo-ID" (WarcWarcinfoId <$> recordId)
, field "WARC-Filename" (WarcFilename <$> (text <|> quotedString))
, field "WARC-Profile" (WarcProfile <$> uri)
, field "WARC-Segment-Number" (WarcSegmentNumber <$> decimal)
, field "WARC-Segment-Total-Length" (WarcSegmentTotalLength <$> decimal)
]
data RecordHeader = RecordHeader { _recWarcVersion :: Version
, _recHeaders :: [Field]
}
deriving (Show)
makeLenses ''RecordHeader
header :: Parser RecordHeader
header = withName "header" $ do
skipSpace
ver <- version <* endOfLine
let unknownField = field token (takeTill (isEndOfLine . ord') *> return Nothing)
fields <- withName "fields" $ many $ (Just <$> warcField) <|> unknownField
endOfLine
return $ RecordHeader ver (catMaybes fields)
encodeHeader :: RecordHeader -> BB.Builder
encodeHeader (RecordHeader (Version maj min) flds) =
"WARC/"<>BB.intDec maj<>"."<>BB.intDec min <> "\n"
<> foldMap encodeField flds
<> BB.char7 '\n'
encodeField :: Field -> BB.Builder
encodeField fld =
case fld of
WarcRecordId r -> field "WARC-Record-ID" (encodeRecordId r)
ContentLength len -> field "Content-Length" (BB.integerDec len)
WarcDate t -> field "WARC-Date" (encodeDate t)
WarcType t -> field "WARC-Type" (encodeWarcType t)
ContentType t -> field "Content-Type" (BB.byteString t)
WarcConcurrentTo r -> field "WARC-Concurrent-To" (encodeRecordId r)
WarcBlockDigest d -> field "WARC-Block-Digest" (encodeDigest d)
WarcPayloadDigest d -> field "WARC-Payload-Digest" (encodeDigest d)
WarcIpAddress addr -> field "WARC-IP-Address" (BB.byteString addr)
WarcRefersTo uri -> field "WARC-Refers-To" (encodeUri uri)
WarcTargetUri uri -> field "WARC-Target-URI" (encodeUri uri)
WarcTruncated t -> field "WARC-Truncated" (encodeTruncationReason t)
WarcWarcinfoId r -> field "WARC-Warcinfo-ID" (encodeRecordId r)
WarcFilename n -> field "WARC-Filename" (quoted $ encodeText n)
WarcProfile uri -> field "WARC-Profile" (encodeUri uri)
WarcSegmentNumber n -> field "WARC-Segment-Number" (BB.integerDec n)
WarcSegmentTotalLength len -> field "WARC-Segment-Total-Length" (BB.integerDec len)
where
field :: BB.Builder -> BB.Builder -> BB.Builder
field name val = name <> ": " <> val <> BB.char7 '\n'
quoted x = q <> x <> q
where q = BB.char7 '"'