{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}

module Data.Warc.Header
    ( -- * Parsing
      header
      -- * Encoding
    , encodeHeader
      -- * Types
    , RecordHeader(..)
    , Version(..)
    , WarcType(..)
    , RecordId(..)
    , TruncationReason(..)
    , Digest(..)
    , Uri(..)
      -- * Header field types
    , Field(..)
      -- ** Prisms
    , _WarcRecordId
    , _ContentLength
    , _WarcDate
    , _WarcType
    , _ContentType
    , _WarcConcurrentTo
    , _WarcBlockDigest
    , _WarcPayloadDigest
    , _WarcIpAddress
    , _WarcRefersTo
    , _WarcTargetUri
    , _WarcTruncated
    , _WarcWarcinfoId
    , _WarcFilename
    , _WarcProfile
    , _WarcIdentifiedPayloadType
    , _WarcSegmentNumber
    , _WarcSegmentOriginId
    , _WarcSegmentTotalLength
      -- * Lenses
    , recWarcVersion, recHeaders
    ) where

import Control.Applicative
import Control.Monad (void)
import Data.Maybe (catMaybes)
import Data.Monoid ((<>))
import Data.Time.Clock
import Data.Time.Format
import Data.Char (ord)

import Data.Attoparsec.ByteString.Char8
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Builder as BB

import Control.Lens

withName :: String -> Parser a -> Parser a
withName name parser = parser <?> name

data Version = Version {versionMajor, versionMinor :: !Int}
             deriving (Show, Read, Eq, Ord)

version :: Parser Version
version = withName "version" $ do
    "WARC/"
    major <- decimal
    char '.'
    minor <- decimal
    return (Version major minor)

newtype FieldName = FieldName {getFieldName :: Text}
                  deriving (Show, Read)

instance Eq FieldName where
    FieldName a == FieldName b = T.toCaseFold a == T.toCaseFold b

instance Ord FieldName where
    FieldName a `compare` FieldName b = T.toCaseFold a `compare` T.toCaseFold b

separators :: String
separators = "()<>@,;:\\\"/[]?={}"

crlf :: Parser ()
crlf = void $ string "\r\n"

token :: Parser ByteString
token = takeTill (inClass $ separators++" \t\n\r")

utf8Token :: Parser Text
utf8Token = TE.decodeUtf8 <$> token

fieldName :: Parser FieldName
fieldName = FieldName . TE.decodeUtf8 <$> token

ord' = fromIntegral . ord

text :: Parser Text
text = do
    let content :: TL.Text -> Parser TL.Text
        content accum = do
            satisfy (isHorizontalSpace . ord')
            c <- takeTill (isEndOfLine . ord')
            continuation (accum <> TL.fromStrict (TE.decodeUtf8 c))
        continuation :: TL.Text -> Parser TL.Text
        continuation accum = content accum <|> return accum
    firstLine <- takeTill (isEndOfLine . ord')
    TL.toStrict <$> continuation (TL.fromStrict $ TE.decodeUtf8 firstLine)

quotedString :: Parser Text
quotedString = do
    char '"'
    c <- TE.decodeUtf8 <$> takeTill (== '"')
    char '"'
    return c

field :: Parser name -> Parser a -> Parser a
field name content = do
    try name
    char ':'
    skipSpace
    content <* endOfLine

data WarcType = WarcInfo
              | Response
              | Resource
              | Request
              | Metadata
              | Revisit
              | Conversion
              | Continuation
              | FutureType !Text
              deriving (Show, Read, Ord, Eq)

warcType :: Parser WarcType
warcType = choice
     [ "warcinfo"     *> pure WarcInfo
     , "response"     *> pure Response
     , "resource"     *> pure Resource
     , "request"      *> pure Request
     , "metadata"     *> pure Metadata
     , "revisit"      *> pure Revisit
     , "conversion"   *> pure Conversion
     , "continuation" *> pure Continuation
     , FutureType <$> utf8Token
     ]

encodeText :: T.Text -> BB.Builder
encodeText = BB.byteString . TE.encodeUtf8

encodeWarcType :: WarcType -> BB.Builder
encodeWarcType WarcInfo       = "warcinfo"
encodeWarcType Response       = "response"
encodeWarcType Resource       = "resource"
encodeWarcType Request        = "request"
encodeWarcType Metadata       = "metadata"
encodeWarcType Revisit        = "revisit"
encodeWarcType Conversion     = "conversion"
encodeWarcType Continuation   = "continuation"
encodeWarcType (FutureType t) = encodeText t

newtype Uri = Uri ByteString
            deriving (Show, Read, Eq, Ord)

uri :: Parser Uri
uri = do
    char '<'
    s <- takeTill (== '>')
    char '>'
    return $ Uri s

laxUri :: Parser Uri
laxUri = Uri <$> takeTill (isEndOfLine . ord')

encodeUri :: Uri -> BB.Builder
encodeUri (Uri b) = BB.char7 '<' <> BB.byteString b <> BB.char7 '>'

newtype RecordId = RecordId Uri
                 deriving (Show, Read, Eq, Ord)

recordId :: Parser RecordId
recordId = RecordId <$> uri

encodeRecordId :: RecordId -> BB.Builder
encodeRecordId (RecordId r) = encodeUri r

data TruncationReason = TruncLength
                      | TruncTime
                      | TruncDisconnect
                      | TruncUnspecified
                      | TruncOther !Text
                      deriving (Show, Read, Ord, Eq)

truncationReason :: Parser TruncationReason
truncationReason = choice
    [ "length" *> pure TruncLength
    , "time"   *> pure TruncTime
    , "disconnect" *> pure TruncDisconnect
    , "unspecified" *> pure TruncUnspecified
    , TruncOther <$> utf8Token
    ]

encodeTruncationReason :: TruncationReason -> BB.Builder
encodeTruncationReason TruncLength      = "length"
encodeTruncationReason TruncTime        = "time"
encodeTruncationReason TruncDisconnect  = "disconnect"
encodeTruncationReason TruncUnspecified = "unspecified"
encodeTruncationReason (TruncOther o)   = encodeText o

data Digest = Digest { digestAlgorithm, digestHash :: !ByteString }
            deriving (Show, Read, Eq, Ord)

digest :: Parser Digest
digest = do
    algo <- token <* char ':'
    hash <- token
    return $ Digest algo hash

encodeDigest :: Digest -> BB.Builder
encodeDigest (Digest algo hash) =
    BB.byteString algo <> ":" <> BB.byteString hash

data Field = WarcRecordId !RecordId
           | ContentLength !Integer
           | WarcDate !UTCTime
           | WarcType !WarcType
           | ContentType !ByteString
           | WarcConcurrentTo !RecordId
           | WarcBlockDigest !Digest
           | WarcPayloadDigest !Digest
           | WarcIpAddress !ByteString
           | WarcRefersTo !Uri
           | WarcTargetUri !Uri
           | WarcTruncated !TruncationReason
           | WarcWarcinfoId !RecordId
           | WarcFilename !Text
           | WarcProfile !Uri
           | WarcIdentifiedPayloadType !ByteString
           | WarcSegmentNumber !Integer
           | WarcSegmentOriginId !ByteString
           | WarcSegmentTotalLength !Integer
           deriving (Show, Read)

makePrisms ''Field

date :: Parser UTCTime
date = do
    s <- takeTill isSpace
    parseTimeM False defaultTimeLocale dateFormat (BS.unpack s)

encodeDate :: UTCTime -> BB.Builder
encodeDate = BB.string7 . formatTime defaultTimeLocale dateFormat

dateFormat = iso8601DateFormat (Just "%H:%M:%SZ")

warcField :: Parser Field
warcField = choice
    [ field "WARC-Record-ID" (WarcRecordId <$> recordId)
    , field "Content-Length" (ContentLength <$> decimal)
    , field "WARC-Date" (WarcDate <$> date)
    , field "WARC-Type" (WarcType <$> warcType)
    , field "Content-Type" (ContentType <$> takeTill (isEndOfLine . ord'))
    , field "WARC-Concurrent-To" (WarcConcurrentTo <$> recordId)
    , field "WARC-Block-Digest" (WarcBlockDigest <$> digest)
    , field "WARC-Payload-Digest" (WarcPayloadDigest <$> digest)
    , field "WARC-IP-Address" (WarcIpAddress <$> takeTill (isEndOfLine . ord'))
    , field "WARC-Refers-To" (WarcRefersTo <$> uri)
    , field "WARC-Target-URI" (WarcTargetUri <$> laxUri)
    , field "WARC-Truncated" (WarcTruncated <$> truncationReason)
    , field "WARC-Warcinfo-ID" (WarcWarcinfoId <$> recordId)
    , field "WARC-Filename" (WarcFilename <$> (text <|> quotedString))
    , field "WARC-Profile" (WarcProfile <$> uri)
    -- , field "WARC-Identified-Payload-Type" (WarcIdentifiedPayloadType <$> mediaType)
    , field "WARC-Segment-Number" (WarcSegmentNumber <$> decimal)
    --, field "WARC-Segment-Origin-ID" (WarcSegmentOriginId <$> msgId)
    , field "WARC-Segment-Total-Length" (WarcSegmentTotalLength <$> decimal)
    ]

data RecordHeader = RecordHeader { _recWarcVersion :: Version
                                 , _recHeaders     :: [Field]
                                 }
                  deriving (Show)

makeLenses ''RecordHeader

-- | A WARC header
header :: Parser RecordHeader
header = withName "header" $ do
    skipSpace
    ver <- version <* endOfLine
    let unknownField = field token (takeTill (isEndOfLine . ord') *> return Nothing)
    fields <- withName "fields" $ many $ (Just <$> warcField) <|> unknownField
    endOfLine
    return $ RecordHeader ver (catMaybes fields)

encodeHeader :: RecordHeader -> BB.Builder
encodeHeader (RecordHeader (Version maj min) flds) =
       "WARC/"<>BB.intDec maj<>"."<>BB.intDec min <> "\n"
    <> foldMap encodeField flds
    <> BB.char7 '\n'

encodeField :: Field -> BB.Builder
encodeField fld =
    case fld of
        WarcRecordId r                -> field "WARC-Record-ID" (encodeRecordId r)
        ContentLength len             -> field "Content-Length" (BB.integerDec len)
        WarcDate t                    -> field "WARC-Date" (encodeDate t)
        WarcType t                    -> field "WARC-Type" (encodeWarcType t)
        ContentType t                 -> field "Content-Type" (BB.byteString t)
        WarcConcurrentTo r            -> field "WARC-Concurrent-To" (encodeRecordId r)
        WarcBlockDigest d             -> field "WARC-Block-Digest" (encodeDigest d)
        WarcPayloadDigest d           -> field "WARC-Payload-Digest" (encodeDigest d)
        WarcIpAddress addr            -> field "WARC-IP-Address" (BB.byteString addr)
        WarcRefersTo uri              -> field "WARC-Refers-To" (encodeUri uri)
        WarcTargetUri uri             -> field "WARC-Target-URI" (encodeUri uri)
        WarcTruncated t               -> field "WARC-Truncated" (encodeTruncationReason t)
        WarcWarcinfoId r              -> field "WARC-Warcinfo-ID" (encodeRecordId r)
        WarcFilename n                -> field "WARC-Filename" (quoted $ encodeText n)
        WarcProfile uri               -> field "WARC-Profile" (encodeUri uri)
        WarcSegmentNumber n           -> field "WARC-Segment-Number" (BB.integerDec n)
        WarcSegmentTotalLength len    -> field "WARC-Segment-Total-Length" (BB.integerDec len)
  where
    field :: BB.Builder -> BB.Builder -> BB.Builder
    field name val = name <> ": " <> val <> BB.char7 '\n'

    quoted x = q <> x <> q
      where q = BB.char7 '"'