{-# LANGUAGE BangPatterns, CPP #-}

-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
-- the following extensions:
--
--  * Empty lines are ignored.
--
--  * Non-escaped fields may contain any characters except
--    double-quotes, commas, carriage returns, and newlines.
--
--  * Escaped fields may contain any characters (but double-quotes
--    need to be escaped).
--
-- The functions in this module can be used to implement e.g. a
-- resumable parser that is fed input incrementally.
module Data.Csv.Parser
    ( DecodeOptions(..)
    , defaultDecodeOptions
    , csv
    , csvWithHeader
    , header
    , record
    , name
    , field
    ) where

import Data.ByteString.Builder (byteString, toLazyByteString, charUtf8)
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import Data.Word (Word8)

import Data.Csv.Types
import Data.Csv.Util ((<$!>), blankLine, endOfLine, liftM2', cr, newline, doubleQuote, toStrict)

-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
-- data.
--
-- To avoid having your program stop compiling when new fields are
-- added to 'DecodeOptions', create option records by overriding
-- values in 'defaultDecodeOptions'. Example:
--
-- > myOptions = defaultDecodeOptions {
-- >       decDelimiter = fromIntegral (ord '\t')
-- >     }
data DecodeOptions = DecodeOptions
    { -- | Field delimiter.
      DecodeOptions -> Word8
decDelimiter  :: {-# UNPACK #-} !Word8
    } deriving (DecodeOptions -> DecodeOptions -> Bool
(DecodeOptions -> DecodeOptions -> Bool)
-> (DecodeOptions -> DecodeOptions -> Bool) -> Eq DecodeOptions
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: DecodeOptions -> DecodeOptions -> Bool
== :: DecodeOptions -> DecodeOptions -> Bool
$c/= :: DecodeOptions -> DecodeOptions -> Bool
/= :: DecodeOptions -> DecodeOptions -> Bool
Eq, Int -> DecodeOptions -> ShowS
[DecodeOptions] -> ShowS
DecodeOptions -> String
(Int -> DecodeOptions -> ShowS)
-> (DecodeOptions -> String)
-> ([DecodeOptions] -> ShowS)
-> Show DecodeOptions
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DecodeOptions -> ShowS
showsPrec :: Int -> DecodeOptions -> ShowS
$cshow :: DecodeOptions -> String
show :: DecodeOptions -> String
$cshowList :: [DecodeOptions] -> ShowS
showList :: [DecodeOptions] -> ShowS
Show)

-- | Decoding options for parsing CSV files.
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
    { decDelimiter :: Word8
decDelimiter = Word8
44  -- comma
    }

-- | Parse a CSV file that does not include a header.
csv :: DecodeOptions -> AL.Parser Csv
csv :: DecodeOptions -> Parser Csv
csv !DecodeOptions
opts = do
    vals <- Parser Record -> Parser [Record]
forall a. Parser a -> Parser [a]
sepByEndOfLine1' (Word8 -> Parser Record
record (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts))
    _ <- optional endOfLine
    endOfInput
    let nonEmpty = [Record] -> [Record]
removeBlankLines [Record]
vals
    return $! V.fromList nonEmpty
{-# INLINE csv #-}

-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByDelim1' :: AL.Parser a
             -> Word8  -- ^ Field delimiter
             -> AL.Parser [a]
sepByDelim1' :: forall a. Parser a -> Word8 -> Parser [a]
sepByDelim1' Parser a
p !Word8
delim = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
  where
    loop :: Parser ByteString [a]
loop = do
        mb <- Parser (Maybe Word8)
A.peekWord8
        case mb of
            Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
delim -> (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
            Maybe Word8
_                   -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByDelim1' #-}

-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByEndOfLine1' :: AL.Parser a
                 -> AL.Parser [a]
sepByEndOfLine1' :: forall a. Parser a -> Parser [a]
sepByEndOfLine1' Parser a
p = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
  where
    loop :: Parser ByteString [a]
loop = do
        mb <- Parser (Maybe Word8)
A.peekWord8
        case mb of
            Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
cr ->
                (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser Word8 -> Parser Word8
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> Parser Word8
A.word8 Word8
newline Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
                   | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
newline ->
                (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
            Maybe Word8
_ -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByEndOfLine1' #-}

-- | Parse a CSV file that includes a header.
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader :: DecodeOptions -> Parser (Record, Vector NamedRecord)
csvWithHeader !DecodeOptions
opts = do
    !hdr <- Word8 -> Parser Record
header (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts)
    vals <- map (toNamedRecord hdr) . removeBlankLines <$>
            sepByEndOfLine1' (record (decDelimiter opts))
    _ <- optional endOfLine
    endOfInput
    let !v = [NamedRecord] -> Vector NamedRecord
forall a. [a] -> Vector a
V.fromList [NamedRecord]
vals
    return (hdr, v)

-- | Parse a header, including the terminating line separator.
header :: Word8  -- ^ Field delimiter
       -> AL.Parser Header
header :: Word8 -> Parser Record
header !Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
name Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim Parser Record -> Parser ByteString () -> Parser Record
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ByteString ()
endOfLine

-- | Parse a header name. Header names have the same format as regular
-- 'field's.
name :: Word8 -> AL.Parser Name
name :: Word8 -> Parser ByteString
name !Word8
delim = Word8 -> Parser ByteString
field Word8
delim

removeBlankLines :: [Record] -> [Record]
removeBlankLines :: [Record] -> [Record]
removeBlankLines = (Record -> Bool) -> [Record] -> [Record]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Record -> Bool) -> Record -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Record -> Bool
blankLine)

-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
record :: Word8  -- ^ Field delimiter
       -> AL.Parser Record
record :: Word8 -> Parser Record
record !Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
field Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim
{-# INLINE record #-}

-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: Word8 -> AL.Parser Field
field :: Word8 -> Parser ByteString
field !Word8
delim = do
    mb <- Parser (Maybe Word8)
A.peekWord8
    -- We purposely don't use <|> as we want to commit to the first
    -- choice if we see a double quote.
    case mb of
        Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote -> Parser ByteString
escapedField
        Maybe Word8
_                         -> Word8 -> Parser ByteString
unescapedField Word8
delim
{-# INLINE field #-}

escapedField :: AL.Parser S.ByteString
escapedField :: Parser ByteString
escapedField = do
    _ <- Parser Char
dquote
    -- The scan state is 'True' if the previous character was a double quote.
    s' <- A.scan False $ \Bool
s Word8
c -> if Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote
                                 then Bool -> Maybe Bool
forall a. a -> Maybe a
Just (Bool -> Bool
not Bool
s)
                                 else if Bool
s then Maybe Bool
forall a. Maybe a
Nothing
                                      else Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
    -- We need to drop a trailing double quote left by scan.
    if S.null s'
      then fail "trailing double quote"
      else let s = HasCallStack => ByteString -> ByteString
ByteString -> ByteString
S.init ByteString
s'
           in if doubleQuote `S.elem` s
              then case Z.parse unescape s of
                     Right ByteString
r  -> ByteString -> Parser ByteString
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
r
                     Left String
err -> String -> Parser ByteString
forall a. String -> Parser ByteString a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err
              else return s

unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField :: Word8 -> Parser ByteString
unescapedField !Word8
delim = (Word8 -> Bool) -> Parser ByteString
A.takeWhile (\ Word8
c -> Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
newline Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
delim Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
cr)

dquote :: AL.Parser Char
dquote :: Parser Char
dquote = Char -> Parser Char
char Char
'"'

unescape :: Z.Parser S.ByteString
unescape :: Parser ByteString
unescape = (LazyByteString -> ByteString
toStrict (LazyByteString -> ByteString)
-> (Builder -> LazyByteString) -> Builder -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> LazyByteString
toLazyByteString) (Builder -> ByteString)
-> ZeptoT Identity Builder -> Parser ByteString
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Builder -> ZeptoT Identity Builder
forall {m :: * -> *}. Monad m => Builder -> ZeptoT m Builder
go Builder
forall a. Monoid a => a
mempty where
  go :: Builder -> ZeptoT m Builder
go Builder
acc = do
    h <- (Word8 -> Bool) -> ZeptoT m ByteString
forall (m :: * -> *).
Monad m =>
(Word8 -> Bool) -> ZeptoT m ByteString
Z.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote)
    let rest = do
          start <- Int -> ZeptoT m ByteString
forall (m :: * -> *). Monad m => Int -> ZeptoT m ByteString
Z.take Int
2
          if (S.unsafeHead start == doubleQuote &&
              S.unsafeIndex start 1 == doubleQuote)
              then go (acc `mappend` byteString h `mappend` charUtf8 '"')
              else fail "invalid CSV escape sequence"
    done <- Z.atEnd
    if done
      then return (acc `mappend` byteString h)
      else rest