{-# LANGUAGE BangPatterns, CPP #-}
module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, csv
, csvWithHeader
, header
, record
, name
, field
) where
import Data.ByteString.Builder (byteString, toLazyByteString, charUtf8)
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import Data.Word (Word8)
import Data.Csv.Types
import Data.Csv.Util ((<$!>), blankLine, endOfLine, liftM2', cr, newline, doubleQuote, toStrict)
data DecodeOptions = DecodeOptions
{
DecodeOptions -> Word8
decDelimiter :: {-# UNPACK #-} !Word8
} deriving (DecodeOptions -> DecodeOptions -> Bool
(DecodeOptions -> DecodeOptions -> Bool)
-> (DecodeOptions -> DecodeOptions -> Bool) -> Eq DecodeOptions
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: DecodeOptions -> DecodeOptions -> Bool
== :: DecodeOptions -> DecodeOptions -> Bool
$c/= :: DecodeOptions -> DecodeOptions -> Bool
/= :: DecodeOptions -> DecodeOptions -> Bool
Eq, Int -> DecodeOptions -> ShowS
[DecodeOptions] -> ShowS
DecodeOptions -> String
(Int -> DecodeOptions -> ShowS)
-> (DecodeOptions -> String)
-> ([DecodeOptions] -> ShowS)
-> Show DecodeOptions
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DecodeOptions -> ShowS
showsPrec :: Int -> DecodeOptions -> ShowS
$cshow :: DecodeOptions -> String
show :: DecodeOptions -> String
$cshowList :: [DecodeOptions] -> ShowS
showList :: [DecodeOptions] -> ShowS
Show)
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter :: Word8
decDelimiter = Word8
44
}
csv :: DecodeOptions -> AL.Parser Csv
csv :: DecodeOptions -> Parser Csv
csv !DecodeOptions
opts = do
vals <- Parser Record -> Parser [Record]
forall a. Parser a -> Parser [a]
sepByEndOfLine1' (Word8 -> Parser Record
record (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts))
_ <- optional endOfLine
endOfInput
let nonEmpty = [Record] -> [Record]
removeBlankLines [Record]
vals
return $! V.fromList nonEmpty
{-# INLINE csv #-}
sepByDelim1' :: AL.Parser a
-> Word8
-> AL.Parser [a]
sepByDelim1' :: forall a. Parser a -> Word8 -> Parser [a]
sepByDelim1' Parser a
p !Word8
delim = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
where
loop :: Parser ByteString [a]
loop = do
mb <- Parser (Maybe Word8)
A.peekWord8
case mb of
Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
delim -> (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
Maybe Word8
_ -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByDelim1' #-}
sepByEndOfLine1' :: AL.Parser a
-> AL.Parser [a]
sepByEndOfLine1' :: forall a. Parser a -> Parser [a]
sepByEndOfLine1' Parser a
p = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
where
loop :: Parser ByteString [a]
loop = do
mb <- Parser (Maybe Word8)
A.peekWord8
case mb of
Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
cr ->
(a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser Word8 -> Parser Word8
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> Parser Word8
A.word8 Word8
newline Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
| Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
newline ->
(a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
Maybe Word8
_ -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByEndOfLine1' #-}
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
!DecodeOptions
opts = do
!hdr <- Word8 -> Parser Record
header (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts)
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
sepByEndOfLine1' (record (decDelimiter opts))
_ <- optional endOfLine
endOfInput
let !v = [NamedRecord] -> Vector NamedRecord
forall a. [a] -> Vector a
V.fromList [NamedRecord]
vals
return (hdr, v)
header :: Word8
-> AL.Parser Header
!Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
name Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim Parser Record -> Parser ByteString () -> Parser Record
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ByteString ()
endOfLine
name :: Word8 -> AL.Parser Name
name :: Word8 -> Parser ByteString
name !Word8
delim = Word8 -> Parser ByteString
field Word8
delim
removeBlankLines :: [Record] -> [Record]
removeBlankLines :: [Record] -> [Record]
removeBlankLines = (Record -> Bool) -> [Record] -> [Record]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Record -> Bool) -> Record -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Record -> Bool
blankLine)
record :: Word8
-> AL.Parser Record
record :: Word8 -> Parser Record
record !Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
field Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim
{-# INLINE record #-}
field :: Word8 -> AL.Parser Field
field :: Word8 -> Parser ByteString
field !Word8
delim = do
mb <- Parser (Maybe Word8)
A.peekWord8
case mb of
Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote -> Parser ByteString
escapedField
Maybe Word8
_ -> Word8 -> Parser ByteString
unescapedField Word8
delim
{-# INLINE field #-}
escapedField :: AL.Parser S.ByteString
escapedField :: Parser ByteString
escapedField = do
_ <- Parser Char
dquote
s' <- A.scan False $ \Bool
s Word8
c -> if Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote
then Bool -> Maybe Bool
forall a. a -> Maybe a
Just (Bool -> Bool
not Bool
s)
else if Bool
s then Maybe Bool
forall a. Maybe a
Nothing
else Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
if S.null s'
then fail "trailing double quote"
else let s = HasCallStack => ByteString -> ByteString
ByteString -> ByteString
S.init ByteString
s'
in if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right ByteString
r -> ByteString -> Parser ByteString
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
r
Left String
err -> String -> Parser ByteString
forall a. String -> Parser ByteString a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err
else return s
unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField :: Word8 -> Parser ByteString
unescapedField !Word8
delim = (Word8 -> Bool) -> Parser ByteString
A.takeWhile (\ Word8
c -> Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote Bool -> Bool -> Bool
&&
Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
newline Bool -> Bool -> Bool
&&
Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
delim Bool -> Bool -> Bool
&&
Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
cr)
dquote :: AL.Parser Char
dquote :: Parser Char
dquote = Char -> Parser Char
char Char
'"'
unescape :: Z.Parser S.ByteString
unescape :: Parser ByteString
unescape = (LazyByteString -> ByteString
toStrict (LazyByteString -> ByteString)
-> (Builder -> LazyByteString) -> Builder -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> LazyByteString
toLazyByteString) (Builder -> ByteString)
-> ZeptoT Identity Builder -> Parser ByteString
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Builder -> ZeptoT Identity Builder
forall {m :: * -> *}. Monad m => Builder -> ZeptoT m Builder
go Builder
forall a. Monoid a => a
mempty where
go :: Builder -> ZeptoT m Builder
go Builder
acc = do
h <- (Word8 -> Bool) -> ZeptoT m ByteString
forall (m :: * -> *).
Monad m =>
(Word8 -> Bool) -> ZeptoT m ByteString
Z.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote)
let rest = do
start <- Int -> ZeptoT m ByteString
forall (m :: * -> *). Monad m => Int -> ZeptoT m ByteString
Z.take Int
2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` byteString h `mappend` charUtf8 '"')
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` byteString h)
else rest