module Frames.CSV where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative ((<$>), pure, (<*>))
import Data.Foldable (foldMap)
import Data.Traversable (sequenceA)
import Data.Monoid (Monoid)
#endif
import Control.Arrow (first, second)
import Control.Monad (MonadPlus(..), when, void)
import Control.Monad.IO.Class
import Data.Char (isAlpha, isAlphaNum, toLower, toUpper)
import Data.Maybe (isNothing, fromMaybe)
import Data.Monoid ((<>))
import Data.Proxy
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.Vinyl (RElem, Rec)
import Data.Vinyl.TypeLevel (RIndex)
import Frames.Col
import Frames.ColumnTypeable
import Frames.ColumnUniverse
import Frames.Rec
import Frames.RecF
import Frames.RecLens
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
import qualified Pipes as P
import System.IO (Handle, hIsEOF, openFile, IOMode(..), withFile)
type Separator = T.Text
type QuoteChar = Char
data QuotingMode
= NoQuoting
| RFC4180Quoting QuoteChar
deriving (Eq, Show)
data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text]
, columnSeparator :: Separator
, quotingMode :: QuotingMode }
deriving (Eq, Show)
instance Lift QuotingMode where
lift NoQuoting = [|NoQuoting|]
lift (RFC4180Quoting char) = [|RFC4180Quoting $(litE . charL $ char)|]
instance Lift ParserOptions where
lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
quoting' = lift quoting
lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|]
quoting' = lift quoting
defaultParser :: ParserOptions
defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"')
defaultSep :: Separator
defaultSep = T.pack ","
tokenizeRow :: ParserOptions -> T.Text -> [T.Text]
tokenizeRow options =
handleQuoting . T.splitOn sep
where sep = columnSeparator options
quoting = quotingMode options
handleQuoting = case quoting of
NoQuoting -> id
RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote
reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text]
reassembleRFC4180QuotedParts sep quoteChar = finish . foldr f ([], Nothing)
where f :: T.Text -> ([T.Text], Maybe T.Text) -> ([T.Text], Maybe T.Text)
f part (rest, Just accum)
| prefixQuoted part = let token = unescape (T.drop 1 part) <> sep <> accum
in (token : rest, Nothing)
| otherwise = (rest, Just (unescape part <> sep <> accum))
f part (rest, Nothing)
| prefixQuoted part &&
suffixQuoted part = ((unescape . T.drop 1 . T.dropEnd 1 $ part) : rest, Nothing)
| suffixQuoted part = (rest, Just (unescape . T.dropEnd 1 $ part))
| otherwise = (T.strip part : rest, Nothing)
prefixQuoted t =
quoteText `T.isPrefixOf` t &&
(T.length t (T.length . T.dropWhile (== quoteChar) $ t)) `mod` 2 == 1
suffixQuoted t =
quoteText `T.isSuffixOf` t &&
(T.length t (T.length . T.dropWhileEnd (== quoteChar) $ t)) `mod` 2 == 1
quoteText = T.singleton quoteChar
unescape :: T.Text -> T.Text
unescape = T.replace (quoteText <> quoteText) quoteText
finish :: ([T.Text], Maybe T.Text) -> [T.Text]
finish (rest, Just dangling) = dangling : rest
finish (rest, Nothing ) = rest
prefixInference :: (ColumnTypeable a, Monoid a)
=> ParserOptions -> Handle -> IO [a]
prefixInference opts h = T.hGetLine h >>= go prefixSize . inferCols
where prefixSize = 1000 :: Int
inferCols = map inferType . tokenizeRow opts
go 0 ts = return ts
go !n ts =
hIsEOF h >>= \case
True -> return ts
False -> T.hGetLine h >>= go (n 1) . zipWith (<>) ts . inferCols
readColHeaders :: (ColumnTypeable a, Monoid a)
=> ParserOptions -> FilePath -> IO [(T.Text, a)]
readColHeaders opts f = withFile f ReadMode $ \h ->
zip <$> maybe (tokenizeRow opts <$> T.hGetLine h)
pure
(headerOverride opts)
<*> prefixInference opts h
class ReadRec (rs :: [*]) where
readRec :: [T.Text] -> Rec Maybe rs
instance ReadRec '[] where
readRec _ = Nil
instance (Parseable t, ReadRec ts) => ReadRec (s :-> t ': ts) where
readRec [] = frameCons Nothing (readRec [])
readRec (h:t) = frameCons (parse' h) (readRec t)
readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec Maybe rs
readRow = (readRec .) . tokenizeRow
readTableMaybeOpt :: (MonadIO m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybeOpt opts csvFile =
do h <- liftIO $ do
h <- openFile csvFile ReadMode
when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
return h
let go = liftIO (hIsEOF h) >>= \case
True -> return ()
False -> liftIO (readRow opts <$> T.hGetLine h) >>= P.yield >> go
go
readTableMaybe :: (MonadIO m, ReadRec rs)
=> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybe = readTableMaybeOpt defaultParser
readTableOpt' :: forall m rs.
(MonadPlus m, MonadIO m, ReadRec rs)
=> ParserOptions -> FilePath -> m (Record rs)
readTableOpt' opts csvFile =
do h <- liftIO $ do
h <- openFile csvFile ReadMode
when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
return h
let go = liftIO (hIsEOF h) >>= \case
True -> mzero
False -> let r = recMaybe . readRow opts <$> T.hGetLine h
in liftIO r >>= maybe go (flip mplus go . return)
go
readTable' :: forall m rs. (MonadPlus m, MonadIO m, ReadRec rs)
=> FilePath -> m (Record rs)
readTable' = readTableOpt' defaultParser
readTableOpt :: forall m rs.
(MonadIO m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Record rs) m ()
readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go
where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe
readTable :: forall m rs. (MonadIO m, ReadRec rs)
=> FilePath -> P.Producer (Record rs) m ()
readTable = readTableOpt defaultParser
recDec :: [(T.Text, Q Type)] -> Q Type
recDec = appT [t|Record|] . go
where go [] = return PromotedNilT
go ((n,t):cs) =
[t|($(litT $ strTyLit (T.unpack n)) :-> $(t)) ': $(go cs) |]
sanitizeTypeName :: T.Text -> T.Text
sanitizeTypeName = unreserved . fixupStart
. T.concat . T.split (not . valid) . toTitle'
where valid c = isAlphaNum c || c == '\'' || c == '_'
toTitle' = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
onHead f = maybe mempty (uncurry T.cons) . fmap (first f) . T.uncons
unreserved t
| t `elem` ["Type"] = "Col" <> t
| otherwise = t
fixupStart t = case T.uncons t of
Nothing -> "Col"
Just (c,_) | isAlpha c -> t
| otherwise -> "Col" <> t
mkColTDec :: TypeQ -> Name -> DecQ
mkColTDec colTypeQ colTName = tySynD colTName [] colTypeQ
mkColPDec :: Name -> TypeQ -> T.Text -> DecsQ
mkColPDec colTName colTy colPName = sequenceA [tySig, val, tySig', val']
where nm = mkName $ T.unpack colPName
nm' = mkName $ T.unpack colPName <> "'"
tySig = sigD nm [t|forall f rs. (Functor f,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> ($colTy -> f $colTy)
-> Record rs
-> f (Record rs)
|]
tySig' = sigD nm' [t|forall f g rs. (Functor f, Functor g,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> (g $(conT colTName) -> f (g $(conT colTName)))
-> Rec g rs
-> f (Rec g rs)
|]
val = valD (varP nm)
(normalB [e|rlens (Proxy :: Proxy $(conT colTName))|])
[]
val' = valD (varP nm')
(normalB [e|rlens' (Proxy :: Proxy $(conT colTName))|])
[]
lowerHead :: T.Text -> Maybe T.Text
lowerHead = fmap aux . T.uncons
where aux (c,t) = T.cons (toLower c) t
colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ
colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName (prefix <> colName)
colPName = fromMaybe "colDec impossible" (lowerHead colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = colType colTy
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
declareColumn :: T.Text -> Name -> DecsQ
declareColumn colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName colName
colPName = maybe "colDec impossible"
(\(c,t) -> T.cons (toLower c) t)
(T.uncons colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = return (ConT colTy)
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
data RowGen a = RowGen { columnNames :: [String]
, tablePrefix :: String
, separator :: Separator
, rowTypeName :: String
, columnUniverse :: Proxy a
}
colQ :: Name -> Q Exp
colQ n = [e| (Proxy :: Proxy (ColumnUniverse $(conT n))) |]
rowGen :: RowGen Columns
rowGen = RowGen [] "" defaultSep "Row" Proxy
tableType :: String -> FilePath -> DecsQ
tableType n = tableType' rowGen { rowTypeName = n }
tableTypes :: String -> FilePath -> DecsQ
tableTypes n = tableTypes' rowGen { rowTypeName = n }
tableType' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> FilePath -> DecsQ
tableType' (RowGen {..}) csvFile =
pure . TySynD (mkName rowTypeName) [] <$>
(runIO (readColHeaders opts csvFile) >>= recDec')
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
tableTypesText' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> FilePath -> DecsQ
tableTypesText' (RowGen {..}) csvFile =
do colNames <- runIO $ withFile csvFile ReadMode $ \h ->
maybe (tokenizeRow opts <$> T.hGetLine h)
pure
(headerOverride opts)
let headers = zip colNames (repeat (inferType " "))
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry $ colDec (T.pack tablePrefix)) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
tableTypes' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> FilePath -> DecsQ
tableTypes' (RowGen {..}) csvFile =
do headers <- runIO $ readColHeaders opts csvFile
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry mkColDecs) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
mkColDecs colNm colTy = do
let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm)
mColNm <- lookupTypeName safeName
case mColNm of
Just _ -> pure []
Nothing -> colDec (T.pack tablePrefix) colNm colTy