module ProjectM36.Relation.Parse.CSV where
import ProjectM36.Base
import ProjectM36.Error
import ProjectM36.Relation
import ProjectM36.AtomType
import qualified ProjectM36.Attribute as A
import Data.Csv.Parser
import qualified Data.Vector as V
import Data.Char (ord, isUpper, isSpace)
import qualified Data.ByteString.Lazy as BS
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Set as S
import qualified Data.HashMap.Lazy as HM
import qualified Data.List as L
import qualified Data.Text as T
import qualified Data.Attoparsec.ByteString.Lazy as APBL
import qualified Data.Attoparsec.Text as APT
import Control.Arrow
import Text.Read hiding (parens)
import Control.Applicative
import Data.Either
data CsvImportError = CsvParseError String |
AttributeMappingError RelationalError |
HeaderAttributeMismatchError (S.Set AttributeName)
deriving (Show)
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = DecodeOptions {decDelimiter = fromIntegral (ord ',')}
csvAsRelation :: Attributes -> TypeConstructorMapping -> BS.ByteString -> Either CsvImportError Relation
csvAsRelation attrs tConsMap inString = case APBL.parse (csvWithHeader csvDecodeOptions) inString of
APBL.Fail _ _ err -> Left (CsvParseError err)
APBL.Done _ (headerRaw,vecMapsRaw) -> do
let strHeader = V.map decodeUtf8 headerRaw
strMapRecords = V.map convertMap vecMapsRaw
convertMap hmap = HM.fromList $ L.map (decodeUtf8 *** (T.unpack . decodeUtf8)) (HM.toList hmap)
attrNames = V.map A.attributeName attrs
attrNameSet = S.fromList (V.toList attrNames)
headerSet = S.fromList (V.toList strHeader)
parseAtom attrName aType textIn = case APT.parseOnly (parseCSVAtomP attrName tConsMap aType <* APT.endOfInput) textIn of
Left err -> Left (ParseError (T.pack err))
Right eAtom -> eAtom
makeTupleList :: HM.HashMap AttributeName String -> [Either CsvImportError Atom]
makeTupleList tupMap = V.toList $ V.map (\attr ->
either (Left . AttributeMappingError) Right $
parseAtom (A.attributeName attr) (A.atomType attr) (T.pack $ tupMap HM.! A.attributeName attr)) attrs
if attrNameSet == headerSet then do
tupleList <- mapM sequence $ V.toList (V.map makeTupleList strMapRecords)
case mkRelationFromList attrs tupleList of
Left err -> Left (AttributeMappingError err)
Right rel -> Right rel
else
Left $ HeaderAttributeMismatchError (S.difference attrNameSet headerSet)
parseCSVAtomP :: AttributeName -> TypeConstructorMapping -> AtomType -> APT.Parser (Either RelationalError Atom)
parseCSVAtomP _ _ IntegerAtomType = (Right . IntegerAtom) <$> APT.decimal
parseCSVAtomP _ _ IntAtomType = Right . IntAtom <$> APT.decimal
parseCSVAtomP _ _ DoubleAtomType = Right . DoubleAtom <$> APT.double
parseCSVAtomP _ _ TextAtomType = do
s <- quotedString <|> takeToEndOfData
pure (Right (TextAtom s))
parseCSVAtomP _ _ DayAtomType = do
dString <- T.unpack <$> takeToEndOfData
case readMaybe dString of
Nothing -> fail ("invalid Day string: " ++ dString)
Just date -> pure (Right (DayAtom date))
parseCSVAtomP _ _ DateTimeAtomType = do
dString <- T.unpack <$> takeToEndOfData
case readMaybe dString of
Nothing -> fail ("invalid Date string: " ++ dString)
Just date -> pure (Right (DateTimeAtom date))
parseCSVAtomP _ _ ByteStringAtomType = do
bsString <- T.unpack <$> takeToEndOfData
case readMaybe bsString of
Nothing -> fail ("invalid ByteString string: " ++ bsString)
Just bs -> pure (Right (ByteStringAtom bs))
parseCSVAtomP _ _ BoolAtomType = do
bString <- T.unpack <$> takeToEndOfData
case readMaybe bString of
Nothing -> fail ("invalid BoolAtom string: " ++ bString)
Just b -> pure (Right (BoolAtom b))
parseCSVAtomP attrName tConsMap (IntervalAtomType iType) = do
begin <- (APT.char '[' >> pure False) <|> (APT.char '(' >> pure True)
eBeginv <- parseCSVAtomP attrName tConsMap iType
case eBeginv of
Left err -> pure (Left err)
Right beginv -> do
_ <- APT.char ','
eEndv <- parseCSVAtomP attrName tConsMap iType
case eEndv of
Left err -> pure (Left err)
Right endv -> do
end <- (APT.char ']' >> pure False) <|> (APT.char ')' >> pure True)
pure (Right (IntervalAtom beginv endv begin end))
parseCSVAtomP attrName tConsMap typ@(ConstructedAtomType _ tvmap) = do
dConsName <- capitalizedIdentifier
APT.skipSpace
case findDataConstructor dConsName tConsMap of
Nothing -> pure (Left (NoSuchDataConstructorError dConsName))
Just (_, dConsDef) ->
case resolvedAtomTypesForDataConstructorDefArgs tConsMap tvmap dConsDef of
Left err -> pure (Left err)
Right argAtomTypes -> do
atomArgs <- mapM (\argTyp -> let parseNextAtom = parseCSVAtomP attrName tConsMap argTyp <* APT.skipSpace in
case argTyp of
ConstructedAtomType _ _ ->
parens parseNextAtom <|>
parseNextAtom
_ -> parseNextAtom
) argAtomTypes
case lefts atomArgs of
[] -> pure (Right (ConstructedAtom dConsName typ (rights atomArgs)))
errs -> pure (Left (someErrors errs))
parseCSVAtomP attrName _ (RelationAtomType _) = pure (Left (RelationValuedAttributesNotSupportedError [attrName]))
parseCSVAtomP _ _ (TypeVariableType x) = pure (Left (TypeConstructorTypeVarMissing x))
capitalizedIdentifier :: APT.Parser T.Text
capitalizedIdentifier = do
fletter <- APT.satisfy isUpper APT.<?> "capitalized data constructor letter"
rest <- APT.takeWhile (\c -> not (isSpace c || c == ')')) APT.<?> "data constructor name"
pure (fletter `T.cons` rest)
takeToEndOfData :: APT.Parser T.Text
takeToEndOfData = APT.takeWhile (APT.notInClass ",)]")
parens :: APT.Parser a -> APT.Parser a
parens p = do
APT.skip (== '(')
APT.skipSpace
v <- p
APT.skipSpace
APT.skip (== ')')
pure v
quotedString :: APT.Parser T.Text
quotedString = do
let escapeMap = [('"','"'), ('n', '\n'), ('r', '\r')]
APT.skip (== '"')
(_, s) <- APT.runScanner [] (\prevl nextChar -> case prevl of
[] -> Just [nextChar]
chars | last chars == '\\' ->
case lookup nextChar escapeMap of
Nothing -> Just (chars ++ [nextChar])
Just escapeVal -> Just (init chars ++ [escapeVal])
| nextChar == '"' -> Nothing
| otherwise -> Just (chars ++ [nextChar]))
APT.skip (== '"')
pure (T.pack s)