-- Copyright © 2011 Bart Massey
-- [This program is licensed under the "MIT License"]
-- Please see the file COPYING in the source
-- distribution of this software for license terms.

{-# LANGUAGE DeriveDataTypeable #-}

-- | This modules provides conversion routines to and from
-- various \"something-separated value\" (SSV) formats.  In
-- particular, it converts the infamous \"comma separated
-- value\" (CSV) format.
module Text.SSV (
  -- * SSV format descriptions 
  -- | These records define a fairly flexible, if entirely
  -- kludgy, domain-specific language for describing
  -- \"something-separated value\" formats. An attempt is made
  -- in the reader and formatter to allow for fairly
  -- arbitrary combinations of features in a sane
  -- way. However, your mileage may undoubtedly vary; CSV is
  -- the only tested configuration.
  SSVFormat(..), 
  SSVFormatQuote(..),
  -- * SSV read, show and IO routines
  readSSV, 
  showSSV, 
  hPutSSV, 
  writeSSVFile, 
  -- * CSV read, show and IO routines 
  -- | CSV is a special case here. Partly this is by virtue
  -- of being the most common format.  CSV also needs a
  -- little bit of \"special\" help with input line endings
  -- to conform to RFC 4180.
  readCSV, 
  showCSV, 
  hPutCSV,
  writeCSVFile,
  -- * Newline conversions
  toNL,
  fromNL,
  -- * Exceptions
  SSVReadException(..), 
  SSVShowException(..),
  -- * Predefined formats
  csvFormat, 
  pwfFormat )
where

import Control.Exception
import Data.Char
import Data.List
import Data.Maybe
import qualified Data.Set as Set
import Data.Typeable
import System.IO

-- | Formatting information for quoted strings for a
-- particular SSV variant.
data SSVFormatQuote = SSVFormatQuote {
  ssvFormatQuoteEscape :: Maybe Char,
  ssvFormatQuoteLeft :: Char,
  ssvFormatQuoteRight :: Char
}

-- | Formatting information for a particular SSV variant.
data SSVFormat = SSVFormat {
  ssvFormatName :: String,
  ssvFormatTerminator :: Char, -- ^ End of row.
  ssvFormatSeparator :: Char, -- ^ Field separator.
  ssvFormatEscape :: Maybe Char, -- ^ Escape character outside of quotes.
  ssvFormatStripWhite :: Bool, -- ^ Strip "extraneous" whitespace next to separators on input.
  ssvFormatQuote :: Maybe SSVFormatQuote, -- ^ Quote format.
  ssvFormatWhiteChars :: String } -- ^ Characters regarded as whitespace.

-- | 'SSVFormat' for CSV data. Closely follows RFC 4180.
csvFormat :: SSVFormat
csvFormat = SSVFormat {
  ssvFormatName = "CSV",
  ssvFormatTerminator = '\n',
  ssvFormatSeparator = ',',
  ssvFormatEscape = Nothing,
  ssvFormatStripWhite = True,
  ssvFormatQuote = Just $ SSVFormatQuote {
    ssvFormatQuoteEscape = Just '"',
    ssvFormatQuoteLeft = '"',
    ssvFormatQuoteRight = '"' },
  ssvFormatWhiteChars = " \t" }

-- | 'SSVFormat' for UNIX \"password file\" data, i.e. colon-separated
-- fields with no escape convention.
pwfFormat :: SSVFormat
pwfFormat = SSVFormat {
  ssvFormatName = "Colon-separated values",
  ssvFormatTerminator = '\n',
  ssvFormatSeparator = ':',
  ssvFormatEscape = Nothing,
  ssvFormatStripWhite = False,
  ssvFormatQuote = Nothing,
  ssvFormatWhiteChars = "" }

-- | Indicates format name, line and column and gives an error message.
data SSVReadException = SSVReadException String (Int, Int) String
     		      | SSVEOFException String String
                        deriving Typeable

-- | Indicates format name and failed field and gives an
-- error message.  This should probably just be an 'error',
-- as the calling program is really responsible for passing
-- something formattable to the show routines.
data SSVShowException = SSVShowException String String String
                        deriving Typeable

instance Show SSVReadException where
  show (SSVReadException fmt (line, col) msg) =
    fmt ++ ":" ++ show line ++ ":" ++ show col ++ ": " ++ 
      "read error: " ++ msg
  show (SSVEOFException fmt msg) =
    fmt ++ ": read error at end of file: " ++ msg


instance Show SSVShowException where
  show (SSVShowException fmt s msg) =
    fmt ++ ": field " ++ show s ++ ": show error: " ++ msg

instance Exception SSVReadException

instance Exception SSVShowException

throwRE :: SSVFormat -> (Int, Int) -> String -> a
throwRE fmt pos msg =
  throw $ SSVReadException (ssvFormatName fmt) pos msg

throwSE :: SSVFormat -> String -> String -> a
throwSE fmt s msg =
  throw $ SSVShowException (ssvFormatName fmt) s msg

-- | Convert CR / LF sequences on input to LF (NL). Also convert
-- other CRs to LF. This is probably the right way to handle CSV
-- data.
toNL :: String -> String
toNL =
  foldr clean1 []
  where
    clean1 :: Char -> String -> String
    clean1 '\r' cs@('\n' : _) = cs
    clean1 '\r' cs = '\n' : cs
    clean1 c cs = c : cs

-- | Convert LF (NL) sequences on input to CR LF. Leaves
-- | other CRs alone.
fromNL :: String -> String
fromNL =
  foldr dirty1 []
  where
    dirty1 :: Char -> String -> String
    dirty1 '\n' cs = '\r' : '\n' : cs
    dirty1 c cs = c : cs

-- | Read using an arbitrary 'SSVFormat'. The input is not
-- cleaned with 'toNL'; if you want this, do it yourself.
-- The standard SSV formats 'csvFormat' and 'pwfFormat' are
-- provided.
readSSV :: SSVFormat -> String -> [[String]]
readSSV fmt = 
  nextsw (1, 1)
  where
    -- State for initialization and fallback from end of field.
    nextsw p cs
      | ssvFormatStripWhite fmt = nextSW p cs
      | otherwise = nextSX p cs
    -- A bunch of abbreviations for concision.
    rs = ssvFormatTerminator fmt
    fs = ssvFormatSeparator fmt
    efmt = ssvFormatEscape fmt
    e = isJust efmt
    ec = fromJust efmt
    qfmt = ssvFormatQuote fmt
    q = isJust qfmt
    lq = ssvFormatQuoteLeft $ fromJust qfmt
    rq = ssvFormatQuoteRight $ fromJust qfmt
    qesc = ssvFormatQuoteEscape $ fromJust qfmt
    qe = isJust qesc
    eq = fromJust qesc
    -- Increment the position in the input various ways.
    incp (line, _) '\n' = (line + 1, 1)
    incp (line, col) '\t' = (line, tcol) 
      where tcol = col + 8 - ((col + 7) `mod` 8)
    incp (line, _) '\r' = (line, 1)
    incp (line, col) _ = (line, col + 1)
    -- The actual state machine for the labeler.
    -- reading a whitespace char
    nextSW p (' ' : cs)    = nextSW (incp p ' ') cs
    nextSW p ('\t' : cs)   = nextSW (incp p '\t') cs
    nextSW p (c : cs)
      | c == rs            = mkCRS $ nextsw (incp p c) cs
      | c == fs            = mkCFS $ nextsw (incp p c) cs
      | e && c == ec       = nextSE (incp p c) cs
      | q && c == lq       = nextSQ (incp p c) cs
      | otherwise          = mkCX c $ nextSX (incp p c) cs
    nextSW _ []            = []
    -- reading a generic char
    nextSX p (c : cs)
      | c == rs            = mkCRS $ nextsw (incp p c) cs
      | c == fs            = mkCFS $ nextsw (incp p c) cs
      | e && c == ec       = nextSE (incp p c) cs
      | q && c == lq       = throwRE fmt p "illegal quote"
      | otherwise          = mkCX c $ nextSX (incp p c) cs
    nextSX _ []            = []
    -- reading a quoted char
    nextSQ p (c : cs) 
      | c == rs            = mkCX c $ nextSQ (incp p c) cs
      | q && qe && c == eq = nextSZ (incp p c) cs
      | q && c == rq       = nextSD (incp p c) cs
      | otherwise          = mkCX c $ nextSQ (incp p c) cs
    nextSQ _ []            = throw $ SSVEOFException 
                               (ssvFormatName fmt) "unclosed quote"
    -- reading an escaped char
    nextSE p (c : cs)      = mkCX c $ nextSX (incp p c) cs
    nextSE _ []            = []
    -- reading a quoted-escaped char    
    nextSZ p (' ' : cs)    = nextSD (incp p ' ') cs
    nextSZ p ('\t' : cs)   = nextSD (incp p '\t') cs
    nextSZ p (c : cs)
      | c == rs            = mkCRS $ nextsw (incp p c) cs
      | c == fs            = mkCFS $ nextsw (incp p c) cs
      | q && qe && c == eq = mkCX c $ nextSQ (incp p c) cs
      | q && c == rq       = mkCX c $ nextSQ (incp p c) cs
      | q && c == lq       = mkCX c $ nextSQ (incp p c) cs
      | otherwise          = throwRE fmt p "illegal escape"
    nextSZ _ []            = []
    -- reading a post-quote char
    nextSD p (' ' : cs)    = nextSD (incp p ' ') cs
    nextSD p ('\t' : cs)   = nextSD (incp p '\t') cs
    nextSD p (c : cs)
      | c == fs            = mkCFS $ nextsw (incp p c) cs
      | c == rs            = mkCRS $ nextsw (incp p c) cs
      | otherwise          = throwRE fmt p "junk after quoted field"
    nextSD _ []            = []
    -- The collector functions for building up the list.
    -- character    
    mkCX x [] = [[[x]]]
    mkCX x ([]:rss) = [[x]]:rss
    mkCX x ((w:wss):rss) = ((x:w):wss):rss
    -- field separator
    mkCFS [] = [["",""]]   -- no newline at end of file
    mkCFS (r:rss) = ("":r):rss
    -- record separator
    mkCRS rss = [""]:rss

-- | Convert a 'String' representing a CSV file into a
-- properly-parsed list of rows, each a list of 'String'
-- fields. Adheres to the spirit and (mostly) to the letter
-- of RFC 4180, which defines the `text/csv` MIME type.
-- 
-- 'toNL' is used on the input string to clean up the
-- various line endings that might appear. Note that this
-- may result in irreversible, undesired manglings of CRs
-- and LFs.
-- 
-- Fields are expected to be separated by commas. Per RFC
-- 4180, fields may be double-quoted: only whitespace, which
-- is discarded, may appear outside the double-quotes of a
-- quoted field. For unquoted fields, whitespace to the left
-- of the field is discarded, but whitespace to the right is
-- retained; this is convenient for the parser, and probably
-- corresponds to the typical intent of CSV authors. Whitespace
-- on both sides of a quoted field is discarded. If a
-- double-quoted fields contains two double-quotes in a row,
-- these are treated as an escaped encoding of a single
-- double-quote.
-- 
-- The final line of the input may end with a line terminator,
-- which will be ignored, or without one.
readCSV :: String -> [[String]]
readCSV = readSSV csvFormat . toNL

-- | Show using an arbitrary 'SSVFormat'.  The standard SSV
-- formats 'csvFormat' and 'pwfFormat' are provided. Some
-- effort is made to \"intelligently\" quote the fields; in
-- the worst case an 'SSVShowException' will be thrown to
-- indicate that a field had characters that could not be
-- quoted. Spaces or tabs in input fields only causes quoting
-- if they are adjacent to a separator, and then only if
-- 'ssvFormatStripWhite' is 'True'.
showSSV :: SSVFormat -> [[String]] -> String
showSSV fmt = 
  concatMap showRow
  where
    showRow = 
      (++ "\n") . intercalate [ssvFormatSeparator fmt] . map showField
      where
        -- Quote the field as needed.
        showField s
          | any needsQuoteChar s || endIsWhite s =
            case ssvFormatQuote fmt of
              Just qfmt ->
                if isJust (ssvFormatQuoteEscape qfmt) ||
                   not (elem (ssvFormatQuoteRight qfmt) s)
                then quote qfmt s
                else case ssvFormatEscape fmt of
                     Just ch -> escape ch s
                     Nothing -> throwSE fmt s "unquotable character in field"
              Nothing -> 
                case ssvFormatEscape fmt of
                  Just ch -> escape ch s
                  Nothing -> throwSE fmt s "unquotable character in field"
          | otherwise = s
            where
              needsQuoteChar c
                  | Set.member c quotableChars = True
                  | isPrint c = False
                  | otherwise = True
                  where
                    -- Set of characters that require a field to be quoted.
                    -- XXX This maybe could be kludgier, but I don't know how.
                    quotableChars =
                        Set.fromList $ concat $ catMaybes [
                          Just [ssvFormatTerminator fmt],
                          Just [ssvFormatSeparator fmt],
                          fmap (:[]) $ ssvFormatEscape fmt,
                          fmap ((:[]) . ssvFormatQuoteLeft) $
                            ssvFormatQuote fmt ]
              endIsWhite _ | not (ssvFormatStripWhite fmt) = False
              endIsWhite "" = False
              endIsWhite s' =
                  let firstChar = head s'
                      lastChar = last s'
                  in
                  firstChar `elem` ssvFormatWhiteChars fmt ||
                  lastChar `elem` ssvFormatWhiteChars fmt
              quote qfmt s' = [ssvFormatQuoteLeft qfmt] ++
                              qescape qfmt s' ++
                              [ssvFormatQuoteRight qfmt]
              escape esc s' =
                foldr escape1 "" s'
                where
                  escape1 c cs
                    | needsQuoteChar c = esc : c : cs
                    | otherwise = c : cs
              qescape qfmt s' =
                case ssvFormatQuoteEscape qfmt of
                  Just qesc -> foldr (qescape1 qesc) "" s'
                  Nothing -> s'
                  where
                    qescape1 qesc c cs
                      | c == qesc || c == ssvFormatQuoteRight qfmt =
                        qesc : c : cs
                      | otherwise =
                        c : cs

-- | Convert a list of rows, each a list of 'String' fields,
-- to a single 'String' CSV representation. Adheres to the
-- spirit and (mostly) to the letter of RFC 4180, which
-- defines the `text/csv` MIME type.
-- 
-- Newline will be used as the end-of-line character, and no
-- discardable whitespace will appear in fields. Fields that
-- need to be quoted because they contain a special
-- character or line terminator will be quoted; all other
-- fields will be left unquoted. The final row of CSV will
-- end with a newline.
showCSV :: [[String]] -> String
showCSV = showSSV csvFormat

-- | Put a representation of the given SSV input out on a
-- file handle using the given 'SSVFormat'. Uses CRLF as the
-- line terminator character, as recommended by RFC 4180 for
-- CSV.  Otherwise, this function behaves as writing the
-- output of 'showSSV' to the 'Handle'; if you want native
-- line terminators, this latter method works for that.
hPutSSV :: SSVFormat -> Handle -> [[String]] -> IO ()
hPutSSV fmt h csv = do
  hSetEncoding h utf8
  let nlm = NewlineMode { inputNL = nativeNewline, outputNL = CRLF }
  hSetNewlineMode h nlm
  hPutStr h $ showSSV fmt csv

-- | Perform 'hPutSSV' with 'csvFormat'.
hPutCSV :: Handle -> [[String]] -> IO ()
hPutCSV = hPutSSV csvFormat

-- | Write an SSV representation of the given input into a
-- new file located at the given path, using the given
-- 'SSVFormat'. As with 'hPutCSV', CRLF will be used as the
-- line terminator.
writeSSVFile :: SSVFormat -> String -> [[String]] -> IO ()
writeSSVFile fmt path csv = do
  h <- openFile path WriteMode
  hPutSSV fmt h csv
  hClose h

-- | Perform 'writeSSVFile' with 'csvFormat'.
writeCSVFile :: String -> [[String]] -> IO ()
writeCSVFile = writeSSVFile csvFormat