{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}

-- | Parses NCBI BLAST+ tabular output

module Biobase.BLAST.Import (blastCmdJSON2FromFile,
                             parseJSONBlast,
                             blastFromFile,
                             parseTabularBlasts,
                             parseTabularHTTPBlasts,
                             blastHTTPFromFile
                            ) where

import Prelude hiding (takeWhile)
import Data.Attoparsec.ByteString.Char8 hiding (isSpace)
import qualified Data.Attoparsec.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Builder as S
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Vector as V
import System.Directory
import Data.Char
import Control.Monad
import Debug.Trace
import Text.Printf
import Biobase.BLAST.Types
import Data.Aeson as A

-- | reads and parses tabular Blast result from provided filePath
blastCmdJSON2FromFile :: String -> IO (Either String BlastCmdJSON2)
blastCmdJSON2FromFile filePath = do
  printf "# reading blast JSON2 input from file %s\n" filePath
  blastFileExists <- doesFileExist filePath
  if blastFileExists
     then do
       bs <- B.readFile filePath
       let json = parseJSONBlastCmd bs
       return json
     else fail "# JSON2 blast file \"%s\" does not exist\n" filePath

parseJSONBlastCmd :: B.ByteString -> Either String BlastCmdJSON2
parseJSONBlastCmd bs = A.eitherDecode bs :: Either String BlastCmdJSON2

parseJSONBlast :: B.ByteString -> Either String BlastJSON2
parseJSONBlast bs = A.eitherDecode bs :: Either String BlastJSON2

-- | reads and parses tabular Blast result from provided filePath
blastFromFile :: String -> IO [BlastTabularResult]
blastFromFile filePath = do
  printf "# reading tabular blast input from file %s\n" filePath
  blastFileExists <- doesFileExist filePath
  if blastFileExists
     then parseTabularBlasts <$> B.readFile filePath
     else fail "# tabular blast file \"%s\" does not exist\n" filePath

-- | reads and parses tabular HTTP Blast result from provided filePath
blastHTTPFromFile :: String -> IO [BlastTabularResult]
blastHTTPFromFile filePath = do
  printf "# reading tabular blast input from file %s\n" filePath
  blastFileExists <- doesFileExist filePath
  if blastFileExists
      then parseTabularHTTPBlasts <$> B.readFile filePath
      else fail "# tabular blast file \"%s\" does not exist\n" filePath

-- | Read a lazy bytestring and stream out a lsit of @BlastTabularResult@'s.
-- In case, there is a parse error "late" in the file, we might have
-- already streamed out some (or many!) of these results.

parseTabularBlasts :: B.ByteString -> [BlastTabularResult]
parseTabularBlasts = go
  where go xs = case L.parse genParseTabularBlast xs of
          L.Fail remainingInput ctxts err  -> error $ "parseTabularBlasts failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
          L.Done remainingInput btr
            | B.null remainingInput  -> [btr]
            | otherwise              -> btr : go remainingInput

parseTabularHTTPBlasts :: B.ByteString -> [BlastTabularResult]
parseTabularHTTPBlasts = go
  where go xs = case L.parse genParseTabularHTTPBlast xs of
          L.Fail remainingInput ctxts err  -> error $ "parseTabularHTTPBlasts failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
          L.Done remainingInput btr
            | B.null remainingInput  -> [btr]
            | otherwise              -> btr : go remainingInput

genParseBlastProgram :: Parser BlastProgram
genParseBlastProgram = do
  choice [string "# BLAST",string "# blast"]
  (toLower <$> anyChar) >>= return . \case
    'x' -> BlastX
    'p' -> BlastP
    'n' -> BlastN

genParseTabularBlast :: Parser BlastTabularResult
genParseTabularBlast = do
  _blastProgram <- genParseBlastProgram <?> "Program"
  many1 (notChar '\n')
  endOfLine
  string "# Query: " <?> "Query"
  _blastQueryId <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "QueryId"
  string "# Database: " <?> "Database"
  _blastDatabase <- many1 (notChar '\n') <?> "Db"
  string "\n# " <?> "header linebreak"
  --fields line
  skipMany (try genParseFieldLine) <?> "Fields"
  _blastHitNumber <- decimal  <?> "Hit number"
  string " hits found\n" <?> "hits found"
  _tabularHit <- count  _blastHitNumber (try genParseBlastTabularHit)  <?> "Tabular hit"
  skipMany endOfLine
  return $ BlastTabularResult _blastProgram (toLB _blastQueryId) (B.pack _blastDatabase) _blastHitNumber (V.fromList _tabularHit)

genParseTabularHTTPBlast :: Parser BlastTabularResult
genParseTabularHTTPBlast = do
  _blastProgram <- genParseBlastProgram <?> "Program"
  --many1 (notChar '\n')
  endOfLine
  string "# Iteration: " <?> "Iteration" -----
  _ <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "IterationNumber" -----
  string "# Query: " <?> "Query"
  _blastQueryId <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "QueryId"
  string "# RID: " <?> "RID" -----
  _ <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "RID" -----
  string "# Database: " <?> "Database"
  _blastDatabase <- many1 (notChar '\n') <?> "Db"
  string "\n# " <?> "header linebreak"
  --fields line
  skipMany (try genParseFieldLine) <?> "Fields"
  _blastHitNumber <- decimal  <?> "Hit number"
  string " hits found\n" <?> "hits found"
  _tabularHit <- count  _blastHitNumber (try genParseBlastHTTPTabularHit)  <?> "Tabular hit"
  skipMany endOfLine
  return $ BlastTabularResult _blastProgram (toLB _blastQueryId) (B.pack _blastDatabase) _blastHitNumber (V.fromList _tabularHit)

genParseFieldLine :: Parser ()
genParseFieldLine = do
  string "Fields:"
  skipMany (notChar '\n')
  string "\n# "
  return ()

genParseBlastTabularHit :: Parser BlastTabularHit
genParseBlastTabularHit = do
  _queryId <- takeWhile1 ((/=9) . ord) <?> "hit qid"
  char '\t'
  _subjectId <- takeWhile1 ((/=9) . ord) <?> "hit sid"
  char '\t'
  _seqIdentity <- double <?> "hit seqid"
  char '\t'
  _alignmentLength <- decimal  <?> "hit sid"
  char '\t'
  _misMatches <- decimal <?> "hit mmatch"
  char '\t'
  _gapOpenScore <- decimal <?> "hit gopen"
  char '\t'
  _queryStart <- decimal <?> "hit qstart"
  char '\t'
  _queryEnd <- decimal  <?> "hit qend"
  char '\t'
  _hitSeqStart <- decimal  <?> "hit sstart"
  char '\t'
  _hitSeqEnd <- decimal <?> "hit send"
  char '\t'
  _eValue <- double <?> "hit eval"
  char '\t'
  _bitScore <- double <?> "hit bs"
  char '\t'
  _subjectFrame <- decimal <?> "hit sF"
  char '\t'
  _querySeq <- takeWhile1 ((/=9) . ord) <?> "hit qseq" -- 9 == '\t'
  char '\t'
  _subjectSeq <- takeWhile1 ((/=10) . ord) <?> "hit subSeq" -- 10 == '\n'
  char '\n'
  return $ BlastTabularHit (B.fromStrict _queryId) (B.fromStrict _subjectId) _seqIdentity _alignmentLength _misMatches _gapOpenScore _queryStart _queryEnd _hitSeqStart _hitSeqEnd _eValue _bitScore _subjectFrame (B.fromStrict _querySeq) (B.fromStrict _subjectSeq)

-- specific for Tabular Blast from NCBI HTTP requests
genParseBlastHTTPTabularHit :: Parser BlastTabularHit
genParseBlastHTTPTabularHit = do
    _queryId <- takeWhile1 ((/=9) . ord) <?> "hit qid"
    char '\t'
    _subjectId <- takeWhile1 ((/=9) . ord) <?> "hit sid"
    char '\t'
    _ <- takeWhile1 ((/=9) . ord) <?> "redundant id1"
    char '\t'
    _ <- takeWhile1 ((/=9) . ord) <?> "redundant id2"
    char '\t'
    _seqIdentity <- double <?> "hit seqid"
    char '\t'
    _alignmentLength <- decimal  <?> "hit sid"
    char '\t'
    _misMatches <- decimal <?> "hit mmatch"
    char '\t'
    _gapOpenScore <- decimal <?> "hit gopen"
    char '\t'
    _queryStart <- decimal <?> "hit qstart"
    char '\t'
    _queryEnd <- decimal  <?> "hit qend"
    char '\t'
    _hitSeqStart <- decimal  <?> "hit sstart"
    char '\t'
    _hitSeqEnd <- decimal <?> "hit send"
    char '\t'
    _eValue <- double <?> "hit eval"
    char '\t'
    _bitScore <- double <?> "hit bs"
    char '\n'
    return $ BlastTabularHit (B.fromStrict _queryId) (B.fromStrict _subjectId) _seqIdentity _alignmentLength _misMatches _gapOpenScore _queryStart _queryEnd _hitSeqStart _hitSeqEnd _eValue _bitScore 0 B.empty B.empty

-- Blast evalues can be reported as e.g. .051e-22 not supported by double parsing
readEvalue :: C.ByteString -> Double
readEvalue eValBs
  | (head stringEval) == '.' = read ('0':(stringEval)) :: Double
  | otherwise = read stringEval :: Double
  where stringEval = C.unpack eValBs

--IUPAC amino acid with gap
--aminoacidLetters :: Char -> Bool
aminoacidLetters = inClass "ARNDCQEGHILMFPSTWYVBZX-"

--IUPAC nucleic acid characters with gap
--nucleotideLetters :: Char -> Bool
nucleotideLetters = inClass "AGTCURYSWKMBDHVN-."

--IUPAC nucleic acid characters with gap
--bioLetters :: Char -> Bool
bioLetters = inClass "ABCDEFGHIJKLMNOPQRSTUVWXYZ.-"


toLB :: C.ByteString -> B.ByteString
toLB = S.toLazyByteString . S.byteString