-- | This module contains parsing functions for Infernal programs

module Biobase.RNAlien.InfernalParser (
                           module Biobase.RNAlien.Types,
                           readCMSearch,
                           readCMSearches,
                           parseCMSearch,
                           parseCMSearches,
                           parseCMstat,
                           readCMstat
                           )
where

import Text.ParserCombinators.Parsec
import Biobase.RNAlien.Types
import qualified Data.ByteString.Char8 as B
import qualified Control.Exception.Base as CE

-- | parse from input filePath              
parseCMSearch :: String -> Either ParseError CMsearch
parseCMSearch = parse genParserCMSearch "parseCMsearch"

-- | parse from input filePath              
parseCMSearches :: String -> Either ParseError CMsearch
parseCMSearches = parse genParserCMSearches "parseCMsearch"

-- | parse from input filePath                      
readCMSearch :: String -> IO (Either ParseError CMsearch)
readCMSearch filePath = do
  parsedFile <- parseFromFile genParserCMSearch filePath
  CE.evaluate parsedFile

-- | parse from input filePath                      
readCMSearches :: String -> IO (Either ParseError CMsearch)
readCMSearches filePath = do
  parsedFile <- parseFromFile genParserCMSearches filePath
  CE.evaluate parsedFile

genParserCMSearches :: GenParser Char st CMsearch
genParserCMSearches = do
  string "# cmsearch :: search CM(s) against a sequence database"
  newline
  string "# INFERNAL "
  many1 (noneOf "\n")
  newline
  string "# Copyright (C) 201"
  many1 (noneOf "\n")
  newline
  string "# Freely distributed under the GNU General Public License (GPLv3)."
  newline
  string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
  newline
  string "# query CM file:"
  many1 space
  queryCMfile' <- many1 (noneOf "\n")
  newline
  string "# target sequence database:"
  many1 space
  targetSequenceDatabase' <- many1 (noneOf "\n")
  newline
  optional (try (genParserCMsearchHeaderField "# CM configuration"))
  optional (try (genParserCMsearchHeaderField "# database size is set to"))
  optional (try (genParserCMsearchHeaderField "# truncated sequence detection"))
  string "# number of worker threads:"
  many1 space
  numberOfWorkerThreads' <- many1 (noneOf "\n")
  newline
  string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
  newline
  optional newline
  cmSearchesHits <- many1 (try genParserMultipleCMSearch)
  optional (string "[ok]\n")
  eof
  return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' (concat cmSearchesHits)

genParserCMSearch :: GenParser Char st CMsearch
genParserCMSearch = do
  string "# cmsearch :: search CM(s) against a sequence database"
  newline
  string "# INFERNAL "
  many1 (noneOf "\n")
  newline
  string "# Copyright (C) 201"
  many1 (noneOf "\n")
  newline
  string "# Freely distributed under the GNU General Public License (GPLv3)."
  newline
  string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
  newline
  string "# query CM file:"
  many1 space
  queryCMfile' <- many1 (noneOf "\n")
  newline
  string "# target sequence database:"
  many1 space
  targetSequenceDatabase' <- many1 (noneOf "\n")
  newline
  optional (try (genParserCMsearchHeaderField "# CM configuration"))
  optional (try (genParserCMsearchHeaderField "# database size is set to"))
  optional (try (genParserCMsearchHeaderField "# truncated sequence detection"))
  string "# number of worker threads:"
  many1 space
  numberOfWorkerThreads' <- many1 (noneOf "\n")
  newline
  string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
  newline
  optional newline
  string "Query:"
  many1 (noneOf "\n")
  newline
  optional (try (genParserCMsearchHeaderField "Accession"))
  optional (try (genParserCMsearchHeaderField "Description"))
  string "Hit scores:"
  newline
  choice  [try (string " rank"), try (string "  rank") , try (string "   rank"), try (string "    rank"),try (string "     rank"),try (string "      rank")]
  many1 space
  string "E-value"
  many1 space
  string "score"
  many1 space
  string "bias"
  many1 space
  string "sequence"
  many1 space
  string "start"
  many1 space
  string "end"
  many1 space
  string "mdl"
  many1 space
  string "trunc"
  many1 space
  string "gc"
  many1 space
  string "description"
  newline
  string " -"
  many1 (try (oneOf " -"))
  newline
  optional (try (string " ------ inclusion threshold ------"))
  many newline
  hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:"))
  optional (try genParserCMsearchEmptyHit)
  -- this is followed by hit alignments and internal cmsearch statistics which are not parsed
  many anyChar
  eof
  return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' hitScores'

-- | Parsing function for CMSearches with multiple querymodels in one modelfile, e.g. clans
genParserMultipleCMSearch :: GenParser Char st [CMsearchHit]
genParserMultipleCMSearch = do
  --optional newline
  --optional string "//"
  string "Query:"
  many1 (noneOf "\n")
  newline
  optional (try (genParserCMsearchHeaderField "Accession"))
  optional (try (genParserCMsearchHeaderField "Description"))
  string "Hit scores:"
  newline
  choice  [try (string " rank"), try (string "  rank") , try (string "   rank"), try (string "    rank"),try (string "     rank"),try (string "      rank")]
  many1 space
  string "E-value"
  many1 space
  string "score"
  many1 space
  string "bias"
  many1 space
  string "sequence"
  many1 space
  string "start"
  many1 space
  string "end"
  many1 space
  string "mdl"
  many1 space
  string "trunc"
  many1 space
  string "gc"
  many1 space
  string "description"
  newline
  string " -"
  many1 (try (oneOf " -"))
  newline
  optional (try (string " ------ inclusion threshold ------"))
  many newline
  hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:"))
  optional (try genParserCMsearchEmptyHit)
  -- this is followed by hit alignments and internal cmsearch statistics which are not parsed
  --many anyChar
  manyTill anyChar (try (string "//\n"))
  return hitScores'

genParserCMsearchHeaderField :: String -> GenParser Char st String
genParserCMsearchHeaderField fieldname = do
  string (fieldname ++ ":")
  many1 space
  many1 (noneOf "\n")
  newline
  return []

genParserCMsearchEmptyHit :: GenParser Char st [CMsearchHit]
genParserCMsearchEmptyHit = do
  string "   [No hits detected that satisfy reporting thresholds]"
  newline
  optional (try newline)
  return []

genParserCMsearchHit :: GenParser Char st CMsearchHit
genParserCMsearchHit = do
  many1 space
  string "("
  hitRank' <- many1 digit
  string ")"
  many1 space
  hitSignificant' <- choice [char '!', char '?']
  many1 space
  hitEValue' <- many1 (oneOf "0123456789.e-")
  many1 space
  hitScore'  <- many1 (oneOf "0123456789.e-")
  many1 space
  hitBias' <- many1 (oneOf "0123456789.e-")
  many1 space
  hitSequenceHeader' <- many1 (noneOf " ")
  many1 space
  hitStart' <- many1 digit
  many1 space
  hitEnd' <- many1 digit
  many1 space
  hitStrand' <- choice [char '+', char '-', char '.']
  many1 space
  hitModel' <- many1 letter
  many1 space
  hitTruncation' <- many1 (choice [alphaNum, char '\''])
  many1 space
  hitGCcontent' <- many1 (oneOf "0123456789.e-")
  many1 space
  hitDescription' <- many1 (noneOf "\n")
  newline
  optional (try (string " ------ inclusion threshold ------"))
  optional (try newline)
  return $ CMsearchHit (readInt hitRank') hitSignificant' (readDouble hitEValue') (readDouble hitScore') (readDouble hitBias') (B.pack hitSequenceHeader') (readInt hitStart') (readInt hitEnd') hitStrand' (B.pack hitModel') (B.pack hitTruncation') (readDouble hitGCcontent') (B.pack hitDescription')

-- | parse from input filePath              
parseCMstat :: String -> Either ParseError CMstat
parseCMstat = parse genParserCMstat "parseCMstat"

-- | parse from input filePath                      
readCMstat :: String -> IO (Either ParseError CMstat)
readCMstat filePath = do
  parsedFile <- parseFromFile genParserCMstat filePath
  CE.evaluate parsedFile

genParserCMstat :: GenParser Char st CMstat
genParserCMstat = do
  string "# cmstat :: display summary statistics for CMs"
  newline
  string "# INFERNAL "
  many1 (noneOf "\n")
  newline
  string "# Copyright (C) 201"
  many1 (noneOf "\n")
  newline
  string "# Freely distributed under the GNU General Public License (GPLv3)."
  newline
  string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
  newline
  char '#'
  many1 (char ' ')
  string "rel entropy"
  newline
  char '#'
  many1 (char ' ')
  many1 (char '-')
  newline
  char '#'
  many1 space
  string "idx"
  many1 space
  string "name"
  many1 space
  string "accession"
  many1 space
  string "nseq"
  many1 space
  string "eff_nseq"
  many1 space
  string "clen"
  many1 space
  string "W"
  many1 space
  string "bps"
  many1 space
  string "bifs"
  many1 space
  string "model"
  many1 space
  string "cm"
  many1 space
  string "hmm"
  newline
  string "#"
  many1 (try (oneOf " -"))
  newline
  many1 space
  _statIndex <- many1 digit
  many1 space
  _statName <- many1 letter
  many1 space
  _statAccession <- many1 (noneOf " ")
  many1 space
  _statSequenceNumber <- many1 digit
  many1 space
  _statEffectiveSequences <- many1 (oneOf "0123456789.e-")
  many1 space
  _statConsensusLength <- many digit
  many1 space
  _statW <- many1 digit
  many1 space
  _statBasepaires <- many1 digit
  many1 space
  _statBifurcations <- many1 digit
  many1 space
  _statModel <- many1 letter
  many1 space
  _relativeEntropyCM <- many1 (oneOf "0123456789.e-")
  many1 space
  _relativeEntropyHMM <- many1 (oneOf "0123456789.e-")
  newline
  char '#'
  newline
  eof
  return $ CMstat (readInt _statIndex) _statName _statAccession (readInt _statSequenceNumber) (readDouble _statEffectiveSequences) (readInt _statConsensusLength) (readInt _statW) (readInt _statBasepaires) (readInt _statBifurcations) _statModel (readDouble _relativeEntropyCM) (readDouble _relativeEntropyHMM)
--   
readInt :: String -> Int
readInt = read

readDouble :: String -> Double
readDouble = read