{-# LANGUAGE BangPatterns, OverloadedStrings #-}
module TransformFastaList ( convertToAminoAcidsFastaSequence
, replaceChars
, fillInSequence
, insertSequence
, changeField
, changeAllFields
, getRegionSequence
, trimFasta
, removeUnknownNucs
) where
import Data.List
import Data.Char
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F
import qualified Data.Text as T
import Data.Fasta.Text
import Text.Regex.TDFA
import Text.Regex.TDFA.Text
import Types
import Utility
convertToAminoAcidsFastaSequence :: CodonTable
-> FastaSequence
-> FastaSequence
convertToAminoAcidsFastaSequence table = fromEither . customTranslate table 1
fillInSequence :: Field -> Start -> Char -> FastaSequence -> FastaSequence
fillInSequence f s c fs = fs { fastaSeq = newFastaSeq }
where
newFastaSeq = (\xs -> if T.singleton c `T.isInfixOf` xs then "" else xs)
$ first `mappend` replaceChars c old new
new = (T.splitOn "|" . fastaHeader $ fs) !! (f - 1)
(first, old) = T.splitAt (s - 1) . fastaSeq $ fs
insertSequence :: Start -> FastaSequence -> FastaSequence -> FastaSequence
insertSequence s ref fs = fs { fastaSeq = newFastaSeq }
where
newFastaSeq = before `mappend` ins `mappend` after
ins = fastaSeq fs
(before, after) = T.splitAt (s - 1) . fastaSeq $ ref
changeField :: Maybe Field -> T.Text -> FastaSequence -> FastaSequence
changeField Nothing _ fs = fs
changeField (Just field) regex fs = fs { fastaHeader = newFastaHeader }
where
newFastaHeader = T.intercalate "|"
. F.toList
. Seq.update (field - 1) newField
$ splitField
newField = (Seq.index splitField (field - 1)) =~ regex :: T.Text
splitField = Seq.fromList . T.splitOn "|" . fastaHeader $ fs
changeAllFields :: FastaSequence -> [(Maybe Int, T.Text)] -> FastaSequence
changeAllFields = foldl' (\fs (!x, !y) -> changeField x y fs)
getRegion :: Maybe Start -> Maybe Stop -> T.Text -> T.Text
getRegion Nothing Nothing = id
getRegion Nothing (Just stop) = T.take stop
getRegion (Just start) Nothing = T.drop start
getRegion (Just start) (Just stop) = T.take (stop - start) . T.drop start
getRegionSequence :: Maybe Start -> Maybe Stop -> FastaSequence -> FastaSequence
getRegionSequence start0 stop fs = fs { fastaSeq = newFastaSeq }
where
newFastaSeq = getRegion start stop . fastaSeq $ fs
start = fmap (flip (-) 1) start0
trim :: GeneticUnit -> Maybe FrameType -> Maybe Frame -> T.Text -> T.Text
trim Nucleotide Nothing Nothing x = T.dropEnd
(T.length x `mod` 3)
x
trim _ _ Nothing x = x
trim _ _ (Just 0) x = x
trim Nucleotide (Just InFrame) (Just frame) x = T.drop (3 - frame) x
trim Nucleotide (Just OutFrame) (Just frame) x = T.dropEnd frame x
trim AminoAcid (Just InFrame) (Just _) x = T.drop 1 x
trim AminoAcid (Just OutFrame) (Just _) x = T.dropEnd 1 x
trimFasta :: GeneticUnit
-> Maybe Frame
-> Maybe Frame
-> FastaSequence
-> FastaSequence
trimFasta Nucleotide Nothing Nothing fs =
fs { fastaSeq = trim Nucleotide Nothing Nothing . fastaSeq $ fs }
trimFasta genU inF outF fs = fs { fastaSeq = trim genU (Just InFrame) inF
. trim genU (Just OutFrame) outF
. fastaSeq $ fs
}
removeUnknownNucs :: FastaSequence -> FastaSequence
removeUnknownNucs fs = fs { fastaSeq = T.map changeNuc . fastaSeq $ fs }
where
changeNuc x
| toUpper x `elem` ("ATCGN.-" :: String) = x
| otherwise = '-'