{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE Arrows #-}
--dist/build/Accessions2TaxIds/Accessions2TaxIds -i /scratch/egg/AccessionList > ~egg/TaxidOut
module Main where
import System.Console.CmdArgs
import Text.ParserCombinators.Parsec
import Data.List
import Data.Either.Unwrap
import Control.Concurrent
import Bio.EntrezHTTP
import Text.XML.HXT.Core
data Options = Options
{ inputFilePath :: String
} deriving (Show,Data,Typeable)
options :: Options
options = Options
{ inputFilePath = def &= name "i" &= help "Path to input accession file containing taxids seperated by linebreaks."
} &= summary "Accessions2TaxIds" &= help "Florian Eggenhofer - 2015" &= verbosity
main :: IO [()]
main = do
Options{..} <- cmdArgs options
accessionsFile <- readFile inputFilePath
let accessions = lines accessionsFile
gis <- retrieveGIsEntrez accessions
taxIds <- retrieveTaxIdsEntrez gis
mapM putStrLn (map show taxIds)
retrieveTaxIdsEntrez :: [Int] -> IO [Int]
retrieveTaxIdsEntrez geneIds = do
taxIdsOutput <- retrieveElementsEntrez geneIds retrieveTaxIdEntrez
let taxids = concatMap readEntrezTaxId taxIdsOutput
return taxids
retrieveGIsEntrez :: [String] -> IO [Int]
retrieveGIsEntrez accessions = do
gisOutput <- retrieveElementsEntrez accessions retrieveGIEntrez
let parsedGisOutput = map parseGIfromEntrez gisOutput
let gis = concatMap fromRight parsedGisOutput
return gis
retrieveGIEntrez :: [String] -> IO String
retrieveGIEntrez accessions = do
let idList = intercalate "," accessions
let query' = "retmax=400&term=" ++ idList
let entrezQuery = EntrezHTTPQuery (Just "esearch") (Just "nucleotide") query'
threadDelay 10000000
result <- entrezHTTP entrezQuery
return result
parseGIfromEntrez :: String -> Either ParseError [Int]
parseGIfromEntrez input = parse genParserGIfromEntrez "genParserGIfromEntrez" input
genParserGIfromEntrez :: GenParser Char st [Int]
genParserGIfromEntrez = do
string "\n\n"
string ""
many1 digit
string ""
many1 digit
string ""
many1 digit
string "\n"
gis <- many1 (Text.ParserCombinators.Parsec.try parseGI)
string ""
return gis
parseGI :: GenParser Char st Int
parseGI = do
string ""
gi <- many1 digit
string "\n"
return (readInt gi)
retrieveTaxIdEntrez :: [Int] -> IO String
retrieveTaxIdEntrez geneIds = do
let geneIdStrings = map show geneIds
let idList = intercalate "," geneIdStrings
let query' = "id=" ++ idList
let entrezQuery = EntrezHTTPQuery (Just "esummary") (Just "nucleotide") query'
threadDelay 10000000
result <- entrezHTTP entrezQuery
return result
readEntrezTaxId :: String -> [Int]
readEntrezTaxId input = runLA (xreadDoc >>> getEntrezTaxId) input
getEntrezTaxId :: ArrowXml a => a XmlTree Int
getEntrezTaxId = getChildren >>> atTag "DocSum" >>>
proc entrezDocSum -> do
taxId <- atName "TaxId" >>> getChildren >>> getText -< entrezDocSum
returnA -< read taxId :: Int
readInt :: String -> Int
readInt = read
-- | gets all subtrees with the specified tag name
atTag :: ArrowXml a => String -> a XmlTree XmlTree
atTag tag = deep (isElem >>> hasName tag)
atName :: ArrowXml a => String -> a XmlTree XmlTree
atName elementId = deep (isElem >>> hasAttrValue "Name" (== elementId))