comparison retrieve_fasta_from_NCBI.py @ 0:4b34f2b5c14e draft

Uploaded
author drosofff
date Mon, 13 Apr 2015 18:17:08 -0400
parents
children c1d17d173128
comparison
equal deleted inserted replaced
-1:000000000000 0:4b34f2b5c14e
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 From a taxonomy ID retrieves all the nucleotide sequences
5 It returns a multiFASTA nuc/prot file
6
7 Entrez Database UID common name E-utility Database Name
8 Nucleotide GI number nuccore
9 Protein GI number protein
10
11 Retrieve strategy:
12
13 esearch to get total number of UIDs (count)
14 esearch to get UIDs in batches
15 loop untile end of UIDs list:
16 epost to put a batch of UIDs in the history server
17 efetch to retrieve info from previous post
18
19 retmax of efetch is 1/10 of declared value from NCBI
20
21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
22
23
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
26 """
27
28 import logging
29 import optparse
30 import time
31 import urllib
32 import urllib2
33 import re
34 class Eutils:
35
36 def __init__(self, options, logger):
37 self.logger = logger
38 self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
39 self.query_string = options.query_string
40 self.dbname = options.dbname
41 if options.outname:
42 self.outname = options.outname
43 else:
44 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
45 self.ids = []
46 self.retmax_esearch = 100000
47 self.retmax_efetch = 1000
48 self.count = 0
49 self.webenv = ""
50 self.query_key = ""
51
52 def retrieve(self):
53 """ """
54 self.get_count_value()
55 self.get_uids_list()
56 self.get_sequences()
57
58 def get_count_value(self):
59 """
60 just to retrieve Count (number of UIDs)
61 Total number of UIDs from the retrieved set to be shown in the XML
62 output (default=20). By default, ESearch only includes the first 20
63 UIDs retrieved in the XML output. If usehistory is set to 'y',
64 the remainder of the retrieved set will be stored on the History server;
65
66 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
67 """
68 self.logger.info("retrieving data from %s" % self.base)
69 self.logger.info("for Query: %s and database: %s" %
70 (self.query_string, self.dbname))
71 querylog = self.esearch(self.dbname, self.query_string, '', '', "count")
72 self.logger.debug("Query response:")
73 for line in querylog:
74 self.logger.debug(line.rstrip())
75 if '</Count>' in line:
76 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
77 self.logger.info("Founded %d UIDs" % self.count)
78
79 def get_uids_list(self):
80 """
81 Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
82 up to a maximum of 100,000 records.
83 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
84 """
85 retmax = self.retmax_esearch
86 if (self.count > retmax):
87 num_batches = (self.count / retmax) + 1
88 else:
89 num_batches = 1
90 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
91 self.logger.info("Number of batches for esearch action: %d " % num_batches)
92 for n in range(num_batches):
93 querylog = self.esearch(self.dbname, self.query_string, n*retmax, retmax, '')
94 for line in querylog:
95 if '<Id>' in line and '</Id>' in line:
96 uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')])
97 self.ids.append(uid)
98 self.logger.info("Retrieved %d UIDs" % len(self.ids))
99
100 def esearch(self, db, term, retstart, retmax, rettype):
101 url = self.base + "esearch.fcgi"
102 self.logger.debug("url: %s" % url)
103 values = {'db': db,
104 'term': term,
105 'rettype': rettype,
106 'retstart': retstart,
107 'retmax': retmax}
108 data = urllib.urlencode(values)
109 self.logger.debug("data: %s" % str(data))
110 req = urllib2.Request(url, data)
111 response = urllib2.urlopen(req)
112 querylog = response.readlines()
113 time.sleep(1)
114 return querylog
115
116 def epost(self, db, ids):
117 url = self.base + "epost.fcgi"
118 self.logger.debug("url_epost: %s" % url)
119 values = {'db': db,
120 'id': ids}
121 data = urllib.urlencode(values)
122 req = urllib2.Request(url, data)
123 #self.logger.debug("data: %s" % str(data))
124 req = urllib2.Request(url, data)
125 response = urllib2.urlopen(req)
126 querylog = response.readlines()
127 self.logger.debug("query response:")
128 for line in querylog:
129 self.logger.debug(line.rstrip())
130 if '</QueryKey>' in line:
131 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
132 if '</WebEnv>' in line:
133 self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')])
134 self.logger.debug("*** epost action ***")
135 self.logger.debug("query_key: %s" % self.query_key)
136 self.logger.debug("webenv: %s" % self.webenv)
137 time.sleep(1)
138
139 def efetch(self, db, query_key, webenv):
140 url = self.base + "efetch.fcgi"
141 self.logger.debug("url_efetch: %s" % url)
142 values = {'db': db,
143 'query_key': query_key,
144 'webenv': webenv,
145 'rettype': "fasta",
146 'retmode': "text"}
147 data = urllib.urlencode(values)
148 req = urllib2.Request(url, data)
149 self.logger.debug("data: %s" % str(data))
150 req = urllib2.Request(url, data)
151 response = urllib2.urlopen(req)
152 fasta = response.read()
153 if self.dbname != "pubmed":
154 assert fasta.startswith(">"), fasta
155 fasta = self.sanitiser(self.dbname, fasta) #
156 time.sleep(1)
157 return fasta
158
159 def sanitiser(self, db, fastaseq):
160 if db not in "nuccore protein" : return fastaseq
161 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
162 sane_seqlist = []
163 seqlist = fastaseq.split("\n\n")
164 for seq in seqlist[:-1]:
165 fastalines = seq.split("\n")
166 if len(fastalines) < 2:
167 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) )
168 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) )
169 continue
170 if db == "nuccore":
171 badnuc = 0
172 for nucleotide in fastalines[1]:
173 if nucleotide not in "ATGC":
174 badnuc += 1
175 if float(badnuc)/len(fastalines[1]) > 0.4:
176 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) )
177 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) )
178 continue
179 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars
180 cleanseq = "\n".join(fastalines)
181 sane_seqlist.append(cleanseq)
182 elif db == "protein":
183 fastalines[0] = fastalines[0][0:100]
184 fastalines[0] = fastalines[0].replace(" ", "_")
185 fastalines[0] = fastalines[0].replace("[", "_")
186 fastalines[0] = fastalines[0].replace("]", "_")
187 fastalines[0] = fastalines[0].replace("=", "_")
188 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it
189 fastalines[0] = re.sub(regex, "_", fastalines[0])
190 cleanseq = "\n".join(fastalines)
191 sane_seqlist.append(cleanseq)
192 # sane_seqlist[-1] = sane_seqlist[-1] + "\n" # remove to have sequence blocks not separated by two \n
193 return "\n".join(sane_seqlist)
194
195 def get_sequences(self):
196 """
197 Total number of records from the input set to be retrieved, up to a maximum
198 of 10,000. Optionally, for a large set the value of retstart can be iterated
199 while holding retmax constant, thereby downloading the entire set in batches
200 of size retmax.
201
202 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
203
204 """
205 batch_size = self.retmax_efetch
206 count = self.count
207 uids_list = self.ids
208 self.logger.info("Batch size for efetch action: %d" % batch_size)
209 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
210 with open(self.outname, 'w') as out:
211 for start in range(0, count, batch_size):
212 end = min(count, start+batch_size)
213 batch = uids_list[start:end]
214 self.epost(self.dbname, ",".join(batch))
215 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
216 mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
217 out.write(mfasta + '\n')
218
219
220 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
221 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
222 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
223
224
225 def __main__():
226 """ main function """
227 parser = optparse.OptionParser(description='Retrieve data from NCBI')
228 parser.add_option('-i', dest='query_string', help='NCBI Query String')
229 parser.add_option('-o', dest='outname', help='output file name')
230 parser.add_option('-l', '--logfile', help='log file (default=stderr)')
231 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
232 parser.add_option('-d', dest='dbname', help='database type')
233 (options, args) = parser.parse_args()
234 if len(args) > 0:
235 parser.error('Wrong number of arguments')
236
237 log_level = getattr(logging, options.loglevel)
238 kwargs = {'format': LOG_FORMAT,
239 'datefmt': LOG_DATEFMT,
240 'level': log_level}
241 if options.logfile:
242 kwargs['filename'] = options.logfile
243 logging.basicConfig(**kwargs)
244 logger = logging.getLogger('data_from_NCBI')
245
246 E = Eutils(options, logger)
247 E.retrieve()
248
249
250 if __name__ == "__main__":
251 __main__()