Mercurial > repos > drosofff > fetch_fasta_from_ncbi
comparison retrieve_fasta_from_NCBI.py @ 2:e9df554f7725 draft
Uploaded
author | drosofff |
---|---|
date | Wed, 20 May 2015 10:07:32 -0400 |
parents | c1d17d173128 |
children | 4ff395248db4 |
comparison
equal
deleted
inserted
replaced
1:c1d17d173128 | 2:e9df554f7725 |
---|---|
22 | 22 |
23 | 23 |
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein | 24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein |
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs | 25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs |
26 """ | 26 """ |
27 | 27 import sys |
28 import logging | 28 import logging |
29 import optparse | 29 import optparse |
30 import time | 30 import time |
31 import urllib | 31 import urllib |
32 import urllib2 | 32 import urllib2 |
120 'id': ids} | 120 'id': ids} |
121 data = urllib.urlencode(values) | 121 data = urllib.urlencode(values) |
122 req = urllib2.Request(url, data) | 122 req = urllib2.Request(url, data) |
123 #self.logger.debug("data: %s" % str(data)) | 123 #self.logger.debug("data: %s" % str(data)) |
124 req = urllib2.Request(url, data) | 124 req = urllib2.Request(url, data) |
125 response = urllib2.urlopen(req) | 125 serverResponse = False |
126 while not serverResponse: | |
127 try: | |
128 response = urllib2.urlopen(req) | |
129 serverResponse = True | |
130 except: # catch *all* exceptions | |
131 e = sys.exc_info()[0] | |
132 self.logger.info( "Catched Error: %s" % e ) | |
133 self.logger.info( "Retrying in 10 sec") | |
134 time.sleep(10) | |
135 # except urllib2.HTTPError as e: | |
136 # serverResponse = False | |
137 # self.logger.info("epost error:%s, %s" % (e.code, e.read() ) ) | |
126 querylog = response.readlines() | 138 querylog = response.readlines() |
127 self.logger.debug("query response:") | 139 self.logger.debug("query response:") |
128 for line in querylog: | 140 for line in querylog: |
129 self.logger.debug(line.rstrip()) | 141 self.logger.debug(line.rstrip()) |
130 if '</QueryKey>' in line: | 142 if '</QueryKey>' in line: |
146 'retmode': "text"} | 158 'retmode': "text"} |
147 data = urllib.urlencode(values) | 159 data = urllib.urlencode(values) |
148 req = urllib2.Request(url, data) | 160 req = urllib2.Request(url, data) |
149 self.logger.debug("data: %s" % str(data)) | 161 self.logger.debug("data: %s" % str(data)) |
150 req = urllib2.Request(url, data) | 162 req = urllib2.Request(url, data) |
151 response = urllib2.urlopen(req) | 163 serverResponse = False |
164 while not serverResponse: | |
165 try: | |
166 response = urllib2.urlopen(req) | |
167 serverResponse = True | |
168 except urllib2.HTTPError as e: | |
169 serverResponse = False | |
170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) | |
152 fasta = response.read() | 171 fasta = response.read() |
153 if "Resource temporarily unavailable" in fasta: | 172 if "Resource temporarily unavailable" in fasta: |
154 return '' # to reiterate the failed download | 173 return '' # to reiterate the failed download |
155 if self.dbname != "pubmed": | 174 if self.dbname != "pubmed": |
156 assert fasta.startswith(">"), fasta | 175 assert fasta.startswith(">"), fasta |
189 fastalines[0] = fastalines[0].replace("=", "_") | 208 fastalines[0] = fastalines[0].replace("=", "_") |
190 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it | 209 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it |
191 fastalines[0] = re.sub(regex, "_", fastalines[0]) | 210 fastalines[0] = re.sub(regex, "_", fastalines[0]) |
192 cleanseq = "\n".join(fastalines) | 211 cleanseq = "\n".join(fastalines) |
193 sane_seqlist.append(cleanseq) | 212 sane_seqlist.append(cleanseq) |
194 # sane_seqlist[-1] = sane_seqlist[-1] + "\n" # remove to have sequence blocks not separated by two \n | 213 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) ) |
195 return "\n".join(sane_seqlist) | 214 return "\n".join(sane_seqlist) |
196 | 215 |
197 def get_sequences(self): | 216 def get_sequences(self): |
198 """ | 217 """ |
199 Total number of records from the input set to be retrieved, up to a maximum | 218 Total number of records from the input set to be retrieved, up to a maximum |