comparison retrieve_fasta_from_NCBI.py @ 2:e9df554f7725 draft

Uploaded
author drosofff
date Wed, 20 May 2015 10:07:32 -0400
parents c1d17d173128
children 4ff395248db4
comparison
equal deleted inserted replaced
1:c1d17d173128 2:e9df554f7725
22 22
23 23
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein 24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs 25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
26 """ 26 """
27 27 import sys
28 import logging 28 import logging
29 import optparse 29 import optparse
30 import time 30 import time
31 import urllib 31 import urllib
32 import urllib2 32 import urllib2
120 'id': ids} 120 'id': ids}
121 data = urllib.urlencode(values) 121 data = urllib.urlencode(values)
122 req = urllib2.Request(url, data) 122 req = urllib2.Request(url, data)
123 #self.logger.debug("data: %s" % str(data)) 123 #self.logger.debug("data: %s" % str(data))
124 req = urllib2.Request(url, data) 124 req = urllib2.Request(url, data)
125 response = urllib2.urlopen(req) 125 serverResponse = False
126 while not serverResponse:
127 try:
128 response = urllib2.urlopen(req)
129 serverResponse = True
130 except: # catch *all* exceptions
131 e = sys.exc_info()[0]
132 self.logger.info( "Catched Error: %s" % e )
133 self.logger.info( "Retrying in 10 sec")
134 time.sleep(10)
135 # except urllib2.HTTPError as e:
136 # serverResponse = False
137 # self.logger.info("epost error:%s, %s" % (e.code, e.read() ) )
126 querylog = response.readlines() 138 querylog = response.readlines()
127 self.logger.debug("query response:") 139 self.logger.debug("query response:")
128 for line in querylog: 140 for line in querylog:
129 self.logger.debug(line.rstrip()) 141 self.logger.debug(line.rstrip())
130 if '</QueryKey>' in line: 142 if '</QueryKey>' in line:
146 'retmode': "text"} 158 'retmode': "text"}
147 data = urllib.urlencode(values) 159 data = urllib.urlencode(values)
148 req = urllib2.Request(url, data) 160 req = urllib2.Request(url, data)
149 self.logger.debug("data: %s" % str(data)) 161 self.logger.debug("data: %s" % str(data))
150 req = urllib2.Request(url, data) 162 req = urllib2.Request(url, data)
151 response = urllib2.urlopen(req) 163 serverResponse = False
164 while not serverResponse:
165 try:
166 response = urllib2.urlopen(req)
167 serverResponse = True
168 except urllib2.HTTPError as e:
169 serverResponse = False
170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
152 fasta = response.read() 171 fasta = response.read()
153 if "Resource temporarily unavailable" in fasta: 172 if "Resource temporarily unavailable" in fasta:
154 return '' # to reiterate the failed download 173 return '' # to reiterate the failed download
155 if self.dbname != "pubmed": 174 if self.dbname != "pubmed":
156 assert fasta.startswith(">"), fasta 175 assert fasta.startswith(">"), fasta
189 fastalines[0] = fastalines[0].replace("=", "_") 208 fastalines[0] = fastalines[0].replace("=", "_")
190 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it 209 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it
191 fastalines[0] = re.sub(regex, "_", fastalines[0]) 210 fastalines[0] = re.sub(regex, "_", fastalines[0])
192 cleanseq = "\n".join(fastalines) 211 cleanseq = "\n".join(fastalines)
193 sane_seqlist.append(cleanseq) 212 sane_seqlist.append(cleanseq)
194 # sane_seqlist[-1] = sane_seqlist[-1] + "\n" # remove to have sequence blocks not separated by two \n 213 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) )
195 return "\n".join(sane_seqlist) 214 return "\n".join(sane_seqlist)
196 215
197 def get_sequences(self): 216 def get_sequences(self):
198 """ 217 """
199 Total number of records from the input set to be retrieved, up to a maximum 218 Total number of records from the input set to be retrieved, up to a maximum