Mercurial > repos > drosofff > fetch_fasta_from_ncbi
comparison retrieve_fasta_from_NCBI.py @ 2:e9df554f7725 draft
Uploaded
| author | drosofff |
|---|---|
| date | Wed, 20 May 2015 10:07:32 -0400 |
| parents | c1d17d173128 |
| children | 4ff395248db4 |
comparison
equal
deleted
inserted
replaced
| 1:c1d17d173128 | 2:e9df554f7725 |
|---|---|
| 22 | 22 |
| 23 | 23 |
| 24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein | 24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein |
| 25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs | 25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs |
| 26 """ | 26 """ |
| 27 | 27 import sys |
| 28 import logging | 28 import logging |
| 29 import optparse | 29 import optparse |
| 30 import time | 30 import time |
| 31 import urllib | 31 import urllib |
| 32 import urllib2 | 32 import urllib2 |
| 120 'id': ids} | 120 'id': ids} |
| 121 data = urllib.urlencode(values) | 121 data = urllib.urlencode(values) |
| 122 req = urllib2.Request(url, data) | 122 req = urllib2.Request(url, data) |
| 123 #self.logger.debug("data: %s" % str(data)) | 123 #self.logger.debug("data: %s" % str(data)) |
| 124 req = urllib2.Request(url, data) | 124 req = urllib2.Request(url, data) |
| 125 response = urllib2.urlopen(req) | 125 serverResponse = False |
| 126 while not serverResponse: | |
| 127 try: | |
| 128 response = urllib2.urlopen(req) | |
| 129 serverResponse = True | |
| 130 except: # catch *all* exceptions | |
| 131 e = sys.exc_info()[0] | |
| 132 self.logger.info( "Catched Error: %s" % e ) | |
| 133 self.logger.info( "Retrying in 10 sec") | |
| 134 time.sleep(10) | |
| 135 # except urllib2.HTTPError as e: | |
| 136 # serverResponse = False | |
| 137 # self.logger.info("epost error:%s, %s" % (e.code, e.read() ) ) | |
| 126 querylog = response.readlines() | 138 querylog = response.readlines() |
| 127 self.logger.debug("query response:") | 139 self.logger.debug("query response:") |
| 128 for line in querylog: | 140 for line in querylog: |
| 129 self.logger.debug(line.rstrip()) | 141 self.logger.debug(line.rstrip()) |
| 130 if '</QueryKey>' in line: | 142 if '</QueryKey>' in line: |
| 146 'retmode': "text"} | 158 'retmode': "text"} |
| 147 data = urllib.urlencode(values) | 159 data = urllib.urlencode(values) |
| 148 req = urllib2.Request(url, data) | 160 req = urllib2.Request(url, data) |
| 149 self.logger.debug("data: %s" % str(data)) | 161 self.logger.debug("data: %s" % str(data)) |
| 150 req = urllib2.Request(url, data) | 162 req = urllib2.Request(url, data) |
| 151 response = urllib2.urlopen(req) | 163 serverResponse = False |
| 164 while not serverResponse: | |
| 165 try: | |
| 166 response = urllib2.urlopen(req) | |
| 167 serverResponse = True | |
| 168 except urllib2.HTTPError as e: | |
| 169 serverResponse = False | |
| 170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) | |
| 152 fasta = response.read() | 171 fasta = response.read() |
| 153 if "Resource temporarily unavailable" in fasta: | 172 if "Resource temporarily unavailable" in fasta: |
| 154 return '' # to reiterate the failed download | 173 return '' # to reiterate the failed download |
| 155 if self.dbname != "pubmed": | 174 if self.dbname != "pubmed": |
| 156 assert fasta.startswith(">"), fasta | 175 assert fasta.startswith(">"), fasta |
| 189 fastalines[0] = fastalines[0].replace("=", "_") | 208 fastalines[0] = fastalines[0].replace("=", "_") |
| 190 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it | 209 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it |
| 191 fastalines[0] = re.sub(regex, "_", fastalines[0]) | 210 fastalines[0] = re.sub(regex, "_", fastalines[0]) |
| 192 cleanseq = "\n".join(fastalines) | 211 cleanseq = "\n".join(fastalines) |
| 193 sane_seqlist.append(cleanseq) | 212 sane_seqlist.append(cleanseq) |
| 194 # sane_seqlist[-1] = sane_seqlist[-1] + "\n" # remove to have sequence blocks not separated by two \n | 213 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) ) |
| 195 return "\n".join(sane_seqlist) | 214 return "\n".join(sane_seqlist) |
| 196 | 215 |
| 197 def get_sequences(self): | 216 def get_sequences(self): |
| 198 """ | 217 """ |
| 199 Total number of records from the input set to be retrieved, up to a maximum | 218 Total number of records from the input set to be retrieved, up to a maximum |
