Mercurial > repos > drosofff > fetch_fasta_from_ncbi
comparison retrieve_fasta_from_NCBI.py @ 5:4ff395248db4 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Sat, 30 May 2015 17:57:09 -0400 |
parents | e9df554f7725 |
children | cd7de2d6c716 |
comparison
equal
deleted
inserted
replaced
4:aa61d63b7e31 | 5:4ff395248db4 |
---|---|
166 response = urllib2.urlopen(req) | 166 response = urllib2.urlopen(req) |
167 serverResponse = True | 167 serverResponse = True |
168 except urllib2.HTTPError as e: | 168 except urllib2.HTTPError as e: |
169 serverResponse = False | 169 serverResponse = False |
170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) | 170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) |
171 fasta = response.read() | 171 try: |
172 fasta = response.read() | |
173 except httplib.IncompleteRead as e: | |
174 fasta = e.partial | |
172 if "Resource temporarily unavailable" in fasta: | 175 if "Resource temporarily unavailable" in fasta: |
173 return '' # to reiterate the failed download | 176 return '' # to reiterate the failed download |
174 if self.dbname != "pubmed": | 177 if self.dbname != "pubmed": |
175 assert fasta.startswith(">"), fasta | 178 assert fasta.startswith(">"), fasta |
176 fasta = self.sanitiser(self.dbname, fasta) # | 179 fasta = self.sanitiser(self.dbname, fasta) # |
177 time.sleep(1) | 180 time.sleep(1) |
178 return fasta | 181 return fasta |
179 | 182 |
180 def sanitiser(self, db, fastaseq): | 183 def sanitiser(self, db, fastaseq): |
181 if db not in "nuccore protein" : return fastaseq | 184 if db not in "nuccore protein" : return fastaseq |
182 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") | 185 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") |
183 sane_seqlist = [] | 186 sane_seqlist = [] |
184 seqlist = fastaseq.split("\n\n") | 187 seqlist = fastaseq.split("\n\n") |
185 for seq in seqlist[:-1]: | 188 for seq in seqlist[:-1]: |
186 fastalines = seq.split("\n") | 189 fastalines = seq.split("\n") |
187 if len(fastalines) < 2: | 190 if len(fastalines) < 2: |
188 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) ) | 191 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) ) |
189 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) ) | 192 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) ) |
190 continue | 193 continue |
191 if db == "nuccore": | 194 if db == "nuccore": |
192 badnuc = 0 | 195 badnuc = 0 |
193 for nucleotide in fastalines[1]: | 196 for nucleotide in fastalines[1]: |
194 if nucleotide not in "ATGC": | 197 if nucleotide not in "ATGC": |
195 badnuc += 1 | 198 badnuc += 1 |
196 if float(badnuc)/len(fastalines[1]) > 0.4: | 199 if float(badnuc)/len(fastalines[1]) > 0.4: |
197 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) ) | 200 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) ) |
198 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) ) | 201 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) ) |
199 continue | 202 continue |
200 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars | 203 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars |
201 cleanseq = "\n".join(fastalines) | 204 cleanseq = "\n".join(fastalines) |
202 sane_seqlist.append(cleanseq) | 205 sane_seqlist.append(cleanseq) |
203 elif db == "protein": | 206 elif db == "protein": |
204 fastalines[0] = fastalines[0][0:100] | 207 fastalines[0] = fastalines[0][0:100] |
205 fastalines[0] = fastalines[0].replace(" ", "_") | 208 fastalines[0] = fastalines[0].replace(" ", "_") |
206 fastalines[0] = fastalines[0].replace("[", "_") | 209 fastalines[0] = fastalines[0].replace("[", "_") |
207 fastalines[0] = fastalines[0].replace("]", "_") | 210 fastalines[0] = fastalines[0].replace("]", "_") |
208 fastalines[0] = fastalines[0].replace("=", "_") | 211 fastalines[0] = fastalines[0].replace("=", "_") |
209 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it | 212 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it |
210 fastalines[0] = re.sub(regex, "_", fastalines[0]) | 213 fastalines[0] = re.sub(regex, "_", fastalines[0]) |
211 cleanseq = "\n".join(fastalines) | 214 cleanseq = "\n".join(fastalines) |
212 sane_seqlist.append(cleanseq) | 215 sane_seqlist.append(cleanseq) |
213 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) ) | 216 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) ) |
214 return "\n".join(sane_seqlist) | 217 return "\n".join(sane_seqlist) |
215 | 218 |
216 def get_sequences(self): | 219 def get_sequences(self): |
217 """ | 220 """ |
218 Total number of records from the input set to be retrieved, up to a maximum | 221 Total number of records from the input set to be retrieved, up to a maximum |
219 of 10,000. Optionally, for a large set the value of retstart can be iterated | 222 of 10,000. Optionally, for a large set the value of retstart can be iterated |