comparison retrieve_fasta_from_NCBI.py @ 5:4ff395248db4 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Sat, 30 May 2015 17:57:09 -0400
parents e9df554f7725
children cd7de2d6c716
comparison
equal deleted inserted replaced
4:aa61d63b7e31 5:4ff395248db4
166 response = urllib2.urlopen(req) 166 response = urllib2.urlopen(req)
167 serverResponse = True 167 serverResponse = True
168 except urllib2.HTTPError as e: 168 except urllib2.HTTPError as e:
169 serverResponse = False 169 serverResponse = False
170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) 170 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
171 fasta = response.read() 171 try:
172 fasta = response.read()
173 except httplib.IncompleteRead as e:
174 fasta = e.partial
172 if "Resource temporarily unavailable" in fasta: 175 if "Resource temporarily unavailable" in fasta:
173 return '' # to reiterate the failed download 176 return '' # to reiterate the failed download
174 if self.dbname != "pubmed": 177 if self.dbname != "pubmed":
175 assert fasta.startswith(">"), fasta 178 assert fasta.startswith(">"), fasta
176 fasta = self.sanitiser(self.dbname, fasta) # 179 fasta = self.sanitiser(self.dbname, fasta) #
177 time.sleep(1) 180 time.sleep(1)
178 return fasta 181 return fasta
179 182
180 def sanitiser(self, db, fastaseq): 183 def sanitiser(self, db, fastaseq):
181 if db not in "nuccore protein" : return fastaseq 184 if db not in "nuccore protein" : return fastaseq
182 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") 185 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
183 sane_seqlist = [] 186 sane_seqlist = []
184 seqlist = fastaseq.split("\n\n") 187 seqlist = fastaseq.split("\n\n")
185 for seq in seqlist[:-1]: 188 for seq in seqlist[:-1]:
186 fastalines = seq.split("\n") 189 fastalines = seq.split("\n")
187 if len(fastalines) < 2: 190 if len(fastalines) < 2:
188 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) ) 191 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) )
189 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) ) 192 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) )
190 continue 193 continue
191 if db == "nuccore": 194 if db == "nuccore":
192 badnuc = 0 195 badnuc = 0
193 for nucleotide in fastalines[1]: 196 for nucleotide in fastalines[1]:
194 if nucleotide not in "ATGC": 197 if nucleotide not in "ATGC":
195 badnuc += 1 198 badnuc += 1
196 if float(badnuc)/len(fastalines[1]) > 0.4: 199 if float(badnuc)/len(fastalines[1]) > 0.4:
197 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) ) 200 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) )
198 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) ) 201 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) )
199 continue 202 continue
200 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars 203 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars
201 cleanseq = "\n".join(fastalines) 204 cleanseq = "\n".join(fastalines)
202 sane_seqlist.append(cleanseq) 205 sane_seqlist.append(cleanseq)
203 elif db == "protein": 206 elif db == "protein":
204 fastalines[0] = fastalines[0][0:100] 207 fastalines[0] = fastalines[0][0:100]
205 fastalines[0] = fastalines[0].replace(" ", "_") 208 fastalines[0] = fastalines[0].replace(" ", "_")
206 fastalines[0] = fastalines[0].replace("[", "_") 209 fastalines[0] = fastalines[0].replace("[", "_")
207 fastalines[0] = fastalines[0].replace("]", "_") 210 fastalines[0] = fastalines[0].replace("]", "_")
208 fastalines[0] = fastalines[0].replace("=", "_") 211 fastalines[0] = fastalines[0].replace("=", "_")
209 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it 212 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it
210 fastalines[0] = re.sub(regex, "_", fastalines[0]) 213 fastalines[0] = re.sub(regex, "_", fastalines[0])
211 cleanseq = "\n".join(fastalines) 214 cleanseq = "\n".join(fastalines)
212 sane_seqlist.append(cleanseq) 215 sane_seqlist.append(cleanseq)
213 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) ) 216 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) )
214 return "\n".join(sane_seqlist) 217 return "\n".join(sane_seqlist)
215 218
216 def get_sequences(self): 219 def get_sequences(self):
217 """ 220 """
218 Total number of records from the input set to be retrieved, up to a maximum 221 Total number of records from the input set to be retrieved, up to a maximum
219 of 10,000. Optionally, for a large set the value of retstart can be iterated 222 of 10,000. Optionally, for a large set the value of retstart can be iterated