Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
comparison TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft
Uploaded updated python script
author | p.lucas |
---|---|
date | Thu, 21 Sep 2023 15:19:55 +0000 |
parents | 3d116861e380 |
children | 932ba9e04f3a |
comparison
equal
deleted
inserted
replaced
11:3d116861e380 | 12:19e175a84d0e |
---|---|
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | 8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) |
9 ### | 9 ### |
10 | 10 |
11 ### Libraries to import: | 11 ### Libraries to import: |
12 import argparse, os, sys, csv, warnings, re, itertools, operator | 12 import argparse, os, sys, csv, warnings, re, itertools, operator |
13 #from subprocess import Popen,PIPE | |
14 import subprocess | |
13 from os import path | 15 from os import path |
14 import ncbi_genome_download as ngd | 16 import ncbi_genome_download as ngd |
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | 17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) |
16 from ete3 import NCBITaxa | 18 from ete3 import NCBITaxa |
17 | 19 |
18 # to be able to report line number in error messages | 20 # to be able to report line number in error messages |
19 import inspect | 21 import inspect |
20 frame = inspect.currentframe() | 22 def lineno(): |
23 """Returns the current line number in our program.""" | |
24 return str(inspect.currentframe().f_back.f_lineno) | |
25 | |
21 | 26 |
22 # debug | 27 # debug |
23 b_test_load_taxids = False # ok 2023 03 07 | 28 test_dir = 'test_TAXID_genusexpand_taxid2acc/' |
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | 29 b_test_load_taxids = False # ok 2023 08 25 |
30 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25 | |
25 | 31 |
26 prog_tag = '[' + os.path.basename(__file__) + ']' | 32 prog_tag = '[' + os.path.basename(__file__) + ']' |
27 | 33 |
28 # boolean to know if we dowload ncbi taxonomy file in current env | 34 # boolean to know if we dowload ncbi taxonomy file in current env |
29 b_load_ncbi_tax_f = False | 35 b_load_ncbi_tax_f = False |
34 accnrlist = [] | 40 accnrlist = [] |
35 | 41 |
36 # order = -4 | 42 # order = -4 |
37 # family or clade = -3 | 43 # family or clade = -3 |
38 # subtribe or genus = -2 | 44 # subtribe or genus = -2 |
39 curr_index_in_lineage = -2 | 45 curr_index_in_lineage = -1 |
40 min_index_in_lineage = -4 | 46 min_index_in_lineage = -4 |
41 | 47 |
42 # boolean to know if we download ncbi taxonomy file in current env | 48 # boolean to know if we download ncbi taxonomy file in current env |
43 b_load_ncbi_tax_f = False | 49 b_load_ncbi_tax_f = False |
44 b_test_all = False | 50 b_test_all = False |
45 b_test = False | 51 b_test = False |
46 b_acc_in_f = False | 52 b_acc_in_f = False |
47 b_acc_out_f = False | 53 b_acc_out_f = False |
48 | 54 |
49 b_verbose = False | 55 b_verbose = False |
56 | |
57 # variables for ncbi-genome-download | |
58 ncbigenomedownload_section = 'refseq' # genbank | |
59 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate' | |
60 assembly_levels = 'complete,chromosome' | |
50 | 61 |
51 # rank = '' # rank level retained by user | 62 # rank = '' # rank level retained by user |
52 # rank_num = index of rank retained by user | 63 # rank_num = index of rank retained by user |
53 | 64 |
54 # # set to check that provided rank exist to be sure to be able to use it | 65 # # set to check that provided rank exist to be sure to be able to use it |
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', | 144 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
134 '-i taxid_accnr_list.tsv', | 145 '-i taxid_accnr_list.tsv', |
135 '-o accnr_out_list.txt']),"\n\n" ])) | 146 '-o accnr_out_list.txt']),"\n\n" ])) |
136 parser.print_help() | 147 parser.print_help() |
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | 148 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + |
138 " arguments, exit line "+str(frame.f_lineno)) | 149 " arguments, exit line "+lineno()) |
139 sys.exit(0) | 150 sys.exit(0) |
140 | 151 |
141 # print('args:', args) | 152 # print('args:', args) |
142 # if(not b_test): | 153 # if(not b_test): |
143 if args.ncbi_tax_f is not None: | 154 if args.ncbi_tax_f is not None: |
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | 155 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) |
145 else: | 156 else: |
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | 157 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" |
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | 158 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") |
148 if args.taxid_acc_in_f is not None: | 159 if args.taxid_acc_in_f is not None: |
149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) | 160 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f) |
150 b_acc_in_f = True | 161 b_acc_in_f = True |
151 elif(not b_test): | 162 elif(not b_test): |
152 sys.exit("[Error] You must provide taxid_acc_in_f") | 163 sys.exit("[Error] You must provide taxid_acc_in_f") |
153 if args.acc_out_f is not None: | 164 if args.acc_out_f is not None: |
154 acc_out_f = os.path.abspath(args.acc_out_f) | 165 acc_out_f = os.path.abspath(args.acc_out_f) |
172 | 183 |
173 # -------------------------------------------------------------------------- | 184 # -------------------------------------------------------------------------- |
174 # to sort uniq, for a list, only need to add list conversion | 185 # to sort uniq, for a list, only need to add list conversion |
175 # -------------------------------------------------------------------------- | 186 # -------------------------------------------------------------------------- |
176 mapper= map # Python ≥ 3 | 187 mapper= map # Python ≥ 3 |
177 def sort_uniq(sequence): | 188 def sort_uniq(sequence: list): |
178 return mapper( | 189 return mapper( |
179 operator.itemgetter(0), | 190 operator.itemgetter(0), |
180 itertools.groupby(sorted(sequence))) | 191 itertools.groupby(sorted(sequence))) |
181 # -------------------------------------------------------------------------- | 192 # -------------------------------------------------------------------------- |
182 | 193 |
183 # -------------------------------------------------------------------------- | 194 # -------------------------------------------------------------------------- |
184 # load taxid acc list, return taxidlist | 195 # Procedure: load taxid acc list, return taxidlist |
185 # -------------------------------------------------------------------------- | 196 # -------------------------------------------------------------------------- |
186 def load_taxids(taxid_acc_tabular_f): | 197 def load_taxids(taxid_acc_tabular_f: str): |
187 | 198 |
188 if not path.exists(taxid_acc_tabular_f): | 199 if not path.exists(taxid_acc_tabular_f): |
189 sys.exit("Error " + taxid_acc_tabular_f + | 200 sys.exit("Error " + taxid_acc_tabular_f + |
190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | 201 " file does not exist, line "+ lineno() ) |
191 | 202 |
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | 203 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u " |
193 | 204 |
194 for line in os.popen(cmd).readlines(): | 205 for line in os.popen(cmd).readlines(): |
195 if line.rstrip() != "": | 206 if line.rstrip() != "": |
196 k, v = line.rstrip().split() | 207 k, v = line.rstrip().split() |
197 taxidlist.append(k) | 208 taxidlist.append(k) |
198 accnrlist.append(v) | 209 accnrlist.append(v) |
199 | 210 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") |
200 return taxidlist | 211 |
201 # -------------------------------------------------------------------------- | 212 # -------------------------------------------------------------------------- |
202 | 213 |
203 # test load_taxids function | 214 # test load_taxids function |
204 # display taxidlist, then exit | 215 # display taxidlist, then exit |
205 if b_test_load_taxids: | 216 if b_test_load_taxids: |
206 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 217 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
207 print("START b_test_load_taxids") | 218 print("START b_test_load_taxids") |
208 print("loading "+taxid_acc_tabular_f+" file") | 219 print("loading "+taxid_acc_tabular_f+" file") |
209 taxidlist = load_taxids(taxid_acc_tabular_f) | 220 load_taxids(taxid_acc_tabular_f) |
210 for i in range(len(taxidlist)): | 221 for i in range(len(taxidlist)): |
211 print(f"{taxidlist[i]}\t{accnrlist[i]}") | 222 print(f"{taxidlist[i]}\t{accnrlist[i]}") |
212 print("END b_test_load_taxids") | 223 print("END b_test_load_taxids") |
213 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | 224 if not b_test_add_host_chr_taxids_accnr_from_ori_list: |
214 sys.exit() | 225 sys.exit() |
231 # function to retain the most recent acc nr for host complete genome found: | 242 # function to retain the most recent acc nr for host complete genome found: |
232 # - return acc nr of most recent complete genome | 243 # - return acc nr of most recent complete genome |
233 # - print accnr species and name retained | 244 # - print accnr species and name retained |
234 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | 245 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp |
235 # -------------------------------------------------------------------------- | 246 # -------------------------------------------------------------------------- |
236 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | 247 def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str: |
237 | 248 |
238 max_accnr_version = 0 | 249 max_accnr_version = 0 |
239 curr_accnr_version = 0 | 250 curr_accnr_version = 0 |
240 max_accnr_nr = 0 | 251 max_accnr_nr = 0 |
241 curr_accnr_nr = 0 | 252 curr_accnr_nr = 0 |
268 | 279 |
269 return kept_accn | 280 return kept_accn |
270 # -------------------------------------------------------------------------- | 281 # -------------------------------------------------------------------------- |
271 | 282 |
272 # -------------------------------------------------------------------------- | 283 # -------------------------------------------------------------------------- |
273 # procedure to find complete genome closely related to current taxid | 284 # Function to find complete genome closely related to current taxid |
274 # goes upper in taxonomy if nothing found until order | 285 # goes upper in taxonomy if nothing found until order |
275 # -------------------------------------------------------------------------- | 286 # -------------------------------------------------------------------------- |
276 def ngd_upper_lineage(curr_index_in_lineage, | 287 def ngd_upper_lineage(curr_index_in_lineage: int, |
277 lineage, | 288 lineage: list, |
278 ncbi, | 289 ncbi: NCBITaxa, |
279 accnrlisttmp, # current working list | 290 accnrlisttmp: list, # current working list |
280 accnrlist, # final list, if something added (or min index reached), recursivity stop | 291 accnrlist: list, # final list, if something added (or min index reached), recursivity stop |
281 speciestmp, | 292 speciestmp: list, |
282 nametmp | 293 nametmp: list |
283 ): | 294 ) -> str: |
284 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | 295 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") |
285 | 296 |
286 # deduce up rank, search complet genome/chr in | 297 # deduce up rank, search complet genome/chr in |
287 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | 298 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species |
288 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | 299 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) |
291 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | 302 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, |
292 intermediate_nodes=False, | 303 intermediate_nodes=False, |
293 collapse_subspecies=False, | 304 collapse_subspecies=False, |
294 return_tree=False | 305 return_tree=False |
295 ) | 306 ) |
307 | |
296 # int conversion to strings | 308 # int conversion to strings |
297 leaves_taxids = list(map(str, leaves_taxids)) | 309 leaves_taxids = list(map(str, leaves_taxids)) |
298 leaves_taxids_list = ','.join(leaves_taxids) | 310 leaves_taxids_list = ','.join(leaves_taxids) |
299 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 311 |
300 # print(f"{prog_tag} cmd:{cmd}") | 312 if b_verbose: |
313 print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}") | |
314 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
315 if b_verbose: | |
316 print(f"{prog_tag} cmd:{cmd}") | |
301 | 317 |
302 # specific to retain_1accn to avoid lists are crashed by other ngd call | 318 # specific to retain_1accn to avoid lists are crashed by other ngd call |
303 accnrlisttmp_r = [] | 319 accnrlisttmp_r = [] |
304 speciestmp_r = [] | 320 speciestmp_r = [] |
305 nametmp_r = [] | 321 nametmp_r = [] |
322 | |
306 for line in os.popen(cmd).readlines(): | 323 for line in os.popen(cmd).readlines(): |
307 | 324 # print(f"line 314:{line.rstrip()}") |
308 if not re.match("^Considering", line): | 325 if not re.match("^Considering", line): |
309 # print(f"line:{line.rstrip()}") | 326 # print(f"line 316:{line.rstrip()}") |
310 if re.match("^(?:ERROR|Error): No downloads", line): | 327 if re.match("^(?:ERROR|Error): No downloads", line): |
311 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | 328 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") |
312 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | 329 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted |
313 curr_index_in_lineage = curr_index_in_lineage - 1 | 330 curr_index_in_lineage = curr_index_in_lineage - 1 |
314 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 331 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
315 return ngd_upper_lineage(curr_index_in_lineage, | 332 return ngd_upper_lineage(curr_index_in_lineage, |
316 lineage, | 333 lineage, |
317 ncbi, | 334 ncbi, |
318 accnrlisttmp, # current working list | 335 accnrlisttmp, # current working list |
319 accnrlist, # final list, if something added (or min index reached), recursivity stop | 336 accnrlist, # final list, if something added (or min index reached), recursivity stop |
320 speciestmp, | 337 speciestmp, |
321 nametmp | 338 nametmp |
322 ) | 339 ) |
323 else: | 340 else: |
324 acc_nr, species, name = line.rstrip().split("\t") | 341 # print(f"line 331:{line}") |
342 try: | |
343 acc_nr, species, name = line.rstrip().split("\t") | |
344 except ValueError as ve: | |
345 sys.exit(f"ValueError {ve}: for split of line '{line}'") | |
325 accnrlisttmp_r.append(acc_nr) | 346 accnrlisttmp_r.append(acc_nr) |
326 speciestmp_r.append(species) | 347 speciestmp_r.append(species) |
327 nametmp_r.append(name) | 348 nametmp_r.append(name) |
328 if b_verbose: | 349 if b_verbose: |
329 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 350 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") |
337 | 358 |
338 # -------------------------------------------------------------------------- | 359 # -------------------------------------------------------------------------- |
339 # read taxids, deduce complete genomes available in genblank, provides in output file | 360 # read taxids, deduce complete genomes available in genblank, provides in output file |
340 # the acc number in addition to those already listed | 361 # the acc number in addition to those already listed |
341 # -------------------------------------------------------------------------- | 362 # -------------------------------------------------------------------------- |
342 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 363 def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list, |
343 accnrlist, | 364 accnrlist: list, |
344 acc_out_f): | 365 acc_out_f: str): |
345 | 366 |
346 # store all accnr found for complete genome of current taxid (or from same family/order) | 367 # store all accnr found for complete genome of current taxid (or from same family/order) |
347 # the aim is to keep only the most recent/complete | 368 # the aim is to keep only the most recent/complete |
348 accnrlisttmp = [] | 369 accnrlisttmp = [] |
349 speciestmp = [] | 370 speciestmp = [] |
353 taxids_list=','.join(taxidlist) | 374 taxids_list=','.join(taxidlist) |
354 | 375 |
355 # # ------------------------------------------ | 376 # # ------------------------------------------ |
356 # # ncbi-genome-download as a library | 377 # # ncbi-genome-download as a library |
357 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | 378 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' |
358 # ngd.download(section='genbank', | 379 # ngd.download(section=ncbigenomedownload_section, |
359 # taxids=taxids_list, | 380 # taxids=taxids_list, |
360 # assembly_levels='chromosome', | 381 # assembly_levels=assembly_levels, |
361 # flat_output=True, | 382 # flat_output=True, |
362 # output=ngd_out_f, | 383 # output=ngd_out_f, |
363 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | 384 # groups=organisms_to_search_in, |
364 # dry_run=True | 385 # dry_run=True |
365 # ) | 386 # ) |
366 | 387 |
367 | 388 |
368 # cmd = "cut -f 1,2 "+ngd_out_f | 389 # cmd = "cut -f 1,2 "+ngd_out_f |
369 | 390 |
388 ncbi.update_taxonomy_database() | 409 ncbi.update_taxonomy_database() |
389 except: | 410 except: |
390 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | 411 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") |
391 | 412 |
392 for taxid_u in taxidlist: | 413 for taxid_u in taxidlist: |
393 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 414 print(f"{prog_tag} treating global taxid:{taxid_u}") |
415 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
394 for line in os.popen(cmd).readlines(): | 416 for line in os.popen(cmd).readlines(): |
417 if b_verbose: | |
418 print(f"{prog_tag} cmd:{cmd} ran, read output") | |
395 # ERROR: No downloads matched your filter. Please check your options. | 419 # ERROR: No downloads matched your filter. Please check your options. |
396 if re.match("^(?:ERROR|Error): No downloads", line): | 420 if re.match("^(?:ERROR|Error): No downloads", line): |
397 # get complete lineage: accept ONLY leave taxid? (species) | 421 # get complete lineage: accept ONLY leave taxid? (species) |
398 lineage = ncbi.get_lineage(int(taxid_u)) | 422 lineage = ncbi.get_lineage(int(taxid_u)) |
399 name = ncbi.translate_to_names(lineage) | 423 name = ncbi.translate_to_names(lineage) |
400 if b_verbose: | 424 if b_verbose: |
401 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | 425 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") |
402 | 426 |
403 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | 427 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome |
404 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 428 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
405 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | 429 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, |
406 lineage, | 430 lineage, |
407 ncbi, | 431 ncbi, |
408 accnrlisttmp, # current working list | 432 accnrlisttmp, # current working list |
409 accnrlist, # final list, if something added (or min index reached), recursivity stop | 433 accnrlist, # final list, if something added (or min index reached), recursivity stop |
410 speciestmp, | 434 speciestmp, |
411 nametmp | 435 nametmp |
412 ) | 436 ) |
413 accnrlist.append( new_acc_nr ) | 437 if new_acc_nr is None: |
438 print(f"No acc_nr found after going up in taxonomy, line {lineno()}") | |
439 else: | |
440 accnrlist.append( new_acc_nr ) | |
441 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
414 | 442 |
415 # initialize for next search | 443 # initialize for next search |
416 accnrlisttmp = [] | 444 accnrlisttmp = [] |
417 speciestmp = [] | 445 speciestmp = [] |
418 nametmp = [] | 446 nametmp = [] |
419 | 447 |
420 elif not re.match("^Considering", line): | 448 elif not re.match("^Considering", line): |
421 # print(f"line:{line.rstrip()}") | 449 # print(f"line:{line.rstrip()}") |
422 acc_nr, species, name = line.rstrip().split("\t") | 450 acc_nr, species, name = line.rstrip().split("\t") |
423 accnrlisttmp.append(acc_nr) | 451 accnrlisttmp.append(acc_nr) |
452 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
453 | |
424 speciestmp.append(species) | 454 speciestmp.append(species) |
425 nametmp.append(name) | 455 nametmp.append(name) |
426 if b_verbose: | 456 if b_verbose: |
427 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 457 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") |
428 | 458 |
429 # retain only the most recent complete genome for current treated taxid | 459 # retain only the most recent complete genome for current treated taxid |
430 if len(accnrlisttmp): | 460 if len(accnrlisttmp): |
431 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | 461 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) |
462 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
432 | 463 |
433 # remove redundant accnr | 464 # remove redundant accnr |
465 print(f"accnrlist to sort:{accnrlist}") | |
434 accnrlist = list(sort_uniq(accnrlist)) | 466 accnrlist = list(sort_uniq(accnrlist)) |
435 with open(acc_out_f, "w") as record_file: | 467 with open(acc_out_f, "w") as record_file: |
436 for accnr in accnrlist: | 468 for accnr in accnrlist: |
437 record_file.write("%s\n" % (accnr)) | 469 record_file.write("%s\n" % (accnr)) |
438 # ------------------------------------------ | 470 # ------------------------------------------ |
440 print(f"{prog_tag} {acc_out_f} file created") | 472 print(f"{prog_tag} {acc_out_f} file created") |
441 | 473 |
442 # -------------------------------------------------------------------------- | 474 # -------------------------------------------------------------------------- |
443 # test | 475 # test |
444 if b_test_add_host_chr_taxids_accnr_from_ori_list: | 476 if b_test_add_host_chr_taxids_accnr_from_ori_list: |
445 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 477 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
446 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | 478 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv' |
447 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
448 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | 479 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") |
449 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | 480 print(f"{prog_tag} loading {taxid_acc_in_f} file") |
450 taxidlist = load_taxids(taxid_acc_in_f) | 481 load_taxids(taxid_acc_in_f) |
482 for i in range(len(taxidlist)): | |
483 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
451 print(f"{prog_tag} end loading") | 484 print(f"{prog_tag} end loading") |
452 | 485 |
453 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 486 add_host_chr_taxids_accnr_from_ori_list(taxidlist, |
454 accnrlist, | 487 accnrlist, |
455 acc_out_f) | 488 acc_out_f) |
461 # MAIN | 494 # MAIN |
462 # -------------------------------------------------------------------------- | 495 # -------------------------------------------------------------------------- |
463 ##### MAIN | 496 ##### MAIN |
464 def __main__(): | 497 def __main__(): |
465 # load taxid_acc file | 498 # load taxid_acc file |
466 load_taxids(taxid_acc_in_f) | 499 load_taxids(taxid_acc_tabular_f) |
467 # check in ncbi taxonomy which acc number are in and out of given taxid | 500 # check in ncbi taxonomy which acc number are in and out of given taxid |
468 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 501 add_host_chr_taxids_accnr_from_ori_list(taxidlist, |
469 accnrlist, | 502 accnrlist, |
470 acc_out_f) | 503 acc_out_f) |
471 # -------------------------------------------------------------------------- | 504 # -------------------------------------------------------------------------- |