comparison TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft

Uploaded updated python script
author p.lucas
date Thu, 21 Sep 2023 15:19:55 +0000
parents 3d116861e380
children 932ba9e04f3a
comparison
equal deleted inserted replaced
11:3d116861e380 12:19e175a84d0e
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) 8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
9 ### 9 ###
10 10
11 ### Libraries to import: 11 ### Libraries to import:
12 import argparse, os, sys, csv, warnings, re, itertools, operator 12 import argparse, os, sys, csv, warnings, re, itertools, operator
13 #from subprocess import Popen,PIPE
14 import subprocess
13 from os import path 15 from os import path
14 import ncbi_genome_download as ngd 16 import ncbi_genome_download as ngd
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) 17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
16 from ete3 import NCBITaxa 18 from ete3 import NCBITaxa
17 19
18 # to be able to report line number in error messages 20 # to be able to report line number in error messages
19 import inspect 21 import inspect
20 frame = inspect.currentframe() 22 def lineno():
23 """Returns the current line number in our program."""
24 return str(inspect.currentframe().f_back.f_lineno)
25
21 26
22 # debug 27 # debug
23 b_test_load_taxids = False # ok 2023 03 07 28 test_dir = 'test_TAXID_genusexpand_taxid2acc/'
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 29 b_test_load_taxids = False # ok 2023 08 25
30 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25
25 31
26 prog_tag = '[' + os.path.basename(__file__) + ']' 32 prog_tag = '[' + os.path.basename(__file__) + ']'
27 33
28 # boolean to know if we dowload ncbi taxonomy file in current env 34 # boolean to know if we dowload ncbi taxonomy file in current env
29 b_load_ncbi_tax_f = False 35 b_load_ncbi_tax_f = False
34 accnrlist = [] 40 accnrlist = []
35 41
36 # order = -4 42 # order = -4
37 # family or clade = -3 43 # family or clade = -3
38 # subtribe or genus = -2 44 # subtribe or genus = -2
39 curr_index_in_lineage = -2 45 curr_index_in_lineage = -1
40 min_index_in_lineage = -4 46 min_index_in_lineage = -4
41 47
42 # boolean to know if we download ncbi taxonomy file in current env 48 # boolean to know if we download ncbi taxonomy file in current env
43 b_load_ncbi_tax_f = False 49 b_load_ncbi_tax_f = False
44 b_test_all = False 50 b_test_all = False
45 b_test = False 51 b_test = False
46 b_acc_in_f = False 52 b_acc_in_f = False
47 b_acc_out_f = False 53 b_acc_out_f = False
48 54
49 b_verbose = False 55 b_verbose = False
56
57 # variables for ncbi-genome-download
58 ncbigenomedownload_section = 'refseq' # genbank
59 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
60 assembly_levels = 'complete,chromosome'
50 61
51 # rank = '' # rank level retained by user 62 # rank = '' # rank level retained by user
52 # rank_num = index of rank retained by user 63 # rank_num = index of rank retained by user
53 64
54 # # set to check that provided rank exist to be sure to be able to use it 65 # # set to check that provided rank exist to be sure to be able to use it
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', 144 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
134 '-i taxid_accnr_list.tsv', 145 '-i taxid_accnr_list.tsv',
135 '-o accnr_out_list.txt']),"\n\n" ])) 146 '-o accnr_out_list.txt']),"\n\n" ]))
136 parser.print_help() 147 parser.print_help()
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + 148 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
138 " arguments, exit line "+str(frame.f_lineno)) 149 " arguments, exit line "+lineno())
139 sys.exit(0) 150 sys.exit(0)
140 151
141 # print('args:', args) 152 # print('args:', args)
142 # if(not b_test): 153 # if(not b_test):
143 if args.ncbi_tax_f is not None: 154 if args.ncbi_tax_f is not None:
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) 155 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
145 else: 156 else:
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" 157 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") 158 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
148 if args.taxid_acc_in_f is not None: 159 if args.taxid_acc_in_f is not None:
149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) 160 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
150 b_acc_in_f = True 161 b_acc_in_f = True
151 elif(not b_test): 162 elif(not b_test):
152 sys.exit("[Error] You must provide taxid_acc_in_f") 163 sys.exit("[Error] You must provide taxid_acc_in_f")
153 if args.acc_out_f is not None: 164 if args.acc_out_f is not None:
154 acc_out_f = os.path.abspath(args.acc_out_f) 165 acc_out_f = os.path.abspath(args.acc_out_f)
172 183
173 # -------------------------------------------------------------------------- 184 # --------------------------------------------------------------------------
174 # to sort uniq, for a list, only need to add list conversion 185 # to sort uniq, for a list, only need to add list conversion
175 # -------------------------------------------------------------------------- 186 # --------------------------------------------------------------------------
176 mapper= map # Python ≥ 3 187 mapper= map # Python ≥ 3
177 def sort_uniq(sequence): 188 def sort_uniq(sequence: list):
178 return mapper( 189 return mapper(
179 operator.itemgetter(0), 190 operator.itemgetter(0),
180 itertools.groupby(sorted(sequence))) 191 itertools.groupby(sorted(sequence)))
181 # -------------------------------------------------------------------------- 192 # --------------------------------------------------------------------------
182 193
183 # -------------------------------------------------------------------------- 194 # --------------------------------------------------------------------------
184 # load taxid acc list, return taxidlist 195 # Procedure: load taxid acc list, return taxidlist
185 # -------------------------------------------------------------------------- 196 # --------------------------------------------------------------------------
186 def load_taxids(taxid_acc_tabular_f): 197 def load_taxids(taxid_acc_tabular_f: str):
187 198
188 if not path.exists(taxid_acc_tabular_f): 199 if not path.exists(taxid_acc_tabular_f):
189 sys.exit("Error " + taxid_acc_tabular_f + 200 sys.exit("Error " + taxid_acc_tabular_f +
190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) 201 " file does not exist, line "+ lineno() )
191 202
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " 203 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
193 204
194 for line in os.popen(cmd).readlines(): 205 for line in os.popen(cmd).readlines():
195 if line.rstrip() != "": 206 if line.rstrip() != "":
196 k, v = line.rstrip().split() 207 k, v = line.rstrip().split()
197 taxidlist.append(k) 208 taxidlist.append(k)
198 accnrlist.append(v) 209 accnrlist.append(v)
199 210 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
200 return taxidlist 211
201 # -------------------------------------------------------------------------- 212 # --------------------------------------------------------------------------
202 213
203 # test load_taxids function 214 # test load_taxids function
204 # display taxidlist, then exit 215 # display taxidlist, then exit
205 if b_test_load_taxids: 216 if b_test_load_taxids:
206 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' 217 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
207 print("START b_test_load_taxids") 218 print("START b_test_load_taxids")
208 print("loading "+taxid_acc_tabular_f+" file") 219 print("loading "+taxid_acc_tabular_f+" file")
209 taxidlist = load_taxids(taxid_acc_tabular_f) 220 load_taxids(taxid_acc_tabular_f)
210 for i in range(len(taxidlist)): 221 for i in range(len(taxidlist)):
211 print(f"{taxidlist[i]}\t{accnrlist[i]}") 222 print(f"{taxidlist[i]}\t{accnrlist[i]}")
212 print("END b_test_load_taxids") 223 print("END b_test_load_taxids")
213 if not b_test_add_host_chr_taxids_accnr_from_ori_list: 224 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
214 sys.exit() 225 sys.exit()
231 # function to retain the most recent acc nr for host complete genome found: 242 # function to retain the most recent acc nr for host complete genome found:
232 # - return acc nr of most recent complete genome 243 # - return acc nr of most recent complete genome
233 # - print accnr species and name retained 244 # - print accnr species and name retained
234 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp 245 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
235 # -------------------------------------------------------------------------- 246 # --------------------------------------------------------------------------
236 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): 247 def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str:
237 248
238 max_accnr_version = 0 249 max_accnr_version = 0
239 curr_accnr_version = 0 250 curr_accnr_version = 0
240 max_accnr_nr = 0 251 max_accnr_nr = 0
241 curr_accnr_nr = 0 252 curr_accnr_nr = 0
268 279
269 return kept_accn 280 return kept_accn
270 # -------------------------------------------------------------------------- 281 # --------------------------------------------------------------------------
271 282
272 # -------------------------------------------------------------------------- 283 # --------------------------------------------------------------------------
273 # procedure to find complete genome closely related to current taxid 284 # Function to find complete genome closely related to current taxid
274 # goes upper in taxonomy if nothing found until order 285 # goes upper in taxonomy if nothing found until order
275 # -------------------------------------------------------------------------- 286 # --------------------------------------------------------------------------
276 def ngd_upper_lineage(curr_index_in_lineage, 287 def ngd_upper_lineage(curr_index_in_lineage: int,
277 lineage, 288 lineage: list,
278 ncbi, 289 ncbi: NCBITaxa,
279 accnrlisttmp, # current working list 290 accnrlisttmp: list, # current working list
280 accnrlist, # final list, if something added (or min index reached), recursivity stop 291 accnrlist: list, # final list, if something added (or min index reached), recursivity stop
281 speciestmp, 292 speciestmp: list,
282 nametmp 293 nametmp: list
283 ): 294 ) -> str:
284 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") 295 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
285 296
286 # deduce up rank, search complet genome/chr in 297 # deduce up rank, search complet genome/chr in
287 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species 298 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
288 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) 299 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
291 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, 302 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
292 intermediate_nodes=False, 303 intermediate_nodes=False,
293 collapse_subspecies=False, 304 collapse_subspecies=False,
294 return_tree=False 305 return_tree=False
295 ) 306 )
307
296 # int conversion to strings 308 # int conversion to strings
297 leaves_taxids = list(map(str, leaves_taxids)) 309 leaves_taxids = list(map(str, leaves_taxids))
298 leaves_taxids_list = ','.join(leaves_taxids) 310 leaves_taxids_list = ','.join(leaves_taxids)
299 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" 311
300 # print(f"{prog_tag} cmd:{cmd}") 312 if b_verbose:
313 print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}")
314 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
315 if b_verbose:
316 print(f"{prog_tag} cmd:{cmd}")
301 317
302 # specific to retain_1accn to avoid lists are crashed by other ngd call 318 # specific to retain_1accn to avoid lists are crashed by other ngd call
303 accnrlisttmp_r = [] 319 accnrlisttmp_r = []
304 speciestmp_r = [] 320 speciestmp_r = []
305 nametmp_r = [] 321 nametmp_r = []
322
306 for line in os.popen(cmd).readlines(): 323 for line in os.popen(cmd).readlines():
307 324 # print(f"line 314:{line.rstrip()}")
308 if not re.match("^Considering", line): 325 if not re.match("^Considering", line):
309 # print(f"line:{line.rstrip()}") 326 # print(f"line 316:{line.rstrip()}")
310 if re.match("^(?:ERROR|Error): No downloads", line): 327 if re.match("^(?:ERROR|Error): No downloads", line):
311 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") 328 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
312 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted 329 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
313 curr_index_in_lineage = curr_index_in_lineage - 1 330 curr_index_in_lineage = curr_index_in_lineage - 1
314 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") 331 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
315 return ngd_upper_lineage(curr_index_in_lineage, 332 return ngd_upper_lineage(curr_index_in_lineage,
316 lineage, 333 lineage,
317 ncbi, 334 ncbi,
318 accnrlisttmp, # current working list 335 accnrlisttmp, # current working list
319 accnrlist, # final list, if something added (or min index reached), recursivity stop 336 accnrlist, # final list, if something added (or min index reached), recursivity stop
320 speciestmp, 337 speciestmp,
321 nametmp 338 nametmp
322 ) 339 )
323 else: 340 else:
324 acc_nr, species, name = line.rstrip().split("\t") 341 # print(f"line 331:{line}")
342 try:
343 acc_nr, species, name = line.rstrip().split("\t")
344 except ValueError as ve:
345 sys.exit(f"ValueError {ve}: for split of line '{line}'")
325 accnrlisttmp_r.append(acc_nr) 346 accnrlisttmp_r.append(acc_nr)
326 speciestmp_r.append(species) 347 speciestmp_r.append(species)
327 nametmp_r.append(name) 348 nametmp_r.append(name)
328 if b_verbose: 349 if b_verbose:
329 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") 350 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
337 358
338 # -------------------------------------------------------------------------- 359 # --------------------------------------------------------------------------
339 # read taxids, deduce complete genomes available in genblank, provides in output file 360 # read taxids, deduce complete genomes available in genblank, provides in output file
340 # the acc number in addition to those already listed 361 # the acc number in addition to those already listed
341 # -------------------------------------------------------------------------- 362 # --------------------------------------------------------------------------
342 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, 363 def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list,
343 accnrlist, 364 accnrlist: list,
344 acc_out_f): 365 acc_out_f: str):
345 366
346 # store all accnr found for complete genome of current taxid (or from same family/order) 367 # store all accnr found for complete genome of current taxid (or from same family/order)
347 # the aim is to keep only the most recent/complete 368 # the aim is to keep only the most recent/complete
348 accnrlisttmp = [] 369 accnrlisttmp = []
349 speciestmp = [] 370 speciestmp = []
353 taxids_list=','.join(taxidlist) 374 taxids_list=','.join(taxidlist)
354 375
355 # # ------------------------------------------ 376 # # ------------------------------------------
356 # # ncbi-genome-download as a library 377 # # ncbi-genome-download as a library
357 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' 378 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
358 # ngd.download(section='genbank', 379 # ngd.download(section=ncbigenomedownload_section,
359 # taxids=taxids_list, 380 # taxids=taxids_list,
360 # assembly_levels='chromosome', 381 # assembly_levels=assembly_levels,
361 # flat_output=True, 382 # flat_output=True,
362 # output=ngd_out_f, 383 # output=ngd_out_f,
363 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', 384 # groups=organisms_to_search_in,
364 # dry_run=True 385 # dry_run=True
365 # ) 386 # )
366 387
367 388
368 # cmd = "cut -f 1,2 "+ngd_out_f 389 # cmd = "cut -f 1,2 "+ngd_out_f
369 390
388 ncbi.update_taxonomy_database() 409 ncbi.update_taxonomy_database()
389 except: 410 except:
390 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") 411 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
391 412
392 for taxid_u in taxidlist: 413 for taxid_u in taxidlist:
393 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" 414 print(f"{prog_tag} treating global taxid:{taxid_u}")
415 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
394 for line in os.popen(cmd).readlines(): 416 for line in os.popen(cmd).readlines():
417 if b_verbose:
418 print(f"{prog_tag} cmd:{cmd} ran, read output")
395 # ERROR: No downloads matched your filter. Please check your options. 419 # ERROR: No downloads matched your filter. Please check your options.
396 if re.match("^(?:ERROR|Error): No downloads", line): 420 if re.match("^(?:ERROR|Error): No downloads", line):
397 # get complete lineage: accept ONLY leave taxid? (species) 421 # get complete lineage: accept ONLY leave taxid? (species)
398 lineage = ncbi.get_lineage(int(taxid_u)) 422 lineage = ncbi.get_lineage(int(taxid_u))
399 name = ncbi.translate_to_names(lineage) 423 name = ncbi.translate_to_names(lineage)
400 if b_verbose: 424 if b_verbose:
401 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") 425 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
402 426
403 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome 427 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
404 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") 428 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
405 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, 429 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
406 lineage, 430 lineage,
407 ncbi, 431 ncbi,
408 accnrlisttmp, # current working list 432 accnrlisttmp, # current working list
409 accnrlist, # final list, if something added (or min index reached), recursivity stop 433 accnrlist, # final list, if something added (or min index reached), recursivity stop
410 speciestmp, 434 speciestmp,
411 nametmp 435 nametmp
412 ) 436 )
413 accnrlist.append( new_acc_nr ) 437 if new_acc_nr is None:
438 print(f"No acc_nr found after going up in taxonomy, line {lineno()}")
439 else:
440 accnrlist.append( new_acc_nr )
441 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
414 442
415 # initialize for next search 443 # initialize for next search
416 accnrlisttmp = [] 444 accnrlisttmp = []
417 speciestmp = [] 445 speciestmp = []
418 nametmp = [] 446 nametmp = []
419 447
420 elif not re.match("^Considering", line): 448 elif not re.match("^Considering", line):
421 # print(f"line:{line.rstrip()}") 449 # print(f"line:{line.rstrip()}")
422 acc_nr, species, name = line.rstrip().split("\t") 450 acc_nr, species, name = line.rstrip().split("\t")
423 accnrlisttmp.append(acc_nr) 451 accnrlisttmp.append(acc_nr)
452 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
453
424 speciestmp.append(species) 454 speciestmp.append(species)
425 nametmp.append(name) 455 nametmp.append(name)
426 if b_verbose: 456 if b_verbose:
427 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") 457 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
428 458
429 # retain only the most recent complete genome for current treated taxid 459 # retain only the most recent complete genome for current treated taxid
430 if len(accnrlisttmp): 460 if len(accnrlisttmp):
431 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) 461 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
462 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
432 463
433 # remove redundant accnr 464 # remove redundant accnr
465 print(f"accnrlist to sort:{accnrlist}")
434 accnrlist = list(sort_uniq(accnrlist)) 466 accnrlist = list(sort_uniq(accnrlist))
435 with open(acc_out_f, "w") as record_file: 467 with open(acc_out_f, "w") as record_file:
436 for accnr in accnrlist: 468 for accnr in accnrlist:
437 record_file.write("%s\n" % (accnr)) 469 record_file.write("%s\n" % (accnr))
438 # ------------------------------------------ 470 # ------------------------------------------
440 print(f"{prog_tag} {acc_out_f} file created") 472 print(f"{prog_tag} {acc_out_f} file created")
441 473
442 # -------------------------------------------------------------------------- 474 # --------------------------------------------------------------------------
443 # test 475 # test
444 if b_test_add_host_chr_taxids_accnr_from_ori_list: 476 if b_test_add_host_chr_taxids_accnr_from_ori_list:
445 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' 477 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
446 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' 478 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
447 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
448 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") 479 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
449 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") 480 print(f"{prog_tag} loading {taxid_acc_in_f} file")
450 taxidlist = load_taxids(taxid_acc_in_f) 481 load_taxids(taxid_acc_in_f)
482 for i in range(len(taxidlist)):
483 print(f"{taxidlist[i]}\t{accnrlist[i]}")
451 print(f"{prog_tag} end loading") 484 print(f"{prog_tag} end loading")
452 485
453 add_host_chr_taxids_accnr_from_ori_list(taxidlist, 486 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
454 accnrlist, 487 accnrlist,
455 acc_out_f) 488 acc_out_f)
461 # MAIN 494 # MAIN
462 # -------------------------------------------------------------------------- 495 # --------------------------------------------------------------------------
463 ##### MAIN 496 ##### MAIN
464 def __main__(): 497 def __main__():
465 # load taxid_acc file 498 # load taxid_acc file
466 load_taxids(taxid_acc_in_f) 499 load_taxids(taxid_acc_tabular_f)
467 # check in ncbi taxonomy which acc number are in and out of given taxid 500 # check in ncbi taxonomy which acc number are in and out of given taxid
468 add_host_chr_taxids_accnr_from_ori_list(taxidlist, 501 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
469 accnrlist, 502 accnrlist,
470 acc_out_f) 503 acc_out_f)
471 # -------------------------------------------------------------------------- 504 # --------------------------------------------------------------------------