Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
annotate TAXID_genusexpand_taxid2acc.py @ 7:9238e54d6532 draft
Uploaded 13 03 23 Fix variable bug line 143
author | p.lucas |
---|---|
date | Mon, 13 Mar 2023 15:09:54 +0000 |
parents | 81acd8138218 |
children | 8c9418230a5a |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 ### | |
4 # USE PYTHON3 | |
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads) | |
6 # provide 2 files: | |
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level) | |
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | |
9 ### | |
10 | |
11 ### Libraries to import: | |
12 import argparse, os, sys, csv, warnings, re, itertools, operator | |
13 from os import path | |
14 import ncbi_genome_download as ngd | |
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | |
16 from ete3 import NCBITaxa | |
17 | |
18 # to be able to report line number in error messages | |
19 import inspect | |
20 frame = inspect.currentframe() | |
21 | |
22 # debug | |
23 b_test_load_taxids = False # ok 2023 03 07 | |
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | |
25 | |
26 prog_tag = '[' + os.path.basename(__file__) + ']' | |
27 | |
28 # boolean to know if we dowload ncbi taxonomy file in current env | |
29 b_load_ncbi_tax_f = False | |
30 | |
31 # list of interesting taxid (fathers) | |
32 taxidlist_f = '' | |
33 taxidlist = [] | |
34 accnrlist = [] | |
35 | |
36 # order = -4 | |
37 # family or clade = -3 | |
38 # subtribe or genus = -2 | |
39 curr_index_in_lineage = -2 | |
40 min_index_in_lineage = -4 | |
41 | |
42 # boolean to know if we download ncbi taxonomy file in current env | |
43 b_load_ncbi_tax_f = False | |
44 b_test_all = False | |
45 b_test = False | |
46 b_acc_in_f = False | |
47 b_acc_out_f = False | |
48 | |
49 b_verbose = False | |
50 | |
51 # rank = '' # rank level retained by user | |
52 # rank_num = index of rank retained by user | |
53 | |
54 # # set to check that provided rank exist to be sure to be able to use it | |
55 # ranks = { | |
56 # 'superkingdom' => 0, | |
57 # # 'clade', # no, several clade, name is too generic | |
58 # 'kingdom' => 1, | |
59 # 'phylum' => 2, | |
60 # 'subphylum' => 3, | |
61 # 'superclass' => 4, | |
62 # 'class' => 5, | |
63 # 'superorder' => 6, | |
64 # 'order' => 7, | |
65 # 'suborder' => 8, | |
66 # 'infraorder' => 9, | |
67 # 'parvorder' => 10, | |
68 # 'superfamily' => 11, | |
69 # 'family' => 12, | |
70 # 'subfamily' => 13, | |
71 # 'genus' => 14, | |
72 # 'species' => 15, | |
73 # 'subspecies' => 16 | |
74 # } | |
75 | |
76 parser = argparse.ArgumentParser() | |
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | |
78 help="taxid acc_number list in tsv (tabular separated at each line)", | |
79 metavar="FILE") | |
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree", |
0 | 82 metavar="FILE") |
83 # parser.add_argument("-r", "--rank", dest='rank', | |
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | |
85 # action="store_const") | |
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | |
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree", | |
88 metavar="FILE") | |
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f', | |
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run", | |
91 action='store_true') | |
92 parser.add_argument("-z", "--test_all", dest='b_test_all', | |
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script", | |
94 action='store_true') | |
95 parser.add_argument("-v", "--verbose", dest='b_verbose', | |
96 help="[Optional] To have details on records when running", | |
97 action='store_true') | |
98 parser.set_defaults(b_load_ncbi_tax_f=False) | |
99 parser.set_defaults(b_test_all=False) | |
100 parser.set_defaults(b_verbose=False) | |
101 | |
102 # get absolute path in case of files | |
103 args = parser.parse_args() | |
104 | |
105 # ------------------------------------------- | |
106 # check arguments | |
107 b_test_all = args.b_test_all | |
108 | |
109 if b_test_all: | |
110 b_test_load_taxids = False | |
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True | |
112 b_test = True | |
113 b_acc_in_f = True | |
114 b_acc_out_f = True | |
115 else: | |
116 b_test = (b_test_load_taxids or | |
117 b_test_add_host_chr_taxids_accnr_from_ori_list) | |
118 | |
119 if ((not b_test)and | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))): |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
121 print("\n".join([ |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
122 "Aim: find accession numbers of complete genomes related to provided taxids.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
123 "If not found at species level, try at upper taxonomic level until order.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
124 "Retains only 1 complete genome is several available:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
125 " - the one with the highest version number, if not sufficient", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
126 " - the one with the highest accession number", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
127 "To use this scripts, run:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
128 "conda activate TAXID_genusexpand_taxid2acc", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
130 " ", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
132 "Then, as an example:\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
134 '-i taxid_accnr_list.tsv', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
135 '-o accnr_out_list.txt']),"\n\n" ])) |
0 | 136 parser.print_help() |
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | |
138 " arguments, exit line "+str(frame.f_lineno)) | |
139 sys.exit(0) | |
140 | |
141 # print('args:', args) | |
142 # if(not b_test): | |
143 if args.ncbi_tax_f is not None: | |
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | |
145 else: | |
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | |
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | |
148 if args.taxid_acc_in_f is not None: | |
7 | 149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) |
0 | 150 b_acc_in_f = True |
151 elif(not b_test): | |
152 sys.exit("[Error] You must provide taxid_acc_in_f") | |
153 if args.acc_out_f is not None: | |
154 acc_out_f = os.path.abspath(args.acc_out_f) | |
155 b_acc_out_f = True | |
156 elif(not b_test): | |
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n") | |
158 # if args.rank is not None: | |
159 # rank = 'genus' | |
160 # else: | |
161 # rank = args.rank | |
162 | |
163 if args.b_verbose is not None: | |
164 b_verbose = int(args.b_verbose) | |
165 | |
166 if(not b_test): | |
167 if (not b_acc_in_f) and (not b_acc_out_f): | |
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>") | |
169 | |
170 # # store index of the rank expected by user | |
171 # rank_num = ranks{ rank } | |
172 | |
173 # -------------------------------------------------------------------------- | |
174 # to sort uniq, for a list, only need to add list conversion | |
175 # -------------------------------------------------------------------------- | |
176 mapper= map # Python ≥ 3 | |
177 def sort_uniq(sequence): | |
178 return mapper( | |
179 operator.itemgetter(0), | |
180 itertools.groupby(sorted(sequence))) | |
181 # -------------------------------------------------------------------------- | |
182 | |
183 # -------------------------------------------------------------------------- | |
184 # load taxid acc list, return taxidlist | |
185 # -------------------------------------------------------------------------- | |
186 def load_taxids(taxid_acc_tabular_f): | |
187 | |
188 if not path.exists(taxid_acc_tabular_f): | |
189 sys.exit("Error " + taxid_acc_tabular_f + | |
190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | |
191 | |
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | |
193 | |
194 for line in os.popen(cmd).readlines(): | |
195 k, v = line.rstrip().split() | |
196 taxidlist.append(k) | |
197 accnrlist.append(v) | |
198 | |
199 return taxidlist | |
200 # -------------------------------------------------------------------------- | |
201 | |
202 # test load_taxids function | |
203 # display taxidlist, then exit | |
204 if b_test_load_taxids: | |
205 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
206 print("START b_test_load_taxids") | |
207 print("loading "+taxid_acc_tabular_f+" file") | |
208 taxidlist = load_taxids(taxid_acc_tabular_f) | |
209 for i in range(len(taxidlist)): | |
210 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
211 print("END b_test_load_taxids") | |
212 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | |
213 sys.exit() | |
214 # -------------------------------------------------------------------------- | |
215 | |
216 # # -------------------------------------------------------------------------- | |
217 # # needs internet connexion, not possible | |
218 # # -------------------------------------------------------------------------- | |
219 # def get_leave_taxid_from_acc_nr(accnrlist): | |
220 | |
221 # # deduce a list of taxid from a list of accession numbers | |
222 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u" | |
223 # for line in os.popen(cmd).readlines(): | |
224 # taxidlist.append(line.rstrip()) | |
225 | |
226 # return taxidlist | |
227 # # -------------------------------------------------------------------------- | |
228 | |
229 # -------------------------------------------------------------------------- | |
230 # function to retain the most recent acc nr for host complete genome found: | |
231 # - return acc nr of most recent complete genome | |
232 # - print accnr species and name retained | |
233 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | |
234 # -------------------------------------------------------------------------- | |
235 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | |
236 | |
237 max_accnr_version = 0 | |
238 curr_accnr_version = 0 | |
239 max_accnr_nr = 0 | |
240 curr_accnr_nr = 0 | |
241 kept_accnr_i = 0 | |
242 p = re.compile(".*?(\d+)\.(\d+)$") | |
243 | |
244 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}") | |
245 | |
246 for i in range(len(accnrlisttmp)): | |
247 m = p.match( accnrlisttmp[i] ) | |
248 if m: | |
249 # print('Match found: ', m.group(2)) | |
250 curr_accnr_version = int(m.group(2)) | |
251 accnr_nr = int(m.group(1)) | |
252 if curr_accnr_version > max_accnr_version: | |
253 max_accnr_version = curr_accnr_version | |
254 kept_accnr_i = i | |
255 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
256 elif(( curr_accnr_version == max_accnr_version)and |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
257 (curr_accnr_nr > max_accnr_nr)): |
0 | 258 max_accnr_nr = curr_accnr_nr |
259 kept_accnr_i = i | |
260 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
261 | |
262 else: | |
263 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}") | |
264 | |
265 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}") | |
266 kept_accn = accnrlisttmp[kept_accnr_i] | |
267 | |
268 return kept_accn | |
269 # -------------------------------------------------------------------------- | |
270 | |
271 # -------------------------------------------------------------------------- | |
272 # procedure to find complete genome closely related to current taxid | |
273 # goes upper in taxonomy if nothing found until order | |
274 # -------------------------------------------------------------------------- | |
275 def ngd_upper_lineage(curr_index_in_lineage, | |
276 lineage, | |
277 ncbi, | |
278 accnrlisttmp, # current working list | |
279 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
280 speciestmp, | |
281 nametmp | |
282 ): | |
283 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | |
284 | |
285 # deduce up rank, search complet genome/chr in | |
286 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | |
287 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | |
288 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]]) | |
289 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}") | |
290 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | |
291 intermediate_nodes=False, | |
292 collapse_subspecies=False, | |
293 return_tree=False | |
294 ) | |
295 # int conversion to strings | |
296 leaves_taxids = list(map(str, leaves_taxids)) | |
297 leaves_taxids_list = ','.join(leaves_taxids) | |
298 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
299 # print(f"{prog_tag} cmd:{cmd}") | |
300 | |
301 # specific to retain_1accn to avoid lists are crashed by other ngd call | |
302 accnrlisttmp_r = [] | |
303 speciestmp_r = [] | |
304 nametmp_r = [] | |
305 for line in os.popen(cmd).readlines(): | |
306 | |
307 if not re.match("^Considering", line): | |
308 # print(f"line:{line.rstrip()}") | |
309 if re.match("^(?:ERROR|Error): No downloads", line): | |
310 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | |
311 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | |
312 curr_index_in_lineage = curr_index_in_lineage - 1 | |
313 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
314 return ngd_upper_lineage(curr_index_in_lineage, | |
315 lineage, | |
316 ncbi, | |
317 accnrlisttmp, # current working list | |
318 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
319 speciestmp, | |
320 nametmp | |
321 ) | |
322 else: | |
323 acc_nr, species, name = line.rstrip().split("\t") | |
324 accnrlisttmp_r.append(acc_nr) | |
325 speciestmp_r.append(species) | |
326 nametmp_r.append(name) | |
327 if b_verbose: | |
328 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
329 | |
330 # retain only the most recent complete genome for current treated taxid | |
331 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r) | |
332 | |
333 # else: | |
334 # print(f"line matching Considering:{line}") | |
335 # -------------------------------------------------------------------------- | |
336 | |
337 # -------------------------------------------------------------------------- | |
338 # read taxids, deduce complete genomes available in genblank, provides in output file | |
339 # the acc number in addition to those already listed | |
340 # -------------------------------------------------------------------------- | |
341 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
342 accnrlist, | |
343 acc_out_f): | |
344 | |
345 # store all accnr found for complete genome of current taxid (or from same family/order) | |
346 # the aim is to keep only the most recent/complete | |
347 accnrlisttmp = [] | |
348 speciestmp = [] | |
349 nametmp = [] | |
350 | |
351 # get host complete genome when found using ncbi_genome_download | |
352 taxids_list=','.join(taxidlist) | |
353 | |
354 # # ------------------------------------------ | |
355 # # ncbi-genome-download as a library | |
356 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | |
357 # ngd.download(section='genbank', | |
358 # taxids=taxids_list, | |
359 # assembly_levels='chromosome', | |
360 # flat_output=True, | |
361 # output=ngd_out_f, | |
362 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | |
363 # dry_run=True | |
364 # ) | |
365 | |
366 | |
367 # cmd = "cut -f 1,2 "+ngd_out_f | |
368 | |
369 # for line in os.popen(cmd).readlines(): | |
370 # acc_nr, species = line.rstrip().split() | |
371 # accnrlist.append(acc_nr) | |
372 | |
373 # if b_verbose: | |
374 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}") | |
375 # # ------------------------------------------ | |
376 | |
377 # ------------------------------------------ | |
378 # ncbi-genome-download as executable script | |
379 # ------------------------------------------ | |
380 | |
381 # load NCBITaxa | |
382 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory) | |
383 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f) | |
384 ncbi = NCBITaxa(dbfile=ncbi_tax_f) | |
385 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f: | |
386 try: | |
387 ncbi.update_taxonomy_database() | |
388 except: | |
389 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | |
390 | |
391 for taxid_u in taxidlist: | |
392 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
393 for line in os.popen(cmd).readlines(): | |
394 # ERROR: No downloads matched your filter. Please check your options. | |
395 if re.match("^(?:ERROR|Error): No downloads", line): | |
396 # get complete lineage: accept ONLY leave taxid? (species) | |
397 lineage = ncbi.get_lineage(int(taxid_u)) | |
398 name = ncbi.translate_to_names(lineage) | |
399 if b_verbose: | |
400 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | |
401 | |
402 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | |
403 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
404 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | |
405 lineage, | |
406 ncbi, | |
407 accnrlisttmp, # current working list | |
408 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
409 speciestmp, | |
410 nametmp | |
411 ) | |
412 accnrlist.append( new_acc_nr ) | |
413 | |
414 # initialize for next search | |
415 accnrlisttmp = [] | |
416 speciestmp = [] | |
417 nametmp = [] | |
418 | |
419 elif not re.match("^Considering", line): | |
420 # print(f"line:{line.rstrip()}") | |
421 acc_nr, species, name = line.rstrip().split("\t") | |
422 accnrlisttmp.append(acc_nr) | |
423 speciestmp.append(species) | |
424 nametmp.append(name) | |
425 if b_verbose: | |
426 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
427 | |
428 # retain only the most recent complete genome for current treated taxid | |
429 if len(accnrlisttmp): | |
430 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | |
431 | |
432 # remove redundant accnr | |
433 accnrlist = list(sort_uniq(accnrlist)) | |
434 with open(acc_out_f, "w") as record_file: | |
435 for accnr in accnrlist: | |
436 record_file.write("%s\n" % (accnr)) | |
437 # ------------------------------------------ | |
438 | |
439 print(f"{prog_tag} {acc_out_f} file created") | |
440 | |
441 # -------------------------------------------------------------------------- | |
442 # test | |
443 if b_test_add_host_chr_taxids_accnr_from_ori_list: | |
444 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
445 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | |
446 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
447 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | |
448 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | |
449 taxidlist = load_taxids(taxid_acc_in_f) | |
450 print(f"{prog_tag} end loading") | |
451 | |
452 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
453 accnrlist, | |
454 acc_out_f) | |
455 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list") | |
456 sys.exit() | |
457 # -------------------------------------------------------------------------- | |
458 | |
459 # -------------------------------------------------------------------------- | |
460 # MAIN | |
461 # -------------------------------------------------------------------------- | |
462 ##### MAIN | |
463 def __main__(): | |
464 # load taxid_acc file | |
465 load_taxids(taxid_acc_tabular_f) | |
466 # check in ncbi taxonomy which acc number are in and out of given taxid | |
467 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
468 accnrlist, | |
469 acc_out_f) | |
470 # -------------------------------------------------------------------------- | |
471 #### MAIN END | |
472 if __name__ == "__main__": __main__() | |
473 |