Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
annotate TAXID_genusexpand_taxid2acc.py @ 9:7e75c71e3c8e draft
Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
author | p.lucas |
---|---|
date | Mon, 13 Mar 2023 16:16:49 +0000 |
parents | 8c9418230a5a |
children | 5a4ac43fea68 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 ### | |
4 # USE PYTHON3 | |
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads) | |
6 # provide 2 files: | |
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level) | |
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | |
9 ### | |
10 | |
11 ### Libraries to import: | |
12 import argparse, os, sys, csv, warnings, re, itertools, operator | |
13 from os import path | |
14 import ncbi_genome_download as ngd | |
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | |
16 from ete3 import NCBITaxa | |
17 | |
18 # to be able to report line number in error messages | |
19 import inspect | |
20 frame = inspect.currentframe() | |
21 | |
22 # debug | |
23 b_test_load_taxids = False # ok 2023 03 07 | |
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | |
25 | |
26 prog_tag = '[' + os.path.basename(__file__) + ']' | |
27 | |
28 # boolean to know if we dowload ncbi taxonomy file in current env | |
29 b_load_ncbi_tax_f = False | |
30 | |
31 # list of interesting taxid (fathers) | |
32 taxidlist_f = '' | |
33 taxidlist = [] | |
34 accnrlist = [] | |
35 | |
36 # order = -4 | |
37 # family or clade = -3 | |
38 # subtribe or genus = -2 | |
39 curr_index_in_lineage = -2 | |
40 min_index_in_lineage = -4 | |
41 | |
42 # boolean to know if we download ncbi taxonomy file in current env | |
43 b_load_ncbi_tax_f = False | |
44 b_test_all = False | |
45 b_test = False | |
46 b_acc_in_f = False | |
47 b_acc_out_f = False | |
48 | |
49 b_verbose = False | |
50 | |
51 # rank = '' # rank level retained by user | |
52 # rank_num = index of rank retained by user | |
53 | |
54 # # set to check that provided rank exist to be sure to be able to use it | |
55 # ranks = { | |
56 # 'superkingdom' => 0, | |
57 # # 'clade', # no, several clade, name is too generic | |
58 # 'kingdom' => 1, | |
59 # 'phylum' => 2, | |
60 # 'subphylum' => 3, | |
61 # 'superclass' => 4, | |
62 # 'class' => 5, | |
63 # 'superorder' => 6, | |
64 # 'order' => 7, | |
65 # 'suborder' => 8, | |
66 # 'infraorder' => 9, | |
67 # 'parvorder' => 10, | |
68 # 'superfamily' => 11, | |
69 # 'family' => 12, | |
70 # 'subfamily' => 13, | |
71 # 'genus' => 14, | |
72 # 'species' => 15, | |
73 # 'subspecies' => 16 | |
74 # } | |
75 | |
76 parser = argparse.ArgumentParser() | |
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | |
78 help="taxid acc_number list in tsv (tabular separated at each line)", | |
79 metavar="FILE") | |
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree", |
0 | 82 metavar="FILE") |
83 # parser.add_argument("-r", "--rank", dest='rank', | |
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | |
85 # action="store_const") | |
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | |
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree", | |
88 metavar="FILE") | |
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f', | |
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run", | |
91 action='store_true') | |
92 parser.add_argument("-z", "--test_all", dest='b_test_all', | |
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script", | |
94 action='store_true') | |
95 parser.add_argument("-v", "--verbose", dest='b_verbose', | |
96 help="[Optional] To have details on records when running", | |
97 action='store_true') | |
98 parser.set_defaults(b_load_ncbi_tax_f=False) | |
99 parser.set_defaults(b_test_all=False) | |
100 parser.set_defaults(b_verbose=False) | |
101 | |
102 # get absolute path in case of files | |
103 args = parser.parse_args() | |
104 | |
105 # ------------------------------------------- | |
106 # check arguments | |
107 b_test_all = args.b_test_all | |
108 | |
109 if b_test_all: | |
110 b_test_load_taxids = False | |
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True | |
112 b_test = True | |
113 b_acc_in_f = True | |
114 b_acc_out_f = True | |
115 else: | |
116 b_test = (b_test_load_taxids or | |
117 b_test_add_host_chr_taxids_accnr_from_ori_list) | |
118 | |
119 if ((not b_test)and | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))): |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
121 print("\n".join([ |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
122 "Aim: find accession numbers of complete genomes related to provided taxids.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
123 "If not found at species level, try at upper taxonomic level until order.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
124 "Retains only 1 complete genome is several available:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
125 " - the one with the highest version number, if not sufficient", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
126 " - the one with the highest accession number", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
127 "To use this scripts, run:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
128 "conda activate TAXID_genusexpand_taxid2acc", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
130 " ", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
132 "Then, as an example:\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
134 '-i taxid_accnr_list.tsv', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
135 '-o accnr_out_list.txt']),"\n\n" ])) |
0 | 136 parser.print_help() |
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | |
138 " arguments, exit line "+str(frame.f_lineno)) | |
139 sys.exit(0) | |
140 | |
141 # print('args:', args) | |
142 # if(not b_test): | |
143 if args.ncbi_tax_f is not None: | |
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | |
145 else: | |
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | |
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | |
148 if args.taxid_acc_in_f is not None: | |
7 | 149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) |
0 | 150 b_acc_in_f = True |
151 elif(not b_test): | |
152 sys.exit("[Error] You must provide taxid_acc_in_f") | |
153 if args.acc_out_f is not None: | |
154 acc_out_f = os.path.abspath(args.acc_out_f) | |
155 b_acc_out_f = True | |
156 elif(not b_test): | |
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n") | |
158 # if args.rank is not None: | |
159 # rank = 'genus' | |
160 # else: | |
161 # rank = args.rank | |
162 | |
163 if args.b_verbose is not None: | |
164 b_verbose = int(args.b_verbose) | |
165 | |
166 if(not b_test): | |
167 if (not b_acc_in_f) and (not b_acc_out_f): | |
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>") | |
169 | |
170 # # store index of the rank expected by user | |
171 # rank_num = ranks{ rank } | |
172 | |
173 # -------------------------------------------------------------------------- | |
174 # to sort uniq, for a list, only need to add list conversion | |
175 # -------------------------------------------------------------------------- | |
176 mapper= map # Python ≥ 3 | |
177 def sort_uniq(sequence): | |
178 return mapper( | |
179 operator.itemgetter(0), | |
180 itertools.groupby(sorted(sequence))) | |
181 # -------------------------------------------------------------------------- | |
182 | |
183 # -------------------------------------------------------------------------- | |
184 # load taxid acc list, return taxidlist | |
185 # -------------------------------------------------------------------------- | |
186 def load_taxids(taxid_acc_tabular_f): | |
187 | |
188 if not path.exists(taxid_acc_tabular_f): | |
189 sys.exit("Error " + taxid_acc_tabular_f + | |
190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | |
191 | |
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | |
193 | |
194 for line in os.popen(cmd).readlines(): | |
9
7e75c71e3c8e
Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents:
8
diff
changeset
|
195 if line != "": |
7e75c71e3c8e
Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents:
8
diff
changeset
|
196 continue |
0 | 197 k, v = line.rstrip().split() |
198 taxidlist.append(k) | |
199 accnrlist.append(v) | |
200 | |
201 return taxidlist | |
202 # -------------------------------------------------------------------------- | |
203 | |
204 # test load_taxids function | |
205 # display taxidlist, then exit | |
206 if b_test_load_taxids: | |
207 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
208 print("START b_test_load_taxids") | |
209 print("loading "+taxid_acc_tabular_f+" file") | |
210 taxidlist = load_taxids(taxid_acc_tabular_f) | |
211 for i in range(len(taxidlist)): | |
212 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
213 print("END b_test_load_taxids") | |
214 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | |
215 sys.exit() | |
216 # -------------------------------------------------------------------------- | |
217 | |
218 # # -------------------------------------------------------------------------- | |
219 # # needs internet connexion, not possible | |
220 # # -------------------------------------------------------------------------- | |
221 # def get_leave_taxid_from_acc_nr(accnrlist): | |
222 | |
223 # # deduce a list of taxid from a list of accession numbers | |
224 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u" | |
225 # for line in os.popen(cmd).readlines(): | |
226 # taxidlist.append(line.rstrip()) | |
227 | |
228 # return taxidlist | |
229 # # -------------------------------------------------------------------------- | |
230 | |
231 # -------------------------------------------------------------------------- | |
232 # function to retain the most recent acc nr for host complete genome found: | |
233 # - return acc nr of most recent complete genome | |
234 # - print accnr species and name retained | |
235 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | |
236 # -------------------------------------------------------------------------- | |
237 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | |
238 | |
239 max_accnr_version = 0 | |
240 curr_accnr_version = 0 | |
241 max_accnr_nr = 0 | |
242 curr_accnr_nr = 0 | |
243 kept_accnr_i = 0 | |
244 p = re.compile(".*?(\d+)\.(\d+)$") | |
245 | |
246 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}") | |
247 | |
248 for i in range(len(accnrlisttmp)): | |
249 m = p.match( accnrlisttmp[i] ) | |
250 if m: | |
251 # print('Match found: ', m.group(2)) | |
252 curr_accnr_version = int(m.group(2)) | |
253 accnr_nr = int(m.group(1)) | |
254 if curr_accnr_version > max_accnr_version: | |
255 max_accnr_version = curr_accnr_version | |
256 kept_accnr_i = i | |
257 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
258 elif(( curr_accnr_version == max_accnr_version)and |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
259 (curr_accnr_nr > max_accnr_nr)): |
0 | 260 max_accnr_nr = curr_accnr_nr |
261 kept_accnr_i = i | |
262 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
263 | |
264 else: | |
265 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}") | |
266 | |
267 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}") | |
268 kept_accn = accnrlisttmp[kept_accnr_i] | |
269 | |
270 return kept_accn | |
271 # -------------------------------------------------------------------------- | |
272 | |
273 # -------------------------------------------------------------------------- | |
274 # procedure to find complete genome closely related to current taxid | |
275 # goes upper in taxonomy if nothing found until order | |
276 # -------------------------------------------------------------------------- | |
277 def ngd_upper_lineage(curr_index_in_lineage, | |
278 lineage, | |
279 ncbi, | |
280 accnrlisttmp, # current working list | |
281 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
282 speciestmp, | |
283 nametmp | |
284 ): | |
285 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | |
286 | |
287 # deduce up rank, search complet genome/chr in | |
288 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | |
289 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | |
290 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]]) | |
291 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}") | |
292 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | |
293 intermediate_nodes=False, | |
294 collapse_subspecies=False, | |
295 return_tree=False | |
296 ) | |
297 # int conversion to strings | |
298 leaves_taxids = list(map(str, leaves_taxids)) | |
299 leaves_taxids_list = ','.join(leaves_taxids) | |
300 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
301 # print(f"{prog_tag} cmd:{cmd}") | |
302 | |
303 # specific to retain_1accn to avoid lists are crashed by other ngd call | |
304 accnrlisttmp_r = [] | |
305 speciestmp_r = [] | |
306 nametmp_r = [] | |
307 for line in os.popen(cmd).readlines(): | |
308 | |
309 if not re.match("^Considering", line): | |
310 # print(f"line:{line.rstrip()}") | |
311 if re.match("^(?:ERROR|Error): No downloads", line): | |
312 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | |
313 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | |
314 curr_index_in_lineage = curr_index_in_lineage - 1 | |
315 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
316 return ngd_upper_lineage(curr_index_in_lineage, | |
317 lineage, | |
318 ncbi, | |
319 accnrlisttmp, # current working list | |
320 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
321 speciestmp, | |
322 nametmp | |
323 ) | |
324 else: | |
325 acc_nr, species, name = line.rstrip().split("\t") | |
326 accnrlisttmp_r.append(acc_nr) | |
327 speciestmp_r.append(species) | |
328 nametmp_r.append(name) | |
329 if b_verbose: | |
330 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
331 | |
332 # retain only the most recent complete genome for current treated taxid | |
333 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r) | |
334 | |
335 # else: | |
336 # print(f"line matching Considering:{line}") | |
337 # -------------------------------------------------------------------------- | |
338 | |
339 # -------------------------------------------------------------------------- | |
340 # read taxids, deduce complete genomes available in genblank, provides in output file | |
341 # the acc number in addition to those already listed | |
342 # -------------------------------------------------------------------------- | |
343 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
344 accnrlist, | |
345 acc_out_f): | |
346 | |
347 # store all accnr found for complete genome of current taxid (or from same family/order) | |
348 # the aim is to keep only the most recent/complete | |
349 accnrlisttmp = [] | |
350 speciestmp = [] | |
351 nametmp = [] | |
352 | |
353 # get host complete genome when found using ncbi_genome_download | |
354 taxids_list=','.join(taxidlist) | |
355 | |
356 # # ------------------------------------------ | |
357 # # ncbi-genome-download as a library | |
358 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | |
359 # ngd.download(section='genbank', | |
360 # taxids=taxids_list, | |
361 # assembly_levels='chromosome', | |
362 # flat_output=True, | |
363 # output=ngd_out_f, | |
364 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | |
365 # dry_run=True | |
366 # ) | |
367 | |
368 | |
369 # cmd = "cut -f 1,2 "+ngd_out_f | |
370 | |
371 # for line in os.popen(cmd).readlines(): | |
372 # acc_nr, species = line.rstrip().split() | |
373 # accnrlist.append(acc_nr) | |
374 | |
375 # if b_verbose: | |
376 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}") | |
377 # # ------------------------------------------ | |
378 | |
379 # ------------------------------------------ | |
380 # ncbi-genome-download as executable script | |
381 # ------------------------------------------ | |
382 | |
383 # load NCBITaxa | |
384 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory) | |
385 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f) | |
386 ncbi = NCBITaxa(dbfile=ncbi_tax_f) | |
387 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f: | |
388 try: | |
389 ncbi.update_taxonomy_database() | |
390 except: | |
391 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | |
392 | |
393 for taxid_u in taxidlist: | |
394 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
395 for line in os.popen(cmd).readlines(): | |
396 # ERROR: No downloads matched your filter. Please check your options. | |
397 if re.match("^(?:ERROR|Error): No downloads", line): | |
398 # get complete lineage: accept ONLY leave taxid? (species) | |
399 lineage = ncbi.get_lineage(int(taxid_u)) | |
400 name = ncbi.translate_to_names(lineage) | |
401 if b_verbose: | |
402 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | |
403 | |
404 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | |
405 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
406 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | |
407 lineage, | |
408 ncbi, | |
409 accnrlisttmp, # current working list | |
410 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
411 speciestmp, | |
412 nametmp | |
413 ) | |
414 accnrlist.append( new_acc_nr ) | |
415 | |
416 # initialize for next search | |
417 accnrlisttmp = [] | |
418 speciestmp = [] | |
419 nametmp = [] | |
420 | |
421 elif not re.match("^Considering", line): | |
422 # print(f"line:{line.rstrip()}") | |
423 acc_nr, species, name = line.rstrip().split("\t") | |
424 accnrlisttmp.append(acc_nr) | |
425 speciestmp.append(species) | |
426 nametmp.append(name) | |
427 if b_verbose: | |
428 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
429 | |
430 # retain only the most recent complete genome for current treated taxid | |
431 if len(accnrlisttmp): | |
432 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | |
433 | |
434 # remove redundant accnr | |
435 accnrlist = list(sort_uniq(accnrlist)) | |
436 with open(acc_out_f, "w") as record_file: | |
437 for accnr in accnrlist: | |
438 record_file.write("%s\n" % (accnr)) | |
439 # ------------------------------------------ | |
440 | |
441 print(f"{prog_tag} {acc_out_f} file created") | |
442 | |
443 # -------------------------------------------------------------------------- | |
444 # test | |
445 if b_test_add_host_chr_taxids_accnr_from_ori_list: | |
446 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
447 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | |
448 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
449 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | |
450 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | |
451 taxidlist = load_taxids(taxid_acc_in_f) | |
452 print(f"{prog_tag} end loading") | |
453 | |
454 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
455 accnrlist, | |
456 acc_out_f) | |
457 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list") | |
458 sys.exit() | |
459 # -------------------------------------------------------------------------- | |
460 | |
461 # -------------------------------------------------------------------------- | |
462 # MAIN | |
463 # -------------------------------------------------------------------------- | |
464 ##### MAIN | |
465 def __main__(): | |
466 # load taxid_acc file | |
8 | 467 load_taxids(taxid_acc_in_f) |
0 | 468 # check in ncbi taxonomy which acc number are in and out of given taxid |
469 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
470 accnrlist, | |
471 acc_out_f) | |
472 # -------------------------------------------------------------------------- | |
473 #### MAIN END | |
474 if __name__ == "__main__": __main__() | |
475 |