Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
annotate TAXID_genusexpand_taxid2acc.py @ 10:5a4ac43fea68 draft
Uploaded 14 03 23 def load_taxids fix bug when empty line
author | p.lucas |
---|---|
date | Tue, 14 Mar 2023 08:07:14 +0000 |
parents | 7e75c71e3c8e |
children | 3d116861e380 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 ### | |
4 # USE PYTHON3 | |
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads) | |
6 # provide 2 files: | |
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level) | |
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | |
9 ### | |
10 | |
11 ### Libraries to import: | |
12 import argparse, os, sys, csv, warnings, re, itertools, operator | |
13 from os import path | |
14 import ncbi_genome_download as ngd | |
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | |
16 from ete3 import NCBITaxa | |
17 | |
18 # to be able to report line number in error messages | |
19 import inspect | |
20 frame = inspect.currentframe() | |
21 | |
22 # debug | |
23 b_test_load_taxids = False # ok 2023 03 07 | |
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | |
25 | |
26 prog_tag = '[' + os.path.basename(__file__) + ']' | |
27 | |
28 # boolean to know if we dowload ncbi taxonomy file in current env | |
29 b_load_ncbi_tax_f = False | |
30 | |
31 # list of interesting taxid (fathers) | |
32 taxidlist_f = '' | |
33 taxidlist = [] | |
34 accnrlist = [] | |
35 | |
36 # order = -4 | |
37 # family or clade = -3 | |
38 # subtribe or genus = -2 | |
39 curr_index_in_lineage = -2 | |
40 min_index_in_lineage = -4 | |
41 | |
42 # boolean to know if we download ncbi taxonomy file in current env | |
43 b_load_ncbi_tax_f = False | |
44 b_test_all = False | |
45 b_test = False | |
46 b_acc_in_f = False | |
47 b_acc_out_f = False | |
48 | |
49 b_verbose = False | |
50 | |
51 # rank = '' # rank level retained by user | |
52 # rank_num = index of rank retained by user | |
53 | |
54 # # set to check that provided rank exist to be sure to be able to use it | |
55 # ranks = { | |
56 # 'superkingdom' => 0, | |
57 # # 'clade', # no, several clade, name is too generic | |
58 # 'kingdom' => 1, | |
59 # 'phylum' => 2, | |
60 # 'subphylum' => 3, | |
61 # 'superclass' => 4, | |
62 # 'class' => 5, | |
63 # 'superorder' => 6, | |
64 # 'order' => 7, | |
65 # 'suborder' => 8, | |
66 # 'infraorder' => 9, | |
67 # 'parvorder' => 10, | |
68 # 'superfamily' => 11, | |
69 # 'family' => 12, | |
70 # 'subfamily' => 13, | |
71 # 'genus' => 14, | |
72 # 'species' => 15, | |
73 # 'subspecies' => 16 | |
74 # } | |
75 | |
76 parser = argparse.ArgumentParser() | |
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | |
78 help="taxid acc_number list in tsv (tabular separated at each line)", | |
79 metavar="FILE") | |
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree", |
0 | 82 metavar="FILE") |
83 # parser.add_argument("-r", "--rank", dest='rank', | |
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | |
85 # action="store_const") | |
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | |
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree", | |
88 metavar="FILE") | |
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f', | |
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run", | |
91 action='store_true') | |
92 parser.add_argument("-z", "--test_all", dest='b_test_all', | |
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script", | |
94 action='store_true') | |
95 parser.add_argument("-v", "--verbose", dest='b_verbose', | |
96 help="[Optional] To have details on records when running", | |
97 action='store_true') | |
98 parser.set_defaults(b_load_ncbi_tax_f=False) | |
99 parser.set_defaults(b_test_all=False) | |
100 parser.set_defaults(b_verbose=False) | |
101 | |
102 # get absolute path in case of files | |
103 args = parser.parse_args() | |
104 | |
105 # ------------------------------------------- | |
106 # check arguments | |
107 b_test_all = args.b_test_all | |
108 | |
109 if b_test_all: | |
110 b_test_load_taxids = False | |
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True | |
112 b_test = True | |
113 b_acc_in_f = True | |
114 b_acc_out_f = True | |
115 else: | |
116 b_test = (b_test_load_taxids or | |
117 b_test_add_host_chr_taxids_accnr_from_ori_list) | |
118 | |
119 if ((not b_test)and | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))): |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
121 print("\n".join([ |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
122 "Aim: find accession numbers of complete genomes related to provided taxids.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
123 "If not found at species level, try at upper taxonomic level until order.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
124 "Retains only 1 complete genome is several available:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
125 " - the one with the highest version number, if not sufficient", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
126 " - the one with the highest accession number", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
127 "To use this scripts, run:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
128 "conda activate TAXID_genusexpand_taxid2acc", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
130 " ", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
132 "Then, as an example:\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
134 '-i taxid_accnr_list.tsv', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
135 '-o accnr_out_list.txt']),"\n\n" ])) |
0 | 136 parser.print_help() |
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | |
138 " arguments, exit line "+str(frame.f_lineno)) | |
139 sys.exit(0) | |
140 | |
141 # print('args:', args) | |
142 # if(not b_test): | |
143 if args.ncbi_tax_f is not None: | |
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | |
145 else: | |
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | |
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | |
148 if args.taxid_acc_in_f is not None: | |
7 | 149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) |
0 | 150 b_acc_in_f = True |
151 elif(not b_test): | |
152 sys.exit("[Error] You must provide taxid_acc_in_f") | |
153 if args.acc_out_f is not None: | |
154 acc_out_f = os.path.abspath(args.acc_out_f) | |
155 b_acc_out_f = True | |
156 elif(not b_test): | |
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n") | |
158 # if args.rank is not None: | |
159 # rank = 'genus' | |
160 # else: | |
161 # rank = args.rank | |
162 | |
163 if args.b_verbose is not None: | |
164 b_verbose = int(args.b_verbose) | |
165 | |
166 if(not b_test): | |
167 if (not b_acc_in_f) and (not b_acc_out_f): | |
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>") | |
169 | |
170 # # store index of the rank expected by user | |
171 # rank_num = ranks{ rank } | |
172 | |
173 # -------------------------------------------------------------------------- | |
174 # to sort uniq, for a list, only need to add list conversion | |
175 # -------------------------------------------------------------------------- | |
176 mapper= map # Python ≥ 3 | |
177 def sort_uniq(sequence): | |
178 return mapper( | |
179 operator.itemgetter(0), | |
180 itertools.groupby(sorted(sequence))) | |
181 # -------------------------------------------------------------------------- | |
182 | |
183 # -------------------------------------------------------------------------- | |
184 # load taxid acc list, return taxidlist | |
185 # -------------------------------------------------------------------------- | |
186 def load_taxids(taxid_acc_tabular_f): | |
187 | |
188 if not path.exists(taxid_acc_tabular_f): | |
189 sys.exit("Error " + taxid_acc_tabular_f + | |
190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | |
191 | |
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | |
193 | |
194 for line in os.popen(cmd).readlines(): | |
9
7e75c71e3c8e
Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents:
8
diff
changeset
|
195 if line != "": |
10
5a4ac43fea68
Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents:
9
diff
changeset
|
196 k, v = line.rstrip().split() |
5a4ac43fea68
Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents:
9
diff
changeset
|
197 taxidlist.append(k) |
5a4ac43fea68
Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents:
9
diff
changeset
|
198 accnrlist.append(v) |
0 | 199 |
200 return taxidlist | |
201 # -------------------------------------------------------------------------- | |
202 | |
203 # test load_taxids function | |
204 # display taxidlist, then exit | |
205 if b_test_load_taxids: | |
206 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
207 print("START b_test_load_taxids") | |
208 print("loading "+taxid_acc_tabular_f+" file") | |
209 taxidlist = load_taxids(taxid_acc_tabular_f) | |
210 for i in range(len(taxidlist)): | |
211 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
212 print("END b_test_load_taxids") | |
213 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | |
214 sys.exit() | |
215 # -------------------------------------------------------------------------- | |
216 | |
217 # # -------------------------------------------------------------------------- | |
218 # # needs internet connexion, not possible | |
219 # # -------------------------------------------------------------------------- | |
220 # def get_leave_taxid_from_acc_nr(accnrlist): | |
221 | |
222 # # deduce a list of taxid from a list of accession numbers | |
223 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u" | |
224 # for line in os.popen(cmd).readlines(): | |
225 # taxidlist.append(line.rstrip()) | |
226 | |
227 # return taxidlist | |
228 # # -------------------------------------------------------------------------- | |
229 | |
230 # -------------------------------------------------------------------------- | |
231 # function to retain the most recent acc nr for host complete genome found: | |
232 # - return acc nr of most recent complete genome | |
233 # - print accnr species and name retained | |
234 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | |
235 # -------------------------------------------------------------------------- | |
236 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | |
237 | |
238 max_accnr_version = 0 | |
239 curr_accnr_version = 0 | |
240 max_accnr_nr = 0 | |
241 curr_accnr_nr = 0 | |
242 kept_accnr_i = 0 | |
243 p = re.compile(".*?(\d+)\.(\d+)$") | |
244 | |
245 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}") | |
246 | |
247 for i in range(len(accnrlisttmp)): | |
248 m = p.match( accnrlisttmp[i] ) | |
249 if m: | |
250 # print('Match found: ', m.group(2)) | |
251 curr_accnr_version = int(m.group(2)) | |
252 accnr_nr = int(m.group(1)) | |
253 if curr_accnr_version > max_accnr_version: | |
254 max_accnr_version = curr_accnr_version | |
255 kept_accnr_i = i | |
256 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
257 elif(( curr_accnr_version == max_accnr_version)and |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
258 (curr_accnr_nr > max_accnr_nr)): |
0 | 259 max_accnr_nr = curr_accnr_nr |
260 kept_accnr_i = i | |
261 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
262 | |
263 else: | |
264 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}") | |
265 | |
266 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}") | |
267 kept_accn = accnrlisttmp[kept_accnr_i] | |
268 | |
269 return kept_accn | |
270 # -------------------------------------------------------------------------- | |
271 | |
272 # -------------------------------------------------------------------------- | |
273 # procedure to find complete genome closely related to current taxid | |
274 # goes upper in taxonomy if nothing found until order | |
275 # -------------------------------------------------------------------------- | |
276 def ngd_upper_lineage(curr_index_in_lineage, | |
277 lineage, | |
278 ncbi, | |
279 accnrlisttmp, # current working list | |
280 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
281 speciestmp, | |
282 nametmp | |
283 ): | |
284 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | |
285 | |
286 # deduce up rank, search complet genome/chr in | |
287 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | |
288 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | |
289 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]]) | |
290 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}") | |
291 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | |
292 intermediate_nodes=False, | |
293 collapse_subspecies=False, | |
294 return_tree=False | |
295 ) | |
296 # int conversion to strings | |
297 leaves_taxids = list(map(str, leaves_taxids)) | |
298 leaves_taxids_list = ','.join(leaves_taxids) | |
299 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
300 # print(f"{prog_tag} cmd:{cmd}") | |
301 | |
302 # specific to retain_1accn to avoid lists are crashed by other ngd call | |
303 accnrlisttmp_r = [] | |
304 speciestmp_r = [] | |
305 nametmp_r = [] | |
306 for line in os.popen(cmd).readlines(): | |
307 | |
308 if not re.match("^Considering", line): | |
309 # print(f"line:{line.rstrip()}") | |
310 if re.match("^(?:ERROR|Error): No downloads", line): | |
311 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | |
312 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | |
313 curr_index_in_lineage = curr_index_in_lineage - 1 | |
314 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
315 return ngd_upper_lineage(curr_index_in_lineage, | |
316 lineage, | |
317 ncbi, | |
318 accnrlisttmp, # current working list | |
319 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
320 speciestmp, | |
321 nametmp | |
322 ) | |
323 else: | |
324 acc_nr, species, name = line.rstrip().split("\t") | |
325 accnrlisttmp_r.append(acc_nr) | |
326 speciestmp_r.append(species) | |
327 nametmp_r.append(name) | |
328 if b_verbose: | |
329 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
330 | |
331 # retain only the most recent complete genome for current treated taxid | |
332 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r) | |
333 | |
334 # else: | |
335 # print(f"line matching Considering:{line}") | |
336 # -------------------------------------------------------------------------- | |
337 | |
338 # -------------------------------------------------------------------------- | |
339 # read taxids, deduce complete genomes available in genblank, provides in output file | |
340 # the acc number in addition to those already listed | |
341 # -------------------------------------------------------------------------- | |
342 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
343 accnrlist, | |
344 acc_out_f): | |
345 | |
346 # store all accnr found for complete genome of current taxid (or from same family/order) | |
347 # the aim is to keep only the most recent/complete | |
348 accnrlisttmp = [] | |
349 speciestmp = [] | |
350 nametmp = [] | |
351 | |
352 # get host complete genome when found using ncbi_genome_download | |
353 taxids_list=','.join(taxidlist) | |
354 | |
355 # # ------------------------------------------ | |
356 # # ncbi-genome-download as a library | |
357 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | |
358 # ngd.download(section='genbank', | |
359 # taxids=taxids_list, | |
360 # assembly_levels='chromosome', | |
361 # flat_output=True, | |
362 # output=ngd_out_f, | |
363 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | |
364 # dry_run=True | |
365 # ) | |
366 | |
367 | |
368 # cmd = "cut -f 1,2 "+ngd_out_f | |
369 | |
370 # for line in os.popen(cmd).readlines(): | |
371 # acc_nr, species = line.rstrip().split() | |
372 # accnrlist.append(acc_nr) | |
373 | |
374 # if b_verbose: | |
375 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}") | |
376 # # ------------------------------------------ | |
377 | |
378 # ------------------------------------------ | |
379 # ncbi-genome-download as executable script | |
380 # ------------------------------------------ | |
381 | |
382 # load NCBITaxa | |
383 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory) | |
384 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f) | |
385 ncbi = NCBITaxa(dbfile=ncbi_tax_f) | |
386 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f: | |
387 try: | |
388 ncbi.update_taxonomy_database() | |
389 except: | |
390 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | |
391 | |
392 for taxid_u in taxidlist: | |
393 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | |
394 for line in os.popen(cmd).readlines(): | |
395 # ERROR: No downloads matched your filter. Please check your options. | |
396 if re.match("^(?:ERROR|Error): No downloads", line): | |
397 # get complete lineage: accept ONLY leave taxid? (species) | |
398 lineage = ncbi.get_lineage(int(taxid_u)) | |
399 name = ncbi.translate_to_names(lineage) | |
400 if b_verbose: | |
401 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | |
402 | |
403 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | |
404 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | |
405 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | |
406 lineage, | |
407 ncbi, | |
408 accnrlisttmp, # current working list | |
409 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
410 speciestmp, | |
411 nametmp | |
412 ) | |
413 accnrlist.append( new_acc_nr ) | |
414 | |
415 # initialize for next search | |
416 accnrlisttmp = [] | |
417 speciestmp = [] | |
418 nametmp = [] | |
419 | |
420 elif not re.match("^Considering", line): | |
421 # print(f"line:{line.rstrip()}") | |
422 acc_nr, species, name = line.rstrip().split("\t") | |
423 accnrlisttmp.append(acc_nr) | |
424 speciestmp.append(species) | |
425 nametmp.append(name) | |
426 if b_verbose: | |
427 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
428 | |
429 # retain only the most recent complete genome for current treated taxid | |
430 if len(accnrlisttmp): | |
431 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | |
432 | |
433 # remove redundant accnr | |
434 accnrlist = list(sort_uniq(accnrlist)) | |
435 with open(acc_out_f, "w") as record_file: | |
436 for accnr in accnrlist: | |
437 record_file.write("%s\n" % (accnr)) | |
438 # ------------------------------------------ | |
439 | |
440 print(f"{prog_tag} {acc_out_f} file created") | |
441 | |
442 # -------------------------------------------------------------------------- | |
443 # test | |
444 if b_test_add_host_chr_taxids_accnr_from_ori_list: | |
445 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | |
446 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | |
447 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
448 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | |
449 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | |
450 taxidlist = load_taxids(taxid_acc_in_f) | |
451 print(f"{prog_tag} end loading") | |
452 | |
453 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
454 accnrlist, | |
455 acc_out_f) | |
456 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list") | |
457 sys.exit() | |
458 # -------------------------------------------------------------------------- | |
459 | |
460 # -------------------------------------------------------------------------- | |
461 # MAIN | |
462 # -------------------------------------------------------------------------- | |
463 ##### MAIN | |
464 def __main__(): | |
465 # load taxid_acc file | |
8 | 466 load_taxids(taxid_acc_in_f) |
0 | 467 # check in ncbi taxonomy which acc number are in and out of given taxid |
468 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
469 accnrlist, | |
470 acc_out_f) | |
471 # -------------------------------------------------------------------------- | |
472 #### MAIN END | |
473 if __name__ == "__main__": __main__() | |
474 |