Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
annotate TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft
Uploaded updated python script
author | p.lucas |
---|---|
date | Thu, 21 Sep 2023 15:19:55 +0000 |
parents | 3d116861e380 |
children | 932ba9e04f3a |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python3 |
2 # -*- coding: utf-8 -*- | |
3 ### | |
4 # USE PYTHON3 | |
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads) | |
6 # provide 2 files: | |
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level) | |
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | |
9 ### | |
10 | |
11 ### Libraries to import: | |
12 import argparse, os, sys, csv, warnings, re, itertools, operator | |
12 | 13 #from subprocess import Popen,PIPE |
14 import subprocess | |
0 | 15 from os import path |
16 import ncbi_genome_download as ngd | |
17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | |
18 from ete3 import NCBITaxa | |
19 | |
20 # to be able to report line number in error messages | |
21 import inspect | |
12 | 22 def lineno(): |
23 """Returns the current line number in our program.""" | |
24 return str(inspect.currentframe().f_back.f_lineno) | |
25 | |
0 | 26 |
27 # debug | |
12 | 28 test_dir = 'test_TAXID_genusexpand_taxid2acc/' |
29 b_test_load_taxids = False # ok 2023 08 25 | |
30 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25 | |
0 | 31 |
32 prog_tag = '[' + os.path.basename(__file__) + ']' | |
33 | |
34 # boolean to know if we dowload ncbi taxonomy file in current env | |
35 b_load_ncbi_tax_f = False | |
36 | |
37 # list of interesting taxid (fathers) | |
38 taxidlist_f = '' | |
39 taxidlist = [] | |
40 accnrlist = [] | |
41 | |
42 # order = -4 | |
43 # family or clade = -3 | |
44 # subtribe or genus = -2 | |
12 | 45 curr_index_in_lineage = -1 |
0 | 46 min_index_in_lineage = -4 |
47 | |
48 # boolean to know if we download ncbi taxonomy file in current env | |
49 b_load_ncbi_tax_f = False | |
50 b_test_all = False | |
51 b_test = False | |
52 b_acc_in_f = False | |
53 b_acc_out_f = False | |
54 | |
55 b_verbose = False | |
56 | |
12 | 57 # variables for ncbi-genome-download |
58 ncbigenomedownload_section = 'refseq' # genbank | |
59 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate' | |
60 assembly_levels = 'complete,chromosome' | |
61 | |
0 | 62 # rank = '' # rank level retained by user |
63 # rank_num = index of rank retained by user | |
64 | |
65 # # set to check that provided rank exist to be sure to be able to use it | |
66 # ranks = { | |
67 # 'superkingdom' => 0, | |
68 # # 'clade', # no, several clade, name is too generic | |
69 # 'kingdom' => 1, | |
70 # 'phylum' => 2, | |
71 # 'subphylum' => 3, | |
72 # 'superclass' => 4, | |
73 # 'class' => 5, | |
74 # 'superorder' => 6, | |
75 # 'order' => 7, | |
76 # 'suborder' => 8, | |
77 # 'infraorder' => 9, | |
78 # 'parvorder' => 10, | |
79 # 'superfamily' => 11, | |
80 # 'family' => 12, | |
81 # 'subfamily' => 13, | |
82 # 'genus' => 14, | |
83 # 'species' => 15, | |
84 # 'subspecies' => 16 | |
85 # } | |
86 | |
87 parser = argparse.ArgumentParser() | |
88 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | |
89 help="taxid acc_number list in tsv (tabular separated at each line)", | |
90 metavar="FILE") | |
91 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
92 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree", |
0 | 93 metavar="FILE") |
94 # parser.add_argument("-r", "--rank", dest='rank', | |
95 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | |
96 # action="store_const") | |
97 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | |
98 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree", | |
99 metavar="FILE") | |
100 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f', | |
101 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run", | |
102 action='store_true') | |
103 parser.add_argument("-z", "--test_all", dest='b_test_all', | |
104 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script", | |
105 action='store_true') | |
106 parser.add_argument("-v", "--verbose", dest='b_verbose', | |
107 help="[Optional] To have details on records when running", | |
108 action='store_true') | |
109 parser.set_defaults(b_load_ncbi_tax_f=False) | |
110 parser.set_defaults(b_test_all=False) | |
111 parser.set_defaults(b_verbose=False) | |
112 | |
113 # get absolute path in case of files | |
114 args = parser.parse_args() | |
115 | |
116 # ------------------------------------------- | |
117 # check arguments | |
118 b_test_all = args.b_test_all | |
119 | |
120 if b_test_all: | |
121 b_test_load_taxids = False | |
122 b_test_add_host_chr_taxids_accnr_from_ori_list = True | |
123 b_test = True | |
124 b_acc_in_f = True | |
125 b_acc_out_f = True | |
126 else: | |
127 b_test = (b_test_load_taxids or | |
128 b_test_add_host_chr_taxids_accnr_from_ori_list) | |
129 | |
130 if ((not b_test)and | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
131 ((len(sys.argv) < 2) or (len(sys.argv) > 5))): |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
132 print("\n".join([ |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
133 "Aim: find accession numbers of complete genomes related to provided taxids.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
134 "If not found at species level, try at upper taxonomic level until order.", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
135 "Retains only 1 complete genome is several available:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
136 " - the one with the highest version number, if not sufficient", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
137 " - the one with the highest accession number", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
138 "To use this scripts, run:", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
139 "conda activate TAXID_genusexpand_taxid2acc", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
140 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
141 " ", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
142 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
143 "Then, as an example:\n\n", |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
144 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
145 '-i taxid_accnr_list.tsv', |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
146 '-o accnr_out_list.txt']),"\n\n" ])) |
0 | 147 parser.print_help() |
148 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | |
12 | 149 " arguments, exit line "+lineno()) |
0 | 150 sys.exit(0) |
151 | |
152 # print('args:', args) | |
153 # if(not b_test): | |
154 if args.ncbi_tax_f is not None: | |
155 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | |
156 else: | |
157 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | |
158 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | |
159 if args.taxid_acc_in_f is not None: | |
12 | 160 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f) |
0 | 161 b_acc_in_f = True |
162 elif(not b_test): | |
163 sys.exit("[Error] You must provide taxid_acc_in_f") | |
164 if args.acc_out_f is not None: | |
165 acc_out_f = os.path.abspath(args.acc_out_f) | |
166 b_acc_out_f = True | |
167 elif(not b_test): | |
168 sys.exit("-acc_out_f <accnr_file>n must be provided\n") | |
169 # if args.rank is not None: | |
170 # rank = 'genus' | |
171 # else: | |
172 # rank = args.rank | |
173 | |
174 if args.b_verbose is not None: | |
175 b_verbose = int(args.b_verbose) | |
176 | |
177 if(not b_test): | |
178 if (not b_acc_in_f) and (not b_acc_out_f): | |
179 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>") | |
180 | |
181 # # store index of the rank expected by user | |
182 # rank_num = ranks{ rank } | |
183 | |
184 # -------------------------------------------------------------------------- | |
185 # to sort uniq, for a list, only need to add list conversion | |
186 # -------------------------------------------------------------------------- | |
187 mapper= map # Python ≥ 3 | |
12 | 188 def sort_uniq(sequence: list): |
0 | 189 return mapper( |
190 operator.itemgetter(0), | |
191 itertools.groupby(sorted(sequence))) | |
192 # -------------------------------------------------------------------------- | |
193 | |
194 # -------------------------------------------------------------------------- | |
12 | 195 # Procedure: load taxid acc list, return taxidlist |
0 | 196 # -------------------------------------------------------------------------- |
12 | 197 def load_taxids(taxid_acc_tabular_f: str): |
0 | 198 |
199 if not path.exists(taxid_acc_tabular_f): | |
200 sys.exit("Error " + taxid_acc_tabular_f + | |
12 | 201 " file does not exist, line "+ lineno() ) |
0 | 202 |
12 | 203 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u " |
0 | 204 |
205 for line in os.popen(cmd).readlines(): | |
11
3d116861e380
Uploaded 14 03 23 def load_taxids fix bug chacking empty line
p.lucas
parents:
10
diff
changeset
|
206 if line.rstrip() != "": |
12 | 207 k, v = line.rstrip().split() |
208 taxidlist.append(k) | |
209 accnrlist.append(v) | |
210 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
0 | 211 |
212 # -------------------------------------------------------------------------- | |
213 | |
214 # test load_taxids function | |
215 # display taxidlist, then exit | |
216 if b_test_load_taxids: | |
12 | 217 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
0 | 218 print("START b_test_load_taxids") |
219 print("loading "+taxid_acc_tabular_f+" file") | |
12 | 220 load_taxids(taxid_acc_tabular_f) |
0 | 221 for i in range(len(taxidlist)): |
222 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
223 print("END b_test_load_taxids") | |
224 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | |
225 sys.exit() | |
226 # -------------------------------------------------------------------------- | |
227 | |
228 # # -------------------------------------------------------------------------- | |
229 # # needs internet connexion, not possible | |
230 # # -------------------------------------------------------------------------- | |
231 # def get_leave_taxid_from_acc_nr(accnrlist): | |
232 | |
233 # # deduce a list of taxid from a list of accession numbers | |
234 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u" | |
235 # for line in os.popen(cmd).readlines(): | |
236 # taxidlist.append(line.rstrip()) | |
237 | |
238 # return taxidlist | |
239 # # -------------------------------------------------------------------------- | |
240 | |
241 # -------------------------------------------------------------------------- | |
242 # function to retain the most recent acc nr for host complete genome found: | |
243 # - return acc nr of most recent complete genome | |
244 # - print accnr species and name retained | |
245 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | |
246 # -------------------------------------------------------------------------- | |
12 | 247 def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str: |
0 | 248 |
249 max_accnr_version = 0 | |
250 curr_accnr_version = 0 | |
251 max_accnr_nr = 0 | |
252 curr_accnr_nr = 0 | |
253 kept_accnr_i = 0 | |
254 p = re.compile(".*?(\d+)\.(\d+)$") | |
255 | |
256 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}") | |
257 | |
258 for i in range(len(accnrlisttmp)): | |
259 m = p.match( accnrlisttmp[i] ) | |
260 if m: | |
261 # print('Match found: ', m.group(2)) | |
262 curr_accnr_version = int(m.group(2)) | |
263 accnr_nr = int(m.group(1)) | |
264 if curr_accnr_version > max_accnr_version: | |
265 max_accnr_version = curr_accnr_version | |
266 kept_accnr_i = i | |
267 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
6
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
268 elif(( curr_accnr_version == max_accnr_version)and |
81acd8138218
Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents:
0
diff
changeset
|
269 (curr_accnr_nr > max_accnr_nr)): |
0 | 270 max_accnr_nr = curr_accnr_nr |
271 kept_accnr_i = i | |
272 # print(f"record kept_accnr_i:{kept_accnr_i}") | |
273 | |
274 else: | |
275 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}") | |
276 | |
277 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}") | |
278 kept_accn = accnrlisttmp[kept_accnr_i] | |
279 | |
280 return kept_accn | |
281 # -------------------------------------------------------------------------- | |
282 | |
283 # -------------------------------------------------------------------------- | |
12 | 284 # Function to find complete genome closely related to current taxid |
0 | 285 # goes upper in taxonomy if nothing found until order |
286 # -------------------------------------------------------------------------- | |
12 | 287 def ngd_upper_lineage(curr_index_in_lineage: int, |
288 lineage: list, | |
289 ncbi: NCBITaxa, | |
290 accnrlisttmp: list, # current working list | |
291 accnrlist: list, # final list, if something added (or min index reached), recursivity stop | |
292 speciestmp: list, | |
293 nametmp: list | |
294 ) -> str: | |
0 | 295 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") |
296 | |
297 # deduce up rank, search complet genome/chr in | |
298 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | |
299 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | |
300 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]]) | |
301 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}") | |
302 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | |
303 intermediate_nodes=False, | |
304 collapse_subspecies=False, | |
305 return_tree=False | |
306 ) | |
12 | 307 |
0 | 308 # int conversion to strings |
309 leaves_taxids = list(map(str, leaves_taxids)) | |
12 | 310 leaves_taxids_list = ','.join(leaves_taxids) |
311 | |
312 if b_verbose: | |
313 print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}") | |
314 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
315 if b_verbose: | |
316 print(f"{prog_tag} cmd:{cmd}") | |
0 | 317 |
318 # specific to retain_1accn to avoid lists are crashed by other ngd call | |
319 accnrlisttmp_r = [] | |
320 speciestmp_r = [] | |
321 nametmp_r = [] | |
12 | 322 |
0 | 323 for line in os.popen(cmd).readlines(): |
12 | 324 # print(f"line 314:{line.rstrip()}") |
0 | 325 if not re.match("^Considering", line): |
12 | 326 # print(f"line 316:{line.rstrip()}") |
0 | 327 if re.match("^(?:ERROR|Error): No downloads", line): |
328 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | |
329 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | |
330 curr_index_in_lineage = curr_index_in_lineage - 1 | |
12 | 331 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
0 | 332 return ngd_upper_lineage(curr_index_in_lineage, |
333 lineage, | |
334 ncbi, | |
335 accnrlisttmp, # current working list | |
336 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
337 speciestmp, | |
338 nametmp | |
339 ) | |
340 else: | |
12 | 341 # print(f"line 331:{line}") |
342 try: | |
343 acc_nr, species, name = line.rstrip().split("\t") | |
344 except ValueError as ve: | |
345 sys.exit(f"ValueError {ve}: for split of line '{line}'") | |
0 | 346 accnrlisttmp_r.append(acc_nr) |
347 speciestmp_r.append(species) | |
348 nametmp_r.append(name) | |
349 if b_verbose: | |
350 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
351 | |
352 # retain only the most recent complete genome for current treated taxid | |
353 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r) | |
354 | |
355 # else: | |
356 # print(f"line matching Considering:{line}") | |
357 # -------------------------------------------------------------------------- | |
358 | |
359 # -------------------------------------------------------------------------- | |
360 # read taxids, deduce complete genomes available in genblank, provides in output file | |
361 # the acc number in addition to those already listed | |
362 # -------------------------------------------------------------------------- | |
12 | 363 def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list, |
364 accnrlist: list, | |
365 acc_out_f: str): | |
0 | 366 |
367 # store all accnr found for complete genome of current taxid (or from same family/order) | |
368 # the aim is to keep only the most recent/complete | |
369 accnrlisttmp = [] | |
370 speciestmp = [] | |
371 nametmp = [] | |
372 | |
373 # get host complete genome when found using ncbi_genome_download | |
374 taxids_list=','.join(taxidlist) | |
375 | |
376 # # ------------------------------------------ | |
377 # # ncbi-genome-download as a library | |
378 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | |
12 | 379 # ngd.download(section=ncbigenomedownload_section, |
0 | 380 # taxids=taxids_list, |
12 | 381 # assembly_levels=assembly_levels, |
0 | 382 # flat_output=True, |
383 # output=ngd_out_f, | |
12 | 384 # groups=organisms_to_search_in, |
385 # dry_run=True | |
0 | 386 # ) |
387 | |
388 | |
389 # cmd = "cut -f 1,2 "+ngd_out_f | |
390 | |
391 # for line in os.popen(cmd).readlines(): | |
392 # acc_nr, species = line.rstrip().split() | |
393 # accnrlist.append(acc_nr) | |
394 | |
395 # if b_verbose: | |
396 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}") | |
397 # # ------------------------------------------ | |
398 | |
399 # ------------------------------------------ | |
400 # ncbi-genome-download as executable script | |
401 # ------------------------------------------ | |
402 | |
403 # load NCBITaxa | |
404 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory) | |
405 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f) | |
406 ncbi = NCBITaxa(dbfile=ncbi_tax_f) | |
407 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f: | |
408 try: | |
409 ncbi.update_taxonomy_database() | |
410 except: | |
411 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | |
412 | |
413 for taxid_u in taxidlist: | |
12 | 414 print(f"{prog_tag} treating global taxid:{taxid_u}") |
415 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
0 | 416 for line in os.popen(cmd).readlines(): |
12 | 417 if b_verbose: |
418 print(f"{prog_tag} cmd:{cmd} ran, read output") | |
0 | 419 # ERROR: No downloads matched your filter. Please check your options. |
420 if re.match("^(?:ERROR|Error): No downloads", line): | |
421 # get complete lineage: accept ONLY leave taxid? (species) | |
422 lineage = ncbi.get_lineage(int(taxid_u)) | |
423 name = ncbi.translate_to_names(lineage) | |
424 if b_verbose: | |
425 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | |
426 | |
427 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | |
12 | 428 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
0 | 429 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, |
430 lineage, | |
431 ncbi, | |
432 accnrlisttmp, # current working list | |
433 accnrlist, # final list, if something added (or min index reached), recursivity stop | |
434 speciestmp, | |
435 nametmp | |
436 ) | |
12 | 437 if new_acc_nr is None: |
438 print(f"No acc_nr found after going up in taxonomy, line {lineno()}") | |
439 else: | |
440 accnrlist.append( new_acc_nr ) | |
441 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
0 | 442 |
443 # initialize for next search | |
444 accnrlisttmp = [] | |
445 speciestmp = [] | |
446 nametmp = [] | |
447 | |
448 elif not re.match("^Considering", line): | |
449 # print(f"line:{line.rstrip()}") | |
450 acc_nr, species, name = line.rstrip().split("\t") | |
451 accnrlisttmp.append(acc_nr) | |
12 | 452 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") |
453 | |
0 | 454 speciestmp.append(species) |
455 nametmp.append(name) | |
456 if b_verbose: | |
457 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | |
458 | |
459 # retain only the most recent complete genome for current treated taxid | |
460 if len(accnrlisttmp): | |
461 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | |
12 | 462 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") |
0 | 463 |
464 # remove redundant accnr | |
12 | 465 print(f"accnrlist to sort:{accnrlist}") |
0 | 466 accnrlist = list(sort_uniq(accnrlist)) |
467 with open(acc_out_f, "w") as record_file: | |
468 for accnr in accnrlist: | |
469 record_file.write("%s\n" % (accnr)) | |
470 # ------------------------------------------ | |
471 | |
472 print(f"{prog_tag} {acc_out_f} file created") | |
473 | |
474 # -------------------------------------------------------------------------- | |
475 # test | |
476 if b_test_add_host_chr_taxids_accnr_from_ori_list: | |
12 | 477 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
478 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
0 | 479 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") |
12 | 480 print(f"{prog_tag} loading {taxid_acc_in_f} file") |
481 load_taxids(taxid_acc_in_f) | |
482 for i in range(len(taxidlist)): | |
483 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
0 | 484 print(f"{prog_tag} end loading") |
485 | |
486 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
487 accnrlist, | |
488 acc_out_f) | |
489 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list") | |
490 sys.exit() | |
491 # -------------------------------------------------------------------------- | |
492 | |
493 # -------------------------------------------------------------------------- | |
494 # MAIN | |
495 # -------------------------------------------------------------------------- | |
496 ##### MAIN | |
497 def __main__(): | |
498 # load taxid_acc file | |
12 | 499 load_taxids(taxid_acc_tabular_f) |
0 | 500 # check in ncbi taxonomy which acc number are in and out of given taxid |
501 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | |
502 accnrlist, | |
503 acc_out_f) | |
504 # -------------------------------------------------------------------------- | |
505 #### MAIN END | |
506 if __name__ == "__main__": __main__() | |
507 |