comparison TAXID_genusexpand_taxid2acc.py @ 0:e7dd595fb0dd draft

Uploaded 09 03 23 first upload of script
author p.lucas
date Thu, 09 Mar 2023 16:50:29 +0000
parents
children 81acd8138218
comparison
equal deleted inserted replaced
-1:000000000000 0:e7dd595fb0dd
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 ###
4 # USE PYTHON3
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
6 # provide 2 files:
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
9 ###
10
11 ### Libraries to import:
12 import argparse, os, sys, csv, warnings, re, itertools, operator
13 from os import path
14 import ncbi_genome_download as ngd
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
16 from ete3 import NCBITaxa
17
18 # to be able to report line number in error messages
19 import inspect
20 frame = inspect.currentframe()
21
22 # debug
23 b_test_load_taxids = False # ok 2023 03 07
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
25
26 prog_tag = '[' + os.path.basename(__file__) + ']'
27
28 # boolean to know if we dowload ncbi taxonomy file in current env
29 b_load_ncbi_tax_f = False
30
31 # list of interesting taxid (fathers)
32 taxidlist_f = ''
33 taxidlist = []
34 accnrlist = []
35
36 # order = -4
37 # family or clade = -3
38 # subtribe or genus = -2
39 curr_index_in_lineage = -2
40 min_index_in_lineage = -4
41
42 # boolean to know if we download ncbi taxonomy file in current env
43 b_load_ncbi_tax_f = False
44 b_test_all = False
45 b_test = False
46 b_acc_in_f = False
47 b_acc_out_f = False
48
49 b_verbose = False
50
51 # rank = '' # rank level retained by user
52 # rank_num = index of rank retained by user
53
54 # # set to check that provided rank exist to be sure to be able to use it
55 # ranks = {
56 # 'superkingdom' => 0,
57 # # 'clade', # no, several clade, name is too generic
58 # 'kingdom' => 1,
59 # 'phylum' => 2,
60 # 'subphylum' => 3,
61 # 'superclass' => 4,
62 # 'class' => 5,
63 # 'superorder' => 6,
64 # 'order' => 7,
65 # 'suborder' => 8,
66 # 'infraorder' => 9,
67 # 'parvorder' => 10,
68 # 'superfamily' => 11,
69 # 'family' => 12,
70 # 'subfamily' => 13,
71 # 'genus' => 14,
72 # 'species' => 15,
73 # 'subspecies' => 16
74 # }
75
76 parser = argparse.ArgumentParser()
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
78 help="taxid acc_number list in tsv (tabular separated at each line)",
79 metavar="FILE")
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree",
82 metavar="FILE")
83 # parser.add_argument("-r", "--rank", dest='rank',
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
85 # action="store_const")
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
88 metavar="FILE")
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
91 action='store_true')
92 parser.add_argument("-z", "--test_all", dest='b_test_all',
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
94 action='store_true')
95 parser.add_argument("-v", "--verbose", dest='b_verbose',
96 help="[Optional] To have details on records when running",
97 action='store_true')
98 parser.set_defaults(b_load_ncbi_tax_f=False)
99 parser.set_defaults(b_test_all=False)
100 parser.set_defaults(b_verbose=False)
101
102 # get absolute path in case of files
103 args = parser.parse_args()
104
105 # -------------------------------------------
106 # check arguments
107 b_test_all = args.b_test_all
108
109 if b_test_all:
110 b_test_load_taxids = False
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True
112 b_test = True
113 b_acc_in_f = True
114 b_acc_out_f = True
115 else:
116 b_test = (b_test_load_taxids or
117 b_test_add_host_chr_taxids_accnr_from_ori_list)
118
119 if ((not b_test)and
120 ((len(sys.argv) < 1) or (len(sys.argv) > 5))):
121 print("\n".join(["To use this scripts, run:",
122 "conda activate TAXID_genusexpand_taxid2acc",
123 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
124 " ",
125 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
126 "Then, as an example:\n\n",
127 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
128 '-i taxid_accnr_list.tsv',
129 '-o accnr_out_list.txt']),"\n\n" ]))
130 parser.print_help()
131 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
132 " arguments, exit line "+str(frame.f_lineno))
133 sys.exit(0)
134
135 # print('args:', args)
136 # if(not b_test):
137 if args.ncbi_tax_f is not None:
138 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
139 else:
140 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
141 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
142 if args.taxid_acc_in_f is not None:
143 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
144 b_acc_in_f = True
145 elif(not b_test):
146 sys.exit("[Error] You must provide taxid_acc_in_f")
147 if args.acc_out_f is not None:
148 acc_out_f = os.path.abspath(args.acc_out_f)
149 b_acc_out_f = True
150 elif(not b_test):
151 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
152 # if args.rank is not None:
153 # rank = 'genus'
154 # else:
155 # rank = args.rank
156
157 if args.b_verbose is not None:
158 b_verbose = int(args.b_verbose)
159
160 if(not b_test):
161 if (not b_acc_in_f) and (not b_acc_out_f):
162 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
163
164 # # store index of the rank expected by user
165 # rank_num = ranks{ rank }
166
167 # --------------------------------------------------------------------------
168 # to sort uniq, for a list, only need to add list conversion
169 # --------------------------------------------------------------------------
170 mapper= map # Python ≥ 3
171 def sort_uniq(sequence):
172 return mapper(
173 operator.itemgetter(0),
174 itertools.groupby(sorted(sequence)))
175 # --------------------------------------------------------------------------
176
177 # --------------------------------------------------------------------------
178 # load taxid acc list, return taxidlist
179 # --------------------------------------------------------------------------
180 def load_taxids(taxid_acc_tabular_f):
181
182 if not path.exists(taxid_acc_tabular_f):
183 sys.exit("Error " + taxid_acc_tabular_f +
184 " file does not exist, line "+ str(sys._getframe().f_lineno) )
185
186 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
187 # cmd = "cut -f 1 "+taxid_acc_tabular_f+" | sort | uniq "
188
189 for line in os.popen(cmd).readlines():
190 k, v = line.rstrip().split()
191 taxidlist.append(k)
192 accnrlist.append(v)
193
194 return taxidlist
195 # --------------------------------------------------------------------------
196
197 # test load_taxids function
198 # display taxidlist, then exit
199 if b_test_load_taxids:
200 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
201 print("START b_test_load_taxids")
202 print("loading "+taxid_acc_tabular_f+" file")
203 taxidlist = load_taxids(taxid_acc_tabular_f)
204 for i in range(len(taxidlist)):
205 print(f"{taxidlist[i]}\t{accnrlist[i]}")
206 print("END b_test_load_taxids")
207 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
208 sys.exit()
209 # --------------------------------------------------------------------------
210
211 # # --------------------------------------------------------------------------
212 # # needs internet connexion, not possible
213 # # --------------------------------------------------------------------------
214 # def get_leave_taxid_from_acc_nr(accnrlist):
215
216 # # deduce a list of taxid from a list of accession numbers
217 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
218 # for line in os.popen(cmd).readlines():
219 # taxidlist.append(line.rstrip())
220
221 # return taxidlist
222 # # --------------------------------------------------------------------------
223
224 # --------------------------------------------------------------------------
225 # function to retain the most recent acc nr for host complete genome found:
226 # - return acc nr of most recent complete genome
227 # - print accnr species and name retained
228 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
229 # --------------------------------------------------------------------------
230 def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
231
232 max_accnr_version = 0
233 curr_accnr_version = 0
234 max_accnr_nr = 0
235 curr_accnr_nr = 0
236 kept_accnr_i = 0
237 p = re.compile(".*?(\d+)\.(\d+)$")
238
239 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
240
241 for i in range(len(accnrlisttmp)):
242 m = p.match( accnrlisttmp[i] )
243 if m:
244 # print('Match found: ', m.group(2))
245 curr_accnr_version = int(m.group(2))
246 accnr_nr = int(m.group(1))
247 if curr_accnr_version > max_accnr_version:
248 max_accnr_version = curr_accnr_version
249 kept_accnr_i = i
250 # print(f"record kept_accnr_i:{kept_accnr_i}")
251 elif curr_accnr_nr > max_accnr_nr:
252 max_accnr_nr = curr_accnr_nr
253 kept_accnr_i = i
254 # print(f"record kept_accnr_i:{kept_accnr_i}")
255
256 else:
257 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
258
259 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
260 kept_accn = accnrlisttmp[kept_accnr_i]
261
262 return kept_accn
263 # --------------------------------------------------------------------------
264
265 # --------------------------------------------------------------------------
266 # procedure to find complete genome closely related to current taxid
267 # goes upper in taxonomy if nothing found until order
268 # --------------------------------------------------------------------------
269 def ngd_upper_lineage(curr_index_in_lineage,
270 lineage,
271 ncbi,
272 accnrlisttmp, # current working list
273 accnrlist, # final list, if something added (or min index reached), recursivity stop
274 speciestmp,
275 nametmp
276 ):
277 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
278
279 # deduce up rank, search complet genome/chr in
280 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
281 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
282 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
283 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
284 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
285 intermediate_nodes=False,
286 collapse_subspecies=False,
287 return_tree=False
288 )
289 # int conversion to strings
290 leaves_taxids = list(map(str, leaves_taxids))
291 leaves_taxids_list = ','.join(leaves_taxids)
292 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
293 # print(f"{prog_tag} cmd:{cmd}")
294
295 # specific to retain_1accn to avoid lists are crashed by other ngd call
296 accnrlisttmp_r = []
297 speciestmp_r = []
298 nametmp_r = []
299 for line in os.popen(cmd).readlines():
300
301 if not re.match("^Considering", line):
302 # print(f"line:{line.rstrip()}")
303 if re.match("^(?:ERROR|Error): No downloads", line):
304 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
305 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
306 curr_index_in_lineage = curr_index_in_lineage - 1
307 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
308 return ngd_upper_lineage(curr_index_in_lineage,
309 lineage,
310 ncbi,
311 accnrlisttmp, # current working list
312 accnrlist, # final list, if something added (or min index reached), recursivity stop
313 speciestmp,
314 nametmp
315 )
316 else:
317 acc_nr, species, name = line.rstrip().split("\t")
318 accnrlisttmp_r.append(acc_nr)
319 speciestmp_r.append(species)
320 nametmp_r.append(name)
321 if b_verbose:
322 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
323
324 # retain only the most recent complete genome for current treated taxid
325 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
326
327 # else:
328 # print(f"line matching Considering:{line}")
329 # --------------------------------------------------------------------------
330
331 # --------------------------------------------------------------------------
332 # read taxids, deduce complete genomes available in genblank, provides in output file
333 # the acc number in addition to those already listed
334 # --------------------------------------------------------------------------
335 def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
336 accnrlist,
337 acc_out_f):
338
339 # store all accnr found for complete genome of current taxid (or from same family/order)
340 # the aim is to keep only the most recent/complete
341 accnrlisttmp = []
342 speciestmp = []
343 nametmp = []
344
345 # get host complete genome when found using ncbi_genome_download
346 taxids_list=','.join(taxidlist)
347
348 # # ------------------------------------------
349 # # ncbi-genome-download as a library
350 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
351 # ngd.download(section='genbank',
352 # taxids=taxids_list,
353 # assembly_levels='chromosome',
354 # flat_output=True,
355 # output=ngd_out_f,
356 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
357 # dry_run=True
358 # )
359
360
361 # cmd = "cut -f 1,2 "+ngd_out_f
362
363 # for line in os.popen(cmd).readlines():
364 # acc_nr, species = line.rstrip().split()
365 # accnrlist.append(acc_nr)
366
367 # if b_verbose:
368 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
369 # # ------------------------------------------
370
371 # ------------------------------------------
372 # ncbi-genome-download as executable script
373 # ------------------------------------------
374
375 # load NCBITaxa
376 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
377 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
378 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
379 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
380 try:
381 ncbi.update_taxonomy_database()
382 except:
383 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
384
385 for taxid_u in taxidlist:
386 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
387 for line in os.popen(cmd).readlines():
388 # ERROR: No downloads matched your filter. Please check your options.
389 if re.match("^(?:ERROR|Error): No downloads", line):
390 # get complete lineage: accept ONLY leave taxid? (species)
391 lineage = ncbi.get_lineage(int(taxid_u))
392 name = ncbi.translate_to_names(lineage)
393 if b_verbose:
394 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
395
396 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
397 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
398 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
399 lineage,
400 ncbi,
401 accnrlisttmp, # current working list
402 accnrlist, # final list, if something added (or min index reached), recursivity stop
403 speciestmp,
404 nametmp
405 )
406 accnrlist.append( new_acc_nr )
407
408 # initialize for next search
409 accnrlisttmp = []
410 speciestmp = []
411 nametmp = []
412
413 elif not re.match("^Considering", line):
414 # print(f"line:{line.rstrip()}")
415 acc_nr, species, name = line.rstrip().split("\t")
416 accnrlisttmp.append(acc_nr)
417 speciestmp.append(species)
418 nametmp.append(name)
419 if b_verbose:
420 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
421
422 # retain only the most recent complete genome for current treated taxid
423 if len(accnrlisttmp):
424 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
425
426 # remove redundant accnr
427 accnrlist = list(sort_uniq(accnrlist))
428 with open(acc_out_f, "w") as record_file:
429 for accnr in accnrlist:
430 record_file.write("%s\n" % (accnr))
431 # ------------------------------------------
432
433 print(f"{prog_tag} {acc_out_f} file created")
434
435 # --------------------------------------------------------------------------
436 # test
437 if b_test_add_host_chr_taxids_accnr_from_ori_list:
438 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
439 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
440 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
441 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
442 print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
443 taxidlist = load_taxids(taxid_acc_in_f)
444 print(f"{prog_tag} end loading")
445
446 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
447 accnrlist,
448 acc_out_f)
449 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
450 sys.exit()
451 # --------------------------------------------------------------------------
452
453 # --------------------------------------------------------------------------
454 # MAIN
455 # --------------------------------------------------------------------------
456 ##### MAIN
457 def __main__():
458 # load taxid_acc file
459 load_taxids(taxid_acc_tabular_f)
460 # check in ncbi taxonomy which acc number are in and out of given taxid
461 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
462 accnrlist,
463 acc_out_f)
464 # --------------------------------------------------------------------------
465 #### MAIN END
466 if __name__ == "__main__": __main__()
467