|
0
|
1 #!/usr/bin/env python3
|
|
|
2 # -*- coding: utf-8 -*-
|
|
|
3 ###
|
|
|
4 # USE PYTHON3
|
|
|
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
|
|
|
6 # provide 2 files:
|
|
|
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
|
|
|
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
|
|
|
9 ###
|
|
|
10
|
|
|
11 ### Libraries to import:
|
|
|
12 import argparse, os, sys, csv, warnings, re, itertools, operator
|
|
|
13 #from subprocess import Popen,PIPE
|
|
|
14 import subprocess
|
|
|
15 from os import path
|
|
|
16 from natsort import natsorted
|
|
|
17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
|
|
|
18 from ete3 import NCBITaxa
|
|
|
19
|
|
|
20 # to be able to report line number in error messages
|
|
|
21 import inspect
|
|
|
22 def lineno():
|
|
|
23 """Returns the current line number in our program."""
|
|
|
24 return str(inspect.currentframe().f_back.f_lineno)
|
|
|
25
|
|
|
26
|
|
|
27 # debug
|
|
|
28 test_dir = 'test_TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb/'
|
|
|
29 b_test_load_taxids = False # ok 2023 09 11 with rm redundant taxid code
|
|
|
30 b_test_get_host_complete_genome_acc_nr_found = False # ok 2023 09 11
|
|
|
31
|
|
|
32 prog_tag = '[' + os.path.basename(__file__) + ']'
|
|
|
33
|
|
|
34 # boolean to know if we dowload ncbi taxonomy file in current env
|
|
|
35 b_load_ncbi_tax_f = False
|
|
|
36
|
|
|
37 # list of interesting taxid (fathers)
|
|
|
38 taxidlist_f = ''
|
|
|
39 taxidlist = []
|
|
|
40 accnrlist = []
|
|
|
41 taxidlisthosts = []
|
|
|
42 accnrlisthosts = []
|
|
|
43
|
|
|
44 # order = -4
|
|
|
45 # family or clade = -3
|
|
|
46 # subtribe or genus = -2
|
|
|
47 curr_index_in_lineage = -1
|
|
|
48 min_index_in_lineage = -4
|
|
|
49
|
|
|
50 # boolean to know if we download ncbi taxonomy file in current env
|
|
|
51 b_load_ncbi_tax_f = False
|
|
|
52 b_test_all = False
|
|
|
53 b_test = False
|
|
|
54 b_acc_in_f = False
|
|
|
55 b_acc_hostdb_in_f = False
|
|
|
56 b_acc_out_f = False
|
|
|
57
|
|
|
58 b_verbose = False
|
|
|
59
|
|
|
60 # variables for ncbi-genome-download
|
|
|
61 ncbigenomedownload_section = 'refseq' # genbank
|
|
|
62 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
|
|
|
63 assembly_levels = 'complete,chromosome'
|
|
|
64
|
|
|
65 # rank = '' # rank level retained by user
|
|
|
66 # rank_num = index of rank retained by user
|
|
|
67
|
|
|
68 # # set to check that provided rank exist to be sure to be able to use it
|
|
|
69 # ranks = {
|
|
|
70 # 'superkingdom' => 0,
|
|
|
71 # # 'clade', # no, several clade, name is too generic
|
|
|
72 # 'kingdom' => 1,
|
|
|
73 # 'phylum' => 2,
|
|
|
74 # 'subphylum' => 3,
|
|
|
75 # 'superclass' => 4,
|
|
|
76 # 'class' => 5,
|
|
|
77 # 'superorder' => 6,
|
|
|
78 # 'order' => 7,
|
|
|
79 # 'suborder' => 8,
|
|
|
80 # 'infraorder' => 9,
|
|
|
81 # 'parvorder' => 10,
|
|
|
82 # 'superfamily' => 11,
|
|
|
83 # 'family' => 12,
|
|
|
84 # 'subfamily' => 13,
|
|
|
85 # 'genus' => 14,
|
|
|
86 # 'species' => 15,
|
|
|
87 # 'subspecies' => 16
|
|
|
88 # }
|
|
|
89
|
|
|
90 parser = argparse.ArgumentParser()
|
|
|
91 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
|
|
|
92 help="taxid acc_number list in tsv (tabular separated at each line)",
|
|
|
93 metavar="FILE")
|
|
|
94 parser.add_argument("-d", "--taxid_acc_hostdb_in_f", dest='taxid_acc_hostdb_in_f',
|
|
|
95 help="taxid acc_number list in tsv (tabular separated at each line) for HOSTS",
|
|
|
96 metavar="FILE")
|
|
|
97 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
|
|
|
98 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
|
|
|
99 metavar="FILE")
|
|
|
100 # parser.add_argument("-r", "--rank", dest='rank',
|
|
|
101 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
|
|
|
102 # action="store_const")
|
|
|
103 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
|
|
|
104 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
|
|
|
105 metavar="FILE")
|
|
|
106 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
|
|
|
107 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
|
|
|
108 action='store_true')
|
|
|
109 parser.add_argument("-z", "--test_all", dest='b_test_all',
|
|
|
110 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
|
|
|
111 action='store_true')
|
|
|
112 parser.add_argument("-v", "--verbose", dest='b_verbose',
|
|
|
113 help="[Optional] To have details on records when running",
|
|
|
114 action='store_true')
|
|
|
115 parser.set_defaults(b_load_ncbi_tax_f=False)
|
|
|
116 parser.set_defaults(b_test_all=False)
|
|
|
117 parser.set_defaults(b_verbose=False)
|
|
|
118
|
|
|
119 args = parser.parse_args()
|
|
|
120
|
|
|
121 # -------------------------------------------
|
|
|
122 # check arguments
|
|
|
123 b_test_all = args.b_test_all
|
|
|
124
|
|
|
125 if b_test_all:
|
|
|
126 b_test_load_taxids = False
|
|
|
127 b_test_get_host_complete_genome_acc_nr_found = True
|
|
|
128 b_test = True
|
|
|
129 b_acc_in_f = True
|
|
|
130 b_acc_hostdb_in_f = True
|
|
|
131 b_acc_out_f = True
|
|
|
132 else:
|
|
|
133 b_test = (b_test_load_taxids or
|
|
|
134 b_test_get_host_complete_genome_acc_nr_found)
|
|
|
135
|
|
|
136 if ((not b_test)and
|
|
|
137 ((len(sys.argv) < 2) or (len(sys.argv) > 7))):
|
|
|
138 print("\n".join([
|
|
|
139 "Aim: find accession numbers of complete genomes related to provided taxids.",
|
|
|
140 "If not found at species level, try at upper taxonomic level until order.",
|
|
|
141 "Retains only 1 complete genome is several available:",
|
|
|
142 " - the one with the highest version number, if not sufficient",
|
|
|
143 " - the one with the highest accession number",
|
|
|
144 "To use this scripts, run:",
|
|
|
145 "conda activate TAXID_genusexpand_taxid2acc",
|
|
|
146 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
|
|
|
147 " ",
|
|
|
148 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
|
|
|
149 "Then, as an example:\n\n",
|
|
|
150 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
|
|
|
151 '-i taxid_accnr_list.tsv',
|
|
|
152 '-d taxid_accnr_hostdb_list.tsv',
|
|
|
153 '-o accnr_out_list.txt']),"\n\n" ]),
|
|
|
154 "For current hostcompletegenome db, taxid_accnr_hostdb_list.tsv file is ../taxid_lists/host_complete_genomes_taxid_accnr.tsv"
|
|
|
155 )
|
|
|
156 parser.print_help()
|
|
|
157 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + " arguments, exit line "+lineno())
|
|
|
158 sys.exit(0)
|
|
|
159
|
|
|
160 # print('args:', args)
|
|
|
161 # if(not b_test):
|
|
|
162 if args.ncbi_tax_f is not None:
|
|
|
163 # get absolute path in case of files
|
|
|
164 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
|
|
|
165 else:
|
|
|
166 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
|
|
|
167 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
|
|
|
168 if args.taxid_acc_in_f is not None:
|
|
|
169 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
|
|
|
170 b_acc_in_f = True
|
|
|
171 elif(not b_test):
|
|
|
172 sys.exit("[Error] You must provide taxid_acc_in_f")
|
|
|
173 if args.taxid_acc_hostdb_in_f is not None:
|
|
|
174 taxid_acc_hostdb_in_f = os.path.abspath(args.taxid_acc_hostdb_in_f)
|
|
|
175 b_acc_hostdb_in_f = True
|
|
|
176 elif(not b_test):
|
|
|
177 sys.exit("[Error] You must provide taxid_acc_hostdb_in_f")
|
|
|
178 if args.acc_out_f is not None:
|
|
|
179 acc_out_f = os.path.abspath(args.acc_out_f)
|
|
|
180 b_acc_out_f = True
|
|
|
181 elif(not b_test):
|
|
|
182 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
|
|
|
183 # if args.rank is not None:
|
|
|
184 # rank = 'genus'
|
|
|
185 # else:
|
|
|
186 # rank = args.rank
|
|
|
187
|
|
|
188 if args.b_verbose is not None:
|
|
|
189 b_verbose = int(args.b_verbose)
|
|
|
190
|
|
|
191 if(not b_test):
|
|
|
192 if (not b_acc_in_f) and (not b_acc_out_f) and (not b_acc_hostdb_in_f):
|
|
|
193 sys.exit(prog_tag + "[Error] You must provide either --taxid_acc_f <file> and --taxid_acc_hostdb_in_f <file> and -taxid_acc_out_f <file>")
|
|
|
194
|
|
|
195 # # store index of the rank expected by user
|
|
|
196 # rank_num = ranks{ rank }
|
|
|
197
|
|
|
198 # --------------------------------------------------------------------------
|
|
|
199 # to sort uniq, for a list, only need to add list conversion
|
|
|
200 # --------------------------------------------------------------------------
|
|
|
201 mapper= map # Python ≥ 3
|
|
|
202 def sort_uniq(sequence) -> list:
|
|
|
203 return mapper(
|
|
|
204 operator.itemgetter(0),
|
|
|
205 itertools.groupby(sorted(sequence)))
|
|
|
206 # --------------------------------------------------------------------------
|
|
|
207
|
|
|
208 # --------------------------------------------------------------------------
|
|
|
209 # Procedure load taxid acc list, return taxidlist
|
|
|
210 # --------------------------------------------------------------------------
|
|
|
211 def load_taxids(taxid_acc_tabular_f: str,
|
|
|
212 taxid_list_l: list,
|
|
|
213 accnr_list_l: list,
|
|
|
214 # Need to remove identical taxid in megablast res to avoid
|
|
|
215 # thousand times the same search
|
|
|
216 # Not needed for host_db_complet_genome_file
|
|
|
217 b_rm_duplicated_taxid: bool):
|
|
|
218
|
|
|
219 if not path.exists(taxid_acc_tabular_f):
|
|
|
220 sys.exit("Error " + taxid_acc_tabular_f +
|
|
|
221 " file does not exist, line "+ lineno() )
|
|
|
222
|
|
|
223 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
|
|
|
224
|
|
|
225 for line in os.popen(cmd).readlines():
|
|
|
226 if line.rstrip() != "":
|
|
|
227 k, v = line.rstrip().split()
|
|
|
228 taxid_list_l.append(int(k))
|
|
|
229 accnr_list_l.append(v)
|
|
|
230 # print(f"last item added to accnr_list_l:{accnr_list_l[-1]}, line {lineno()}")
|
|
|
231
|
|
|
232 # if asked, keep only non redundant taxids
|
|
|
233 if b_rm_duplicated_taxid:
|
|
|
234 # create set (uniq taxids)
|
|
|
235 taxid_list_uniq_set = set(taxid_list_l)
|
|
|
236 # convert to list
|
|
|
237 taxid_list_uniq = list(taxid_list_uniq_set)
|
|
|
238
|
|
|
239 if b_test_load_taxids or b_verbose:
|
|
|
240 print(f"{prog_tag} [load_taxids] number taxids:{len(taxid_list_l)}, uniq_taxids:{len(taxid_list_uniq)}")
|
|
|
241 accnr_list_uniq = []
|
|
|
242 for taxid_s in list(taxid_list_uniq):
|
|
|
243 # find index of uniq taxid in the original list of taxid
|
|
|
244 taxid_index = taxid_list_l.index(taxid_s)
|
|
|
245 # add deduced related first accession number
|
|
|
246 accnr_list_uniq.append( accnr_list_l[taxid_index] )
|
|
|
247 if b_test_load_taxids or b_verbose:
|
|
|
248 print(f"{prog_tag} [load_taxids] number accnrs:{len(accnr_list_l)}, uniq_accnrs:{len(accnr_list_uniq)}")
|
|
|
249 # replace original taxid accnr lists by the one with non redundant taxids
|
|
|
250 # this syntaxe force copy in original list (write over)
|
|
|
251 taxid_list_l[:] = taxid_list_uniq
|
|
|
252 accnr_list_l[:] = accnr_list_uniq
|
|
|
253 if b_test_load_taxids or b_test:
|
|
|
254 print(f"{prog_tag} [load_taxids] FINAL uniq_taxids:{len(taxid_list_l)} uniq_accnrs:{len(accnr_list_l)}")
|
|
|
255 # --------------------------------------------------------------------------
|
|
|
256
|
|
|
257 # test load_taxids function
|
|
|
258 # display taxidlist, then exit
|
|
|
259 if b_test_load_taxids:
|
|
|
260 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_testrmredundanttaxids.tsv'
|
|
|
261 print("START b_test_load_taxids REDUND")
|
|
|
262 print("loading "+taxid_acc_tabular_f+" file")
|
|
|
263 taxid_list = []
|
|
|
264 accnr_list = []
|
|
|
265 load_taxids(taxid_acc_tabular_f,
|
|
|
266 taxid_list,
|
|
|
267 accnr_list,
|
|
|
268 True)
|
|
|
269 for i in range(len(taxid_list)):
|
|
|
270 print(f"{taxid_list[i]}\t{accnr_list[i]}")
|
|
|
271 print(f"{prog_tag} [load_taxids] FINAL taxid_list:{len(taxid_list)} accnr_list:{len(accnr_list)}")
|
|
|
272 print("END b_test_load_taxids REDUND")
|
|
|
273
|
|
|
274 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
|
|
|
275 print("START b_test_load_taxids")
|
|
|
276 print("loading "+taxid_acc_tabular_f+" file")
|
|
|
277 taxid_list = []
|
|
|
278 accnr_list = []
|
|
|
279 load_taxids(taxid_acc_tabular_f,
|
|
|
280 taxid_list,
|
|
|
281 accnr_list,
|
|
|
282 False)
|
|
|
283 for i in range(len(taxid_list)):
|
|
|
284 print(f"{taxid_list[i]}\t{accnr_list[i]}")
|
|
|
285 print("END b_test_load_taxids")
|
|
|
286 if not b_test_get_host_complete_genome_acc_nr_found:
|
|
|
287 sys.exit()
|
|
|
288 # --------------------------------------------------------------------------
|
|
|
289
|
|
|
290 # # --------------------------------------------------------------------------
|
|
|
291 # # needs internet connexion, not possible
|
|
|
292 # # --------------------------------------------------------------------------
|
|
|
293 # def get_leave_taxid_from_acc_nr(accnrlist):
|
|
|
294
|
|
|
295 # # deduce a list of taxid from a list of accession numbers
|
|
|
296 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
|
|
|
297 # for line in os.popen(cmd).readlines():
|
|
|
298 # taxidlist.append(line.rstrip())
|
|
|
299
|
|
|
300 # return taxidlist
|
|
|
301 # # --------------------------------------------------------------------------
|
|
|
302
|
|
|
303 # --------------------------------------------------------------------------
|
|
|
304 # function to retain the most recent acc nr for host complete genome found:
|
|
|
305 # - return acc nr of most recent complete genome
|
|
|
306 # - print accnr species and name retained
|
|
|
307 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
|
|
|
308 # --------------------------------------------------------------------------
|
|
|
309 def retain_1accnr(accnrlisttmp: list,
|
|
|
310 speciestmp: list,
|
|
|
311 nametmp: list) -> list:
|
|
|
312
|
|
|
313 max_accnr_version = 0
|
|
|
314 curr_accnr_version = 0
|
|
|
315 max_accnr_nr = 0
|
|
|
316 curr_accnr_nr = 0
|
|
|
317 kept_accnr_i = 0
|
|
|
318 p = re.compile("(.*?(\d+)\.(\d+))$")
|
|
|
319
|
|
|
320 print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}), line {lineno()}")
|
|
|
321
|
|
|
322 for iacc in range(len(accnrlisttmp)):
|
|
|
323 m = p.match( accnrlisttmp[iacc] )
|
|
|
324 if m:
|
|
|
325 curr_accnr = m.group(1)
|
|
|
326 curr_accnr_version = int(m.group(3))
|
|
|
327 accnr_nr = int(m.group(2))
|
|
|
328 # print(f"curr_accnr_version:{curr_accnr_version} accnr_nr:{accnr_nr}, line {lineno()}")
|
|
|
329 if curr_accnr_version > max_accnr_version:
|
|
|
330 max_accnr_version = curr_accnr_version
|
|
|
331 kept_accnr_i = iacc
|
|
|
332 # print(f"{prog_tag} record kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
|
|
|
333 elif(( curr_accnr_version == max_accnr_version)and
|
|
|
334 (curr_accnr_nr > max_accnr_nr)):
|
|
|
335 max_accnr_nr = curr_accnr_nr
|
|
|
336 kept_accnr_i = iacc
|
|
|
337 # print(f"{prog_tag} record kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
|
|
|
338 elif b_verbose:
|
|
|
339 print(f"{prog_tag} keep kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
|
|
|
340
|
|
|
341 else:
|
|
|
342 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[iacc]}, line {lineno()}")
|
|
|
343 print(f"{prog_tag} kept_accnr_i:{kept_accnr_i}, line {lineno()}")
|
|
|
344 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
|
|
|
345 kept_accn = [ accnrlisttmp[kept_accnr_i] ]
|
|
|
346
|
|
|
347 return kept_accn
|
|
|
348 # --------------------------------------------------------------------------
|
|
|
349
|
|
|
350
|
|
|
351 # --------------------------------------------------------------------------
|
|
|
352 # Function to find complete genome closely related to current taxid
|
|
|
353 # goes upper in taxonomy if nothing found until order
|
|
|
354 # --------------------------------------------------------------------------
|
|
|
355 def ngd_upper_lineage(curr_index_in_lineage: int,
|
|
|
356 lineage: list,
|
|
|
357 ncbi: NCBITaxa
|
|
|
358 ) -> list:
|
|
|
359 print(f"{prog_tag} [ngd_upper_lineage] with curr_index_in_lineage:{curr_index_in_lineage}")
|
|
|
360
|
|
|
361 leaves_taxids_ints = []
|
|
|
362
|
|
|
363 # deduce up rank, search complet genome/chr in
|
|
|
364 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
|
|
|
365 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
|
|
|
366 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
|
|
|
367 print(f"{prog_tag} [ngd_upper_lineage] test with taxid:{upper_taxid} corresponding to rank:{rank}")
|
|
|
368 leaves_taxids_ints = ncbi.get_descendant_taxa(upper_taxid,
|
|
|
369 intermediate_nodes=False,
|
|
|
370 collapse_subspecies=False,
|
|
|
371 return_tree=False
|
|
|
372 )
|
|
|
373
|
|
|
374 # int conversion to strings
|
|
|
375 leaves_taxids = list(map(str, leaves_taxids_ints))
|
|
|
376 leaves_taxids_list = ','.join(leaves_taxids)
|
|
|
377
|
|
|
378 if b_verbose:
|
|
|
379 print(f"{prog_tag} [ngd_upper_lineage] number leaves_taxids returned for taxid {upper_taxid}:{len(leaves_taxids)}")
|
|
|
380 return leaves_taxids_ints
|
|
|
381
|
|
|
382
|
|
|
383 # --------------------------------------------------------------------------
|
|
|
384
|
|
|
385 # ----------------------------------------------------------------------
|
|
|
386 # Function that Intersect 2 lists of taxids:
|
|
|
387 # - host_complete_genome_taxids
|
|
|
388 # (host_complete_genome_acc_numbers = acc numbers related to host_complete_genome_taxids: same index pos)
|
|
|
389 # - leaves_taxids (taxids where we search for host complete genome taxids)
|
|
|
390
|
|
|
391 # - record taxid/accnr found in both
|
|
|
392 # - get not_found_taxids: taxids not found in host complet genome taxids
|
|
|
393 # - get uppertaxid for not_found_taxids by running ngd_upper_lineage
|
|
|
394
|
|
|
395 # Return host_complete_genomes_acc_nr, the list of complete genome acc number found related to leaves_taxids
|
|
|
396 # ----------------------------------------------------------------------
|
|
|
397 def get_host_complete_genome_acc_nr_found( host_complete_genome_taxids: list,
|
|
|
398 host_complete_genome_acc_numbers: list,
|
|
|
399 leaves_taxids: list,
|
|
|
400 # for further taxonomy analysis
|
|
|
401 curr_index_in_lineage: int,
|
|
|
402 lineage: list,
|
|
|
403 ncbi: NCBITaxa) -> list:
|
|
|
404
|
|
|
405 # -------------------------------------------------------------------------
|
|
|
406 # part shared for all taxid searched (comparison to host taxids from db)
|
|
|
407
|
|
|
408 # initi local working var
|
|
|
409 host_complete_genomes_acc_nr = [] # result
|
|
|
410 host_complete_genomes_acc_nr_set = set() # result (set)
|
|
|
411
|
|
|
412 # get host complete genomes as a set
|
|
|
413 host_complete_genome_taxids_set = set() # knwown host complete genomes
|
|
|
414 for host_taxid in host_complete_genome_taxids:
|
|
|
415 host_complete_genome_taxids_set.add(host_taxid)
|
|
|
416 # -------------------------------------------------------------------------
|
|
|
417
|
|
|
418 print(f"{prog_tag} [get_host_complete_genome_acc_nr_found] curr_index_in_lineage:{curr_index_in_lineage} treating {len(leaves_taxids)} taxid(s):{leaves_taxids[0]}...")
|
|
|
419
|
|
|
420 # get result taxids as a set
|
|
|
421 leaves_taxids_set = set() # taxid found in results
|
|
|
422 for res_taxid in leaves_taxids:
|
|
|
423 leaves_taxids_set.add(res_taxid)
|
|
|
424
|
|
|
425 print(f"{prog_tag} We intersect:{len(host_complete_genome_taxids)} host complete genomes with")
|
|
|
426 #\n{host_complete_genome_taxids}\n
|
|
|
427 print(f"{prog_tag} :{len(leaves_taxids)} genomes in result, line {lineno()}")
|
|
|
428 #\n{leaves_taxids}")
|
|
|
429
|
|
|
430 # do intersection for taxids shared (host complete genomes found in results)
|
|
|
431 host_complete_genomes_taxids_intersect_leaves_set = leaves_taxids_set.intersection(host_complete_genome_taxids_set)
|
|
|
432 host_complete_genomes_taxids_intersect_leaves_list = list(host_complete_genomes_taxids_intersect_leaves_set)
|
|
|
433
|
|
|
434 if b_verbose:
|
|
|
435 print(f"{prog_tag} After intersec, we retain set :{len(host_complete_genomes_taxids_intersect_leaves_set)} taxid to get related acc nr, line {lineno()}")
|
|
|
436 #\n{host_complete_genomes_taxids_intersect_leaves_set}")
|
|
|
437 print(f"{prog_tag} After intersec, we retain list:{len(host_complete_genomes_taxids_intersect_leaves_list)} taxid to get related acc nr, line {lineno()}")
|
|
|
438 #\n{host_complete_genomes_taxids_intersect_leaves_list}")
|
|
|
439
|
|
|
440 # get taxid not found to go up in taxonomy, get leave taxids again and cross again with
|
|
|
441 # available taxid/accnr found in hot complete genome db
|
|
|
442 not_found_taxids_set = list(leaves_taxids_set.difference(host_complete_genomes_taxids_intersect_leaves_set))
|
|
|
443 print(f"{prog_tag} Number of not_found_taxids:{len(not_found_taxids_set)}")
|
|
|
444
|
|
|
445 # get index of retained taxids in original host_complete_genome_taxids to deduce related acc numbers
|
|
|
446 try:
|
|
|
447 # -----------------------------------------------------------
|
|
|
448 # search indices of elements of host_complete_genomes_taxids_list in host_complete_genome_taxids
|
|
|
449 host_complete_genomes_taxids_indexes = []
|
|
|
450 for elt in host_complete_genomes_taxids_intersect_leaves_list:
|
|
|
451 host_complete_genomes_taxids_indexes.append(host_complete_genome_taxids.index(elt))
|
|
|
452 # -----------------------------------------------------------
|
|
|
453 if len(host_complete_genomes_taxids_intersect_leaves_list) != len(host_complete_genomes_taxids_indexes):
|
|
|
454 sys.exit(f"{prog_tag} number of indexes retained != number of taxids retained: NOT NORMAL, line {lineno()}")
|
|
|
455 # print(f"{prog_tag} After intersec, nr indexes retained: {len(host_complete_genomes_taxids_indexes)}, line {lineno()}")
|
|
|
456
|
|
|
457 # get acc numbers, sorted
|
|
|
458 host_complete_genomes_acc_nr = natsorted( [ host_complete_genome_acc_numbers[i] for i in host_complete_genomes_taxids_indexes ] )
|
|
|
459 print(f"{prog_tag} After intersec, nr accnr retained: {len(host_complete_genomes_acc_nr)}, line {lineno()}")
|
|
|
460
|
|
|
461 if len(host_complete_genomes_acc_nr) == 0:
|
|
|
462 print(f"no acc nr found for provided leaves_taxids_ints, line {lineno()}")
|
|
|
463 elif len(host_complete_genomes_acc_nr) == 1:
|
|
|
464 accnr = host_complete_genomes_acc_nr[0]
|
|
|
465 taxid = host_complete_genomes_taxids_intersect_leaves_list[0]
|
|
|
466 name = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())[0]
|
|
|
467 species = name
|
|
|
468 print(f"retained accnr:{accnr}\tspecies:{species}\tname:{name}, line {lineno()}")
|
|
|
469 return host_complete_genomes_acc_nr
|
|
|
470 elif len(host_complete_genomes_acc_nr) > 1:
|
|
|
471 accnrlisttmp = host_complete_genomes_acc_nr
|
|
|
472 nametmp = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())
|
|
|
473 speciestmp = nametmp
|
|
|
474 print(f"{prog_tag} retain_1accnr in {accnrlisttmp}, line {lineno()}")
|
|
|
475 return retain_1accnr(accnrlisttmp, speciestmp, nametmp)
|
|
|
476 else:
|
|
|
477 sys.exit(f"{prog_tag} [Error] Case not treated len of found_complete_genomes:{len(host_complete_genomes_acc_nr)}, curr_index_in_lineage:{curr_index_in_lineage}, line {lineno()}")
|
|
|
478
|
|
|
479 except ValueError:
|
|
|
480 """
|
|
|
481 # specific to retain_1accn to avoid lists are crashed by other ngd call
|
|
|
482 accnrlisttmp_r = []
|
|
|
483 speciestmp_r = []
|
|
|
484 nametmp_r = []
|
|
|
485 """
|
|
|
486
|
|
|
487 # goes up in taxonomy to get a larger number of taxids to cross with host genome taxids
|
|
|
488 if( (len(host_complete_genomes_acc_nr) == 0) and (curr_index_in_lineage > min_index_in_lineage)):
|
|
|
489 curr_index_in_lineage = curr_index_in_lineage - 1
|
|
|
490 upper_taxid = lineage[curr_index_in_lineage]
|
|
|
491 rank = ncbi.get_rank(upper_taxid[curr_index_in_lineage])
|
|
|
492 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name}), line {lineno()}")
|
|
|
493 if b_verbose:
|
|
|
494 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
|
|
|
495
|
|
|
496 # get complete lineage: accept ONLY leave taxid? (species)
|
|
|
497 # name = ncbi.get_taxid_translator(upper_taxid)
|
|
|
498 if b_verbose:
|
|
|
499 print(f"taxid:{upper_taxid}\tlineage:{lineage}\tname:{name}")
|
|
|
500 leave_taxid_ints = ngd_upper_lineage( curr_index_in_lineage,
|
|
|
501 lineage,
|
|
|
502 ncbi
|
|
|
503 )
|
|
|
504 # we search if one of the accession numbers of complete genome is included in leaves_taxids_ints
|
|
|
505 if b_verbose:
|
|
|
506 print(f"{prog_tag} get_host_complete_genome_acc_nr_found(\nhost_complete_genome_taxids,\nhost_complete_genome_acc_numbers,\nleaves_taxids_ints) ran")
|
|
|
507
|
|
|
508 return get_host_complete_genome_acc_nr_found(
|
|
|
509 host_complete_genome_taxids,
|
|
|
510 host_complete_genome_acc_numbers,
|
|
|
511 leave_taxid_ints,
|
|
|
512 # for further taxonomy analysis
|
|
|
513 curr_index_in_lineage,
|
|
|
514 lineage,
|
|
|
515 ncbi)
|
|
|
516
|
|
|
517 # retain only the most recent complete genome for current treated taxid
|
|
|
518 # return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
|
|
|
519 elif len(host_complete_genomes_acc_nr) == 0:
|
|
|
520 print(f"no acc nr found for provided leaves_taxids_ints, line {lineno()}")
|
|
|
521 return ''
|
|
|
522 elif len(host_complete_genomes_acc_nr) == 1:
|
|
|
523 print(f"retained accnr:{accnr}\tspecies:{species}\tname:{name}, line {lineno()}")
|
|
|
524 return host_complete_genomes_acc_nr
|
|
|
525 elif len(host_complete_genomes_acc_nr) > 1:
|
|
|
526 accnrlisttmp = host_complete_genomes_acc_nr
|
|
|
527 speciestmp = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())
|
|
|
528 nametmp = speciestmp
|
|
|
529 print(f"{prog_tag} retain_1accnr in {accnrlisttmp}, line {lineno()}")
|
|
|
530 return retain_1accnr(accnrlisttmp, speciestmp, nametmp)
|
|
|
531 else:
|
|
|
532 sys.exit(f"{prog_tag} [Error] Case not treated len of found_complete_genomes:{len(host_complete_genomes_acc_nr)}, curr_index_in_lineage:{curr_index_in_lineage}, line {lineno()}")
|
|
|
533
|
|
|
534
|
|
|
535 # goes up in taxonomy to get a larger number of taxids to cross with host genome taxids
|
|
|
536 if( (len(host_complete_genomes_acc_nr) == 0) and (curr_index_in_lineage > min_index_in_lineage)):
|
|
|
537 curr_index_in_lineage = curr_index_in_lineage - 1
|
|
|
538 upper_taxid_int = lineage[curr_index_in_lineage]
|
|
|
539 upper_taxid = str(upper_taxid_int)
|
|
|
540 rank = ncbi.get_rank([upper_taxid_int])
|
|
|
541 name = ncbi.get_taxid_translator([upper_taxid_int])
|
|
|
542 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name}), line {lineno()}")
|
|
|
543 if b_verbose:
|
|
|
544 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
|
|
|
545
|
|
|
546 # get complete lineage: accept ONLY leave taxid? (species)
|
|
|
547 if b_verbose:
|
|
|
548 print(f"taxid:{upper_taxid}\tlineage:{lineage}\tname:{name}")
|
|
|
549
|
|
|
550 # get taxids for current lineage up some lines earlier, we want to get leave taxids for this species/group/genus/family etc...
|
|
|
551 leave_taxid_ints = ngd_upper_lineage( curr_index_in_lineage,
|
|
|
552 lineage,
|
|
|
553 ncbi
|
|
|
554 )
|
|
|
555 print(f"{prog_tag} number of leave_taxid_ints obtained for rank:{rank}: {len(leave_taxid_ints)}, line {lineno()}")
|
|
|
556
|
|
|
557 # we search if one of the accession numbers of complete genome is included in leaves_taxids_ints
|
|
|
558 if b_verbose:
|
|
|
559 print(f"{prog_tag} get_host_complete_genome_acc_nr_found(\nhost_complete_genome_taxids,\nhost_complete_genome_acc_numbers,\nleaves_taxids_ints) ran")
|
|
|
560
|
|
|
561 return get_host_complete_genome_acc_nr_found(
|
|
|
562 host_complete_genome_taxids,
|
|
|
563 host_complete_genome_acc_numbers,
|
|
|
564 leave_taxid_ints,
|
|
|
565 # for further taxonomy analysis
|
|
|
566 curr_index_in_lineage,
|
|
|
567 lineage,
|
|
|
568 ncbi)
|
|
|
569
|
|
|
570
|
|
|
571 if b_test_get_host_complete_genome_acc_nr_found:
|
|
|
572 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
|
|
|
573 taxid_acc_hostdb_in_f = test_dir + 'host_complete_genomes_taxid_accnr.tsv'
|
|
|
574 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
|
|
|
575 # taxid_u = ['4520','4530','9606']
|
|
|
576 # we must obtain retained
|
|
|
577 # accnr:GCF_022539505.1 species:Lolium rigidum name:na
|
|
|
578 # accnr:GCF_000231095.2 species:Oryza brachyantha name:na
|
|
|
579 # accnr:GCF_000001405.40 species:Homo sapiens name:na
|
|
|
580 taxid_u = [126889,4520,4530,9187,9606]
|
|
|
581 # test case of empty list result, we expect a file with only a endofline
|
|
|
582 # taxid_u = [9187] # ok 2023/09/21
|
|
|
583
|
|
|
584 # taxid_u_str = sorted(map(str, taxid_u), key=int)
|
|
|
585
|
|
|
586 # taxid_u = [4530] # oriza
|
|
|
587
|
|
|
588 # load taxid_acc file
|
|
|
589 load_taxids(taxid_acc_in_f,
|
|
|
590 taxidlist,
|
|
|
591 accnrlist,
|
|
|
592 True) # remove redundant taxids
|
|
|
593
|
|
|
594 # load host db taxid acc
|
|
|
595 load_taxids(taxid_acc_hostdb_in_f,
|
|
|
596 taxidlisthosts,
|
|
|
597 accnrlisthosts,
|
|
|
598 False) # no need to remove redundant taxids
|
|
|
599
|
|
|
600 # load NCBITaxa
|
|
|
601 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
|
|
|
602 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
|
|
|
603 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
|
|
|
604 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
|
|
|
605 try:
|
|
|
606 ncbi.update_taxonomy_database()
|
|
|
607 except:
|
|
|
608 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
|
|
|
609
|
|
|
610 print(f"{prog_tag} [TEST get_host_complete_genome_acc_nr_found] START")
|
|
|
611 # check in ncbi taxonomy which acc number are in and out of given taxid
|
|
|
612
|
|
|
613 accnrlist_res = [] # we do not want initial acc nr not corresponding to complete genomes
|
|
|
614 # = accnrlist # we initialize with acc nr already found in results because
|
|
|
615 # we expect to increase the number of res
|
|
|
616 for glob_taxid in taxid_u:
|
|
|
617
|
|
|
618 accnrlisttmptest = []
|
|
|
619
|
|
|
620 # important
|
|
|
621 curr_index_in_lineage = -1
|
|
|
622
|
|
|
623 # get complete lineage: accept ONLY leave taxid? (species)
|
|
|
624 lineage = ncbi.get_lineage(glob_taxid)
|
|
|
625 # get species and names
|
|
|
626 # accnrlisttmp = acc nr of found complete genomes
|
|
|
627 nametmp = list(ncbi.get_taxid_translator(lineage).values())
|
|
|
628
|
|
|
629 print(f"{prog_tag} We search for host complete genome related to taxid:{glob_taxid}\tname:{nametmp}")
|
|
|
630
|
|
|
631 try:
|
|
|
632 accnrlisttmptest = get_host_complete_genome_acc_nr_found(
|
|
|
633 taxidlisthosts,
|
|
|
634 accnrlisthosts,
|
|
|
635 [glob_taxid],
|
|
|
636 # for further taxonomy analysis
|
|
|
637 curr_index_in_lineage,
|
|
|
638 lineage,
|
|
|
639 ncbi)
|
|
|
640 accnrlist_res.extend(accnrlisttmptest)
|
|
|
641 print(f"{prog_tag} [TEST] add {len(accnrlisttmptest)} accnr to existing {len(accnrlist_res)} accnr")
|
|
|
642 print(f"{prog_tag} [TEST] accnrlisttmptest:{accnrlisttmptest} line {lineno()}")
|
|
|
643 except TypeError as nterr:
|
|
|
644 print(f"{prog_tag} [TEST] Nothing returned for global taxid:{glob_taxid}\tname:{nametmp}, err:{nterr}")
|
|
|
645
|
|
|
646 # remove redundant accnr
|
|
|
647 print(f"accnrlist_res to sort:{accnrlist_res}")
|
|
|
648 accnrlist_res = list(sort_uniq(accnrlist_res))
|
|
|
649 with open(acc_out_f, "w") as record_file:
|
|
|
650 for accnr in accnrlist_res:
|
|
|
651 record_file.write("%s\n" % (accnr))
|
|
|
652 # if empty list, we record a endofline only in the file to avoid a Galaxy error
|
|
|
653 if len(accnrlist_res) == 0:
|
|
|
654 with open(acc_out_f, "w") as record_file:
|
|
|
655 record_file.write("\n")
|
|
|
656
|
|
|
657 print(f"{prog_tag} {acc_out_f} file created")
|
|
|
658 print(f"{prog_tag} [TEST get_host_complete_genome_acc_nr_found] END")
|
|
|
659 sys.exit()
|
|
|
660 # ----------------------------------------------------------------------
|
|
|
661
|
|
|
662 # --------------------------------------------------------------------------
|
|
|
663 # MAIN
|
|
|
664 # --------------------------------------------------------------------------
|
|
|
665 ##### MAIN
|
|
|
666 def __main__():
|
|
|
667
|
|
|
668 # load taxid_acc file
|
|
|
669 load_taxids(taxid_acc_in_f,
|
|
|
670 taxidlist,
|
|
|
671 accnrlist,
|
|
|
672 True) # remove redundant taxids (avoid to search n times the same acc nr)
|
|
|
673
|
|
|
674 # load host db taxid acc
|
|
|
675 load_taxids(taxid_acc_hostdb_in_f,
|
|
|
676 taxidlisthosts,
|
|
|
677 accnrlisthosts,
|
|
|
678 False) # do not try to remove redundancy, there is no redundant taxid in db
|
|
|
679
|
|
|
680 # load NCBITaxa
|
|
|
681 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
|
|
|
682 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
|
|
|
683 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
|
|
|
684 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
|
|
|
685 try:
|
|
|
686 ncbi.update_taxonomy_database()
|
|
|
687 except:
|
|
|
688 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
|
|
|
689
|
|
|
690 # global final results: accnrlist
|
|
|
691 accnrlist_res = []
|
|
|
692
|
|
|
693 # for each taxid found in res
|
|
|
694 for taxid_u in taxidlist:
|
|
|
695
|
|
|
696 # get complete lineage: accept ONLY leave taxid? (species)
|
|
|
697 lineage = ncbi.get_lineage(int(taxid_u))
|
|
|
698 # get species and names
|
|
|
699 # accnrlisttmp = acc nr of found complete genomes
|
|
|
700 nametmp = ncbi.translate_to_names(lineage)
|
|
|
701
|
|
|
702 taxid_u_l = [taxid_u]
|
|
|
703
|
|
|
704 """
|
|
|
705 speciestmp = ''
|
|
|
706 # get species name
|
|
|
707 for taxid in lineage.reversed:
|
|
|
708 if ncbi.get_rank(taxid) == 'species':
|
|
|
709 speciestmp = ncbi.get_taxid_translator(taxid)
|
|
|
710 break
|
|
|
711 """
|
|
|
712
|
|
|
713 if b_verbose:
|
|
|
714 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{nametmp}")
|
|
|
715
|
|
|
716 try:
|
|
|
717 # check in ncbi taxonomy which acc number are in and out of given taxid
|
|
|
718 accnrlisttmp = get_host_complete_genome_acc_nr_found(
|
|
|
719 taxidlisthosts,
|
|
|
720 accnrlisthosts,
|
|
|
721 taxid_u_l,
|
|
|
722 # for further taxonomy analysis
|
|
|
723 curr_index_in_lineage,
|
|
|
724 lineage,
|
|
|
725 ncbi)
|
|
|
726 accnrlist_res.extend(accnrlisttmp)
|
|
|
727 except TypeError as nterr:
|
|
|
728 print(f"{prog_tag} Nothing returned for global taxid:{taxid_u}")
|
|
|
729
|
|
|
730
|
|
|
731 # remove redundant accnr
|
|
|
732 print(f"accnrlist_res to sort:{accnrlist_res}")
|
|
|
733 accnrlist_res = list(sort_uniq(accnrlist_res))
|
|
|
734 with open(acc_out_f, "w") as record_file:
|
|
|
735 for accnr in accnrlist_res:
|
|
|
736 record_file.write("%s\n" % (accnr))
|
|
|
737
|
|
|
738 # if empty list, we record a endofline only in the file to avoid a Galaxy error
|
|
|
739 if len(accnrlist_res) == 0:
|
|
|
740 with open(acc_out_f, "w") as record_file:
|
|
|
741 record_file.write("\n")
|
|
|
742
|
|
|
743 # ------------------------------------------
|
|
|
744
|
|
|
745 print(f"{prog_tag} {acc_out_f} file created")
|
|
|
746
|
|
|
747 # --------------------------------------------------------------------------
|
|
|
748 #### MAIN END
|
|
|
749 if __name__ == "__main__": __main__()
|
|
|
750
|