annotate TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb.py @ 3:c6fc401b61b2 draft default tip

Change version of natsort for conda
author p.lucas
date Thu, 28 Sep 2023 13:01:06 +0000
parents e5b355a33841
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
3 ###
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
6 # provide 2 files:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
9 ###
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
10
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
13 #from subprocess import Popen,PIPE
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
14 import subprocess
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
15 from os import path
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
16 from natsort import natsorted
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
18 from ete3 import NCBITaxa
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
19
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
20 # to be able to report line number in error messages
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
21 import inspect
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
22 def lineno():
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
23 """Returns the current line number in our program."""
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
24 return str(inspect.currentframe().f_back.f_lineno)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
25
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
26
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
27 # debug
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
28 test_dir = 'test_TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb/'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
29 b_test_load_taxids = False # ok 2023 09 11 with rm redundant taxid code
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
30 b_test_get_host_complete_genome_acc_nr_found = False # ok 2023 09 11
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
31
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
32 prog_tag = '[' + os.path.basename(__file__) + ']'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
33
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
34 # boolean to know if we dowload ncbi taxonomy file in current env
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
35 b_load_ncbi_tax_f = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
36
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
37 # list of interesting taxid (fathers)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
38 taxidlist_f = ''
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
39 taxidlist = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
40 accnrlist = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
41 taxidlisthosts = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
42 accnrlisthosts = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
43
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
44 # order = -4
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
45 # family or clade = -3
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
46 # subtribe or genus = -2
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
47 curr_index_in_lineage = -1
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
48 min_index_in_lineage = -4
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
49
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
50 # boolean to know if we download ncbi taxonomy file in current env
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
51 b_load_ncbi_tax_f = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
52 b_test_all = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
53 b_test = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
54 b_acc_in_f = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
55 b_acc_hostdb_in_f = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
56 b_acc_out_f = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
57
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
58 b_verbose = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
59
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
60 # variables for ncbi-genome-download
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
61 ncbigenomedownload_section = 'refseq' # genbank
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
62 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
63 assembly_levels = 'complete,chromosome'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
64
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
65 # rank = '' # rank level retained by user
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
66 # rank_num = index of rank retained by user
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
67
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
68 # # set to check that provided rank exist to be sure to be able to use it
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
69 # ranks = {
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
70 # 'superkingdom' => 0,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
71 # # 'clade', # no, several clade, name is too generic
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
72 # 'kingdom' => 1,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
73 # 'phylum' => 2,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
74 # 'subphylum' => 3,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
75 # 'superclass' => 4,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
76 # 'class' => 5,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
77 # 'superorder' => 6,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
78 # 'order' => 7,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
79 # 'suborder' => 8,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
80 # 'infraorder' => 9,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
81 # 'parvorder' => 10,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
82 # 'superfamily' => 11,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
83 # 'family' => 12,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
84 # 'subfamily' => 13,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
85 # 'genus' => 14,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
86 # 'species' => 15,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
87 # 'subspecies' => 16
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
88 # }
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
89
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
90 parser = argparse.ArgumentParser()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
91 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
92 help="taxid acc_number list in tsv (tabular separated at each line)",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
93 metavar="FILE")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
94 parser.add_argument("-d", "--taxid_acc_hostdb_in_f", dest='taxid_acc_hostdb_in_f',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
95 help="taxid acc_number list in tsv (tabular separated at each line) for HOSTS",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
96 metavar="FILE")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
97 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
98 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
99 metavar="FILE")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
100 # parser.add_argument("-r", "--rank", dest='rank',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
101 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
102 # action="store_const")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
103 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
104 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
105 metavar="FILE")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
106 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
107 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
108 action='store_true')
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
109 parser.add_argument("-z", "--test_all", dest='b_test_all',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
110 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
111 action='store_true')
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
112 parser.add_argument("-v", "--verbose", dest='b_verbose',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
113 help="[Optional] To have details on records when running",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
114 action='store_true')
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
115 parser.set_defaults(b_load_ncbi_tax_f=False)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
116 parser.set_defaults(b_test_all=False)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
117 parser.set_defaults(b_verbose=False)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
118
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
119 args = parser.parse_args()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
120
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
121 # -------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
122 # check arguments
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
123 b_test_all = args.b_test_all
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
124
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
125 if b_test_all:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
126 b_test_load_taxids = False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
127 b_test_get_host_complete_genome_acc_nr_found = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
128 b_test = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
129 b_acc_in_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
130 b_acc_hostdb_in_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
131 b_acc_out_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
132 else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
133 b_test = (b_test_load_taxids or
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
134 b_test_get_host_complete_genome_acc_nr_found)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
135
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
136 if ((not b_test)and
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
137 ((len(sys.argv) < 2) or (len(sys.argv) > 7))):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
138 print("\n".join([
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
139 "Aim: find accession numbers of complete genomes related to provided taxids.",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
140 "If not found at species level, try at upper taxonomic level until order.",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
141 "Retains only 1 complete genome is several available:",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
142 " - the one with the highest version number, if not sufficient",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
143 " - the one with the highest accession number",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
144 "To use this scripts, run:",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
145 "conda activate TAXID_genusexpand_taxid2acc",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
146 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
147 " ",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
148 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
149 "Then, as an example:\n\n",
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
150 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
151 '-i taxid_accnr_list.tsv',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
152 '-d taxid_accnr_hostdb_list.tsv',
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
153 '-o accnr_out_list.txt']),"\n\n" ]),
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
154 "For current hostcompletegenome db, taxid_accnr_hostdb_list.tsv file is ../taxid_lists/host_complete_genomes_taxid_accnr.tsv"
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
155 )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
156 parser.print_help()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
157 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + " arguments, exit line "+lineno())
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
158 sys.exit(0)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
159
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
160 # print('args:', args)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
161 # if(not b_test):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
162 if args.ncbi_tax_f is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
163 # get absolute path in case of files
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
164 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
165 else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
166 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
167 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
168 if args.taxid_acc_in_f is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
169 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
170 b_acc_in_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
171 elif(not b_test):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
172 sys.exit("[Error] You must provide taxid_acc_in_f")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
173 if args.taxid_acc_hostdb_in_f is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
174 taxid_acc_hostdb_in_f = os.path.abspath(args.taxid_acc_hostdb_in_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
175 b_acc_hostdb_in_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
176 elif(not b_test):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
177 sys.exit("[Error] You must provide taxid_acc_hostdb_in_f")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
178 if args.acc_out_f is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
179 acc_out_f = os.path.abspath(args.acc_out_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
180 b_acc_out_f = True
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
181 elif(not b_test):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
182 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
183 # if args.rank is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
184 # rank = 'genus'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
185 # else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
186 # rank = args.rank
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
187
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
188 if args.b_verbose is not None:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
189 b_verbose = int(args.b_verbose)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
190
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
191 if(not b_test):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
192 if (not b_acc_in_f) and (not b_acc_out_f) and (not b_acc_hostdb_in_f):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
193 sys.exit(prog_tag + "[Error] You must provide either --taxid_acc_f <file> and --taxid_acc_hostdb_in_f <file> and -taxid_acc_out_f <file>")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
194
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
195 # # store index of the rank expected by user
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
196 # rank_num = ranks{ rank }
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
197
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
198 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
199 # to sort uniq, for a list, only need to add list conversion
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
200 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
201 mapper= map # Python ≥ 3
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
202 def sort_uniq(sequence) -> list:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
203 return mapper(
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
204 operator.itemgetter(0),
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
205 itertools.groupby(sorted(sequence)))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
206 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
207
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
208 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
209 # Procedure load taxid acc list, return taxidlist
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
210 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
211 def load_taxids(taxid_acc_tabular_f: str,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
212 taxid_list_l: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
213 accnr_list_l: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
214 # Need to remove identical taxid in megablast res to avoid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
215 # thousand times the same search
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
216 # Not needed for host_db_complet_genome_file
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
217 b_rm_duplicated_taxid: bool):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
218
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
219 if not path.exists(taxid_acc_tabular_f):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
220 sys.exit("Error " + taxid_acc_tabular_f +
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
221 " file does not exist, line "+ lineno() )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
222
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
223 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
224
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
225 for line in os.popen(cmd).readlines():
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
226 if line.rstrip() != "":
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
227 k, v = line.rstrip().split()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
228 taxid_list_l.append(int(k))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
229 accnr_list_l.append(v)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
230 # print(f"last item added to accnr_list_l:{accnr_list_l[-1]}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
231
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
232 # if asked, keep only non redundant taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
233 if b_rm_duplicated_taxid:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
234 # create set (uniq taxids)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
235 taxid_list_uniq_set = set(taxid_list_l)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
236 # convert to list
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
237 taxid_list_uniq = list(taxid_list_uniq_set)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
238
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
239 if b_test_load_taxids or b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
240 print(f"{prog_tag} [load_taxids] number taxids:{len(taxid_list_l)}, uniq_taxids:{len(taxid_list_uniq)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
241 accnr_list_uniq = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
242 for taxid_s in list(taxid_list_uniq):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
243 # find index of uniq taxid in the original list of taxid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
244 taxid_index = taxid_list_l.index(taxid_s)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
245 # add deduced related first accession number
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
246 accnr_list_uniq.append( accnr_list_l[taxid_index] )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
247 if b_test_load_taxids or b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
248 print(f"{prog_tag} [load_taxids] number accnrs:{len(accnr_list_l)}, uniq_accnrs:{len(accnr_list_uniq)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
249 # replace original taxid accnr lists by the one with non redundant taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
250 # this syntaxe force copy in original list (write over)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
251 taxid_list_l[:] = taxid_list_uniq
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
252 accnr_list_l[:] = accnr_list_uniq
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
253 if b_test_load_taxids or b_test:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
254 print(f"{prog_tag} [load_taxids] FINAL uniq_taxids:{len(taxid_list_l)} uniq_accnrs:{len(accnr_list_l)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
255 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
256
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
257 # test load_taxids function
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
258 # display taxidlist, then exit
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
259 if b_test_load_taxids:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
260 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_testrmredundanttaxids.tsv'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
261 print("START b_test_load_taxids REDUND")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
262 print("loading "+taxid_acc_tabular_f+" file")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
263 taxid_list = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
264 accnr_list = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
265 load_taxids(taxid_acc_tabular_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
266 taxid_list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
267 accnr_list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
268 True)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
269 for i in range(len(taxid_list)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
270 print(f"{taxid_list[i]}\t{accnr_list[i]}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
271 print(f"{prog_tag} [load_taxids] FINAL taxid_list:{len(taxid_list)} accnr_list:{len(accnr_list)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
272 print("END b_test_load_taxids REDUND")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
273
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
274 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
275 print("START b_test_load_taxids")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
276 print("loading "+taxid_acc_tabular_f+" file")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
277 taxid_list = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
278 accnr_list = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
279 load_taxids(taxid_acc_tabular_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
280 taxid_list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
281 accnr_list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
282 False)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
283 for i in range(len(taxid_list)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
284 print(f"{taxid_list[i]}\t{accnr_list[i]}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
285 print("END b_test_load_taxids")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
286 if not b_test_get_host_complete_genome_acc_nr_found:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
287 sys.exit()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
288 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
289
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
290 # # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
291 # # needs internet connexion, not possible
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
292 # # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
293 # def get_leave_taxid_from_acc_nr(accnrlist):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
294
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
295 # # deduce a list of taxid from a list of accession numbers
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
296 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
297 # for line in os.popen(cmd).readlines():
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
298 # taxidlist.append(line.rstrip())
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
299
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
300 # return taxidlist
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
301 # # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
302
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
303 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
304 # function to retain the most recent acc nr for host complete genome found:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
305 # - return acc nr of most recent complete genome
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
306 # - print accnr species and name retained
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
307 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
308 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
309 def retain_1accnr(accnrlisttmp: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
310 speciestmp: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
311 nametmp: list) -> list:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
312
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
313 max_accnr_version = 0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
314 curr_accnr_version = 0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
315 max_accnr_nr = 0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
316 curr_accnr_nr = 0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
317 kept_accnr_i = 0
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
318 p = re.compile("(.*?(\d+)\.(\d+))$")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
319
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
320 print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}), line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
321
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
322 for iacc in range(len(accnrlisttmp)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
323 m = p.match( accnrlisttmp[iacc] )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
324 if m:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
325 curr_accnr = m.group(1)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
326 curr_accnr_version = int(m.group(3))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
327 accnr_nr = int(m.group(2))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
328 # print(f"curr_accnr_version:{curr_accnr_version} accnr_nr:{accnr_nr}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
329 if curr_accnr_version > max_accnr_version:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
330 max_accnr_version = curr_accnr_version
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
331 kept_accnr_i = iacc
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
332 # print(f"{prog_tag} record kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
333 elif(( curr_accnr_version == max_accnr_version)and
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
334 (curr_accnr_nr > max_accnr_nr)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
335 max_accnr_nr = curr_accnr_nr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
336 kept_accnr_i = iacc
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
337 # print(f"{prog_tag} record kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
338 elif b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
339 print(f"{prog_tag} keep kept_accnr_i:{kept_accnr_i} for accnr:{curr_accnr}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
340
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
341 else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
342 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[iacc]}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
343 print(f"{prog_tag} kept_accnr_i:{kept_accnr_i}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
344 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
345 kept_accn = [ accnrlisttmp[kept_accnr_i] ]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
346
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
347 return kept_accn
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
348 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
349
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
350
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
351 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
352 # Function to find complete genome closely related to current taxid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
353 # goes upper in taxonomy if nothing found until order
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
354 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
355 def ngd_upper_lineage(curr_index_in_lineage: int,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
356 lineage: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
357 ncbi: NCBITaxa
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
358 ) -> list:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
359 print(f"{prog_tag} [ngd_upper_lineage] with curr_index_in_lineage:{curr_index_in_lineage}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
360
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
361 leaves_taxids_ints = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
362
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
363 # deduce up rank, search complet genome/chr in
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
364 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
365 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
366 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
367 print(f"{prog_tag} [ngd_upper_lineage] test with taxid:{upper_taxid} corresponding to rank:{rank}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
368 leaves_taxids_ints = ncbi.get_descendant_taxa(upper_taxid,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
369 intermediate_nodes=False,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
370 collapse_subspecies=False,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
371 return_tree=False
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
372 )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
373
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
374 # int conversion to strings
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
375 leaves_taxids = list(map(str, leaves_taxids_ints))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
376 leaves_taxids_list = ','.join(leaves_taxids)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
377
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
378 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
379 print(f"{prog_tag} [ngd_upper_lineage] number leaves_taxids returned for taxid {upper_taxid}:{len(leaves_taxids)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
380 return leaves_taxids_ints
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
381
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
382
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
383 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
384
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
385 # ----------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
386 # Function that Intersect 2 lists of taxids:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
387 # - host_complete_genome_taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
388 # (host_complete_genome_acc_numbers = acc numbers related to host_complete_genome_taxids: same index pos)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
389 # - leaves_taxids (taxids where we search for host complete genome taxids)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
390
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
391 # - record taxid/accnr found in both
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
392 # - get not_found_taxids: taxids not found in host complet genome taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
393 # - get uppertaxid for not_found_taxids by running ngd_upper_lineage
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
394
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
395 # Return host_complete_genomes_acc_nr, the list of complete genome acc number found related to leaves_taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
396 # ----------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
397 def get_host_complete_genome_acc_nr_found( host_complete_genome_taxids: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
398 host_complete_genome_acc_numbers: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
399 leaves_taxids: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
400 # for further taxonomy analysis
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
401 curr_index_in_lineage: int,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
402 lineage: list,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
403 ncbi: NCBITaxa) -> list:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
404
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
405 # -------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
406 # part shared for all taxid searched (comparison to host taxids from db)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
407
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
408 # initi local working var
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
409 host_complete_genomes_acc_nr = [] # result
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
410 host_complete_genomes_acc_nr_set = set() # result (set)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
411
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
412 # get host complete genomes as a set
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
413 host_complete_genome_taxids_set = set() # knwown host complete genomes
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
414 for host_taxid in host_complete_genome_taxids:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
415 host_complete_genome_taxids_set.add(host_taxid)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
416 # -------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
417
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
418 print(f"{prog_tag} [get_host_complete_genome_acc_nr_found] curr_index_in_lineage:{curr_index_in_lineage} treating {len(leaves_taxids)} taxid(s):{leaves_taxids[0]}...")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
419
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
420 # get result taxids as a set
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
421 leaves_taxids_set = set() # taxid found in results
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
422 for res_taxid in leaves_taxids:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
423 leaves_taxids_set.add(res_taxid)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
424
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
425 print(f"{prog_tag} We intersect:{len(host_complete_genome_taxids)} host complete genomes with")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
426 #\n{host_complete_genome_taxids}\n
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
427 print(f"{prog_tag} :{len(leaves_taxids)} genomes in result, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
428 #\n{leaves_taxids}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
429
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
430 # do intersection for taxids shared (host complete genomes found in results)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
431 host_complete_genomes_taxids_intersect_leaves_set = leaves_taxids_set.intersection(host_complete_genome_taxids_set)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
432 host_complete_genomes_taxids_intersect_leaves_list = list(host_complete_genomes_taxids_intersect_leaves_set)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
433
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
434 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
435 print(f"{prog_tag} After intersec, we retain set :{len(host_complete_genomes_taxids_intersect_leaves_set)} taxid to get related acc nr, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
436 #\n{host_complete_genomes_taxids_intersect_leaves_set}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
437 print(f"{prog_tag} After intersec, we retain list:{len(host_complete_genomes_taxids_intersect_leaves_list)} taxid to get related acc nr, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
438 #\n{host_complete_genomes_taxids_intersect_leaves_list}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
439
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
440 # get taxid not found to go up in taxonomy, get leave taxids again and cross again with
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
441 # available taxid/accnr found in hot complete genome db
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
442 not_found_taxids_set = list(leaves_taxids_set.difference(host_complete_genomes_taxids_intersect_leaves_set))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
443 print(f"{prog_tag} Number of not_found_taxids:{len(not_found_taxids_set)}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
444
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
445 # get index of retained taxids in original host_complete_genome_taxids to deduce related acc numbers
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
446 try:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
447 # -----------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
448 # search indices of elements of host_complete_genomes_taxids_list in host_complete_genome_taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
449 host_complete_genomes_taxids_indexes = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
450 for elt in host_complete_genomes_taxids_intersect_leaves_list:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
451 host_complete_genomes_taxids_indexes.append(host_complete_genome_taxids.index(elt))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
452 # -----------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
453 if len(host_complete_genomes_taxids_intersect_leaves_list) != len(host_complete_genomes_taxids_indexes):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
454 sys.exit(f"{prog_tag} number of indexes retained != number of taxids retained: NOT NORMAL, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
455 # print(f"{prog_tag} After intersec, nr indexes retained: {len(host_complete_genomes_taxids_indexes)}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
456
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
457 # get acc numbers, sorted
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
458 host_complete_genomes_acc_nr = natsorted( [ host_complete_genome_acc_numbers[i] for i in host_complete_genomes_taxids_indexes ] )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
459 print(f"{prog_tag} After intersec, nr accnr retained: {len(host_complete_genomes_acc_nr)}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
460
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
461 if len(host_complete_genomes_acc_nr) == 0:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
462 print(f"no acc nr found for provided leaves_taxids_ints, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
463 elif len(host_complete_genomes_acc_nr) == 1:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
464 accnr = host_complete_genomes_acc_nr[0]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
465 taxid = host_complete_genomes_taxids_intersect_leaves_list[0]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
466 name = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())[0]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
467 species = name
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
468 print(f"retained accnr:{accnr}\tspecies:{species}\tname:{name}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
469 return host_complete_genomes_acc_nr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
470 elif len(host_complete_genomes_acc_nr) > 1:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
471 accnrlisttmp = host_complete_genomes_acc_nr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
472 nametmp = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
473 speciestmp = nametmp
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
474 print(f"{prog_tag} retain_1accnr in {accnrlisttmp}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
475 return retain_1accnr(accnrlisttmp, speciestmp, nametmp)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
476 else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
477 sys.exit(f"{prog_tag} [Error] Case not treated len of found_complete_genomes:{len(host_complete_genomes_acc_nr)}, curr_index_in_lineage:{curr_index_in_lineage}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
478
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
479 except ValueError:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
480 """
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
481 # specific to retain_1accn to avoid lists are crashed by other ngd call
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
482 accnrlisttmp_r = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
483 speciestmp_r = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
484 nametmp_r = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
485 """
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
486
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
487 # goes up in taxonomy to get a larger number of taxids to cross with host genome taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
488 if( (len(host_complete_genomes_acc_nr) == 0) and (curr_index_in_lineage > min_index_in_lineage)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
489 curr_index_in_lineage = curr_index_in_lineage - 1
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
490 upper_taxid = lineage[curr_index_in_lineage]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
491 rank = ncbi.get_rank(upper_taxid[curr_index_in_lineage])
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
492 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name}), line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
493 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
494 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
495
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
496 # get complete lineage: accept ONLY leave taxid? (species)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
497 # name = ncbi.get_taxid_translator(upper_taxid)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
498 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
499 print(f"taxid:{upper_taxid}\tlineage:{lineage}\tname:{name}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
500 leave_taxid_ints = ngd_upper_lineage( curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
501 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
502 ncbi
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
503 )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
504 # we search if one of the accession numbers of complete genome is included in leaves_taxids_ints
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
505 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
506 print(f"{prog_tag} get_host_complete_genome_acc_nr_found(\nhost_complete_genome_taxids,\nhost_complete_genome_acc_numbers,\nleaves_taxids_ints) ran")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
507
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
508 return get_host_complete_genome_acc_nr_found(
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
509 host_complete_genome_taxids,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
510 host_complete_genome_acc_numbers,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
511 leave_taxid_ints,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
512 # for further taxonomy analysis
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
513 curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
514 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
515 ncbi)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
516
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
517 # retain only the most recent complete genome for current treated taxid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
518 # return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
519 elif len(host_complete_genomes_acc_nr) == 0:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
520 print(f"no acc nr found for provided leaves_taxids_ints, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
521 return ''
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
522 elif len(host_complete_genomes_acc_nr) == 1:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
523 print(f"retained accnr:{accnr}\tspecies:{species}\tname:{name}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
524 return host_complete_genomes_acc_nr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
525 elif len(host_complete_genomes_acc_nr) > 1:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
526 accnrlisttmp = host_complete_genomes_acc_nr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
527 speciestmp = list(ncbi.get_taxid_translator(host_complete_genomes_taxids_intersect_leaves_list).values())
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
528 nametmp = speciestmp
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
529 print(f"{prog_tag} retain_1accnr in {accnrlisttmp}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
530 return retain_1accnr(accnrlisttmp, speciestmp, nametmp)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
531 else:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
532 sys.exit(f"{prog_tag} [Error] Case not treated len of found_complete_genomes:{len(host_complete_genomes_acc_nr)}, curr_index_in_lineage:{curr_index_in_lineage}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
533
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
534
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
535 # goes up in taxonomy to get a larger number of taxids to cross with host genome taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
536 if( (len(host_complete_genomes_acc_nr) == 0) and (curr_index_in_lineage > min_index_in_lineage)):
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
537 curr_index_in_lineage = curr_index_in_lineage - 1
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
538 upper_taxid_int = lineage[curr_index_in_lineage]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
539 upper_taxid = str(upper_taxid_int)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
540 rank = ncbi.get_rank([upper_taxid_int])
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
541 name = ncbi.get_taxid_translator([upper_taxid_int])
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
542 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name}), line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
543 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
544 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
545
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
546 # get complete lineage: accept ONLY leave taxid? (species)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
547 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
548 print(f"taxid:{upper_taxid}\tlineage:{lineage}\tname:{name}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
549
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
550 # get taxids for current lineage up some lines earlier, we want to get leave taxids for this species/group/genus/family etc...
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
551 leave_taxid_ints = ngd_upper_lineage( curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
552 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
553 ncbi
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
554 )
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
555 print(f"{prog_tag} number of leave_taxid_ints obtained for rank:{rank}: {len(leave_taxid_ints)}, line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
556
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
557 # we search if one of the accession numbers of complete genome is included in leaves_taxids_ints
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
558 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
559 print(f"{prog_tag} get_host_complete_genome_acc_nr_found(\nhost_complete_genome_taxids,\nhost_complete_genome_acc_numbers,\nleaves_taxids_ints) ran")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
560
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
561 return get_host_complete_genome_acc_nr_found(
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
562 host_complete_genome_taxids,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
563 host_complete_genome_acc_numbers,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
564 leave_taxid_ints,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
565 # for further taxonomy analysis
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
566 curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
567 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
568 ncbi)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
569
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
570
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
571 if b_test_get_host_complete_genome_acc_nr_found:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
572 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
573 taxid_acc_hostdb_in_f = test_dir + 'host_complete_genomes_taxid_accnr.tsv'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
574 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
575 # taxid_u = ['4520','4530','9606']
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
576 # we must obtain retained
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
577 # accnr:GCF_022539505.1 species:Lolium rigidum name:na
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
578 # accnr:GCF_000231095.2 species:Oryza brachyantha name:na
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
579 # accnr:GCF_000001405.40 species:Homo sapiens name:na
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
580 taxid_u = [126889,4520,4530,9187,9606]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
581 # test case of empty list result, we expect a file with only a endofline
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
582 # taxid_u = [9187] # ok 2023/09/21
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
583
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
584 # taxid_u_str = sorted(map(str, taxid_u), key=int)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
585
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
586 # taxid_u = [4530] # oriza
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
587
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
588 # load taxid_acc file
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
589 load_taxids(taxid_acc_in_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
590 taxidlist,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
591 accnrlist,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
592 True) # remove redundant taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
593
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
594 # load host db taxid acc
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
595 load_taxids(taxid_acc_hostdb_in_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
596 taxidlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
597 accnrlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
598 False) # no need to remove redundant taxids
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
599
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
600 # load NCBITaxa
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
601 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
602 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
603 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
604 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
605 try:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
606 ncbi.update_taxonomy_database()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
607 except:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
608 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
609
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
610 print(f"{prog_tag} [TEST get_host_complete_genome_acc_nr_found] START")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
611 # check in ncbi taxonomy which acc number are in and out of given taxid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
612
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
613 accnrlist_res = [] # we do not want initial acc nr not corresponding to complete genomes
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
614 # = accnrlist # we initialize with acc nr already found in results because
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
615 # we expect to increase the number of res
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
616 for glob_taxid in taxid_u:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
617
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
618 accnrlisttmptest = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
619
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
620 # important
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
621 curr_index_in_lineage = -1
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
622
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
623 # get complete lineage: accept ONLY leave taxid? (species)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
624 lineage = ncbi.get_lineage(glob_taxid)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
625 # get species and names
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
626 # accnrlisttmp = acc nr of found complete genomes
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
627 nametmp = list(ncbi.get_taxid_translator(lineage).values())
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
628
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
629 print(f"{prog_tag} We search for host complete genome related to taxid:{glob_taxid}\tname:{nametmp}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
630
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
631 try:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
632 accnrlisttmptest = get_host_complete_genome_acc_nr_found(
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
633 taxidlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
634 accnrlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
635 [glob_taxid],
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
636 # for further taxonomy analysis
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
637 curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
638 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
639 ncbi)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
640 accnrlist_res.extend(accnrlisttmptest)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
641 print(f"{prog_tag} [TEST] add {len(accnrlisttmptest)} accnr to existing {len(accnrlist_res)} accnr")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
642 print(f"{prog_tag} [TEST] accnrlisttmptest:{accnrlisttmptest} line {lineno()}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
643 except TypeError as nterr:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
644 print(f"{prog_tag} [TEST] Nothing returned for global taxid:{glob_taxid}\tname:{nametmp}, err:{nterr}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
645
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
646 # remove redundant accnr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
647 print(f"accnrlist_res to sort:{accnrlist_res}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
648 accnrlist_res = list(sort_uniq(accnrlist_res))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
649 with open(acc_out_f, "w") as record_file:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
650 for accnr in accnrlist_res:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
651 record_file.write("%s\n" % (accnr))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
652 # if empty list, we record a endofline only in the file to avoid a Galaxy error
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
653 if len(accnrlist_res) == 0:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
654 with open(acc_out_f, "w") as record_file:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
655 record_file.write("\n")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
656
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
657 print(f"{prog_tag} {acc_out_f} file created")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
658 print(f"{prog_tag} [TEST get_host_complete_genome_acc_nr_found] END")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
659 sys.exit()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
660 # ----------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
661
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
662 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
663 # MAIN
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
664 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
665 ##### MAIN
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
666 def __main__():
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
667
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
668 # load taxid_acc file
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
669 load_taxids(taxid_acc_in_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
670 taxidlist,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
671 accnrlist,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
672 True) # remove redundant taxids (avoid to search n times the same acc nr)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
673
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
674 # load host db taxid acc
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
675 load_taxids(taxid_acc_hostdb_in_f,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
676 taxidlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
677 accnrlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
678 False) # do not try to remove redundancy, there is no redundant taxid in db
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
679
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
680 # load NCBITaxa
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
681 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
682 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
683 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
684 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
685 try:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
686 ncbi.update_taxonomy_database()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
687 except:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
688 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
689
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
690 # global final results: accnrlist
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
691 accnrlist_res = []
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
692
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
693 # for each taxid found in res
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
694 for taxid_u in taxidlist:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
695
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
696 # get complete lineage: accept ONLY leave taxid? (species)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
697 lineage = ncbi.get_lineage(int(taxid_u))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
698 # get species and names
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
699 # accnrlisttmp = acc nr of found complete genomes
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
700 nametmp = ncbi.translate_to_names(lineage)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
701
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
702 taxid_u_l = [taxid_u]
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
703
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
704 """
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
705 speciestmp = ''
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
706 # get species name
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
707 for taxid in lineage.reversed:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
708 if ncbi.get_rank(taxid) == 'species':
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
709 speciestmp = ncbi.get_taxid_translator(taxid)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
710 break
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
711 """
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
712
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
713 if b_verbose:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
714 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{nametmp}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
715
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
716 try:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
717 # check in ncbi taxonomy which acc number are in and out of given taxid
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
718 accnrlisttmp = get_host_complete_genome_acc_nr_found(
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
719 taxidlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
720 accnrlisthosts,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
721 taxid_u_l,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
722 # for further taxonomy analysis
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
723 curr_index_in_lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
724 lineage,
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
725 ncbi)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
726 accnrlist_res.extend(accnrlisttmp)
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
727 except TypeError as nterr:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
728 print(f"{prog_tag} Nothing returned for global taxid:{taxid_u}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
729
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
730
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
731 # remove redundant accnr
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
732 print(f"accnrlist_res to sort:{accnrlist_res}")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
733 accnrlist_res = list(sort_uniq(accnrlist_res))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
734 with open(acc_out_f, "w") as record_file:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
735 for accnr in accnrlist_res:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
736 record_file.write("%s\n" % (accnr))
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
737
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
738 # if empty list, we record a endofline only in the file to avoid a Galaxy error
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
739 if len(accnrlist_res) == 0:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
740 with open(acc_out_f, "w") as record_file:
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
741 record_file.write("\n")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
742
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
743 # ------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
744
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
745 print(f"{prog_tag} {acc_out_f} file created")
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
746
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
747 # --------------------------------------------------------------------------
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
748 #### MAIN END
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
749 if __name__ == "__main__": __main__()
e5b355a33841 Uploaded python script v1
p.lucas
parents:
diff changeset
750