annotate TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft

Uploaded updated python script
author p.lucas
date Thu, 21 Sep 2023 15:19:55 +0000
parents 3d116861e380
children 932ba9e04f3a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
3 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
6 # provide 2 files:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
9 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
10
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
13 #from subprocess import Popen,PIPE
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
14 import subprocess
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
15 from os import path
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
16 import ncbi_genome_download as ngd
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
18 from ete3 import NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
19
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
20 # to be able to report line number in error messages
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
21 import inspect
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
22 def lineno():
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
23 """Returns the current line number in our program."""
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
24 return str(inspect.currentframe().f_back.f_lineno)
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
25
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
26
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
27 # debug
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
28 test_dir = 'test_TAXID_genusexpand_taxid2acc/'
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
29 b_test_load_taxids = False # ok 2023 08 25
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
30 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
31
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
32 prog_tag = '[' + os.path.basename(__file__) + ']'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
33
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
34 # boolean to know if we dowload ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
35 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
36
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
37 # list of interesting taxid (fathers)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
38 taxidlist_f = ''
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
39 taxidlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
40 accnrlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
41
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
42 # order = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
43 # family or clade = -3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
44 # subtribe or genus = -2
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
45 curr_index_in_lineage = -1
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
46 min_index_in_lineage = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
47
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
48 # boolean to know if we download ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
49 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
50 b_test_all = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
51 b_test = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
52 b_acc_in_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
53 b_acc_out_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
54
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
55 b_verbose = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
56
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
57 # variables for ncbi-genome-download
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
58 ncbigenomedownload_section = 'refseq' # genbank
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
59 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
60 assembly_levels = 'complete,chromosome'
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
61
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
62 # rank = '' # rank level retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
63 # rank_num = index of rank retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
64
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
65 # # set to check that provided rank exist to be sure to be able to use it
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
66 # ranks = {
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
67 # 'superkingdom' => 0,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
68 # # 'clade', # no, several clade, name is too generic
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
69 # 'kingdom' => 1,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
70 # 'phylum' => 2,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
71 # 'subphylum' => 3,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
72 # 'superclass' => 4,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
73 # 'class' => 5,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
74 # 'superorder' => 6,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
75 # 'order' => 7,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
76 # 'suborder' => 8,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
77 # 'infraorder' => 9,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
78 # 'parvorder' => 10,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
79 # 'superfamily' => 11,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
80 # 'family' => 12,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
81 # 'subfamily' => 13,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
82 # 'genus' => 14,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
83 # 'species' => 15,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
84 # 'subspecies' => 16
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
85 # }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
86
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
87 parser = argparse.ArgumentParser()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
88 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
89 help="taxid acc_number list in tsv (tabular separated at each line)",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
90 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
91 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
92 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
93 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
94 # parser.add_argument("-r", "--rank", dest='rank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
95 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
96 # action="store_const")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
97 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
98 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
99 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
100 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
101 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
102 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
103 parser.add_argument("-z", "--test_all", dest='b_test_all',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
104 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
105 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
106 parser.add_argument("-v", "--verbose", dest='b_verbose',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
107 help="[Optional] To have details on records when running",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
108 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
109 parser.set_defaults(b_load_ncbi_tax_f=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
110 parser.set_defaults(b_test_all=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
111 parser.set_defaults(b_verbose=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
112
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
113 # get absolute path in case of files
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
114 args = parser.parse_args()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
115
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
116 # -------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
117 # check arguments
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
118 b_test_all = args.b_test_all
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
119
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
120 if b_test_all:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
121 b_test_load_taxids = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
122 b_test_add_host_chr_taxids_accnr_from_ori_list = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
123 b_test = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
124 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
125 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
126 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
127 b_test = (b_test_load_taxids or
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
128 b_test_add_host_chr_taxids_accnr_from_ori_list)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
129
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
130 if ((not b_test)and
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
131 ((len(sys.argv) < 2) or (len(sys.argv) > 5))):
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
132 print("\n".join([
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
133 "Aim: find accession numbers of complete genomes related to provided taxids.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
134 "If not found at species level, try at upper taxonomic level until order.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
135 "Retains only 1 complete genome is several available:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
136 " - the one with the highest version number, if not sufficient",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
137 " - the one with the highest accession number",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
138 "To use this scripts, run:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
139 "conda activate TAXID_genusexpand_taxid2acc",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
140 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
141 " ",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
142 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
143 "Then, as an example:\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
144 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
145 '-i taxid_accnr_list.tsv',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
146 '-o accnr_out_list.txt']),"\n\n" ]))
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
147 parser.print_help()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
148 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
149 " arguments, exit line "+lineno())
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
150 sys.exit(0)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
151
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
152 # print('args:', args)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
153 # if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
154 if args.ncbi_tax_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
155 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
156 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
157 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
158 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
159 if args.taxid_acc_in_f is not None:
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
160 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
161 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
162 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
163 sys.exit("[Error] You must provide taxid_acc_in_f")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
164 if args.acc_out_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
165 acc_out_f = os.path.abspath(args.acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
166 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
167 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
168 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
169 # if args.rank is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
170 # rank = 'genus'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
171 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
172 # rank = args.rank
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
173
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
174 if args.b_verbose is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
175 b_verbose = int(args.b_verbose)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
176
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
177 if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
178 if (not b_acc_in_f) and (not b_acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
179 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
180
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
181 # # store index of the rank expected by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
182 # rank_num = ranks{ rank }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
183
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
184 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
185 # to sort uniq, for a list, only need to add list conversion
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
186 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
187 mapper= map # Python ≥ 3
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
188 def sort_uniq(sequence: list):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
189 return mapper(
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
190 operator.itemgetter(0),
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
191 itertools.groupby(sorted(sequence)))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
192 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
193
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
194 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
195 # Procedure: load taxid acc list, return taxidlist
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
196 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
197 def load_taxids(taxid_acc_tabular_f: str):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
198
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
199 if not path.exists(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
200 sys.exit("Error " + taxid_acc_tabular_f +
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
201 " file does not exist, line "+ lineno() )
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
202
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
203 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
204
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
205 for line in os.popen(cmd).readlines():
11
3d116861e380 Uploaded 14 03 23 def load_taxids fix bug chacking empty line
p.lucas
parents: 10
diff changeset
206 if line.rstrip() != "":
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
207 k, v = line.rstrip().split()
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
208 taxidlist.append(k)
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
209 accnrlist.append(v)
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
210 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
211
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
212 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
213
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
214 # test load_taxids function
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
215 # display taxidlist, then exit
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
216 if b_test_load_taxids:
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
217 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
218 print("START b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
219 print("loading "+taxid_acc_tabular_f+" file")
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
220 load_taxids(taxid_acc_tabular_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
221 for i in range(len(taxidlist)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
222 print(f"{taxidlist[i]}\t{accnrlist[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
223 print("END b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
224 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
225 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
226 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
227
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
228 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
229 # # needs internet connexion, not possible
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
230 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
231 # def get_leave_taxid_from_acc_nr(accnrlist):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
232
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
233 # # deduce a list of taxid from a list of accession numbers
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
234 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
235 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
236 # taxidlist.append(line.rstrip())
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
237
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
238 # return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
239 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
240
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
241 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
242 # function to retain the most recent acc nr for host complete genome found:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
243 # - return acc nr of most recent complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
244 # - print accnr species and name retained
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
245 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
246 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
247 def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str:
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
248
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
249 max_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
250 curr_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
251 max_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
252 curr_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
253 kept_accnr_i = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
254 p = re.compile(".*?(\d+)\.(\d+)$")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
255
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
256 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
257
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
258 for i in range(len(accnrlisttmp)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
259 m = p.match( accnrlisttmp[i] )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
260 if m:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
261 # print('Match found: ', m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
262 curr_accnr_version = int(m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
263 accnr_nr = int(m.group(1))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
264 if curr_accnr_version > max_accnr_version:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
265 max_accnr_version = curr_accnr_version
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
266 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
267 # print(f"record kept_accnr_i:{kept_accnr_i}")
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
268 elif(( curr_accnr_version == max_accnr_version)and
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
269 (curr_accnr_nr > max_accnr_nr)):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
270 max_accnr_nr = curr_accnr_nr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
271 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
272 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
273
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
274 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
275 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
276
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
277 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
278 kept_accn = accnrlisttmp[kept_accnr_i]
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
279
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
280 return kept_accn
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
281 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
282
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
283 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
284 # Function to find complete genome closely related to current taxid
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
285 # goes upper in taxonomy if nothing found until order
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
286 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
287 def ngd_upper_lineage(curr_index_in_lineage: int,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
288 lineage: list,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
289 ncbi: NCBITaxa,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
290 accnrlisttmp: list, # current working list
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
291 accnrlist: list, # final list, if something added (or min index reached), recursivity stop
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
292 speciestmp: list,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
293 nametmp: list
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
294 ) -> str:
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
295 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
296
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
297 # deduce up rank, search complet genome/chr in
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
298 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
299 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
300 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
301 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
302 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
303 intermediate_nodes=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
304 collapse_subspecies=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
305 return_tree=False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
306 )
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
307
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
308 # int conversion to strings
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
309 leaves_taxids = list(map(str, leaves_taxids))
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
310 leaves_taxids_list = ','.join(leaves_taxids)
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
311
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
312 if b_verbose:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
313 print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
314 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
315 if b_verbose:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
316 print(f"{prog_tag} cmd:{cmd}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
317
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
318 # specific to retain_1accn to avoid lists are crashed by other ngd call
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
319 accnrlisttmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
320 speciestmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
321 nametmp_r = []
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
322
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
323 for line in os.popen(cmd).readlines():
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
324 # print(f"line 314:{line.rstrip()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
325 if not re.match("^Considering", line):
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
326 # print(f"line 316:{line.rstrip()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
327 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
328 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
329 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
330 curr_index_in_lineage = curr_index_in_lineage - 1
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
331 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
332 return ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
333 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
334 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
335 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
336 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
337 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
338 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
339 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
340 else:
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
341 # print(f"line 331:{line}")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
342 try:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
343 acc_nr, species, name = line.rstrip().split("\t")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
344 except ValueError as ve:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
345 sys.exit(f"ValueError {ve}: for split of line '{line}'")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
346 accnrlisttmp_r.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
347 speciestmp_r.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
348 nametmp_r.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
349 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
350 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
351
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
352 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
353 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
354
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
355 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
356 # print(f"line matching Considering:{line}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
357 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
358
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
359 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
360 # read taxids, deduce complete genomes available in genblank, provides in output file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
361 # the acc number in addition to those already listed
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
362 # --------------------------------------------------------------------------
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
363 def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
364 accnrlist: list,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
365 acc_out_f: str):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
366
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
367 # store all accnr found for complete genome of current taxid (or from same family/order)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
368 # the aim is to keep only the most recent/complete
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
369 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
370 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
371 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
372
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
373 # get host complete genome when found using ncbi_genome_download
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
374 taxids_list=','.join(taxidlist)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
375
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
376 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
377 # # ncbi-genome-download as a library
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
378 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
379 # ngd.download(section=ncbigenomedownload_section,
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
380 # taxids=taxids_list,
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
381 # assembly_levels=assembly_levels,
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
382 # flat_output=True,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
383 # output=ngd_out_f,
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
384 # groups=organisms_to_search_in,
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
385 # dry_run=True
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
386 # )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
387
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
388
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
389 # cmd = "cut -f 1,2 "+ngd_out_f
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
390
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
391 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
392 # acc_nr, species = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
393 # accnrlist.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
394
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
395 # if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
396 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
397 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
398
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
399 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
400 # ncbi-genome-download as executable script
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
401 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
402
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
403 # load NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
404 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
405 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
406 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
407 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
408 try:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
409 ncbi.update_taxonomy_database()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
410 except:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
411 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
412
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
413 for taxid_u in taxidlist:
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
414 print(f"{prog_tag} treating global taxid:{taxid_u}")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
415 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
416 for line in os.popen(cmd).readlines():
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
417 if b_verbose:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
418 print(f"{prog_tag} cmd:{cmd} ran, read output")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
419 # ERROR: No downloads matched your filter. Please check your options.
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
420 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
421 # get complete lineage: accept ONLY leave taxid? (species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
422 lineage = ncbi.get_lineage(int(taxid_u))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
423 name = ncbi.translate_to_names(lineage)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
424 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
425 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
426
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
427 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
428 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
429 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
430 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
431 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
432 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
433 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
434 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
435 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
436 )
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
437 if new_acc_nr is None:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
438 print(f"No acc_nr found after going up in taxonomy, line {lineno()}")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
439 else:
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
440 accnrlist.append( new_acc_nr )
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
441 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
442
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
443 # initialize for next search
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
444 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
445 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
446 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
447
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
448 elif not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
449 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
450 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
451 accnrlisttmp.append(acc_nr)
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
452 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
453
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
454 speciestmp.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
455 nametmp.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
456 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
457 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
458
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
459 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
460 if len(accnrlisttmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
461 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
462 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
463
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
464 # remove redundant accnr
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
465 print(f"accnrlist to sort:{accnrlist}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
466 accnrlist = list(sort_uniq(accnrlist))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
467 with open(acc_out_f, "w") as record_file:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
468 for accnr in accnrlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
469 record_file.write("%s\n" % (accnr))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
470 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
471
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
472 print(f"{prog_tag} {acc_out_f} file created")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
473
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
474 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
475 # test
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
476 if b_test_add_host_chr_taxids_accnr_from_ori_list:
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
477 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
478 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
479 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
480 print(f"{prog_tag} loading {taxid_acc_in_f} file")
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
481 load_taxids(taxid_acc_in_f)
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
482 for i in range(len(taxidlist)):
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
483 print(f"{taxidlist[i]}\t{accnrlist[i]}")
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
484 print(f"{prog_tag} end loading")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
485
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
486 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
487 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
488 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
489 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
490 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
491 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
492
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
493 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
494 # MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
495 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
496 ##### MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
497 def __main__():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
498 # load taxid_acc file
12
19e175a84d0e Uploaded updated python script
p.lucas
parents: 11
diff changeset
499 load_taxids(taxid_acc_tabular_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
500 # check in ncbi taxonomy which acc number are in and out of given taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
501 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
502 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
503 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
504 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
505 #### MAIN END
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
506 if __name__ == "__main__": __main__()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
507