annotate TAXID_genusexpand_taxid2acc.py @ 9:7e75c71e3c8e draft

Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
author p.lucas
date Mon, 13 Mar 2023 16:16:49 +0000
parents 8c9418230a5a
children 5a4ac43fea68
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
3 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
6 # provide 2 files:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
9 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
10
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
13 from os import path
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
14 import ncbi_genome_download as ngd
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
16 from ete3 import NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
17
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
18 # to be able to report line number in error messages
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
19 import inspect
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
20 frame = inspect.currentframe()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
21
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
22 # debug
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
23 b_test_load_taxids = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
25
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
26 prog_tag = '[' + os.path.basename(__file__) + ']'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
27
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
28 # boolean to know if we dowload ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
29 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
30
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
31 # list of interesting taxid (fathers)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
32 taxidlist_f = ''
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
33 taxidlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
34 accnrlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
35
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
36 # order = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
37 # family or clade = -3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
38 # subtribe or genus = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
39 curr_index_in_lineage = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
40 min_index_in_lineage = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
41
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
42 # boolean to know if we download ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
43 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
44 b_test_all = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
45 b_test = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
46 b_acc_in_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
47 b_acc_out_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
48
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
49 b_verbose = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
50
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
51 # rank = '' # rank level retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
52 # rank_num = index of rank retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
53
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
54 # # set to check that provided rank exist to be sure to be able to use it
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
55 # ranks = {
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
56 # 'superkingdom' => 0,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
57 # # 'clade', # no, several clade, name is too generic
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
58 # 'kingdom' => 1,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
59 # 'phylum' => 2,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
60 # 'subphylum' => 3,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
61 # 'superclass' => 4,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
62 # 'class' => 5,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
63 # 'superorder' => 6,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
64 # 'order' => 7,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
65 # 'suborder' => 8,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
66 # 'infraorder' => 9,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
67 # 'parvorder' => 10,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
68 # 'superfamily' => 11,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
69 # 'family' => 12,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
70 # 'subfamily' => 13,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
71 # 'genus' => 14,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
72 # 'species' => 15,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
73 # 'subspecies' => 16
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
74 # }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
75
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
76 parser = argparse.ArgumentParser()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
78 help="taxid acc_number list in tsv (tabular separated at each line)",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
79 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
82 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
83 # parser.add_argument("-r", "--rank", dest='rank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
85 # action="store_const")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
88 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
91 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
92 parser.add_argument("-z", "--test_all", dest='b_test_all',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
94 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
95 parser.add_argument("-v", "--verbose", dest='b_verbose',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
96 help="[Optional] To have details on records when running",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
97 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
98 parser.set_defaults(b_load_ncbi_tax_f=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
99 parser.set_defaults(b_test_all=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
100 parser.set_defaults(b_verbose=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
101
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
102 # get absolute path in case of files
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
103 args = parser.parse_args()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
104
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
105 # -------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
106 # check arguments
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
107 b_test_all = args.b_test_all
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
108
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
109 if b_test_all:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
110 b_test_load_taxids = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
112 b_test = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
113 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
114 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
115 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
116 b_test = (b_test_load_taxids or
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
117 b_test_add_host_chr_taxids_accnr_from_ori_list)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
118
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
119 if ((not b_test)and
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))):
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
121 print("\n".join([
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
122 "Aim: find accession numbers of complete genomes related to provided taxids.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
123 "If not found at species level, try at upper taxonomic level until order.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
124 "Retains only 1 complete genome is several available:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
125 " - the one with the highest version number, if not sufficient",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
126 " - the one with the highest accession number",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
127 "To use this scripts, run:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
128 "conda activate TAXID_genusexpand_taxid2acc",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
130 " ",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
132 "Then, as an example:\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
134 '-i taxid_accnr_list.tsv',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
135 '-o accnr_out_list.txt']),"\n\n" ]))
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
136 parser.print_help()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
138 " arguments, exit line "+str(frame.f_lineno))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
139 sys.exit(0)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
140
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
141 # print('args:', args)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
142 # if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
143 if args.ncbi_tax_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
145 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
148 if args.taxid_acc_in_f is not None:
7
9238e54d6532 Uploaded 13 03 23 Fix variable bug line 143
p.lucas
parents: 6
diff changeset
149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
150 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
151 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
152 sys.exit("[Error] You must provide taxid_acc_in_f")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
153 if args.acc_out_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
154 acc_out_f = os.path.abspath(args.acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
155 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
156 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
158 # if args.rank is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
159 # rank = 'genus'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
160 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
161 # rank = args.rank
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
162
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
163 if args.b_verbose is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
164 b_verbose = int(args.b_verbose)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
165
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
166 if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
167 if (not b_acc_in_f) and (not b_acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
169
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
170 # # store index of the rank expected by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
171 # rank_num = ranks{ rank }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
172
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
173 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
174 # to sort uniq, for a list, only need to add list conversion
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
175 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
176 mapper= map # Python ≥ 3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
177 def sort_uniq(sequence):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
178 return mapper(
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
179 operator.itemgetter(0),
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
180 itertools.groupby(sorted(sequence)))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
181 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
182
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
183 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
184 # load taxid acc list, return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
185 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
186 def load_taxids(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
187
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
188 if not path.exists(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
189 sys.exit("Error " + taxid_acc_tabular_f +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
190 " file does not exist, line "+ str(sys._getframe().f_lineno) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
191
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
193
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
194 for line in os.popen(cmd).readlines():
9
7e75c71e3c8e Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents: 8
diff changeset
195 if line != "":
7e75c71e3c8e Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents: 8
diff changeset
196 continue
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
197 k, v = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
198 taxidlist.append(k)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
199 accnrlist.append(v)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
200
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
201 return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
202 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
203
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
204 # test load_taxids function
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
205 # display taxidlist, then exit
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
206 if b_test_load_taxids:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
207 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
208 print("START b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
209 print("loading "+taxid_acc_tabular_f+" file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
210 taxidlist = load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
211 for i in range(len(taxidlist)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
212 print(f"{taxidlist[i]}\t{accnrlist[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
213 print("END b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
214 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
215 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
216 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
217
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
218 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
219 # # needs internet connexion, not possible
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
220 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
221 # def get_leave_taxid_from_acc_nr(accnrlist):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
222
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
223 # # deduce a list of taxid from a list of accession numbers
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
224 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
225 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
226 # taxidlist.append(line.rstrip())
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
227
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
228 # return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
229 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
230
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
231 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
232 # function to retain the most recent acc nr for host complete genome found:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
233 # - return acc nr of most recent complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
234 # - print accnr species and name retained
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
235 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
236 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
237 def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
238
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
239 max_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
240 curr_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
241 max_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
242 curr_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
243 kept_accnr_i = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
244 p = re.compile(".*?(\d+)\.(\d+)$")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
245
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
246 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
247
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
248 for i in range(len(accnrlisttmp)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
249 m = p.match( accnrlisttmp[i] )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
250 if m:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
251 # print('Match found: ', m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
252 curr_accnr_version = int(m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
253 accnr_nr = int(m.group(1))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
254 if curr_accnr_version > max_accnr_version:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
255 max_accnr_version = curr_accnr_version
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
256 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
257 # print(f"record kept_accnr_i:{kept_accnr_i}")
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
258 elif(( curr_accnr_version == max_accnr_version)and
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
259 (curr_accnr_nr > max_accnr_nr)):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
260 max_accnr_nr = curr_accnr_nr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
261 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
262 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
263
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
264 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
265 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
266
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
267 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
268 kept_accn = accnrlisttmp[kept_accnr_i]
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
269
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
270 return kept_accn
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
271 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
272
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
273 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
274 # procedure to find complete genome closely related to current taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
275 # goes upper in taxonomy if nothing found until order
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
276 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
277 def ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
278 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
279 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
280 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
281 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
282 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
283 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
284 ):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
285 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
286
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
287 # deduce up rank, search complet genome/chr in
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
288 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
289 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
290 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
291 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
292 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
293 intermediate_nodes=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
294 collapse_subspecies=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
295 return_tree=False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
296 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
297 # int conversion to strings
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
298 leaves_taxids = list(map(str, leaves_taxids))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
299 leaves_taxids_list = ','.join(leaves_taxids)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
300 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
301 # print(f"{prog_tag} cmd:{cmd}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
302
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
303 # specific to retain_1accn to avoid lists are crashed by other ngd call
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
304 accnrlisttmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
305 speciestmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
306 nametmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
307 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
308
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
309 if not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
310 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
311 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
312 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
313 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
314 curr_index_in_lineage = curr_index_in_lineage - 1
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
315 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
316 return ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
317 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
318 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
319 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
320 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
321 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
322 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
323 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
324 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
325 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
326 accnrlisttmp_r.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
327 speciestmp_r.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
328 nametmp_r.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
329 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
330 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
331
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
332 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
333 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
334
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
335 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
336 # print(f"line matching Considering:{line}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
337 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
338
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
339 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
340 # read taxids, deduce complete genomes available in genblank, provides in output file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
341 # the acc number in addition to those already listed
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
342 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
343 def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
344 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
345 acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
346
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
347 # store all accnr found for complete genome of current taxid (or from same family/order)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
348 # the aim is to keep only the most recent/complete
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
349 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
350 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
351 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
352
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
353 # get host complete genome when found using ncbi_genome_download
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
354 taxids_list=','.join(taxidlist)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
355
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
356 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
357 # # ncbi-genome-download as a library
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
358 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
359 # ngd.download(section='genbank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
360 # taxids=taxids_list,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
361 # assembly_levels='chromosome',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
362 # flat_output=True,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
363 # output=ngd_out_f,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
364 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
365 # dry_run=True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
366 # )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
367
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
368
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
369 # cmd = "cut -f 1,2 "+ngd_out_f
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
370
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
371 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
372 # acc_nr, species = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
373 # accnrlist.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
374
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
375 # if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
376 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
377 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
378
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
379 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
380 # ncbi-genome-download as executable script
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
381 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
382
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
383 # load NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
384 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
385 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
386 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
387 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
388 try:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
389 ncbi.update_taxonomy_database()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
390 except:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
391 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
392
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
393 for taxid_u in taxidlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
394 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
395 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
396 # ERROR: No downloads matched your filter. Please check your options.
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
397 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
398 # get complete lineage: accept ONLY leave taxid? (species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
399 lineage = ncbi.get_lineage(int(taxid_u))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
400 name = ncbi.translate_to_names(lineage)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
401 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
402 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
403
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
404 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
405 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
406 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
407 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
408 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
409 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
410 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
411 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
412 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
413 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
414 accnrlist.append( new_acc_nr )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
415
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
416 # initialize for next search
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
417 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
418 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
419 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
420
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
421 elif not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
422 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
423 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
424 accnrlisttmp.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
425 speciestmp.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
426 nametmp.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
427 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
428 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
429
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
430 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
431 if len(accnrlisttmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
432 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
433
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
434 # remove redundant accnr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
435 accnrlist = list(sort_uniq(accnrlist))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
436 with open(acc_out_f, "w") as record_file:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
437 for accnr in accnrlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
438 record_file.write("%s\n" % (accnr))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
439 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
440
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
441 print(f"{prog_tag} {acc_out_f} file created")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
442
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
443 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
444 # test
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
445 if b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
446 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
447 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
448 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
449 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
450 print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
451 taxidlist = load_taxids(taxid_acc_in_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
452 print(f"{prog_tag} end loading")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
453
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
454 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
455 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
456 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
457 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
458 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
459 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
460
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
461 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
462 # MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
463 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
464 ##### MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
465 def __main__():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
466 # load taxid_acc file
8
8c9418230a5a Uploaded 13 03 23 Fix bug on variable name line 465
p.lucas
parents: 7
diff changeset
467 load_taxids(taxid_acc_in_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
468 # check in ncbi taxonomy which acc number are in and out of given taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
469 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
470 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
471 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
472 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
473 #### MAIN END
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
474 if __name__ == "__main__": __main__()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
475