annotate TAXID_genusexpand_taxid2acc.py @ 7:9238e54d6532 draft

Uploaded 13 03 23 Fix variable bug line 143
author p.lucas
date Mon, 13 Mar 2023 15:09:54 +0000
parents 81acd8138218
children 8c9418230a5a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
3 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
6 # provide 2 files:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
9 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
10
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
13 from os import path
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
14 import ncbi_genome_download as ngd
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
16 from ete3 import NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
17
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
18 # to be able to report line number in error messages
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
19 import inspect
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
20 frame = inspect.currentframe()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
21
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
22 # debug
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
23 b_test_load_taxids = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
25
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
26 prog_tag = '[' + os.path.basename(__file__) + ']'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
27
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
28 # boolean to know if we dowload ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
29 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
30
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
31 # list of interesting taxid (fathers)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
32 taxidlist_f = ''
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
33 taxidlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
34 accnrlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
35
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
36 # order = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
37 # family or clade = -3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
38 # subtribe or genus = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
39 curr_index_in_lineage = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
40 min_index_in_lineage = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
41
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
42 # boolean to know if we download ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
43 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
44 b_test_all = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
45 b_test = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
46 b_acc_in_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
47 b_acc_out_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
48
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
49 b_verbose = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
50
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
51 # rank = '' # rank level retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
52 # rank_num = index of rank retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
53
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
54 # # set to check that provided rank exist to be sure to be able to use it
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
55 # ranks = {
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
56 # 'superkingdom' => 0,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
57 # # 'clade', # no, several clade, name is too generic
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
58 # 'kingdom' => 1,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
59 # 'phylum' => 2,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
60 # 'subphylum' => 3,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
61 # 'superclass' => 4,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
62 # 'class' => 5,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
63 # 'superorder' => 6,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
64 # 'order' => 7,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
65 # 'suborder' => 8,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
66 # 'infraorder' => 9,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
67 # 'parvorder' => 10,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
68 # 'superfamily' => 11,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
69 # 'family' => 12,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
70 # 'subfamily' => 13,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
71 # 'genus' => 14,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
72 # 'species' => 15,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
73 # 'subspecies' => 16
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
74 # }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
75
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
76 parser = argparse.ArgumentParser()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
78 help="taxid acc_number list in tsv (tabular separated at each line)",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
79 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
82 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
83 # parser.add_argument("-r", "--rank", dest='rank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
85 # action="store_const")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
88 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
91 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
92 parser.add_argument("-z", "--test_all", dest='b_test_all',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
94 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
95 parser.add_argument("-v", "--verbose", dest='b_verbose',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
96 help="[Optional] To have details on records when running",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
97 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
98 parser.set_defaults(b_load_ncbi_tax_f=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
99 parser.set_defaults(b_test_all=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
100 parser.set_defaults(b_verbose=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
101
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
102 # get absolute path in case of files
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
103 args = parser.parse_args()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
104
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
105 # -------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
106 # check arguments
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
107 b_test_all = args.b_test_all
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
108
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
109 if b_test_all:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
110 b_test_load_taxids = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
112 b_test = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
113 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
114 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
115 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
116 b_test = (b_test_load_taxids or
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
117 b_test_add_host_chr_taxids_accnr_from_ori_list)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
118
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
119 if ((not b_test)and
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))):
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
121 print("\n".join([
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
122 "Aim: find accession numbers of complete genomes related to provided taxids.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
123 "If not found at species level, try at upper taxonomic level until order.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
124 "Retains only 1 complete genome is several available:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
125 " - the one with the highest version number, if not sufficient",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
126 " - the one with the highest accession number",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
127 "To use this scripts, run:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
128 "conda activate TAXID_genusexpand_taxid2acc",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
130 " ",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
132 "Then, as an example:\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
134 '-i taxid_accnr_list.tsv',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
135 '-o accnr_out_list.txt']),"\n\n" ]))
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
136 parser.print_help()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
138 " arguments, exit line "+str(frame.f_lineno))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
139 sys.exit(0)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
140
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
141 # print('args:', args)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
142 # if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
143 if args.ncbi_tax_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
145 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
148 if args.taxid_acc_in_f is not None:
7
9238e54d6532 Uploaded 13 03 23 Fix variable bug line 143
p.lucas
parents: 6
diff changeset
149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
150 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
151 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
152 sys.exit("[Error] You must provide taxid_acc_in_f")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
153 if args.acc_out_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
154 acc_out_f = os.path.abspath(args.acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
155 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
156 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
158 # if args.rank is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
159 # rank = 'genus'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
160 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
161 # rank = args.rank
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
162
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
163 if args.b_verbose is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
164 b_verbose = int(args.b_verbose)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
165
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
166 if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
167 if (not b_acc_in_f) and (not b_acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
169
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
170 # # store index of the rank expected by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
171 # rank_num = ranks{ rank }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
172
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
173 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
174 # to sort uniq, for a list, only need to add list conversion
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
175 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
176 mapper= map # Python ≥ 3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
177 def sort_uniq(sequence):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
178 return mapper(
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
179 operator.itemgetter(0),
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
180 itertools.groupby(sorted(sequence)))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
181 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
182
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
183 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
184 # load taxid acc list, return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
185 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
186 def load_taxids(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
187
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
188 if not path.exists(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
189 sys.exit("Error " + taxid_acc_tabular_f +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
190 " file does not exist, line "+ str(sys._getframe().f_lineno) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
191
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
193
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
194 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
195 k, v = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
196 taxidlist.append(k)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
197 accnrlist.append(v)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
198
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
199 return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
200 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
201
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
202 # test load_taxids function
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
203 # display taxidlist, then exit
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
204 if b_test_load_taxids:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
205 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
206 print("START b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
207 print("loading "+taxid_acc_tabular_f+" file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
208 taxidlist = load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
209 for i in range(len(taxidlist)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
210 print(f"{taxidlist[i]}\t{accnrlist[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
211 print("END b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
212 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
213 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
214 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
215
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
216 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
217 # # needs internet connexion, not possible
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
218 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
219 # def get_leave_taxid_from_acc_nr(accnrlist):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
220
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
221 # # deduce a list of taxid from a list of accession numbers
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
222 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
223 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
224 # taxidlist.append(line.rstrip())
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
225
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
226 # return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
227 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
228
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
229 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
230 # function to retain the most recent acc nr for host complete genome found:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
231 # - return acc nr of most recent complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
232 # - print accnr species and name retained
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
233 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
234 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
235 def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
236
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
237 max_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
238 curr_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
239 max_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
240 curr_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
241 kept_accnr_i = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
242 p = re.compile(".*?(\d+)\.(\d+)$")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
243
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
244 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
245
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
246 for i in range(len(accnrlisttmp)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
247 m = p.match( accnrlisttmp[i] )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
248 if m:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
249 # print('Match found: ', m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
250 curr_accnr_version = int(m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
251 accnr_nr = int(m.group(1))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
252 if curr_accnr_version > max_accnr_version:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
253 max_accnr_version = curr_accnr_version
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
254 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
255 # print(f"record kept_accnr_i:{kept_accnr_i}")
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
256 elif(( curr_accnr_version == max_accnr_version)and
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
257 (curr_accnr_nr > max_accnr_nr)):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
258 max_accnr_nr = curr_accnr_nr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
259 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
260 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
261
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
262 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
263 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
264
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
265 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
266 kept_accn = accnrlisttmp[kept_accnr_i]
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
267
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
268 return kept_accn
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
269 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
270
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
271 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
272 # procedure to find complete genome closely related to current taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
273 # goes upper in taxonomy if nothing found until order
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
274 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
275 def ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
276 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
277 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
278 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
279 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
280 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
281 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
282 ):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
283 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
284
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
285 # deduce up rank, search complet genome/chr in
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
286 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
287 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
288 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
289 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
290 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
291 intermediate_nodes=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
292 collapse_subspecies=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
293 return_tree=False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
294 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
295 # int conversion to strings
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
296 leaves_taxids = list(map(str, leaves_taxids))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
297 leaves_taxids_list = ','.join(leaves_taxids)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
298 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
299 # print(f"{prog_tag} cmd:{cmd}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
300
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
301 # specific to retain_1accn to avoid lists are crashed by other ngd call
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
302 accnrlisttmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
303 speciestmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
304 nametmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
305 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
306
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
307 if not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
308 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
309 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
310 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
311 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
312 curr_index_in_lineage = curr_index_in_lineage - 1
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
313 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
314 return ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
315 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
316 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
317 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
318 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
319 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
320 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
321 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
322 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
323 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
324 accnrlisttmp_r.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
325 speciestmp_r.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
326 nametmp_r.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
327 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
328 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
329
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
330 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
331 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
332
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
333 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
334 # print(f"line matching Considering:{line}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
335 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
336
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
337 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
338 # read taxids, deduce complete genomes available in genblank, provides in output file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
339 # the acc number in addition to those already listed
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
340 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
341 def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
342 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
343 acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
344
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
345 # store all accnr found for complete genome of current taxid (or from same family/order)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
346 # the aim is to keep only the most recent/complete
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
347 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
348 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
349 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
350
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
351 # get host complete genome when found using ncbi_genome_download
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
352 taxids_list=','.join(taxidlist)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
353
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
354 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
355 # # ncbi-genome-download as a library
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
356 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
357 # ngd.download(section='genbank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
358 # taxids=taxids_list,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
359 # assembly_levels='chromosome',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
360 # flat_output=True,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
361 # output=ngd_out_f,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
362 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
363 # dry_run=True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
364 # )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
365
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
366
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
367 # cmd = "cut -f 1,2 "+ngd_out_f
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
368
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
369 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
370 # acc_nr, species = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
371 # accnrlist.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
372
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
373 # if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
374 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
375 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
376
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
377 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
378 # ncbi-genome-download as executable script
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
379 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
380
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
381 # load NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
382 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
383 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
384 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
385 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
386 try:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
387 ncbi.update_taxonomy_database()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
388 except:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
389 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
390
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
391 for taxid_u in taxidlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
392 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
393 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
394 # ERROR: No downloads matched your filter. Please check your options.
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
395 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
396 # get complete lineage: accept ONLY leave taxid? (species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
397 lineage = ncbi.get_lineage(int(taxid_u))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
398 name = ncbi.translate_to_names(lineage)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
399 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
400 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
401
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
402 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
403 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
404 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
405 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
406 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
407 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
408 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
409 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
410 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
411 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
412 accnrlist.append( new_acc_nr )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
413
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
414 # initialize for next search
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
415 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
416 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
417 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
418
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
419 elif not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
420 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
421 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
422 accnrlisttmp.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
423 speciestmp.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
424 nametmp.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
425 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
426 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
427
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
428 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
429 if len(accnrlisttmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
430 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
431
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
432 # remove redundant accnr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
433 accnrlist = list(sort_uniq(accnrlist))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
434 with open(acc_out_f, "w") as record_file:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
435 for accnr in accnrlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
436 record_file.write("%s\n" % (accnr))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
437 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
438
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
439 print(f"{prog_tag} {acc_out_f} file created")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
440
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
441 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
442 # test
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
443 if b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
444 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
445 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
446 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
447 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
448 print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
449 taxidlist = load_taxids(taxid_acc_in_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
450 print(f"{prog_tag} end loading")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
451
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
452 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
453 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
454 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
455 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
456 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
457 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
458
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
459 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
460 # MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
461 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
462 ##### MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
463 def __main__():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
464 # load taxid_acc file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
465 load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
466 # check in ncbi taxonomy which acc number are in and out of given taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
467 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
468 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
469 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
470 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
471 #### MAIN END
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
472 if __name__ == "__main__": __main__()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
473