annotate TAXID_genusexpand_taxid2acc.py @ 10:5a4ac43fea68 draft

Uploaded 14 03 23 def load_taxids fix bug when empty line
author p.lucas
date Tue, 14 Mar 2023 08:07:14 +0000
parents 7e75c71e3c8e
children 3d116861e380
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
3 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
6 # provide 2 files:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
9 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
10
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
13 from os import path
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
14 import ncbi_genome_download as ngd
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
16 from ete3 import NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
17
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
18 # to be able to report line number in error messages
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
19 import inspect
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
20 frame = inspect.currentframe()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
21
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
22 # debug
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
23 b_test_load_taxids = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
25
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
26 prog_tag = '[' + os.path.basename(__file__) + ']'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
27
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
28 # boolean to know if we dowload ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
29 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
30
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
31 # list of interesting taxid (fathers)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
32 taxidlist_f = ''
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
33 taxidlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
34 accnrlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
35
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
36 # order = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
37 # family or clade = -3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
38 # subtribe or genus = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
39 curr_index_in_lineage = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
40 min_index_in_lineage = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
41
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
42 # boolean to know if we download ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
43 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
44 b_test_all = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
45 b_test = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
46 b_acc_in_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
47 b_acc_out_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
48
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
49 b_verbose = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
50
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
51 # rank = '' # rank level retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
52 # rank_num = index of rank retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
53
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
54 # # set to check that provided rank exist to be sure to be able to use it
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
55 # ranks = {
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
56 # 'superkingdom' => 0,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
57 # # 'clade', # no, several clade, name is too generic
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
58 # 'kingdom' => 1,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
59 # 'phylum' => 2,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
60 # 'subphylum' => 3,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
61 # 'superclass' => 4,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
62 # 'class' => 5,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
63 # 'superorder' => 6,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
64 # 'order' => 7,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
65 # 'suborder' => 8,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
66 # 'infraorder' => 9,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
67 # 'parvorder' => 10,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
68 # 'superfamily' => 11,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
69 # 'family' => 12,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
70 # 'subfamily' => 13,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
71 # 'genus' => 14,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
72 # 'species' => 15,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
73 # 'subspecies' => 16
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
74 # }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
75
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
76 parser = argparse.ArgumentParser()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
78 help="taxid acc_number list in tsv (tabular separated at each line)",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
79 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
82 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
83 # parser.add_argument("-r", "--rank", dest='rank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
85 # action="store_const")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
88 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
91 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
92 parser.add_argument("-z", "--test_all", dest='b_test_all',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
94 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
95 parser.add_argument("-v", "--verbose", dest='b_verbose',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
96 help="[Optional] To have details on records when running",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
97 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
98 parser.set_defaults(b_load_ncbi_tax_f=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
99 parser.set_defaults(b_test_all=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
100 parser.set_defaults(b_verbose=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
101
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
102 # get absolute path in case of files
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
103 args = parser.parse_args()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
104
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
105 # -------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
106 # check arguments
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
107 b_test_all = args.b_test_all
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
108
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
109 if b_test_all:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
110 b_test_load_taxids = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
112 b_test = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
113 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
114 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
115 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
116 b_test = (b_test_load_taxids or
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
117 b_test_add_host_chr_taxids_accnr_from_ori_list)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
118
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
119 if ((not b_test)and
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))):
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
121 print("\n".join([
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
122 "Aim: find accession numbers of complete genomes related to provided taxids.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
123 "If not found at species level, try at upper taxonomic level until order.",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
124 "Retains only 1 complete genome is several available:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
125 " - the one with the highest version number, if not sufficient",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
126 " - the one with the highest accession number",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
127 "To use this scripts, run:",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
128 "conda activate TAXID_genusexpand_taxid2acc",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
130 " ",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
132 "Then, as an example:\n\n",
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
134 '-i taxid_accnr_list.tsv',
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
135 '-o accnr_out_list.txt']),"\n\n" ]))
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
136 parser.print_help()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
138 " arguments, exit line "+str(frame.f_lineno))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
139 sys.exit(0)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
140
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
141 # print('args:', args)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
142 # if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
143 if args.ncbi_tax_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
145 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
148 if args.taxid_acc_in_f is not None:
7
9238e54d6532 Uploaded 13 03 23 Fix variable bug line 143
p.lucas
parents: 6
diff changeset
149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
150 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
151 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
152 sys.exit("[Error] You must provide taxid_acc_in_f")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
153 if args.acc_out_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
154 acc_out_f = os.path.abspath(args.acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
155 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
156 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
157 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
158 # if args.rank is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
159 # rank = 'genus'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
160 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
161 # rank = args.rank
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
162
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
163 if args.b_verbose is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
164 b_verbose = int(args.b_verbose)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
165
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
166 if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
167 if (not b_acc_in_f) and (not b_acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
168 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
169
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
170 # # store index of the rank expected by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
171 # rank_num = ranks{ rank }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
172
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
173 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
174 # to sort uniq, for a list, only need to add list conversion
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
175 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
176 mapper= map # Python ≥ 3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
177 def sort_uniq(sequence):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
178 return mapper(
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
179 operator.itemgetter(0),
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
180 itertools.groupby(sorted(sequence)))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
181 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
182
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
183 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
184 # load taxid acc list, return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
185 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
186 def load_taxids(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
187
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
188 if not path.exists(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
189 sys.exit("Error " + taxid_acc_tabular_f +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
190 " file does not exist, line "+ str(sys._getframe().f_lineno) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
191
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
193
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
194 for line in os.popen(cmd).readlines():
9
7e75c71e3c8e Uploaded 13 03 23 fin def load_taxids fix bug to avoid error with empty line
p.lucas
parents: 8
diff changeset
195 if line != "":
10
5a4ac43fea68 Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents: 9
diff changeset
196 k, v = line.rstrip().split()
5a4ac43fea68 Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents: 9
diff changeset
197 taxidlist.append(k)
5a4ac43fea68 Uploaded 14 03 23 def load_taxids fix bug when empty line
p.lucas
parents: 9
diff changeset
198 accnrlist.append(v)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
199
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
200 return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
201 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
202
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
203 # test load_taxids function
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
204 # display taxidlist, then exit
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
205 if b_test_load_taxids:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
206 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
207 print("START b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
208 print("loading "+taxid_acc_tabular_f+" file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
209 taxidlist = load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
210 for i in range(len(taxidlist)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
211 print(f"{taxidlist[i]}\t{accnrlist[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
212 print("END b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
213 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
214 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
215 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
216
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
217 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
218 # # needs internet connexion, not possible
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
219 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
220 # def get_leave_taxid_from_acc_nr(accnrlist):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
221
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
222 # # deduce a list of taxid from a list of accession numbers
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
223 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
224 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
225 # taxidlist.append(line.rstrip())
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
226
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
227 # return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
228 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
229
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
230 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
231 # function to retain the most recent acc nr for host complete genome found:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
232 # - return acc nr of most recent complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
233 # - print accnr species and name retained
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
234 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
235 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
236 def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
237
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
238 max_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
239 curr_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
240 max_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
241 curr_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
242 kept_accnr_i = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
243 p = re.compile(".*?(\d+)\.(\d+)$")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
244
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
245 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
246
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
247 for i in range(len(accnrlisttmp)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
248 m = p.match( accnrlisttmp[i] )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
249 if m:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
250 # print('Match found: ', m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
251 curr_accnr_version = int(m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
252 accnr_nr = int(m.group(1))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
253 if curr_accnr_version > max_accnr_version:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
254 max_accnr_version = curr_accnr_version
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
255 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
256 # print(f"record kept_accnr_i:{kept_accnr_i}")
6
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
257 elif(( curr_accnr_version == max_accnr_version)and
81acd8138218 Uploaded 13 03 23 Update to the last version of the script
p.lucas
parents: 0
diff changeset
258 (curr_accnr_nr > max_accnr_nr)):
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
259 max_accnr_nr = curr_accnr_nr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
260 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
261 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
262
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
263 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
264 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
265
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
266 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
267 kept_accn = accnrlisttmp[kept_accnr_i]
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
268
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
269 return kept_accn
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
270 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
271
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
272 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
273 # procedure to find complete genome closely related to current taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
274 # goes upper in taxonomy if nothing found until order
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
275 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
276 def ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
277 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
278 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
279 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
280 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
281 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
282 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
283 ):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
284 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
285
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
286 # deduce up rank, search complet genome/chr in
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
287 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
288 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
289 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
290 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
291 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
292 intermediate_nodes=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
293 collapse_subspecies=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
294 return_tree=False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
295 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
296 # int conversion to strings
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
297 leaves_taxids = list(map(str, leaves_taxids))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
298 leaves_taxids_list = ','.join(leaves_taxids)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
299 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
300 # print(f"{prog_tag} cmd:{cmd}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
301
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
302 # specific to retain_1accn to avoid lists are crashed by other ngd call
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
303 accnrlisttmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
304 speciestmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
305 nametmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
306 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
307
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
308 if not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
309 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
310 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
311 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
312 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
313 curr_index_in_lineage = curr_index_in_lineage - 1
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
314 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
315 return ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
316 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
317 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
318 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
319 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
320 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
321 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
322 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
323 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
324 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
325 accnrlisttmp_r.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
326 speciestmp_r.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
327 nametmp_r.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
328 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
329 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
330
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
331 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
332 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
333
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
334 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
335 # print(f"line matching Considering:{line}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
336 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
337
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
338 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
339 # read taxids, deduce complete genomes available in genblank, provides in output file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
340 # the acc number in addition to those already listed
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
341 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
342 def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
343 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
344 acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
345
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
346 # store all accnr found for complete genome of current taxid (or from same family/order)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
347 # the aim is to keep only the most recent/complete
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
348 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
349 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
350 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
351
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
352 # get host complete genome when found using ncbi_genome_download
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
353 taxids_list=','.join(taxidlist)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
354
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
355 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
356 # # ncbi-genome-download as a library
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
357 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
358 # ngd.download(section='genbank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
359 # taxids=taxids_list,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
360 # assembly_levels='chromosome',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
361 # flat_output=True,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
362 # output=ngd_out_f,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
363 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
364 # dry_run=True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
365 # )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
366
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
367
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
368 # cmd = "cut -f 1,2 "+ngd_out_f
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
369
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
370 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
371 # acc_nr, species = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
372 # accnrlist.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
373
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
374 # if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
375 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
376 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
377
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
378 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
379 # ncbi-genome-download as executable script
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
380 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
381
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
382 # load NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
383 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
384 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
385 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
386 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
387 try:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
388 ncbi.update_taxonomy_database()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
389 except:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
390 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
391
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
392 for taxid_u in taxidlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
393 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
394 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
395 # ERROR: No downloads matched your filter. Please check your options.
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
396 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
397 # get complete lineage: accept ONLY leave taxid? (species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
398 lineage = ncbi.get_lineage(int(taxid_u))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
399 name = ncbi.translate_to_names(lineage)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
400 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
401 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
402
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
403 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
404 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
405 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
406 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
407 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
408 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
409 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
410 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
411 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
412 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
413 accnrlist.append( new_acc_nr )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
414
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
415 # initialize for next search
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
416 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
417 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
418 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
419
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
420 elif not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
421 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
422 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
423 accnrlisttmp.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
424 speciestmp.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
425 nametmp.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
426 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
427 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
428
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
429 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
430 if len(accnrlisttmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
431 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
432
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
433 # remove redundant accnr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
434 accnrlist = list(sort_uniq(accnrlist))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
435 with open(acc_out_f, "w") as record_file:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
436 for accnr in accnrlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
437 record_file.write("%s\n" % (accnr))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
438 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
439
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
440 print(f"{prog_tag} {acc_out_f} file created")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
441
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
442 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
443 # test
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
444 if b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
445 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
446 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
447 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
448 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
449 print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
450 taxidlist = load_taxids(taxid_acc_in_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
451 print(f"{prog_tag} end loading")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
452
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
453 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
454 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
455 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
456 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
457 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
458 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
459
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
460 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
461 # MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
462 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
463 ##### MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
464 def __main__():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
465 # load taxid_acc file
8
8c9418230a5a Uploaded 13 03 23 Fix bug on variable name line 465
p.lucas
parents: 7
diff changeset
466 load_taxids(taxid_acc_in_f)
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
467 # check in ncbi taxonomy which acc number are in and out of given taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
468 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
469 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
470 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
471 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
472 #### MAIN END
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
473 if __name__ == "__main__": __main__()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
474