annotate TAXID_genusexpand_taxid2acc.py @ 0:e7dd595fb0dd draft

Uploaded 09 03 23 first upload of script
author p.lucas
date Thu, 09 Mar 2023 16:50:29 +0000
parents
children 81acd8138218
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
3 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
4 # USE PYTHON3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
6 # provide 2 files:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
9 ###
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
10
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
11 ### Libraries to import:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
12 import argparse, os, sys, csv, warnings, re, itertools, operator
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
13 from os import path
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
14 import ncbi_genome_download as ngd
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
16 from ete3 import NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
17
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
18 # to be able to report line number in error messages
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
19 import inspect
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
20 frame = inspect.currentframe()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
21
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
22 # debug
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
23 b_test_load_taxids = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
25
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
26 prog_tag = '[' + os.path.basename(__file__) + ']'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
27
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
28 # boolean to know if we dowload ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
29 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
30
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
31 # list of interesting taxid (fathers)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
32 taxidlist_f = ''
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
33 taxidlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
34 accnrlist = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
35
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
36 # order = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
37 # family or clade = -3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
38 # subtribe or genus = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
39 curr_index_in_lineage = -2
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
40 min_index_in_lineage = -4
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
41
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
42 # boolean to know if we download ncbi taxonomy file in current env
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
43 b_load_ncbi_tax_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
44 b_test_all = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
45 b_test = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
46 b_acc_in_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
47 b_acc_out_f = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
48
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
49 b_verbose = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
50
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
51 # rank = '' # rank level retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
52 # rank_num = index of rank retained by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
53
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
54 # # set to check that provided rank exist to be sure to be able to use it
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
55 # ranks = {
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
56 # 'superkingdom' => 0,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
57 # # 'clade', # no, several clade, name is too generic
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
58 # 'kingdom' => 1,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
59 # 'phylum' => 2,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
60 # 'subphylum' => 3,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
61 # 'superclass' => 4,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
62 # 'class' => 5,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
63 # 'superorder' => 6,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
64 # 'order' => 7,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
65 # 'suborder' => 8,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
66 # 'infraorder' => 9,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
67 # 'parvorder' => 10,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
68 # 'superfamily' => 11,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
69 # 'family' => 12,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
70 # 'subfamily' => 13,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
71 # 'genus' => 14,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
72 # 'species' => 15,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
73 # 'subspecies' => 16
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
74 # }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
75
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
76 parser = argparse.ArgumentParser()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
78 help="taxid acc_number list in tsv (tabular separated at each line)",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
79 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
82 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
83 # parser.add_argument("-r", "--rank", dest='rank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
85 # action="store_const")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
87 help="[Optional] ncbi tabular file with taxonomy organized to represent a tree",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
88 metavar="FILE")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
90 help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
91 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
92 parser.add_argument("-z", "--test_all", dest='b_test_all',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
93 help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
94 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
95 parser.add_argument("-v", "--verbose", dest='b_verbose',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
96 help="[Optional] To have details on records when running",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
97 action='store_true')
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
98 parser.set_defaults(b_load_ncbi_tax_f=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
99 parser.set_defaults(b_test_all=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
100 parser.set_defaults(b_verbose=False)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
101
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
102 # get absolute path in case of files
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
103 args = parser.parse_args()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
104
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
105 # -------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
106 # check arguments
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
107 b_test_all = args.b_test_all
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
108
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
109 if b_test_all:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
110 b_test_load_taxids = False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
111 b_test_add_host_chr_taxids_accnr_from_ori_list = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
112 b_test = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
113 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
114 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
115 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
116 b_test = (b_test_load_taxids or
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
117 b_test_add_host_chr_taxids_accnr_from_ori_list)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
118
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
119 if ((not b_test)and
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
120 ((len(sys.argv) < 1) or (len(sys.argv) > 5))):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
121 print("\n".join(["To use this scripts, run:",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
122 "conda activate TAXID_genusexpand_taxid2acc",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
123 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
124 " ",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
125 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
126 "Then, as an example:\n\n",
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
127 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
128 '-i taxid_accnr_list.tsv',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
129 '-o accnr_out_list.txt']),"\n\n" ]))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
130 parser.print_help()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
131 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
132 " arguments, exit line "+str(frame.f_lineno))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
133 sys.exit(0)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
134
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
135 # print('args:', args)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
136 # if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
137 if args.ncbi_tax_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
138 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
139 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
140 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
141 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
142 if args.taxid_acc_in_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
143 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
144 b_acc_in_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
145 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
146 sys.exit("[Error] You must provide taxid_acc_in_f")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
147 if args.acc_out_f is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
148 acc_out_f = os.path.abspath(args.acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
149 b_acc_out_f = True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
150 elif(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
151 sys.exit("-acc_out_f <accnr_file>n must be provided\n")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
152 # if args.rank is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
153 # rank = 'genus'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
154 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
155 # rank = args.rank
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
156
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
157 if args.b_verbose is not None:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
158 b_verbose = int(args.b_verbose)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
159
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
160 if(not b_test):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
161 if (not b_acc_in_f) and (not b_acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
162 sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
163
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
164 # # store index of the rank expected by user
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
165 # rank_num = ranks{ rank }
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
166
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
167 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
168 # to sort uniq, for a list, only need to add list conversion
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
169 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
170 mapper= map # Python ≥ 3
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
171 def sort_uniq(sequence):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
172 return mapper(
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
173 operator.itemgetter(0),
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
174 itertools.groupby(sorted(sequence)))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
175 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
176
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
177 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
178 # load taxid acc list, return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
179 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
180 def load_taxids(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
181
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
182 if not path.exists(taxid_acc_tabular_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
183 sys.exit("Error " + taxid_acc_tabular_f +
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
184 " file does not exist, line "+ str(sys._getframe().f_lineno) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
185
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
186 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
187 # cmd = "cut -f 1 "+taxid_acc_tabular_f+" | sort | uniq "
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
188
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
189 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
190 k, v = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
191 taxidlist.append(k)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
192 accnrlist.append(v)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
193
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
194 return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
195 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
196
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
197 # test load_taxids function
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
198 # display taxidlist, then exit
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
199 if b_test_load_taxids:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
200 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
201 print("START b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
202 print("loading "+taxid_acc_tabular_f+" file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
203 taxidlist = load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
204 for i in range(len(taxidlist)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
205 print(f"{taxidlist[i]}\t{accnrlist[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
206 print("END b_test_load_taxids")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
207 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
208 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
209 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
210
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
211 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
212 # # needs internet connexion, not possible
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
213 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
214 # def get_leave_taxid_from_acc_nr(accnrlist):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
215
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
216 # # deduce a list of taxid from a list of accession numbers
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
217 # cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
218 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
219 # taxidlist.append(line.rstrip())
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
220
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
221 # return taxidlist
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
222 # # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
223
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
224 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
225 # function to retain the most recent acc nr for host complete genome found:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
226 # - return acc nr of most recent complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
227 # - print accnr species and name retained
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
228 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
229 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
230 def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
231
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
232 max_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
233 curr_accnr_version = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
234 max_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
235 curr_accnr_nr = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
236 kept_accnr_i = 0
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
237 p = re.compile(".*?(\d+)\.(\d+)$")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
238
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
239 # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
240
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
241 for i in range(len(accnrlisttmp)):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
242 m = p.match( accnrlisttmp[i] )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
243 if m:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
244 # print('Match found: ', m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
245 curr_accnr_version = int(m.group(2))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
246 accnr_nr = int(m.group(1))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
247 if curr_accnr_version > max_accnr_version:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
248 max_accnr_version = curr_accnr_version
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
249 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
250 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
251 elif curr_accnr_nr > max_accnr_nr:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
252 max_accnr_nr = curr_accnr_nr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
253 kept_accnr_i = i
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
254 # print(f"record kept_accnr_i:{kept_accnr_i}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
255
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
256 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
257 sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
258
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
259 print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
260 kept_accn = accnrlisttmp[kept_accnr_i]
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
261
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
262 return kept_accn
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
263 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
264
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
265 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
266 # procedure to find complete genome closely related to current taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
267 # goes upper in taxonomy if nothing found until order
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
268 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
269 def ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
270 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
271 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
272 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
273 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
274 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
275 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
276 ):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
277 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
278
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
279 # deduce up rank, search complet genome/chr in
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
280 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
281 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
282 name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]])
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
283 print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
284 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
285 intermediate_nodes=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
286 collapse_subspecies=False,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
287 return_tree=False
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
288 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
289 # int conversion to strings
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
290 leaves_taxids = list(map(str, leaves_taxids))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
291 leaves_taxids_list = ','.join(leaves_taxids)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
292 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
293 # print(f"{prog_tag} cmd:{cmd}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
294
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
295 # specific to retain_1accn to avoid lists are crashed by other ngd call
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
296 accnrlisttmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
297 speciestmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
298 nametmp_r = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
299 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
300
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
301 if not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
302 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
303 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
304 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
305 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
306 curr_index_in_lineage = curr_index_in_lineage - 1
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
307 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
308 return ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
309 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
310 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
311 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
312 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
313 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
314 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
315 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
316 else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
317 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
318 accnrlisttmp_r.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
319 speciestmp_r.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
320 nametmp_r.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
321 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
322 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
323
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
324 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
325 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
326
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
327 # else:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
328 # print(f"line matching Considering:{line}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
329 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
330
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
331 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
332 # read taxids, deduce complete genomes available in genblank, provides in output file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
333 # the acc number in addition to those already listed
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
334 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
335 def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
336 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
337 acc_out_f):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
338
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
339 # store all accnr found for complete genome of current taxid (or from same family/order)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
340 # the aim is to keep only the most recent/complete
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
341 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
342 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
343 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
344
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
345 # get host complete genome when found using ncbi_genome_download
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
346 taxids_list=','.join(taxidlist)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
347
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
348 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
349 # # ncbi-genome-download as a library
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
350 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
351 # ngd.download(section='genbank',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
352 # taxids=taxids_list,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
353 # assembly_levels='chromosome',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
354 # flat_output=True,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
355 # output=ngd_out_f,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
356 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
357 # dry_run=True
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
358 # )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
359
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
360
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
361 # cmd = "cut -f 1,2 "+ngd_out_f
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
362
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
363 # for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
364 # acc_nr, species = line.rstrip().split()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
365 # accnrlist.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
366
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
367 # if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
368 # print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
369 # # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
370
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
371 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
372 # ncbi-genome-download as executable script
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
373 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
374
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
375 # load NCBITaxa
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
376 ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
377 print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
378 ncbi = NCBITaxa(dbfile=ncbi_tax_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
379 if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
380 try:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
381 ncbi.update_taxonomy_database()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
382 except:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
383 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
384
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
385 for taxid_u in taxidlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
386 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
387 for line in os.popen(cmd).readlines():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
388 # ERROR: No downloads matched your filter. Please check your options.
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
389 if re.match("^(?:ERROR|Error): No downloads", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
390 # get complete lineage: accept ONLY leave taxid? (species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
391 lineage = ncbi.get_lineage(int(taxid_u))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
392 name = ncbi.translate_to_names(lineage)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
393 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
394 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
395
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
396 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
397 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
398 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
399 lineage,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
400 ncbi,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
401 accnrlisttmp, # current working list
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
402 accnrlist, # final list, if something added (or min index reached), recursivity stop
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
403 speciestmp,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
404 nametmp
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
405 )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
406 accnrlist.append( new_acc_nr )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
407
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
408 # initialize for next search
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
409 accnrlisttmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
410 speciestmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
411 nametmp = []
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
412
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
413 elif not re.match("^Considering", line):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
414 # print(f"line:{line.rstrip()}")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
415 acc_nr, species, name = line.rstrip().split("\t")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
416 accnrlisttmp.append(acc_nr)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
417 speciestmp.append(species)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
418 nametmp.append(name)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
419 if b_verbose:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
420 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
421
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
422 # retain only the most recent complete genome for current treated taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
423 if len(accnrlisttmp):
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
424 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
425
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
426 # remove redundant accnr
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
427 accnrlist = list(sort_uniq(accnrlist))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
428 with open(acc_out_f, "w") as record_file:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
429 for accnr in accnrlist:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
430 record_file.write("%s\n" % (accnr))
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
431 # ------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
432
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
433 print(f"{prog_tag} {acc_out_f} file created")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
434
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
435 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
436 # test
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
437 if b_test_add_host_chr_taxids_accnr_from_ori_list:
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
438 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
439 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
440 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
441 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
442 print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
443 taxidlist = load_taxids(taxid_acc_in_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
444 print(f"{prog_tag} end loading")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
445
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
446 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
447 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
448 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
449 print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list")
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
450 sys.exit()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
451 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
452
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
453 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
454 # MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
455 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
456 ##### MAIN
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
457 def __main__():
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
458 # load taxid_acc file
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
459 load_taxids(taxid_acc_tabular_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
460 # check in ncbi taxonomy which acc number are in and out of given taxid
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
461 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
462 accnrlist,
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
463 acc_out_f)
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
464 # --------------------------------------------------------------------------
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
465 #### MAIN END
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
466 if __name__ == "__main__": __main__()
e7dd595fb0dd Uploaded 09 03 23 first upload of script
p.lucas
parents:
diff changeset
467