comparison TAXID_genusexpand_taxid2acc.py @ 6:81acd8138218 draft

Uploaded 13 03 23 Update to the last version of the script
author p.lucas
date Mon, 13 Mar 2023 13:57:09 +0000
parents e7dd595fb0dd
children 9238e54d6532
comparison
equal deleted inserted replaced
5:b53e594a8042 6:81acd8138218
76 parser = argparse.ArgumentParser() 76 parser = argparse.ArgumentParser()
77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', 77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f',
78 help="taxid acc_number list in tsv (tabular separated at each line)", 78 help="taxid acc_number list in tsv (tabular separated at each line)",
79 metavar="FILE") 79 metavar="FILE")
80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', 80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f',
81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree", 81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",
82 metavar="FILE") 82 metavar="FILE")
83 # parser.add_argument("-r", "--rank", dest='rank', 83 # parser.add_argument("-r", "--rank", dest='rank',
84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", 84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list",
85 # action="store_const") 85 # action="store_const")
86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', 86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f',
115 else: 115 else:
116 b_test = (b_test_load_taxids or 116 b_test = (b_test_load_taxids or
117 b_test_add_host_chr_taxids_accnr_from_ori_list) 117 b_test_add_host_chr_taxids_accnr_from_ori_list)
118 118
119 if ((not b_test)and 119 if ((not b_test)and
120 ((len(sys.argv) < 1) or (len(sys.argv) > 5))): 120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))):
121 print("\n".join(["To use this scripts, run:", 121 print("\n".join([
122 "conda activate TAXID_genusexpand_taxid2acc", 122 "Aim: find accession numbers of complete genomes related to provided taxids.",
123 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", 123 "If not found at species level, try at upper taxonomic level until order.",
124 " ", 124 "Retains only 1 complete genome is several available:",
125 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", 125 " - the one with the highest version number, if not sufficient",
126 "Then, as an example:\n\n", 126 " - the one with the highest accession number",
127 ' '.join(['./TAXID_genusexpand_taxid2acc.py', 127 "To use this scripts, run:",
128 '-i taxid_accnr_list.tsv', 128 "conda activate TAXID_genusexpand_taxid2acc",
129 '-o accnr_out_list.txt']),"\n\n" ])) 129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f",
130 " ",
131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n",
132 "Then, as an example:\n\n",
133 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
134 '-i taxid_accnr_list.tsv',
135 '-o accnr_out_list.txt']),"\n\n" ]))
130 parser.print_help() 136 parser.print_help()
131 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + 137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
132 " arguments, exit line "+str(frame.f_lineno)) 138 " arguments, exit line "+str(frame.f_lineno))
133 sys.exit(0) 139 sys.exit(0)
134 140
182 if not path.exists(taxid_acc_tabular_f): 188 if not path.exists(taxid_acc_tabular_f):
183 sys.exit("Error " + taxid_acc_tabular_f + 189 sys.exit("Error " + taxid_acc_tabular_f +
184 " file does not exist, line "+ str(sys._getframe().f_lineno) ) 190 " file does not exist, line "+ str(sys._getframe().f_lineno) )
185 191
186 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " 192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
187 # cmd = "cut -f 1 "+taxid_acc_tabular_f+" | sort | uniq "
188 193
189 for line in os.popen(cmd).readlines(): 194 for line in os.popen(cmd).readlines():
190 k, v = line.rstrip().split() 195 k, v = line.rstrip().split()
191 taxidlist.append(k) 196 taxidlist.append(k)
192 accnrlist.append(v) 197 accnrlist.append(v)
246 accnr_nr = int(m.group(1)) 251 accnr_nr = int(m.group(1))
247 if curr_accnr_version > max_accnr_version: 252 if curr_accnr_version > max_accnr_version:
248 max_accnr_version = curr_accnr_version 253 max_accnr_version = curr_accnr_version
249 kept_accnr_i = i 254 kept_accnr_i = i
250 # print(f"record kept_accnr_i:{kept_accnr_i}") 255 # print(f"record kept_accnr_i:{kept_accnr_i}")
251 elif curr_accnr_nr > max_accnr_nr: 256 elif(( curr_accnr_version == max_accnr_version)and
257 (curr_accnr_nr > max_accnr_nr)):
252 max_accnr_nr = curr_accnr_nr 258 max_accnr_nr = curr_accnr_nr
253 kept_accnr_i = i 259 kept_accnr_i = i
254 # print(f"record kept_accnr_i:{kept_accnr_i}") 260 # print(f"record kept_accnr_i:{kept_accnr_i}")
255 261
256 else: 262 else: