Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
comparison TAXID_genusexpand_taxid2acc.py @ 6:81acd8138218 draft
Uploaded 13 03 23 Update to the last version of the script
| author | p.lucas |
|---|---|
| date | Mon, 13 Mar 2023 13:57:09 +0000 |
| parents | e7dd595fb0dd |
| children | 9238e54d6532 |
comparison
equal
deleted
inserted
replaced
| 5:b53e594a8042 | 6:81acd8138218 |
|---|---|
| 76 parser = argparse.ArgumentParser() | 76 parser = argparse.ArgumentParser() |
| 77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | 77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', |
| 78 help="taxid acc_number list in tsv (tabular separated at each line)", | 78 help="taxid acc_number list in tsv (tabular separated at each line)", |
| 79 metavar="FILE") | 79 metavar="FILE") |
| 80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | 80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', |
| 81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree", | 81 help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree", |
| 82 metavar="FILE") | 82 metavar="FILE") |
| 83 # parser.add_argument("-r", "--rank", dest='rank', | 83 # parser.add_argument("-r", "--rank", dest='rank', |
| 84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | 84 # help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", |
| 85 # action="store_const") | 85 # action="store_const") |
| 86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | 86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', |
| 115 else: | 115 else: |
| 116 b_test = (b_test_load_taxids or | 116 b_test = (b_test_load_taxids or |
| 117 b_test_add_host_chr_taxids_accnr_from_ori_list) | 117 b_test_add_host_chr_taxids_accnr_from_ori_list) |
| 118 | 118 |
| 119 if ((not b_test)and | 119 if ((not b_test)and |
| 120 ((len(sys.argv) < 1) or (len(sys.argv) > 5))): | 120 ((len(sys.argv) < 2) or (len(sys.argv) > 5))): |
| 121 print("\n".join(["To use this scripts, run:", | 121 print("\n".join([ |
| 122 "conda activate TAXID_genusexpand_taxid2acc", | 122 "Aim: find accession numbers of complete genomes related to provided taxids.", |
| 123 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", | 123 "If not found at species level, try at upper taxonomic level until order.", |
| 124 " ", | 124 "Retains only 1 complete genome is several available:", |
| 125 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", | 125 " - the one with the highest version number, if not sufficient", |
| 126 "Then, as an example:\n\n", | 126 " - the one with the highest accession number", |
| 127 ' '.join(['./TAXID_genusexpand_taxid2acc.py', | 127 "To use this scripts, run:", |
| 128 '-i taxid_accnr_list.tsv', | 128 "conda activate TAXID_genusexpand_taxid2acc", |
| 129 '-o accnr_out_list.txt']),"\n\n" ])) | 129 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", |
| 130 " ", | |
| 131 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", | |
| 132 "Then, as an example:\n\n", | |
| 133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', | |
| 134 '-i taxid_accnr_list.tsv', | |
| 135 '-o accnr_out_list.txt']),"\n\n" ])) | |
| 130 parser.print_help() | 136 parser.print_help() |
| 131 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | 137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + |
| 132 " arguments, exit line "+str(frame.f_lineno)) | 138 " arguments, exit line "+str(frame.f_lineno)) |
| 133 sys.exit(0) | 139 sys.exit(0) |
| 134 | 140 |
| 182 if not path.exists(taxid_acc_tabular_f): | 188 if not path.exists(taxid_acc_tabular_f): |
| 183 sys.exit("Error " + taxid_acc_tabular_f + | 189 sys.exit("Error " + taxid_acc_tabular_f + |
| 184 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | 190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) |
| 185 | 191 |
| 186 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | 192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " |
| 187 # cmd = "cut -f 1 "+taxid_acc_tabular_f+" | sort | uniq " | |
| 188 | 193 |
| 189 for line in os.popen(cmd).readlines(): | 194 for line in os.popen(cmd).readlines(): |
| 190 k, v = line.rstrip().split() | 195 k, v = line.rstrip().split() |
| 191 taxidlist.append(k) | 196 taxidlist.append(k) |
| 192 accnrlist.append(v) | 197 accnrlist.append(v) |
| 246 accnr_nr = int(m.group(1)) | 251 accnr_nr = int(m.group(1)) |
| 247 if curr_accnr_version > max_accnr_version: | 252 if curr_accnr_version > max_accnr_version: |
| 248 max_accnr_version = curr_accnr_version | 253 max_accnr_version = curr_accnr_version |
| 249 kept_accnr_i = i | 254 kept_accnr_i = i |
| 250 # print(f"record kept_accnr_i:{kept_accnr_i}") | 255 # print(f"record kept_accnr_i:{kept_accnr_i}") |
| 251 elif curr_accnr_nr > max_accnr_nr: | 256 elif(( curr_accnr_version == max_accnr_version)and |
| 257 (curr_accnr_nr > max_accnr_nr)): | |
| 252 max_accnr_nr = curr_accnr_nr | 258 max_accnr_nr = curr_accnr_nr |
| 253 kept_accnr_i = i | 259 kept_accnr_i = i |
| 254 # print(f"record kept_accnr_i:{kept_accnr_i}") | 260 # print(f"record kept_accnr_i:{kept_accnr_i}") |
| 255 | 261 |
| 256 else: | 262 else: |
