Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
changeset 12:19e175a84d0e draft
Uploaded updated python script
author | p.lucas |
---|---|
date | Thu, 21 Sep 2023 15:19:55 +0000 |
parents | 3d116861e380 |
children | d80e5d0898a3 |
files | TAXID_genusexpand_taxid2acc.py |
diffstat | 1 files changed, 83 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- a/TAXID_genusexpand_taxid2acc.py Tue Mar 14 08:17:06 2023 +0000 +++ b/TAXID_genusexpand_taxid2acc.py Thu Sep 21 15:19:55 2023 +0000 @@ -10,6 +10,8 @@ ### Libraries to import: import argparse, os, sys, csv, warnings, re, itertools, operator +#from subprocess import Popen,PIPE +import subprocess from os import path import ncbi_genome_download as ngd # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) @@ -17,11 +19,15 @@ # to be able to report line number in error messages import inspect -frame = inspect.currentframe() +def lineno(): + """Returns the current line number in our program.""" + return str(inspect.currentframe().f_back.f_lineno) + # debug -b_test_load_taxids = False # ok 2023 03 07 -b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 +test_dir = 'test_TAXID_genusexpand_taxid2acc/' +b_test_load_taxids = False # ok 2023 08 25 +b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25 prog_tag = '[' + os.path.basename(__file__) + ']' @@ -36,7 +42,7 @@ # order = -4 # family or clade = -3 # subtribe or genus = -2 -curr_index_in_lineage = -2 +curr_index_in_lineage = -1 min_index_in_lineage = -4 # boolean to know if we download ncbi taxonomy file in current env @@ -48,6 +54,11 @@ b_verbose = False +# variables for ncbi-genome-download +ncbigenomedownload_section = 'refseq' # genbank +organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate' +assembly_levels = 'complete,chromosome' + # rank = '' # rank level retained by user # rank_num = index of rank retained by user @@ -135,7 +146,7 @@ '-o accnr_out_list.txt']),"\n\n" ])) parser.print_help() print(prog_tag + "[Error] we found "+str(len(sys.argv)) + - " arguments, exit line "+str(frame.f_lineno)) + " arguments, exit line "+lineno()) sys.exit(0) # print('args:', args) @@ -146,7 +157,7 @@ # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") if args.taxid_acc_in_f is not None: - taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) + taxid_acc_in_f = os.path.abspath(args.taxid_acc_f) b_acc_in_f = True elif(not b_test): sys.exit("[Error] You must provide taxid_acc_in_f") @@ -174,39 +185,39 @@ # to sort uniq, for a list, only need to add list conversion # -------------------------------------------------------------------------- mapper= map # Python ≥ 3 -def sort_uniq(sequence): +def sort_uniq(sequence: list): return mapper( operator.itemgetter(0), itertools.groupby(sorted(sequence))) # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- -# load taxid acc list, return taxidlist +# Procedure: load taxid acc list, return taxidlist # -------------------------------------------------------------------------- -def load_taxids(taxid_acc_tabular_f): +def load_taxids(taxid_acc_tabular_f: str): if not path.exists(taxid_acc_tabular_f): sys.exit("Error " + taxid_acc_tabular_f + - " file does not exist, line "+ str(sys._getframe().f_lineno) ) + " file does not exist, line "+ lineno() ) - cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " + cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u " for line in os.popen(cmd).readlines(): if line.rstrip() != "": - k, v = line.rstrip().split() - taxidlist.append(k) - accnrlist.append(v) + k, v = line.rstrip().split() + taxidlist.append(k) + accnrlist.append(v) + # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") - return taxidlist # -------------------------------------------------------------------------- # test load_taxids function # display taxidlist, then exit if b_test_load_taxids: - taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' + taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' print("START b_test_load_taxids") print("loading "+taxid_acc_tabular_f+" file") - taxidlist = load_taxids(taxid_acc_tabular_f) + load_taxids(taxid_acc_tabular_f) for i in range(len(taxidlist)): print(f"{taxidlist[i]}\t{accnrlist[i]}") print("END b_test_load_taxids") @@ -233,7 +244,7 @@ # - print accnr species and name retained # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp # -------------------------------------------------------------------------- -def retain_1accnr(accnrlisttmp, speciestmp, nametmp): +def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str: max_accnr_version = 0 curr_accnr_version = 0 @@ -270,17 +281,17 @@ # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- -# procedure to find complete genome closely related to current taxid +# Function to find complete genome closely related to current taxid # goes upper in taxonomy if nothing found until order # -------------------------------------------------------------------------- -def ngd_upper_lineage(curr_index_in_lineage, - lineage, - ncbi, - accnrlisttmp, # current working list - accnrlist, # final list, if something added (or min index reached), recursivity stop - speciestmp, - nametmp - ): +def ngd_upper_lineage(curr_index_in_lineage: int, + lineage: list, + ncbi: NCBITaxa, + accnrlisttmp: list, # current working list + accnrlist: list, # final list, if something added (or min index reached), recursivity stop + speciestmp: list, + nametmp: list + ) -> str: print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") # deduce up rank, search complet genome/chr in @@ -293,25 +304,31 @@ collapse_subspecies=False, return_tree=False ) + # int conversion to strings leaves_taxids = list(map(str, leaves_taxids)) - leaves_taxids_list = ','.join(leaves_taxids) - cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" - # print(f"{prog_tag} cmd:{cmd}") + leaves_taxids_list = ','.join(leaves_taxids) + + if b_verbose: + print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}") + cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" + if b_verbose: + print(f"{prog_tag} cmd:{cmd}") # specific to retain_1accn to avoid lists are crashed by other ngd call accnrlisttmp_r = [] speciestmp_r = [] nametmp_r = [] + for line in os.popen(cmd).readlines(): - +# print(f"line 314:{line.rstrip()}") if not re.match("^Considering", line): - # print(f"line:{line.rstrip()}") +# print(f"line 316:{line.rstrip()}") if re.match("^(?:ERROR|Error): No downloads", line): print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted curr_index_in_lineage = curr_index_in_lineage - 1 - print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") + print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") return ngd_upper_lineage(curr_index_in_lineage, lineage, ncbi, @@ -321,7 +338,11 @@ nametmp ) else: - acc_nr, species, name = line.rstrip().split("\t") +# print(f"line 331:{line}") + try: + acc_nr, species, name = line.rstrip().split("\t") + except ValueError as ve: + sys.exit(f"ValueError {ve}: for split of line '{line}'") accnrlisttmp_r.append(acc_nr) speciestmp_r.append(species) nametmp_r.append(name) @@ -339,9 +360,9 @@ # read taxids, deduce complete genomes available in genblank, provides in output file # the acc number in addition to those already listed # -------------------------------------------------------------------------- -def add_host_chr_taxids_accnr_from_ori_list(taxidlist, - accnrlist, - acc_out_f): +def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list, + accnrlist: list, + acc_out_f: str): # store all accnr found for complete genome of current taxid (or from same family/order) # the aim is to keep only the most recent/complete @@ -355,13 +376,13 @@ # # ------------------------------------------ # # ncbi-genome-download as a library # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' - # ngd.download(section='genbank', + # ngd.download(section=ncbigenomedownload_section, # taxids=taxids_list, - # assembly_levels='chromosome', + # assembly_levels=assembly_levels, # flat_output=True, # output=ngd_out_f, - # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', - # dry_run=True + # groups=organisms_to_search_in, + # dry_run=True # ) @@ -390,8 +411,11 @@ warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") for taxid_u in taxidlist: - cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" + print(f"{prog_tag} treating global taxid:{taxid_u}") + cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" for line in os.popen(cmd).readlines(): + if b_verbose: + print(f"{prog_tag} cmd:{cmd} ran, read output") # ERROR: No downloads matched your filter. Please check your options. if re.match("^(?:ERROR|Error): No downloads", line): # get complete lineage: accept ONLY leave taxid? (species) @@ -401,7 +425,7 @@ print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome - # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") + # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, lineage, ncbi, @@ -410,7 +434,11 @@ speciestmp, nametmp ) - accnrlist.append( new_acc_nr ) + if new_acc_nr is None: + print(f"No acc_nr found after going up in taxonomy, line {lineno()}") + else: + accnrlist.append( new_acc_nr ) + # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") # initialize for next search accnrlisttmp = [] @@ -421,6 +449,8 @@ # print(f"line:{line.rstrip()}") acc_nr, species, name = line.rstrip().split("\t") accnrlisttmp.append(acc_nr) + # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") + speciestmp.append(species) nametmp.append(name) if b_verbose: @@ -429,8 +459,10 @@ # retain only the most recent complete genome for current treated taxid if len(accnrlisttmp): accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) + # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") # remove redundant accnr + print(f"accnrlist to sort:{accnrlist}") accnrlist = list(sort_uniq(accnrlist)) with open(acc_out_f, "w") as record_file: for accnr in accnrlist: @@ -442,12 +474,13 @@ # -------------------------------------------------------------------------- # test if b_test_add_host_chr_taxids_accnr_from_ori_list: - taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' - taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' - acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' + taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' + acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv' print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") - print(f"{prog_tag} loading {taxid_acc_tabular_f} file") - taxidlist = load_taxids(taxid_acc_in_f) + print(f"{prog_tag} loading {taxid_acc_in_f} file") + load_taxids(taxid_acc_in_f) + for i in range(len(taxidlist)): + print(f"{taxidlist[i]}\t{accnrlist[i]}") print(f"{prog_tag} end loading") add_host_chr_taxids_accnr_from_ori_list(taxidlist, @@ -463,7 +496,7 @@ ##### MAIN def __main__(): # load taxid_acc file - load_taxids(taxid_acc_in_f) + load_taxids(taxid_acc_tabular_f) # check in ncbi taxonomy which acc number are in and out of given taxid add_host_chr_taxids_accnr_from_ori_list(taxidlist, accnrlist,