taxid_genusexpand_taxid2acc: TAXID_genusexpand

comparison TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft

Uploaded updated python script

author	p.lucas
date	Thu, 21 Sep 2023 15:19:55 +0000
parents	3d116861e380
children	932ba9e04f3a

comparison

equal deleted inserted replaced

-:3d116861e380
+:19e175a84d0e
 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)
 ###
 ### Libraries to import:
 import argparse, os, sys, csv, warnings, re, itertools, operator
+#from subprocess import Popen,PIPE
+import subprocess
 from os import path
 import ncbi_genome_download as ngd
 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
 from ete3 import NCBITaxa
 # to be able to report line number in error messages
 import inspect
-frame = inspect.currentframe()
+def lineno():
+"""Returns the current line number in our program."""
+return str(inspect.currentframe().f_back.f_lineno)
 # debug
-b_test_load_taxids = False                            # ok 2023 03 07
+test_dir = 'test_TAXID_genusexpand_taxid2acc/'
-b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
+b_test_load_taxids = False                             # ok 2023 08 25
+b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25
 prog_tag = '[' + os.path.basename(__file__) + ']'
 # boolean to know if we dowload ncbi taxonomy file in current env
 b_load_ncbi_tax_f = False
 accnrlist = []
 # order = -4
 # family or clade = -3
 # subtribe or genus = -2
-curr_index_in_lineage = -2
+curr_index_in_lineage = -1
 min_index_in_lineage = -4
 # boolean to know if we download ncbi taxonomy file in current env
 b_load_ncbi_tax_f = False
 b_test_all        = False
 b_test            = False
 b_acc_in_f        = False
 b_acc_out_f       = False
 b_verbose         = False
+# variables for ncbi-genome-download
+ncbigenomedownload_section = 'refseq' # genbank
+organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
+assembly_levels = 'complete,chromosome'
 # rank = '' # rank level retained by user
 # rank_num = index of rank retained by user
 # # set to check that provided rank exist to be sure to be able to use it
 ' '.join(['./TAXID_genusexpand_taxid2acc.py',
 '-i taxid_accnr_list.tsv',
 '-o accnr_out_list.txt']),"\n\n" ]))
 parser.print_help()
 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
-" arguments, exit line "+str(frame.f_lineno))
+" arguments, exit line "+lineno())
 sys.exit(0)
 # print('args:', args)
 # if(not b_test):
 if args.ncbi_tax_f is not None:
 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f)
 else:
 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
 if args.taxid_acc_in_f is not None:
-taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
+taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
 b_acc_in_f = True
 elif(not b_test):
 sys.exit("[Error] You must provide taxid_acc_in_f")
 if args.acc_out_f is not None:
 acc_out_f = os.path.abspath(args.acc_out_f)
 # --------------------------------------------------------------------------
 # to sort uniq, for a list, only need to add list conversion
 # --------------------------------------------------------------------------
 mapper= map # Python ≥ 3
-def sort_uniq(sequence):
+def sort_uniq(sequence: list):
 return mapper(
 operator.itemgetter(0),
 itertools.groupby(sorted(sequence)))
 # --------------------------------------------------------------------------
 # --------------------------------------------------------------------------
-# load taxid acc list, return taxidlist
+# Procedure: load taxid acc list, return taxidlist
 # --------------------------------------------------------------------------
-def load_taxids(taxid_acc_tabular_f):
+def load_taxids(taxid_acc_tabular_f: str):
 if not path.exists(taxid_acc_tabular_f):
 sys.exit("Error " + taxid_acc_tabular_f +
-" file does not exist, line "+ str(sys._getframe().f_lineno) )
+" file does not exist, line "+ lineno() )
-cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
+cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
 for line in os.popen(cmd).readlines():
 if line.rstrip() != "":
 k, v = line.rstrip().split()
 taxidlist.append(k)
 accnrlist.append(v)
+# print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
-return taxidlist
 # --------------------------------------------------------------------------
 # test load_taxids function
 # display taxidlist, then exit
 if b_test_load_taxids:
-taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
+taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
 print("START b_test_load_taxids")
 print("loading "+taxid_acc_tabular_f+" file")
-taxidlist = load_taxids(taxid_acc_tabular_f)
+load_taxids(taxid_acc_tabular_f)
 for i in range(len(taxidlist)):
 print(f"{taxidlist[i]}\t{accnrlist[i]}")
 print("END b_test_load_taxids")
 if not b_test_add_host_chr_taxids_accnr_from_ori_list:
 sys.exit()
 # function to retain the most recent acc nr for host complete genome found:
 # - return acc nr of most recent complete genome
 # - print accnr species and name retained
 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp
 # --------------------------------------------------------------------------
-def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
+def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str:
 max_accnr_version  = 0
 curr_accnr_version = 0
 max_accnr_nr       = 0
 curr_accnr_nr      = 0
 return kept_accn
 # --------------------------------------------------------------------------
 # --------------------------------------------------------------------------
-# procedure to find complete genome closely related to current taxid
+# Function to find complete genome closely related to current taxid
 # goes upper in taxonomy if nothing found until order
 # --------------------------------------------------------------------------
-def ngd_upper_lineage(curr_index_in_lineage,
+def ngd_upper_lineage(curr_index_in_lineage: int,
-lineage,
+lineage: list,
-ncbi,
+ncbi: NCBITaxa,
-accnrlisttmp, # current working list
+accnrlisttmp: list, # current working list
-accnrlist,    # final list, if something added (or min index reached), recursivity stop
+accnrlist: list,    # final list, if something added (or min index reached), recursivity stop
-speciestmp,
+speciestmp: list,
-nametmp
+nametmp: list
-):
+) -> str:
 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
 # deduce up rank, search complet genome/chr in
 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species
 rank = ncbi.get_rank([lineage[curr_index_in_lineage]])
 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid,
 intermediate_nodes=False,
 collapse_subspecies=False,
 return_tree=False
 )
 # int conversion to strings
 leaves_taxids = list(map(str, leaves_taxids))
 leaves_taxids_list = ','.join(leaves_taxids)
-cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
-# print(f"{prog_tag} cmd:{cmd}")
+if b_verbose:
+print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}")
+cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
+if b_verbose:
+print(f"{prog_tag} cmd:{cmd}")
 # specific to retain_1accn to avoid lists are crashed by other ngd call
 accnrlisttmp_r = []
 speciestmp_r   = []
 nametmp_r      = []
 for line in os.popen(cmd).readlines():
+#        print(f"line 314:{line.rstrip()}")
 if not re.match("^Considering", line):
-# print(f"line:{line.rstrip()}")
+#            print(f"line 316:{line.rstrip()}")
 if re.match("^(?:ERROR|Error): No downloads", line):
 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
 curr_index_in_lineage = curr_index_in_lineage - 1
-print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
+print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
 return ngd_upper_lineage(curr_index_in_lineage,
 lineage,
 ncbi,
 accnrlisttmp, # current working list
 accnrlist,    # final list, if something added (or min index reached), recursivity stop
 speciestmp,
 nametmp
 )
 else:
-acc_nr, species, name = line.rstrip().split("\t")
+#                print(f"line 331:{line}")
+try:
+acc_nr, species, name = line.rstrip().split("\t")
+except ValueError as ve:
+sys.exit(f"ValueError {ve}: for split of line '{line}'")
 accnrlisttmp_r.append(acc_nr)
 speciestmp_r.append(species)
 nametmp_r.append(name)
 if b_verbose:
 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
 # --------------------------------------------------------------------------
 # read taxids, deduce complete genomes available in genblank, provides in output file
 # the acc number in addition  to those already listed
 # --------------------------------------------------------------------------
-def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
+def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list,
-accnrlist,
+accnrlist: list,
-acc_out_f):
+acc_out_f: str):
 # store all accnr found for complete genome of current taxid (or from same family/order)
 # the aim is to keep only the most recent/complete
 accnrlisttmp = []
 speciestmp   = []
 taxids_list=','.join(taxidlist)
 # # ------------------------------------------
 # # ncbi-genome-download as a library
 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
-# ngd.download(section='genbank',
+# ngd.download(section=ncbigenomedownload_section,
 #              taxids=taxids_list,
-#              assembly_levels='chromosome',
+#              assembly_levels=assembly_levels,
 #              flat_output=True,
 #              output=ngd_out_f,
-#              groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
+#              groups=organisms_to_search_in,
 #              dry_run=True
 #              )
 # cmd = "cut -f 1,2 "+ngd_out_f
 ncbi.update_taxonomy_database()
 except:
 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
 for taxid_u in taxidlist:
-cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
+print(f"{prog_tag} treating global taxid:{taxid_u}")
+cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
 for line in os.popen(cmd).readlines():
+if b_verbose:
+print(f"{prog_tag} cmd:{cmd} ran, read output")
 # ERROR: No downloads matched your filter. Please check your options.
 if re.match("^(?:ERROR|Error): No downloads", line):
 # get complete lineage: accept ONLY leave taxid? (species)
 lineage = ncbi.get_lineage(int(taxid_u))
 name = ncbi.translate_to_names(lineage)
 if b_verbose:
 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
-# print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}")
+# print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}")
 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
 lineage,
 ncbi,
 accnrlisttmp, # current working list
 accnrlist, # final list, if something added (or min index reached), recursivity stop
 speciestmp,
 nametmp
 )
-accnrlist.append( new_acc_nr )
+if new_acc_nr is None:
+print(f"No acc_nr found after going up in taxonomy, line {lineno()}")
+else:
+accnrlist.append( new_acc_nr )
+# print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
 # initialize for next search
 accnrlisttmp = []
 speciestmp   = []
 nametmp      = []
 elif not re.match("^Considering", line):
 # print(f"line:{line.rstrip()}")
 acc_nr, species, name = line.rstrip().split("\t")
 accnrlisttmp.append(acc_nr)
+# print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
 speciestmp.append(species)
 nametmp.append(name)
 if b_verbose:
 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})")
 # retain only the most recent complete genome for current treated taxid
 if len(accnrlisttmp):
 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
+# print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
 # remove redundant accnr
+print(f"accnrlist to sort:{accnrlist}")
 accnrlist = list(sort_uniq(accnrlist))
 with open(acc_out_f, "w") as record_file:
 for accnr in accnrlist:
 record_file.write("%s\n" % (accnr))
 # ------------------------------------------
 print(f"{prog_tag} {acc_out_f} file created")
 # --------------------------------------------------------------------------
 # test
 if b_test_add_host_chr_taxids_accnr_from_ori_list:
-taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
+taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
-taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
+acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
-acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
-print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
+print(f"{prog_tag} loading {taxid_acc_in_f} file")
-taxidlist = load_taxids(taxid_acc_in_f)
+load_taxids(taxid_acc_in_f)
+for i in range(len(taxidlist)):
+print(f"{taxidlist[i]}\t{accnrlist[i]}")
 print(f"{prog_tag} end loading")
 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
 accnrlist,
 acc_out_f)
 # MAIN
 # --------------------------------------------------------------------------
 ##### MAIN
 def __main__():
 # load taxid_acc file
-load_taxids(taxid_acc_in_f)
+load_taxids(taxid_acc_tabular_f)
 # check in ncbi taxonomy which acc number are in and out of given taxid
 add_host_chr_taxids_accnr_from_ori_list(taxidlist,
 accnrlist,
 acc_out_f)
 # --------------------------------------------------------------------------

Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc

comparison TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft