changeset 12:19e175a84d0e draft

Uploaded updated python script
author p.lucas
date Thu, 21 Sep 2023 15:19:55 +0000
parents 3d116861e380
children d80e5d0898a3
files TAXID_genusexpand_taxid2acc.py
diffstat 1 files changed, 83 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/TAXID_genusexpand_taxid2acc.py	Tue Mar 14 08:17:06 2023 +0000
+++ b/TAXID_genusexpand_taxid2acc.py	Thu Sep 21 15:19:55 2023 +0000
@@ -10,6 +10,8 @@
 
 ### Libraries to import:
 import argparse, os, sys, csv, warnings, re, itertools, operator
+#from subprocess import Popen,PIPE
+import subprocess
 from os import path
 import ncbi_genome_download as ngd
 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)
@@ -17,11 +19,15 @@
 
 # to be able to report line number in error messages
 import inspect
-frame = inspect.currentframe()
+def lineno():
+    """Returns the current line number in our program."""
+    return str(inspect.currentframe().f_back.f_lineno)
+
 
 # debug
-b_test_load_taxids = False                            # ok 2023 03 07
-b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07
+test_dir = 'test_TAXID_genusexpand_taxid2acc/'
+b_test_load_taxids = False                             # ok 2023 08 25
+b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25
 
 prog_tag = '[' + os.path.basename(__file__) + ']'
 
@@ -36,7 +42,7 @@
 # order = -4
 # family or clade = -3
 # subtribe or genus = -2
-curr_index_in_lineage = -2
+curr_index_in_lineage = -1
 min_index_in_lineage = -4
 
 # boolean to know if we download ncbi taxonomy file in current env
@@ -48,6 +54,11 @@
 
 b_verbose         = False
 
+# variables for ncbi-genome-download
+ncbigenomedownload_section = 'refseq' # genbank
+organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate'
+assembly_levels = 'complete,chromosome'
+
 # rank = '' # rank level retained by user
 # rank_num = index of rank retained by user
 
@@ -135,7 +146,7 @@
                   '-o accnr_out_list.txt']),"\n\n" ]))
     parser.print_help()
     print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
-          " arguments, exit line "+str(frame.f_lineno))
+          " arguments, exit line "+lineno())
     sys.exit(0)
 
 # print('args:', args)
@@ -146,7 +157,7 @@
     # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite"
     ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite")
 if args.taxid_acc_in_f is not None:
-    taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f)
+    taxid_acc_in_f = os.path.abspath(args.taxid_acc_f)
     b_acc_in_f = True    
 elif(not b_test):
     sys.exit("[Error] You must provide taxid_acc_in_f")
@@ -174,39 +185,39 @@
 # to sort uniq, for a list, only need to add list conversion
 # --------------------------------------------------------------------------
 mapper= map # Python ≥ 3
-def sort_uniq(sequence):
+def sort_uniq(sequence: list):
     return mapper(
         operator.itemgetter(0),
         itertools.groupby(sorted(sequence)))
 # --------------------------------------------------------------------------
 
 # --------------------------------------------------------------------------
-# load taxid acc list, return taxidlist
+# Procedure: load taxid acc list, return taxidlist
 # --------------------------------------------------------------------------
-def load_taxids(taxid_acc_tabular_f):
+def load_taxids(taxid_acc_tabular_f: str):
 
     if not path.exists(taxid_acc_tabular_f):
         sys.exit("Error " + taxid_acc_tabular_f +
-                 " file does not exist, line "+ str(sys._getframe().f_lineno) )
+                 " file does not exist, line "+ lineno() )
 
-    cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq "
+    cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u "
 
     for line in os.popen(cmd).readlines():
         if line.rstrip() != "":
-          k, v = line.rstrip().split()
-          taxidlist.append(k)
-          accnrlist.append(v)
+            k, v = line.rstrip().split()
+            taxidlist.append(k)
+            accnrlist.append(v)
+            # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
 
-    return taxidlist
 # --------------------------------------------------------------------------
 
 # test load_taxids function
 # display taxidlist, then exit
 if b_test_load_taxids:
-    taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
+    taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
     print("START b_test_load_taxids")
     print("loading "+taxid_acc_tabular_f+" file")
-    taxidlist = load_taxids(taxid_acc_tabular_f)
+    load_taxids(taxid_acc_tabular_f)
     for i in range(len(taxidlist)):
         print(f"{taxidlist[i]}\t{accnrlist[i]}")
     print("END b_test_load_taxids")
@@ -233,7 +244,7 @@
 # - print accnr species and name retained
 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp 
 # --------------------------------------------------------------------------
-def retain_1accnr(accnrlisttmp, speciestmp, nametmp):
+def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str:
 
     max_accnr_version  = 0
     curr_accnr_version = 0
@@ -270,17 +281,17 @@
 # --------------------------------------------------------------------------
 
 # --------------------------------------------------------------------------
-# procedure to find complete genome closely related to current taxid
+# Function to find complete genome closely related to current taxid
 # goes upper in taxonomy if nothing found until order
 # --------------------------------------------------------------------------
-def ngd_upper_lineage(curr_index_in_lineage,
-                      lineage,
-                      ncbi,                      
-                      accnrlisttmp, # current working list
-                      accnrlist,    # final list, if something added (or min index reached), recursivity stop
-                      speciestmp,
-                      nametmp
-                      ):
+def ngd_upper_lineage(curr_index_in_lineage: int,
+                      lineage: list,
+                      ncbi: NCBITaxa,                      
+                      accnrlisttmp: list, # current working list
+                      accnrlist: list,    # final list, if something added (or min index reached), recursivity stop
+                      speciestmp: list,
+                      nametmp: list
+                      ) -> str:
     print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}")
     
     # deduce up rank, search complet genome/chr in
@@ -293,25 +304,31 @@
                                              collapse_subspecies=False,
                                              return_tree=False
                                              )
+
     # int conversion to strings
     leaves_taxids = list(map(str, leaves_taxids))
-    leaves_taxids_list = ','.join(leaves_taxids) 
-    cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
-    # print(f"{prog_tag} cmd:{cmd}")
+    leaves_taxids_list = ','.join(leaves_taxids)
+
+    if b_verbose:
+        print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}")    
+    cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
+    if b_verbose:
+        print(f"{prog_tag} cmd:{cmd}")
 
     # specific to retain_1accn to avoid lists are crashed by other ngd call
     accnrlisttmp_r = []
     speciestmp_r   = []
     nametmp_r      = []
+    
     for line in os.popen(cmd).readlines():
-
+#        print(f"line 314:{line.rstrip()}")
         if not re.match("^Considering", line):
-            # print(f"line:{line.rstrip()}")
+#            print(f"line 316:{line.rstrip()}")
             if re.match("^(?:ERROR|Error): No downloads", line):
                 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})")
                 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted
                     curr_index_in_lineage = curr_index_in_lineage - 1
-                    print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") 
+                    print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") 
                     return ngd_upper_lineage(curr_index_in_lineage,
                                              lineage,
                                              ncbi,                                      
@@ -321,7 +338,11 @@
                                              nametmp                                     
                                              )
             else:
-                acc_nr, species, name = line.rstrip().split("\t")
+#                print(f"line 331:{line}")
+                try:
+                    acc_nr, species, name = line.rstrip().split("\t")
+                except ValueError as ve:
+                    sys.exit(f"ValueError {ve}: for split of line '{line}'")
                 accnrlisttmp_r.append(acc_nr)
                 speciestmp_r.append(species)
                 nametmp_r.append(name)                                  
@@ -339,9 +360,9 @@
 # read taxids, deduce complete genomes available in genblank, provides in output file
 # the acc number in addition  to those already listed
 # --------------------------------------------------------------------------
-def add_host_chr_taxids_accnr_from_ori_list(taxidlist,
-                                            accnrlist,
-                                            acc_out_f):
+def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list,
+                                            accnrlist: list,
+                                            acc_out_f: str):
 
     # store all accnr found for complete genome of current taxid (or from same family/order)
     # the aim is to keep only the most recent/complete
@@ -355,13 +376,13 @@
     # # ------------------------------------------
     # # ncbi-genome-download as a library
     # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv'
-    # ngd.download(section='genbank',
+    # ngd.download(section=ncbigenomedownload_section,
     #              taxids=taxids_list,
-    #              assembly_levels='chromosome',
+    #              assembly_levels=assembly_levels,
     #              flat_output=True,
     #              output=ngd_out_f,
-    #              groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate',
-    #              dry_run=True                                                           
+    #              groups=organisms_to_search_in,
+    #              dry_run=True   
     #              )
 
 
@@ -390,8 +411,11 @@
             warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")
 
     for taxid_u in taxidlist:
-        cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1"
+        print(f"{prog_tag} treating global taxid:{taxid_u}")
+        cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1"
         for line in os.popen(cmd).readlines():
+            if b_verbose:
+                print(f"{prog_tag} cmd:{cmd} ran, read output")
             # ERROR: No downloads matched your filter. Please check your options.
             if re.match("^(?:ERROR|Error): No downloads", line):
                 # get complete lineage: accept ONLY leave taxid? (species)
@@ -401,7 +425,7 @@
                     print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}")
 
                 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome
-                # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") 
+                # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") 
                 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage,
                                                lineage,
                                                ncbi,
@@ -410,7 +434,11 @@
                                                speciestmp,
                                                nametmp
                                                )
-                accnrlist.append( new_acc_nr )
+                if new_acc_nr is None:
+                    print(f"No acc_nr found after going up in taxonomy, line {lineno()}")
+                else:
+                    accnrlist.append( new_acc_nr )
+                    # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
                 
                 # initialize for next search
                 accnrlisttmp = []
@@ -421,6 +449,8 @@
                 # print(f"line:{line.rstrip()}")
                 acc_nr, species, name = line.rstrip().split("\t")
                 accnrlisttmp.append(acc_nr)
+                # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
+                            
                 speciestmp.append(species)
                 nametmp.append(name)                                  
                 if b_verbose:
@@ -429,8 +459,10 @@
         # retain only the most recent complete genome for current treated taxid
         if len(accnrlisttmp):
             accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) )
+            # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}")
 
     # remove redundant accnr
+    print(f"accnrlist to sort:{accnrlist}")
     accnrlist = list(sort_uniq(accnrlist))
     with open(acc_out_f, "w") as record_file:
         for accnr in accnrlist:
@@ -442,12 +474,13 @@
 # --------------------------------------------------------------------------
 # test
 if b_test_add_host_chr_taxids_accnr_from_ori_list:
-   taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv'
-   taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv'
-   acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv'
+   taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv'
+   acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv'
    print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list")
-   print(f"{prog_tag} loading {taxid_acc_tabular_f} file")
-   taxidlist = load_taxids(taxid_acc_in_f)
+   print(f"{prog_tag} loading {taxid_acc_in_f} file")
+   load_taxids(taxid_acc_in_f)
+   for i in range(len(taxidlist)):
+       print(f"{taxidlist[i]}\t{accnrlist[i]}")
    print(f"{prog_tag} end loading")   
    
    add_host_chr_taxids_accnr_from_ori_list(taxidlist,
@@ -463,7 +496,7 @@
 ##### MAIN
 def __main__():
     # load taxid_acc file
-    load_taxids(taxid_acc_in_f)
+    load_taxids(taxid_acc_tabular_f)
     # check in ncbi taxonomy which acc number are in and out of given taxid
     add_host_chr_taxids_accnr_from_ori_list(taxidlist,
                                             accnrlist,