Next changeset 1:f83dad3524f9 (2023-09-21) |
Commit message:
Uploaded python script v1 |
added:
TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb.py |
b |
diff -r 000000000000 -r e5b355a33841 TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb.py Thu Sep 21 15:22:59 2023 +0000 |
[ |
b'@@ -0,0 +1,750 @@\n+#!/usr/bin/env python3\n+# -*- coding: utf-8 -*-\n+###\n+# USE PYTHON3\n+# From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads)\n+# provide 2 files:\n+# - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level)\n+# - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level)\n+###\n+\n+###\xc2\xa0Libraries to import:\n+import argparse, os, sys, csv, warnings, re, itertools, operator\n+#from subprocess import Popen,PIPE\n+import subprocess\n+from os import path\n+from natsort import natsorted\n+# to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...)\n+from ete3 import NCBITaxa\n+\n+# to be able to report line number in error messages\n+import inspect\n+def lineno():\n+ """Returns the current line number in our program."""\n+ return str(inspect.currentframe().f_back.f_lineno)\n+\n+\n+# debug\n+test_dir = \'test_TAXID_genusexpand_taxid2acc_offline_searchinhostcompletegenomedb/\'\n+b_test_load_taxids = False # ok 2023 09 11 with rm redundant taxid code\n+b_test_get_host_complete_genome_acc_nr_found = False # ok 2023 09 11\n+\n+prog_tag = \'[\' + os.path.basename(__file__) + \']\'\n+\n+# boolean to know if we dowload ncbi taxonomy file in current env\n+b_load_ncbi_tax_f = False\n+\n+# list of interesting taxid (fathers)\n+taxidlist_f = \'\'\n+taxidlist = []\n+accnrlist = []\n+taxidlisthosts = []\n+accnrlisthosts = []\n+ \n+# order = -4\n+# family or clade = -3\n+# subtribe or genus = -2\n+curr_index_in_lineage = -1\n+min_index_in_lineage = -4\n+\n+# boolean to know if we download ncbi taxonomy file in current env\n+b_load_ncbi_tax_f = False\n+b_test_all = False\n+b_test = False\n+b_acc_in_f = False\n+b_acc_hostdb_in_f = False\n+b_acc_out_f = False\n+\n+b_verbose = False\n+\n+# variables for ncbi-genome-download\n+ncbigenomedownload_section = \'refseq\' # genbank\n+organisms_to_search_in = \'vertebrate_other,vertebrate_mammalian,plant,invertebrate\'\n+assembly_levels = \'complete,chromosome\'\n+\n+# rank = \'\' # rank level retained by user\n+# rank_num = index of rank retained by user\n+\n+# # set to check that provided rank exist to be sure to be able to use it\n+# ranks = {\n+# \'superkingdom\' => 0,\n+# # \'clade\', # no, several clade, name is too generic\n+# \'kingdom\' => 1,\n+# \'phylum\' => 2,\n+# \'subphylum\' => 3,\n+# \'superclass\' => 4,\n+# \'class\' => 5,\n+# \'superorder\' => 6,\n+# \'order\' => 7,\n+# \'suborder\' => 8,\n+# \'infraorder\' => 9,\n+# \'parvorder\' => 10,\n+# \'superfamily\' => 11,\n+# \'family\' => 12,\n+# \'subfamily\' => 13,\n+# \'genus\' => 14,\n+# \'species\' => 15,\n+# \'subspecies\' => 16\n+# }\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("-i", "--taxid_acc_in_f", dest=\'taxid_acc_in_f\',\n+ help="taxid acc_number list in tsv (tabular separated at each line)",\n+ metavar="FILE")\n+parser.add_argument("-d", "--taxid_acc_hostdb_in_f", dest=\'taxid_acc_hostdb_in_f\',\n+ help="taxid acc_number list in tsv (tabular separated at each line) for HOSTS",\n+ metavar="FILE")\n+parser.add_argument("-o", "--acc_out_f", dest=\'acc_out_f\',\n+ help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers of COMPLETE GENOMES under taxid in ncbi taxonomy tree",\n+ metavar="FILE")\n+# parser.add_argument("-r", "--rank", dest=\'rank\',\n+# help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this \'rank\' (genus) taxid list",\n+# action="store_const")\n+parser.add_argument("-n", "--ncbi_tax_f", dest=\'nc'..b' \n+ # remove redundant accnr\n+ print(f"accnrlist_res to sort:{accnrlist_res}")\n+ accnrlist_res = list(sort_uniq(accnrlist_res))\n+ with open(acc_out_f, "w") as record_file:\n+ for accnr in accnrlist_res:\n+ record_file.write("%s\\n" % (accnr))\n+ # if empty list, we record a endofline only in the file to avoid a Galaxy error\n+ if len(accnrlist_res) == 0:\n+ with open(acc_out_f, "w") as record_file:\n+ record_file.write("\\n")\n+\n+ print(f"{prog_tag} {acc_out_f} file created")\n+ print(f"{prog_tag} [TEST get_host_complete_genome_acc_nr_found] END")\n+ sys.exit()\n+# ----------------------------------------------------------------------\n+\n+# --------------------------------------------------------------------------\n+# MAIN\n+# --------------------------------------------------------------------------\n+##### MAIN\n+def __main__():\n+ \n+ # load taxid_acc file\n+ load_taxids(taxid_acc_in_f,\n+ taxidlist,\n+ accnrlist,\n+ True) # remove redundant taxids (avoid to search n times the same acc nr)\n+\n+ # load host db taxid acc\n+ load_taxids(taxid_acc_hostdb_in_f,\n+ taxidlisthosts,\n+ accnrlisthosts,\n+ False) # do not try to remove redundancy, there is no redundant taxid in db\n+ \n+ # load NCBITaxa\n+ ncbi = NCBITaxa() # Install ete3 db in local user file (.ete_toolkit/ directory)\n+ print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f)\n+ ncbi = NCBITaxa(dbfile=ncbi_tax_f)\n+ if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f:\n+ try:\n+ ncbi.update_taxonomy_database()\n+ except:\n+ warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs")\n+\n+ # global final results: accnrlist\n+ accnrlist_res = []\n+ \n+ # for each taxid found in res\n+ for taxid_u in taxidlist:\n+\n+ # get complete lineage: accept ONLY leave taxid? (species)\n+ lineage = ncbi.get_lineage(int(taxid_u))\n+ # get species and names\n+ # accnrlisttmp = acc nr of found complete genomes\n+ nametmp = ncbi.translate_to_names(lineage)\n+ \n+ taxid_u_l = [taxid_u]\n+ \n+ """\n+ speciestmp = \'\'\n+ # get species name\n+ for taxid in lineage.reversed:\n+ if ncbi.get_rank(taxid) == \'species\':\n+ speciestmp = ncbi.get_taxid_translator(taxid)\n+ break\n+ """\n+ \n+ if b_verbose:\n+ print(f"taxid:{taxid_u}\\tlineage:{lineage}\\tname:{nametmp}")\n+\n+ try:\n+ # check in ncbi taxonomy which acc number are in and out of given taxid\n+ accnrlisttmp = get_host_complete_genome_acc_nr_found(\n+ taxidlisthosts,\n+ accnrlisthosts,\n+ taxid_u_l,\n+ # for further taxonomy analysis\n+ curr_index_in_lineage,\n+ lineage,\n+ ncbi) \n+ accnrlist_res.extend(accnrlisttmp)\n+ except TypeError as nterr:\n+ print(f"{prog_tag} Nothing returned for global taxid:{taxid_u}")\n+ \n+ \n+ # remove redundant accnr\n+ print(f"accnrlist_res to sort:{accnrlist_res}")\n+ accnrlist_res = list(sort_uniq(accnrlist_res))\n+ with open(acc_out_f, "w") as record_file:\n+ for accnr in accnrlist_res:\n+ record_file.write("%s\\n" % (accnr))\n+ \n+ # if empty list, we record a endofline only in the file to avoid a Galaxy error\n+ if len(accnrlist_res) == 0:\n+ with open(acc_out_f, "w") as record_file:\n+ record_file.write("\\n")\n+\n+ # ------------------------------------------\n+\n+ print(f"{prog_tag} {acc_out_f} file created")\n+ \n+ # --------------------------------------------------------------------------\n+#### MAIN END\n+if __name__ == "__main__": __main__()\n+ \n' |