Mercurial > repos > p.lucas > megablast_tab_get_taxid_acc
changeset 0:c5e214c34bfc draft
Uploaded
author | p.lucas |
---|---|
date | Fri, 04 Feb 2022 10:07:58 +0000 |
parents | |
children | 6fa61da8c4c9 |
files | MEGABLAST_TAB_get_taxid_acc.py |
diffstat | 1 files changed, 154 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MEGABLAST_TAB_get_taxid_acc.py Fri Feb 04 10:07:58 2022 +0000 @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +### +# From a megablast file of results (tabular 25 columns) and taxid(s) user is interested in, +# provide 1 file with 2 columns: taxids acc +### + +### Libraries to import: +# NOTE: Python 2.7 because needs krona env that MUST use 2.7 (last krona version load 2022 01 21) +# NOTE: to update krona tax in conda env, run: +# ktUpdateTaxonomy.sh +# ktUpdateTaxonomy.sh --accessions (this one NOT PROVIDED IN DOCUMENTATION) +import argparse, os, sys, csv, re, warnings +# NEEDS to use krona conda environnement if access ktGetTaxIDFromAcc +from os import path +# from optparse import OptionParser +from datetime import datetime #, timezone +import pytz +from natsort import natsorted + +# to be able to report line number in error messages +import inspect +frame = inspect.currentframe() + +# start_time = datetime.now(timezone.utc) +start_time = datetime.now(pytz.utc) + +# debug +b_test_creates_taxid_acc_f_from_megablast_res = False # ok 2022 01 21 + +prog_tag = '[' + os.path.basename(__file__) + ']' + +krona_taxid_acc_f = '' +krona_tab_dir = '' + +# taxid found under the taxid searched for +tax_in = [] + +# boolean to know if we dowload ncbi taxonomy file in current env +b_test_all = False +b_test = False + +# min_nr_reads_by_accnr = 1 + +parser = argparse.ArgumentParser() +parser.add_argument("-r", "--megablast_tabular_f", dest='megablast_f', + help="megablast results tabular file (25 colums), including accession numbers in col 2", + metavar="FILE") +parser.add_argument("-o", "--tax_acc_out_f", dest='tax_acc_out_f', + help="Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree", + metavar="FILE") +#parser.add_argument("-m", "--min_number_off_reads_by_acc_nr", dest='min_nr_reads_by_accnr', +# help="[Optional] minimal number of reads matching an accession number to take it into account (default:1)", +# metavar="INT") +parser.add_argument("-z", "--test_all", dest='b_test_all', + help="[Optional] run all tests", + action='store_true') +parser.set_defaults(b_load_ncbi_tax_f=False) +parser.set_defaults(b_test_all=False) +parser.set_defaults(min_nr_reads_by_accnr=1) + +# get absolute path in case of files +args = parser.parse_args() + +# ------------------------------------------- +# check arguments +b_test_all = args.b_test_all + +if b_test_all: + b_test_creates_taxid_acc_f_from_megablast_res = True + b_test = True +else: + b_test = b_test_creates_taxid_acc_f_from_megablast_res + +if ((not b_test)and + ((len(sys.argv) < 5) or (len(sys.argv) > 5))): + print("\n".join(["To use this scripts, install first MEGABLAST_TAB_get_acc_under_taxid_in_out.yaml conda env. Then run:", + "conda activate MEGABLAST_TAB_get_taxid_acc", + "ktUpdateTaxonomy.sh", + "ktUpdateTaxonomy.sh --accessions", + "\n\n", + "Example: "+ ' '.join(['./MEGABLAST_TAB_get_taxid_acc.py', + '-r megablast_out_f_25clmn.tsv', + '-o megablast_out_f_taxid_acc.tsv']),"\n\n" ])) + parser.print_help() + print(prog_tag + "[Error] we found "+str(len(sys.argv)) + + " arguments, exit line "+str(frame.f_lineno)) + sys.exit(0) + +# print('args:', args) +if(not b_test): + if args.megablast_f is not None: + megablast_f = os.path.abspath(args.megablast_f) + else: + sys.exit("[Error] You must provide megablast_f") + if args.tax_acc_out_f is not None: + tax_acc_out_f = os.path.abspath(args.tax_acc_out_f) + # if min_nr_reads_by_accnr is not None: + # min_nr_reads_by_accnr = args.min_nr_reads_by_accnr + +# ------------------------------------------------------------------------ +# from a megablast tsv output file with 25 columns, return a file with only +# taxid acc +# output file name provided as parameter +# ------------------------------------------------------------------------ +krona_taxid_acc_f = '' +def creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f): + acc_col_nb_in_megablast_res = str(2) + krona_taxdb_f = os.path.expanduser("/db/krona/") # krona['taxdb'] # "/nfs/data/db/tax_krona/" + if not os.path.isfile(krona_taxdb_f + "all.accession2taxid.sorted"): + sys.exit(prog_tag + "[Error] missing "+krona_taxdb_f+" file, please run 'ktUpdateTaxonomy.sh --accessions' in your krona conda environment (and 'ktUpdateTaxonomy.sh' before if you have not done)") + + # conda: "../envs/krona.yaml" + cmd = ' '.join(["cut -f", acc_col_nb_in_megablast_res, megablast_f, + '| ktGetTaxIDFromAcc -tax ',krona_taxdb_f,' -p ', + # '| uniq ', # remove many redundant lines # DO NOT USE because need exact number of each acc nr + '> ',tax_acc_out_f]) # return lines:taxid acc + # print("cmd:"+cmd) + os.system(cmd) + print(prog_tag + ' ' + tax_acc_out_f + " file created") + +# test creates_taxid_acc_f_from_megablast_res function +# display created file and header +if b_test_creates_taxid_acc_f_from_megablast_res: + megablast_f = "megablast_out_f_25clmn.tsv" + print(prog_tag + " START b_test_creates_taxid_acc_f_from_megablast_res") + print(prog_tag + " loading "+megablast_f+" file") + tax_acc_out_f = 'megablast_out_f_taxid_acc.tsv' + creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f) + if os.path.isfile(tax_acc_out_f): + print(prog_tag + " " + tax_acc_out_f + " file created, start with:") + cmd = "head " + tax_acc_out_f + print(os.system(cmd)) + else: + sys.exit(prog_tag + "[Error] creates_taxid_acc_f_from_megablast_res has not created file "+tax_acc_out_f) + print("END b_test_creates_taxid_acc_f_from_megablast_res") + sys.exit("Exit program after test") + +# -------------------------------------------------------------------------- + +# -------------------------------------------------------------------------- +# MAIN +# -------------------------------------------------------------------------- +##### MAIN +def __main__(): + # creates taxid acc file from megablast result + creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f) + + stop_time = datetime.now(pytz.utc) + duration = stop_time - start_time + print("duration:",duration) +#### MAIN END +if __name__ == "__main__": __main__() +