annotate MEGABLAST_TAB_get_taxid_acc.py @ 9:9a0ce5d04cbc draft default tip

Fix erreur commande
author p.lucas
date Wed, 09 Feb 2022 16:33:40 +0000
parents b5b8f3cb240c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
3 ###
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
4 # From a megablast file of results (tabular 25 columns) and taxid(s) user is interested in,
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
5 # provide 1 file with 2 columns: taxids acc
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
6 ###
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
7
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
8 ### Libraries to import:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
9 # NOTE: Python 2.7 because needs krona env that MUST use 2.7 (last krona version load 2022 01 21)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
10 # NOTE: to update krona tax in conda env, run:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
11 # ktUpdateTaxonomy.sh
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
12 # ktUpdateTaxonomy.sh --accessions (this one NOT PROVIDED IN DOCUMENTATION)
6
b5b8f3cb240c MAJ python script by FT
p.lucas
parents: 0
diff changeset
13 import argparse, os, sys, warnings
0
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
14 # NEEDS to use krona conda environnement if access ktGetTaxIDFromAcc
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
15 from os import path
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
16
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
17 # to be able to report line number in error messages
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
18 import inspect
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
19 frame = inspect.currentframe()
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
20
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
21 # debug
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
22 b_test_creates_taxid_acc_f_from_megablast_res = False # ok 2022 01 21
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
23
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
24 prog_tag = '[' + os.path.basename(__file__) + ']'
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
25
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
26 krona_taxid_acc_f = ''
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
27 krona_tab_dir = ''
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
28
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
29 # taxid found under the taxid searched for
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
30 tax_in = []
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
31
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
32 # boolean to know if we dowload ncbi taxonomy file in current env
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
33 b_test_all = False
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
34 b_test = False
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
35
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
36 # min_nr_reads_by_accnr = 1
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
37
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
38 parser = argparse.ArgumentParser()
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
39 parser.add_argument("-r", "--megablast_tabular_f", dest='megablast_f',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
40 help="megablast results tabular file (25 colums), including accession numbers in col 2",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
41 metavar="FILE")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
42 parser.add_argument("-o", "--tax_acc_out_f", dest='tax_acc_out_f',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
43 help="Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
44 metavar="FILE")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
45 #parser.add_argument("-m", "--min_number_off_reads_by_acc_nr", dest='min_nr_reads_by_accnr',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
46 # help="[Optional] minimal number of reads matching an accession number to take it into account (default:1)",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
47 # metavar="INT")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
48 parser.add_argument("-z", "--test_all", dest='b_test_all',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
49 help="[Optional] run all tests",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
50 action='store_true')
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
51 parser.set_defaults(b_load_ncbi_tax_f=False)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
52 parser.set_defaults(b_test_all=False)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
53 parser.set_defaults(min_nr_reads_by_accnr=1)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
54
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
55 # get absolute path in case of files
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
56 args = parser.parse_args()
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
57
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
58 # -------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
59 # check arguments
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
60 b_test_all = args.b_test_all
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
61
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
62 if b_test_all:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
63 b_test_creates_taxid_acc_f_from_megablast_res = True
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
64 b_test = True
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
65 else:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
66 b_test = b_test_creates_taxid_acc_f_from_megablast_res
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
67
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
68 if ((not b_test)and
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
69 ((len(sys.argv) < 5) or (len(sys.argv) > 5))):
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
70 print("\n".join(["To use this scripts, install first MEGABLAST_TAB_get_acc_under_taxid_in_out.yaml conda env. Then run:",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
71 "conda activate MEGABLAST_TAB_get_taxid_acc",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
72 "ktUpdateTaxonomy.sh",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
73 "ktUpdateTaxonomy.sh --accessions",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
74 "\n\n",
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
75 "Example: "+ ' '.join(['./MEGABLAST_TAB_get_taxid_acc.py',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
76 '-r megablast_out_f_25clmn.tsv',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
77 '-o megablast_out_f_taxid_acc.tsv']),"\n\n" ]))
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
78 parser.print_help()
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
79 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
80 " arguments, exit line "+str(frame.f_lineno))
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
81 sys.exit(0)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
82
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
83 # print('args:', args)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
84 if(not b_test):
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
85 if args.megablast_f is not None:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
86 megablast_f = os.path.abspath(args.megablast_f)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
87 else:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
88 sys.exit("[Error] You must provide megablast_f")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
89 if args.tax_acc_out_f is not None:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
90 tax_acc_out_f = os.path.abspath(args.tax_acc_out_f)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
91 # if min_nr_reads_by_accnr is not None:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
92 # min_nr_reads_by_accnr = args.min_nr_reads_by_accnr
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
93
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
94 # ------------------------------------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
95 # from a megablast tsv output file with 25 columns, return a file with only
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
96 # taxid acc
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
97 # output file name provided as parameter
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
98 # ------------------------------------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
99 krona_taxid_acc_f = ''
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
100 def creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f):
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
101 acc_col_nb_in_megablast_res = str(2)
6
b5b8f3cb240c MAJ python script by FT
p.lucas
parents: 0
diff changeset
102 # krona_taxdb_f = os.path.expanduser('~/miniconda3/envs/krona/opt/krona/taxonomy/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
b5b8f3cb240c MAJ python script by FT
p.lucas
parents: 0
diff changeset
103 krona_taxdb_f = os.path.expanduser('/db/krona/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
b5b8f3cb240c MAJ python script by FT
p.lucas
parents: 0
diff changeset
104 if not os.path.isfile(krona_taxdb_f + 'all.accession2taxid.sorted'):
0
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
105 sys.exit(prog_tag + "[Error] missing "+krona_taxdb_f+" file, please run 'ktUpdateTaxonomy.sh --accessions' in your krona conda environment (and 'ktUpdateTaxonomy.sh' before if you have not done)")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
106
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
107 # conda: "../envs/krona.yaml"
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
108 cmd = ' '.join(["cut -f", acc_col_nb_in_megablast_res, megablast_f,
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
109 '| ktGetTaxIDFromAcc -tax ',krona_taxdb_f,' -p ',
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
110 # '| uniq ', # remove many redundant lines # DO NOT USE because need exact number of each acc nr
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
111 '> ',tax_acc_out_f]) # return lines:taxid acc
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
112 # print("cmd:"+cmd)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
113 os.system(cmd)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
114 print(prog_tag + ' ' + tax_acc_out_f + " file created")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
115
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
116 # test creates_taxid_acc_f_from_megablast_res function
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
117 # display created file and header
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
118 if b_test_creates_taxid_acc_f_from_megablast_res:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
119 megablast_f = "megablast_out_f_25clmn.tsv"
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
120 print(prog_tag + " START b_test_creates_taxid_acc_f_from_megablast_res")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
121 print(prog_tag + " loading "+megablast_f+" file")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
122 tax_acc_out_f = 'megablast_out_f_taxid_acc.tsv'
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
123 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
124 if os.path.isfile(tax_acc_out_f):
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
125 print(prog_tag + " " + tax_acc_out_f + " file created, start with:")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
126 cmd = "head " + tax_acc_out_f
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
127 print(os.system(cmd))
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
128 else:
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
129 sys.exit(prog_tag + "[Error] creates_taxid_acc_f_from_megablast_res has not created file "+tax_acc_out_f)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
130 print("END b_test_creates_taxid_acc_f_from_megablast_res")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
131 sys.exit("Exit program after test")
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
132
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
133 # --------------------------------------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
134
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
135 # --------------------------------------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
136 # MAIN
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
137 # --------------------------------------------------------------------------
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
138 ##### MAIN
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
139 def __main__():
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
140 # creates taxid acc file from megablast result
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
141 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
142
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
143 #### MAIN END
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
144 if __name__ == "__main__": __main__()
c5e214c34bfc Uploaded
p.lucas
parents:
diff changeset
145