Mercurial > repos > dfornika > blast_report_basic
comparison blast_report.py @ 0:5dfd84907521 draft
planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit bc359460bb66db7946cc68ccbd47cd479624c4a1-dirty
| author | dfornika |
|---|---|
| date | Tue, 03 Mar 2020 00:14:34 +0000 |
| parents | |
| children | 386a88793078 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5dfd84907521 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 '''Report on BLAST results. | |
| 3 | |
| 4 python blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]] | |
| 5 ''' | |
| 6 import argparse | |
| 7 import re | |
| 8 import sys | |
| 9 | |
| 10 from Cheetah.Template import Template | |
| 11 | |
| 12 | |
| 13 def stop_err( msg ): | |
| 14 sys.stderr.write("%s\n" % msg) | |
| 15 sys.exit(1) | |
| 16 | |
| 17 | |
| 18 class BLASTBin: | |
| 19 def __init__(self, label, file): | |
| 20 self.label = label | |
| 21 self.dict = {} | |
| 22 | |
| 23 file_in = open(file) | |
| 24 for line in file_in: | |
| 25 self.dict[line.rstrip().split('.')[0]] = '' | |
| 26 file_in.close() | |
| 27 | |
| 28 def __str__(self): | |
| 29 return "label: %s dict: %s" % (self.label, str(self.dict)) | |
| 30 | |
| 31 | |
| 32 class BLASTQuery: | |
| 33 def __init__(self, query_id): | |
| 34 self.query_id = query_id | |
| 35 self.matches = [] | |
| 36 self.match_accessions = {} | |
| 37 self.bins = {} #{bin(label):[match indexes]} | |
| 38 self.pident_filtered = 0 | |
| 39 self.kw_filtered = 0 | |
| 40 self.kw_filtered_breakdown = {} #{kw:count} | |
| 41 | |
| 42 def __str__(self): | |
| 43 return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ | |
| 44 % (self.query_id, | |
| 45 str(len(self.matches)), | |
| 46 str([bin.label for bin in bins]), | |
| 47 str(self.pident_filtered), | |
| 48 str(self.kw_filtered), | |
| 49 str(self.kw_filtered_breakdown)) | |
| 50 | |
| 51 | |
| 52 class BLASTMatch: | |
| 53 def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): | |
| 54 self.subject_acc = subject_acc | |
| 55 self.subject_descr = subject_descr | |
| 56 self.score = score | |
| 57 self.p_cov = p_cov | |
| 58 self.p_ident = p_ident | |
| 59 self.bins = subject_bins | |
| 60 | |
| 61 def __str__(self): | |
| 62 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ | |
| 63 % (self.subject_acc, | |
| 64 self.subject_descr, | |
| 65 str(self.score), | |
| 66 str(round(self.p_cov,2)), | |
| 67 str(round(self.p_ident, 2))) | |
| 68 | |
| 69 | |
| 70 | |
| 71 #PARSE OPTIONS AND ARGUMENTS | |
| 72 parser = argparse.ArgumentParser() | |
| 73 | |
| 74 parser.add_argument('-f', '--filter', | |
| 75 type='string', | |
| 76 dest='filter', | |
| 77 ) | |
| 78 parser.add_argument('-b', '--bins', | |
| 79 type='string', | |
| 80 dest='bins' | |
| 81 ) | |
| 82 parser.add_argument('-r', '--redundant', | |
| 83 dest='redundant', | |
| 84 default=False, | |
| 85 action='store_true' | |
| 86 ) | |
| 87 args = parser.parse_args() | |
| 88 | |
| 89 try: | |
| 90 input_tab, cheetah_tmpl, output_html, output_tab = args | |
| 91 except: | |
| 92 stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.') | |
| 93 # print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab)) | |
| 94 | |
| 95 | |
| 96 #BINS | |
| 97 bins=[] | |
| 98 if args.bins != None: | |
| 99 bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in args.bins.split(',')]) | |
| 100 print('database bins: %s' % str([bin.label for bin in bins])) | |
| 101 | |
| 102 #FILTERS | |
| 103 filter_pident = 0 | |
| 104 filter_kws = [] | |
| 105 if args.filter != None: | |
| 106 pident_kws = args.filter.split(':') | |
| 107 filter_pident = float(pident_kws[0]) | |
| 108 filter_kws = pident_kws[-1].split(',') | |
| 109 print('filter_pident: %s filter_kws: %s' % (str(filter_pident), str(filter_kws))) | |
| 110 | |
| 111 if args.redundant: | |
| 112 print('Throwing out redundant hits...') | |
| 113 | |
| 114 #RESULTS! | |
| 115 PIDENT_COL = 2 | |
| 116 DESCR_COL = 25 | |
| 117 SUBJ_ID_COL = 12 | |
| 118 SCORE_COL = 11 | |
| 119 PCOV_COL = 24 | |
| 120 queries = [] | |
| 121 current_query = '' | |
| 122 output_tab = open(output_tab, 'w') | |
| 123 | |
| 124 with open(input_tab) as input_tab: | |
| 125 for line in input_tab: | |
| 126 cols = line.split('\t') | |
| 127 if cols[0] != current_query: | |
| 128 current_query = cols[0] | |
| 129 queries.append(BLASTQuery(current_query)) | |
| 130 | |
| 131 try: | |
| 132 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] | |
| 133 except IndexError as e: | |
| 134 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) | |
| 135 | |
| 136 #hsp option: keep best (first) hit only for each query and accession id. | |
| 137 if args.redundant: | |
| 138 if accs[0] in queries[-1].match_accessions: | |
| 139 continue #don't save the result and skip to the next | |
| 140 else: | |
| 141 queries[-1].match_accessions[accs[0]] = '' | |
| 142 | |
| 143 | |
| 144 p_ident = float(cols[PIDENT_COL]) | |
| 145 #FILTER BY PIDENT | |
| 146 if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True | |
| 147 queries[-1].pident_filtered += 1 | |
| 148 continue | |
| 149 | |
| 150 descrs = cols[DESCR_COL] | |
| 151 #FILTER BY KEY WORDS | |
| 152 filter_by_kw = False | |
| 153 for kw in filter_kws: | |
| 154 kw = kw.strip() #Fix by Damion D Nov 2013 | |
| 155 if kw != '' and re.search(kw, descrs, re.IGNORECASE): | |
| 156 filter_by_kw = True | |
| 157 try: | |
| 158 queries[-1].kw_filtered_breakdown[kw] += 1 | |
| 159 except: | |
| 160 queries[-1].kw_filtered_breakdown[kw] = 1 | |
| 161 if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True | |
| 162 queries[-1].kw_filtered += 1 | |
| 163 continue | |
| 164 descr = descrs.split(';')[0] | |
| 165 | |
| 166 #ATTEMPT BIN | |
| 167 subj_bins = [] | |
| 168 for bin in bins: #if we are not binning, bins = [] so for loop not entered | |
| 169 for acc in accs: | |
| 170 if acc.split('.')[0] in bin.dict: | |
| 171 try: | |
| 172 queries[-1].bins[bin.label].append(len(queries[-1].matches)) | |
| 173 except: | |
| 174 queries[-1].bins[bin.label] = [len(queries[-1].matches)] | |
| 175 subj_bins.append(bin.label) | |
| 176 break #this result has been binned to this bin so break | |
| 177 acc = accs[0] | |
| 178 | |
| 179 score = int(float(cols[SCORE_COL])) | |
| 180 p_cov = float(cols[PCOV_COL]) | |
| 181 | |
| 182 #SAVE RESULT | |
| 183 queries[-1].matches.append( | |
| 184 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) | |
| 185 ) | |
| 186 output_tab.write(line) | |
| 187 input_tab.close() | |
| 188 output_tab.close() | |
| 189 | |
| 190 ''' | |
| 191 for query in queries: | |
| 192 print(query) | |
| 193 for match in query.matches: | |
| 194 print(' %s' % str(match)) | |
| 195 for bin in query.bins: | |
| 196 print(' bin: %s' % bin) | |
| 197 for x in query.bins[bin]: | |
| 198 print(' %s' % str(query.matches[x])) | |
| 199 ''' | |
| 200 | |
| 201 namespace = {'queries': queries} | |
| 202 html = Template(file=cheetah_tmpl, searchList=[namespace]) | |
| 203 out_html = open(output_html, 'w') | |
| 204 out_html.write(str(html)) | |
| 205 out_html.close() | |
| 206 | |
| 207 | |
| 208 if __name__ == '__main__': | |
| 209 main() |
