Mercurial > repos > dfornika > blast_report_basic
changeset 30:1d6a2561e05e draft
Uploaded
| author | dfornika | 
|---|---|
| date | Tue, 03 Mar 2020 10:55:12 +0000 | 
| parents | c3b7a0a72107 | 
| children | 11f622d60501 | 
| files | blast_report.py | 
| diffstat | 1 files changed, 41 insertions(+), 45 deletions(-) [+] | 
line wrap: on
 line diff
--- a/blast_report.py Tue Mar 03 10:11:18 2020 +0000 +++ b/blast_report.py Tue Mar 03 10:55:12 2020 +0000 @@ -2,19 +2,14 @@ from __future__ import print_function -'''Report on BLAST results. - -python blast_report.py input_tab cheetah_tmpl output_html output_tab [-i [min_identity]] [-f filterkw1,...,filterkwN]] [-b bin1_label bin1_path[,...binN_label binN_path]] -''' - import argparse import re import sys from Cheetah.Template import Template -from pprint import pprint + -def stop_err( msg ): +def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.exit(1) @@ -23,12 +18,12 @@ def __init__(self, label, file): self.label = label self.dict = {} - + file_in = open(file) for line in file_in: self.dict[line.rstrip().split('.')[0]] = '' file_in.close() - + def __str__(self): return "label: %s dict: %s" % (self.label, str(self.dict)) @@ -38,13 +33,21 @@ self.query_id = query_id self.matches = [] self.match_accessions = {} - self.bins = {} #{bin(label):[match indexes]} + self.bins = {} # {bin(label):[match indexes]} self.pident_filtered = 0 self.kw_filtered = 0 - self.kw_filtered_breakdown = {} #{kw:count} - + self.kw_filtered_breakdown = {} # {kw:count} + def __str__(self): - return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ + format_string = "\t".join([ + "query_id: %s", + "len(matches): %s", + "bins (labels only): %s", + "pident_filtered: %s", + "kw_filtered: %s", + "kw_filtered_breakdown: %s" + ]) + return format_string \ % (self.query_id, str(len(self.matches)), str([bin.label for bin in bins]), @@ -61,17 +64,17 @@ self.p_cov = p_cov self.p_ident = p_ident self.bins = subject_bins - + def __str__(self): return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ % (self.subject_acc, self.subject_descr, str(self.score), - str(round(self.p_cov,2)), + str(round(self.p_cov, 2)), str(round(self.p_ident, 2))) -#PARSE OPTIONS AND ARGUMENTS +# PARSE OPTIONS AND ARGUMENTS parser = argparse.ArgumentParser() parser.add_argument('-f', '--filter-keywords', @@ -97,20 +100,15 @@ args = parser.parse_args() -pprint(args.bins) - -print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (args.input_tab, args.cheetah_tmpl, args.output_html, args.output_tab)) - - -#BINS -bins=[] -if args.bins != None: +# BINS +bins = [] +if args.bins is not None: for bin in args.bins: bins.append(BLASTBin(bin[0], bin[1])) print('database bins: %s' % str([bin.label for bin in bins])) -#FILTERS +# FILTERS filter_pident = 0 filter_kws = [] if args.filter_keywords: @@ -129,7 +127,7 @@ queries = [] current_query = '' output_tab = open(args.output_tab, 'w') - + with open(args.input_tab) as input_tab: for line in input_tab: cols = line.split('\t') @@ -142,22 +140,21 @@ except IndexError as e: stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) - #hsp option: keep best (first) hit only for each query and accession id. + # keep best (first) hit only for each query and accession id. if args.discard_redundant: if accs[0] in queries[-1].match_accessions: - continue #don't save the result and skip to the next + continue # don't save the result and skip to the next else: queries[-1].match_accessions[accs[0]] = '' - p_ident = float(cols[PIDENT_COL]) - #FILTER BY PIDENT - if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True + # FILTER BY PIDENT + if p_ident < filter_pident: # if we are not filtering, filter_pident == 0 and this will never evaluate to True queries[-1].pident_filtered += 1 continue - + descrs = cols[DESCR_COL] - #FILTER BY KEY WORDS + # FILTER BY KEY WORDS filter_by_kw = False for kw in filter_kws: kw = kw.strip() @@ -165,34 +162,34 @@ filter_by_kw = True try: queries[-1].kw_filtered_breakdown[kw] += 1 - except: + except Exception as e: queries[-1].kw_filtered_breakdown[kw] = 1 - if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True + if filter_by_kw: # if we are not filtering, for loop will not be entered and this will never be True queries[-1].kw_filtered += 1 continue descr = descrs.split(';')[0] - - #ATTEMPT BIN + + # ATTEMPT BIN subj_bins = [] - for bin in bins: #if we are not binning, bins = [] so for loop not entered + for bin in bins: # if we are not binning, bins = [] so for loop not entered for acc in accs: if acc.split('.')[0] in bin.dict: try: queries[-1].bins[bin.label].append(len(queries[-1].matches)) - except: + except Exception as e: queries[-1].bins[bin.label] = [len(queries[-1].matches)] subj_bins.append(bin.label) - break #this result has been binned to this bin so break + break # this result has been binned to this bin so break acc = accs[0] - + score = int(float(cols[SCORE_COL])) p_cov = float(cols[PCOV_COL]) - - #SAVE RESULT + + # SAVE RESULT queries[-1].matches.append( BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) ) - output_tab.write(line) + output_tab.write(line) input_tab.close() output_tab.close() @@ -212,4 +209,3 @@ out_html = open(args.output_html, 'w') out_html.write(str(html)) out_html.close() -
