comparison bin/ess.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 #!/usr/bin/env python
2
3 import csv
4 import numpy
5 from biopy import bayesianStats as bs
6 import argparse
7 import re
8
9
10 commented_regex = re.compile("^\s*\#.*")
11
12 doc_template = """
13 <html>
14 <body>
15 <h1>Effective Sample Size Results</h1>
16 <p>Recommendation: {status}</p>
17
18 <h2>Explanation</h2>
19 <p>
20 Ideally, you want all these values to be greater than or equal to 200.
21 If any of them aren't, it's probably a good idea to perform a Resume run with BEAST.
22 On average, running 3x as long will increase your ESS by 3.
23 This should help give you some sense of how much longer you should run BEAST.
24 </p>
25 <p>
26 Note that while ESS statistics are valuable for getting a rough sense of whether to run or not, it is
27 <em>always</em> recommended that you manually look at your logfile traces using
28 <a href="http://tree.bio.ed.ac.uk/software/tracer/">Tracer</a>, or another trace analysis software before
29 accepting your analysis as "final".
30 </p>
31
32 <h2>Detailed Results</h2>
33 <table>
34 <tr><th>Parameter</th><th>ESS</th></tr>
35 {table_content}
36 </table>
37 </body>
38 </html>
39 """
40
41 tr_template = "<tr><td>{parameter}</td><td {style}>{ess}</td></tr>"
42
43
44 def format_tr(result, threshold=200):
45 "Takes a result [param, ess] pair and makes the appropriate tr html"
46 param, ess = result
47 passes = ess >= threshold
48 style = "" if passes else 'style="color: red;"'
49 return tr_template.format(parameter=param, ess=ess, style=style)
50
51
52 def dict_reader_to_cols(dict_reader):
53 d = dict()
54 for row in dict_reader:
55 for k, v in row.iteritems():
56 try:
57 v = float(v)
58 try:
59 d[k].append(v)
60 except KeyError:
61 d[k] = [v]
62 except ValueError:
63 if k == "" and v == "":
64 # Don't worry about this, there is a tailing \t on each line
65 pass
66 else:
67 # Otherwise... something weird is happening
68 print "Could not cast to float for kv pair:", k, v
69 return d
70
71
72 def table_contents(results, threshold=200):
73 trs = [format_tr(result, threshold) for result in sorted(results)]
74 return "\n".join(trs)
75
76
77 def html_contents(results, threshold):
78 table_content = table_contents(results, threshold)
79 within_tol = [x[1] > threshold for x in results]
80 ess_good = all(within_tol)
81 status = "Good to go" if all(within_tol) else "Should run longer"
82 return doc_template.format(status=status, table_content=table_content)
83
84
85 def get_args():
86 parser = argparse.ArgumentParser()
87 parser.add_argument('logfile', type=argparse.FileType('r'))
88 parser.add_argument('ess_out', type=argparse.FileType('w'))
89 parser.add_argument('-t', '--threshold', type=float, default=200,
90 help="""For formatting html, what's the minimal ESS score to be considered high enough?
91 (default: 200""")
92 parser.add_argument('--html-out', action="store_true",
93 help="Default is CSV output")
94 return parser.parse_args()
95
96
97 def main(args):
98 non_comment_lines = (row for row in args.logfile if not commented_regex.match(row))
99 reader = csv.DictReader(non_comment_lines, delimiter="\t")
100 data_columns = dict_reader_to_cols(reader)
101 results = []
102 for colname, data in data_columns.iteritems():
103 if colname != "Sample":
104 results.append([colname, bs.effectiveSampleSize(data)])
105
106 if args.html_out:
107 html_content = html_contents(results, args.threshold)
108 args.ess_out.write(html_content)
109 else:
110 writer = csv.writer(args.ess_out)
111 writer.writerow(["statistic", "ess"])
112 for result in results:
113 writer.writerow(result)
114
115 args.logfile.close()
116 args.ess_out.close()
117
118
119 if __name__ == '__main__':
120 main(get_args())
121
122