comparison statistical_hypothesis_testing.py @ 0:22ed769665b6 draft default tip

Uploaded
author bgruening
date Sun, 01 Feb 2015 18:35:40 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:22ed769665b6
1 #!/usr/bin/env python
2
3 """
4
5 """
6 import sys
7 import argparse
8 from scipy import stats
9
10 def columns_to_values( args, line ):
11 #here you go over every list
12 samples = []
13 for list in args:
14 cols = line.split('\t')
15 sample_list = []
16 for row in list:
17 sample_list.append( cols[row-1] )
18 samples.append( map(int, sample_list) )
19 return samples
20
21
22 def main():
23 parser = argparse.ArgumentParser()
24 parser.add_argument('-i', '--infile', required=True, help='Tabular file.')
25 parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.')
26 parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
27 parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
28 parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
29 parser.add_argument("--test_id", help="statistical test method")
30 parser.add_argument("--mwu_use_continuity", action="store_true", default = False,
31 help="Whether a continuity correction (1/2.) should be taken into account.")
32 parser.add_argument("--equal_var", action="store_true", default = False,
33 help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.")
34 parser.add_argument("--reta", action="store_true", default = False,
35 help="Whether or not to return the internally computed a values.")
36 parser.add_argument("--fisher", action="store_true", default = False,
37 help="if true then Fisher definition is used")
38 parser.add_argument("--bias", action="store_true", default = False,
39 help="if false,then the calculations are corrected for statistical bias")
40 parser.add_argument("--inclusive1", action="store_true", default= False ,
41 help="if false,lower_limit will be ignored")
42 parser.add_argument("--inclusive2", action="store_true", default = False,
43 help="if false,higher_limit will be ignored")
44 parser.add_argument("--inclusive", action="store_true", default = False,
45 help="if false,limit will be ignored")
46 parser.add_argument("--printextras", action="store_true", default = False,
47 help="If True, if there are extra points a warning is raised saying how many of those points there are")
48 parser.add_argument("--initial_lexsort", action="store_true", default="False",
49 help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.")
50 parser.add_argument("--correction", action="store_true", default = False,
51 help="continuity correction ")
52 parser.add_argument("--axis", type=int, default=0,
53 help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)")
54 parser.add_argument("--n", type=int, default=0,
55 help="the number of trials. This is ignored if x gives both the number of successes and failures")
56 parser.add_argument("--b", type=int, default=0,
57 help="The number of bins to use for the histogram")
58 parser.add_argument("--N", type=int, default=0,
59 help="Score that is compared to the elements in a.")
60 parser.add_argument("--ddof", type=int, default=0,
61 help="Degrees of freedom correction")
62 parser.add_argument("--score", type=int, default=0,
63 help="Score that is compared to the elements in a.")
64 parser.add_argument("--m", type=float, default=0.0,
65 help="limits")
66 parser.add_argument("--mf", type=float, default=2.0,
67 help="lower limit")
68 parser.add_argument("--nf", type=float, default=99.9,
69 help="higher_limit")
70 parser.add_argument("--p", type=float, default=0.5,
71 help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5")
72 parser.add_argument("--alpha", type=float, default=0.9,
73 help="probability")
74 parser.add_argument("--new", type=float, default=0.0,
75 help="Value to put in place of values in a outside of bounds")
76 parser.add_argument("--proportiontocut", type=float, default=0.0,
77 help="Proportion (in range 0-1) of total data set to trim of each end.")
78 parser.add_argument("--lambda_", type=float, default=1.0,
79 help="lambda_ gives the power in the Cressie-Read power divergence statistic")
80 parser.add_argument("--imbda", type=float, default=0,
81 help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.")
82 parser.add_argument("--base", type=float, default=1.6,
83 help="The logarithmic base to use, defaults to e")
84 parser.add_argument("--dtype", help="dtype")
85 parser.add_argument("--med", help="med")
86 parser.add_argument("--cdf", help="cdf")
87 parser.add_argument("--zero_method", help="zero_method options")
88 parser.add_argument("--dist", help="dist options")
89 parser.add_argument("--ties", help="ties options")
90 parser.add_argument("--alternative", help="alternative options")
91 parser.add_argument("--mode", help="mode options")
92 parser.add_argument("--method", help="method options")
93 parser.add_argument("--md", help="md options")
94 parser.add_argument("--center", help="center options")
95 parser.add_argument("--kind", help="kind options")
96 parser.add_argument("--tail", help="tail options")
97 parser.add_argument("--interpolation", help="interpolation options")
98 parser.add_argument("--statistic", help="statistic options")
99
100 args = parser.parse_args()
101 infile = args.infile
102 outfile = open(args.outfile, 'w+')
103 test_id = args.test_id
104 nf = args.nf
105 mf = args.mf
106 imbda = args.imbda
107 inclusive1 = args.inclusive1
108 inclusive2 = args.inclusive2
109 sample0 = 0
110 sample1 = 0
111 sample2 = 0
112 if args.sample_cols != None:
113 sample0 = 1
114 barlett_samples = []
115 for sample in args.sample_cols.split(';'):
116 barlett_samples.append( map(int, sample.split(',')) )
117 if args.sample_one_cols != None:
118 sample1 = 1
119 sample_one_cols = args.sample_one_cols.split(',')
120 if args.sample_two_cols != None:
121 sample_two_cols = args.sample_two_cols.split(',')
122 sample2 = 1
123 for line in open( infile ):
124 sample_one = []
125 sample_two = []
126 cols = line.strip().split('\t')
127 if sample0 == 1:
128 b_samples = columns_to_values( barlett_samples,line )
129 if sample1 == 1:
130 for index in sample_one_cols:
131 sample_one.append( cols[ int(index) -1 ] )
132 if sample2 == 1:
133 for index in sample_two_cols:
134 sample_two.append( cols[ int(index) -1 ] )
135 if test_id.strip() == 'describe':
136 size, min_max,mean,uv,bs,bk = stats.describe( map(float, sample_one) )
137 cols.append( size )
138 cols.append( min_max )
139 cols.append( mean )
140 cols.append( uv )
141 cols.append( bs )
142 cols.append( bk )
143 elif test_id.strip() == 'mode':
144 vals, counts = stats.mode( map(float, sample_one) )
145 cols.append( vals )
146 cols.append( counts )
147 elif test_id.strip() == 'nanmean':
148 m = stats.nanmean( map(float, sample_one))
149 cols.append( m )
150 elif test_id.strip() == 'nanmedian':
151 m = stats.nanmedian( map(float, sample_one))
152 cols.append( m )
153 elif test_id.strip() == 'kurtosistest':
154 z_value, p_value = stats.kurtosistest( map(float, sample_one) )
155 cols.append( z_value )
156 cols.append( p_value )
157 elif test_id.strip() == 'variation':
158 ra = stats.variation( map(float, sample_one))
159 cols.append( ra )
160 elif test_id.strip() == 'itemfreq':
161 freq = stats.itemfreq( map(float, sample_one))
162 for list in freq:
163 elements = ','.join( map(str, list) )
164 cols.append( elements )
165 elif test_id.strip() == 'nanmedian':
166 m = stats.nanmedian( map(float, sample_one))
167 cols.append( m )
168 elif test_id.strip() == 'variation':
169 ra = stats.variation( map(float, sample_one))
170 cols.append( ra )
171 elif test_id.strip() == 'boxcox_llf':
172 IIf = stats.boxcox_llf( imbda,map(float, sample_one) )
173 cols.append( IIf )
174 elif test_id.strip() == 'tiecorrect':
175 fa = stats.tiecorrect( map(float, sample_one) )
176 cols.append( fa )
177 elif test_id.strip() == 'rankdata':
178 r = stats.rankdata( map(float, sample_one),method=args.md )
179 cols.append( r )
180 elif test_id.strip() == 'nanstd':
181 s = stats.nanstd( map(float, sample_one),bias=args.bias )
182 cols.append( s )
183 elif test_id.strip() == 'anderson':
184 A2, critical, sig = stats.anderson( map(float, sample_one), dist=args.dist )
185 cols.append( A2 )
186 for list in critical:
187 cols.append( list )
188 cols.append( ',' )
189 for list in sig:
190 cols.append( list )
191 elif test_id.strip() == 'binom_test':
192 p_value = stats.binom_test( map(float, sample_one), n=args.n, p=args.p )
193 cols.append( p_value )
194 elif test_id.strip() == 'gmean':
195 gm = stats.gmean( map(float, sample_one), dtype=args.dtype )
196 cols.append( gm )
197 elif test_id.strip() == 'hmean':
198 hm = stats.hmean( map(float, sample_one), dtype=args.dtype )
199 cols.append( hm )
200 elif test_id.strip() == 'kurtosis':
201 k = stats.kurtosis( map(float, sample_one),axis=args.axis, fisher=args.fisher, bias=args.bias )
202 cols.append( k )
203 elif test_id.strip() == 'moment':
204 n_moment = stats.moment( map(float, sample_one),n=args.n )
205 cols.append( n_moment )
206 elif test_id.strip() == 'normaltest':
207 k2, p_value = stats.normaltest( map(float, sample_one) )
208 cols.append( k2 )
209 cols.append( p_value )
210 elif test_id.strip() == 'skew':
211 skewness = stats.skew( map(float, sample_one),bias=args.bias )
212 cols.append( skewness )
213 elif test_id.strip() == 'skewtest':
214 z_value, p_value = stats.skewtest( map(float, sample_one))
215 cols.append( z_value )
216 cols.append( p_value )
217 elif test_id.strip() == 'sem':
218 s = stats.sem( map(float, sample_one),ddof=args.ddof )
219 cols.append( s )
220 elif test_id.strip() == 'zscore':
221 z = stats.zscore( map(float, sample_one),ddof=args.ddof )
222 for list in z:
223 cols.append( list )
224 elif test_id.strip() == 'signaltonoise':
225 s2n = stats.signaltonoise( map(float, sample_one),ddof=args.ddof )
226 cols.append( s2n )
227 elif test_id.strip() == 'percentileofscore':
228 p = stats.percentileofscore( map(float, sample_one),score=args.score,kind=args.kind )
229 cols.append( p )
230 elif test_id.strip() == 'bayes_mvs':
231 c_mean, c_var,c_std = stats.bayes_mvs( map(float, sample_one),alpha=args.alpha )
232 cols.append( c_mean )
233 cols.append( c_var )
234 cols.append( c_std )
235 elif test_id.strip() == 'sigmaclip':
236 c, c_low,c_up = stats.sigmaclip( map(float, sample_one),low=args.m,high=args.n )
237 cols.append( c )
238 cols.append( c_low )
239 cols.append( c_up )
240 elif test_id.strip() == 'kstest':
241 d, p_value = stats.kstest(map(float, sample_one), cdf=args.cdf , N=args.N,alternative=args.alternative,mode=args.mode )
242 cols.append(d)
243 cols.append(p_value)
244 elif test_id.strip() == 'chi2_contingency':
245 chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction ,lambda_=args.lambda_)
246 cols.append( chi2 )
247 cols.append( p )
248 cols.append( dof )
249 cols.append( ex )
250 elif test_id.strip() == 'tmean':
251 if nf is 0 and mf is 0:
252 mean = stats.tmean( map(float, sample_one))
253 else:
254 mean = stats.tmean( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
255 cols.append( mean )
256 elif test_id.strip() == 'tmin':
257 if mf is 0:
258 min = stats.tmin( map(float, sample_one))
259 else:
260 min = stats.tmin( map(float, sample_one),lowerlimit=mf,inclusive=args.inclusive)
261 cols.append( min )
262 elif test_id.strip() == 'tmax':
263 if nf is 0:
264 max = stats.tmax( map(float, sample_one))
265 else:
266 max = stats.tmax( map(float, sample_one),upperlimit=nf,inclusive=args.inclusive)
267 cols.append( max )
268 elif test_id.strip() == 'tvar':
269 if nf is 0 and mf is 0:
270 var = stats.tvar( map(float, sample_one))
271 else:
272 var = stats.tvar( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
273 cols.append( var )
274 elif test_id.strip() == 'tstd':
275 if nf is 0 and mf is 0:
276 std = stats.tstd( map(float, sample_one))
277 else:
278 std = stats.tstd( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
279 cols.append( std )
280 elif test_id.strip() == 'tsem':
281 if nf is 0 and mf is 0:
282 s = stats.tsem( map(float, sample_one))
283 else:
284 s = stats.tsem( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
285 cols.append( s )
286 elif test_id.strip() == 'scoreatpercentile':
287 if nf is 0 and mf is 0:
288 s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),interpolation_method=args.interpolation )
289 else:
290 s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),( mf, nf ),interpolation_method=args.interpolation )
291 for list in s:
292 cols.append( list )
293 elif test_id.strip() == 'relfreq':
294 if nf is 0 and mf is 0:
295 rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b)
296 else:
297 rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b,( mf, nf ))
298 for list in rel:
299 cols.append( list )
300 cols.append( low_range )
301 cols.append( binsize )
302 cols.append( ex )
303 elif test_id.strip() == 'binned_statistic':
304 if nf is 0 and mf is 0:
305 st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b )
306 else:
307 st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b,range=( mf, nf ) )
308 cols.append( st )
309 cols.append( b_edge )
310 cols.append( b_n )
311 elif test_id.strip() == 'threshold':
312 if nf is 0 and mf is 0:
313 o = stats.threshold( map(float, sample_one),newval=args.new )
314 else:
315 o = stats.threshold( map(float, sample_one),mf,nf,newval=args.new )
316 for list in o:
317 cols.append( list )
318 elif test_id.strip() == 'trimboth':
319 o = stats.trimboth( map(float, sample_one),proportiontocut=args.proportiontocut )
320 for list in o:
321 cols.append( list )
322 elif test_id.strip() == 'trim1':
323 t1 = stats.trim1( map(float, sample_one),proportiontocut=args.proportiontocut,tail=args.tail )
324 for list in t1:
325 cols.append( list )
326 elif test_id.strip() == 'histogram':
327 if nf is 0 and mf is 0:
328 hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b)
329 else:
330 hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b,( mf, nf ))
331 cols.append( hi )
332 cols.append( low_range )
333 cols.append( binsize )
334 cols.append( ex )
335 elif test_id.strip() == 'cumfreq':
336 if nf is 0 and mf is 0:
337 cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b)
338 else:
339 cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b,( mf, nf ))
340 cols.append( cum )
341 cols.append( low_range )
342 cols.append( binsize )
343 cols.append( ex )
344 elif test_id.strip() == 'boxcox_normmax':
345 if nf is 0 and mf is 0:
346 ma = stats.boxcox_normmax( map(float, sample_one))
347 else:
348 ma = stats.boxcox_normmax( map(float, sample_one),( mf, nf ),method=args.method)
349 cols.append( ma )
350 elif test_id.strip() == 'boxcox':
351 if imbda is 0:
352 box, ma, ci = stats.boxcox( map(float, sample_one),alpha=args.alpha )
353 cols.append( box )
354 cols.append( ma )
355 cols.append( ci )
356 else:
357 box = stats.boxcox( map(float, sample_one),imbda,alpha=args.alpha )
358 cols.append( box )
359 elif test_id.strip() == 'histogram2':
360 h2 = stats.histogram2( map(float, sample_one), map(float, sample_two) )
361 for list in h2:
362 cols.append( list )
363 elif test_id.strip() == 'ranksums':
364 z_statistic, p_value = stats.ranksums( map(float, sample_one), map(float, sample_two) )
365 cols.append(z_statistic)
366 cols.append(p_value)
367 elif test_id.strip() == 'ttest_1samp':
368 t, prob = stats.ttest_1samp( map(float, sample_one), map(float, sample_two) )
369 for list in t:
370 cols.append( list )
371 for list in prob:
372 cols.append( list )
373 elif test_id.strip() == 'ansari':
374 AB, p_value = stats.ansari( map(float, sample_one), map(float, sample_two) )
375 cols.append(AB)
376 cols.append(p_value)
377 elif test_id.strip() == 'linregress':
378 slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) )
379 cols.append(slope)
380 cols.append(intercept)
381 cols.append(r_value)
382 cols.append(p_value)
383 cols.append(stderr)
384 elif test_id.strip() == 'pearsonr':
385 cor, p_value = stats.pearsonr( map(float, sample_one), map(float, sample_two) )
386 cols.append(cor)
387 cols.append(p_value)
388 elif test_id.strip() == 'pointbiserialr':
389 r, p_value = stats.pointbiserialr( map(float, sample_one), map(float, sample_two) )
390 cols.append(r)
391 cols.append(p_value)
392 elif test_id.strip() == 'ks_2samp':
393 d, p_value = stats.ks_2samp( map(float, sample_one), map(float, sample_two) )
394 cols.append(d)
395 cols.append(p_value)
396 elif test_id.strip() == 'mannwhitneyu':
397 mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity )
398 cols.append( mw_stats_u )
399 cols.append( p_value )
400 elif test_id.strip() == 'zmap':
401 z = stats.zmap( map(float, sample_one),map(float, sample_two),ddof=args.ddof )
402 for list in z:
403 cols.append( list )
404 elif test_id.strip() == 'ttest_ind':
405 mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var )
406 cols.append( mw_stats_u )
407 cols.append( p_value )
408 elif test_id.strip() == 'ttest_rel':
409 t, prob = stats.ttest_rel( map(float, sample_one), map(float, sample_two), axis=args.axis )
410 cols.append( t )
411 cols.append( prob )
412 elif test_id.strip() == 'mood':
413 z, p_value = stats.mood( map(float, sample_one), map(float, sample_two), axis=args.axis )
414 cols.append( z )
415 cols.append( p_value )
416 elif test_id.strip() == 'shapiro':
417 W, p_value, a = stats.shapiro( map(float, sample_one), map(float, sample_two), args.reta )
418 cols.append( W )
419 cols.append( p_value )
420 for list in a:
421 cols.append( list )
422 elif test_id.strip() == 'kendalltau':
423 k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort )
424 cols.append(k)
425 cols.append(p_value)
426 elif test_id.strip() == 'entropy':
427 s = stats.entropy( map(float, sample_one), map(float, sample_two), base=args.base )
428 cols.append( s )
429 elif test_id.strip() == 'spearmanr':
430 if sample2 == 1 :
431 rho, p_value = stats.spearmanr( map(float, sample_one), map(float, sample_two) )
432 else:
433 rho, p_value = stats.spearmanr( map(float, sample_one))
434 cols.append( rho )
435 cols.append( p_value )
436 elif test_id.strip() == 'wilcoxon':
437 if sample2 == 1 :
438 T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two),zero_method=args.zero_method,correction=args.correction )
439 else:
440 T, p_value = stats.wilcoxon( map(float, sample_one),zero_method=args.zero_method,correction=args.correction )
441 cols.append( T )
442 cols.append( p_value )
443 elif test_id.strip() == 'chisquare':
444 if sample2 == 1 :
445 rho, p_value = stats.chisquare( map(float, sample_one), map(float, sample_two),ddof=args.ddof )
446 else:
447 rho, p_value = stats.chisquare( map(float, sample_one),ddof=args.ddof)
448 cols.append( rho )
449 cols.append( p_value )
450 elif test_id.strip() == 'power_divergence':
451 if sample2 == 1 :
452 stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two),ddof=args.ddof,lambda_=args.lambda_ )
453 else:
454 stat, p_value = stats.power_divergence( map(float, sample_one),ddof=args.ddof,lambda_=args.lambda_)
455 cols.append( stat )
456 cols.append( p_value )
457 elif test_id.strip() == 'theilslopes':
458 if sample2 == 1 :
459 mpe, met, lo, up = stats.theilslopes( map(float, sample_one), map(float, sample_two),alpha=args.alpha )
460 else:
461 mpe, met, lo, up = stats.theilslopes( map(float, sample_one),alpha=args.alpha)
462 cols.append( mpe )
463 cols.append( met )
464 cols.append( lo )
465 cols.append( up )
466 elif test_id.strip() == 'combine_pvalues':
467 if sample2 == 1 :
468 stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med,weights=map(float, sample_two) )
469 else:
470 stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med)
471 cols.append( stat )
472 cols.append( p_value )
473 elif test_id.strip() == 'obrientransform':
474 ob = stats.obrientransform( *b_samples )
475 for list in ob:
476 elements = ','.join( map(str, list) )
477 cols.append( elements )
478 elif test_id.strip() == 'f_oneway':
479 f_value, p_value = stats.f_oneway( *b_samples )
480 cols.append(f_value)
481 cols.append(p_value)
482 elif test_id.strip() == 'kruskal':
483 h, p_value = stats.kruskal( *b_samples )
484 cols.append(h)
485 cols.append(p_value)
486 elif test_id.strip() == 'friedmanchisquare':
487 fr, p_value = stats.friedmanchisquare( *b_samples )
488 cols.append(fr)
489 cols.append(p_value)
490 elif test_id.strip() == 'fligner':
491 xsq, p_value = stats.fligner( center=args.center,proportiontocut=args.proportiontocut,*b_samples )
492 cols.append(xsq)
493 cols.append(p_value)
494 elif test_id.strip() == 'bartlett':
495 T, p_value = stats.bartlett( *b_samples )
496 cols.append(T)
497 cols.append(p_value)
498 elif test_id.strip() == 'levene':
499 w, p_value = stats.levene( center=args.center,proportiontocut=args.proportiontocut,*b_samples )
500 cols.append(w)
501 cols.append(p_value)
502 elif test_id.strip() == 'median_test':
503 stat, p_value, m, table = stats.median_test( ties=args.ties,correction=args.correction ,lambda_=args.lambda_,*b_samples )
504 cols.append(stat)
505 cols.append(p_value)
506 cols.append(m)
507 cols.append(table)
508 for list in table:
509 elements = ','.join( map(str, list) )
510 cols.append( elements )
511 outfile.write( '%s\n' % '\t'.join( map(str, cols) ) )
512 outfile.close()
513
514 if __name__ == '__main__':
515 main()