diff statistical_hypothesis_testing.py @ 0:22ed769665b6 draft default tip

Uploaded
author bgruening
date Sun, 01 Feb 2015 18:35:40 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/statistical_hypothesis_testing.py	Sun Feb 01 18:35:40 2015 -0500
@@ -0,0 +1,515 @@
+#!/usr/bin/env python
+
+"""
+
+"""
+import sys
+import argparse
+from scipy import stats
+
+def columns_to_values( args, line ):
+    #here you go over every list
+    samples = []
+    for list in args:
+        cols = line.split('\t')
+        sample_list = []
+        for row in list:
+            sample_list.append( cols[row-1] )
+        samples.append( map(int, sample_list) )
+    return samples
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--infile', required=True, help='Tabular file.')
+    parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.')
+    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
+    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
+    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
+    parser.add_argument("--test_id", help="statistical test method")
+    parser.add_argument("--mwu_use_continuity", action="store_true", default = False,
+                    help="Whether a continuity correction (1/2.) should be taken into account.")
+    parser.add_argument("--equal_var", action="store_true", default = False,
+                    help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.")
+    parser.add_argument("--reta", action="store_true", default = False,
+                    help="Whether or not to return the internally computed a values.")
+    parser.add_argument("--fisher", action="store_true", default = False,
+                    help="if true then Fisher definition is used")
+    parser.add_argument("--bias", action="store_true", default = False,
+                    help="if false,then the calculations are corrected for statistical bias")
+    parser.add_argument("--inclusive1", action="store_true", default= False ,
+                    help="if false,lower_limit will be ignored")
+    parser.add_argument("--inclusive2", action="store_true", default = False,
+                    help="if false,higher_limit will be ignored")
+    parser.add_argument("--inclusive", action="store_true", default = False,
+                    help="if false,limit will be ignored")
+    parser.add_argument("--printextras", action="store_true", default = False,
+                    help="If True, if there are extra points a warning is raised saying how many of those points there are")
+    parser.add_argument("--initial_lexsort", action="store_true", default="False",
+                    help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.")
+    parser.add_argument("--correction", action="store_true", default = False,
+                    help="continuity correction ")
+    parser.add_argument("--axis", type=int, default=0,
+                    help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)")
+    parser.add_argument("--n", type=int, default=0,
+                    help="the number of trials. This is ignored if x gives both the number of successes and failures")
+    parser.add_argument("--b", type=int, default=0,
+                    help="The number of bins to use for the histogram")
+    parser.add_argument("--N", type=int, default=0,
+                    help="Score that is compared to the elements in a.")
+    parser.add_argument("--ddof", type=int, default=0,
+                    help="Degrees of freedom correction")
+    parser.add_argument("--score", type=int, default=0,
+                    help="Score that is compared to the elements in a.")
+    parser.add_argument("--m", type=float, default=0.0,
+                    help="limits")
+    parser.add_argument("--mf", type=float, default=2.0,
+                    help="lower limit")
+    parser.add_argument("--nf", type=float, default=99.9,
+                    help="higher_limit")
+    parser.add_argument("--p", type=float, default=0.5,
+                    help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5")
+    parser.add_argument("--alpha", type=float, default=0.9,
+                    help="probability")
+    parser.add_argument("--new", type=float, default=0.0,
+                    help="Value to put in place of values in a outside of bounds")
+    parser.add_argument("--proportiontocut", type=float, default=0.0,
+                    help="Proportion (in range 0-1) of total data set to trim of each end.")
+    parser.add_argument("--lambda_", type=float, default=1.0,
+                    help="lambda_ gives the power in the Cressie-Read power divergence statistic")
+    parser.add_argument("--imbda", type=float, default=0,
+                    help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.")
+    parser.add_argument("--base", type=float, default=1.6,
+                    help="The logarithmic base to use, defaults to e")
+    parser.add_argument("--dtype", help="dtype")
+    parser.add_argument("--med", help="med")
+    parser.add_argument("--cdf", help="cdf")
+    parser.add_argument("--zero_method", help="zero_method options")
+    parser.add_argument("--dist", help="dist options")
+    parser.add_argument("--ties", help="ties options")
+    parser.add_argument("--alternative", help="alternative options")
+    parser.add_argument("--mode", help="mode options")
+    parser.add_argument("--method", help="method options")
+    parser.add_argument("--md", help="md options")
+    parser.add_argument("--center", help="center options")
+    parser.add_argument("--kind", help="kind options")
+    parser.add_argument("--tail", help="tail options")
+    parser.add_argument("--interpolation", help="interpolation options")
+    parser.add_argument("--statistic", help="statistic options")
+
+    args = parser.parse_args()
+    infile = args.infile
+    outfile = open(args.outfile, 'w+')
+    test_id = args.test_id
+    nf = args.nf
+    mf = args.mf
+    imbda = args.imbda
+    inclusive1 = args.inclusive1
+    inclusive2 = args.inclusive2
+    sample0 = 0
+    sample1 = 0
+    sample2 = 0
+    if args.sample_cols != None:
+        sample0 = 1
+        barlett_samples = []
+        for sample in args.sample_cols.split(';'):
+            barlett_samples.append( map(int, sample.split(',')) )
+    if args.sample_one_cols != None:
+        sample1 = 1
+        sample_one_cols = args.sample_one_cols.split(',')
+    if args.sample_two_cols != None:
+        sample_two_cols = args.sample_two_cols.split(',')
+        sample2 = 1
+    for line in open( infile ):
+        sample_one = []
+        sample_two = []
+        cols = line.strip().split('\t')
+        if sample0 == 1:
+            b_samples = columns_to_values( barlett_samples,line )
+        if sample1 == 1:
+            for index in sample_one_cols:
+                sample_one.append( cols[ int(index) -1 ] )
+        if sample2 == 1:
+            for index in sample_two_cols:
+                sample_two.append( cols[ int(index) -1 ] )
+        if test_id.strip() == 'describe':
+            size, min_max,mean,uv,bs,bk = stats.describe( map(float, sample_one) )
+            cols.append( size )
+            cols.append( min_max )
+            cols.append( mean )
+            cols.append( uv )
+            cols.append( bs )
+            cols.append( bk )
+        elif test_id.strip() == 'mode':
+            vals, counts = stats.mode( map(float, sample_one) )
+            cols.append( vals )
+            cols.append( counts )
+        elif test_id.strip() == 'nanmean':
+            m = stats.nanmean( map(float, sample_one))
+            cols.append( m )
+        elif test_id.strip() == 'nanmedian':
+            m = stats.nanmedian( map(float, sample_one))
+            cols.append( m )
+        elif test_id.strip() == 'kurtosistest':
+            z_value, p_value = stats.kurtosistest( map(float, sample_one) )
+            cols.append( z_value )
+            cols.append( p_value )
+        elif test_id.strip() == 'variation':
+            ra = stats.variation( map(float, sample_one))
+            cols.append( ra )
+        elif test_id.strip() == 'itemfreq':
+            freq = stats.itemfreq( map(float, sample_one))
+            for list in freq:
+                elements = ','.join( map(str, list) )
+                cols.append( elements )
+        elif test_id.strip() == 'nanmedian':
+            m = stats.nanmedian( map(float, sample_one))
+            cols.append( m )
+        elif test_id.strip() == 'variation':
+            ra = stats.variation( map(float, sample_one))
+            cols.append( ra )
+        elif test_id.strip() == 'boxcox_llf':
+            IIf = stats.boxcox_llf( imbda,map(float, sample_one) )
+            cols.append( IIf )
+        elif test_id.strip() == 'tiecorrect':
+            fa = stats.tiecorrect( map(float, sample_one) )
+            cols.append( fa )
+        elif test_id.strip() == 'rankdata':
+            r = stats.rankdata( map(float, sample_one),method=args.md )
+            cols.append( r )
+        elif test_id.strip() == 'nanstd':
+            s = stats.nanstd( map(float, sample_one),bias=args.bias )
+            cols.append( s )
+        elif test_id.strip() == 'anderson':
+            A2, critical, sig = stats.anderson( map(float, sample_one), dist=args.dist )
+            cols.append( A2 )
+            for list in critical:
+                cols.append( list )
+            cols.append( ',' )
+            for list in sig:
+                cols.append( list )
+        elif test_id.strip() == 'binom_test':
+            p_value = stats.binom_test( map(float, sample_one), n=args.n, p=args.p )
+            cols.append( p_value )
+        elif test_id.strip() == 'gmean':
+            gm = stats.gmean( map(float, sample_one), dtype=args.dtype )
+            cols.append( gm )
+        elif test_id.strip() == 'hmean':
+            hm = stats.hmean( map(float, sample_one), dtype=args.dtype )
+            cols.append( hm )
+        elif test_id.strip() == 'kurtosis':
+            k = stats.kurtosis( map(float, sample_one),axis=args.axis, fisher=args.fisher, bias=args.bias )
+            cols.append( k )
+        elif test_id.strip() == 'moment':
+            n_moment = stats.moment( map(float, sample_one),n=args.n )
+            cols.append( n_moment )
+        elif test_id.strip() == 'normaltest':
+            k2, p_value = stats.normaltest( map(float, sample_one) )
+            cols.append( k2 )
+            cols.append( p_value )
+        elif test_id.strip() == 'skew':
+            skewness = stats.skew( map(float, sample_one),bias=args.bias )
+            cols.append( skewness )
+        elif test_id.strip() == 'skewtest':
+            z_value, p_value = stats.skewtest( map(float, sample_one))
+            cols.append( z_value )
+            cols.append( p_value )
+        elif test_id.strip() == 'sem':
+            s = stats.sem( map(float, sample_one),ddof=args.ddof )
+            cols.append( s )
+        elif test_id.strip() == 'zscore':
+            z = stats.zscore( map(float, sample_one),ddof=args.ddof )
+            for list in z:
+                cols.append( list )
+        elif test_id.strip() == 'signaltonoise':
+            s2n = stats.signaltonoise( map(float, sample_one),ddof=args.ddof )
+            cols.append( s2n )
+        elif test_id.strip() == 'percentileofscore':
+            p = stats.percentileofscore( map(float, sample_one),score=args.score,kind=args.kind )
+            cols.append( p )
+        elif test_id.strip() == 'bayes_mvs':
+            c_mean, c_var,c_std = stats.bayes_mvs( map(float, sample_one),alpha=args.alpha )
+            cols.append( c_mean )
+            cols.append( c_var )
+            cols.append( c_std )
+        elif test_id.strip() == 'sigmaclip':
+            c, c_low,c_up = stats.sigmaclip( map(float, sample_one),low=args.m,high=args.n )
+            cols.append( c )
+            cols.append( c_low )
+            cols.append( c_up )
+        elif test_id.strip() == 'kstest':
+            d, p_value = stats.kstest(map(float, sample_one), cdf=args.cdf , N=args.N,alternative=args.alternative,mode=args.mode )
+            cols.append(d)
+            cols.append(p_value)
+        elif test_id.strip() == 'chi2_contingency':
+            chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction ,lambda_=args.lambda_)
+            cols.append( chi2 )
+            cols.append( p )
+            cols.append( dof )
+            cols.append( ex )
+        elif test_id.strip() == 'tmean':
+            if nf is 0 and mf is 0:
+                mean = stats.tmean( map(float, sample_one))
+            else:
+                mean = stats.tmean( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
+            cols.append( mean )
+        elif test_id.strip() == 'tmin':
+            if mf is 0:
+                min = stats.tmin( map(float, sample_one))
+            else:
+                min = stats.tmin( map(float, sample_one),lowerlimit=mf,inclusive=args.inclusive)
+            cols.append( min )
+        elif test_id.strip() == 'tmax':
+            if nf is 0:
+                max = stats.tmax( map(float, sample_one))
+            else:
+                max = stats.tmax( map(float, sample_one),upperlimit=nf,inclusive=args.inclusive)
+            cols.append( max )
+        elif test_id.strip() == 'tvar':
+            if nf is 0 and mf is 0:
+                var = stats.tvar( map(float, sample_one))
+            else:
+                var = stats.tvar( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
+            cols.append( var )
+        elif test_id.strip() == 'tstd':
+            if nf is 0 and mf is 0:
+                std = stats.tstd( map(float, sample_one))
+            else:
+                std = stats.tstd( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
+            cols.append( std )
+        elif test_id.strip() == 'tsem':
+            if nf is 0 and mf is 0:
+                s = stats.tsem( map(float, sample_one))
+            else:
+                s = stats.tsem( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 ))
+            cols.append( s )
+        elif test_id.strip() == 'scoreatpercentile':
+            if nf is 0 and mf is 0:
+                s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),interpolation_method=args.interpolation )
+            else:
+                s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),( mf, nf ),interpolation_method=args.interpolation )
+            for list in s:
+                cols.append( list )
+        elif test_id.strip() == 'relfreq':
+            if nf is 0 and mf is 0:
+                rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b)
+            else:
+                rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b,( mf, nf ))
+            for list in rel:
+                cols.append( list )
+            cols.append( low_range )
+            cols.append( binsize )
+            cols.append( ex )
+        elif test_id.strip() == 'binned_statistic':
+            if nf is 0 and mf is 0:
+                st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b )
+            else:
+                st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b,range=( mf, nf ) )
+            cols.append( st )
+            cols.append( b_edge )
+            cols.append( b_n )
+        elif test_id.strip() == 'threshold':
+            if nf is 0 and mf is 0:
+                o = stats.threshold( map(float, sample_one),newval=args.new )
+            else:
+                o = stats.threshold( map(float, sample_one),mf,nf,newval=args.new )
+            for list in o:
+                cols.append( list )
+        elif test_id.strip() == 'trimboth':
+            o = stats.trimboth( map(float, sample_one),proportiontocut=args.proportiontocut )
+            for list in o:
+                cols.append( list )
+        elif test_id.strip() == 'trim1':
+            t1 = stats.trim1( map(float, sample_one),proportiontocut=args.proportiontocut,tail=args.tail )
+            for list in t1:
+                cols.append( list )
+        elif test_id.strip() == 'histogram':
+            if nf is 0 and mf is 0:
+                hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b)
+            else:
+                hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b,( mf, nf ))
+            cols.append( hi )
+            cols.append( low_range )
+            cols.append( binsize )
+            cols.append( ex )
+        elif test_id.strip() == 'cumfreq':
+            if nf is 0 and mf is 0:
+                cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b)
+            else:
+                cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b,( mf, nf ))
+            cols.append( cum )
+            cols.append( low_range )
+            cols.append( binsize )
+            cols.append( ex )
+        elif test_id.strip() == 'boxcox_normmax':
+            if nf is 0 and mf is 0:
+                ma = stats.boxcox_normmax( map(float, sample_one))
+            else:
+                ma = stats.boxcox_normmax( map(float, sample_one),( mf, nf ),method=args.method)
+            cols.append( ma )
+        elif test_id.strip() == 'boxcox':
+            if imbda is 0:
+                box, ma, ci = stats.boxcox( map(float, sample_one),alpha=args.alpha )
+                cols.append( box )
+                cols.append( ma )
+                cols.append( ci )
+            else:
+                box = stats.boxcox( map(float, sample_one),imbda,alpha=args.alpha )
+                cols.append( box )
+        elif test_id.strip() == 'histogram2':
+            h2 = stats.histogram2( map(float, sample_one), map(float, sample_two) )
+            for list in h2:
+                cols.append( list )
+        elif test_id.strip() == 'ranksums':
+            z_statistic, p_value = stats.ranksums( map(float, sample_one), map(float, sample_two) )
+            cols.append(z_statistic)
+            cols.append(p_value)
+        elif test_id.strip() == 'ttest_1samp':
+            t, prob = stats.ttest_1samp( map(float, sample_one), map(float, sample_two) )
+            for list in t:
+                cols.append( list )
+            for list in prob:
+                cols.append( list )
+        elif test_id.strip() == 'ansari':
+            AB, p_value = stats.ansari( map(float, sample_one), map(float, sample_two) )
+            cols.append(AB)
+            cols.append(p_value)
+        elif test_id.strip() == 'linregress':
+            slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) )
+            cols.append(slope)
+            cols.append(intercept)
+            cols.append(r_value)
+            cols.append(p_value)
+            cols.append(stderr)
+        elif test_id.strip() == 'pearsonr':
+            cor, p_value = stats.pearsonr( map(float, sample_one), map(float, sample_two) )
+            cols.append(cor)
+            cols.append(p_value)
+        elif test_id.strip() == 'pointbiserialr':
+            r, p_value = stats.pointbiserialr( map(float, sample_one), map(float, sample_two) )
+            cols.append(r)
+            cols.append(p_value)
+        elif test_id.strip() == 'ks_2samp':
+            d, p_value = stats.ks_2samp( map(float, sample_one), map(float, sample_two) )
+            cols.append(d)
+            cols.append(p_value)
+        elif test_id.strip() == 'mannwhitneyu':
+            mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity )
+            cols.append( mw_stats_u )
+            cols.append( p_value )
+        elif test_id.strip() == 'zmap':
+            z = stats.zmap( map(float, sample_one),map(float, sample_two),ddof=args.ddof )
+            for list in z:
+                cols.append( list )
+        elif test_id.strip() == 'ttest_ind':
+            mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var )
+            cols.append( mw_stats_u )
+            cols.append( p_value )
+        elif test_id.strip() == 'ttest_rel':
+            t, prob = stats.ttest_rel( map(float, sample_one), map(float, sample_two), axis=args.axis )
+            cols.append( t )
+            cols.append( prob )
+        elif test_id.strip() == 'mood':
+            z, p_value = stats.mood( map(float, sample_one), map(float, sample_two), axis=args.axis )
+            cols.append( z )
+            cols.append( p_value )
+        elif test_id.strip() == 'shapiro':
+            W, p_value, a = stats.shapiro( map(float, sample_one), map(float, sample_two), args.reta )
+            cols.append( W )
+            cols.append( p_value )
+            for list in a:
+                cols.append( list )
+        elif test_id.strip() == 'kendalltau':
+            k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort )
+            cols.append(k)
+            cols.append(p_value)
+        elif test_id.strip() == 'entropy':
+            s = stats.entropy( map(float, sample_one), map(float, sample_two), base=args.base )
+            cols.append( s )
+        elif test_id.strip() == 'spearmanr':
+            if sample2 == 1 :
+                rho, p_value = stats.spearmanr( map(float, sample_one), map(float, sample_two) )
+            else:
+                rho, p_value = stats.spearmanr( map(float, sample_one))
+            cols.append( rho )
+            cols.append( p_value )
+        elif test_id.strip() == 'wilcoxon':
+            if sample2 == 1 :
+                T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two),zero_method=args.zero_method,correction=args.correction )
+            else:
+                T, p_value = stats.wilcoxon( map(float, sample_one),zero_method=args.zero_method,correction=args.correction )
+            cols.append( T )
+            cols.append( p_value )
+        elif test_id.strip() == 'chisquare':
+            if sample2 == 1 :
+                rho, p_value = stats.chisquare( map(float, sample_one), map(float, sample_two),ddof=args.ddof )
+            else:
+                rho, p_value = stats.chisquare( map(float, sample_one),ddof=args.ddof)
+            cols.append( rho )
+            cols.append( p_value )
+        elif test_id.strip() == 'power_divergence':
+            if sample2 == 1 :
+                stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two),ddof=args.ddof,lambda_=args.lambda_ )
+            else:
+                stat, p_value = stats.power_divergence( map(float, sample_one),ddof=args.ddof,lambda_=args.lambda_)
+            cols.append( stat )
+            cols.append( p_value )
+        elif test_id.strip() == 'theilslopes':
+            if sample2 == 1 :
+                mpe, met, lo, up = stats.theilslopes( map(float, sample_one), map(float, sample_two),alpha=args.alpha )
+            else:
+                mpe, met, lo, up = stats.theilslopes( map(float, sample_one),alpha=args.alpha)
+            cols.append( mpe )
+            cols.append( met )
+            cols.append( lo )
+            cols.append( up )
+        elif test_id.strip() == 'combine_pvalues':
+            if sample2 == 1 :
+                stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med,weights=map(float, sample_two) )
+            else:
+                stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med)
+            cols.append( stat )
+            cols.append( p_value )
+        elif test_id.strip() == 'obrientransform':
+            ob = stats.obrientransform( *b_samples )
+            for list in ob:
+                elements = ','.join( map(str, list) )
+                cols.append( elements )
+        elif test_id.strip() == 'f_oneway':
+            f_value, p_value = stats.f_oneway( *b_samples )
+            cols.append(f_value)
+            cols.append(p_value)
+        elif test_id.strip() == 'kruskal':
+            h, p_value = stats.kruskal( *b_samples )
+            cols.append(h)
+            cols.append(p_value)
+        elif test_id.strip() == 'friedmanchisquare':
+            fr, p_value = stats.friedmanchisquare( *b_samples )
+            cols.append(fr)
+            cols.append(p_value)
+        elif test_id.strip() == 'fligner':
+            xsq, p_value = stats.fligner( center=args.center,proportiontocut=args.proportiontocut,*b_samples )
+            cols.append(xsq)
+            cols.append(p_value)
+        elif test_id.strip() == 'bartlett':
+            T, p_value = stats.bartlett( *b_samples )
+            cols.append(T)
+            cols.append(p_value)
+        elif test_id.strip() == 'levene':
+            w, p_value = stats.levene( center=args.center,proportiontocut=args.proportiontocut,*b_samples )
+            cols.append(w)
+            cols.append(p_value)
+        elif test_id.strip() == 'median_test':
+            stat, p_value, m, table = stats.median_test( ties=args.ties,correction=args.correction ,lambda_=args.lambda_,*b_samples )
+            cols.append(stat)
+            cols.append(p_value)
+            cols.append(m)
+            cols.append(table)
+            for list in table:
+                elements = ','.join( map(str, list) )
+                cols.append( elements )
+        outfile.write( '%s\n' % '\t'.join( map(str, cols) ) )
+    outfile.close()
+
+if __name__ == '__main__':
+    main()