Mercurial > repos > bgruening > upload_testing
comparison gsummary.py @ 80:c4a3a8999945 draft
Uploaded
| author | bernhardlutz |
|---|---|
| date | Mon, 20 Jan 2014 14:39:43 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 79:dc82017052ac | 80:c4a3a8999945 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys | |
| 4 import re | |
| 5 import tempfile | |
| 6 #from rpy import * | |
| 7 import rpy2.robjects as robjects | |
| 8 r = robjects.r | |
| 9 from rpy2.robjects.vectors import DataFrame | |
| 10 | |
| 11 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 12 | |
| 13 def stop_err( msg ): | |
| 14 sys.stderr.write( msg ) | |
| 15 sys.exit() | |
| 16 | |
| 17 def S3_METHODS( all="key" ): | |
| 18 Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif", | |
| 19 "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh", | |
| 20 "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma", | |
| 21 "cumsum", "cumprod", "cummax", "cummin", "c" ] | |
| 22 Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ] | |
| 23 if all is "key": | |
| 24 return { 'Math' : Group_Math, 'Ops' : Group_Ops } | |
| 25 | |
| 26 def main(): | |
| 27 try: | |
| 28 datafile = sys.argv[1] | |
| 29 outfile_name = sys.argv[2] | |
| 30 expression = sys.argv[3] | |
| 31 except: | |
| 32 stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) | |
| 33 | |
| 34 math_allowed = S3_METHODS()[ 'Math' ] | |
| 35 ops_allowed = S3_METHODS()[ 'Ops' ] | |
| 36 | |
| 37 # Check for invalid expressions | |
| 38 for word in re.compile( '[a-zA-Z]+' ).findall( expression ): | |
| 39 if word and not word in math_allowed: | |
| 40 stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) ) | |
| 41 symbols = set() | |
| 42 for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): | |
| 43 if symbol and not symbol in ops_allowed: | |
| 44 stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) | |
| 45 else: | |
| 46 symbols.add( symbol ) | |
| 47 if len( symbols ) == 1 and ',' in symbols: | |
| 48 # User may have entered a comma-separated list r_data_frame columns | |
| 49 stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) | |
| 50 | |
| 51 # Find all column references in the expression | |
| 52 cols = [] | |
| 53 for col in re.compile( 'c[0-9]+' ).findall( expression ): | |
| 54 try: | |
| 55 cols.append( int( col[1:] ) - 1 ) | |
| 56 except: | |
| 57 pass | |
| 58 | |
| 59 tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) | |
| 60 # Write the R header row to the temporary file | |
| 61 hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols ) | |
| 62 tmp_file.write( "%s\n" % hdr_str ) | |
| 63 skipped_lines = 0 | |
| 64 first_invalid_line = 0 | |
| 65 i = 0 | |
| 66 for i, line in enumerate( file( datafile ) ): | |
| 67 line = line.rstrip( '\r\n' ) | |
| 68 if line and not line.startswith( '#' ): | |
| 69 valid = True | |
| 70 fields = line.split( '\t' ) | |
| 71 # Write the R data row to the temporary file | |
| 72 for col in cols: | |
| 73 try: | |
| 74 float( fields[ col ] ) | |
| 75 except: | |
| 76 skipped_lines += 1 | |
| 77 if not first_invalid_line: | |
| 78 first_invalid_line = i + 1 | |
| 79 valid = False | |
| 80 break | |
| 81 if valid: | |
| 82 data_str = "\t".join( fields[ col ] for col in cols ) | |
| 83 tmp_file.write( "%s\n" % data_str ) | |
| 84 tmp_file.flush() | |
| 85 | |
| 86 if skipped_lines == i + 1: | |
| 87 stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) | |
| 88 else: | |
| 89 # summary function and return labels | |
| 90 summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) | |
| 91 headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] | |
| 92 headings_str = "\t".join( headings ) | |
| 93 | |
| 94 #r.set_default_mode( NO_CONVERSION ) | |
| 95 #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) | |
| 96 r_data_frame = DataFrame.from_csvfile( tmp_file.name, header=True, sep="\t" ) | |
| 97 | |
| 98 outfile = open( outfile_name, 'w' ) | |
| 99 | |
| 100 for col in re.compile( 'c[0-9]+' ).findall( expression ): | |
| 101 r.assign( col, r[ "$" ]( r_data_frame, col ) ) | |
| 102 try: | |
| 103 summary = summary_func( r( expression ) ) | |
| 104 except RException, s: | |
| 105 outfile.close() | |
| 106 stop_err( "Computation resulted in the following error: %s" % str( s ) ) | |
| 107 #summary = summary.as_py( BASIC_CONVERSION ) | |
| 108 outfile.write( "#%s\n" % headings_str ) | |
| 109 print summary | |
| 110 print summary.r_repr() | |
| 111 outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary.rx2( k )[0] ) for k in headings ] ) ) | |
| 112 outfile.close() | |
| 113 | |
| 114 if skipped_lines: | |
| 115 print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line ) | |
| 116 | |
| 117 if __name__ == "__main__": main() |
