| 80 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import sys | 
|  | 4 import re | 
|  | 5 import tempfile | 
|  | 6 #from rpy import * | 
|  | 7 import rpy2.robjects as robjects | 
|  | 8 r = robjects.r | 
|  | 9 from rpy2.robjects.vectors import DataFrame | 
|  | 10 | 
|  | 11 assert sys.version_info[:2] >= ( 2, 4 ) | 
|  | 12 | 
|  | 13 def stop_err( msg ): | 
|  | 14     sys.stderr.write( msg ) | 
|  | 15     sys.exit() | 
|  | 16 | 
|  | 17 def S3_METHODS( all="key" ): | 
|  | 18     Group_Math =  [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif", | 
|  | 19         "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh", | 
|  | 20         "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma", | 
|  | 21         "cumsum", "cumprod", "cummax", "cummin", "c" ] | 
|  | 22     Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ] | 
|  | 23     if all is "key": | 
|  | 24         return { 'Math' : Group_Math, 'Ops' : Group_Ops } | 
|  | 25 | 
|  | 26 def main(): | 
|  | 27     try: | 
|  | 28         datafile = sys.argv[1] | 
|  | 29         outfile_name = sys.argv[2] | 
|  | 30         expression = sys.argv[3] | 
|  | 31     except: | 
|  | 32         stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) | 
|  | 33 | 
|  | 34     math_allowed = S3_METHODS()[ 'Math' ] | 
|  | 35     ops_allowed = S3_METHODS()[ 'Ops' ] | 
|  | 36 | 
|  | 37     # Check for invalid expressions | 
|  | 38     for word in re.compile( '[a-zA-Z]+' ).findall( expression ): | 
|  | 39         if word and not word in math_allowed: | 
|  | 40             stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) ) | 
|  | 41     symbols = set() | 
|  | 42     for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): | 
|  | 43         if symbol and not symbol in ops_allowed: | 
|  | 44             stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) | 
|  | 45         else: | 
|  | 46             symbols.add( symbol ) | 
|  | 47     if len( symbols ) == 1 and ',' in symbols: | 
|  | 48         # User may have entered a comma-separated list r_data_frame columns | 
|  | 49         stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) | 
|  | 50 | 
|  | 51     # Find all column references in the expression | 
|  | 52     cols = [] | 
|  | 53     for col in re.compile( 'c[0-9]+' ).findall( expression ): | 
|  | 54         try: | 
|  | 55             cols.append( int( col[1:] ) - 1 ) | 
|  | 56         except: | 
|  | 57             pass | 
|  | 58 | 
|  | 59     tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) | 
|  | 60     # Write the R header row to the temporary file | 
|  | 61     hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols ) | 
|  | 62     tmp_file.write( "%s\n" % hdr_str ) | 
|  | 63     skipped_lines = 0 | 
|  | 64     first_invalid_line = 0 | 
|  | 65     i = 0 | 
|  | 66     for i, line in enumerate( file( datafile ) ): | 
|  | 67         line = line.rstrip( '\r\n' ) | 
|  | 68         if line and not line.startswith( '#' ): | 
|  | 69             valid = True | 
|  | 70             fields = line.split( '\t' ) | 
|  | 71             # Write the R data row to the temporary file | 
|  | 72             for col in cols: | 
|  | 73                 try: | 
|  | 74                     float( fields[ col ] ) | 
|  | 75                 except: | 
|  | 76                     skipped_lines += 1 | 
|  | 77                     if not first_invalid_line: | 
|  | 78                         first_invalid_line = i + 1 | 
|  | 79                     valid = False | 
|  | 80                     break | 
|  | 81             if valid: | 
|  | 82                 data_str = "\t".join( fields[ col ] for col in cols ) | 
|  | 83                 tmp_file.write( "%s\n" % data_str ) | 
|  | 84     tmp_file.flush() | 
|  | 85 | 
|  | 86     if skipped_lines == i + 1: | 
|  | 87         stop_err( "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements." ) | 
|  | 88     else: | 
|  | 89         # summary function and return labels | 
|  | 90         summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) | 
|  | 91         headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] | 
|  | 92         headings_str = "\t".join( headings ) | 
|  | 93 | 
|  | 94         #r.set_default_mode( NO_CONVERSION ) | 
|  | 95         #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) | 
|  | 96         r_data_frame = DataFrame.from_csvfile( tmp_file.name, header=True, sep="\t" ) | 
|  | 97 | 
|  | 98         outfile = open( outfile_name, 'w' ) | 
|  | 99 | 
|  | 100         for col in re.compile( 'c[0-9]+' ).findall( expression ): | 
|  | 101             r.assign( col, r[ "$" ]( r_data_frame, col ) ) | 
|  | 102         try: | 
|  | 103             summary = summary_func( r( expression ) ) | 
|  | 104         except RException, s: | 
|  | 105             outfile.close() | 
|  | 106             stop_err( "Computation resulted in the following error: %s" % str( s ) ) | 
|  | 107         #summary = summary.as_py( BASIC_CONVERSION ) | 
|  | 108         outfile.write( "#%s\n" % headings_str ) | 
|  | 109         print summary | 
|  | 110         print summary.r_repr() | 
|  | 111         outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary.rx2( k )[0] ) for k in headings ] ) ) | 
|  | 112         outfile.close() | 
|  | 113 | 
|  | 114         if skipped_lines: | 
|  | 115             print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % ( skipped_lines, first_invalid_line ) | 
|  | 116 | 
|  | 117 if __name__ == "__main__": main() |