| 
80
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 #Greg Von Kuster
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 import sys
 | 
| 
 | 
     5 #from rpy import *
 | 
| 
 | 
     6 import rpy2.robjects as robjects
 | 
| 
 | 
     7 from rpy2.robjects.packages import importr
 | 
| 
 | 
     8 r = robjects.r
 | 
| 
 | 
     9 grdevices = importr('grDevices')
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 assert sys.version_info[:2] >= ( 2, 4 )
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 def stop_err(msg):
 | 
| 
 | 
    15     sys.stderr.write(msg)
 | 
| 
 | 
    16     sys.exit()
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 def main():
 | 
| 
 | 
    19 
 | 
| 
 | 
    20     # Handle input params
 | 
| 
 | 
    21     in_fname = sys.argv[1]
 | 
| 
 | 
    22     out_fname = sys.argv[2] 
 | 
| 
 | 
    23     try:
 | 
| 
 | 
    24         column = int( sys.argv[3] ) - 1
 | 
| 
 | 
    25     except:
 | 
| 
 | 
    26         stop_err( "Column not specified, your query does not contain a column of numerical data." )
 | 
| 
 | 
    27     title = sys.argv[4]
 | 
| 
 | 
    28     xlab = sys.argv[5]
 | 
| 
 | 
    29     breaks = int( sys.argv[6] )
 | 
| 
 | 
    30     if breaks == 0:
 | 
| 
 | 
    31         breaks = "Sturges"
 | 
| 
 | 
    32     if sys.argv[7] == "true":
 | 
| 
 | 
    33         density = True
 | 
| 
 | 
    34     else: density = False
 | 
| 
 | 
    35     if len( sys.argv ) >= 9 and sys.argv[8] == "true":
 | 
| 
 | 
    36         frequency = True
 | 
| 
 | 
    37     else: frequency = False
 | 
| 
 | 
    38 
 | 
| 
 | 
    39     matrix = []
 | 
| 
 | 
    40     skipped_lines = 0
 | 
| 
 | 
    41     first_invalid_line = 0
 | 
| 
 | 
    42     invalid_value = ''
 | 
| 
 | 
    43     i = 0
 | 
| 
 | 
    44     for i, line in enumerate( file( in_fname ) ):
 | 
| 
 | 
    45         valid = True
 | 
| 
 | 
    46         line = line.rstrip('\r\n')
 | 
| 
 | 
    47         # Skip comments
 | 
| 
 | 
    48         if line and not line.startswith( '#' ): 
 | 
| 
 | 
    49             # Extract values and convert to floats
 | 
| 
 | 
    50             row = []
 | 
| 
 | 
    51             try:
 | 
| 
 | 
    52                 fields = line.split( "\t" )
 | 
| 
 | 
    53                 val = fields[column]
 | 
| 
 | 
    54                 if val.lower() == "na":
 | 
| 
 | 
    55                     row.append( float( "nan" ) )
 | 
| 
 | 
    56             except:
 | 
| 
 | 
    57                 valid = False
 | 
| 
 | 
    58                 skipped_lines += 1
 | 
| 
 | 
    59                 if not first_invalid_line:
 | 
| 
 | 
    60                     first_invalid_line = i+1
 | 
| 
 | 
    61             else:
 | 
| 
 | 
    62                 try:
 | 
| 
 | 
    63                     row.append( float( val ) )
 | 
| 
 | 
    64                 except ValueError:
 | 
| 
 | 
    65                     valid = False
 | 
| 
 | 
    66                     skipped_lines += 1
 | 
| 
 | 
    67                     if not first_invalid_line:
 | 
| 
 | 
    68                         first_invalid_line = i+1
 | 
| 
 | 
    69                         invalid_value = fields[column]
 | 
| 
 | 
    70         else:
 | 
| 
 | 
    71             valid = False
 | 
| 
 | 
    72             skipped_lines += 1
 | 
| 
 | 
    73             if not first_invalid_line:
 | 
| 
 | 
    74                 first_invalid_line = i+1
 | 
| 
 | 
    75 
 | 
| 
 | 
    76         if valid:
 | 
| 
 | 
    77             matrix += row
 | 
| 
 | 
    78 
 | 
| 
 | 
    79     if skipped_lines < i:
 | 
| 
 | 
    80         try:
 | 
| 
 | 
    81             #a = r.array( matrix )
 | 
| 
 | 
    82             fv = robjects.FloatVector(matrix)
 | 
| 
 | 
    83             a = r['matrix'](fv, ncol=1,byrow=True)
 | 
| 
 | 
    84             #r.pdf( out_fname, 8, 8 )
 | 
| 
 | 
    85             r.pdf( out_fname, 8, 8 )
 | 
| 
 | 
    86             histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks )
 | 
| 
 | 
    87             if density:
 | 
| 
 | 
    88                 density = r.density( a )
 | 
| 
 | 
    89                 if frequency:
 | 
| 
 | 
    90                     scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints
 | 
| 
 | 
    91                     density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] )
 | 
| 
 | 
    92                 r.lines( density )
 | 
| 
 | 
    93             #r.dev_off()
 | 
| 
 | 
    94             grdevices.dev_off()   
 | 
| 
 | 
    95         except Exception, exc:
 | 
| 
 | 
    96             stop_err( "%s" %str( exc ) )
 | 
| 
 | 
    97     else:
 | 
| 
 | 
    98         if i == 0:
 | 
| 
 | 
    99             stop_err("Input dataset is empty.")
 | 
| 
 | 
   100         else:
 | 
| 
 | 
   101             stop_err( "All values in column %s are non-numeric." %sys.argv[3] )
 | 
| 
 | 
   102 
 | 
| 
 | 
   103     print "Histogram of column %s. " %sys.argv[3]
 | 
| 
 | 
   104     if skipped_lines > 0:
 | 
| 
 | 
   105         print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value )
 | 
| 
 | 
   106 
 | 
| 
 | 
   107     #r.quit( save="no" )
 | 
| 
 | 
   108     
 | 
| 
 | 
   109 if __name__ == "__main__":
 | 
| 
 | 
   110     main()
 |