| 80 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import sys, string | 
|  | 4 import rpy2.robjects as robjects | 
|  | 5 import rpy2.rlike.container as rlc | 
|  | 6 from rpy2.robjects.packages import importr | 
|  | 7 r = robjects.r | 
|  | 8 grdevices = importr('grDevices') | 
|  | 9 #  from rpy import * | 
|  | 10 import numpy | 
|  | 11 | 
|  | 12 | 
|  | 13 def stop_err(msg): | 
|  | 14     sys.stderr.write(msg) | 
|  | 15     sys.exit() | 
|  | 16 | 
|  | 17 infile = sys.argv[1] | 
|  | 18 y_col = int(sys.argv[2])-1 | 
|  | 19 x_cols = sys.argv[3].split(',') | 
|  | 20 outfile = sys.argv[4] | 
|  | 21 outfile2 = sys.argv[5] | 
|  | 22 | 
|  | 23 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) | 
|  | 24 fout = open(outfile,'w') | 
|  | 25 elems = [] | 
|  | 26 for i, line in enumerate( file ( infile )): | 
|  | 27     line = line.rstrip('\r\n') | 
|  | 28     if len( line )>0 and not line.startswith( '#' ): | 
|  | 29         elems = line.split( '\t' ) | 
|  | 30         break | 
|  | 31     if i == 30: | 
|  | 32         break # Hopefully we'll never get here... | 
|  | 33 | 
|  | 34 if len( elems )<1: | 
|  | 35     stop_err( "The data in your input dataset is either missing or not formatted properly." ) | 
|  | 36 | 
|  | 37 y_vals = [] | 
|  | 38 x_vals = [] | 
|  | 39 | 
|  | 40 for k,col in enumerate(x_cols): | 
|  | 41     x_cols[k] = int(col)-1 | 
|  | 42     # x_vals.append([]) | 
|  | 43 | 
|  | 44 NA = 'NA' | 
|  | 45 for ind,line in enumerate( file( infile )): | 
|  | 46     if line and not line.startswith( '#' ): | 
|  | 47         try: | 
|  | 48             fields = line.split("\t") | 
|  | 49             try: | 
|  | 50                 yval = float(fields[y_col]) | 
|  | 51             except: | 
|  | 52                 yval = r('NA') | 
|  | 53             y_vals.append(yval) | 
|  | 54             for k,col in enumerate(x_cols): | 
|  | 55                 try: | 
|  | 56                     xval = float(fields[col]) | 
|  | 57                 except: | 
|  | 58                     xval = r('NA') | 
|  | 59                 # x_vals[k].append(xval) | 
|  | 60                 x_vals.append(xval) | 
|  | 61         except: | 
|  | 62             pass | 
|  | 63 # x_vals1 = numpy.asarray(x_vals).transpose() | 
|  | 64 # dat= r.list(x=array(x_vals1), y=y_vals) | 
|  | 65 fv = robjects.FloatVector(x_vals) | 
|  | 66 m = r['matrix'](fv, ncol=len(x_cols),byrow=True) | 
|  | 67 # ensure order for generating formula | 
|  | 68 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)]) | 
|  | 69 dat = robjects.DataFrame(od) | 
|  | 70 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2' | 
|  | 71 formula = ' + '.join(dat.names).replace('+','~',1) | 
|  | 72 | 
|  | 73 #set_default_mode(NO_CONVERSION) | 
|  | 74 try: | 
|  | 75     #linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat)) | 
|  | 76     linear_model = r.lm(formula,  data =  r['na.exclude'](dat)) | 
|  | 77 except RException, rex: | 
|  | 78     stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") | 
|  | 79 #set_default_mode(BASIC_CONVERSION) | 
|  | 80 | 
|  | 81 #coeffs=linear_model.as_py()['coefficients'] | 
|  | 82 #yintercept= coeffs['(Intercept)'] | 
|  | 83 coeffs=linear_model.rx2('coefficients') | 
|  | 84 yintercept= coeffs.rx2('(Intercept)')[0] | 
|  | 85 summary = r.summary(linear_model) | 
|  | 86 | 
|  | 87 #co = summary.get('coefficients', 'NA') | 
|  | 88 co = summary.rx2("coefficients") | 
|  | 89 | 
|  | 90 """ | 
|  | 91 if len(co) != len(x_vals)+1: | 
|  | 92     stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") | 
|  | 93 """ | 
|  | 94 #print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3]) | 
|  | 95 print >>fout, "p-value (Y-intercept)\t%s" %(co.rx(1,4)[0]) | 
|  | 96 | 
|  | 97 if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable | 
|  | 98     try: | 
|  | 99         #slope = coeffs['x'] | 
|  | 100         slope = r.round(float(coeffs.rx2('x')[0]), digits=10) | 
|  | 101     except: | 
|  | 102         slope = 'NA' | 
|  | 103     try: | 
|  | 104         #pval = co[1][3] | 
|  | 105         pval = r.round(float(co.rx(2,4)[0]), digits=10) | 
|  | 106     except: | 
|  | 107         pval = 'NA' | 
|  | 108     print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) | 
|  | 109     print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) | 
|  | 110 else:    #Multiple regression case with >1 predictors | 
|  | 111     ind=1 | 
|  | 112     #while ind < len(coeffs.keys()): | 
|  | 113     while ind < len(coeffs.names): | 
|  | 114         # print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)]) | 
|  | 115         print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs.rx2(coeffs.names[ind])[0]) | 
|  | 116         try: | 
|  | 117             #pval = co[ind][3] | 
|  | 118             pval = r.round(float(co.rx(ind+1,4)[0]), digits=10) | 
|  | 119         except: | 
|  | 120             pval = 'NA' | 
|  | 121         print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) | 
|  | 122         ind+=1 | 
|  | 123 | 
|  | 124 rsq = summary.rx2('r.squared')[0] | 
|  | 125 adjrsq = summary.rx2('adj.r.squared')[0] | 
|  | 126 fstat = summary.rx2('fstatistic').rx2('value')[0] | 
|  | 127 sigma = summary.rx2('sigma')[0] | 
|  | 128 | 
|  | 129 try: | 
|  | 130     rsq = r.round(float(rsq), digits=5) | 
|  | 131     adjrsq = r.round(float(adjrsq), digits=5) | 
|  | 132     fval = r.round(fstat['value'], digits=5) | 
|  | 133     fstat['value'] = str(fval) | 
|  | 134     sigma = r.round(float(sigma), digits=10) | 
|  | 135 except: | 
|  | 136     pass | 
|  | 137 | 
|  | 138 print >>fout, "R-squared\t%s" %(rsq) | 
|  | 139 print >>fout, "Adjusted R-squared\t%s" %(adjrsq) | 
|  | 140 print >>fout, "F-statistic\t%s" %(fstat) | 
|  | 141 print >>fout, "Sigma\t%s" %(sigma) | 
|  | 142 | 
|  | 143 r.pdf( outfile2, 8, 8 ) | 
|  | 144 if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable | 
|  | 145     sub_title =  "Slope = %s; Y-int = %s" %(slope,yintercept) | 
|  | 146     try: | 
|  | 147         r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") | 
|  | 148         r.abline(a=yintercept, b=slope, col="red") | 
|  | 149     except: | 
|  | 150         pass | 
|  | 151 else: | 
|  | 152     r.pairs(dat, main="Scatterplot Matrix", col="blue") | 
|  | 153 try: | 
|  | 154     r.plot(linear_model) | 
|  | 155 except: | 
|  | 156     pass | 
|  | 157 #r.dev_off() | 
|  | 158 grdevices.dev_off() |