| 80 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 #from galaxy import eggs | 
|  | 4 | 
|  | 5 import sys, string | 
|  | 6 #from rpy import * | 
|  | 7 | 
|  | 8 import rpy2.robjects as robjects | 
|  | 9 import rpy2.rlike.container as rlc | 
|  | 10 r = robjects.r | 
|  | 11 import numpy | 
|  | 12 | 
|  | 13 #export PYTHONPATH=~/galaxy/lib/ | 
|  | 14 #running command python partialR_square.py reg_inp.tab 4 1,2,3 partialR_result.tabular | 
|  | 15 | 
|  | 16 def stop_err(msg): | 
|  | 17     sys.stderr.write(msg) | 
|  | 18     sys.exit() | 
|  | 19 | 
|  | 20 def sscombs(s): | 
|  | 21     if len(s) == 1: | 
|  | 22         return [s] | 
|  | 23     else: | 
|  | 24         ssc = sscombs(s[1:]) | 
|  | 25         return [s[0]] + [s[0]+comb for comb in ssc] + ssc | 
|  | 26 | 
|  | 27 | 
|  | 28 infile = sys.argv[1] | 
|  | 29 y_col = int(sys.argv[2])-1 | 
|  | 30 x_cols = sys.argv[3].split(',') | 
|  | 31 outfile = sys.argv[4] | 
|  | 32 | 
|  | 33 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) | 
|  | 34 fout = open(outfile,'w') | 
|  | 35 | 
|  | 36 for i, line in enumerate( file ( infile )): | 
|  | 37     line = line.rstrip('\r\n') | 
|  | 38     if len( line )>0 and not line.startswith( '#' ): | 
|  | 39         elems = line.split( '\t' ) | 
|  | 40         break | 
|  | 41     if i == 30: | 
|  | 42         break # Hopefully we'll never get here... | 
|  | 43 | 
|  | 44 if len( elems )<1: | 
|  | 45     stop_err( "The data in your input dataset is either missing or not formatted properly." ) | 
|  | 46 | 
|  | 47 y_vals = [] | 
|  | 48 x_vals = [] | 
|  | 49 x_vector = [] | 
|  | 50 for k,col in enumerate(x_cols): | 
|  | 51     x_cols[k] = int(col)-1 | 
|  | 52     x_vals.append([]) | 
|  | 53     """ | 
|  | 54     try: | 
|  | 55         float( elems[x_cols[k]] ) | 
|  | 56     except: | 
|  | 57         try: | 
|  | 58             msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) | 
|  | 59         except: | 
|  | 60             msg = "This operation cannot be performed on non-numeric data." | 
|  | 61         stop_err( msg ) | 
|  | 62     """ | 
|  | 63 NA = 'NA' | 
|  | 64 for ind,line in enumerate( file( infile )): | 
|  | 65     if line and not line.startswith( '#' ): | 
|  | 66         try: | 
|  | 67             fields = line.split("\t") | 
|  | 68             try: | 
|  | 69                 yval = float(fields[y_col]) | 
|  | 70             except Exception, ey: | 
|  | 71                 yval = r('NA') | 
|  | 72                 #print >>sys.stderr, "ey = %s" %ey | 
|  | 73             y_vals.append(yval) | 
|  | 74             for k,col in enumerate(x_cols): | 
|  | 75                 try: | 
|  | 76                     xval = float(fields[col]) | 
|  | 77                 except Exception, ex: | 
|  | 78                     xval = r('NA') | 
|  | 79                     #print >>sys.stderr, "ex = %s" %ex | 
|  | 80                 x_vals[k].append(xval) | 
|  | 81                 x_vector.append(xval) | 
|  | 82         except: | 
|  | 83             pass | 
|  | 84 | 
|  | 85 #x_vals1 = numpy.asarray(x_vals).transpose() | 
|  | 86 #dat= r.list(x=array(x_vals1), y=y_vals) | 
|  | 87 | 
|  | 88 #set_default_mode(NO_CONVERSION) | 
|  | 89 #try: | 
|  | 90 #    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user | 
|  | 91 #except RException, rex: | 
|  | 92 #    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") | 
|  | 93 #set_default_mode(BASIC_CONVERSION) | 
|  | 94 | 
|  | 95 fv = robjects.FloatVector(x_vector) | 
|  | 96 m = r['matrix'](fv, ncol=len(x_cols),byrow=True) | 
|  | 97 # ensure order for generating formula | 
|  | 98 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)]) | 
|  | 99 dat = robjects.DataFrame(od) | 
|  | 100 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2' | 
|  | 101 formula = ' + '.join(dat.names).replace('+','~',1) | 
|  | 102 try: | 
|  | 103     full = r.lm(formula,  data =  r['na.exclude'](dat)) | 
|  | 104 except RException, rex: | 
|  | 105     stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") | 
|  | 106 | 
|  | 107 | 
|  | 108 | 
|  | 109 summary = r.summary(full) | 
|  | 110 #fullr2 = summary.get('r.squared','NA') | 
|  | 111 fullr2 = summary.rx2('r.squared')[0] | 
|  | 112 | 
|  | 113 if fullr2 == 'NA': | 
|  | 114     stop_error("Error in linear regression") | 
|  | 115 | 
|  | 116 if len(x_vals) < 10: | 
|  | 117     s = "" | 
|  | 118     for ch in range(len(x_vals)): | 
|  | 119         s += str(ch) | 
|  | 120 else: | 
|  | 121     stop_err("This tool only works with less than 10 predictors.") | 
|  | 122 | 
|  | 123 print >>fout, "#Model\tR-sq\tpartial_R_Terms\tpartial_R_Value" | 
|  | 124 all_combos = sorted(sscombs(s), key=len) | 
|  | 125 all_combos.reverse() | 
|  | 126 for j,cols in enumerate(all_combos): | 
|  | 127     #if len(cols) == len(s):    #Same as the full model above | 
|  | 128     #    continue | 
|  | 129     if len(cols) == 1: | 
|  | 130         #x_vals1 = x_vals[int(cols)] | 
|  | 131         x_v = x_vals[int(cols)] | 
|  | 132     else: | 
|  | 133         x_v = [] | 
|  | 134         for col in cols: | 
|  | 135             #x_v.append(x_vals[int(col)]) | 
|  | 136             x_v.extend(x_vals[int(col)]) | 
|  | 137         #x_vals1 = numpy.asarray(x_v).transpose() | 
|  | 138     #dat= r.list(x=array(x_vals1), y=y_vals) | 
|  | 139     #set_default_mode(NO_CONVERSION) | 
|  | 140     #red = r.lm(r("y ~ x"), data= dat)    #Reduced model | 
|  | 141     #set_default_mode(BASIC_CONVERSION) | 
|  | 142     fv = robjects.FloatVector(x_v) | 
|  | 143     m = r['matrix'](fv, ncol=len(cols),byrow=False) | 
|  | 144     # ensure order for generating formula | 
|  | 145     od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)]) | 
|  | 146     dat = robjects.DataFrame(od) | 
|  | 147     # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2' | 
|  | 148     formula = ' + '.join(dat.names).replace('+','~',1) | 
|  | 149     try: | 
|  | 150         red = r.lm(formula,  data =  r['na.exclude'](dat)) | 
|  | 151     except RException, rex: | 
|  | 152         stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") | 
|  | 153 | 
|  | 154 | 
|  | 155     summary = r.summary(red) | 
|  | 156     #redr2 = summary.get('r.squared','NA') | 
|  | 157     redr2 = summary.rx2('r.squared')[0] | 
|  | 158 | 
|  | 159     try: | 
|  | 160         partial_R = (float(fullr2)-float(redr2))/(1-float(redr2)) | 
|  | 161     except: | 
|  | 162         partial_R = 'NA' | 
|  | 163     col_str = "" | 
|  | 164     for col in cols: | 
|  | 165         col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " | 
|  | 166     col_str.strip() | 
|  | 167     partial_R_col_str = "" | 
|  | 168     for col in s: | 
|  | 169         if col not in cols: | 
|  | 170             partial_R_col_str = partial_R_col_str + str(int(x_cols[int(col)]) + 1) + " " | 
|  | 171     partial_R_col_str.strip() | 
|  | 172     if len(cols) == len(s):    #full model | 
|  | 173         partial_R_col_str = "-" | 
|  | 174         partial_R = "-" | 
|  | 175     try: | 
|  | 176         redr2 = "%.4f" %(float(redr2)) | 
|  | 177     except: | 
|  | 178         pass | 
|  | 179     try: | 
|  | 180         partial_R = "%.4f" %(float(partial_R)) | 
|  | 181     except: | 
|  | 182         pass | 
|  | 183     print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,partial_R_col_str,partial_R) |