| 
80
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 #from galaxy import eggs
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 import sys, string
 | 
| 
 | 
     6 #from rpy import *
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 import rpy2.robjects as robjects
 | 
| 
 | 
     9 import rpy2.rlike.container as rlc
 | 
| 
 | 
    10 r = robjects.r
 | 
| 
 | 
    11 import numpy
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 #export PYTHONPATH=~/galaxy/lib/
 | 
| 
 | 
    14 #running command python partialR_square.py reg_inp.tab 4 1,2,3 partialR_result.tabular
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 def stop_err(msg):
 | 
| 
 | 
    17     sys.stderr.write(msg)
 | 
| 
 | 
    18     sys.exit()
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 def sscombs(s):
 | 
| 
 | 
    21     if len(s) == 1:
 | 
| 
 | 
    22         return [s]
 | 
| 
 | 
    23     else:
 | 
| 
 | 
    24         ssc = sscombs(s[1:])
 | 
| 
 | 
    25         return [s[0]] + [s[0]+comb for comb in ssc] + ssc
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 infile = sys.argv[1]
 | 
| 
 | 
    29 y_col = int(sys.argv[2])-1
 | 
| 
 | 
    30 x_cols = sys.argv[3].split(',')
 | 
| 
 | 
    31 outfile = sys.argv[4]
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
 | 
| 
 | 
    34 fout = open(outfile,'w')
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 for i, line in enumerate( file ( infile )):
 | 
| 
 | 
    37     line = line.rstrip('\r\n')
 | 
| 
 | 
    38     if len( line )>0 and not line.startswith( '#' ):
 | 
| 
 | 
    39         elems = line.split( '\t' )
 | 
| 
 | 
    40         break 
 | 
| 
 | 
    41     if i == 30:
 | 
| 
 | 
    42         break # Hopefully we'll never get here...
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 if len( elems )<1:
 | 
| 
 | 
    45     stop_err( "The data in your input dataset is either missing or not formatted properly." )
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 y_vals = []
 | 
| 
 | 
    48 x_vals = []
 | 
| 
 | 
    49 x_vector = []
 | 
| 
 | 
    50 for k,col in enumerate(x_cols):
 | 
| 
 | 
    51     x_cols[k] = int(col)-1
 | 
| 
 | 
    52     x_vals.append([])    
 | 
| 
 | 
    53     """
 | 
| 
 | 
    54     try:
 | 
| 
 | 
    55         float( elems[x_cols[k]] )
 | 
| 
 | 
    56     except:
 | 
| 
 | 
    57         try:
 | 
| 
 | 
    58             msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
 | 
| 
 | 
    59         except:
 | 
| 
 | 
    60             msg = "This operation cannot be performed on non-numeric data."
 | 
| 
 | 
    61         stop_err( msg )
 | 
| 
 | 
    62     """
 | 
| 
 | 
    63 NA = 'NA'
 | 
| 
 | 
    64 for ind,line in enumerate( file( infile )):
 | 
| 
 | 
    65     if line and not line.startswith( '#' ):
 | 
| 
 | 
    66         try:
 | 
| 
 | 
    67             fields = line.split("\t")
 | 
| 
 | 
    68             try:
 | 
| 
 | 
    69                 yval = float(fields[y_col])
 | 
| 
 | 
    70             except Exception, ey:
 | 
| 
 | 
    71                 yval = r('NA')
 | 
| 
 | 
    72                 #print >>sys.stderr, "ey = %s" %ey
 | 
| 
 | 
    73             y_vals.append(yval)
 | 
| 
 | 
    74             for k,col in enumerate(x_cols):
 | 
| 
 | 
    75                 try:
 | 
| 
 | 
    76                     xval = float(fields[col])
 | 
| 
 | 
    77                 except Exception, ex:
 | 
| 
 | 
    78                     xval = r('NA')
 | 
| 
 | 
    79                     #print >>sys.stderr, "ex = %s" %ex
 | 
| 
 | 
    80                 x_vals[k].append(xval)
 | 
| 
 | 
    81                 x_vector.append(xval)
 | 
| 
 | 
    82         except:
 | 
| 
 | 
    83             pass
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 #x_vals1 = numpy.asarray(x_vals).transpose()
 | 
| 
 | 
    86 #dat= r.list(x=array(x_vals1), y=y_vals)
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 #set_default_mode(NO_CONVERSION)
 | 
| 
 | 
    89 #try:
 | 
| 
 | 
    90 #    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user
 | 
| 
 | 
    91 #except RException, rex:
 | 
| 
 | 
    92 #    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
 | 
| 
 | 
    93 #set_default_mode(BASIC_CONVERSION)
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 fv = robjects.FloatVector(x_vector)
 | 
| 
 | 
    96 m = r['matrix'](fv, ncol=len(x_cols),byrow=True)
 | 
| 
 | 
    97 # ensure order for generating formula
 | 
| 
 | 
    98 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)])
 | 
| 
 | 
    99 dat = robjects.DataFrame(od)
 | 
| 
 | 
   100 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2'
 | 
| 
 | 
   101 formula = ' + '.join(dat.names).replace('+','~',1)
 | 
| 
 | 
   102 try:
 | 
| 
 | 
   103     full = r.lm(formula,  data =  r['na.exclude'](dat))
 | 
| 
 | 
   104 except RException, rex:
 | 
| 
 | 
   105     stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
 | 
| 
 | 
   106 
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 
 | 
| 
 | 
   109 summary = r.summary(full)
 | 
| 
 | 
   110 #fullr2 = summary.get('r.squared','NA')
 | 
| 
 | 
   111 fullr2 = summary.rx2('r.squared')[0]
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 if fullr2 == 'NA':
 | 
| 
 | 
   114     stop_error("Error in linear regression")
 | 
| 
 | 
   115 
 | 
| 
 | 
   116 if len(x_vals) < 10:
 | 
| 
 | 
   117     s = ""
 | 
| 
 | 
   118     for ch in range(len(x_vals)):
 | 
| 
 | 
   119         s += str(ch)
 | 
| 
 | 
   120 else:
 | 
| 
 | 
   121     stop_err("This tool only works with less than 10 predictors.")
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 print >>fout, "#Model\tR-sq\tpartial_R_Terms\tpartial_R_Value"
 | 
| 
 | 
   124 all_combos = sorted(sscombs(s), key=len)
 | 
| 
 | 
   125 all_combos.reverse()
 | 
| 
 | 
   126 for j,cols in enumerate(all_combos):
 | 
| 
 | 
   127     #if len(cols) == len(s):    #Same as the full model above
 | 
| 
 | 
   128     #    continue
 | 
| 
 | 
   129     if len(cols) == 1:
 | 
| 
 | 
   130         #x_vals1 = x_vals[int(cols)]
 | 
| 
 | 
   131         x_v = x_vals[int(cols)]        
 | 
| 
 | 
   132     else:
 | 
| 
 | 
   133         x_v = []
 | 
| 
 | 
   134         for col in cols:
 | 
| 
 | 
   135             #x_v.append(x_vals[int(col)])
 | 
| 
 | 
   136             x_v.extend(x_vals[int(col)])
 | 
| 
 | 
   137         #x_vals1 = numpy.asarray(x_v).transpose()
 | 
| 
 | 
   138     #dat= r.list(x=array(x_vals1), y=y_vals)
 | 
| 
 | 
   139     #set_default_mode(NO_CONVERSION)
 | 
| 
 | 
   140     #red = r.lm(r("y ~ x"), data= dat)    #Reduced model
 | 
| 
 | 
   141     #set_default_mode(BASIC_CONVERSION)
 | 
| 
 | 
   142     fv = robjects.FloatVector(x_v)
 | 
| 
 | 
   143     m = r['matrix'](fv, ncol=len(cols),byrow=False)
 | 
| 
 | 
   144     # ensure order for generating formula
 | 
| 
 | 
   145     od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)])
 | 
| 
 | 
   146     dat = robjects.DataFrame(od)
 | 
| 
 | 
   147     # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2'
 | 
| 
 | 
   148     formula = ' + '.join(dat.names).replace('+','~',1)
 | 
| 
 | 
   149     try:
 | 
| 
 | 
   150         red = r.lm(formula,  data =  r['na.exclude'](dat))
 | 
| 
 | 
   151     except RException, rex:
 | 
| 
 | 
   152         stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
 | 
| 
 | 
   153     
 | 
| 
 | 
   154 
 | 
| 
 | 
   155     summary = r.summary(red)
 | 
| 
 | 
   156     #redr2 = summary.get('r.squared','NA')
 | 
| 
 | 
   157     redr2 = summary.rx2('r.squared')[0]
 | 
| 
 | 
   158 
 | 
| 
 | 
   159     try:
 | 
| 
 | 
   160         partial_R = (float(fullr2)-float(redr2))/(1-float(redr2))
 | 
| 
 | 
   161     except:
 | 
| 
 | 
   162         partial_R = 'NA'
 | 
| 
 | 
   163     col_str = ""
 | 
| 
 | 
   164     for col in cols:
 | 
| 
 | 
   165         col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
 | 
| 
 | 
   166     col_str.strip()
 | 
| 
 | 
   167     partial_R_col_str = ""
 | 
| 
 | 
   168     for col in s:
 | 
| 
 | 
   169         if col not in cols:
 | 
| 
 | 
   170             partial_R_col_str = partial_R_col_str + str(int(x_cols[int(col)]) + 1) + " "
 | 
| 
 | 
   171     partial_R_col_str.strip()
 | 
| 
 | 
   172     if len(cols) == len(s):    #full model
 | 
| 
 | 
   173         partial_R_col_str = "-"
 | 
| 
 | 
   174         partial_R = "-"
 | 
| 
 | 
   175     try:
 | 
| 
 | 
   176         redr2 = "%.4f" %(float(redr2))
 | 
| 
 | 
   177     except:
 | 
| 
 | 
   178         pass
 | 
| 
 | 
   179     try:
 | 
| 
 | 
   180         partial_R = "%.4f" %(float(partial_R))
 | 
| 
 | 
   181     except:
 | 
| 
 | 
   182         pass
 | 
| 
 | 
   183     print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,partial_R_col_str,partial_R)
 |