| 
80
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 #from galaxy import eggs
 | 
| 
 | 
     4 import sys, string
 | 
| 
 | 
     5 #from rpy import *
 | 
| 
 | 
     6 import rpy2.robjects as robjects
 | 
| 
 | 
     7 import rpy2.rlike.container as rlc
 | 
| 
 | 
     8 import rpy2.rinterface as ri
 | 
| 
 | 
     9 r = robjects.r
 | 
| 
 | 
    10 import numpy
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 def stop_err(msg):
 | 
| 
 | 
    13     sys.stderr.write(msg)
 | 
| 
 | 
    14     sys.exit()
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 infile = sys.argv[1]
 | 
| 
 | 
    17 y_col = int(sys.argv[2])-1
 | 
| 
 | 
    18 x_cols = sys.argv[3].split(',')
 | 
| 
 | 
    19 outfile = sys.argv[4]
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
 | 
| 
 | 
    23 fout = open(outfile,'w')
 | 
| 
 | 
    24 elems = []
 | 
| 
 | 
    25 for i, line in enumerate( file ( infile )):
 | 
| 
 | 
    26     line = line.rstrip('\r\n')
 | 
| 
 | 
    27     if len( line )>0 and not line.startswith( '#' ):
 | 
| 
 | 
    28         elems = line.split( '\t' )
 | 
| 
 | 
    29         break 
 | 
| 
 | 
    30     if i == 30:
 | 
| 
 | 
    31         break # Hopefully we'll never get here...
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 if len( elems )<1:
 | 
| 
 | 
    34     stop_err( "The data in your input dataset is either missing or not formatted properly." )
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 y_vals = []
 | 
| 
 | 
    37 x_vals = []
 | 
| 
 | 
    38 x_vector = []
 | 
| 
 | 
    39 for k,col in enumerate(x_cols):
 | 
| 
 | 
    40     x_cols[k] = int(col)-1
 | 
| 
 | 
    41     x_vals.append([])
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 NA = 'NA'
 | 
| 
 | 
    44 for ind,line in enumerate( file( infile )):
 | 
| 
 | 
    45     if line and not line.startswith( '#' ):
 | 
| 
 | 
    46         try:
 | 
| 
 | 
    47             fields = line.split("\t")
 | 
| 
 | 
    48             try:
 | 
| 
 | 
    49                 yval = float(fields[y_col])
 | 
| 
 | 
    50             except:
 | 
| 
 | 
    51                 yval = r('NA')
 | 
| 
 | 
    52             y_vals.append(yval)
 | 
| 
 | 
    53             for k,col in enumerate(x_cols):
 | 
| 
 | 
    54                 try:
 | 
| 
 | 
    55                     xval = float(fields[col])
 | 
| 
 | 
    56                 except:
 | 
| 
 | 
    57                     xval = r('NA')
 | 
| 
 | 
    58                 x_vals[k].append(xval)
 | 
| 
 | 
    59                 x_vector.append(xval)
 | 
| 
 | 
    60         except Exception, e:
 | 
| 
 | 
    61             print e
 | 
| 
 | 
    62             pass
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 #x_vals1 = numpy.asarray(x_vals).transpose()
 | 
| 
 | 
    65 
 | 
| 
 | 
    66 check1=0
 | 
| 
 | 
    67 check0=0
 | 
| 
 | 
    68 for i in y_vals:
 | 
| 
 | 
    69     if i == 1:
 | 
| 
 | 
    70         check1=1
 | 
| 
 | 
    71     if i == 0:
 | 
| 
 | 
    72         check0=1
 | 
| 
 | 
    73 if check1==0 or check0==0:
 | 
| 
 | 
    74     sys.exit("Warning: logistic regression must have at least two classes")
 | 
| 
 | 
    75 
 | 
| 
 | 
    76 for i in y_vals:
 | 
| 
 | 
    77     if i not in [1,0,r('NA')]:
 | 
| 
 | 
    78         print >>fout, str(i)
 | 
| 
 | 
    79         sys.exit("Warning: the current version of this tool can run only with two classes and need to be labeled as 0 and 1.")
 | 
| 
 | 
    80     
 | 
| 
 | 
    81     
 | 
| 
 | 
    82 #dat= r.list(x=array(x_vals1), y=y_vals)
 | 
| 
 | 
    83 novif=0
 | 
| 
 | 
    84 #set_default_mode(NO_CONVERSION)
 | 
| 
 | 
    85 #try:
 | 
| 
 | 
    86 #    linear_model = r.glm(r("y ~ x"), data = r.na_exclude(dat),family="binomial")
 | 
| 
 | 
    87 #    #r('library(car)')
 | 
| 
 | 
    88 #    #r.assign('dat',dat)
 | 
| 
 | 
    89 #    #r.assign('ncols',len(x_cols))
 | 
| 
 | 
    90 #    #r.vif(r('glm(dat$y ~ ., data = na.exclude(data.frame(as.matrix(dat$x,ncol=ncols))->datx),family="binomial")')).as_py()
 | 
| 
 | 
    91 #   
 | 
| 
 | 
    92 #except RException, rex:
 | 
| 
 | 
    93 #    stop_err("Error performing logistic regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 fv = robjects.FloatVector(x_vector)
 | 
| 
 | 
    96 m = r['matrix'](fv, ncol=len(x_cols),byrow=True)
 | 
| 
 | 
    97 # ensure order for generating formula
 | 
| 
 | 
    98 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)])
 | 
| 
 | 
    99 dat = robjects.DataFrame(od)
 | 
| 
 | 
   100 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2'
 | 
| 
 | 
   101 formula = ' + '.join(dat.names).replace('+','~',1)
 | 
| 
 | 
   102 print formula
 | 
| 
 | 
   103 try:
 | 
| 
 | 
   104     linear_model = r.glm(formula,  data =  r['na.exclude'](dat), family="binomial")
 | 
| 
 | 
   105 except Exception, rex:
 | 
| 
 | 
   106     stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 if len(x_cols)>1:
 | 
| 
 | 
   109     try:
 | 
| 
 | 
   110         r('library(car)')
 | 
| 
 | 
   111         r.assign('dat',dat)
 | 
| 
 | 
   112         r.assign('ncols',len(x_cols))
 | 
| 
 | 
   113         #vif=r.vif(r('glm(dat$y ~ ., data = na.exclude(data.frame(as.matrix(dat$x,ncol=ncols))->datx),family="binomial")'))
 | 
| 
 | 
   114         od2 = rlc.OrdDict([('datx', m)])
 | 
| 
 | 
   115         glm_data_frame = robjects.DataFrame(od2)
 | 
| 
 | 
   116         glm_result = r.glm("dat$y ~ .", data = r['na.exclude'](glm_data_frame),family="binomial")
 | 
| 
 | 
   117         print 'Have glm'
 | 
| 
 | 
   118         vif = r.vif(glm_result)
 | 
| 
 | 
   119     except Exception, rex:        
 | 
| 
 | 
   120         print rex
 | 
| 
 | 
   121 else:
 | 
| 
 | 
   122     novif=1
 | 
| 
 | 
   123     
 | 
| 
 | 
   124 #set_default_mode(BASIC_CONVERSION)
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 #coeffs=linear_model.as_py()['coefficients']
 | 
| 
 | 
   127 coeffs=linear_model.rx2('coefficients')
 | 
| 
 | 
   128 #null_deviance=linear_model.as_py()['null.deviance']
 | 
| 
 | 
   129 null_deviance=linear_model.rx2('null.deviance')[0]
 | 
| 
 | 
   130 #residual_deviance=linear_model.as_py()['deviance']
 | 
| 
 | 
   131 residual_deviance=linear_model.rx2('deviance')[0]
 | 
| 
 | 
   132 #yintercept= coeffs['(Intercept)']
 | 
| 
 | 
   133 yintercept= coeffs.rx2('(Intercept)')[0]
 | 
| 
 | 
   134 
 | 
| 
 | 
   135 summary = r.summary(linear_model)
 | 
| 
 | 
   136 #co = summary.get('coefficients', 'NA')
 | 
| 
 | 
   137 co = summary.rx2("coefficients")
 | 
| 
 | 
   138 print co
 | 
| 
 | 
   139 """
 | 
| 
 | 
   140 if len(co) != len(x_vals)+1:
 | 
| 
 | 
   141     stop_err("Stopped performing logistic regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
 | 
| 
 | 
   142 """
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 try:
 | 
| 
 | 
   145     yintercept = r.round(float(yintercept), digits=10)[0]
 | 
| 
 | 
   146     #pvaly = r.round(float(co[0][3]), digits=10)
 | 
| 
 | 
   147     pvaly = r.round(float(co.rx(1,4)[0]), digits=10)[0]
 | 
| 
 | 
   148 except Exception, e:
 | 
| 
 | 
   149     print str(e)
 | 
| 
 | 
   150     pass
 | 
| 
 | 
   151 print >>fout, "response column\tc%d" %(y_col+1)
 | 
| 
 | 
   152 tempP=[]
 | 
| 
 | 
   153 for i in x_cols:
 | 
| 
 | 
   154     tempP.append('c'+str(i+1))
 | 
| 
 | 
   155 tempP=','.join(tempP)
 | 
| 
 | 
   156 print >>fout, "predictor column(s)\t%s" %(tempP)
 | 
| 
 | 
   157 print >>fout, "Y-intercept\t%s" %(yintercept)
 | 
| 
 | 
   158 print >>fout, "p-value (Y-intercept)\t%s" %(pvaly)
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 print coeffs
 | 
| 
 | 
   161 if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
 | 
| 
 | 
   162     try:
 | 
| 
 | 
   163         #slope = r.round(float(coeffs['x']), digits=10)
 | 
| 
 | 
   164         raw_slope = coeffs.rx2('x')[0]
 | 
| 
 | 
   165         slope = r.round(float(raw_slope), digits=10)[0] 
 | 
| 
 | 
   166     except:
 | 
| 
 | 
   167         slope = 'NA'
 | 
| 
 | 
   168     try:
 | 
| 
 | 
   169         #pval = r.round(float(co[1][3]), digits=10)
 | 
| 
 | 
   170         pval = r.round(float(co.rx2(2,4)[0]), digits=10)[0]
 | 
| 
 | 
   171     except:
 | 
| 
 | 
   172         pval = 'NA'
 | 
| 
 | 
   173     print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
 | 
| 
 | 
   174     print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
 | 
| 
 | 
   175 else:    #Multiple regression case with >1 predictors
 | 
| 
 | 
   176     ind=1
 | 
| 
 | 
   177     #while ind < len(coeffs.keys()):
 | 
| 
 | 
   178     print len(coeffs.names)
 | 
| 
 | 
   179     while ind < len(coeffs.names):
 | 
| 
 | 
   180         try:
 | 
| 
 | 
   181             #slope = r.round(float(coeffs['x'+str(ind)]), digits=10)
 | 
| 
 | 
   182             raw_slope = coeffs.rx2('x.' + str(ind))[0]
 | 
| 
 | 
   183             slope = r.round(float(raw_slope), digits=10)[0]
 | 
| 
 | 
   184         except:
 | 
| 
 | 
   185             slope = 'NA'
 | 
| 
 | 
   186         print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,slope)
 | 
| 
 | 
   187 
 | 
| 
 | 
   188         try:
 | 
| 
 | 
   189             #pval = r.round(float(co[ind][3]), digits=10)
 | 
| 
 | 
   190             pval = r.round(float(co.rx2(ind+1, 4)[0]), digits=10)[0]
 | 
| 
 | 
   191         except:
 | 
| 
 | 
   192             pval = 'NA'
 | 
| 
 | 
   193         print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
 | 
| 
 | 
   194         ind+=1
 | 
| 
 | 
   195 
 | 
| 
 | 
   196 #rsq = summary.get('r.squared','NA')
 | 
| 
 | 
   197 rsq = summary.rx2('r.squared')
 | 
| 
 | 
   198 if rsq == ri.RNULLType():
 | 
| 
 | 
   199     rsq = 'NA'
 | 
| 
 | 
   200 else:
 | 
| 
 | 
   201     rsq = rsq[0]
 | 
| 
 | 
   202 
 | 
| 
 | 
   203 
 | 
| 
 | 
   204 try:
 | 
| 
 | 
   205     #rsq= r.round(float((null_deviance-residual_deviance)/null_deviance), digits=5)
 | 
| 
 | 
   206     rsq= r.round(float((null_deviance-residual_deviance)/null_deviance), digits=5)[0]
 | 
| 
 | 
   207     #null_deviance= r.round(float(null_deviance), digits=5)
 | 
| 
 | 
   208     null_deviance= r.round(float(null_deviance), digits=5)[0]
 | 
| 
 | 
   209     #residual_deviance= r.round(float(residual_deviance), digits=5)
 | 
| 
 | 
   210     residual_deviance= r.round(float(residual_deviance), digits=5)[0]
 | 
| 
 | 
   211     
 | 
| 
 | 
   212 except:
 | 
| 
 | 
   213     pass
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 print >>fout, "Null deviance\t%s" %(null_deviance)
 | 
| 
 | 
   216 
 | 
| 
 | 
   217 print >>fout, "Residual deviance\t%s" %(residual_deviance)
 | 
| 
 | 
   218 print >>fout, "pseudo R-squared\t%s" %(rsq)
 | 
| 
 | 
   219 print >>fout, "\n"
 | 
| 
 | 
   220 print >>fout, 'vif'
 | 
| 
 | 
   221 
 | 
| 
 | 
   222 if novif==0:
 | 
| 
 | 
   223     #py_vif=vif.as_py()
 | 
| 
 | 
   224     count=0
 | 
| 
 | 
   225     for i in sorted(vif.names):
 | 
| 
 | 
   226         print >>fout,'c'+str(x_cols[count]+1) ,str(vif.rx2(i)[0])
 | 
| 
 | 
   227         count+=1
 | 
| 
 | 
   228 elif novif==1:
 | 
| 
 | 
   229     print >>fout, "vif can calculate only when model have more than 1 predictor"
 |