| 80 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 #from galaxy import eggs | 
|  | 4 import sys, string | 
|  | 5 #from rpy import * | 
|  | 6 import rpy2.robjects as robjects | 
|  | 7 import rpy2.rlike.container as rlc | 
|  | 8 import rpy2.rinterface as ri | 
|  | 9 r = robjects.r | 
|  | 10 import numpy | 
|  | 11 | 
|  | 12 def stop_err(msg): | 
|  | 13     sys.stderr.write(msg) | 
|  | 14     sys.exit() | 
|  | 15 | 
|  | 16 infile = sys.argv[1] | 
|  | 17 y_col = int(sys.argv[2])-1 | 
|  | 18 x_cols = sys.argv[3].split(',') | 
|  | 19 outfile = sys.argv[4] | 
|  | 20 | 
|  | 21 | 
|  | 22 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) | 
|  | 23 fout = open(outfile,'w') | 
|  | 24 elems = [] | 
|  | 25 for i, line in enumerate( file ( infile )): | 
|  | 26     line = line.rstrip('\r\n') | 
|  | 27     if len( line )>0 and not line.startswith( '#' ): | 
|  | 28         elems = line.split( '\t' ) | 
|  | 29         break | 
|  | 30     if i == 30: | 
|  | 31         break # Hopefully we'll never get here... | 
|  | 32 | 
|  | 33 if len( elems )<1: | 
|  | 34     stop_err( "The data in your input dataset is either missing or not formatted properly." ) | 
|  | 35 | 
|  | 36 y_vals = [] | 
|  | 37 x_vals = [] | 
|  | 38 x_vector = [] | 
|  | 39 for k,col in enumerate(x_cols): | 
|  | 40     x_cols[k] = int(col)-1 | 
|  | 41     x_vals.append([]) | 
|  | 42 | 
|  | 43 NA = 'NA' | 
|  | 44 for ind,line in enumerate( file( infile )): | 
|  | 45     if line and not line.startswith( '#' ): | 
|  | 46         try: | 
|  | 47             fields = line.split("\t") | 
|  | 48             try: | 
|  | 49                 yval = float(fields[y_col]) | 
|  | 50             except: | 
|  | 51                 yval = r('NA') | 
|  | 52             y_vals.append(yval) | 
|  | 53             for k,col in enumerate(x_cols): | 
|  | 54                 try: | 
|  | 55                     xval = float(fields[col]) | 
|  | 56                 except: | 
|  | 57                     xval = r('NA') | 
|  | 58                 x_vals[k].append(xval) | 
|  | 59                 x_vector.append(xval) | 
|  | 60         except Exception, e: | 
|  | 61             print e | 
|  | 62             pass | 
|  | 63 | 
|  | 64 #x_vals1 = numpy.asarray(x_vals).transpose() | 
|  | 65 | 
|  | 66 check1=0 | 
|  | 67 check0=0 | 
|  | 68 for i in y_vals: | 
|  | 69     if i == 1: | 
|  | 70         check1=1 | 
|  | 71     if i == 0: | 
|  | 72         check0=1 | 
|  | 73 if check1==0 or check0==0: | 
|  | 74     sys.exit("Warning: logistic regression must have at least two classes") | 
|  | 75 | 
|  | 76 for i in y_vals: | 
|  | 77     if i not in [1,0,r('NA')]: | 
|  | 78         print >>fout, str(i) | 
|  | 79         sys.exit("Warning: the current version of this tool can run only with two classes and need to be labeled as 0 and 1.") | 
|  | 80 | 
|  | 81 | 
|  | 82 #dat= r.list(x=array(x_vals1), y=y_vals) | 
|  | 83 novif=0 | 
|  | 84 #set_default_mode(NO_CONVERSION) | 
|  | 85 #try: | 
|  | 86 #    linear_model = r.glm(r("y ~ x"), data = r.na_exclude(dat),family="binomial") | 
|  | 87 #    #r('library(car)') | 
|  | 88 #    #r.assign('dat',dat) | 
|  | 89 #    #r.assign('ncols',len(x_cols)) | 
|  | 90 #    #r.vif(r('glm(dat$y ~ ., data = na.exclude(data.frame(as.matrix(dat$x,ncol=ncols))->datx),family="binomial")')).as_py() | 
|  | 91 # | 
|  | 92 #except RException, rex: | 
|  | 93 #    stop_err("Error performing logistic regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") | 
|  | 94 | 
|  | 95 fv = robjects.FloatVector(x_vector) | 
|  | 96 m = r['matrix'](fv, ncol=len(x_cols),byrow=True) | 
|  | 97 # ensure order for generating formula | 
|  | 98 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)]) | 
|  | 99 dat = robjects.DataFrame(od) | 
|  | 100 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2' | 
|  | 101 formula = ' + '.join(dat.names).replace('+','~',1) | 
|  | 102 print formula | 
|  | 103 try: | 
|  | 104     linear_model = r.glm(formula,  data =  r['na.exclude'](dat), family="binomial") | 
|  | 105 except Exception, rex: | 
|  | 106     stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") | 
|  | 107 | 
|  | 108 if len(x_cols)>1: | 
|  | 109     try: | 
|  | 110         r('library(car)') | 
|  | 111         r.assign('dat',dat) | 
|  | 112         r.assign('ncols',len(x_cols)) | 
|  | 113         #vif=r.vif(r('glm(dat$y ~ ., data = na.exclude(data.frame(as.matrix(dat$x,ncol=ncols))->datx),family="binomial")')) | 
|  | 114         od2 = rlc.OrdDict([('datx', m)]) | 
|  | 115         glm_data_frame = robjects.DataFrame(od2) | 
|  | 116         glm_result = r.glm("dat$y ~ .", data = r['na.exclude'](glm_data_frame),family="binomial") | 
|  | 117         print 'Have glm' | 
|  | 118         vif = r.vif(glm_result) | 
|  | 119     except Exception, rex: | 
|  | 120         print rex | 
|  | 121 else: | 
|  | 122     novif=1 | 
|  | 123 | 
|  | 124 #set_default_mode(BASIC_CONVERSION) | 
|  | 125 | 
|  | 126 #coeffs=linear_model.as_py()['coefficients'] | 
|  | 127 coeffs=linear_model.rx2('coefficients') | 
|  | 128 #null_deviance=linear_model.as_py()['null.deviance'] | 
|  | 129 null_deviance=linear_model.rx2('null.deviance')[0] | 
|  | 130 #residual_deviance=linear_model.as_py()['deviance'] | 
|  | 131 residual_deviance=linear_model.rx2('deviance')[0] | 
|  | 132 #yintercept= coeffs['(Intercept)'] | 
|  | 133 yintercept= coeffs.rx2('(Intercept)')[0] | 
|  | 134 | 
|  | 135 summary = r.summary(linear_model) | 
|  | 136 #co = summary.get('coefficients', 'NA') | 
|  | 137 co = summary.rx2("coefficients") | 
|  | 138 print co | 
|  | 139 """ | 
|  | 140 if len(co) != len(x_vals)+1: | 
|  | 141     stop_err("Stopped performing logistic regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") | 
|  | 142 """ | 
|  | 143 | 
|  | 144 try: | 
|  | 145     yintercept = r.round(float(yintercept), digits=10)[0] | 
|  | 146     #pvaly = r.round(float(co[0][3]), digits=10) | 
|  | 147     pvaly = r.round(float(co.rx(1,4)[0]), digits=10)[0] | 
|  | 148 except Exception, e: | 
|  | 149     print str(e) | 
|  | 150     pass | 
|  | 151 print >>fout, "response column\tc%d" %(y_col+1) | 
|  | 152 tempP=[] | 
|  | 153 for i in x_cols: | 
|  | 154     tempP.append('c'+str(i+1)) | 
|  | 155 tempP=','.join(tempP) | 
|  | 156 print >>fout, "predictor column(s)\t%s" %(tempP) | 
|  | 157 print >>fout, "Y-intercept\t%s" %(yintercept) | 
|  | 158 print >>fout, "p-value (Y-intercept)\t%s" %(pvaly) | 
|  | 159 | 
|  | 160 print coeffs | 
|  | 161 if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable | 
|  | 162     try: | 
|  | 163         #slope = r.round(float(coeffs['x']), digits=10) | 
|  | 164         raw_slope = coeffs.rx2('x')[0] | 
|  | 165         slope = r.round(float(raw_slope), digits=10)[0] | 
|  | 166     except: | 
|  | 167         slope = 'NA' | 
|  | 168     try: | 
|  | 169         #pval = r.round(float(co[1][3]), digits=10) | 
|  | 170         pval = r.round(float(co.rx2(2,4)[0]), digits=10)[0] | 
|  | 171     except: | 
|  | 172         pval = 'NA' | 
|  | 173     print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) | 
|  | 174     print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) | 
|  | 175 else:    #Multiple regression case with >1 predictors | 
|  | 176     ind=1 | 
|  | 177     #while ind < len(coeffs.keys()): | 
|  | 178     print len(coeffs.names) | 
|  | 179     while ind < len(coeffs.names): | 
|  | 180         try: | 
|  | 181             #slope = r.round(float(coeffs['x'+str(ind)]), digits=10) | 
|  | 182             raw_slope = coeffs.rx2('x.' + str(ind))[0] | 
|  | 183             slope = r.round(float(raw_slope), digits=10)[0] | 
|  | 184         except: | 
|  | 185             slope = 'NA' | 
|  | 186         print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,slope) | 
|  | 187 | 
|  | 188         try: | 
|  | 189             #pval = r.round(float(co[ind][3]), digits=10) | 
|  | 190             pval = r.round(float(co.rx2(ind+1, 4)[0]), digits=10)[0] | 
|  | 191         except: | 
|  | 192             pval = 'NA' | 
|  | 193         print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) | 
|  | 194         ind+=1 | 
|  | 195 | 
|  | 196 #rsq = summary.get('r.squared','NA') | 
|  | 197 rsq = summary.rx2('r.squared') | 
|  | 198 if rsq == ri.RNULLType(): | 
|  | 199     rsq = 'NA' | 
|  | 200 else: | 
|  | 201     rsq = rsq[0] | 
|  | 202 | 
|  | 203 | 
|  | 204 try: | 
|  | 205     #rsq= r.round(float((null_deviance-residual_deviance)/null_deviance), digits=5) | 
|  | 206     rsq= r.round(float((null_deviance-residual_deviance)/null_deviance), digits=5)[0] | 
|  | 207     #null_deviance= r.round(float(null_deviance), digits=5) | 
|  | 208     null_deviance= r.round(float(null_deviance), digits=5)[0] | 
|  | 209     #residual_deviance= r.round(float(residual_deviance), digits=5) | 
|  | 210     residual_deviance= r.round(float(residual_deviance), digits=5)[0] | 
|  | 211 | 
|  | 212 except: | 
|  | 213     pass | 
|  | 214 | 
|  | 215 print >>fout, "Null deviance\t%s" %(null_deviance) | 
|  | 216 | 
|  | 217 print >>fout, "Residual deviance\t%s" %(residual_deviance) | 
|  | 218 print >>fout, "pseudo R-squared\t%s" %(rsq) | 
|  | 219 print >>fout, "\n" | 
|  | 220 print >>fout, 'vif' | 
|  | 221 | 
|  | 222 if novif==0: | 
|  | 223     #py_vif=vif.as_py() | 
|  | 224     count=0 | 
|  | 225     for i in sorted(vif.names): | 
|  | 226         print >>fout,'c'+str(x_cols[count]+1) ,str(vif.rx2(i)[0]) | 
|  | 227         count+=1 | 
|  | 228 elif novif==1: | 
|  | 229     print >>fout, "vif can calculate only when model have more than 1 predictor" |