comparison linear_regression.py @ 90:b061185bcb83 draft

Uploaded
author bernhardlutz
date Thu, 23 Jan 2014 14:53:46 -0500
parents c4a3a8999945
children babf8ab95495
comparison
equal deleted inserted replaced
89:71e953a4191d 90:b061185bcb83
1 #!/usr/bin/env python
2
3 from galaxy import eggs
4 import sys, string
5 import rpy2.robjects as robjects
6 import rpy2.rlike.container as rlc
7 from rpy2.robjects.packages import importr
8 r = robjects.r
9 grdevices = importr('grDevices')
10 # from rpy import *
11 import numpy
12
13
14 def stop_err(msg):
15 sys.stderr.write(msg)
16 sys.exit()
17
18 infile = sys.argv[1]
19 y_col = int(sys.argv[2])-1
20 x_cols = sys.argv[3].split(',')
21 outfile = sys.argv[4]
22 outfile2 = sys.argv[5]
23
24 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
25 fout = open(outfile,'w')
26 elems = []
27 for i, line in enumerate( file ( infile )):
28 line = line.rstrip('\r\n')
29 if len( line )>0 and not line.startswith( '#' ):
30 elems = line.split( '\t' )
31 break
32 if i == 30:
33 break # Hopefully we'll never get here...
34
35 if len( elems )<1:
36 stop_err( "The data in your input dataset is either missing or not formatted properly." )
37
38 y_vals = []
39 x_vals = []
40
41 for k,col in enumerate(x_cols):
42 x_cols[k] = int(col)-1
43 # x_vals.append([])
44
45 NA = 'NA'
46 for ind,line in enumerate( file( infile )):
47 if line and not line.startswith( '#' ):
48 try:
49 fields = line.split("\t")
50 try:
51 yval = float(fields[y_col])
52 except:
53 yval = r('NA')
54 y_vals.append(yval)
55 for k,col in enumerate(x_cols):
56 try:
57 xval = float(fields[col])
58 except:
59 xval = r('NA')
60 # x_vals[k].append(xval)
61 x_vals.append(xval)
62 except:
63 pass
64 # x_vals1 = numpy.asarray(x_vals).transpose()
65 # dat= r.list(x=array(x_vals1), y=y_vals)
66 fv = robjects.FloatVector(x_vals)
67 m = r['matrix'](fv, ncol=len(x_cols),byrow=True)
68 # ensure order for generating formula
69 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)])
70 dat = robjects.DataFrame(od)
71 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2'
72 formula = ' + '.join(dat.names).replace('+','~',1)
73
74 #set_default_mode(NO_CONVERSION)
75 try:
76 #linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
77 linear_model = r.lm(formula, data = r['na.exclude'](dat))
78 except RException, rex:
79 stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
80 #set_default_mode(BASIC_CONVERSION)
81
82 #coeffs=linear_model.as_py()['coefficients']
83 #yintercept= coeffs['(Intercept)']
84 coeffs=linear_model.rx2('coefficients')
85 yintercept= coeffs.rx2('(Intercept)')[0]
86 summary = r.summary(linear_model)
87
88 #co = summary.get('coefficients', 'NA')
89 co = summary.rx2("coefficients")
90
91 """
92 if len(co) != len(x_vals)+1:
93 stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
94 """
95 #print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3])
96 print >>fout, "p-value (Y-intercept)\t%s" %(co.rx(1,4)[0])
97
98 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
99 try:
100 #slope = coeffs['x']
101 slope = r.round(float(coeffs.rx2('x')[0]), digits=10)
102 except:
103 slope = 'NA'
104 try:
105 #pval = co[1][3]
106 pval = r.round(float(co.rx(2,4)[0]), digits=10)
107 except:
108 pval = 'NA'
109 print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
110 print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
111 else: #Multiple regression case with >1 predictors
112 ind=1
113 #while ind < len(coeffs.keys()):
114 while ind < len(coeffs.names):
115 # print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)])
116 print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs.rx2(coeffs.names[ind])[0])
117 try:
118 #pval = co[ind][3]
119 pval = r.round(float(co.rx(ind+1,4)[0]), digits=10)
120 except:
121 pval = 'NA'
122 print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
123 ind+=1
124
125 rsq = summary.rx2('r.squared')[0]
126 adjrsq = summary.rx2('adj.r.squared')[0]
127 fstat = summary.rx2('fstatistic').rx2('value')[0]
128 sigma = summary.rx2('sigma')[0]
129
130 try:
131 rsq = r.round(float(rsq), digits=5)
132 adjrsq = r.round(float(adjrsq), digits=5)
133 fval = r.round(fstat['value'], digits=5)
134 fstat['value'] = str(fval)
135 sigma = r.round(float(sigma), digits=10)
136 except:
137 pass
138
139 print >>fout, "R-squared\t%s" %(rsq)
140 print >>fout, "Adjusted R-squared\t%s" %(adjrsq)
141 print >>fout, "F-statistic\t%s" %(fstat)
142 print >>fout, "Sigma\t%s" %(sigma)
143
144 r.pdf( outfile2, 8, 8 )
145 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
146 sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept)
147 try:
148 r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
149 r.abline(a=yintercept, b=slope, col="red")
150 except:
151 pass
152 else:
153 r.pairs(dat, main="Scatterplot Matrix", col="blue")
154 try:
155 r.plot(linear_model)
156 except:
157 pass
158 #r.dev_off()
159 grdevices.dev_off()