annotate linear_regression.py @ 90:b061185bcb83 draft

Uploaded
author bernhardlutz
date Thu, 23 Jan 2014 14:53:46 -0500
parents c4a3a8999945
children babf8ab95495
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
80
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
1 #!/usr/bin/env python
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
2
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
3 from galaxy import eggs
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
4 import sys, string
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
5 import rpy2.robjects as robjects
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
6 import rpy2.rlike.container as rlc
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
7 from rpy2.robjects.packages import importr
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
8 r = robjects.r
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
9 grdevices = importr('grDevices')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
10 # from rpy import *
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
11 import numpy
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
12
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
13
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
14 def stop_err(msg):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
15 sys.stderr.write(msg)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
16 sys.exit()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
17
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
18 infile = sys.argv[1]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
19 y_col = int(sys.argv[2])-1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
20 x_cols = sys.argv[3].split(',')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
21 outfile = sys.argv[4]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
22 outfile2 = sys.argv[5]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
23
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
24 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
25 fout = open(outfile,'w')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
26 elems = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
27 for i, line in enumerate( file ( infile )):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
28 line = line.rstrip('\r\n')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
29 if len( line )>0 and not line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
30 elems = line.split( '\t' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
31 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
32 if i == 30:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
33 break # Hopefully we'll never get here...
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
34
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
35 if len( elems )<1:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
36 stop_err( "The data in your input dataset is either missing or not formatted properly." )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
37
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
38 y_vals = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
39 x_vals = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
40
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
41 for k,col in enumerate(x_cols):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
42 x_cols[k] = int(col)-1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
43 # x_vals.append([])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
44
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
45 NA = 'NA'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
46 for ind,line in enumerate( file( infile )):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
47 if line and not line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
48 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
49 fields = line.split("\t")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
50 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
51 yval = float(fields[y_col])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
52 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
53 yval = r('NA')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
54 y_vals.append(yval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
55 for k,col in enumerate(x_cols):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
56 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
57 xval = float(fields[col])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
58 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
59 xval = r('NA')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
60 # x_vals[k].append(xval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
61 x_vals.append(xval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
62 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
63 pass
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
64 # x_vals1 = numpy.asarray(x_vals).transpose()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
65 # dat= r.list(x=array(x_vals1), y=y_vals)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
66 fv = robjects.FloatVector(x_vals)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
67 m = r['matrix'](fv, ncol=len(x_cols),byrow=True)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
68 # ensure order for generating formula
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
69 od = rlc.OrdDict([('y',robjects.FloatVector(y_vals)),('x',m)])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
70 dat = robjects.DataFrame(od)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
71 # convert dat.names: ["y","x.1","x.2"] to formula string: 'y ~ x.1 + x.2'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
72 formula = ' + '.join(dat.names).replace('+','~',1)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
73
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
74 #set_default_mode(NO_CONVERSION)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
75 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
76 #linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
77 linear_model = r.lm(formula, data = r['na.exclude'](dat))
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
78 except RException, rex:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
79 stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
80 #set_default_mode(BASIC_CONVERSION)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
81
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
82 #coeffs=linear_model.as_py()['coefficients']
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
83 #yintercept= coeffs['(Intercept)']
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
84 coeffs=linear_model.rx2('coefficients')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
85 yintercept= coeffs.rx2('(Intercept)')[0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
86 summary = r.summary(linear_model)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
87
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
88 #co = summary.get('coefficients', 'NA')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
89 co = summary.rx2("coefficients")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
90
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
91 """
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
92 if len(co) != len(x_vals)+1:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
93 stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
94 """
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
95 #print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
96 print >>fout, "p-value (Y-intercept)\t%s" %(co.rx(1,4)[0])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
97
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
98 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
99 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
100 #slope = coeffs['x']
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
101 slope = r.round(float(coeffs.rx2('x')[0]), digits=10)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
102 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
103 slope = 'NA'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
104 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
105 #pval = co[1][3]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
106 pval = r.round(float(co.rx(2,4)[0]), digits=10)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
107 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
108 pval = 'NA'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
109 print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
110 print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
111 else: #Multiple regression case with >1 predictors
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
112 ind=1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
113 #while ind < len(coeffs.keys()):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
114 while ind < len(coeffs.names):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
115 # print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
116 print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs.rx2(coeffs.names[ind])[0])
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
117 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
118 #pval = co[ind][3]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
119 pval = r.round(float(co.rx(ind+1,4)[0]), digits=10)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
120 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
121 pval = 'NA'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
122 print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
123 ind+=1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
124
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
125 rsq = summary.rx2('r.squared')[0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
126 adjrsq = summary.rx2('adj.r.squared')[0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
127 fstat = summary.rx2('fstatistic').rx2('value')[0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
128 sigma = summary.rx2('sigma')[0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
129
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
130 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
131 rsq = r.round(float(rsq), digits=5)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
132 adjrsq = r.round(float(adjrsq), digits=5)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
133 fval = r.round(fstat['value'], digits=5)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
134 fstat['value'] = str(fval)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
135 sigma = r.round(float(sigma), digits=10)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
136 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
137 pass
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
138
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
139 print >>fout, "R-squared\t%s" %(rsq)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
140 print >>fout, "Adjusted R-squared\t%s" %(adjrsq)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
141 print >>fout, "F-statistic\t%s" %(fstat)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
142 print >>fout, "Sigma\t%s" %(sigma)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
143
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
144 r.pdf( outfile2, 8, 8 )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
145 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
146 sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
147 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
148 r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
149 r.abline(a=yintercept, b=slope, col="red")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
150 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
151 pass
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
152 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
153 r.pairs(dat, main="Scatterplot Matrix", col="blue")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
154 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
155 r.plot(linear_model)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
156 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
157 pass
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
158 #r.dev_off()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
159 grdevices.dev_off()