comparison rgedgeR/rgGSEA.py @ 0:82e0af566160 draft

Uploaded
author fubar
date Wed, 12 Jun 2013 02:58:43 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:82e0af566160
1 """
2 April 2013
3 eeesh GSEA does NOT respect the mode flag!
4
5 Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate
6 feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from
7 barfing.
8
9 October 14 2012
10 Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
11 eesh.
12
13 Added history .gmt source - requires passing a faked name to gsea
14 Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
15 Started Feb 22
16 Copyright 2012 Ross Lazarus
17 All rights reserved
18 Licensed under the LGPL
19
20 called eg as
21
22 #!/bin/sh
23 GALAXY_LIB="/data/extended/galaxy/lib"
24 if [ "$GALAXY_LIB" != "None" ]; then
25 if [ -n "$PYTHONPATH" ]; then
26 PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
27 else
28 PYTHONPATH="$GALAXY_LIB"
29 fi
30 export PYTHONPATH
31 fi
32
33 cd /data/extended/galaxy/database/job_working_directory/027/27311
34 python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat" --adjpvalcol "5" --signcol "2"
35 --idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
36 --setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
37 --gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
38 --output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
39 --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"
40
41
42 """
43 import optparse
44 import tempfile
45 import os
46 import sys
47 import subprocess
48 import time
49 import shutil
50 import glob
51 import math
52 import re
53
54 KEEPSELECTION = False # detailed records for selection of multiple probes
55
56 def timenow():
57 """return current time as a string
58 """
59 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
60
61
62
63 def fix_subdir(adir,destdir):
64 """ Galaxy wants everything in the same files_dir
65 if os.path.exists(adir):
66 for (d,dirs,files) in os.path.walk(adir):
67 for f in files:
68 sauce = os.path.join(d,f)
69 shutil.copy(sauce,destdir)
70 """
71
72 def fixAffycrap(apath=''):
73 """class='richTable'>RUNNING ES</th><th class='richTable'>CORE ENRICHMENT</th><tr><td class='lessen'>1</td>
74 <td><a href='https://www.affymetrix.com/LinkServlet?probeset=LBR'>LBR</a></td><td></td><td></td><td>1113</td>
75 <td>0.194</td><td>-0.1065</td><td>No</td></tr><tr><td class='lessen'>2</td><td>
76 <a href='https://www.affymetrix.com/LinkServlet?probeset=GGPS1'>GGPS1</a></td><td></td><td></td><td>4309</td><td>0.014</td><td>-0.4328</td>
77 <td>No</td></tr>
78 """
79 html = []
80 try:
81 html = open(apath,'r').readlines()
82 except:
83 return html
84 for i,row in enumerate(html):
85 row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
86 html[i] = row
87 return html
88
89 cleanup = False
90 if os.path.exists(adir):
91 flist = os.listdir(adir) # get all files created
92 for f in flist:
93 apath = os.path.join(adir,f)
94 dest = os.path.join(destdir,f)
95 if not os.path.isdir(apath):
96 if os.path.splitext(f)[1].lower() == '.html':
97 html = fixAffycrap(apath)
98 fixed = open(apath,'w')
99 fixed.write('\n'.join(html))
100 fixed.write('\n')
101 fixed.close()
102 if not os.path.isfile(dest):
103 shutil.copy(apath,dest)
104 else:
105 fix_subdir(apath,destdir)
106 if cleanup:
107 try:
108 shutil.rmtree(path=adir,ignore_errors=True)
109 except:
110 pass
111
112
113
114 def getFileString(fpath, outpath):
115 """
116 format a nice file size string
117 """
118 size = ''
119 fp = os.path.join(outpath, fpath)
120 s = fpath
121 if os.path.isfile(fp):
122 n = float(os.path.getsize(fp))
123 if n > 2**20:
124 size = ' (%1.1f MB)' % (n/2**20)
125 elif n > 2**10:
126 size = ' (%1.1f KB)' % (n/2**10)
127 elif n > 0:
128 size = ' (%d B)' % (int(n))
129 s = '%s %s' % (fpath, size)
130 return s
131
132 class gsea_wrapper:
133 """
134 GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
135 We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
136 (vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh
137 #!/bin/bash
138 for RNK in `ls *.rnk`
139 do
140 DIRNAME=${RNK%.*}
141 echo $DIRNAME
142 qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv
143 -nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp
144 -set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
145 done
146 """
147
148 def __init__(self,myName=None,opts=None):
149 """ setup cl for gsea
150 """
151 self.idcol = 0
152 self.signcol = 0
153 self.adjpvalcol = 0
154 self.progname=myName
155 self.opts = opts
156 remove_duplicates=True
157 if not os.path.isdir(opts.output_dir):
158 try:
159 os.makedirs(opts.output_dir)
160 except:
161 print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
162 sys.exit(1)
163 fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
164 fakeGMT = os.path.join(opts.output_dir,fakeGMT)
165 fakeGMT = os.path.abspath(fakeGMT)
166 fakeRanks = '%s.rnk' % fakeGMT
167 if not fakeGMT.endswith('.gmt'):
168 fakeGMT = '%s.gmt' % fakeGMT
169 if opts.builtin_gmt and opts.history_gmt:
170 newfile = open(fakeGMT,'w')
171 subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
172 newfile.close()
173 elif opts.history_gmt:
174 subprocess.call(['cp',opts.history_gmt,fakeGMT])
175 else:
176 subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
177 # remove dupes from each gene set
178 gmt = open(fakeGMT,'r').readlines()
179 gmt = [x for x in gmt if len(x.split('\t')) > 3]
180 ugmt = []
181 for i,row in enumerate(gmt):
182 rows = row.rstrip().split('\t')
183 gmtname = rows[0]
184 gmtcomment = rows[1]
185 glist = list(set(rows[2:]))
186 newgmt = [gmtname,gmtcomment]
187 newgmt += glist
188 ugmt.append('\t'.join(newgmt))
189 gmt = open(fakeGMT,'w')
190 gmt.write('\n'.join(ugmt))
191 gmt.write('\n')
192 gmt.close()
193 if opts.input_ranks:
194 infname = opts.input_ranks
195 rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
196 rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
197 dat = [[x[0],x[1],x[1]] for x in rdat]
198 # fake same structure as input tabular file
199 try:
200 pvals = [float(x[1]) for x in dat]
201 signs = [float(x[1]) for x in dat]
202 except:
203 print >> sys.stderr, '## error converting floating point - cannot process this input'
204 sys.exit(99)
205 else: # read tabular
206 self.idcol = int(opts.idcol) - 1
207 self.signcol = int(opts.signcol) - 1
208 self.adjpvalcol = int(opts.adjpvalcol) - 1
209 maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
210 infname = opts.input_tab
211 indat = open(opts.input_tab,'r').readlines()
212 dat = [x.rstrip().split('\t') for x in indat[1:]]
213 dat = [x for x in dat if len(x) > maxcol]
214 dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form
215 pvals = [float(x[1]) for x in dat]
216 outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
217 assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
218 signs = [float(x[2]) for x in dat]
219 outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
220 bad = [dat[x][2] for x in outofrange]
221 assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad
222 ids = [x[0] for x in dat]
223 res = []
224 self.comments = []
225 useme = []
226 for i,row in enumerate(dat):
227 if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
228 useme.append(i)
229 lost = len(dat) - len(useme)
230 if lost <> 0:
231 newdat = [dat[x] for x in useme]
232 del dat
233 dat = newdat
234 print >> sys.stdout, '## %d lost - NA values or null id' % lost
235 if remove_duplicates:
236 uids = list(set(ids)) # complex procedure to get min pval for each unique id
237 if len(uids) <> len(ids): # dupes - deal with mode
238 print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
239 ures = {}
240 for i,id in enumerate(ids):
241 p = pvals[i]
242 ures.setdefault(id,[])
243 ures[id].append((p,signs[i]))
244 for id in uids:
245 tlist = ures[id]
246 tp = [x[0] for x in tlist]
247 ts = [x[1] for x in tlist]
248 if len(tp) == 1:
249 p = tp[0]
250 sign = ts[0]
251 else:
252 if opts.mode == "Max_probe":
253 p = min(tp)
254 sign = ts[tp.index(p)]
255 else: # guess median - too bad if even count
256 tp.sort()
257 ltp = len(tp)
258 ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
259 if ltp % 2 == 1: # odd
260 ind += 1 # take the median
261 p = tp[ind]
262 sign = ts[ind]
263 if KEEPSELECTION:
264 self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
265 % (id,str(tp),str(ts),p,sign))
266 if opts.input_ranks: # must be a rank file
267 res.append((id,'%f' % p))
268 else:
269 if p == 0.0:
270 p = 1e-99
271 try:
272 lp = -math.log10(p) # large positive if low p value
273 except ValueError:
274 lp = 0.0
275 if sign < 0:
276 lp = -lp # if negative, swap p to negative
277 res.append((id,'%f' % lp))
278 else: # no duplicates
279 for i,row in enumerate(dat):
280 (id,p,sign) = (row[0],float(row[1]),float(row[2]))
281 if opts.input_ranks: # must be a rank file
282 res.append((id,'%f' % p))
283 else:
284 if p == 0.0:
285 p = 1e-99
286 try:
287 lp = -math.log10(p) # large positive if low p value
288 except ValueError:
289 lp = 0.0
290 if sign < 0:
291 lp = -lp # if negative, swap p to negative
292 res.append((id,'%f' % lp))
293 else:
294 for i,row in enumerate(dat):
295 (id,p,sign) = (row[0],float(row[1]),float(row[2]))
296 if opts.input_ranks: # must be a rank file
297 res.append((id,'%f' % p))
298 else:
299 if p == 0.0:
300 p = 1e-99
301 try:
302 lp = -math.log10(p) # large positive if low p value
303 except ValueError:
304 lp = 0.0
305 if sign < 0:
306 lp = -lp # if negative, swap p to negative
307 res.append((id,'%f' % lp))
308 len1 = len(ids)
309 len2 = len(ranks)
310 delta = len1 - len2
311 if delta <> 0:
312 print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
313 ranks = [(float(x[1]),x) for x in ranks] # decorate
314 ranks.sort()
315 ranks.reverse()
316 ranks = [x[1] for x in ranks] # undecorate
317 if opts.chip == '': # if mouse - need HUGO
318 ranks = [[x[0].upper(),x[1]] for x in ranks]
319 print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
320 ranks = ['\t'.join(x) for x in ranks]
321 if len(ranks) < 2:
322 print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
323 sys.exit(1)
324 print '### opening %s and writing %s' % (fakeRanks,str(ranks[:10]))
325 rclean = open(fakeRanks,'w')
326 rclean.write('contig\tscore\n')
327 rclean.write('\n'.join(ranks))
328 rclean.write('\n')
329 rclean.close()
330 cl = []
331 a = cl.append
332 a('java -Xmx6G -cp')
333 a(opts.gsea_jar)
334 a('xtools.gsea.GseaPreranked')
335 a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
336 a('-gui false') # use preranked file mode and no gui
337 a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
338 a('-norm meandiv -zip_report false -scoring_scheme weighted') # ? need to set these?
339 a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
340 a('-out %s' % opts.output_dir)
341 a('-set_max %s' % opts.setMax)
342 a('-set_min %s' % opts.setMin)
343 a('-mode %s' % opts.mode)
344 if opts.chip > '':
345 #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)
346 a('-chip %s -collapse true' % opts.chip)
347 else:
348 a("-collapse false") # needed if no chip
349 a('-nperm %s' % opts.nPerm)
350 a('-rpt_label %s' % opts.title)
351 a('-plot_top_x %s' % opts.plotTop)
352 self.cl = cl
353 self.comments.append('## GSEA command line:')
354 self.comments.append(' '.join(self.cl))
355 self.fakeRanks = fakeRanks
356 self.fakeGMT = fakeGMT
357
358 def grepIds(self):
359 """
360 """
361 found = []
362 allids = open(self.opts.input_ranks,'r').readlines()
363 allids = [x.strip().split() for x in allids]
364 allids = [x[0] for x in allids] # list of ids
365 gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip
366
367 def run(self):
368 """
369
370 """
371 tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
372 sto = open(tlog,'w')
373 x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
374 retval = x.wait()
375 sto.close()
376 d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
377 if len(d) > 0:
378 fix_subdir(d[0],self.opts.output_dir)
379 htmlfname = os.path.join(self.opts.output_dir,'index.html')
380 try:
381 html = open(htmlfname,'r').readlines()
382 html = [x.strip() for x in html if len(x.strip()) > 0]
383 if len(self.comments) > 0:
384 s = ['<pre>']
385 s += self.comments
386 s.append('</pre>')
387 try:
388 i = html.index('<div id="footer">')
389 except:
390 i = len(html) - 7 # fudge
391 html = html[:i] + s + html[i:]
392 except:
393 html = []
394 htmlhead = '<html><head></head><body>'
395 html.append('## Galaxy GSEA wrapper failure')
396 html.append('## Unable to find index.html in %s - listdir=%s' % (d,' '.join(os.listdir(self.opts.output_dir))))
397 html.append('## Command line was %s' % (' '.join(self.cl)))
398 html.append('## commonly caused by mismatched ID/chip selection')
399 glog = open(os.path.join(self.opts.output_dir,'gsea_runner.log'),'r').readlines()
400 html.append('## gsea_runner.log=%s' % '\n'.join(glog))
401 #tryme = self.grepIds()
402 retval = 1
403 print >> sys.stderr,'\n'.join(html)
404 html = ['%s<br/>' % x for x in html]
405 html.insert(0,htmlhead)
406 html.append('</body></html>')
407 htmlf = file(self.opts.outhtml,'w')
408 htmlf.write('\n'.join(html))
409 htmlf.write('\n')
410 htmlf.close()
411 os.unlink(self.fakeRanks)
412 os.unlink(self.fakeGMT)
413 if opts.outtab_neg:
414 tabs = glob.glob(os.path.join(opts.output_dir,"gsea_report_for_*.xls"))
415 if len(tabs) > 0:
416 for tabi,t in enumerate(tabs):
417 tkind = os.path.basename(t).split('_')[4].lower()
418 if tkind == 'neg':
419 outtab = opts.outtab_neg
420 elif tkind == 'pos':
421 outtab = opts.outtab_pos
422 else:
423 print >> sys.stderr, '## tab file matched %s which is not "neg" or "pos" in 4th segment %s' % (t,tkind)
424 sys.exit()
425 content = open(t).readlines()
426 tabf = open(outtab,'w')
427 tabf.write(''.join(content))
428 tabf.close()
429 else:
430 print >> sys.stdout, 'Odd, maketab = %s but no matches - tabs = %s' % (makeTab,tabs)
431 return retval
432
433
434 if __name__ == "__main__":
435 """
436 called as:
437 <command interpreter="python">rgGSEA.py --input_ranks "$input1" --outhtml "$html_file"
438 --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" --gsea_jar "$GALAXY_DATA_INDEX_DIR/shared/jars/gsea2-2.07.jar"
439 --output_dir "$html_file.files_path" --use_gmt ""${use_gmt.fields.path}"" --chip "${use_chip.fields.path}"
440 </command>
441 """
442 op = optparse.OptionParser()
443 a = op.add_option
444 a('--input_ranks',default=None)
445 a('--input_tab',default=None)
446 a('--input_name',default=None)
447 a('--use_gmt',default=None)
448 a('--history_gmt',default=None)
449 a('--builtin_gmt',default=None)
450 a('--history_gmt_name',default=None)
451 a('--setMax',default="500")
452 a('--setMin',default="15")
453 a('--nPerm',default="1000")
454 a('--title',default="GSEA report")
455 a('--chip',default='')
456 a('--plotTop',default='20')
457 a('--outhtml',default=None)
458 a('--makeTab',default=None)
459 a('--output_dir',default=None)
460 a('--outtab_neg',default=None)
461 a('--outtab_pos',default=None)
462 a('--adjpvalcol',default=None)
463 a('--signcol',default=None)
464 a('--idcol',default=None)
465 a('--mode',default='Max_probe')
466 a('-j','--gsea_jar',default='/usr/local/bin/gsea2-2.07.jar')
467 opts, args = op.parse_args()
468 assert os.path.isfile(opts.gsea_jar),'## GSEA runner unable to find supplied gsea java desktop executable file %s' % opts.gsea_jar
469 if opts.input_ranks:
470 inpf = opts.input_ranks
471 else:
472 inpf = opts.input_tab
473 assert opts.idcol <> None, '## GSEA runner needs an id column if a tabular file provided'
474 assert opts.signcol <> None, '## GSEA runner needs a sign column if a tabular file provided'
475 assert opts.adjpvalcol <> None, '## GSEA runner needs an adjusted p value column if a tabular file provided'
476 assert os.path.isfile(inpf),'## GSEA runner unable to open supplied input file %s' % inpf
477 if opts.chip > '':
478 assert os.path.isfile(opts.chip),'## GSEA runner unable to open supplied chip file %s' % opts.chip
479 some = None
480 if opts.history_gmt <> None:
481 some = 1
482 assert os.path.isfile(opts.history_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.history_gmt
483 if opts.builtin_gmt <> None:
484 some = 1
485 assert os.path.isfile(opts.builtin_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.builtin_gmt
486 assert some, '## GSEA runner needs a gene set matrix file - none chosen?'
487 opts.title = re.sub('[^a-zA-Z0-9_]+', '', opts.title)
488 myName=os.path.split(sys.argv[0])[-1]
489 gse = gsea_wrapper(myName, opts=opts)
490 retcode = gse.run()
491 if retcode <> 0:
492 sys.exit(retcode) # indicate failure to job runner
493
494