comparison picard_wrapper.py @ 124:5a39cfd995b3 draft

Uploaded
author devteam
date Wed, 26 Feb 2014 00:28:03 -0500
parents
children
comparison
equal deleted inserted replaced
123:b23e39dbfe30 124:5a39cfd995b3
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146
147 print runme,self.opts.outdir
148
149 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
150 return stdouts,rval
151
152 def samToBam(self,infile=None,outdir=None):
153 """
154 use samtools view to convert sam to bam
155 """
156 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
157 cl = ['samtools view -h -b -S -o ',tempbam,infile]
158 tlog,stdouts,rval = self.runCL(cl,outdir)
159 return tlog,tempbam,rval
160
161 def sortSam(self, infile=None,outfile=None,outdir=None):
162 """
163 """
164 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
165 cl = ['samtools sort',infile,outfile]
166 tlog,stdouts,rval = self.runCL(cl,outdir)
167 return tlog
168
169 def cleanup(self):
170 for fname in self.delme:
171 try:
172 os.unlink(fname)
173 except:
174 pass
175
176 def prettyPicout(self,transpose,maxrows):
177 """organize picard outpouts into a report html page
178 """
179 res = []
180 try:
181 r = open(self.metricsOut,'r').readlines()
182 except:
183 r = []
184 if len(r) > 0:
185 res.append('<b>Picard on line resources</b><ul>\n')
186 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
187 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
188 if transpose:
189 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
190 else:
191 res.append('<b>Picard output</b><hr/>\n')
192 res.append('<table cellpadding="3" >\n')
193 dat = []
194 heads = []
195 lastr = len(r) - 1
196 # special case for estimate library complexity hist
197 thist = False
198 for i,row in enumerate(r):
199 if row.strip() > '':
200 srow = row.split('\t')
201 if row.startswith('#'):
202 heads.append(row.strip()) # want strings
203 else:
204 dat.append(srow) # want lists
205 if row.startswith('## HISTOGRAM'):
206 thist = True
207 if len(heads) > 0:
208 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
209 res += hres
210 heads = []
211 if len(dat) > 0:
212 if transpose and not thist:
213 tdat = map(None,*dat) # transpose an arbitrary list of lists
214 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
215 else:
216 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
217 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
218 res += tdat
219 dat = []
220 res.append('</table>\n')
221 return res
222
223 def fixPicardOutputs(self,transpose,maxloglines):
224 """
225 picard produces long hard to read tab header files
226 make them available but present them transposed for readability
227 """
228 logging.shutdown()
229 self.cleanup() # remove temp files stored in delme
230 rstyle="""<style type="text/css">
231 tr.d0 td {background-color: oldlace; color: black;}
232 tr.d1 td {background-color: aliceblue; color: black;}
233 </style>"""
234 res = [rstyle,]
235 res.append(galhtmlprefix % self.progname)
236 res.append(galhtmlattr % (self.picname,timenow()))
237 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
238 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
239 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
240 for p in pdflist:
241 pbase = os.path.splitext(p)[0] # removes .pdf
242 imghref = '%s.jpg' % pbase
243 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
244 if mimghref in flist:
245 imghref=mimghref # only one for thumbnail...it's a multi page pdf
246 res.append('<table cellpadding="10"><tr><td>\n')
247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
248 res.append('</tr></td></table>\n')
249 if len(flist) > 0:
250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
251 res.append('<table>\n')
252 for i,f in enumerate(flist):
253 fn = os.path.split(f)[-1]
254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
255 res.append('</table><p/>\n')
256 pres = self.prettyPicout(transpose,maxloglines)
257 if len(pres) > 0:
258 res += pres
259 l = open(self.log_filename,'r').readlines()
260 llen = len(l)
261 if llen > 0:
262 res.append('<b>Picard Tool Run Log</b><hr/>\n')
263 rlog = ['<pre>',]
264 if llen > maxloglines:
265 n = min(50,int(maxloglines/2))
266 rlog += l[:n]
267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
268 rlog += l[-n:]
269 else:
270 rlog += l
271 rlog.append('</pre>')
272 if llen > maxloglines:
273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
274 res += rlog
275 else:
276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
279 res.append(galhtmlpostfix)
280 outf = open(self.opts.htmlout,'w')
281 outf.write(''.join(res))
282 outf.write('\n')
283 outf.close()
284
285 def makePicInterval(self,inbed=None,outf=None):
286 """
287 picard wants bait and target files to have the same header length as the incoming bam/sam
288 a meaningful (ie accurate) representation will fail because of this - so this hack
289 it would be far better to be able to supply the original bed untouched
290 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
291 """
292 assert inbed <> None
293 bed = open(inbed,'r').readlines()
294 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
295 lens = [len(x) for x in sbed]
296 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
297 maxl = max(lens)
298 minl = min(lens)
299 e = []
300 if maxl <> minl:
301 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
302 if maxl <> 5:
303 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
304 if len(strands) > 0:
305 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
306 if len(e) > 0: # write to stderr and quit
307 print >> sys.stderr, '\n'.join(e)
308 sys.exit(1)
309 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
310 if self.opts.datatype == 'sam':
311 cl = ['samtools view -H -S',self.opts.input,'>',thead]
312 else:
313 cl = ['samtools view -H',self.opts.input,'>',thead]
314 self.runCL(cl=cl,output_dir=self.opts.outdir)
315 head = open(thead,'r').readlines()
316 s = '## got %d rows of header\n' % (len(head))
317 logging.info(s)
318 o = open(outf,'w')
319 o.write(''.join(head))
320 o.write(''.join(bed))
321 o.close()
322 return outf
323
324 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
325 """
326 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
327 Do the work of removing all the error sequences
328 pysam is cool
329 infile = pysam.Samfile( "-", "r" )
330 outfile = pysam.Samfile( "-", "w", template = infile )
331 for s in infile: outfile.write(s)
332
333 errors from ValidateSameFile.jar look like
334 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
335 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
336 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
337
338 """
339 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
340 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
341 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
342 remDict = dict(zip(removeNames,range(len(removeNames))))
343 infile = pysam.Samfile(insam,'rb')
344 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
345 if len(removeNames) > 0:
346 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
347 i = 0
348 j = 0
349 for row in infile:
350 dropme = remDict.get(row.qname,None) # keep if None
351 if not dropme:
352 outfile.write(row)
353 j += 1
354 else: # discard
355 i += 1
356 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
357 outfile.close()
358 infile.close()
359 else: # we really want a nullop or a simple pointer copy
360 infile.close()
361 if newsam:
362 shutil.copy(insam,newsam)
363 logging.info(info)
364
365
366
367 def __main__():
368 doFix = False # tools returning htmlfile don't need this
369 doTranspose = True # default
370 maxloglines = 100 # default
371 #Parse Command Line
372 op = optparse.OptionParser()
373 # All tools
374 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
375 op.add_option('-e', '--inputext', default=None)
376 op.add_option('-o', '--output', default=None)
377 op.add_option('-n', '--title', default="Pick a Picard Tool")
378 op.add_option('-t', '--htmlout', default=None)
379 op.add_option('-d', '--outdir', default=None)
380 op.add_option('-x', '--maxjheap', default='3000m')
381 op.add_option('-b', '--bisulphite', default='false')
382 op.add_option('-s', '--sortorder', default='query')
383 op.add_option('','--tmpdir', default='/tmp')
384 op.add_option('-j','--jar',default='')
385 op.add_option('','--picard-cmd',default=None)
386 # Many tools
387 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
388 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
389 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
390 # CreateSequenceDictionary
391 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
392 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
393 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
394 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
395 # MarkDuplicates
396 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
397 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
398 # CollectInsertSizeMetrics
399 op.add_option('', '--taillimit', default="0")
400 op.add_option('', '--histwidth', default="0")
401 op.add_option('', '--minpct', default="0.01")
402 op.add_option('', '--malevel', default='')
403 op.add_option('', '--deviations', default="0.0")
404 # CollectAlignmentSummaryMetrics
405 op.add_option('', '--maxinsert', default="20")
406 op.add_option('', '--adaptors', default='')
407 # FixMateInformation and validate
408 # CollectGcBiasMetrics
409 op.add_option('', '--windowsize', default='100')
410 op.add_option('', '--mingenomefrac', default='0.00001')
411 # AddOrReplaceReadGroups
412 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
413 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
414 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
415 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
416 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
417 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
418 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
419 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
420 # ReorderSam
421 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
422 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
423 # ReplaceSamHeader
424 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
425
426 op.add_option('','--assumesorted', default='true')
427 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
428 #estimatelibrarycomplexity
429 op.add_option('','--minid', default="5")
430 op.add_option('','--maxdiff', default="0.03")
431 op.add_option('','--minmeanq', default="20")
432 #hsmetrics
433 op.add_option('','--baitbed', default=None)
434 op.add_option('','--targetbed', default=None)
435 #validate
436 op.add_option('','--ignoreflags', action='append', type="string")
437 op.add_option('','--maxerrors', default=None)
438 op.add_option('','--datatype', default=None)
439 op.add_option('','--bamout', default=None)
440 op.add_option('','--samout', default=None)
441 #downsample
442 op.add_option('','--probability', default="1")
443 op.add_option('','--seed', default="1")
444 #meanqualitybycycle
445 op.add_option('','--pfreadsonly', default='false')
446 op.add_option('','--alignedreadsonly', default='false')
447
448 opts, args = op.parse_args()
449 opts.sortme = opts.assumesorted == 'false'
450 assert opts.input <> None
451 # need to add
452 # instance that does all the work
453 pic = PicardBase(opts,sys.argv[0])
454
455 tmp_dir = opts.outdir
456 haveTempout = False # we use this where sam output is an option
457 rval = 0
458 stdouts = 'Not run yet'
459 # set ref and dict files to use (create if necessary)
460 ref_file_name = opts.ref
461 if opts.ref_file <> None:
462 csd = 'CreateSequenceDictionary'
463 realjarpath = os.path.split(opts.jar)[0]
464 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
465 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
466 ref_file_name = '%s.fasta' % tmp_ref_name
467 # build dict
468 dict_file_name = '%s.dict' % tmp_ref_name
469 os.symlink( opts.ref_file, ref_file_name )
470 cl = ['REFERENCE=%s' % ref_file_name]
471 cl.append('OUTPUT=%s' % dict_file_name)
472 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
473 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
474 if opts.species_name:
475 cl.append('SPECIES=%s' % opts.species_name)
476 if opts.build_name:
477 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
478 pic.delme.append(dict_file_name)
479 pic.delme.append(ref_file_name)
480 pic.delme.append(tmp_ref_name)
481 stdouts,rval = pic.runPic(jarpath, cl)
482 # run relevant command(s)
483
484 # define temporary output
485 # if output is sam, it must have that extension, otherwise bam will be produced
486 # specify sam or bam file with extension
487 if opts.output_format == 'sam':
488 suff = '.sam'
489 else:
490 suff = ''
491 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
492
493 cl = ['VALIDATION_STRINGENCY=LENIENT',]
494
495 if pic.picname == 'AddOrReplaceReadGroups':
496 # sort order to match Galaxy's default
497 cl.append('SORT_ORDER=coordinate')
498 # input
499 cl.append('INPUT=%s' % opts.input)
500 # outputs
501 cl.append('OUTPUT=%s' % tempout)
502 # required read groups
503 cl.append('RGLB="%s"' % opts.rg_library)
504 cl.append('RGPL="%s"' % opts.rg_platform)
505 cl.append('RGPU="%s"' % opts.rg_plat_unit)
506 cl.append('RGSM="%s"' % opts.rg_sample)
507 if opts.rg_id:
508 cl.append('RGID="%s"' % opts.rg_id)
509 # optional read groups
510 if opts.rg_seq_center:
511 cl.append('RGCN="%s"' % opts.rg_seq_center)
512 if opts.rg_desc:
513 cl.append('RGDS="%s"' % opts.rg_desc)
514 stdouts,rval = pic.runPic(opts.jar, cl)
515 haveTempout = True
516
517 elif pic.picname == 'BamIndexStats':
518 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
519 tmp_bam_name = '%s.bam' % tmp_name
520 tmp_bai_name = '%s.bai' % tmp_bam_name
521 os.symlink( opts.input, tmp_bam_name )
522 os.symlink( opts.bai_file, tmp_bai_name )
523 cl.append('INPUT=%s' % ( tmp_bam_name ))
524 pic.delme.append(tmp_bam_name)
525 pic.delme.append(tmp_bai_name)
526 pic.delme.append(tmp_name)
527 stdouts,rval = pic.runPic( opts.jar, cl )
528 f = open(pic.metricsOut,'a')
529 f.write(stdouts) # got this on stdout from runCl
530 f.write('\n')
531 f.close()
532 doTranspose = False # but not transposed
533
534 elif pic.picname == 'EstimateLibraryComplexity':
535 cl.append('I=%s' % opts.input)
536 cl.append('O=%s' % pic.metricsOut)
537 if float(opts.minid) > 0:
538 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
539 if float(opts.maxdiff) > 0.0:
540 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
541 if float(opts.minmeanq) > 0:
542 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
543 if opts.readregex > '':
544 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
545 if float(opts.optdupdist) > 0:
546 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
547 stdouts,rval = pic.runPic(opts.jar, cl)
548
549 elif pic.picname == 'CollectAlignmentSummaryMetrics':
550 # Why do we do this fakefasta thing?
551 # Because we need NO fai to be available or picard barfs unless it matches the input data.
552 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
553 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
554 try:
555 os.symlink(ref_file_name,fakefasta)
556 except:
557 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
558 info = s
559 shutil.copy(ref_file_name,fakefasta)
560 pic.delme.append(fakefasta)
561 cl.append('ASSUME_SORTED=true')
562 adaptlist = opts.adaptors.split(',')
563 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
564 cl += adaptorseqs
565 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
566 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
567 cl.append('OUTPUT=%s' % pic.metricsOut)
568 cl.append('R=%s' % fakefasta)
569 cl.append('TMP_DIR=%s' % opts.tmpdir)
570 if not opts.assumesorted.lower() == 'true': # we need to sort input
571 sortedfile = '%s.sorted' % os.path.basename(opts.input)
572 if opts.datatype == 'sam': # need to work with a bam
573 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
574 pic.delme.append(tempbam)
575 try:
576 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
577 except:
578 print '## exception on sorting sam file %s' % opts.input
579 else: # is already bam
580 try:
581 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
582 except : # bug - [bam_sort_core] not being ignored - TODO fixme
583 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
584 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
585 pic.delme.append(os.path.join(opts.outdir,sortedfile))
586 else:
587 cl.append('INPUT=%s' % os.path.abspath(opts.input))
588 stdouts,rval = pic.runPic(opts.jar, cl)
589
590
591 elif pic.picname == 'CollectGcBiasMetrics':
592 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
593 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
594 # why? Dunno
595 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
596 try:
597 os.symlink(ref_file_name,fakefasta)
598 except:
599 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
600 info = s
601 shutil.copy(ref_file_name,fakefasta)
602 pic.delme.append(fakefasta)
603 x = 'rgPicardGCBiasMetrics'
604 pdfname = '%s.pdf' % x
605 jpgname = '%s.jpg' % x
606 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
607 temppdf = os.path.join(opts.outdir,pdfname)
608 cl.append('R=%s' % fakefasta)
609 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
610 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
611 cl.append('INPUT=%s' % opts.input)
612 cl.append('OUTPUT=%s' % tempout)
613 cl.append('TMP_DIR=%s' % opts.tmpdir)
614 cl.append('CHART_OUTPUT=%s' % temppdf)
615 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
616 stdouts,rval = pic.runPic(opts.jar, cl)
617 if os.path.isfile(temppdf):
618 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
619 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
620 else:
621 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
622 lf = open(pic.log_filename,'a')
623 lf.write(s)
624 lf.write('\n')
625 lf.close()
626
627 elif pic.picname == 'CollectInsertSizeMetrics':
628 """ <command interpreter="python">
629 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
630 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
631 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
632 </command>
633 """
634 isPDF = 'InsertSizeHist.pdf'
635 pdfpath = os.path.join(opts.outdir,isPDF)
636 histpdf = 'InsertSizeHist.pdf'
637 cl.append('I=%s' % opts.input)
638 cl.append('O=%s' % pic.metricsOut)
639 cl.append('HISTOGRAM_FILE=%s' % histpdf)
640 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
641 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
642 if opts.histwidth <> '0':
643 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
644 if float( opts.minpct) > 0.0:
645 cl.append('MINIMUM_PCT=%s' % opts.minpct)
646 if float(opts.deviations) > 0.0:
647 cl.append('DEVIATIONS=%s' % opts.deviations)
648 if opts.malevel:
649 malists = opts.malevel.split(',')
650 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
651 cl += malist
652 stdouts,rval = pic.runPic(opts.jar, cl)
653 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
654 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
655 pic.runCL(cl=cl2,output_dir=opts.outdir)
656 else:
657 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
658 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
659 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
660 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
661 logging.info(s)
662 if len(stdouts) > 0:
663 logging.info(stdouts)
664
665 elif pic.picname == 'MarkDuplicates':
666 # assume sorted even if header says otherwise
667 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
668 # input
669 cl.append('INPUT=%s' % opts.input)
670 # outputs
671 cl.append('OUTPUT=%s' % opts.output)
672 cl.append('METRICS_FILE=%s' % pic.metricsOut )
673 # remove or mark duplicates
674 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
675 # the regular expression to be used to parse reads in incoming SAM file
676 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
677 # maximum offset between two duplicate clusters
678 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
679 stdouts,rval = pic.runPic(opts.jar, cl)
680
681 elif pic.picname == 'FixMateInformation':
682 cl.append('I=%s' % opts.input)
683 cl.append('O=%s' % tempout)
684 cl.append('SORT_ORDER=%s' % opts.sortorder)
685 stdouts,rval = pic.runPic(opts.jar,cl)
686 haveTempout = True
687
688 elif pic.picname == 'ReorderSam':
689 # input
690 cl.append('INPUT=%s' % opts.input)
691 # output
692 cl.append('OUTPUT=%s' % tempout)
693 # reference
694 cl.append('REFERENCE=%s' % ref_file_name)
695 # incomplete dict concordance
696 if opts.allow_inc_dict_concord == 'true':
697 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
698 # contig length discordance
699 if opts.allow_contig_len_discord == 'true':
700 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
701 stdouts,rval = pic.runPic(opts.jar, cl)
702 haveTempout = True
703
704 elif pic.picname == 'ReplaceSamHeader':
705 cl.append('INPUT=%s' % opts.input)
706 cl.append('OUTPUT=%s' % tempout)
707 cl.append('HEADER=%s' % opts.header_file)
708 stdouts,rval = pic.runPic(opts.jar, cl)
709 haveTempout = True
710
711 elif pic.picname == 'CalculateHsMetrics':
712 maxloglines = 100
713 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
714 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
715 baitf = pic.makePicInterval(opts.baitbed,baitfname)
716 if opts.targetbed == opts.baitbed: # same file sometimes
717 targetf = baitf
718 else:
719 targetf = pic.makePicInterval(opts.targetbed,targetfname)
720 cl.append('BAIT_INTERVALS=%s' % baitf)
721 cl.append('TARGET_INTERVALS=%s' % targetf)
722 cl.append('INPUT=%s' % os.path.abspath(opts.input))
723 cl.append('OUTPUT=%s' % pic.metricsOut)
724 cl.append('TMP_DIR=%s' % opts.tmpdir)
725 stdouts,rval = pic.runPic(opts.jar,cl)
726
727 elif pic.picname == 'ValidateSamFile':
728 import pysam
729 doTranspose = False
730 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
731 stf = open(pic.log_filename,'w')
732 tlog = None
733 if opts.datatype == 'sam': # need to work with a bam
734 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
735 try:
736 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
737 except:
738 print '## exception on sorting sam file %s' % opts.input
739 else: # is already bam
740 try:
741 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
742 except: # bug - [bam_sort_core] not being ignored - TODO fixme
743 print '## exception on sorting bam file %s' % opts.input
744 if tlog:
745 print '##tlog=',tlog
746 stf.write(tlog)
747 stf.write('\n')
748 sortedfile = '%s.bam' % sortedfile # samtools does that
749 cl.append('O=%s' % pic.metricsOut)
750 cl.append('TMP_DIR=%s' % opts.tmpdir)
751 cl.append('I=%s' % sortedfile)
752 opts.maxerrors = '99999999'
753 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
754 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
755 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
756 cl.append(' '.join(igs))
757 if opts.bisulphite.lower() <> 'false':
758 cl.append('IS_BISULFITE_SEQUENCED=true')
759 if opts.ref <> None or opts.ref_file <> None:
760 cl.append('R=%s' % ref_file_name)
761 stdouts,rval = pic.runPic(opts.jar,cl)
762 if opts.datatype == 'sam':
763 pic.delme.append(tempbam)
764 newsam = opts.output
765 outformat = 'bam'
766 pe = open(pic.metricsOut,'r').readlines()
767 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
768 pic.delme.append(sortedfile) # not wanted
769 stf.close()
770 pic.cleanup()
771
772 ####liubo added CleanSam tool####
773 elif pic.picname == 'CleanSam':
774 # input
775 cl.append('INPUT=%s' % opts.input)
776 # output
777 cl.append('OUTPUT=%s' % tempout)
778 stdouts,rval = pic.runPic(opts.jar, cl)
779 haveTempout = True
780
781 elif pic.picname == 'SortSam':
782 cl.append('I=%s' % opts.input)
783 cl.append('O=%s' % tempout)
784 cl.append('SORT_ORDER=%s' % opts.sortorder)
785 stdouts,rval = pic.runPic(opts.jar,cl)
786 haveTempout = True
787
788 elif pic.picname == 'BuildBamIndex':
789 cl.append('I=%s' % opts.input)
790 cl.append('O=%s' % tempout)
791 stdouts,rval = pic.runPic(opts.jar,cl)
792 haveTempout = True
793
794 elif pic.picname == 'SamFormatConverter':
795 cl.append('INPUT=%s' % opts.input)
796 cl.append('OUTPUT=%s' % tempout)
797 pic.runPic(opts.jar, cl)
798 haveTempout = True
799 elif pic.picname == "DownsampleSam":
800 cl.append('I=%s' % opts.input)
801 mystring = opts.output
802 mystringsam = mystring + ".sam"
803 cl.append('O=%s' % mystringsam)
804 if float(opts.probability) > 0:
805 cl.append('PROBABILITY=%s' % opts.probability)
806 if float(opts.seed) > 0:
807 cl.append('RANDOM_SEED=%s' % opts.seed)
808 stdouts,rval = pic.runPic(opts.jar, cl)
809 myoutput = mystringsam.replace(".sam", "")
810 os.rename(mystringsam,myoutput)
811
812 elif pic.picname == 'MeanQualityByCycle':
813 isPDF = 'MeanQualityByCycle.pdf'
814 pdfpath = os.path.join(opts.outdir,isPDF)
815 histpdf = isPDF
816 cl.append('I=%s' % opts.input)
817 cl.append('O=%s' % pic.metricsOut)
818 cl.append('CHART_OUTPUT=%s' % histpdf)
819 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
820 cl.append('ALIGNED_READS_ONLY=%s' % (opts.alignedreadsonly))
821 cl.append('PF_READS_ONLY=%s' % (opts.pfreadsonly))
822 stdouts,rval = pic.runPic(opts.jar, cl)
823 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
824 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
825 pic.runCL(cl=cl2,output_dir=opts.outdir)
826 else:
827 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
828 logging.info(s)
829 if len(stdouts) > 0:
830 logging.info(stdouts)
831
832 else:
833 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
834 sys.exit(1)
835 if haveTempout:
836 # Some Picard tools produced a potentially intermediate bam file.
837 # Either just move to final location or create sam
838 if os.path.exists(tempout):
839 shutil.move(tempout, os.path.abspath(opts.output))
840 if opts.htmlout <> None or doFix: # return a pretty html page
841 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
842 if rval <> 0:
843 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
844 # signal failure
845 if __name__=="__main__": __main__()