macs21: macs21_wrapper.py comparison

comparison macs21_wrapper.py @ 1:d0986d2be693 draft

Substantial reimplementation of internals, also renamed id and version.

author	pjbriggs
date	Thu, 29 Jan 2015 11:11:21 -0500
parents	fdad0c8c0957
children	15889783e759

comparison

equal deleted inserted replaced

-:fdad0c8c0957
+:d0986d2be693
-#purpose: macs2 python wrapper
+#!/bin/env python
-#author: Ziru Zhou
+#
-#date: November, 2012
+# Galaxy wrapper to run MACS 2.1
+#
+# Completely rewritten from the original macs2 wrapped by Ziru Zhou
+# taken from http://toolshed.g2.bx.psu.edu/view/modencode-dcc/macs2
-import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
+import sys
-from galaxy import eggs
+import os
-import pkg_resources
+import subprocess
-pkg_resources.require( "simplejson" )
+import tempfile
-import simplejson
+import shutil
-CHUNK_SIZE = 1024
+def move_file(working_dir,name,destination):
+"""Move a file 'name' from 'working_dir' to 'destination'
-#==========================================================================================
+"""
-#functions
+if destination is None:
-#==========================================================================================
+# Nothing to do
-def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
+return
-out = open( target_filename, 'wb' )
+source = os.path.join(working_dir,name)
-for filename in glob.glob( glob_path ):
+if os.path.exists(source):
-fh = gzip.open( filename, 'rb' )
+shutil.move(source,destination)
-while True:
-data = fh.read( CHUNK_SIZE )
-if data:
-out.write( data )
-else:
-break
-fh.close()
-if delete:
-os.unlink( filename )
-out.close()
-def xls_to_interval( xls_file, interval_file, header = None ):
+def convert_xls_to_interval(xls_file,interval_file,header=None):
-out = open( interval_file, 'wb' )
+"""Convert MACS XLS file to interval
+From the MACS readme: "Coordinates in XLS is 1-based which is different with
+BED format."
+However this function no longer performs any coordinate conversions, it
+simply ensures that any blank or non-data lines are commented out
+"""
+fp = open(interval_file,'wb')
 if header:
-out.write( '#%s\n' % header )
+fp.write('#%s\n' % header)
-wrote_header = False
+for line in open(xls_file):
-#From macs readme: Coordinates in XLS is 1-based which is different with BED format.
+# Keep all existing comment lines
-#PJB: updated to keep original 'start' coordinate i.e. do not convert to BED
+if line.startswith('#'):
-for line in open( xls_file ):
+fp.write(line)
-#keep all existing comment lines
-if line.startswith( '#' ):
-out.write( line )
-	#added for macs2 since there is an extra newline
-##elif line.startswith( '\n' ):
-##    out.write( line )
-elif not wrote_header:
-out.write( '#%s' % line )
-	    print line
-wrote_header = True
 else:
-fields = line.split( '\t' )
+# Split line into fields and test to see if
-if len( fields ) > 1:
+# the 'start' field is actually an integer
+fields = line.split('\t')
+if len(fields) > 1:
 try:
-# Try to convert 'start' to int and shift
+int(fields[1])
-##fields[1] = str( int( fields[1] ) - 1 )
-# PJB don't correct, just test to see if it's an integer
-int( fields[1] )
 except ValueError:
 # Integer conversion failed so comment out
 # "bad" line instead
 fields[0] = "#%s" % fields[0]
-out.write( '\t'.join( fields ) )
+fp.write( '\t'.join( fields ) )
-out.close()
+fp.close()
-#==========================================================================================
+if __name__ == "__main__":
-#main
-#==========================================================================================
-def main():
-#take in options file and output file names
-options = simplejson.load( open( sys.argv[1] ) )
-outputs = simplejson.load( open( sys.argv[2] ) )
-#=================================================================================
+# Echo the command line
-#parse options and execute macs2
+print ' '.join(sys.argv)
-#=================================================================================
-#default inputs that are in every major command
-experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for some file names
-cmdline = "macs2 %s -t %s" % ( options['command'], ",".join( options['input_chipseq'] ) )
-if options['input_control']:
-cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
-#=================================================================================
+# Initialise output files - values are set by reading from
-if (options['command'] == "callpeak"):
+# the command line supplied by the Galaxy wrapper
-	output_summits_bed = outputs['output_summits_bed_file']
+output_extra_html = None
-	output_extra_html = outputs['output_extra_file']
+output_extra_path = None
-	output_extra_path = outputs['output_extra_file_path']
+output_broadpeaks = None
-	output_peaks =  outputs['output_peaks_file']
+output_gappedpeaks = None
-	output_narrowpeaks = outputs['output_narrowpeaks_file']
+output_narrowpeaks = None
-	output_broadpeaks = outputs['output_broadpeaks_file']
+output_treat_pileup = None
-	output_gappedpeaks = outputs['output_gappedpeaks_file']
+output_lambda_bedgraph = None
-	output_xls_to_interval_peaks_file = outputs['output_xls_to_interval_peaks_file']
+output_xls_to_interval_peaks_file = None
-output_treat_pileup = outputs['output_treat_pileup_file']
+output_peaks = None
-output_lambda_bedgraph = outputs['output_lambda_bedgraph_file']
+output_bdgcmp = None
-cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s'" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'] )
+# Build the MACS 2.1 command line
-if (options['broad'] == 'broad'):
+# Initial arguments are always the same: command & input ChIP-seq file name
-cmdline = "%s --broad --broad-cutoff='%s'" % ( cmdline, options['broad_cutoff'] )
+cmdline = ["macs2 %s -t %s" % (sys.argv[1],sys.argv[2])]
-	if 'pvalue' in options:
-cmdline = "%s --pvalue='%s'" % ( cmdline, options['pvalue'] )
-	elif 'qvalue' in options:
-cmdline = "%s --qvalue='%s'" % ( cmdline, options['qvalue'] )
-cmdline = "%s --mfold %s %s %s %s %s %s --keep-dup %s" % (cmdline, options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'], options['spmr'], options['call_summits'], options['keep_dup'] )
-	if 'nomodel' in options:
-	cmdline = "%s --nomodel --extsize='%s'" % ( cmdline, options['nomodel'] )
-#=================================================================================
-if (options['command'] == "bdgcmp"):
-	output_bdgcmp = outputs['output_bdgcmp_file']
-	cmdline = "%s -m %s -p %s -o bdgcmp_out.bdg" % ( cmdline, options['m'], options['pseudocount'] )
+# Process remaining args
-#=================================================================================
+for arg in sys.argv[3:]:
+if arg.startswith('--format='):
+# Convert format to uppercase
+format_ = arg.split('=')[1].upper()
+cmdline.append("--format=%s" % format_)
+elif arg.startswith('--name='):
+# Replace whitespace in name with underscores
+experiment_name = '_'.join(arg.split('=')[1].split())
+cmdline.append("--name=%s" % experiment_name)
+elif arg.startswith('--output-'):
+# Handle destinations for output files
+arg0,filen = arg.split('=')
+if arg0 == '--output-summits':
+output_summits = filen
+elif arg0 == '--output-extra-files':
+output_extra_html = filen
+elif  arg0 == '--output-extra-files-path':
+output_extra_path = filen
+elif  arg0 == '--output-broadpeaks':
+output_broadpeaks = filen
+elif  arg0 == '--output-gappedpeaks':
+output_gappedpeaks = filen
+elif  arg0 == '--output-narrowpeaks':
+output_narrowpeaks = filen
+elif  arg0 == '--output-pileup':
+output_treat_pileup = filen
+elif  arg0 == '--output-lambda-bedgraph':
+output_lambda_bedgraph = filen
+elif  arg0 == '--output-xls-to-interval':
+output_xls_to_interval_peaks_file = filen
+elif  arg0 == '--output-peaks':
+output_peaks = filen
+else:
+# Pass remaining args directly to MACS
+# command line
+cmdline.append(arg)
+cmdline = ' '.join(cmdline)
+print "Generated command line:\n%s" % cmdline
-tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
+# Execute MACS2
-stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
+#
-proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
+# Make a working directory
+working_dir = tempfile.mkdtemp()
+#
+# Collect stderr in a file for reporting later
+stderr_filen = tempfile.NamedTemporaryFile().name
+#
+# Run MACS2
+proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir,
+stderr=open(stderr_filen,'wb'))
 proc.wait()
-#We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
-#Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
-if proc.returncode:
-stderr_f = open( stderr_name )
-while True:
-chunk = stderr_f.read( CHUNK_SIZE )
-if not chunk:
-stderr_f.close()
-break
-sys.stderr.write( chunk )
-#=================================================================================
+# Run R script to create PDF from model script
-#copy files created by macs2 to appripriate directory with the provided names
+if os.path.exists(os.path.join(working_dir,"%s_model.r" % experiment_name)):
-#=================================================================================
+cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % \
+(experiment_name, experiment_name)
+proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir)
+proc.wait()
-#=================================================================================
+# Convert XLS to interval, if requested
-#move files generated by callpeak command
+if output_xls_to_interval_peaks_file is not None:
-if (options['command'] == "callpeak"):
+peaks_xls_file = os.path.join(working_dir,'%s_peaks.xls' % experiment_name )
-	#run R to create pdf from model script
+if os.path.exists(peaks_xls_file):
-	if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
+convert_xls_to_interval(peaks_xls_file,output_xls_to_interval_peaks_file,
-	   	cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
+header='peaks file')
-	   	proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
-		proc.wait()
-	#move bed out to proper output file
+# Move MACS2 output files from working dir to their final destinations
-	created_bed_name =  os.path.join( tmp_dir, "%s_summits.bed" % experiment_name )
+move_file(working_dir,"%s_summits.bed" % experiment_name,output_summits)
-	if os.path.exists( created_bed_name ):
+move_file(working_dir,"%s_peaks.xls" % experiment_name,output_peaks)
-	shutil.move( created_bed_name, output_summits_bed )
+move_file(working_dir,"%s_peaks.narrowPeak" % experiment_name,output_narrowpeaks)
+move_file(working_dir,"%s_peaks.broadPeak" % experiment_name,output_broadpeaks)
+move_file(working_dir,"%s_peaks.gappedPeak" % experiment_name,output_gappedpeaks)
+move_file(working_dir,"%s_treat_pileup.bdg" % experiment_name,output_treat_pileup)
+move_file(working_dir,"%s_control_lambda.bdg" % experiment_name,output_lambda_bedgraph)
+move_file(working_dir,"bdgcmp_out.bdg",output_bdgcmp)
-	#OICR peak_xls file
+# Move remaining file to the 'extra files' path and link from the HTML
-	created_peak_xls_file =  os.path.join( tmp_dir, "%s_peaks.xls" % experiment_name )
+# file to allow user to access them from within Galaxy
-	if os.path.exists( created_peak_xls_file ):
+html_file = open(output_extra_html,'wb')
-	# shutil.copy( created_peak_xls_file, os.path.join ( "/mnt/galaxyData/tmp/", "%s_peaks.xls" % ( os.path.basename(output_extra_path) )))
+html_file.write('<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name)
-		shutil.copyfile( created_peak_xls_file, output_peaks )
+# Make the 'extra files' directory
+os.mkdir(output_extra_path)
+# Move the files
+for filen in sorted(os.listdir(working_dir)):
+shutil.move(os.path.join(working_dir,filen),
+os.path.join(output_extra_path,filen))
+html_file.write( '<li><a href="%s">%s</a></li>\n' % (filen,filen))
+# All files moved, close out HTML
+html_file.write( '</ul></p>\n' )
+# Append any stderr output
+html_file.write('<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' %
+open(stderr_filen,'rb').read())
+html_file.write('</body></html>\n')
+html_file.close()
-	#peaks.encodepeaks (narrowpeaks) file
+# Clean up the working directory and files
-	created_narrowpeak_file = os.path.join (tmp_dir, "%s_peaks.narrowPeak" % experiment_name )
+os.unlink(stderr_filen)
-	if os.path.exists( created_narrowpeak_file ):
+os.rmdir(working_dir)
-		shutil.move (created_narrowpeak_file, output_narrowpeaks )
-#peaks.broadpeaks file
-	created_broadpeak_file = os.path.join (tmp_dir, "%s_peaks.broadPeak" % experiment_name )
-	if os.path.exists( created_broadpeak_file ):
-		shutil.move (created_broadpeak_file, output_broadpeaks )
-#peaks.gappedpeaks file
-	created_gappedpeak_file = os.path.join (tmp_dir, "%s_peaks.gappedPeak" % experiment_name )
-	if os.path.exists( created_gappedpeak_file ):
-		shutil.move (created_gappedpeak_file, output_gappedpeaks )
-	#parse xls files to interval files as needed
-	#if 'xls_to_interval' in options:
-	if (options['xls_to_interval'] == "True"):
-	create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
-	if os.path.exists( create_peak_xls_file ):
-		xls_to_interval( create_peak_xls_file, output_xls_to_interval_peaks_file, header = 'peaks file' )
-#bedgraph bedgraph files
-if (options['bdg']):
-created_treat_pileup_file = os.path.join (tmp_dir, "%s_treat_pileup.bdg" % experiment_name )
-if os.path.exists( created_treat_pileup_file ):
-		shutil.move (created_treat_pileup_file, output_treat_pileup )
-created_lambda_bedgraph_file = os.path.join (tmp_dir, "%s_control_lambda.bdg" % experiment_name )
-if os.path.exists( created_lambda_bedgraph_file ):
-		shutil.move (created_lambda_bedgraph_file, output_lambda_bedgraph )
-	#move all remaining files to extra files path of html file output to allow user download
-	out_html = open( output_extra_html, 'wb' )
-	out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
-	os.mkdir( output_extra_path )
-	for filename in sorted( os.listdir( tmp_dir ) ):
-		shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
-	out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
-		#out_html.write( '<li><a href="%s">%s</a>peakxls %s SomethingDifferent tmp_dir %s path %s exp_name %s</li>\n' % ( created_peak_xls_file, filename, filename, tmp_dir, output_extra_path, experiment_name ) )
-	out_html.write( '</ul></p>\n' )
-	out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
-	out_html.write( '</body></html>\n' )
-	out_html.close()
-#=================================================================================
-#move files generated by bdgcmp command
-if (options['command'] == "bdgcmp"):
-	created_bdgcmp_file = os.path.join (tmp_dir, "bdgcmp_out.bdg" )
-	if os.path.exists( created_bdgcmp_file ):
-		shutil.move (created_bdgcmp_file, output_bdgcmp )
-#=================================================================================
-#cleanup
-#=================================================================================
-os.unlink( stderr_name )
-os.rmdir( tmp_dir )
-if __name__ == "__main__": main()

Mercurial > repos > pjbriggs > macs21

comparison macs21_wrapper.py @ 1:d0986d2be693 draft