Mercurial > repos > pjbriggs > macs21
diff macs21_wrapper.py @ 1:d0986d2be693 draft
Substantial reimplementation of internals, also renamed id and version.
author | pjbriggs |
---|---|
date | Thu, 29 Jan 2015 11:11:21 -0500 |
parents | fdad0c8c0957 |
children | 15889783e759 |
line wrap: on
line diff
--- a/macs21_wrapper.py Wed Jan 21 11:07:37 2015 -0500 +++ b/macs21_wrapper.py Thu Jan 29 11:11:21 2015 -0500 @@ -1,208 +1,177 @@ -#purpose: macs2 python wrapper -#author: Ziru Zhou -#date: November, 2012 +#!/bin/env python +# +# Galaxy wrapper to run MACS 2.1 +# +# Completely rewritten from the original macs2 wrapped by Ziru Zhou +# taken from http://toolshed.g2.bx.psu.edu/view/modencode-dcc/macs2 -import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip -from galaxy import eggs -import pkg_resources -pkg_resources.require( "simplejson" ) -import simplejson +import sys +import os +import subprocess +import tempfile +import shutil -CHUNK_SIZE = 1024 +def move_file(working_dir,name,destination): + """Move a file 'name' from 'working_dir' to 'destination' + + """ + if destination is None: + # Nothing to do + return + source = os.path.join(working_dir,name) + if os.path.exists(source): + shutil.move(source,destination) -#========================================================================================== -#functions -#========================================================================================== -def gunzip_cat_glob_path( glob_path, target_filename, delete = False ): - out = open( target_filename, 'wb' ) - for filename in glob.glob( glob_path ): - fh = gzip.open( filename, 'rb' ) - while True: - data = fh.read( CHUNK_SIZE ) - if data: - out.write( data ) - else: - break - fh.close() - if delete: - os.unlink( filename ) - out.close() +def convert_xls_to_interval(xls_file,interval_file,header=None): + """Convert MACS XLS file to interval + + From the MACS readme: "Coordinates in XLS is 1-based which is different with + BED format." -def xls_to_interval( xls_file, interval_file, header = None ): - out = open( interval_file, 'wb' ) + However this function no longer performs any coordinate conversions, it + simply ensures that any blank or non-data lines are commented out + + """ + fp = open(interval_file,'wb') if header: - out.write( '#%s\n' % header ) - wrote_header = False - #From macs readme: Coordinates in XLS is 1-based which is different with BED format. - #PJB: updated to keep original 'start' coordinate i.e. do not convert to BED - for line in open( xls_file ): - #keep all existing comment lines - if line.startswith( '#' ): - out.write( line ) - #added for macs2 since there is an extra newline - ##elif line.startswith( '\n' ): - ## out.write( line ) - elif not wrote_header: - out.write( '#%s' % line ) - print line - wrote_header = True + fp.write('#%s\n' % header) + for line in open(xls_file): + # Keep all existing comment lines + if line.startswith('#'): + fp.write(line) else: - fields = line.split( '\t' ) - if len( fields ) > 1: + # Split line into fields and test to see if + # the 'start' field is actually an integer + fields = line.split('\t') + if len(fields) > 1: try: - # Try to convert 'start' to int and shift - ##fields[1] = str( int( fields[1] ) - 1 ) - # PJB don't correct, just test to see if it's an integer - int( fields[1] ) + int(fields[1]) except ValueError: # Integer conversion failed so comment out # "bad" line instead fields[0] = "#%s" % fields[0] - out.write( '\t'.join( fields ) ) - out.close() - -#========================================================================================== -#main -#========================================================================================== -def main(): - #take in options file and output file names - options = simplejson.load( open( sys.argv[1] ) ) - outputs = simplejson.load( open( sys.argv[2] ) ) + fp.write( '\t'.join( fields ) ) + fp.close() - #================================================================================= - #parse options and execute macs2 - #================================================================================= - #default inputs that are in every major command - experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for some file names - cmdline = "macs2 %s -t %s" % ( options['command'], ",".join( options['input_chipseq'] ) ) - if options['input_control']: - cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) ) +if __name__ == "__main__": - #================================================================================= - if (options['command'] == "callpeak"): - output_summits_bed = outputs['output_summits_bed_file'] - output_extra_html = outputs['output_extra_file'] - output_extra_path = outputs['output_extra_file_path'] - output_peaks = outputs['output_peaks_file'] - output_narrowpeaks = outputs['output_narrowpeaks_file'] - output_broadpeaks = outputs['output_broadpeaks_file'] - output_gappedpeaks = outputs['output_gappedpeaks_file'] - output_xls_to_interval_peaks_file = outputs['output_xls_to_interval_peaks_file'] - output_treat_pileup = outputs['output_treat_pileup_file'] - output_lambda_bedgraph = outputs['output_lambda_bedgraph_file'] + # Echo the command line + print ' '.join(sys.argv) - cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s'" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'] ) - if (options['broad'] == 'broad'): - cmdline = "%s --broad --broad-cutoff='%s'" % ( cmdline, options['broad_cutoff'] ) - if 'pvalue' in options: - cmdline = "%s --pvalue='%s'" % ( cmdline, options['pvalue'] ) - elif 'qvalue' in options: - cmdline = "%s --qvalue='%s'" % ( cmdline, options['qvalue'] ) - cmdline = "%s --mfold %s %s %s %s %s %s --keep-dup %s" % (cmdline, options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'], options['spmr'], options['call_summits'], options['keep_dup'] ) - - if 'nomodel' in options: - cmdline = "%s --nomodel --extsize='%s'" % ( cmdline, options['nomodel'] ) - #================================================================================= - if (options['command'] == "bdgcmp"): - output_bdgcmp = outputs['output_bdgcmp_file'] - - cmdline = "%s -m %s -p %s -o bdgcmp_out.bdg" % ( cmdline, options['m'], options['pseudocount'] ) - #================================================================================= + # Initialise output files - values are set by reading from + # the command line supplied by the Galaxy wrapper + output_extra_html = None + output_extra_path = None + output_broadpeaks = None + output_gappedpeaks = None + output_narrowpeaks = None + output_treat_pileup = None + output_lambda_bedgraph = None + output_xls_to_interval_peaks_file = None + output_peaks = None + output_bdgcmp = None - tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user - stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report - proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) ) - proc.wait() - #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log - #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup - if proc.returncode: - stderr_f = open( stderr_name ) - while True: - chunk = stderr_f.read( CHUNK_SIZE ) - if not chunk: - stderr_f.close() - break - sys.stderr.write( chunk ) - - #================================================================================= - #copy files created by macs2 to appripriate directory with the provided names - #================================================================================= + # Build the MACS 2.1 command line + # Initial arguments are always the same: command & input ChIP-seq file name + cmdline = ["macs2 %s -t %s" % (sys.argv[1],sys.argv[2])] - #================================================================================= - #move files generated by callpeak command - if (options['command'] == "callpeak"): - #run R to create pdf from model script - if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ): - cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name ) - proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir ) - proc.wait() - - #move bed out to proper output file - created_bed_name = os.path.join( tmp_dir, "%s_summits.bed" % experiment_name ) - if os.path.exists( created_bed_name ): - shutil.move( created_bed_name, output_summits_bed ) - - #OICR peak_xls file - created_peak_xls_file = os.path.join( tmp_dir, "%s_peaks.xls" % experiment_name ) - if os.path.exists( created_peak_xls_file ): - # shutil.copy( created_peak_xls_file, os.path.join ( "/mnt/galaxyData/tmp/", "%s_peaks.xls" % ( os.path.basename(output_extra_path) ))) - shutil.copyfile( created_peak_xls_file, output_peaks ) - - #peaks.encodepeaks (narrowpeaks) file - created_narrowpeak_file = os.path.join (tmp_dir, "%s_peaks.narrowPeak" % experiment_name ) - if os.path.exists( created_narrowpeak_file ): - shutil.move (created_narrowpeak_file, output_narrowpeaks ) - - #peaks.broadpeaks file - created_broadpeak_file = os.path.join (tmp_dir, "%s_peaks.broadPeak" % experiment_name ) - if os.path.exists( created_broadpeak_file ): - shutil.move (created_broadpeak_file, output_broadpeaks ) - - #peaks.gappedpeaks file - created_gappedpeak_file = os.path.join (tmp_dir, "%s_peaks.gappedPeak" % experiment_name ) - if os.path.exists( created_gappedpeak_file ): - shutil.move (created_gappedpeak_file, output_gappedpeaks ) + # Process remaining args + for arg in sys.argv[3:]: + if arg.startswith('--format='): + # Convert format to uppercase + format_ = arg.split('=')[1].upper() + cmdline.append("--format=%s" % format_) + elif arg.startswith('--name='): + # Replace whitespace in name with underscores + experiment_name = '_'.join(arg.split('=')[1].split()) + cmdline.append("--name=%s" % experiment_name) + elif arg.startswith('--output-'): + # Handle destinations for output files + arg0,filen = arg.split('=') + if arg0 == '--output-summits': + output_summits = filen + elif arg0 == '--output-extra-files': + output_extra_html = filen + elif arg0 == '--output-extra-files-path': + output_extra_path = filen + elif arg0 == '--output-broadpeaks': + output_broadpeaks = filen + elif arg0 == '--output-gappedpeaks': + output_gappedpeaks = filen + elif arg0 == '--output-narrowpeaks': + output_narrowpeaks = filen + elif arg0 == '--output-pileup': + output_treat_pileup = filen + elif arg0 == '--output-lambda-bedgraph': + output_lambda_bedgraph = filen + elif arg0 == '--output-xls-to-interval': + output_xls_to_interval_peaks_file = filen + elif arg0 == '--output-peaks': + output_peaks = filen + else: + # Pass remaining args directly to MACS + # command line + cmdline.append(arg) + + cmdline = ' '.join(cmdline) + print "Generated command line:\n%s" % cmdline - #parse xls files to interval files as needed - #if 'xls_to_interval' in options: - if (options['xls_to_interval'] == "True"): - create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name ) - if os.path.exists( create_peak_xls_file ): - xls_to_interval( create_peak_xls_file, output_xls_to_interval_peaks_file, header = 'peaks file' ) - - #bedgraph bedgraph files - if (options['bdg']): - created_treat_pileup_file = os.path.join (tmp_dir, "%s_treat_pileup.bdg" % experiment_name ) - if os.path.exists( created_treat_pileup_file ): - shutil.move (created_treat_pileup_file, output_treat_pileup ) - created_lambda_bedgraph_file = os.path.join (tmp_dir, "%s_control_lambda.bdg" % experiment_name ) - if os.path.exists( created_lambda_bedgraph_file ): - shutil.move (created_lambda_bedgraph_file, output_lambda_bedgraph ) + # Execute MACS2 + # + # Make a working directory + working_dir = tempfile.mkdtemp() + # + # Collect stderr in a file for reporting later + stderr_filen = tempfile.NamedTemporaryFile().name + # + # Run MACS2 + proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir, + stderr=open(stderr_filen,'wb')) + proc.wait() + + # Run R script to create PDF from model script + if os.path.exists(os.path.join(working_dir,"%s_model.r" % experiment_name)): + cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % \ + (experiment_name, experiment_name) + proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir) + proc.wait() - #move all remaining files to extra files path of html file output to allow user download - out_html = open( output_extra_html, 'wb' ) - out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name ) - os.mkdir( output_extra_path ) - for filename in sorted( os.listdir( tmp_dir ) ): - shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) ) - out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) ) - #out_html.write( '<li><a href="%s">%s</a>peakxls %s SomethingDifferent tmp_dir %s path %s exp_name %s</li>\n' % ( created_peak_xls_file, filename, filename, tmp_dir, output_extra_path, experiment_name ) ) - out_html.write( '</ul></p>\n' ) - out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() ) - out_html.write( '</body></html>\n' ) - out_html.close() + # Convert XLS to interval, if requested + if output_xls_to_interval_peaks_file is not None: + peaks_xls_file = os.path.join(working_dir,'%s_peaks.xls' % experiment_name ) + if os.path.exists(peaks_xls_file): + convert_xls_to_interval(peaks_xls_file,output_xls_to_interval_peaks_file, + header='peaks file') + + # Move MACS2 output files from working dir to their final destinations + move_file(working_dir,"%s_summits.bed" % experiment_name,output_summits) + move_file(working_dir,"%s_peaks.xls" % experiment_name,output_peaks) + move_file(working_dir,"%s_peaks.narrowPeak" % experiment_name,output_narrowpeaks) + move_file(working_dir,"%s_peaks.broadPeak" % experiment_name,output_broadpeaks) + move_file(working_dir,"%s_peaks.gappedPeak" % experiment_name,output_gappedpeaks) + move_file(working_dir,"%s_treat_pileup.bdg" % experiment_name,output_treat_pileup) + move_file(working_dir,"%s_control_lambda.bdg" % experiment_name,output_lambda_bedgraph) + move_file(working_dir,"bdgcmp_out.bdg",output_bdgcmp) - #================================================================================= - #move files generated by bdgcmp command - if (options['command'] == "bdgcmp"): - created_bdgcmp_file = os.path.join (tmp_dir, "bdgcmp_out.bdg" ) - if os.path.exists( created_bdgcmp_file ): - shutil.move (created_bdgcmp_file, output_bdgcmp ) - - #================================================================================= - #cleanup - #================================================================================= - os.unlink( stderr_name ) - os.rmdir( tmp_dir ) + # Move remaining file to the 'extra files' path and link from the HTML + # file to allow user to access them from within Galaxy + html_file = open(output_extra_html,'wb') + html_file.write('<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name) + # Make the 'extra files' directory + os.mkdir(output_extra_path) + # Move the files + for filen in sorted(os.listdir(working_dir)): + shutil.move(os.path.join(working_dir,filen), + os.path.join(output_extra_path,filen)) + html_file.write( '<li><a href="%s">%s</a></li>\n' % (filen,filen)) + # All files moved, close out HTML + html_file.write( '</ul></p>\n' ) + # Append any stderr output + html_file.write('<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % + open(stderr_filen,'rb').read()) + html_file.write('</body></html>\n') + html_file.close() -if __name__ == "__main__": main() + # Clean up the working directory and files + os.unlink(stderr_filen) + os.rmdir(working_dir)