Mercurial > repos > greg > multigps
view multigps.py @ 9:32c8c38cd651 draft
Uploaded
author | greg |
---|---|
date | Tue, 05 Jan 2016 08:19:20 -0500 |
parents | afba8c2a3b32 |
children | a75e532f0441 |
line wrap: on
line source
import optparse import os import shutil import subprocess import sys import tempfile BUFF_SIZE = 1048576 DESIGN_FILE = 'design.tabular' def generate_design_file(input_items, filename): design_file = open(filename, 'wb') for items in input_items: filename, ext, signal_control, condition_name, replicate_name, read_distribution_file, fixed_read = items # MultiGPS version 0.5 does not support the newer scidx datatype. if ext == 'scidx': datatype = 'IDX' else: datatype = ext.upper() design_file.write('%s\n' % '\t'.join([filename, signal_control, datatype, condition_name, replicate_name, read_distribution_file, fixed_read])) design_file.close() def get_stderr_exception(tmp_err, tmp_stderr, tmp_out, tmp_stdout, include_stdout=False): tmp_stderr.close() # Get stderr, allowing for case where it's very large. tmp_stderr = open(tmp_err, 'rb') stderr_str = '' buffsize = BUFF_SIZE try: while True: stderr_str += tmp_stderr.read(buffsize) if not stderr_str or len(stderr_str) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if include_stdout: tmp_stdout = open(tmp_out, 'rb') stdout_str = '' buffsize = BUFF_SIZE try: while True: stdout_str += tmp_stdout.read(buffsize) if not stdout_str or len(stdout_str) % buffsize != 0: break except OverflowError: pass tmp_stdout.close() if include_stdout: return 'STDOUT\n%s\n\nSTDERR\n%s\n' % (stdout_str, stderr_str) return stderr_str def stop_err(msg): sys.stderr.write(msg) sys.exit(1) parser = optparse.OptionParser() parser.add_option('--input_item', dest='input_items', action='append', nargs=7, help="Input datasets and associated parameters") parser.add_option('--threads', dest='threads', type="int", default=4, help='The number of threads to run') parser.add_option('--multigps_jar', dest='multigps_jar', help='Path to the MultiGPS jar file') parser.add_option('--geninfo', dest='geninfo', help='File listing the lengths of all chromosomes') parser.add_option('--use_motif', dest='use_motif', help='Perform motif-finding or use a motif-prior') parser.add_option('--seq_dir', dest='seq_dir', default=None, help='Directory containing fasta files corresponding to every named chromosome') parser.add_option('--positional_prior', dest='positional_prior', default=None, help='Perform inter-experiment positional prior') parser.add_option('--events_shared_probability', dest='events_shared_probability', type=float, default=0.9, help='Probability that events are shared across conditions') parser.add_option('--motifs', dest='motifs', default=None, help='Perform motif-finding and motif priors') parser.add_option('--motif_finding_only', dest='motif_finding_only', default=None, help='Perform motif-finding only') parser.add_option('--num_motifs', dest='num_motifs', type="int", default=None, help='Number of motifs MEME should find for each condition') parser.add_option('--mememinw', dest='mememinw', type="int", default=None, help='Minimum motif width for MEME') parser.add_option('--mememaxw', dest='mememaxw', type="int", default=None, help='Maximum motif width for MEME') parser.add_option('--fixedpb', dest='fixedpb', type="int", default=None, help='Fixed per-base limit') parser.add_option('--poissongausspb', dest='poissongausspb', type="int", default=None, help='Poisson threshold for filtering per base') parser.add_option('--non_unique_reads', dest='non_unique_reads', default=None, help='Use non-unique reads') parser.add_option('--minqvalue', dest='minqvalue', type="int", default=None, help='Minimum Q-value (corrected p-value) of reported binding events') parser.add_option('--minfold', dest='minfold', type="int", default=None, help='Minimum event fold-change vs scaled control') parser.add_option('--diff_enrichment_tests', dest='diff_enrichment_tests', help='Run differential enrichment tests') parser.add_option('--edgerod', dest='edgerod', type="int", default=None, help='EdgeR over-dispersion parameter value') parser.add_option('--diffp', dest='diffp', type="int", default=None, help='Minimum p-value for reporting differential enrichment') parser.add_option('--noscaling', dest='noscaling', default=None, help='Do not use signal vs control scaling') parser.add_option('--medianscale', dest='medianscale', default=None, help='Use the median signal/control ratio as the scaling factor') parser.add_option('--sesscale', dest='sesscale', default=None, help='Estimate scaling factor by SES') parser.add_option('--scalewin', dest='scalewin', type="int", default=None, help='Window size for estimating scaling ratios') parser.add_option('--max_training_rounds', dest='max_training_rounds', type="int", default=None, help='Maximum number of training rounds for updating binding event read distributions') parser.add_option('--exclude_file', dest='exclude_file', default=None, help='File containing a set of regions to ignore during MultiGPS training') parser.add_option('--binding_model_updates', dest='binding_model_updates', default=None, help='Perform binding model updates') parser.add_option('--minmodelupdateevents', dest='minmodelupdateevents', type="int", default=None, help='Minimum number of events to support an update of the read distribution') parser.add_option('--binding_model_smoothing', dest='binding_model_smoothing', default=None, help='Binding model smoothing') parser.add_option('--spline_smooth', dest='spline_smooth', type="int", default=None, help='Binding model smoothing value') parser.add_option('--gauss_smooth', dest='gauss_smooth', type="int", default=None, help='Gaussian smoothing std dev') parser.add_option('--joint_in_model', dest='joint_in_model', default=None, help='Allow joint events in model updates') parser.add_option('--ml_config_not_shared', dest='ml_config_not_shared', default=None, help='Share component configs in the ML step') parser.add_option('--output_html_path', dest='output_html_path', help='Output html results file') parser.add_option('--output_html_files_path', dest='output_html_files_path', help='Output html extra files') parser.add_option('--output_process_path', dest='output_process_path', default=None, help='Output file for capturing stdout') options, args = parser.parse_args() try: # Preparation. tmp_dir = tempfile.mkdtemp(prefix="tmp-multigps-") generate_design_file(options.input_items, DESIGN_FILE) # Build the command line. cmd_list = ['java -jar %s --verbose' % options.multigps_jar] cmd_list.append('--threads %d' % options.threads) cmd_list.append('--geninfo %s' % options.geninfo) if options.use_motif == 'yes': cmd_list.append('--seq %s' % options.seq_dir) if options.positional_prior == 'no': cmd_list.append('--noposprior') cmd_list.append('--probshared %3f' % options.events_shared_probability) if options.motifs == 'no': cmd_list.append('--nomotifs') if options.motif_finding_only == 'yes': cmd_list.append('--nomotifprior') if options.num_motifs is not None: cmd_list.append('--memenmotifs %d' % options.num_motifs) if options.mememinw is not None: cmd_list.append('--mememinw %d' % options.mememinw) if options.mememaxw is not None: cmd_list.append('--mememaxw %d' % options.mememaxw) if options.fixedpb is not None: cmd_list.append('--fixedpb %d' % options.fixedpb) if options.poissongausspb is not None: cmd_list.append('--poissongausspb %d' % options.poissongausspb) if options.non_unique_reads == 'yes': cmd_list.append('--nonunique') if options.minqvalue is not None: cmd_list.append('--q %d' % options.minqvalue) if options.minfold is not None: cmd_list.append('--minfold %d' % options.minfold) if options.diff_enrichment_tests == 'no': cmd_list.append('--nodifftests') if options.edgerod is not None: cmd_list.append('--edgerod %d' % options.edgerod) if options.diffp is not None: cmd_list.append('--diffp %d' % options.diffp) if options.noscaling == 'no': cmd_list.append('--noscaling') if options.medianscale == "yes": cmd_list.append('--medianscale') if options.sesscale == "yes": cmd_list.append('--sesscale') if options.scalewin is not None: cmd_list.append('--scalewin %d' % options.scalewin) if options.max_training_rounds is not None: cmd_list.append('--r %d' % options.max_training_rounds) if options.exclude_file not in [None, 'None']: cmd_list.append('--exclude %s' % options.exclude_file) if options.binding_model_updates == 'no': cmd_list.append('--nomodelupdate') if options.minmodelupdateevents is not None: cmd_list.append('--minmodelupdateevents %d' % options.minmodelupdateevents) if options.binding_model_smoothing == 'yes': if options.spline_smooth is not None: cmd_list.append('--splinesmoothparam %d' % options.spline_smooth) else: cmd_list.append('--nomodelsmoothing') if options.gauss_smooth is not None: cmd_list.append('--gaussmodelsmoothing') cmd_list.append('--gausssmoothparam %d' % options.gauss_smooth) if options.joint_in_model == 'yes': cmd_list.append('--jointinmodel') if options.ml_config_not_shared == 'no': cmd_list.append('--mlconfignotshared') cmd_list.append('--design %s' % DESIGN_FILE) cmd_list.append('--out %s' % options.output_html_files_path) if options.output_process_path is None: cmd_list.append('2>/dev/null') else: cmd_list.append('2>%s' % options.output_process_path) cmd = ' '.join(cmd_list) # Define command response buffers. tmp_out = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stdout = open(tmp_out, 'wb') tmp_err = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp_err, 'wb') tmp_filename = tempfile.NamedTemporaryFile(dir=tmp_dir).name # Execute the command. proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, stdout=tmp_stdout, shell=True) rc = proc.wait() if rc != 0: error_message = get_stderr_exception(tmp_err, tmp_stderr, tmp_out, tmp_stdout) stop_err(error_message) # Move the output HTML file to the output dataset path. output_html_filename = 'multiGPS_%s_results.html' % os.path.split(options.output_html_files_path)[1] shutil.move(os.path.join(options.output_html_files_path, output_html_filename), options.output_html_path) except Exception, e: stop_err('Error running MultiGPS\n%s\n' % str(e)) finally: if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir)