Mercurial > repos > crs4 > conifer3
changeset 8:1993dd87698b draft
Uploaded
author | crs4 |
---|---|
date | Fri, 17 Mar 2017 10:08:48 -0400 |
parents | 3e83b4218d6e |
children | ab730530065a |
files | conifer3/conifer.py |
diffstat | 1 files changed, 73 insertions(+), 73 deletions(-) [+] |
line wrap: on
line diff
--- a/conifer3/conifer.py Fri Mar 17 09:49:18 2017 -0400 +++ b/conifer3/conifer.py Fri Mar 17 10:08:48 2017 -0400 @@ -41,9 +41,9 @@ probe_fn = str(args.probes[0]) probes = cf.loadProbeList(probe_fn) num_probes = len(probes) - print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) + print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) except IOError as e: - print '[ERROR] Cannot read probes file: ', probe_fn + print('[ERROR] Cannot read probes file: ', probe_fn) sys.exit(0) try: @@ -51,7 +51,7 @@ h5file_out = openFile(svd_outfile_fn, mode='w') probe_group = h5file_out.createGroup("/", "probes", "probes") except IOError as e: - print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn + print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn) sys.exit(0) if args.write_svals != "": @@ -66,7 +66,7 @@ from matplotlib.lines import Line2D from matplotlib.patches import Rectangle except: - print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" + print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") sys.exit(0) plt.gcf().clear() @@ -76,31 +76,31 @@ rpkm_dir = str(args.rpkm_dir[0]) rpkm_files = glob.glob(rpkm_dir + "/*") if len(rpkm_files) == 0: - print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir + print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir) sys.exit(0) elif len(rpkm_files) == 1: - print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.' + print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.') sys.exit(0) elif len(rpkm_files) < 8: - print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len( - rpkm_files) + print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len( + rpkm_files)) elif len(rpkm_files) <= int(args.svd): - print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( - len(rpkm_files), int(args.svd)) + print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( + len(rpkm_files), int(args.svd))) sys.exit(0) else: - print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir) + print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)) # read in samples names and generate file list samples = {} for f in rpkm_files: s = '.'.join(f.split('/')[-1].split('.')[0:-1]) - print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s) + print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s)) samples[s] = f # check uniqueness and total # of samples if len(set(samples)) != len(set(rpkm_files)): - print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!' + print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!') sys.exit(0) # LOAD RPKM DATA @@ -110,60 +110,60 @@ for i, s in enumerate(samples.keys()): t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) if len(t) != num_probes: - print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % ( - s, samples[s], probe_fn) + print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % ( + s, samples[s], probe_fn)) _ = samples.pop(s) failed_samples += 1 else: RPKM_data[:, i] = t - print "[INIT] Successfully read RPKM data for sampleID: %s" % s + print("[INIT] Successfully read RPKM data for sampleID: %s" % s) RPKM_data = RPKM_data[:, 0:len(samples)] - print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % ( - len(samples), failed_samples) + print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % ( + len(samples), failed_samples)) if len(samples) <= int(args.svd): - print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( - int(args.svd), len(samples)) + print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( + int(args.svd), len(samples))) sys.exit(0) # BEGIN chrs_to_process = set(map(operator.itemgetter("chr"), probes)) chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) - print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str + print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str) for chr in chrs_to_process: - print "[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr)) + print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr))) chr_group_name = "chr%d" % chr chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name) - chr_probes = filter(lambda i: i["chr"] == chr, probes) + chr_probes = [i for i in probes if i["chr"] == chr] num_chr_probes = len(chr_probes) start_probeID = chr_probes[0]['probeID'] stop_probeID = chr_probes[-1]['probeID'] - print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1, - stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term + print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1, + stop_probeID)) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term rpkm = RPKM_data[start_probeID:stop_probeID, :] - print "[RUNNING: chr%d] Calculating median RPKM" % chr + print("[RUNNING: chr%d] Calculating median RPKM" % chr) median = np.median(rpkm, 1) sd = np.std(rpkm, 1) probe_mask = median >= float(args.min_rpkm) - print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % ( - chr, np.sum(probe_mask == False), float(args.min_rpkm)) + print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % ( + chr, np.sum(probe_mask == False), float(args.min_rpkm))) rpkm = rpkm[probe_mask, :] num_chr_probes = np.sum(probe_mask) if num_chr_probes <= len(samples): - print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file" + print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file") sys.exit(0) - probeIDs = np.array(map(operator.itemgetter("probeID"), chr_probes))[probe_mask] - probe_starts = np.array(map(operator.itemgetter("start"), chr_probes))[probe_mask] - probe_stops = np.array(map(operator.itemgetter("stop"), chr_probes))[probe_mask] - gene_names = np.array(map(operator.itemgetter("name"), chr_probes))[probe_mask] + probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask] + probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask] + probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask] + gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask] dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)]) @@ -175,11 +175,11 @@ probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr) probe_table.append(out_probes) - print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr + print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr) rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) # svd transform - print "[RUNNING: chr%d] SVD decomposition..." % chr + print("[RUNNING: chr%d] SVD decomposition..." % chr) components_removed = int(args.svd) U, S, Vt = np.linalg.svd(rpkm, full_matrices=False) @@ -195,7 +195,7 @@ rpkm = np.dot(U, np.dot(new_S, Vt)) # save to HDF5 file - print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr + print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr) for i, s in enumerate(samples): out_data = np.empty(num_chr_probes, dtype='u4,f8') @@ -204,16 +204,16 @@ sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s)) sample_tbl.append(out_data) - print "[RUNNING] Saving sampleIDs to file..." + print("[RUNNING] Saving sampleIDs to file...") sample_group = h5file_out.createGroup("/", "samples", "samples") sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples") dt = np.dtype([('sampleID', np.str_, 100)]) - out_samples = np.empty(len(samples.keys()), dtype=dt) - out_samples['sampleID'] = np.array(samples.keys()) + out_samples = np.empty(len(list(samples.keys())), dtype=dt) + out_samples['sampleID'] = np.array(list(samples.keys())) sample_table.append(out_samples) if args.write_sd != "": - print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..." + print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...") sd_file = open(args.write_sd, 'w') @@ -244,7 +244,7 @@ plt.ylabel("Relative contributed variance") plt.savefig(args.plot_scree) - print "[FINISHED]" + print("[FINISHED]") h5file_out.close() sys.exit(0) @@ -254,7 +254,7 @@ h5file_in_fn = str(args.input) h5file_in = openFile(h5file_in_fn, mode='r') except IOError as e: - print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn + print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) sys.exit(0) # read probes @@ -267,15 +267,15 @@ out_path = os.path.abspath(args.output) - print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path) + print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)) for sample in all_samples: try: outfile_fn = out_path + "/" + sample + ".bed" outfile_f = open(outfile_fn, 'w') except IOError as e: - print '[ERROR] Cannot open output file for writing: ', outfile_fn + print('[ERROR] Cannot open output file for writing: ', outfile_fn) sys.exit(0) - print "[RUNNING] Exporting %s" % sample + print("[RUNNING] Exporting %s" % sample) cf.export_sample(h5file_in, sample, probes, outfile_f) outfile_f.close() @@ -283,7 +283,7 @@ elif len(args.sample) == 1: out_path = os.path.abspath(args.output) sample = args.sample[0] - print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path) + print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)) try: if os.path.isdir(out_path): outfile_fn = out_path + "/" + sample + ".bed" @@ -291,16 +291,16 @@ outfile_fn = out_path outfile_f = open(outfile_fn, 'w') except IOError as e: - print '[ERROR] Cannot open output file for writing: ', outfile_fn + print('[ERROR] Cannot open output file for writing: ', outfile_fn) sys.exit(0) - print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) + print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) cf.export_sample(h5file_in, sample, probes, outfile_f) outfile_f.close() else: out_path = os.path.abspath(args.output) - print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path) + print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)) for sample in args.sample: try: if os.path.isdir(out_path): @@ -309,9 +309,9 @@ outfile_fn = out_path outfile_f = open(outfile_fn, 'w') except IOError as e: - print '[ERROR] Cannot open output file for writing: ', outfile_fn + print('[ERROR] Cannot open output file for writing: ', outfile_fn) sys.exit(0) - print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) + print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) cf.export_sample(h5file_in, sample, probes, outfile_f) outfile_f.close() @@ -323,14 +323,14 @@ h5file_in_fn = str(args.input) h5file_in = openFile(h5file_in_fn, mode='r') except IOError as e: - print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn + print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) sys.exit(0) try: callfile_fn = str(args.output) callfile_f = open(callfile_fn, mode='w') except IOError as e: - print '[ERROR] Cannot open output file for writing: ', callfile_fn + print('[ERROR] Cannot open output file for writing: ', callfile_fn) sys.exit(0) chrs_to_process = [] @@ -340,14 +340,14 @@ h5file_in.close() - print '[INIT] Initializing caller at threshold = %f' % (args.threshold) + print('[INIT] Initializing caller at threshold = %f' % (args.threshold)) r = cf.rpkm_reader(h5file_in_fn) all_calls = [] for chr in chrs_to_process: - print '[RUNNING] Now processing chr%s' % chr + print('[RUNNING] Now processing chr%s' % chr) data = r.getExonValuesByRegion(chr) # raw_data = copy.copy(data) @@ -407,7 +407,7 @@ callfile_f.write('\t'.join(header) + "\n") for call in all_calls: - print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) + print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) callfile_f.write( "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) @@ -425,7 +425,7 @@ from matplotlib.patches import Rectangle _ = locale.setlocale(locale.LC_ALL, '') except: - print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" + print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") sys.exit(0) chr, start, stop = cf.parseLocString(args.region) @@ -448,7 +448,7 @@ try: color, sampleID = sample.split(":") except: - color = coloriter.next() + color = next(coloriter) sampleID = sample ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID) @@ -484,7 +484,7 @@ from matplotlib.lines import Line2D from matplotlib.patches import Rectangle except: - print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" + print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") sys.exit(0) import locale @@ -497,7 +497,7 @@ callfile_fn = str(args.calls) callfile_f = open(callfile_fn, mode='r') except IOError as e: - print '[ERROR] Cannot open call file for reading: ', callfile_fn + print('[ERROR] Cannot open call file for reading: ', callfile_fn) sys.exit(0) all_calls = [] @@ -560,33 +560,33 @@ try: import pysam except: - print '[ERROR] Cannot load pysam module! Make sure it is insalled' + print('[ERROR] Cannot load pysam module! Make sure it is insalled') sys.exit(0) try: # read probes table probe_fn = str(args.probes[0]) probes = cf.loadProbeList(probe_fn) num_probes = len(probes) - print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) + print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) except IOError as e: - print '[ERROR] Cannot read probes file: ', probe_fn + print('[ERROR] Cannot read probes file: ', probe_fn) sys.exit(0) try: rpkm_f = open(args.output[0], 'w') except IOError as e: - print '[ERROR] Cannot open rpkm file for writing: ', args.output + print('[ERROR] Cannot open rpkm file for writing: ', args.output) sys.exit(0) - print "[RUNNING] Counting total number of reads in bam file..." + print("[RUNNING] Counting total number of reads in bam file...") total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) - print "[RUNNING] Found %d reads" % total_reads + print("[RUNNING] Found %d reads" % total_reads) f = pysam.Samfile(args.input[0], "rb") if not f._hasIndex(): - print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \ - args.input[0] + print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \ + args.input[0]) sys.exit(0) # will be storing values in these arrays @@ -609,11 +609,11 @@ elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs: probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "") else: - print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % ( - probes_contig, probe_fn) + print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % ( + probes_contig, probe_fn)) sys.exit(0) - print "[RUNNING] Calculating RPKM values..." + print("[RUNNING] Calculating RPKM values...") # loop through each probe for p in probes: @@ -627,8 +627,8 @@ try: iter = f.fetch(p_chr, p_start, p_stop) except: - print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % ( - p_chr, p_start, p_stop) + print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % ( + p_chr, p_start, p_stop)) sys.exit(0) for i in iter: