changeset 8:1993dd87698b draft

Uploaded
author crs4
date Fri, 17 Mar 2017 10:08:48 -0400
parents 3e83b4218d6e
children ab730530065a
files conifer3/conifer.py
diffstat 1 files changed, 73 insertions(+), 73 deletions(-) [+]
line wrap: on
line diff
--- a/conifer3/conifer.py	Fri Mar 17 09:49:18 2017 -0400
+++ b/conifer3/conifer.py	Fri Mar 17 10:08:48 2017 -0400
@@ -41,9 +41,9 @@
         probe_fn = str(args.probes[0])
         probes = cf.loadProbeList(probe_fn)
         num_probes = len(probes)
-        print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
+        print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
     except IOError as e:
-        print '[ERROR] Cannot read probes file: ', probe_fn
+        print('[ERROR] Cannot read probes file: ', probe_fn)
         sys.exit(0)
 
     try:
@@ -51,7 +51,7 @@
         h5file_out = openFile(svd_outfile_fn, mode='w')
         probe_group = h5file_out.createGroup("/", "probes", "probes")
     except IOError as e:
-        print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
+        print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn)
         sys.exit(0)
 
     if args.write_svals != "":
@@ -66,7 +66,7 @@
             from matplotlib.lines import Line2D
             from matplotlib.patches import Rectangle
         except:
-            print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+            print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
             sys.exit(0)
 
         plt.gcf().clear()
@@ -76,31 +76,31 @@
     rpkm_dir = str(args.rpkm_dir[0])
     rpkm_files = glob.glob(rpkm_dir + "/*")
     if len(rpkm_files) == 0:
-        print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
+        print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir)
         sys.exit(0)
     elif len(rpkm_files) == 1:
-        print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
+        print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.')
         sys.exit(0)
     elif len(rpkm_files) < 8:
-        print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(
-            rpkm_files)
+        print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(
+            rpkm_files))
     elif len(rpkm_files) <= int(args.svd):
-        print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
-        len(rpkm_files), int(args.svd))
+        print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
+        len(rpkm_files), int(args.svd)))
         sys.exit(0)
     else:
-        print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
+        print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir))
 
     # read in samples names and generate file list
     samples = {}
     for f in rpkm_files:
         s = '.'.join(f.split('/')[-1].split('.')[0:-1])
-        print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
+        print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s))
         samples[s] = f
 
     # check uniqueness and total # of samples
     if len(set(samples)) != len(set(rpkm_files)):
-        print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
+        print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!')
         sys.exit(0)
 
     # LOAD RPKM DATA
@@ -110,60 +110,60 @@
     for i, s in enumerate(samples.keys()):
         t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
         if len(t) != num_probes:
-            print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (
-            s, samples[s], probe_fn)
+            print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (
+            s, samples[s], probe_fn))
             _ = samples.pop(s)
             failed_samples += 1
         else:
             RPKM_data[:, i] = t
-            print "[INIT] Successfully read RPKM data for sampleID: %s" % s
+            print("[INIT] Successfully read RPKM data for sampleID: %s" % s)
 
     RPKM_data = RPKM_data[:, 0:len(samples)]
-    print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (
-    len(samples), failed_samples)
+    print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (
+    len(samples), failed_samples))
 
     if len(samples) <= int(args.svd):
-        print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
-        int(args.svd), len(samples))
+        print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
+        int(args.svd), len(samples)))
         sys.exit(0)
 
     # BEGIN
     chrs_to_process = set(map(operator.itemgetter("chr"), probes))
     chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
-    print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
+    print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str)
 
     for chr in chrs_to_process:
-        print "[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr))
+        print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr)))
         chr_group_name = "chr%d" % chr
         chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name)
 
-        chr_probes = filter(lambda i: i["chr"] == chr, probes)
+        chr_probes = [i for i in probes if i["chr"] == chr]
         num_chr_probes = len(chr_probes)
         start_probeID = chr_probes[0]['probeID']
         stop_probeID = chr_probes[-1]['probeID']
-        print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1,
-                                                                              stop_probeID)  # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
+        print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1,
+                                                                              stop_probeID))  # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
 
         rpkm = RPKM_data[start_probeID:stop_probeID, :]
 
-        print "[RUNNING: chr%d] Calculating median RPKM" % chr
+        print("[RUNNING: chr%d] Calculating median RPKM" % chr)
         median = np.median(rpkm, 1)
         sd = np.std(rpkm, 1)
         probe_mask = median >= float(args.min_rpkm)
-        print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (
-        chr, np.sum(probe_mask == False), float(args.min_rpkm))
+        print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (
+        chr, np.sum(probe_mask == False), float(args.min_rpkm)))
 
         rpkm = rpkm[probe_mask, :]
         num_chr_probes = np.sum(probe_mask)
 
         if num_chr_probes <= len(samples):
-            print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
+            print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file")
             sys.exit(0)
 
-        probeIDs = np.array(map(operator.itemgetter("probeID"), chr_probes))[probe_mask]
-        probe_starts = np.array(map(operator.itemgetter("start"), chr_probes))[probe_mask]
-        probe_stops = np.array(map(operator.itemgetter("stop"), chr_probes))[probe_mask]
-        gene_names = np.array(map(operator.itemgetter("name"), chr_probes))[probe_mask]
+        probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask]
+        probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask]
+        probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask]
+        gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask]
 
         dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)])
 
@@ -175,11 +175,11 @@
         probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr)
         probe_table.append(out_probes)
 
-        print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
+        print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr)
         rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
 
         # svd transform
-        print "[RUNNING: chr%d] SVD decomposition..." % chr
+        print("[RUNNING: chr%d] SVD decomposition..." % chr)
         components_removed = int(args.svd)
 
         U, S, Vt = np.linalg.svd(rpkm, full_matrices=False)
@@ -195,7 +195,7 @@
         rpkm = np.dot(U, np.dot(new_S, Vt))
 
         # save to HDF5 file
-        print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
+        print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr)
 
         for i, s in enumerate(samples):
             out_data = np.empty(num_chr_probes, dtype='u4,f8')
@@ -204,16 +204,16 @@
             sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s))
             sample_tbl.append(out_data)
 
-    print "[RUNNING] Saving sampleIDs to file..."
+    print("[RUNNING] Saving sampleIDs to file...")
     sample_group = h5file_out.createGroup("/", "samples", "samples")
     sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples")
     dt = np.dtype([('sampleID', np.str_, 100)])
-    out_samples = np.empty(len(samples.keys()), dtype=dt)
-    out_samples['sampleID'] = np.array(samples.keys())
+    out_samples = np.empty(len(list(samples.keys())), dtype=dt)
+    out_samples['sampleID'] = np.array(list(samples.keys()))
     sample_table.append(out_samples)
 
     if args.write_sd != "":
-        print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
+        print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...")
 
         sd_file = open(args.write_sd, 'w')
 
@@ -244,7 +244,7 @@
         plt.ylabel("Relative contributed variance")
         plt.savefig(args.plot_scree)
 
-    print "[FINISHED]"
+    print("[FINISHED]")
     h5file_out.close()
     sys.exit(0)
 
@@ -254,7 +254,7 @@
         h5file_in_fn = str(args.input)
         h5file_in = openFile(h5file_in_fn, mode='r')
     except IOError as e:
-        print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
+        print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
         sys.exit(0)
 
     # read probes
@@ -267,15 +267,15 @@
 
         out_path = os.path.abspath(args.output)
 
-        print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
+        print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path))
         for sample in all_samples:
             try:
                 outfile_fn = out_path + "/" + sample + ".bed"
                 outfile_f = open(outfile_fn, 'w')
             except IOError as e:
-                print '[ERROR] Cannot open output file for writing: ', outfile_fn
+                print('[ERROR] Cannot open output file for writing: ', outfile_fn)
                 sys.exit(0)
-            print "[RUNNING] Exporting %s" % sample
+            print("[RUNNING] Exporting %s" % sample)
 
             cf.export_sample(h5file_in, sample, probes, outfile_f)
             outfile_f.close()
@@ -283,7 +283,7 @@
     elif len(args.sample) == 1:
         out_path = os.path.abspath(args.output)
         sample = args.sample[0]
-        print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
+        print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path))
         try:
             if os.path.isdir(out_path):
                 outfile_fn = out_path + "/" + sample + ".bed"
@@ -291,16 +291,16 @@
                 outfile_fn = out_path
             outfile_f = open(outfile_fn, 'w')
         except IOError as e:
-            print '[ERROR] Cannot open output file for writing: ', outfile_fn
+            print('[ERROR] Cannot open output file for writing: ', outfile_fn)
             sys.exit(0)
-        print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
+        print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
 
         cf.export_sample(h5file_in, sample, probes, outfile_f)
         outfile_f.close()
 
     else:
         out_path = os.path.abspath(args.output)
-        print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
+        print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path))
         for sample in args.sample:
             try:
                 if os.path.isdir(out_path):
@@ -309,9 +309,9 @@
                     outfile_fn = out_path
                 outfile_f = open(outfile_fn, 'w')
             except IOError as e:
-                print '[ERROR] Cannot open output file for writing: ', outfile_fn
+                print('[ERROR] Cannot open output file for writing: ', outfile_fn)
                 sys.exit(0)
-            print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
+            print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
 
             cf.export_sample(h5file_in, sample, probes, outfile_f)
             outfile_f.close()
@@ -323,14 +323,14 @@
         h5file_in_fn = str(args.input)
         h5file_in = openFile(h5file_in_fn, mode='r')
     except IOError as e:
-        print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
+        print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
         sys.exit(0)
 
     try:
         callfile_fn = str(args.output)
         callfile_f = open(callfile_fn, mode='w')
     except IOError as e:
-        print '[ERROR] Cannot open output file for writing: ', callfile_fn
+        print('[ERROR] Cannot open output file for writing: ', callfile_fn)
         sys.exit(0)
 
     chrs_to_process = []
@@ -340,14 +340,14 @@
 
     h5file_in.close()
 
-    print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
+    print('[INIT] Initializing caller at threshold = %f' % (args.threshold))
 
     r = cf.rpkm_reader(h5file_in_fn)
 
     all_calls = []
 
     for chr in chrs_to_process:
-        print '[RUNNING] Now processing chr%s' % chr
+        print('[RUNNING] Now processing chr%s' % chr)
         data = r.getExonValuesByRegion(chr)
 
         # raw_data = copy.copy(data)
@@ -407,7 +407,7 @@
 
     callfile_f.write('\t'.join(header) + "\n")
     for call in all_calls:
-        print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
+        print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
         callfile_f.write(
             "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
 
@@ -425,7 +425,7 @@
         from matplotlib.patches import Rectangle
         _ = locale.setlocale(locale.LC_ALL, '')
     except:
-        print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+        print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
         sys.exit(0)
 
     chr, start, stop = cf.parseLocString(args.region)
@@ -448,7 +448,7 @@
             try:
                 color, sampleID = sample.split(":")
             except:
-                color = coloriter.next()
+                color = next(coloriter)
                 sampleID = sample
 
             ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID)
@@ -484,7 +484,7 @@
         from matplotlib.lines import Line2D
         from matplotlib.patches import Rectangle
     except:
-        print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+        print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
         sys.exit(0)
 
     import locale
@@ -497,7 +497,7 @@
         callfile_fn = str(args.calls)
         callfile_f = open(callfile_fn, mode='r')
     except IOError as e:
-        print '[ERROR] Cannot open call file for reading: ', callfile_fn
+        print('[ERROR] Cannot open call file for reading: ', callfile_fn)
         sys.exit(0)
 
     all_calls = []
@@ -560,33 +560,33 @@
     try:
         import pysam
     except:
-        print '[ERROR] Cannot load pysam module! Make sure it is insalled'
+        print('[ERROR] Cannot load pysam module! Make sure it is insalled')
         sys.exit(0)
     try:
         # read probes table
         probe_fn = str(args.probes[0])
         probes = cf.loadProbeList(probe_fn)
         num_probes = len(probes)
-        print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
+        print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
     except IOError as e:
-        print '[ERROR] Cannot read probes file: ', probe_fn
+        print('[ERROR] Cannot read probes file: ', probe_fn)
         sys.exit(0)
 
     try:
         rpkm_f = open(args.output[0], 'w')
     except IOError as e:
-        print '[ERROR] Cannot open rpkm file for writing: ', args.output
+        print('[ERROR] Cannot open rpkm file for writing: ', args.output)
         sys.exit(0)
 
-    print "[RUNNING] Counting total number of reads in bam file..."
+    print("[RUNNING] Counting total number of reads in bam file...")
     total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
-    print "[RUNNING] Found %d reads" % total_reads
+    print("[RUNNING] Found %d reads" % total_reads)
 
     f = pysam.Samfile(args.input[0], "rb")
 
     if not f._hasIndex():
-        print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \
-              args.input[0]
+        print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \
+              args.input[0])
         sys.exit(0)
 
     # will be storing values in these arrays
@@ -609,11 +609,11 @@
         elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs:
             probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "")
         else:
-            print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (
-            probes_contig, probe_fn)
+            print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (
+            probes_contig, probe_fn))
             sys.exit(0)
 
-    print "[RUNNING] Calculating RPKM values..."
+    print("[RUNNING] Calculating RPKM values...")
 
     # loop through each probe
     for p in probes:
@@ -627,8 +627,8 @@
         try:
             iter = f.fetch(p_chr, p_start, p_stop)
         except:
-            print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (
-            p_chr, p_start, p_stop)
+            print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (
+            p_chr, p_start, p_stop))
             sys.exit(0)
 
         for i in iter: