diff genetrack.py @ 0:0368815ae4d5 draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:19:45 -0500
parents
children fcc2f5992551
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genetrack.py	Tue Nov 17 14:19:45 2015 -0500
@@ -0,0 +1,70 @@
+"""
+genetrack.py
+
+Input: either ssccidx or gff format of reads
+.ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads
+.gff format: standard gff, score interpreted as number of reads
+
+Output: Called peaks in either gff or txt format
+.txt format: tab-separated chromosome, strand, start, end, read count
+.gff format: standard gff, score is read count
+"""
+import argparse
+import csv
+import os
+import genetrack_util
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_format', dest='input_format', help='Input format.')
+    parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets")
+    parser.add_argument('--sigma', dest='sigma', type=int, default=5, help='Sigma.')
+    parser.add_argument('--exclusion', dest='exclusion', type=int, default=20, help='Exclusion zone.')
+    parser.add_argument('--up_width', dest='up_width', type=int, default=10, help='Upstream width of called peaks.')
+    parser.add_argument('--down_width', dest='down_width', type=int, default=10, help='Downstream width of called peaks.')
+    parser.add_argument('--filter', dest='filter', type=int, default=3, help='Absolute read filter.')
+    parser.add_argument('--chunk_size', dest='chunk_size', type=int, default=10, help='Size, in millions of base pairs.')
+    args = parser.parse_args()
+
+    os.mkdir('output')
+    for (dataset_path, hid) in args.inputs:
+        if args.input_format == 'gff':
+            # Make sure the reads for each chromosome are sorted by index.
+            input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path)
+        else:
+            # We're processing ssccidx data.
+            input_path = dataset_path
+        output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(args.sigma),
+                                                      str(args.exclusion),
+                                                      str(args.up_width),
+                                                      str(args.down_width),
+                                                      str(args.filter),
+                                                      str(hid))
+        output_path = os.path.join('output', output_name)
+        reader = csv.reader(open(input_path, 'rU'), delimiter='\t')
+        writer = csv.writer(open(output_path, 'wt'), delimiter='\t')
+        chunk_size = args.chunk_size * 10 ** 6
+        width = args.sigma * 5
+        manager = genetrack_util.ChromosomeManager(reader)
+        while not manager.done:
+            cname = manager.chromosome_name()
+            # Should we process this chromosome?
+            data = manager.load_chromosome()
+            if not data:
+                continue
+            keys = genetrack_util.make_keys(data)
+            lo, hi = genetrack_util.get_range(data)
+            for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width):
+                (slice_start, slice_end), process_bounds = chunk
+                window = genetrack_util.get_window(data, slice_start, slice_end, keys)
+                genetrack_util.process_chromosome(cname,
+                                                  window,
+                                                  writer,
+                                                  process_bounds,
+                                                  args.width,
+                                                  args.sigma,
+                                                  args.up_width,
+                                                  args.down_width,
+                                                  args.exclusion,
+                                                  args.filter)