annotate genetrack.py @ 0:0368815ae4d5 draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:19:45 -0500
parents
children fcc2f5992551
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
1 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
2 genetrack.py
0368815ae4d5 Uploaded
greg
parents:
diff changeset
3
0368815ae4d5 Uploaded
greg
parents:
diff changeset
4 Input: either ssccidx or gff format of reads
0368815ae4d5 Uploaded
greg
parents:
diff changeset
5 .ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads
0368815ae4d5 Uploaded
greg
parents:
diff changeset
6 .gff format: standard gff, score interpreted as number of reads
0368815ae4d5 Uploaded
greg
parents:
diff changeset
7
0368815ae4d5 Uploaded
greg
parents:
diff changeset
8 Output: Called peaks in either gff or txt format
0368815ae4d5 Uploaded
greg
parents:
diff changeset
9 .txt format: tab-separated chromosome, strand, start, end, read count
0368815ae4d5 Uploaded
greg
parents:
diff changeset
10 .gff format: standard gff, score is read count
0368815ae4d5 Uploaded
greg
parents:
diff changeset
11 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
12 import argparse
0368815ae4d5 Uploaded
greg
parents:
diff changeset
13 import csv
0368815ae4d5 Uploaded
greg
parents:
diff changeset
14 import os
0368815ae4d5 Uploaded
greg
parents:
diff changeset
15 import genetrack_util
0368815ae4d5 Uploaded
greg
parents:
diff changeset
16
0368815ae4d5 Uploaded
greg
parents:
diff changeset
17
0368815ae4d5 Uploaded
greg
parents:
diff changeset
18 if __name__ == '__main__':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
19 parser = argparse.ArgumentParser()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
20 parser.add_argument('--input_format', dest='input_format', help='Input format.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
21 parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets")
0368815ae4d5 Uploaded
greg
parents:
diff changeset
22 parser.add_argument('--sigma', dest='sigma', type=int, default=5, help='Sigma.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
23 parser.add_argument('--exclusion', dest='exclusion', type=int, default=20, help='Exclusion zone.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
24 parser.add_argument('--up_width', dest='up_width', type=int, default=10, help='Upstream width of called peaks.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
25 parser.add_argument('--down_width', dest='down_width', type=int, default=10, help='Downstream width of called peaks.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
26 parser.add_argument('--filter', dest='filter', type=int, default=3, help='Absolute read filter.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
27 parser.add_argument('--chunk_size', dest='chunk_size', type=int, default=10, help='Size, in millions of base pairs.')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
28 args = parser.parse_args()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
29
0368815ae4d5 Uploaded
greg
parents:
diff changeset
30 os.mkdir('output')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
31 for (dataset_path, hid) in args.inputs:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
32 if args.input_format == 'gff':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
33 # Make sure the reads for each chromosome are sorted by index.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
34 input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
35 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
36 # We're processing ssccidx data.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
37 input_path = dataset_path
0368815ae4d5 Uploaded
greg
parents:
diff changeset
38 output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(args.sigma),
0368815ae4d5 Uploaded
greg
parents:
diff changeset
39 str(args.exclusion),
0368815ae4d5 Uploaded
greg
parents:
diff changeset
40 str(args.up_width),
0368815ae4d5 Uploaded
greg
parents:
diff changeset
41 str(args.down_width),
0368815ae4d5 Uploaded
greg
parents:
diff changeset
42 str(args.filter),
0368815ae4d5 Uploaded
greg
parents:
diff changeset
43 str(hid))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
44 output_path = os.path.join('output', output_name)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
45 reader = csv.reader(open(input_path, 'rU'), delimiter='\t')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
46 writer = csv.writer(open(output_path, 'wt'), delimiter='\t')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
47 chunk_size = args.chunk_size * 10 ** 6
0368815ae4d5 Uploaded
greg
parents:
diff changeset
48 width = args.sigma * 5
0368815ae4d5 Uploaded
greg
parents:
diff changeset
49 manager = genetrack_util.ChromosomeManager(reader)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
50 while not manager.done:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
51 cname = manager.chromosome_name()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
52 # Should we process this chromosome?
0368815ae4d5 Uploaded
greg
parents:
diff changeset
53 data = manager.load_chromosome()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
54 if not data:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
55 continue
0368815ae4d5 Uploaded
greg
parents:
diff changeset
56 keys = genetrack_util.make_keys(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
57 lo, hi = genetrack_util.get_range(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
58 for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
59 (slice_start, slice_end), process_bounds = chunk
0368815ae4d5 Uploaded
greg
parents:
diff changeset
60 window = genetrack_util.get_window(data, slice_start, slice_end, keys)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
61 genetrack_util.process_chromosome(cname,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
62 window,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
63 writer,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
64 process_bounds,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
65 args.width,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
66 args.sigma,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
67 args.up_width,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
68 args.down_width,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
69 args.exclusion,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
70 args.filter)