0
|
1 """
|
|
2 genetrack.py
|
|
3
|
|
4 Input: either ssccidx or gff format of reads
|
|
5 .ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads
|
|
6 .gff format: standard gff, score interpreted as number of reads
|
|
7
|
|
8 Output: Called peaks in either gff or txt format
|
|
9 .txt format: tab-separated chromosome, strand, start, end, read count
|
|
10 .gff format: standard gff, score is read count
|
|
11 """
|
2
|
12 import optparse
|
0
|
13 import csv
|
|
14 import os
|
|
15 import genetrack_util
|
|
16
|
|
17
|
|
18 if __name__ == '__main__':
|
2
|
19 parser = optparse.OptionParser()
|
|
20 parser.add_option('-t', '--input_format', dest='input_format', type='string', help='Input format')
|
|
21 parser.add_option('-i', '--input', dest='inputs', type='string', action='append', nargs=2, help='Input datasets')
|
|
22 parser.add_option('-s', '--sigma', dest='sigma', type='int', default=5, help='Sigma.')
|
|
23 parser.add_option('-e', '--exclusion', dest='exclusion', type='int', default=20, help='Exclusion zone.')
|
|
24 parser.add_option('-u', '--up_width', dest='up_width', type='int', default=10, help='Upstream width of called peaks.')
|
|
25 parser.add_option('-d', '--down_width', dest='down_width', type='int', default=10, help='Downstream width of called peaks.')
|
|
26 parser.add_option('-f', '--filter', dest='filter', type='int', default=3, help='Absolute read filter.')
|
|
27 parser.add_option('-c', '--chunk_size', dest='chunk_size', type='int', default=10, help='Size, in millions of base pairs.')
|
|
28 options, args = parser.parse_args()
|
0
|
29
|
|
30 os.mkdir('output')
|
2
|
31 for (dataset_path, hid) in options.inputs:
|
|
32 if options.input_format == 'gff':
|
0
|
33 # Make sure the reads for each chromosome are sorted by index.
|
|
34 input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path)
|
|
35 else:
|
|
36 # We're processing ssccidx data.
|
|
37 input_path = dataset_path
|
2
|
38 output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(options.sigma),
|
|
39 str(options.exclusion),
|
|
40 str(options.up_width),
|
|
41 str(options.down_width),
|
|
42 str(options.filter),
|
0
|
43 str(hid))
|
|
44 output_path = os.path.join('output', output_name)
|
|
45 reader = csv.reader(open(input_path, 'rU'), delimiter='\t')
|
|
46 writer = csv.writer(open(output_path, 'wt'), delimiter='\t')
|
2
|
47 chunk_size = options.chunk_size * 10 ** 6
|
|
48 width = options.sigma * 5
|
0
|
49 manager = genetrack_util.ChromosomeManager(reader)
|
|
50 while not manager.done:
|
|
51 cname = manager.chromosome_name()
|
|
52 # Should we process this chromosome?
|
|
53 data = manager.load_chromosome()
|
|
54 if not data:
|
|
55 continue
|
|
56 keys = genetrack_util.make_keys(data)
|
|
57 lo, hi = genetrack_util.get_range(data)
|
|
58 for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width):
|
|
59 (slice_start, slice_end), process_bounds = chunk
|
|
60 window = genetrack_util.get_window(data, slice_start, slice_end, keys)
|
|
61 genetrack_util.process_chromosome(cname,
|
|
62 window,
|
|
63 writer,
|
|
64 process_bounds,
|
2
|
65 width,
|
|
66 options.sigma,
|
|
67 options.up_width,
|
|
68 options.down_width,
|
|
69 options.exclusion,
|
|
70 options.filter)
|