comparison genetrack.py @ 10:1a9f1a4fa36c draft

Uploaded
author greg
date Wed, 02 Dec 2015 16:14:58 -0500
parents 551630d1fae3
children 6ad44f393892
comparison
equal deleted inserted replaced
9:e10c1ddd440e 10:1a9f1a4fa36c
7 import optparse 7 import optparse
8 import csv 8 import csv
9 import os 9 import os
10 import genetrack_util 10 import genetrack_util
11 11
12 CHUNK_SIZE = 10000000
13
12 14
13 if __name__ == '__main__': 15 if __name__ == '__main__':
14 parser = optparse.OptionParser() 16 parser = optparse.OptionParser()
15 parser.add_option('-t', '--input_format', dest='input_format', type='string', help='Input format') 17 parser.add_option('-t', '--input_format', dest='input_format', type='string', help='Input format')
16 parser.add_option('-i', '--input', dest='inputs', type='string', action='append', nargs=2, help='Input datasets') 18 parser.add_option('-i', '--input', dest='inputs', type='string', action='append', nargs=2, help='Input datasets')
17 parser.add_option('-s', '--sigma', dest='sigma', type='int', default=5, help='Sigma.') 19 parser.add_option('-s', '--sigma', dest='sigma', type='int', default=5, help='Sigma.')
18 parser.add_option('-e', '--exclusion', dest='exclusion', type='int', default=20, help='Exclusion zone.') 20 parser.add_option('-e', '--exclusion', dest='exclusion', type='int', default=20, help='Exclusion zone.')
19 parser.add_option('-u', '--up_width', dest='up_width', type='int', default=10, help='Upstream width of called peaks.') 21 parser.add_option('-u', '--up_width', dest='up_width', type='int', default=10, help='Upstream width of called peaks.')
20 parser.add_option('-d', '--down_width', dest='down_width', type='int', default=10, help='Downstream width of called peaks.') 22 parser.add_option('-d', '--down_width', dest='down_width', type='int', default=10, help='Downstream width of called peaks.')
21 parser.add_option('-f', '--filter', dest='filter', type='int', default=3, help='Absolute read filter.') 23 parser.add_option('-f', '--filter', dest='filter', type='int', default=3, help='Absolute read filter.')
22 parser.add_option('-c', '--chunk_size', dest='chunk_size', type='int', default=10, help='Size, in millions of base pairs.')
23 options, args = parser.parse_args() 24 options, args = parser.parse_args()
24 25
25 os.mkdir('output') 26 os.mkdir('output')
26 for (dataset_path, hid) in options.inputs: 27 for (dataset_path, hid) in options.inputs:
27 if options.input_format == 'gff': 28 if options.input_format == 'gff':
37 str(options.filter), 38 str(options.filter),
38 str(hid)) 39 str(hid))
39 output_path = os.path.join('output', output_name) 40 output_path = os.path.join('output', output_name)
40 reader = csv.reader(open(input_path, 'rU'), delimiter='\t') 41 reader = csv.reader(open(input_path, 'rU'), delimiter='\t')
41 writer = csv.writer(open(output_path, 'wt'), delimiter='\t') 42 writer = csv.writer(open(output_path, 'wt'), delimiter='\t')
42 chunk_size = options.chunk_size * 10 ** 6
43 width = options.sigma * 5 43 width = options.sigma * 5
44 manager = genetrack_util.ChromosomeManager(reader) 44 manager = genetrack_util.ChromosomeManager(reader)
45 while not manager.done: 45 while not manager.done:
46 cname = manager.chromosome_name() 46 cname = manager.chromosome_name()
47 # Should we process this chromosome? 47 # Should we process this chromosome?
48 data = manager.load_chromosome() 48 data = manager.load_chromosome()
49 if not data: 49 if not data:
50 continue 50 continue
51 keys = genetrack_util.make_keys(data) 51 keys = genetrack_util.make_keys(data)
52 lo, hi = genetrack_util.get_range(data) 52 lo, hi = genetrack_util.get_range(data)
53 for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width): 53 for chunk in genetrack_util.get_chunks(lo, hi, size=CHUNK_SIZE, overlap=width):
54 (slice_start, slice_end), process_bounds = chunk 54 (slice_start, slice_end), process_bounds = chunk
55 window = genetrack_util.get_window(data, slice_start, slice_end, keys) 55 window = genetrack_util.get_window(data, slice_start, slice_end, keys)
56 genetrack_util.process_chromosome(cname, 56 genetrack_util.process_chromosome(cname,
57 window, 57 window,
58 writer, 58 writer,