Mercurial > repos > greg > genetrack
comparison genetrack.py @ 0:0368815ae4d5 draft
Uploaded
author | greg |
---|---|
date | Tue, 17 Nov 2015 14:19:45 -0500 |
parents | |
children | fcc2f5992551 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0368815ae4d5 |
---|---|
1 """ | |
2 genetrack.py | |
3 | |
4 Input: either ssccidx or gff format of reads | |
5 .ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads | |
6 .gff format: standard gff, score interpreted as number of reads | |
7 | |
8 Output: Called peaks in either gff or txt format | |
9 .txt format: tab-separated chromosome, strand, start, end, read count | |
10 .gff format: standard gff, score is read count | |
11 """ | |
12 import argparse | |
13 import csv | |
14 import os | |
15 import genetrack_util | |
16 | |
17 | |
18 if __name__ == '__main__': | |
19 parser = argparse.ArgumentParser() | |
20 parser.add_argument('--input_format', dest='input_format', help='Input format.') | |
21 parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets") | |
22 parser.add_argument('--sigma', dest='sigma', type=int, default=5, help='Sigma.') | |
23 parser.add_argument('--exclusion', dest='exclusion', type=int, default=20, help='Exclusion zone.') | |
24 parser.add_argument('--up_width', dest='up_width', type=int, default=10, help='Upstream width of called peaks.') | |
25 parser.add_argument('--down_width', dest='down_width', type=int, default=10, help='Downstream width of called peaks.') | |
26 parser.add_argument('--filter', dest='filter', type=int, default=3, help='Absolute read filter.') | |
27 parser.add_argument('--chunk_size', dest='chunk_size', type=int, default=10, help='Size, in millions of base pairs.') | |
28 args = parser.parse_args() | |
29 | |
30 os.mkdir('output') | |
31 for (dataset_path, hid) in args.inputs: | |
32 if args.input_format == 'gff': | |
33 # Make sure the reads for each chromosome are sorted by index. | |
34 input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path) | |
35 else: | |
36 # We're processing ssccidx data. | |
37 input_path = dataset_path | |
38 output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(args.sigma), | |
39 str(args.exclusion), | |
40 str(args.up_width), | |
41 str(args.down_width), | |
42 str(args.filter), | |
43 str(hid)) | |
44 output_path = os.path.join('output', output_name) | |
45 reader = csv.reader(open(input_path, 'rU'), delimiter='\t') | |
46 writer = csv.writer(open(output_path, 'wt'), delimiter='\t') | |
47 chunk_size = args.chunk_size * 10 ** 6 | |
48 width = args.sigma * 5 | |
49 manager = genetrack_util.ChromosomeManager(reader) | |
50 while not manager.done: | |
51 cname = manager.chromosome_name() | |
52 # Should we process this chromosome? | |
53 data = manager.load_chromosome() | |
54 if not data: | |
55 continue | |
56 keys = genetrack_util.make_keys(data) | |
57 lo, hi = genetrack_util.get_range(data) | |
58 for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width): | |
59 (slice_start, slice_end), process_bounds = chunk | |
60 window = genetrack_util.get_window(data, slice_start, slice_end, keys) | |
61 genetrack_util.process_chromosome(cname, | |
62 window, | |
63 writer, | |
64 process_bounds, | |
65 args.width, | |
66 args.sigma, | |
67 args.up_width, | |
68 args.down_width, | |
69 args.exclusion, | |
70 args.filter) |