Mercurial > repos > greg > genetrack
comparison genetrack.py @ 0:0368815ae4d5 draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 17 Nov 2015 14:19:45 -0500 |
| parents | |
| children | fcc2f5992551 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0368815ae4d5 |
|---|---|
| 1 """ | |
| 2 genetrack.py | |
| 3 | |
| 4 Input: either ssccidx or gff format of reads | |
| 5 .ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads | |
| 6 .gff format: standard gff, score interpreted as number of reads | |
| 7 | |
| 8 Output: Called peaks in either gff or txt format | |
| 9 .txt format: tab-separated chromosome, strand, start, end, read count | |
| 10 .gff format: standard gff, score is read count | |
| 11 """ | |
| 12 import argparse | |
| 13 import csv | |
| 14 import os | |
| 15 import genetrack_util | |
| 16 | |
| 17 | |
| 18 if __name__ == '__main__': | |
| 19 parser = argparse.ArgumentParser() | |
| 20 parser.add_argument('--input_format', dest='input_format', help='Input format.') | |
| 21 parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets") | |
| 22 parser.add_argument('--sigma', dest='sigma', type=int, default=5, help='Sigma.') | |
| 23 parser.add_argument('--exclusion', dest='exclusion', type=int, default=20, help='Exclusion zone.') | |
| 24 parser.add_argument('--up_width', dest='up_width', type=int, default=10, help='Upstream width of called peaks.') | |
| 25 parser.add_argument('--down_width', dest='down_width', type=int, default=10, help='Downstream width of called peaks.') | |
| 26 parser.add_argument('--filter', dest='filter', type=int, default=3, help='Absolute read filter.') | |
| 27 parser.add_argument('--chunk_size', dest='chunk_size', type=int, default=10, help='Size, in millions of base pairs.') | |
| 28 args = parser.parse_args() | |
| 29 | |
| 30 os.mkdir('output') | |
| 31 for (dataset_path, hid) in args.inputs: | |
| 32 if args.input_format == 'gff': | |
| 33 # Make sure the reads for each chromosome are sorted by index. | |
| 34 input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path) | |
| 35 else: | |
| 36 # We're processing ssccidx data. | |
| 37 input_path = dataset_path | |
| 38 output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(args.sigma), | |
| 39 str(args.exclusion), | |
| 40 str(args.up_width), | |
| 41 str(args.down_width), | |
| 42 str(args.filter), | |
| 43 str(hid)) | |
| 44 output_path = os.path.join('output', output_name) | |
| 45 reader = csv.reader(open(input_path, 'rU'), delimiter='\t') | |
| 46 writer = csv.writer(open(output_path, 'wt'), delimiter='\t') | |
| 47 chunk_size = args.chunk_size * 10 ** 6 | |
| 48 width = args.sigma * 5 | |
| 49 manager = genetrack_util.ChromosomeManager(reader) | |
| 50 while not manager.done: | |
| 51 cname = manager.chromosome_name() | |
| 52 # Should we process this chromosome? | |
| 53 data = manager.load_chromosome() | |
| 54 if not data: | |
| 55 continue | |
| 56 keys = genetrack_util.make_keys(data) | |
| 57 lo, hi = genetrack_util.get_range(data) | |
| 58 for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width): | |
| 59 (slice_start, slice_end), process_bounds = chunk | |
| 60 window = genetrack_util.get_window(data, slice_start, slice_end, keys) | |
| 61 genetrack_util.process_chromosome(cname, | |
| 62 window, | |
| 63 writer, | |
| 64 process_bounds, | |
| 65 args.width, | |
| 66 args.sigma, | |
| 67 args.up_width, | |
| 68 args.down_width, | |
| 69 args.exclusion, | |
| 70 args.filter) |
