Mercurial > repos > greg > genetrack
diff genetrack.py @ 0:0368815ae4d5 draft
Uploaded
author | greg |
---|---|
date | Tue, 17 Nov 2015 14:19:45 -0500 |
parents | |
children | fcc2f5992551 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genetrack.py Tue Nov 17 14:19:45 2015 -0500 @@ -0,0 +1,70 @@ +""" +genetrack.py + +Input: either ssccidx or gff format of reads +.ssccidx format: tab-separated chromosome (chr##), index, + reads, - reads +.gff format: standard gff, score interpreted as number of reads + +Output: Called peaks in either gff or txt format +.txt format: tab-separated chromosome, strand, start, end, read count +.gff format: standard gff, score is read count +""" +import argparse +import csv +import os +import genetrack_util + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--input_format', dest='input_format', help='Input format.') + parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets") + parser.add_argument('--sigma', dest='sigma', type=int, default=5, help='Sigma.') + parser.add_argument('--exclusion', dest='exclusion', type=int, default=20, help='Exclusion zone.') + parser.add_argument('--up_width', dest='up_width', type=int, default=10, help='Upstream width of called peaks.') + parser.add_argument('--down_width', dest='down_width', type=int, default=10, help='Downstream width of called peaks.') + parser.add_argument('--filter', dest='filter', type=int, default=3, help='Absolute read filter.') + parser.add_argument('--chunk_size', dest='chunk_size', type=int, default=10, help='Size, in millions of base pairs.') + args = parser.parse_args() + + os.mkdir('output') + for (dataset_path, hid) in args.inputs: + if args.input_format == 'gff': + # Make sure the reads for each chromosome are sorted by index. + input_path = genetrack_util.sort_chromosome_reads_by_index(dataset_path) + else: + # We're processing ssccidx data. + input_path = dataset_path + output_name = 's%se%su%sd%sF%s_on_data_%s' % (str(args.sigma), + str(args.exclusion), + str(args.up_width), + str(args.down_width), + str(args.filter), + str(hid)) + output_path = os.path.join('output', output_name) + reader = csv.reader(open(input_path, 'rU'), delimiter='\t') + writer = csv.writer(open(output_path, 'wt'), delimiter='\t') + chunk_size = args.chunk_size * 10 ** 6 + width = args.sigma * 5 + manager = genetrack_util.ChromosomeManager(reader) + while not manager.done: + cname = manager.chromosome_name() + # Should we process this chromosome? + data = manager.load_chromosome() + if not data: + continue + keys = genetrack_util.make_keys(data) + lo, hi = genetrack_util.get_range(data) + for chunk in genetrack_util.get_chunks(lo, hi, size=chunk_size, overlap=width): + (slice_start, slice_end), process_bounds = chunk + window = genetrack_util.get_window(data, slice_start, slice_end, keys) + genetrack_util.process_chromosome(cname, + window, + writer, + process_bounds, + args.width, + args.sigma, + args.up_width, + args.down_width, + args.exclusion, + args.filter)