Mercurial > repos > mvdbeek > generate_sliding_windows
comparison generate_sliding_windows.py @ 0:559cf4ca1f2d draft
Uploaded
author | mvdbeek |
---|---|
date | Wed, 15 Apr 2015 06:34:23 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:559cf4ca1f2d |
---|---|
1 #!/usr/bin/env python | |
2 from Bio import SeqIO | |
3 import argparse | |
4 import sys | |
5 | |
6 def generate_windows(seq, window, step): | |
7 ''' | |
8 Generates windows of a sequence, with the distance of windows | |
9 defined by *step*. | |
10 | |
11 seq -- string to split into windows. | |
12 window -- integer specifying the size the generated fragments. | |
13 step -- integer specifiying the distance between adjacent fragments. | |
14 ''' | |
15 stop = window | |
16 end = len(seq) | |
17 for i in range(stop, end, step): | |
18 start = stop-window | |
19 fragment = seq[start:stop] | |
20 stop_coordinate = stop #to return real stop coordinate | |
21 stop = stop+step | |
22 yield (fragment, start+1, stop_coordinate) #start+1 to adjust 0-based range | |
23 | |
24 | |
25 def write_fragment(description, output_handle, fragment, start, stop): | |
26 '''Write out fragments as fasta with description and start/stop coordinates as fasta header''' | |
27 output_string = ">{0}_start:{1}_stop:{2}\n{3}\n".format(description, start, stop, fragment) | |
28 output_handle.write(output_string) | |
29 | |
30 | |
31 def handle_io(input, output, window = 21, step= 21): | |
32 ''' | |
33 Keyword arguments: | |
34 input -- file handle for fasta file containing sequences for which you wish to generate fragments. | |
35 output -- file handle for the multi-fasta that will contain the generated fragments. | |
36 window -- integer specifying the size of the fragments. | |
37 step -- integer specifiying the distance between adjacent fragments. | |
38 ''' | |
39 record_iterator = SeqIO.parse(input, "fasta") | |
40 for entry in record_iterator: | |
41 seq = str(entry.seq) | |
42 description = str(entry.description) | |
43 windows = generate_windows(seq, window, step) | |
44 [write_fragment(description, output, *fragment) for fragment in windows] | |
45 output.close() | |
46 input.close() | |
47 | |
48 def positive_int(val): | |
49 try: | |
50 assert(int(val) > 0) | |
51 except: | |
52 raise ArgumentTypeError("'%s' is not a valid positive int" % val) | |
53 return int(val) | |
54 | |
55 if __name__ == "__main__": | |
56 | |
57 parser = argparse.ArgumentParser(description='Generate fixed size windows in fasta format from multi-fasta sequence.') | |
58 parser.add_argument('--input', type=argparse.FileType('r'), required=True, | |
59 help='supply an input multi-fasta file.') | |
60 parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout, | |
61 help='supply an output multi-fasta file. If not specified use stdout.') | |
62 parser.add_argument('--window', type=positive_int, default=21, | |
63 help='Set the size of the generated windows') | |
64 parser.add_argument('--step', type=positive_int, default=21, | |
65 help='Set distance between the windows') | |
66 args = parser.parse_args() | |
67 | |
68 handle_io(args.input, args.output, args.window, args.step) |