annotate generate_sliding_windows.py @ 2:22dc61eaed53 draft

Uploaded
author mvdbeek
date Wed, 15 Apr 2015 06:38:06 -0400
parents 559cf4ca1f2d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
1 #!/usr/bin/env python
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
2 from Bio import SeqIO
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
3 import argparse
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
4 import sys
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
5
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
6 def generate_windows(seq, window, step):
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
7 '''
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
8 Generates windows of a sequence, with the distance of windows
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
9 defined by *step*.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
10
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
11 seq -- string to split into windows.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
12 window -- integer specifying the size the generated fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
13 step -- integer specifiying the distance between adjacent fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
14 '''
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
15 stop = window
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
16 end = len(seq)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
17 for i in range(stop, end, step):
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
18 start = stop-window
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
19 fragment = seq[start:stop]
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
20 stop_coordinate = stop #to return real stop coordinate
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
21 stop = stop+step
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
22 yield (fragment, start+1, stop_coordinate) #start+1 to adjust 0-based range
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
23
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
24
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
25 def write_fragment(description, output_handle, fragment, start, stop):
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
26 '''Write out fragments as fasta with description and start/stop coordinates as fasta header'''
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
27 output_string = ">{0}_start:{1}_stop:{2}\n{3}\n".format(description, start, stop, fragment)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
28 output_handle.write(output_string)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
29
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
30
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
31 def handle_io(input, output, window = 21, step= 21):
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
32 '''
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
33 Keyword arguments:
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
34 input -- file handle for fasta file containing sequences for which you wish to generate fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
35 output -- file handle for the multi-fasta that will contain the generated fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
36 window -- integer specifying the size of the fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
37 step -- integer specifiying the distance between adjacent fragments.
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
38 '''
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
39 record_iterator = SeqIO.parse(input, "fasta")
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
40 for entry in record_iterator:
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
41 seq = str(entry.seq)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
42 description = str(entry.description)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
43 windows = generate_windows(seq, window, step)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
44 [write_fragment(description, output, *fragment) for fragment in windows]
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
45 output.close()
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
46 input.close()
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
47
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
48 def positive_int(val):
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
49 try:
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
50 assert(int(val) > 0)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
51 except:
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
52 raise ArgumentTypeError("'%s' is not a valid positive int" % val)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
53 return int(val)
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
54
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
55 if __name__ == "__main__":
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
56
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
57 parser = argparse.ArgumentParser(description='Generate fixed size windows in fasta format from multi-fasta sequence.')
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
58 parser.add_argument('--input', type=argparse.FileType('r'), required=True,
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
59 help='supply an input multi-fasta file.')
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
60 parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout,
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
61 help='supply an output multi-fasta file. If not specified use stdout.')
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
62 parser.add_argument('--window', type=positive_int, default=21,
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
63 help='Set the size of the generated windows')
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
64 parser.add_argument('--step', type=positive_int, default=21,
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
65 help='Set distance between the windows')
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
66 args = parser.parse_args()
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
67
559cf4ca1f2d Uploaded
mvdbeek
parents:
diff changeset
68 handle_io(args.input, args.output, args.window, args.step)