0
|
1 #!/usr/bin/env python
|
|
2 from Bio import SeqIO
|
|
3 import argparse
|
|
4 import sys
|
|
5
|
|
6 def generate_windows(seq, window, step):
|
|
7 '''
|
|
8 Generates windows of a sequence, with the distance of windows
|
|
9 defined by *step*.
|
|
10
|
|
11 seq -- string to split into windows.
|
|
12 window -- integer specifying the size the generated fragments.
|
|
13 step -- integer specifiying the distance between adjacent fragments.
|
|
14 '''
|
|
15 stop = window
|
|
16 end = len(seq)
|
|
17 for i in range(stop, end, step):
|
|
18 start = stop-window
|
|
19 fragment = seq[start:stop]
|
|
20 stop_coordinate = stop #to return real stop coordinate
|
|
21 stop = stop+step
|
|
22 yield (fragment, start+1, stop_coordinate) #start+1 to adjust 0-based range
|
|
23
|
|
24
|
|
25 def write_fragment(description, output_handle, fragment, start, stop):
|
|
26 '''Write out fragments as fasta with description and start/stop coordinates as fasta header'''
|
|
27 output_string = ">{0}_start:{1}_stop:{2}\n{3}\n".format(description, start, stop, fragment)
|
|
28 output_handle.write(output_string)
|
|
29
|
|
30
|
|
31 def handle_io(input, output, window = 21, step= 21):
|
|
32 '''
|
|
33 Keyword arguments:
|
|
34 input -- file handle for fasta file containing sequences for which you wish to generate fragments.
|
|
35 output -- file handle for the multi-fasta that will contain the generated fragments.
|
|
36 window -- integer specifying the size of the fragments.
|
|
37 step -- integer specifiying the distance between adjacent fragments.
|
|
38 '''
|
|
39 record_iterator = SeqIO.parse(input, "fasta")
|
|
40 for entry in record_iterator:
|
|
41 seq = str(entry.seq)
|
|
42 description = str(entry.description)
|
|
43 windows = generate_windows(seq, window, step)
|
|
44 [write_fragment(description, output, *fragment) for fragment in windows]
|
|
45 output.close()
|
|
46 input.close()
|
|
47
|
|
48 def positive_int(val):
|
|
49 try:
|
|
50 assert(int(val) > 0)
|
|
51 except:
|
|
52 raise ArgumentTypeError("'%s' is not a valid positive int" % val)
|
|
53 return int(val)
|
|
54
|
|
55 if __name__ == "__main__":
|
|
56
|
|
57 parser = argparse.ArgumentParser(description='Generate fixed size windows in fasta format from multi-fasta sequence.')
|
|
58 parser.add_argument('--input', type=argparse.FileType('r'), required=True,
|
|
59 help='supply an input multi-fasta file.')
|
|
60 parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
61 help='supply an output multi-fasta file. If not specified use stdout.')
|
|
62 parser.add_argument('--window', type=positive_int, default=21,
|
|
63 help='Set the size of the generated windows')
|
|
64 parser.add_argument('--step', type=positive_int, default=21,
|
|
65 help='Set distance between the windows')
|
|
66 args = parser.parse_args()
|
|
67
|
|
68 handle_io(args.input, args.output, args.window, args.step)
|