diff generate_sliding_windows.py @ 0:559cf4ca1f2d draft

Uploaded
author mvdbeek
date Wed, 15 Apr 2015 06:34:23 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/generate_sliding_windows.py	Wed Apr 15 06:34:23 2015 -0400
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import argparse
+import sys
+
+def generate_windows(seq, window, step):
+    '''
+    Generates windows of a sequence, with the distance of windows
+    defined by *step*.
+    
+    seq -- string to split into windows.
+    window -- integer specifying the size the generated fragments.
+    step -- integer specifiying the distance between adjacent fragments.
+    '''
+    stop = window
+    end = len(seq)
+    for i in range(stop, end, step):
+        start = stop-window
+        fragment = seq[start:stop]
+        stop_coordinate = stop #to return real stop coordinate
+        stop = stop+step
+        yield (fragment, start+1, stop_coordinate) #start+1 to adjust 0-based range
+        
+        
+def write_fragment(description, output_handle, fragment, start, stop):
+    '''Write out fragments as fasta with description and start/stop coordinates as fasta header'''
+    output_string = ">{0}_start:{1}_stop:{2}\n{3}\n".format(description, start, stop, fragment)
+    output_handle.write(output_string)
+
+    
+def handle_io(input, output, window = 21, step= 21):
+    '''
+    Keyword arguments:
+    input -- file handle for fasta file containing sequences for which you wish to generate fragments.
+    output -- file handle for the multi-fasta that will contain the generated fragments.
+    window -- integer specifying the size of the fragments.
+    step -- integer specifiying the distance between adjacent fragments.
+    '''
+    record_iterator = SeqIO.parse(input, "fasta")
+    for entry in record_iterator:
+        seq = str(entry.seq)
+        description = str(entry.description)
+        windows = generate_windows(seq, window, step)
+        [write_fragment(description, output, *fragment) for fragment in windows]
+    output.close()
+    input.close()
+
+def positive_int(val):
+    try:
+        assert(int(val) > 0)
+    except:
+        raise ArgumentTypeError("'%s' is not a valid positive int" % val)
+    return int(val)
+    
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Generate fixed size windows in fasta format from multi-fasta sequence.')
+    parser.add_argument('--input', type=argparse.FileType('r'), required=True,
+                help='supply an input multi-fasta file.')
+    parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout, 
+                help='supply an output multi-fasta file. If not specified use stdout.')
+    parser.add_argument('--window', type=positive_int, default=21,
+               help='Set the size of the generated windows')
+    parser.add_argument('--step', type=positive_int, default=21,
+               help='Set distance between the windows')
+    args = parser.parse_args()
+
+    handle_io(args.input, args.output, args.window, args.step)