comparison yac.py @ 1:e5ef40107f54 draft

Uploaded
author mvdbeek
date Mon, 30 Mar 2015 11:40:52 -0400
parents 2445856981a1
children
comparison
equal deleted inserted replaced
0:2445856981a1 1:e5ef40107f54
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # yac = yet another clipper 2 # yac = yet another clipper
3 # v 1.2.1 - 23-08-2014 - Support FastQ output
3 # v 1.1.0 - 23-08-2014 - argparse implementation 4 # v 1.1.0 - 23-08-2014 - argparse implementation
4 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode 5 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode
5 # Christophe Antoniewski <drosofff@gmail.com> 6 # Christophe Antoniewski <drosofff@gmail.com>
6 7
7 import sys, string, argparse 8 import sys
9 import string
10 import argparse
11 from itertools import islice
12
8 13
9 def Parser(): 14 def Parser():
10 the_parser = argparse.ArgumentParser() 15 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--input', action="store", type=str, help="input fastq file") 16 the_parser.add_argument(
12 the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file") 17 '--input', action="store", nargs='+', help="input fastq files")
13 the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") 18 the_parser.add_argument(
14 the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep") 19 '--output', action="store", type=str, help="output, clipped fasta file")
15 the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep") 20 the_parser.add_argument(
16 the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping") 21 '--output_format', action="store", type=str, help="output format, fasta or fastq")
17 args = the_parser.parse_args() 22 the_parser.add_argument(
18 args.adapter_to_clip = args.adapter_to_clip.upper() 23 '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
19 return args 24 the_parser.add_argument(
25 '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
26 the_parser.add_argument(
27 '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
28 the_parser.add_argument('--Nmode', action="store", type=str, choices=[
29 "accept", "reject"], help="accept or reject sequences with N for clipping")
30 args = the_parser.parse_args()
31 args.adapter_to_clip = args.adapter_to_clip.upper()
32 return args
20 33
21 34
22 class Clip: 35 class Clip:
23 def __init__(self, inputfile, outputfile, adapter, minsize, maxsize):
24 self.inputfile = inputfile
25 self.outputfile = outputfile
26 self.adapter = adapter
27 self.minsize = int(minsize)
28 self.maxsize = int(maxsize)
29 def motives (sequence):
30 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
31 sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match
32 dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"}
33 for pos in enumerate(sequence[:6]):
34 for subst in dicsubst[pos[1]]:
35 sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7])
36 return sequencevariants
37 self.adaptmotifs= motives(self.adapter)
38 36
39 def scanadapt(self, adaptmotives=[], sequence=""): 37 def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
40 '''scans sequence for adapter motives''' 38 self.inputfile = inputfile
41 if sequence.rfind(adaptmotives[0]) != -1: 39 self.outputfile = outputfile
42 return sequence[:sequence.rfind(adaptmotives[0])] 40 self.output_format = output_format
43 for motif in adaptmotives[1:]: 41 self.adapter = adapter
44 if sequence.rfind(motif) != -1: 42 self.minsize = int(minsize)
45 return sequence[:sequence.rfind(motif)] 43 self.maxsize = int(maxsize)
46 return sequence 44 self.Nmode = Nmode
47 45
48 def clip_with_N (self): 46 def motives(sequence):
49 '''clips adapter sequences from inputfile. 47 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
50 Reads containing N are retained.''' 48 sequencevariants = [
51 iterator = 0 49 sequence[0:6]] # initializes the list with the 6mer perfect match
50 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
51 for pos in enumerate(sequence[:6]):
52 for subst in dicsubst[pos[1]]:
53 sequencevariants.append(
54 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
55 return sequencevariants
56 self.adaptmotifs = motives(self.adapter)
57
58 def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
59 '''scans sequence for adapter motives'''
60 match_position = sequence.rfind(adaptmotives[0])
61 if match_position != -1:
62 return sequence[:match_position], qscore[:match_position]
63 for motif in adaptmotives[1:]:
64 match_position = sequence.rfind(motif)
65 if match_position != -1:
66 return sequence[:match_position], qscore[:match_position]
67 return sequence, qscore
68
69 def write_output(self, id, read, qscore, output):
70 if self.output_format == "fasta":
71 block = ">{0}\n{1}\n".format(id, read)
72 else:
73 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
74 output.write(block)
75
76 def handle_io(self):
77 '''Open input file, pass read sequence and read qscore to clipping function.
78 Pass clipped read and qscore to output function.'''
79 id = 0
80 output = open(self.outputfile, "a")
81 with open(self.inputfile, "r") as input:
82 block_gen = islice(input, 1, None, 2)
83 for i, line in enumerate(block_gen):
84 if i % 2:
85 qscore = line.rstrip()
86 else:
87 read = line.rstrip()
88 continue
89 trimmed_read, trimmed_qscore = self.scanadapt(
90 self.adaptmotifs, read, qscore)
91 if self.minsize <= len(trimmed_read) <= self.maxsize:
92 if (self.Nmode == "reject") and ("N" in trimmed_read):
93 continue
94 id += 1
95 self.write_output(id, trimmed_read, trimmed_qscore, output)
96 output.close()
97
98
99 def main(*argv):
100 instanceClip = Clip(*argv)
101 instanceClip.handle_io()
102
103 if __name__ == "__main__":
104 args = Parser()
52 id = 0 105 id = 0
53 F = open (self.inputfile, "r") 106 for inputfile in args.input:
54 O = open (self.outputfile, "w") 107 main(inputfile, args.output, args.output_format,
55 for line in F: 108 args.adapter_to_clip, args.min, args.max, args.Nmode)
56 iterator += 1
57 if iterator % 4 == 2:
58 trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
59 if self.minsize <= len(trim) <= self.maxsize:
60 id += 1
61 print >> O, ">%i\n%s" % (id, trim)
62 F.close()
63 O.close()
64 def clip_without_N (self):
65 '''clips adapter sequences from inputfile.
66 Reads containing N are rejected.'''
67 iterator = 0
68 id = 0
69 F = open (self.inputfile, "r")
70 O = open (self.outputfile, "w")
71 for line in F:
72 iterator += 1
73 if iterator % 4 == 2:
74 trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
75 if "N" in trim: continue
76 if self.minsize <= len(trim) <= self.maxsize:
77 id += 1
78 print >> O, ">%i\n%s" % (id, trim)
79 F.close()
80 O.close()
81
82 def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode):
83 instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize)
84 if Nmode == "accept":
85 instanceClip.clip_with_N()
86 else:
87 instanceClip.clip_without_N()
88
89 if __name__ == "__main__" :
90 args = Parser()
91 __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode)