Mercurial > repos > drosofff > yac_clipper
comparison yac.py @ 1:e5ef40107f54 draft
Uploaded
author | mvdbeek |
---|---|
date | Mon, 30 Mar 2015 11:40:52 -0400 |
parents | 2445856981a1 |
children |
comparison
equal
deleted
inserted
replaced
0:2445856981a1 | 1:e5ef40107f54 |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 # yac = yet another clipper | 2 # yac = yet another clipper |
3 # v 1.2.1 - 23-08-2014 - Support FastQ output | |
3 # v 1.1.0 - 23-08-2014 - argparse implementation | 4 # v 1.1.0 - 23-08-2014 - argparse implementation |
4 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode | 5 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode |
5 # Christophe Antoniewski <drosofff@gmail.com> | 6 # Christophe Antoniewski <drosofff@gmail.com> |
6 | 7 |
7 import sys, string, argparse | 8 import sys |
9 import string | |
10 import argparse | |
11 from itertools import islice | |
12 | |
8 | 13 |
9 def Parser(): | 14 def Parser(): |
10 the_parser = argparse.ArgumentParser() | 15 the_parser = argparse.ArgumentParser() |
11 the_parser.add_argument('--input', action="store", type=str, help="input fastq file") | 16 the_parser.add_argument( |
12 the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file") | 17 '--input', action="store", nargs='+', help="input fastq files") |
13 the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") | 18 the_parser.add_argument( |
14 the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep") | 19 '--output', action="store", type=str, help="output, clipped fasta file") |
15 the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep") | 20 the_parser.add_argument( |
16 the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping") | 21 '--output_format', action="store", type=str, help="output format, fasta or fastq") |
17 args = the_parser.parse_args() | 22 the_parser.add_argument( |
18 args.adapter_to_clip = args.adapter_to_clip.upper() | 23 '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") |
19 return args | 24 the_parser.add_argument( |
25 '--min', action="store", type=int, help="minimal size of clipped sequence to keep") | |
26 the_parser.add_argument( | |
27 '--max', action="store", type=int, help="maximal size of clipped sequence to keep") | |
28 the_parser.add_argument('--Nmode', action="store", type=str, choices=[ | |
29 "accept", "reject"], help="accept or reject sequences with N for clipping") | |
30 args = the_parser.parse_args() | |
31 args.adapter_to_clip = args.adapter_to_clip.upper() | |
32 return args | |
20 | 33 |
21 | 34 |
22 class Clip: | 35 class Clip: |
23 def __init__(self, inputfile, outputfile, adapter, minsize, maxsize): | |
24 self.inputfile = inputfile | |
25 self.outputfile = outputfile | |
26 self.adapter = adapter | |
27 self.minsize = int(minsize) | |
28 self.maxsize = int(maxsize) | |
29 def motives (sequence): | |
30 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' | |
31 sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match | |
32 dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"} | |
33 for pos in enumerate(sequence[:6]): | |
34 for subst in dicsubst[pos[1]]: | |
35 sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7]) | |
36 return sequencevariants | |
37 self.adaptmotifs= motives(self.adapter) | |
38 | 36 |
39 def scanadapt(self, adaptmotives=[], sequence=""): | 37 def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode): |
40 '''scans sequence for adapter motives''' | 38 self.inputfile = inputfile |
41 if sequence.rfind(adaptmotives[0]) != -1: | 39 self.outputfile = outputfile |
42 return sequence[:sequence.rfind(adaptmotives[0])] | 40 self.output_format = output_format |
43 for motif in adaptmotives[1:]: | 41 self.adapter = adapter |
44 if sequence.rfind(motif) != -1: | 42 self.minsize = int(minsize) |
45 return sequence[:sequence.rfind(motif)] | 43 self.maxsize = int(maxsize) |
46 return sequence | 44 self.Nmode = Nmode |
47 | 45 |
48 def clip_with_N (self): | 46 def motives(sequence): |
49 '''clips adapter sequences from inputfile. | 47 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' |
50 Reads containing N are retained.''' | 48 sequencevariants = [ |
51 iterator = 0 | 49 sequence[0:6]] # initializes the list with the 6mer perfect match |
50 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} | |
51 for pos in enumerate(sequence[:6]): | |
52 for subst in dicsubst[pos[1]]: | |
53 sequencevariants.append( | |
54 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) | |
55 return sequencevariants | |
56 self.adaptmotifs = motives(self.adapter) | |
57 | |
58 def scanadapt(self, adaptmotives=[], sequence="", qscore=""): | |
59 '''scans sequence for adapter motives''' | |
60 match_position = sequence.rfind(adaptmotives[0]) | |
61 if match_position != -1: | |
62 return sequence[:match_position], qscore[:match_position] | |
63 for motif in adaptmotives[1:]: | |
64 match_position = sequence.rfind(motif) | |
65 if match_position != -1: | |
66 return sequence[:match_position], qscore[:match_position] | |
67 return sequence, qscore | |
68 | |
69 def write_output(self, id, read, qscore, output): | |
70 if self.output_format == "fasta": | |
71 block = ">{0}\n{1}\n".format(id, read) | |
72 else: | |
73 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) | |
74 output.write(block) | |
75 | |
76 def handle_io(self): | |
77 '''Open input file, pass read sequence and read qscore to clipping function. | |
78 Pass clipped read and qscore to output function.''' | |
79 id = 0 | |
80 output = open(self.outputfile, "a") | |
81 with open(self.inputfile, "r") as input: | |
82 block_gen = islice(input, 1, None, 2) | |
83 for i, line in enumerate(block_gen): | |
84 if i % 2: | |
85 qscore = line.rstrip() | |
86 else: | |
87 read = line.rstrip() | |
88 continue | |
89 trimmed_read, trimmed_qscore = self.scanadapt( | |
90 self.adaptmotifs, read, qscore) | |
91 if self.minsize <= len(trimmed_read) <= self.maxsize: | |
92 if (self.Nmode == "reject") and ("N" in trimmed_read): | |
93 continue | |
94 id += 1 | |
95 self.write_output(id, trimmed_read, trimmed_qscore, output) | |
96 output.close() | |
97 | |
98 | |
99 def main(*argv): | |
100 instanceClip = Clip(*argv) | |
101 instanceClip.handle_io() | |
102 | |
103 if __name__ == "__main__": | |
104 args = Parser() | |
52 id = 0 | 105 id = 0 |
53 F = open (self.inputfile, "r") | 106 for inputfile in args.input: |
54 O = open (self.outputfile, "w") | 107 main(inputfile, args.output, args.output_format, |
55 for line in F: | 108 args.adapter_to_clip, args.min, args.max, args.Nmode) |
56 iterator += 1 | |
57 if iterator % 4 == 2: | |
58 trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) | |
59 if self.minsize <= len(trim) <= self.maxsize: | |
60 id += 1 | |
61 print >> O, ">%i\n%s" % (id, trim) | |
62 F.close() | |
63 O.close() | |
64 def clip_without_N (self): | |
65 '''clips adapter sequences from inputfile. | |
66 Reads containing N are rejected.''' | |
67 iterator = 0 | |
68 id = 0 | |
69 F = open (self.inputfile, "r") | |
70 O = open (self.outputfile, "w") | |
71 for line in F: | |
72 iterator += 1 | |
73 if iterator % 4 == 2: | |
74 trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) | |
75 if "N" in trim: continue | |
76 if self.minsize <= len(trim) <= self.maxsize: | |
77 id += 1 | |
78 print >> O, ">%i\n%s" % (id, trim) | |
79 F.close() | |
80 O.close() | |
81 | |
82 def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode): | |
83 instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize) | |
84 if Nmode == "accept": | |
85 instanceClip.clip_with_N() | |
86 else: | |
87 instanceClip.clip_without_N() | |
88 | |
89 if __name__ == "__main__" : | |
90 args = Parser() | |
91 __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode) |