0
|
1 #!/usr/bin/python
|
|
2 # yac = yet another clipper
|
|
3 # v 1.2.1 - 23-08-2014 - Support FastQ output
|
|
4 # v 1.1.0 - 23-08-2014 - argparse implementation
|
|
5 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode
|
|
6 # Christophe Antoniewski <drosofff@gmail.com>
|
|
7
|
|
8 import sys
|
|
9 import string
|
|
10 import argparse
|
|
11 from itertools import islice
|
|
12
|
|
13
|
|
14 def Parser():
|
|
15 the_parser = argparse.ArgumentParser()
|
|
16 the_parser.add_argument(
|
|
17 '--input', action="store", nargs='+', help="input fastq files")
|
|
18 the_parser.add_argument(
|
|
19 '--output', action="store", type=str, help="output, clipped fasta file")
|
|
20 the_parser.add_argument(
|
|
21 '--output_format', action="store", type=str, help="output format, fasta or fastq")
|
|
22 the_parser.add_argument(
|
|
23 '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
|
|
24 the_parser.add_argument(
|
|
25 '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
|
|
26 the_parser.add_argument(
|
|
27 '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
|
|
28 the_parser.add_argument('--Nmode', action="store", type=str, choices=[
|
|
29 "accept", "reject"], help="accept or reject sequences with N for clipping")
|
|
30 args = the_parser.parse_args()
|
|
31 args.adapter_to_clip = args.adapter_to_clip.upper()
|
|
32 return args
|
|
33
|
|
34
|
|
35 class Clip:
|
|
36
|
|
37 def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
|
|
38 self.inputfile = inputfile
|
|
39 self.outputfile = outputfile
|
|
40 self.output_format = output_format
|
|
41 self.adapter = adapter
|
|
42 self.minsize = int(minsize)
|
|
43 self.maxsize = int(maxsize)
|
|
44 self.Nmode = Nmode
|
|
45
|
|
46 def motives(sequence):
|
|
47 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
|
|
48 sequencevariants = [
|
|
49 sequence[0:6]] # initializes the list with the 6mer perfect match
|
|
50 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
|
|
51 for pos in enumerate(sequence[:6]):
|
|
52 for subst in dicsubst[pos[1]]:
|
|
53 sequencevariants.append(
|
|
54 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
|
|
55 return sequencevariants
|
|
56 self.adaptmotifs = motives(self.adapter)
|
|
57
|
|
58 def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
|
|
59 '''scans sequence for adapter motives'''
|
|
60 match_position = sequence.rfind(adaptmotives[0])
|
|
61 if match_position != -1:
|
|
62 return sequence[:match_position], qscore[:match_position]
|
|
63 for motif in adaptmotives[1:]:
|
|
64 match_position = sequence.rfind(motif)
|
|
65 if match_position != -1:
|
|
66 return sequence[:match_position], qscore[:match_position]
|
|
67 return sequence, qscore
|
|
68
|
|
69 def write_output(self, id, read, qscore, output):
|
|
70 if self.output_format == "fasta":
|
|
71 block = ">{0}\n{1}\n".format(id, read)
|
|
72 else:
|
|
73 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
|
|
74 output.write(block)
|
|
75
|
|
76 def handle_io(self):
|
|
77 '''Open input file, pass read sequence and read qscore to clipping function.
|
|
78 Pass clipped read and qscore to output function.'''
|
|
79 id = 0
|
|
80 output = open(self.outputfile, "a")
|
|
81 with open(self.inputfile, "r") as input:
|
|
82 block_gen = islice(input, 1, None, 2)
|
|
83 for i, line in enumerate(block_gen):
|
|
84 if i % 2:
|
|
85 qscore = line.rstrip()
|
|
86 else:
|
|
87 read = line.rstrip()
|
|
88 continue
|
|
89 trimmed_read, trimmed_qscore = self.scanadapt(
|
|
90 self.adaptmotifs, read, qscore)
|
|
91 if self.minsize <= len(trimmed_read) <= self.maxsize:
|
|
92 if (self.Nmode == "reject") and ("N" in trimmed_read):
|
|
93 continue
|
|
94 id += 1
|
|
95 self.write_output(id, trimmed_read, trimmed_qscore, output)
|
|
96 output.close()
|
|
97
|
|
98
|
|
99 def main(*argv):
|
|
100 instanceClip = Clip(*argv)
|
|
101 instanceClip.handle_io()
|
|
102
|
|
103 if __name__ == "__main__":
|
|
104 args = Parser()
|
|
105 id = 0
|
|
106 for inputfile in args.input:
|
|
107 main(inputfile, args.output, args.output_format,
|
|
108 args.adapter_to_clip, args.min, args.max, args.Nmode)
|