annotate yac.py @ 0:2445856981a1 draft

Imported from capsule None
author drosofff
date Mon, 03 Nov 2014 09:34:45 -0500
parents
children e5ef40107f54
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
1 #!/usr/bin/python
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
2 # yac = yet another clipper
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
3 # v 1.1.0 - 23-08-2014 - argparse implementation
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
4 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
5 # Christophe Antoniewski <drosofff@gmail.com>
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
6
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
7 import sys, string, argparse
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
8
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
9 def Parser():
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
10 the_parser = argparse.ArgumentParser()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
11 the_parser.add_argument('--input', action="store", type=str, help="input fastq file")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
12 the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
13 the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
14 the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
15 the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
16 the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
17 args = the_parser.parse_args()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
18 args.adapter_to_clip = args.adapter_to_clip.upper()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
19 return args
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
20
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
21
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
22 class Clip:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
23 def __init__(self, inputfile, outputfile, adapter, minsize, maxsize):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
24 self.inputfile = inputfile
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
25 self.outputfile = outputfile
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
26 self.adapter = adapter
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
27 self.minsize = int(minsize)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
28 self.maxsize = int(maxsize)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
29 def motives (sequence):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
30 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
31 sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
32 dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"}
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
33 for pos in enumerate(sequence[:6]):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
34 for subst in dicsubst[pos[1]]:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
35 sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7])
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
36 return sequencevariants
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
37 self.adaptmotifs= motives(self.adapter)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
38
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
39 def scanadapt(self, adaptmotives=[], sequence=""):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
40 '''scans sequence for adapter motives'''
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
41 if sequence.rfind(adaptmotives[0]) != -1:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
42 return sequence[:sequence.rfind(adaptmotives[0])]
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
43 for motif in adaptmotives[1:]:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
44 if sequence.rfind(motif) != -1:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
45 return sequence[:sequence.rfind(motif)]
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
46 return sequence
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
47
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
48 def clip_with_N (self):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
49 '''clips adapter sequences from inputfile.
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
50 Reads containing N are retained.'''
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
51 iterator = 0
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
52 id = 0
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
53 F = open (self.inputfile, "r")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
54 O = open (self.outputfile, "w")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
55 for line in F:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
56 iterator += 1
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
57 if iterator % 4 == 2:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
58 trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
59 if self.minsize <= len(trim) <= self.maxsize:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
60 id += 1
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
61 print >> O, ">%i\n%s" % (id, trim)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
62 F.close()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
63 O.close()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
64 def clip_without_N (self):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
65 '''clips adapter sequences from inputfile.
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
66 Reads containing N are rejected.'''
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
67 iterator = 0
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
68 id = 0
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
69 F = open (self.inputfile, "r")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
70 O = open (self.outputfile, "w")
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
71 for line in F:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
72 iterator += 1
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
73 if iterator % 4 == 2:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
74 trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
75 if "N" in trim: continue
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
76 if self.minsize <= len(trim) <= self.maxsize:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
77 id += 1
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
78 print >> O, ">%i\n%s" % (id, trim)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
79 F.close()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
80 O.close()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
81
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
82 def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode):
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
83 instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize)
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
84 if Nmode == "accept":
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
85 instanceClip.clip_with_N()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
86 else:
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
87 instanceClip.clip_without_N()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
88
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
89 if __name__ == "__main__" :
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
90 args = Parser()
2445856981a1 Imported from capsule None
drosofff
parents:
diff changeset
91 __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode)