# HG changeset patch
# User mvdbeek
# Date 1427730052 14400
# Node ID e5ef40107f544b0dd632a76155ce369e3a09cc08
# Parent  2445856981a1671aeb145256ba533e8fbb6c4aec
Uploaded

diff -r 2445856981a1 -r e5ef40107f54 README.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Mon Mar 30 11:40:52 2015 -0400
@@ -0,0 +1,11 @@
+This tool clips adapter sequences from a fastq file and outputs either a    
+fasta or fastq file of clipped reads with renumbered fasta/fastq headers.
+
+Clipped sequences with Ns can be discarded.
+
+Min size and max size filter clipped reads on their size.
+
+Note that unclipped reads that satisfy the min and max size conditions are kept.
+
+Homepage: drosophile.org
+
diff -r 2445856981a1 -r e5ef40107f54 test-data/yac_fastq.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac_fastq.out	Mon Mar 30 11:40:52 2015 -0400
@@ -0,0 +1,24 @@
+@HWI-1
+TGTAAACATCCCCGACTGGCAGC
++
+B@BBCBCCBCBCCC8A<@#####
+@HWI-2
+AAAGTGCTACTACTTTTGAGTCT
++
+BAA@7?A@@A@@B<'25?6>59:
+@HWI-3
+ACTGGACTTGGAGTCCGAAGGC
++
+BBB@@ABAAB?9B42&9;####
+@HWI-4
+AAGTGCCGCCAGGTTTTGAGTGG
++
+AB?5;3>/=?>=;416481####
+@HWI-5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
++
+BCB=:ACCBB=>BB8<-############
+@HWI-6
+TAGCTTATCAGACTGATGTTGAC
++
+BBBBBCBBCB;>AA',9=18?1:
diff -r 2445856981a1 -r e5ef40107f54 yac.py
--- a/yac.py	Mon Nov 03 09:34:45 2014 -0500
+++ b/yac.py	Mon Mar 30 11:40:52 2015 -0400
@@ -1,91 +1,108 @@
 #!/usr/bin/python
 # yac = yet another clipper
+# v 1.2.1 - 23-08-2014 - Support FastQ output
 # v 1.1.0 - 23-08-2014 - argparse implementation
 # Usage yac.py  $input $output $adapter_to_clip $min $max $Nmode
 # Christophe Antoniewski <drosofff@gmail.com>
 
-import sys, string, argparse
+import sys
+import string
+import argparse
+from itertools import islice
+
 
 def Parser():
-  the_parser = argparse.ArgumentParser()
-  the_parser.add_argument('--input', action="store", type=str, help="input fastq file")
-  the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file")
-  the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
-  the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep")
-  the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep")
-  the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping")
-  args = the_parser.parse_args()
-  args.adapter_to_clip = args.adapter_to_clip.upper()
-  return args
+    the_parser = argparse.ArgumentParser()
+    the_parser.add_argument(
+        '--input', action="store", nargs='+', help="input fastq files")
+    the_parser.add_argument(
+        '--output', action="store", type=str, help="output, clipped fasta file")
+    the_parser.add_argument(
+        '--output_format', action="store", type=str, help="output format, fasta or fastq")
+    the_parser.add_argument(
+        '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
+    the_parser.add_argument(
+        '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
+    the_parser.add_argument(
+        '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
+    the_parser.add_argument('--Nmode', action="store", type=str, choices=[
+                            "accept", "reject"], help="accept or reject sequences with N for clipping")
+    args = the_parser.parse_args()
+    args.adapter_to_clip = args.adapter_to_clip.upper()
+    return args
 
 
 class Clip:
-  def __init__(self, inputfile, outputfile, adapter, minsize, maxsize):
-    self.inputfile = inputfile
-    self.outputfile = outputfile
-    self.adapter = adapter
-    self.minsize = int(minsize)
-    self.maxsize = int(maxsize)
-    def motives (sequence):
-      '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
-      sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match
-      dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"}
-      for pos in enumerate(sequence[:6]):
-        for subst in dicsubst[pos[1]]:
-          sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7])
-      return sequencevariants
-    self.adaptmotifs= motives(self.adapter)
+
+    def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
+        self.inputfile = inputfile
+        self.outputfile = outputfile
+        self.output_format = output_format
+        self.adapter = adapter
+        self.minsize = int(minsize)
+        self.maxsize = int(maxsize)
+        self.Nmode = Nmode
+
+        def motives(sequence):
+            '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
+            sequencevariants = [
+                sequence[0:6]]  # initializes the list with the 6mer perfect match
+            dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
+            for pos in enumerate(sequence[:6]):
+                for subst in dicsubst[pos[1]]:
+                    sequencevariants.append(
+                        sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
+            return sequencevariants
+        self.adaptmotifs = motives(self.adapter)
+
+    def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
+        '''scans sequence for adapter motives'''
+        match_position = sequence.rfind(adaptmotives[0])
+        if match_position != -1:
+            return sequence[:match_position], qscore[:match_position]
+        for motif in adaptmotives[1:]:
+            match_position = sequence.rfind(motif)
+            if match_position != -1:
+                return sequence[:match_position], qscore[:match_position]
+        return sequence, qscore
 
-  def scanadapt(self, adaptmotives=[], sequence=""):
-    '''scans sequence for adapter motives'''
-    if sequence.rfind(adaptmotives[0]) != -1:
-      return sequence[:sequence.rfind(adaptmotives[0])]
-    for motif in adaptmotives[1:]:
-      if sequence.rfind(motif) != -1:
-        return sequence[:sequence.rfind(motif)]
-    return sequence
+    def write_output(self, id, read, qscore, output):
+        if self.output_format == "fasta":
+            block = ">{0}\n{1}\n".format(id, read)
+        else:
+            block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
+        output.write(block)
 
-  def clip_with_N (self):
-    '''clips adapter sequences from inputfile. 
-    Reads containing N are retained.'''
-    iterator = 0
+    def handle_io(self):
+        '''Open input file, pass read sequence and read qscore to clipping function.
+        Pass clipped read and qscore to output function.'''
+        id = 0
+        output = open(self.outputfile, "a")
+        with open(self.inputfile, "r") as input:
+            block_gen = islice(input, 1, None, 2)
+            for i, line in enumerate(block_gen):
+                if i % 2:
+                    qscore = line.rstrip()
+                else:
+                    read = line.rstrip()
+                    continue
+                trimmed_read, trimmed_qscore = self.scanadapt(
+                    self.adaptmotifs, read, qscore)
+                if self.minsize <= len(trimmed_read) <= self.maxsize:
+                    if (self.Nmode == "reject") and ("N" in trimmed_read):
+                        continue
+                    id += 1
+                    self.write_output(id, trimmed_read, trimmed_qscore, output)
+            output.close()
+
+
+def main(*argv):
+    instanceClip = Clip(*argv)
+    instanceClip.handle_io()
+
+if __name__ == "__main__":
+    args = Parser()
     id = 0
-    F = open (self.inputfile, "r")
-    O = open (self.outputfile, "w")
-    for line in F:
-      iterator += 1
-      if iterator % 4 == 2:
-        trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
-        if self.minsize <= len(trim) <= self.maxsize:
-          id += 1
-          print >> O, ">%i\n%s" % (id, trim)
-    F.close()
-    O.close()
-  def clip_without_N (self):
-    '''clips adapter sequences from inputfile. 
-    Reads containing N are rejected.'''
-    iterator = 0
-    id = 0
-    F = open (self.inputfile, "r")
-    O = open (self.outputfile, "w")
-    for line in F:
-      iterator += 1
-      if iterator % 4 == 2:
-        trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
-        if "N" in trim: continue
-        if self.minsize <= len(trim) <= self.maxsize:
-          id += 1
-          print >> O, ">%i\n%s" % (id, trim)
-    F.close()
-    O.close()
-
-def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode):
-  instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize)
-  if Nmode == "accept":
-    instanceClip.clip_with_N()
-  else:
-    instanceClip.clip_without_N()
-
-if __name__ == "__main__" :
-  args = Parser()
-  __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode)
+    for inputfile in args.input:
+        main(inputfile, args.output, args.output_format,
+             args.adapter_to_clip, args.min, args.max, args.Nmode)
diff -r 2445856981a1 -r e5ef40107f54 yac.xml
--- a/yac.xml	Mon Nov 03 09:34:45 2014 -0500
+++ b/yac.xml	Mon Mar 30 11:40:52 2015 -0400
@@ -1,64 +1,79 @@
- <tool id="yac" name="Clip adapter" version="1.1.0">
-  <description></description>
-  <command interpreter="python">yac.py --input $input
+<tool id="yac" name="Clip adapter" version="1.3.3">
+    <description />
+    <command interpreter="python">yac.py --input $input
                                        --output $output
+                                       --output_format "$out_format"
                                        --adapter_to_clip $clip_source.clip_sequence
                                        --min $min
                                        --max $max
                                        --Nmode $Nmode
   </command>
-  <inputs>
-    <param format="fastq" name="input" type="data" label="Source file"/>
-    <param name="min" type="integer" size="4" value="15" label="min size"/>
-    <param name="max" type="integer" size="4" value="36" label="max size"/>
-    <param name="Nmode" type="select" label="Accept reads containing N?">
-        <option value="accept" selected="True">accept</option>
-        <option value="reject">reject</option>
-    </param>
-    <conditional name="clip_source">
-   	 <param name="clip_source_list" type="select" label="Source" help="Built-in adapters or User-provided">
-         	<option value="prebuilt" selected="True">Use a built-in adapter (select from the list below)</option>
-         	<option value="user">Use custom sequence</option>
-         </param>
-         <when value="prebuilt">
-         	<param name="clip_sequence" type="select" label="Select Adapter to clip" help="if your adapter is not listed, input your own sequence">
-                	<option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option>
-                        <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option>
-                        <option value="TGGAATTCTCGGGTGCCAAG" selected="True">Illumina TruSeq  TGGAATTCTCGGGTGCCAAG</option>
-                        <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option>
-		</param>
-	</when>
-        <when value="user">
-		 <param name="clip_sequence" type="text" size="35"  label="Enter your Sequence" value="GAATCC"/>
-	</when>
-    </conditional>
-  </inputs>
-  <outputs>
-    <data format="fasta" name="output"  metadata="input" />
-  </outputs>
+    <inputs>
+        <param format="fastq" label="Source file" name="input" type="data" />
+        <param label="min size" name="min" size="4" type="integer" value="15" />
+        <param label="max size" name="max" size="4" type="integer" value="36" />
+        <param label="Select output format" name="out_format" type="select">
+            <option selected="true" value="fasta">Fasta format</option>
+            <option value="fastq">Fastq format</option>
+        </param>
+        <param label="Accept reads containing N?" name="Nmode" type="select">
+            <option selected="True" value="accept">accept</option>
+            <option value="reject">reject</option>
+        </param>
+        <conditional name="clip_source">
+            <param help="Built-in adapters or User-provided" label="Source" name="clip_source_list" type="select">
+                <option selected="True" value="prebuilt">Use a built-in adapter (select from the list below)</option>
+                <option value="user">Use custom sequence</option>
+            </param>
+            <when value="prebuilt">
+                <param help="if your adapter is not listed, input your own sequence" label="Select Adapter to clip" name="clip_sequence" type="select">
+                    <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option>
+                    <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option>
+                    <option selected="True" value="TGGAATTCTCGGGTGCCAAG">Illumina TruSeq  TGGAATTCTCGGGTGCCAAG</option>
+                    <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option>
+                </param>
+            </when>
+            <when value="user">
+                <param label="Enter your Sequence" name="clip_sequence" size="35" type="text" value="GAATCC" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="fasta" metadata="input" name="output" />
+        <change_format>
+            <when format="fastq" input="out_format" value="fastq" />
+        </change_format>
+    </outputs>
+    <tests>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <output file="yac.out" name="output" />
+        </test>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastq" />
+            <output file="yac_fastq.out" name="output" />
+        </test>
+    </tests>
+    <help>
+This tool clips adapter sequences from a fastq file and outputs either a
+fasta or fastq file of clipped reads with renumbered fasta/fastq headers.
 
-  <help>
-<!-- write a decent doc ! -->
-This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers.
-
-Clipped sequences with Ns can be discarded.
+By defualt clipped sequences with unknown nucleotides are kept, but 
+can be discarded by setting "Accept reads containing N?" to reject.
 
 Min size and max size filter clipped reads on their size.
 
 Note that unclipped reads that satisfy the min and max size conditions are kept.
   </help>
-
-<!-- write a <test> section -->
-	<tests>
-                <test>
-                        <param name="input" value="yac.fastq" ftype="fastqsanger"/>
-                        <param name="min" value="18" />
-                        <param name="max" value="29" />
-                        <param name="clip_source_list" value="prebuilt" />
-                        <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
-                        <param name="Nmode" value="accept" />
-                        <output name="output" file="yac.out" />
-                </test>
-        </tests>
-
 </tool>