# HG changeset patch # User mvdbeek # Date 1427730052 14400 # Node ID e5ef40107f544b0dd632a76155ce369e3a09cc08 # Parent 2445856981a1671aeb145256ba533e8fbb6c4aec Uploaded diff -r 2445856981a1 -r e5ef40107f54 README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Mon Mar 30 11:40:52 2015 -0400 @@ -0,0 +1,11 @@ +This tool clips adapter sequences from a fastq file and outputs either a +fasta or fastq file of clipped reads with renumbered fasta/fastq headers. + +Clipped sequences with Ns can be discarded. + +Min size and max size filter clipped reads on their size. + +Note that unclipped reads that satisfy the min and max size conditions are kept. + +Homepage: drosophile.org + diff -r 2445856981a1 -r e5ef40107f54 test-data/yac_fastq.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac_fastq.out Mon Mar 30 11:40:52 2015 -0400 @@ -0,0 +1,24 @@ +@HWI-1 +TGTAAACATCCCCGACTGGCAGC ++ +B@BBCBCCBCBCCC8A<@##### +@HWI-2 +AAAGTGCTACTACTTTTGAGTCT ++ +BAA@7?A@@A@@B<'25?6>59: +@HWI-3 +ACTGGACTTGGAGTCCGAAGGC ++ +BBB@@ABAAB?9B42&9;#### +@HWI-4 +AAGTGCCGCCAGGTTTTGAGTGG ++ +AB?5;3>/=?>=;416481#### +@HWI-5 +TATTGCACTTGTCCCGGCCTGAATCNCGT ++ +BCB=:ACCBB=>BB8<-############ +@HWI-6 +TAGCTTATCAGACTGATGTTGAC ++ +BBBBBCBBCB;>AA',9=18?1: diff -r 2445856981a1 -r e5ef40107f54 yac.py --- a/yac.py Mon Nov 03 09:34:45 2014 -0500 +++ b/yac.py Mon Mar 30 11:40:52 2015 -0400 @@ -1,91 +1,108 @@ #!/usr/bin/python # yac = yet another clipper +# v 1.2.1 - 23-08-2014 - Support FastQ output # v 1.1.0 - 23-08-2014 - argparse implementation # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode # Christophe Antoniewski -import sys, string, argparse +import sys +import string +import argparse +from itertools import islice + def Parser(): - the_parser = argparse.ArgumentParser() - the_parser.add_argument('--input', action="store", type=str, help="input fastq file") - the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file") - the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") - the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep") - the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep") - the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping") - args = the_parser.parse_args() - args.adapter_to_clip = args.adapter_to_clip.upper() - return args + the_parser = argparse.ArgumentParser() + the_parser.add_argument( + '--input', action="store", nargs='+', help="input fastq files") + the_parser.add_argument( + '--output', action="store", type=str, help="output, clipped fasta file") + the_parser.add_argument( + '--output_format', action="store", type=str, help="output format, fasta or fastq") + the_parser.add_argument( + '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") + the_parser.add_argument( + '--min', action="store", type=int, help="minimal size of clipped sequence to keep") + the_parser.add_argument( + '--max', action="store", type=int, help="maximal size of clipped sequence to keep") + the_parser.add_argument('--Nmode', action="store", type=str, choices=[ + "accept", "reject"], help="accept or reject sequences with N for clipping") + args = the_parser.parse_args() + args.adapter_to_clip = args.adapter_to_clip.upper() + return args class Clip: - def __init__(self, inputfile, outputfile, adapter, minsize, maxsize): - self.inputfile = inputfile - self.outputfile = outputfile - self.adapter = adapter - self.minsize = int(minsize) - self.maxsize = int(maxsize) - def motives (sequence): - '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' - sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match - dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"} - for pos in enumerate(sequence[:6]): - for subst in dicsubst[pos[1]]: - sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7]) - return sequencevariants - self.adaptmotifs= motives(self.adapter) + + def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode): + self.inputfile = inputfile + self.outputfile = outputfile + self.output_format = output_format + self.adapter = adapter + self.minsize = int(minsize) + self.maxsize = int(maxsize) + self.Nmode = Nmode + + def motives(sequence): + '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' + sequencevariants = [ + sequence[0:6]] # initializes the list with the 6mer perfect match + dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} + for pos in enumerate(sequence[:6]): + for subst in dicsubst[pos[1]]: + sequencevariants.append( + sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) + return sequencevariants + self.adaptmotifs = motives(self.adapter) + + def scanadapt(self, adaptmotives=[], sequence="", qscore=""): + '''scans sequence for adapter motives''' + match_position = sequence.rfind(adaptmotives[0]) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + return sequence, qscore - def scanadapt(self, adaptmotives=[], sequence=""): - '''scans sequence for adapter motives''' - if sequence.rfind(adaptmotives[0]) != -1: - return sequence[:sequence.rfind(adaptmotives[0])] - for motif in adaptmotives[1:]: - if sequence.rfind(motif) != -1: - return sequence[:sequence.rfind(motif)] - return sequence + def write_output(self, id, read, qscore, output): + if self.output_format == "fasta": + block = ">{0}\n{1}\n".format(id, read) + else: + block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) + output.write(block) - def clip_with_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are retained.''' - iterator = 0 + def handle_io(self): + '''Open input file, pass read sequence and read qscore to clipping function. + Pass clipped read and qscore to output function.''' + id = 0 + output = open(self.outputfile, "a") + with open(self.inputfile, "r") as input: + block_gen = islice(input, 1, None, 2) + for i, line in enumerate(block_gen): + if i % 2: + qscore = line.rstrip() + else: + read = line.rstrip() + continue + trimmed_read, trimmed_qscore = self.scanadapt( + self.adaptmotifs, read, qscore) + if self.minsize <= len(trimmed_read) <= self.maxsize: + if (self.Nmode == "reject") and ("N" in trimmed_read): + continue + id += 1 + self.write_output(id, trimmed_read, trimmed_qscore, output) + output.close() + + +def main(*argv): + instanceClip = Clip(*argv) + instanceClip.handle_io() + +if __name__ == "__main__": + args = Parser() id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - def clip_without_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are rejected.''' - iterator = 0 - id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if "N" in trim: continue - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - -def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode): - instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize) - if Nmode == "accept": - instanceClip.clip_with_N() - else: - instanceClip.clip_without_N() - -if __name__ == "__main__" : - args = Parser() - __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode) + for inputfile in args.input: + main(inputfile, args.output, args.output_format, + args.adapter_to_clip, args.min, args.max, args.Nmode) diff -r 2445856981a1 -r e5ef40107f54 yac.xml --- a/yac.xml Mon Nov 03 09:34:45 2014 -0500 +++ b/yac.xml Mon Mar 30 11:40:52 2015 -0400 @@ -1,64 +1,79 @@ - - - yac.py --input $input + + + yac.py --input $input --output $output + --output_format "$out_format" --adapter_to_clip $clip_source.clip_sequence --min $min --max $max --Nmode $Nmode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool clips adapter sequences from a fastq file and outputs either a +fasta or fastq file of clipped reads with renumbered fasta/fastq headers. - - -This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers. - -Clipped sequences with Ns can be discarded. +By defualt clipped sequences with unknown nucleotides are kept, but +can be discarded by setting "Accept reads containing N?" to reject. Min size and max size filter clipped reads on their size. Note that unclipped reads that satisfy the min and max size conditions are kept. - - - - - - - - - - - - - -