Mercurial > repos > peterjc > fastq_paired_unpaired
comparison tools/fastq/fastq_paired_unpaired.py @ 1:2feaef06d388 draft
Uploaded v0.0.6, adds unit test
| author | peterjc |
|---|---|
| date | Tue, 30 Apr 2013 14:07:59 -0400 |
| parents | 3a39d2053bc5 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3a39d2053bc5 | 1:2feaef06d388 |
|---|---|
| 7 suffices. See below or run the tool with no arguments for more details. | 7 suffices. See below or run the tool with no arguments for more details. |
| 8 | 8 |
| 9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even | 9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even |
| 10 Color Space should all work equally well). | 10 Color Space should all work equally well). |
| 11 | 11 |
| 12 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | 12 This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute |
| 13 (formerly SCRI), Scotland, UK. All rights reserved. | |
| 14 | |
| 13 See accompanying text file for licence details (MIT/BSD style). | 15 See accompanying text file for licence details (MIT/BSD style). |
| 14 | |
| 15 This is version 0.0.4 of the script. | |
| 16 """ | 16 """ |
| 17 import os | 17 import os |
| 18 import sys | 18 import sys |
| 19 import re | 19 import re |
| 20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
| 21 | |
| 22 if "-v" in sys.argv or "--version" in sys.argv: | |
| 23 print "Version 0.0.6" | |
| 24 sys.exit(0) | |
| 21 | 25 |
| 22 def stop_err(msg, err=1): | 26 def stop_err(msg, err=1): |
| 23 sys.stderr.write(msg.rstrip() + "\n") | 27 sys.stderr.write(msg.rstrip() + "\n") |
| 24 sys.exit(err) | 28 sys.exit(err) |
| 25 | 29 |
| 42 any partner forward+reverse reads are consecutive. The output files all | 46 any partner forward+reverse reads are consecutive. The output files all |
| 43 preserve this sort order. | 47 preserve this sort order. |
| 44 | 48 |
| 45 Any reads where the forward/reverse naming suffix used is not recognised | 49 Any reads where the forward/reverse naming suffix used is not recognised |
| 46 are treated as orphan reads. The tool supports the /1 and /2 convention | 50 are treated as orphan reads. The tool supports the /1 and /2 convention |
| 47 used by Illumina, the .f and .r convention, and the Sanger convention | 51 originally used by Illumina, the .f and .r convention, and the Sanger |
| 48 (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). | 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html |
| 53 for details), and the new Illumina convention where the reads have the | |
| 54 same identifier with the fragment at the start of the description, e.g. | |
| 55 | |
| 56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA | |
| 57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA | |
| 49 | 58 |
| 50 Note that this does support multiple forward and reverse reads per template | 59 Note that this does support multiple forward and reverse reads per template |
| 51 (which is quite common with Sanger sequencing), e.g. this which is sorted | 60 (which is quite common with Sanger sequencing), e.g. this which is sorted |
| 52 alphabetically: | 61 alphabetically: |
| 53 | 62 |
| 111 assert not re_r.search("demo.p") | 120 assert not re_r.search("demo.p") |
| 112 assert not re_f.search("demo/2") | 121 assert not re_f.search("demo/2") |
| 113 assert not re_f.search("demo.r") | 122 assert not re_f.search("demo.r") |
| 114 assert not re_f.search("demo.q") | 123 assert not re_f.search("demo.q") |
| 115 | 124 |
| 125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") | |
| 126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") | |
| 127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
| 128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
| 129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
| 130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
| 131 | |
| 132 | |
| 116 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 | 133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 |
| 117 in_handle = open(input_fastq) | 134 in_handle = open(input_fastq) |
| 118 if pairs_fastq: | 135 if pairs_fastq: |
| 119 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) | 136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) |
| 120 pairs_r_writer = pairs_f_writer | 137 pairs_r_writer = pairs_f_writer |
| 126 | 143 |
| 127 for record in fastqReader(in_handle, format): | 144 for record in fastqReader(in_handle, format): |
| 128 count += 1 | 145 count += 1 |
| 129 name = record.identifier.split(None,1)[0] | 146 name = record.identifier.split(None,1)[0] |
| 130 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser | 147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser |
| 148 is_forward = False | |
| 131 suffix = re_f.search(name) | 149 suffix = re_f.search(name) |
| 132 if suffix: | 150 if suffix: |
| 133 #============ | 151 #============ |
| 134 #Forward read | 152 #Forward read |
| 135 #============ | 153 #============ |
| 136 template = name[:suffix.start()] | 154 template = name[:suffix.start()] |
| 155 is_forward = True | |
| 156 elif re_illumina_f.match(record.identifier): | |
| 157 template = name #No suffix | |
| 158 is_forward = True | |
| 159 if is_forward: | |
| 137 #print name, "forward", template | 160 #print name, "forward", template |
| 138 forward += 1 | 161 forward += 1 |
| 139 if last_template == template: | 162 if last_template == template: |
| 140 buffered_reads.append(record) | 163 buffered_reads.append(record) |
| 141 else: | 164 else: |
| 143 for old in buffered_reads: | 166 for old in buffered_reads: |
| 144 singles_writer.write(old) | 167 singles_writer.write(old) |
| 145 singles += 1 | 168 singles += 1 |
| 146 #Save this read in buffer | 169 #Save this read in buffer |
| 147 buffered_reads = [record] | 170 buffered_reads = [record] |
| 148 last_template = template | 171 last_template = template |
| 149 else: | 172 else: |
| 173 is_reverse = False | |
| 150 suffix = re_r.search(name) | 174 suffix = re_r.search(name) |
| 151 if suffix: | 175 if suffix: |
| 152 #============ | 176 #============ |
| 153 #Reverse read | 177 #Reverse read |
| 154 #============ | 178 #============ |
| 155 template = name[:suffix.start()] | 179 template = name[:suffix.start()] |
| 180 is_reverse = True | |
| 181 elif re_illumina_r.match(record.identifier): | |
| 182 template = name #No suffix | |
| 183 is_reverse = True | |
| 184 if is_reverse: | |
| 156 #print name, "reverse", template | 185 #print name, "reverse", template |
| 157 reverse += 1 | 186 reverse += 1 |
| 158 if last_template == template and buffered_reads: | 187 if last_template == template and buffered_reads: |
| 159 #We have a pair! | 188 #We have a pair! |
| 160 #If there are multiple buffered forward reads, want to pick | 189 #If there are multiple buffered forward reads, want to pick |
| 206 else: | 235 else: |
| 207 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ | 236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ |
| 208 % (count, forward, reverse, pairs, singles) | 237 % (count, forward, reverse, pairs, singles) |
| 209 | 238 |
| 210 assert count == pairs + singles == forward + reverse + neither, \ | 239 assert count == pairs + singles == forward + reverse + neither, \ |
| 211 "%i vs %i+%i=%i vs %i+%i=%i" \ | 240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ |
| 212 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) | 241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) |
