Mercurial > repos > peterjc > fastq_paired_unpaired
comparison tools/fastq/fastq_paired_unpaired.py @ 3:81ac324efa49 draft
Re-uploaded v0.0.6 (to undo accidental commit of the BLAST+ wrappers here).
| author | peterjc |
|---|---|
| date | Thu, 02 May 2013 11:27:46 -0400 |
| parents | |
| children | 9b5cd13eb78b |
comparison
equal
deleted
inserted
replaced
| 2:fae4084a0bc0 | 3:81ac324efa49 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Divides a FASTQ into paired and single (orphan reads) as separate files. | |
| 3 | |
| 4 The input file should be a valid FASTQ file which has been sorted so that | |
| 5 any partner forward+reverse reads are consecutive. The output files all | |
| 6 preserve this sort order. Pairing are recognised based on standard name | |
| 7 suffices. See below or run the tool with no arguments for more details. | |
| 8 | |
| 9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even | |
| 10 Color Space should all work equally well). | |
| 11 | |
| 12 This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute | |
| 13 (formerly SCRI), Scotland, UK. All rights reserved. | |
| 14 | |
| 15 See accompanying text file for licence details (MIT/BSD style). | |
| 16 """ | |
| 17 import os | |
| 18 import sys | |
| 19 import re | |
| 20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
| 21 | |
| 22 if "-v" in sys.argv or "--version" in sys.argv: | |
| 23 print "Version 0.0.6" | |
| 24 sys.exit(0) | |
| 25 | |
| 26 def stop_err(msg, err=1): | |
| 27 sys.stderr.write(msg.rstrip() + "\n") | |
| 28 sys.exit(err) | |
| 29 | |
| 30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. | |
| 31 | |
| 32 If you want two output files, use four arguments: | |
| 33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) | |
| 34 - Sorted input FASTQ filename, | |
| 35 - Output paired FASTQ filename (forward then reverse interleaved), | |
| 36 - Output singles FASTQ filename (orphan reads) | |
| 37 | |
| 38 If you want three output files, use five arguments: | |
| 39 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) | |
| 40 - Sorted input FASTQ filename, | |
| 41 - Output forward paired FASTQ filename, | |
| 42 - Output reverse paired FASTQ filename, | |
| 43 - Output singles FASTQ filename (orphan reads) | |
| 44 | |
| 45 The input file should be a valid FASTQ file which has been sorted so that | |
| 46 any partner forward+reverse reads are consecutive. The output files all | |
| 47 preserve this sort order. | |
| 48 | |
| 49 Any reads where the forward/reverse naming suffix used is not recognised | |
| 50 are treated as orphan reads. The tool supports the /1 and /2 convention | |
| 51 originally used by Illumina, the .f and .r convention, and the Sanger | |
| 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html | |
| 53 for details), and the new Illumina convention where the reads have the | |
| 54 same identifier with the fragment at the start of the description, e.g. | |
| 55 | |
| 56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA | |
| 57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA | |
| 58 | |
| 59 Note that this does support multiple forward and reverse reads per template | |
| 60 (which is quite common with Sanger sequencing), e.g. this which is sorted | |
| 61 alphabetically: | |
| 62 | |
| 63 WTSI_1055_4p17.p1kapIBF | |
| 64 WTSI_1055_4p17.p1kpIBF | |
| 65 WTSI_1055_4p17.q1kapIBR | |
| 66 WTSI_1055_4p17.q1kpIBR | |
| 67 | |
| 68 or this where the reads already come in pairs: | |
| 69 | |
| 70 WTSI_1055_4p17.p1kapIBF | |
| 71 WTSI_1055_4p17.q1kapIBR | |
| 72 WTSI_1055_4p17.p1kpIBF | |
| 73 WTSI_1055_4p17.q1kpIBR | |
| 74 | |
| 75 both become: | |
| 76 | |
| 77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR | |
| 78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR | |
| 79 """ | |
| 80 | |
| 81 if len(sys.argv) == 5: | |
| 82 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] | |
| 83 elif len(sys.argv) == 6: | |
| 84 pairs_fastq = None | |
| 85 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] | |
| 86 else: | |
| 87 stop_err(msg) | |
| 88 | |
| 89 format = format.replace("fastq", "").lower() | |
| 90 if not format: | |
| 91 format="sanger" #safe default | |
| 92 elif format not in ["sanger","solexa","illumina","cssanger"]: | |
| 93 stop_err("Unrecognised format %s" % format) | |
| 94 | |
| 95 def f_match(name): | |
| 96 if name.endswith("/1") or name.endswith(".f"): | |
| 97 return True | |
| 98 | |
| 99 #Cope with three widely used suffix naming convensions, | |
| 100 #Illumina: /1 or /2 | |
| 101 #Forward/revered: .f or .r | |
| 102 #Sanger, e.g. .p1k and .q1k | |
| 103 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | |
| 104 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | |
| 105 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | |
| 106 | |
| 107 #assert re_f.match("demo/1") | |
| 108 assert re_f.search("demo.f") | |
| 109 assert re_f.search("demo.s1") | |
| 110 assert re_f.search("demo.f1k") | |
| 111 assert re_f.search("demo.p1") | |
| 112 assert re_f.search("demo.p1k") | |
| 113 assert re_f.search("demo.p1lk") | |
| 114 assert re_r.search("demo/2") | |
| 115 assert re_r.search("demo.r") | |
| 116 assert re_r.search("demo.q1") | |
| 117 assert re_r.search("demo.q1lk") | |
| 118 assert not re_r.search("demo/1") | |
| 119 assert not re_r.search("demo.f") | |
| 120 assert not re_r.search("demo.p") | |
| 121 assert not re_f.search("demo/2") | |
| 122 assert not re_f.search("demo.r") | |
| 123 assert not re_f.search("demo.q") | |
| 124 | |
| 125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") | |
| 126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") | |
| 127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
| 128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
| 129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
| 130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
| 131 | |
| 132 | |
| 133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 | |
| 134 in_handle = open(input_fastq) | |
| 135 if pairs_fastq: | |
| 136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) | |
| 137 pairs_r_writer = pairs_f_writer | |
| 138 else: | |
| 139 pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) | |
| 140 pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) | |
| 141 singles_writer = fastqWriter(open(singles_fastq, "w"), format) | |
| 142 last_template, buffered_reads = None, [] | |
| 143 | |
| 144 for record in fastqReader(in_handle, format): | |
| 145 count += 1 | |
| 146 name = record.identifier.split(None,1)[0] | |
| 147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser | |
| 148 is_forward = False | |
| 149 suffix = re_f.search(name) | |
| 150 if suffix: | |
| 151 #============ | |
| 152 #Forward read | |
| 153 #============ | |
| 154 template = name[:suffix.start()] | |
| 155 is_forward = True | |
| 156 elif re_illumina_f.match(record.identifier): | |
| 157 template = name #No suffix | |
| 158 is_forward = True | |
| 159 if is_forward: | |
| 160 #print name, "forward", template | |
| 161 forward += 1 | |
| 162 if last_template == template: | |
| 163 buffered_reads.append(record) | |
| 164 else: | |
| 165 #Any old buffered reads are orphans | |
| 166 for old in buffered_reads: | |
| 167 singles_writer.write(old) | |
| 168 singles += 1 | |
| 169 #Save this read in buffer | |
| 170 buffered_reads = [record] | |
| 171 last_template = template | |
| 172 else: | |
| 173 is_reverse = False | |
| 174 suffix = re_r.search(name) | |
| 175 if suffix: | |
| 176 #============ | |
| 177 #Reverse read | |
| 178 #============ | |
| 179 template = name[:suffix.start()] | |
| 180 is_reverse = True | |
| 181 elif re_illumina_r.match(record.identifier): | |
| 182 template = name #No suffix | |
| 183 is_reverse = True | |
| 184 if is_reverse: | |
| 185 #print name, "reverse", template | |
| 186 reverse += 1 | |
| 187 if last_template == template and buffered_reads: | |
| 188 #We have a pair! | |
| 189 #If there are multiple buffered forward reads, want to pick | |
| 190 #the first one (although we could try and do something more | |
| 191 #clever looking at the suffix to match them up...) | |
| 192 old = buffered_reads.pop(0) | |
| 193 pairs_f_writer.write(old) | |
| 194 pairs_r_writer.write(record) | |
| 195 pairs += 2 | |
| 196 else: | |
| 197 #As this is a reverse read, this and any buffered read(s) are | |
| 198 #all orphans | |
| 199 for old in buffered_reads: | |
| 200 singles_writer.write(old) | |
| 201 singles += 1 | |
| 202 buffered_reads = [] | |
| 203 singles_writer.write(record) | |
| 204 singles += 1 | |
| 205 last_template = None | |
| 206 else: | |
| 207 #=========================== | |
| 208 #Neither forward nor reverse | |
| 209 #=========================== | |
| 210 singles_writer.write(record) | |
| 211 singles += 1 | |
| 212 neither += 1 | |
| 213 for old in buffered_reads: | |
| 214 singles_writer.write(old) | |
| 215 singles += 1 | |
| 216 buffered_reads = [] | |
| 217 last_template = None | |
| 218 if last_template: | |
| 219 #Left over singles... | |
| 220 for old in buffered_reads: | |
| 221 singles_writer.write(old) | |
| 222 singles += 1 | |
| 223 in_handle.close | |
| 224 singles_writer.close() | |
| 225 if pairs_fastq: | |
| 226 pairs_f_writer.close() | |
| 227 assert pairs_r_writer.file.closed | |
| 228 else: | |
| 229 pairs_f_writer.close() | |
| 230 pairs_r_writer.close() | |
| 231 | |
| 232 if neither: | |
| 233 print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ | |
| 234 % (count, forward, reverse, neither, pairs, singles) | |
| 235 else: | |
| 236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ | |
| 237 % (count, forward, reverse, pairs, singles) | |
| 238 | |
| 239 assert count == pairs + singles == forward + reverse + neither, \ | |
| 240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ | |
| 241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) |
