comparison tools/fastq/fastq_paired_unpaired.py @ 3:81ac324efa49 draft

Re-uploaded v0.0.6 (to undo accidental commit of the BLAST+ wrappers here).
author peterjc
date Thu, 02 May 2013 11:27:46 -0400
parents
children 9b5cd13eb78b
comparison
equal deleted inserted replaced
2:fae4084a0bc0 3:81ac324efa49
1 #!/usr/bin/env python
2 """Divides a FASTQ into paired and single (orphan reads) as separate files.
3
4 The input file should be a valid FASTQ file which has been sorted so that
5 any partner forward+reverse reads are consecutive. The output files all
6 preserve this sort order. Pairing are recognised based on standard name
7 suffices. See below or run the tool with no arguments for more details.
8
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
10 Color Space should all work equally well).
11
12 This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute
13 (formerly SCRI), Scotland, UK. All rights reserved.
14
15 See accompanying text file for licence details (MIT/BSD style).
16 """
17 import os
18 import sys
19 import re
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
21
22 if "-v" in sys.argv or "--version" in sys.argv:
23 print "Version 0.0.6"
24 sys.exit(0)
25
26 def stop_err(msg, err=1):
27 sys.stderr.write(msg.rstrip() + "\n")
28 sys.exit(err)
29
30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames.
31
32 If you want two output files, use four arguments:
33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
34 - Sorted input FASTQ filename,
35 - Output paired FASTQ filename (forward then reverse interleaved),
36 - Output singles FASTQ filename (orphan reads)
37
38 If you want three output files, use five arguments:
39 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
40 - Sorted input FASTQ filename,
41 - Output forward paired FASTQ filename,
42 - Output reverse paired FASTQ filename,
43 - Output singles FASTQ filename (orphan reads)
44
45 The input file should be a valid FASTQ file which has been sorted so that
46 any partner forward+reverse reads are consecutive. The output files all
47 preserve this sort order.
48
49 Any reads where the forward/reverse naming suffix used is not recognised
50 are treated as orphan reads. The tool supports the /1 and /2 convention
51 originally used by Illumina, the .f and .r convention, and the Sanger
52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html
53 for details), and the new Illumina convention where the reads have the
54 same identifier with the fragment at the start of the description, e.g.
55
56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA
58
59 Note that this does support multiple forward and reverse reads per template
60 (which is quite common with Sanger sequencing), e.g. this which is sorted
61 alphabetically:
62
63 WTSI_1055_4p17.p1kapIBF
64 WTSI_1055_4p17.p1kpIBF
65 WTSI_1055_4p17.q1kapIBR
66 WTSI_1055_4p17.q1kpIBR
67
68 or this where the reads already come in pairs:
69
70 WTSI_1055_4p17.p1kapIBF
71 WTSI_1055_4p17.q1kapIBR
72 WTSI_1055_4p17.p1kpIBF
73 WTSI_1055_4p17.q1kpIBR
74
75 both become:
76
77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
79 """
80
81 if len(sys.argv) == 5:
82 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:]
83 elif len(sys.argv) == 6:
84 pairs_fastq = None
85 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:]
86 else:
87 stop_err(msg)
88
89 format = format.replace("fastq", "").lower()
90 if not format:
91 format="sanger" #safe default
92 elif format not in ["sanger","solexa","illumina","cssanger"]:
93 stop_err("Unrecognised format %s" % format)
94
95 def f_match(name):
96 if name.endswith("/1") or name.endswith(".f"):
97 return True
98
99 #Cope with three widely used suffix naming convensions,
100 #Illumina: /1 or /2
101 #Forward/revered: .f or .r
102 #Sanger, e.g. .p1k and .q1k
103 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
104 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
105 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
106
107 #assert re_f.match("demo/1")
108 assert re_f.search("demo.f")
109 assert re_f.search("demo.s1")
110 assert re_f.search("demo.f1k")
111 assert re_f.search("demo.p1")
112 assert re_f.search("demo.p1k")
113 assert re_f.search("demo.p1lk")
114 assert re_r.search("demo/2")
115 assert re_r.search("demo.r")
116 assert re_r.search("demo.q1")
117 assert re_r.search("demo.q1lk")
118 assert not re_r.search("demo/1")
119 assert not re_r.search("demo.f")
120 assert not re_r.search("demo.p")
121 assert not re_f.search("demo/2")
122 assert not re_f.search("demo.r")
123 assert not re_f.search("demo.q")
124
125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
131
132
133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
134 in_handle = open(input_fastq)
135 if pairs_fastq:
136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
137 pairs_r_writer = pairs_f_writer
138 else:
139 pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
140 pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
141 singles_writer = fastqWriter(open(singles_fastq, "w"), format)
142 last_template, buffered_reads = None, []
143
144 for record in fastqReader(in_handle, format):
145 count += 1
146 name = record.identifier.split(None,1)[0]
147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
148 is_forward = False
149 suffix = re_f.search(name)
150 if suffix:
151 #============
152 #Forward read
153 #============
154 template = name[:suffix.start()]
155 is_forward = True
156 elif re_illumina_f.match(record.identifier):
157 template = name #No suffix
158 is_forward = True
159 if is_forward:
160 #print name, "forward", template
161 forward += 1
162 if last_template == template:
163 buffered_reads.append(record)
164 else:
165 #Any old buffered reads are orphans
166 for old in buffered_reads:
167 singles_writer.write(old)
168 singles += 1
169 #Save this read in buffer
170 buffered_reads = [record]
171 last_template = template
172 else:
173 is_reverse = False
174 suffix = re_r.search(name)
175 if suffix:
176 #============
177 #Reverse read
178 #============
179 template = name[:suffix.start()]
180 is_reverse = True
181 elif re_illumina_r.match(record.identifier):
182 template = name #No suffix
183 is_reverse = True
184 if is_reverse:
185 #print name, "reverse", template
186 reverse += 1
187 if last_template == template and buffered_reads:
188 #We have a pair!
189 #If there are multiple buffered forward reads, want to pick
190 #the first one (although we could try and do something more
191 #clever looking at the suffix to match them up...)
192 old = buffered_reads.pop(0)
193 pairs_f_writer.write(old)
194 pairs_r_writer.write(record)
195 pairs += 2
196 else:
197 #As this is a reverse read, this and any buffered read(s) are
198 #all orphans
199 for old in buffered_reads:
200 singles_writer.write(old)
201 singles += 1
202 buffered_reads = []
203 singles_writer.write(record)
204 singles += 1
205 last_template = None
206 else:
207 #===========================
208 #Neither forward nor reverse
209 #===========================
210 singles_writer.write(record)
211 singles += 1
212 neither += 1
213 for old in buffered_reads:
214 singles_writer.write(old)
215 singles += 1
216 buffered_reads = []
217 last_template = None
218 if last_template:
219 #Left over singles...
220 for old in buffered_reads:
221 singles_writer.write(old)
222 singles += 1
223 in_handle.close
224 singles_writer.close()
225 if pairs_fastq:
226 pairs_f_writer.close()
227 assert pairs_r_writer.file.closed
228 else:
229 pairs_f_writer.close()
230 pairs_r_writer.close()
231
232 if neither:
233 print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \
234 % (count, forward, reverse, neither, pairs, singles)
235 else:
236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \
237 % (count, forward, reverse, pairs, singles)
238
239 assert count == pairs + singles == forward + reverse + neither, \
240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \
241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)