16
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import random
|
|
5 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
6
|
|
7 def count_reads(fastq):
|
|
8 """
|
|
9 Count number of reads in a Fastq file
|
|
10 """
|
|
11 n = 0
|
|
12 with open(fastq,'r') as fq:
|
|
13 while True:
|
|
14 buf = fq.read()
|
|
15 n += buf.count('\n')
|
|
16 if buf == "": break
|
|
17 return n/4
|
|
18
|
|
19 def fastq_subset(fastq_in,fastq_out,indices):
|
|
20 """
|
|
21 Output a subset of reads from a Fastq file
|
|
22
|
|
23 The reads to output are specifed by a list
|
|
24 of integer indices; only reads at those
|
|
25 positions in the input file will be written
|
|
26 to the output.
|
|
27 """
|
|
28 with open(fastq_in,'r') as fq_in:
|
|
29 fq_out = open(fastq_out,'w')
|
|
30 i = 0
|
|
31 for title,seq,qual in FastqGeneralIterator(fq_in):
|
|
32 if i in indices:
|
|
33 fq_out.write("@%s\n%s\n+\n%s\n" % (title,
|
|
34 seq,
|
|
35 qual))
|
|
36 i += 1
|
|
37 fq_out.close()
|
|
38
|
|
39 if __name__ == "__main__":
|
|
40
|
|
41 p = argparse.ArgumentParser()
|
|
42 p.add_argument("fastq_r1")
|
|
43 p.add_argument("fastq_r2")
|
|
44 p.add_argument("-n",
|
|
45 dest="subset_size",
|
|
46 default=None,
|
|
47 help="subset size")
|
|
48 p.add_argument("-s",
|
|
49 dest="seed",
|
|
50 type=int,
|
|
51 default=None,
|
|
52 help="seed for random number generator")
|
|
53 args = p.parse_args()
|
|
54
|
|
55 print "Processing fastq pair:"
|
|
56 print "\t%s" % args.fastq_r1
|
|
57 print "\t%s" % args.fastq_r2
|
|
58
|
|
59 nreads = count_reads(args.fastq_r1)
|
|
60 print "Counted %d reads in %s" % (nreads,args.fastq_r1)
|
|
61
|
|
62 if args.subset_size is not None:
|
|
63 subset_size = float(args.subset_size)
|
|
64 if subset_size < 1.0:
|
|
65 subset_size = int(nreads*subset_size)
|
|
66 else:
|
|
67 subset_size = int(subset_size)
|
|
68 print "Extracting subset of reads: %s" % subset_size
|
|
69 if args.seed is not None:
|
|
70 print "Random number generator seed: %d" % args.seed
|
|
71 random.seed(args.seed)
|
|
72 subset = random.sample(xrange(nreads),subset_size)
|
|
73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
|
|
74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)
|