16
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import random
|
17
|
5 import gzip
|
|
6
|
|
7 CHUNKSIZE = 102400
|
|
8
|
|
9 def getlines(filen):
|
|
10 """
|
|
11 Efficiently fetch lines from a file one by one
|
|
12
|
|
13 Generator function implementing an efficient
|
|
14 method of reading lines sequentially from a file,
|
|
15 attempting to minimise the number of read operations
|
|
16 and performing the line splitting in memory:
|
|
17
|
|
18 >>> for line in getlines(filen):
|
|
19 >>> ...do something...
|
|
20
|
|
21 Input file can be gzipped; this function should
|
|
22 handle this invisibly provided the file names ends
|
|
23 with '.gz'.
|
|
24
|
|
25 Arguments:
|
|
26 filen (str): path of the file to read lines from
|
|
27
|
|
28 Yields:
|
|
29 String: next line of text from the file, with any
|
|
30 newline character removed.
|
|
31 """
|
|
32 if filen.split('.')[-1] == 'gz':
|
|
33 fp = gzip.open(filen,'rb')
|
|
34 else:
|
|
35 fp = open(filen,'rb')
|
|
36 # Read in data in chunks
|
|
37 buf = ''
|
|
38 lines = []
|
|
39 while True:
|
|
40 # Grab a chunk of data
|
|
41 data = fp.read(CHUNKSIZE)
|
|
42 # Check for EOF
|
|
43 if not data:
|
|
44 break
|
|
45 # Add to buffer and split into lines
|
|
46 buf = buf + data
|
|
47 if buf[0] == '\n':
|
|
48 buf = buf[1:]
|
|
49 if buf[-1] != '\n':
|
|
50 i = buf.rfind('\n')
|
|
51 if i == -1:
|
|
52 continue
|
|
53 else:
|
|
54 lines = buf[:i].split('\n')
|
|
55 buf = buf[i+1:]
|
|
56 else:
|
|
57 lines = buf[:-1].split('\n')
|
|
58 buf = ''
|
|
59 # Return the lines one at a time
|
|
60 for line in lines:
|
|
61 yield line
|
16
|
62
|
|
63 def count_reads(fastq):
|
|
64 """
|
|
65 Count number of reads in a Fastq file
|
|
66 """
|
|
67 n = 0
|
|
68 with open(fastq,'r') as fq:
|
|
69 while True:
|
|
70 buf = fq.read()
|
|
71 n += buf.count('\n')
|
|
72 if buf == "": break
|
|
73 return n/4
|
|
74
|
|
75 def fastq_subset(fastq_in,fastq_out,indices):
|
|
76 """
|
|
77 Output a subset of reads from a Fastq file
|
|
78
|
|
79 The reads to output are specifed by a list
|
|
80 of integer indices; only reads at those
|
|
81 positions in the input file will be written
|
|
82 to the output.
|
|
83 """
|
17
|
84 with open(fastq_out,'w') as fq_out:
|
|
85 # Current index
|
16
|
86 i = 0
|
17
|
87 # Read count
|
|
88 n = 0
|
|
89 # Read contents
|
|
90 rd = []
|
|
91 # Iterate through the file
|
|
92 for ii,line in enumerate(getlines(fastq_in),start=1):
|
|
93 rd.append(line)
|
|
94 if ii%4 == 0:
|
|
95 # Got a complete read
|
|
96 try:
|
|
97 # If read index matches the current index
|
|
98 # then output the read
|
|
99 if n == indices[i]:
|
|
100 fq_out.write("%s\n" % '\n'.join(rd))
|
|
101 i += 1
|
|
102 # Update for next read
|
|
103 n += 1
|
|
104 rd = []
|
|
105 except IndexError:
|
|
106 # Subset complete
|
|
107 return
|
|
108 # End of file: check nothing was left over
|
|
109 if rd:
|
|
110 raise Exception("Incomplete read at file end: %s"
|
|
111 % rd)
|
16
|
112
|
|
113 if __name__ == "__main__":
|
|
114
|
|
115 p = argparse.ArgumentParser()
|
|
116 p.add_argument("fastq_r1")
|
|
117 p.add_argument("fastq_r2")
|
|
118 p.add_argument("-n",
|
|
119 dest="subset_size",
|
|
120 default=None,
|
|
121 help="subset size")
|
|
122 p.add_argument("-s",
|
|
123 dest="seed",
|
|
124 type=int,
|
|
125 default=None,
|
|
126 help="seed for random number generator")
|
|
127 args = p.parse_args()
|
|
128
|
|
129 print "Processing fastq pair:"
|
|
130 print "\t%s" % args.fastq_r1
|
|
131 print "\t%s" % args.fastq_r2
|
|
132
|
|
133 nreads = count_reads(args.fastq_r1)
|
|
134 print "Counted %d reads in %s" % (nreads,args.fastq_r1)
|
|
135
|
|
136 if args.subset_size is not None:
|
|
137 subset_size = float(args.subset_size)
|
|
138 if subset_size < 1.0:
|
|
139 subset_size = int(nreads*subset_size)
|
|
140 else:
|
|
141 subset_size = int(subset_size)
|
|
142 print "Extracting subset of reads: %s" % subset_size
|
|
143 if args.seed is not None:
|
|
144 print "Random number generator seed: %d" % args.seed
|
|
145 random.seed(args.seed)
|
17
|
146 subset = sorted(random.sample(xrange(nreads),subset_size))
|
16
|
147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
|
|
148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)
|