annotate fastq_subset.py @ 17:831c838927c7 draft

Test updated fastq_subset.py utility.
author pjbriggs
date Mon, 02 Jul 2018 10:57:46 -0400
parents 856cafcbf422
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
1 #!/usr/bin/env python
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
2
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
3 import argparse
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
4 import random
17
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
5 import gzip
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
6
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
7 CHUNKSIZE = 102400
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
8
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
9 def getlines(filen):
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
10 """
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
11 Efficiently fetch lines from a file one by one
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
12
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
13 Generator function implementing an efficient
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
14 method of reading lines sequentially from a file,
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
15 attempting to minimise the number of read operations
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
16 and performing the line splitting in memory:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
17
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
18 >>> for line in getlines(filen):
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
19 >>> ...do something...
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
20
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
21 Input file can be gzipped; this function should
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
22 handle this invisibly provided the file names ends
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
23 with '.gz'.
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
24
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
25 Arguments:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
26 filen (str): path of the file to read lines from
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
27
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
28 Yields:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
29 String: next line of text from the file, with any
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
30 newline character removed.
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
31 """
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
32 if filen.split('.')[-1] == 'gz':
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
33 fp = gzip.open(filen,'rb')
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
34 else:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
35 fp = open(filen,'rb')
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
36 # Read in data in chunks
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
37 buf = ''
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
38 lines = []
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
39 while True:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
40 # Grab a chunk of data
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
41 data = fp.read(CHUNKSIZE)
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
42 # Check for EOF
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
43 if not data:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
44 break
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
45 # Add to buffer and split into lines
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
46 buf = buf + data
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
47 if buf[0] == '\n':
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
48 buf = buf[1:]
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
49 if buf[-1] != '\n':
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
50 i = buf.rfind('\n')
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
51 if i == -1:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
52 continue
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
53 else:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
54 lines = buf[:i].split('\n')
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
55 buf = buf[i+1:]
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
56 else:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
57 lines = buf[:-1].split('\n')
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
58 buf = ''
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
59 # Return the lines one at a time
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
60 for line in lines:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
61 yield line
16
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
62
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
63 def count_reads(fastq):
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
64 """
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
65 Count number of reads in a Fastq file
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
66 """
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
67 n = 0
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
68 with open(fastq,'r') as fq:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
69 while True:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
70 buf = fq.read()
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
71 n += buf.count('\n')
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
72 if buf == "": break
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
73 return n/4
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
74
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
75 def fastq_subset(fastq_in,fastq_out,indices):
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
76 """
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
77 Output a subset of reads from a Fastq file
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
78
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
79 The reads to output are specifed by a list
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
80 of integer indices; only reads at those
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
81 positions in the input file will be written
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
82 to the output.
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
83 """
17
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
84 with open(fastq_out,'w') as fq_out:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
85 # Current index
16
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
86 i = 0
17
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
87 # Read count
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
88 n = 0
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
89 # Read contents
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
90 rd = []
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
91 # Iterate through the file
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
92 for ii,line in enumerate(getlines(fastq_in),start=1):
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
93 rd.append(line)
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
94 if ii%4 == 0:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
95 # Got a complete read
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
96 try:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
97 # If read index matches the current index
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
98 # then output the read
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
99 if n == indices[i]:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
100 fq_out.write("%s\n" % '\n'.join(rd))
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
101 i += 1
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
102 # Update for next read
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
103 n += 1
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
104 rd = []
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
105 except IndexError:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
106 # Subset complete
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
107 return
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
108 # End of file: check nothing was left over
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
109 if rd:
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
110 raise Exception("Incomplete read at file end: %s"
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
111 % rd)
16
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
112
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
113 if __name__ == "__main__":
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
114
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
115 p = argparse.ArgumentParser()
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
116 p.add_argument("fastq_r1")
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
117 p.add_argument("fastq_r2")
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
118 p.add_argument("-n",
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
119 dest="subset_size",
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
120 default=None,
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
121 help="subset size")
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
122 p.add_argument("-s",
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
123 dest="seed",
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
124 type=int,
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
125 default=None,
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
126 help="seed for random number generator")
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
127 args = p.parse_args()
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
128
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
129 print "Processing fastq pair:"
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
130 print "\t%s" % args.fastq_r1
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
131 print "\t%s" % args.fastq_r2
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
132
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
133 nreads = count_reads(args.fastq_r1)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
134 print "Counted %d reads in %s" % (nreads,args.fastq_r1)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
135
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
136 if args.subset_size is not None:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
137 subset_size = float(args.subset_size)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
138 if subset_size < 1.0:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
139 subset_size = int(nreads*subset_size)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
140 else:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
141 subset_size = int(subset_size)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
142 print "Extracting subset of reads: %s" % subset_size
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
143 if args.seed is not None:
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
144 print "Random number generator seed: %d" % args.seed
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
145 random.seed(args.seed)
17
831c838927c7 Test updated fastq_subset.py utility.
pjbriggs
parents: 16
diff changeset
146 subset = sorted(random.sample(xrange(nreads),subset_size))
16
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
856cafcbf422 Uploaded
pjbriggs
parents:
diff changeset
148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)