annotate fastq_paired_end_deinterlacer.py @ 0:e6e6498bf63c draft

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:29 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
1 #Florent Angly
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
2 import sys
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
3 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
4
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
5 def main():
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
6 input_filename = sys.argv[1]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
7 input_type = sys.argv[2] or 'sanger'
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
8 mate1_filename = sys.argv[3]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
9 mate2_filename = sys.argv[4]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
10 single1_filename = sys.argv[5]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
11 single2_filename = sys.argv[6]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
12
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
13 type = input_type
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
14 input = fastqNamedReader( open( input_filename, 'rb' ), format = type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
15 mate1_out = fastqWriter( open( mate1_filename, 'wb' ), format = type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
16 mate2_out = fastqWriter( open( mate2_filename, 'wb' ), format = type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
17 single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
18 single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
19 joiner = fastqJoiner( type )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
20
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
21 i = None
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
22 skip_count = 0
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
23 found = {}
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
24 for i, read in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ):
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
25
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
26 if read.identifier in found:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
27 del found[read.identifier]
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
28 continue
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
29
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
30 mate1 = input.get( read.identifier )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
31
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
32 mate2 = input.get( joiner.get_paired_identifier( mate1 ) )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
33
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
34 if mate2:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
35 # This is a mate pair
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
36 found[mate2.identifier] = None
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
37 if joiner.is_first_mate( mate1 ):
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
38 mate1_out.write( mate1 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
39 mate2_out.write( mate2 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
40 else:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
41 mate1_out.write( mate2 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
42 mate2_out.write( mate1 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
43 else:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
44 # This is a single
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
45 skip_count += 1
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
46 if joiner.is_first_mate( mate1 ):
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
47 single1_out.write( mate1 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
48 else:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
49 single2_out.write( mate1 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
50
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
51 if i is None:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
52 print "Your input file contained no valid FASTQ sequences."
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
53 else:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
54 if skip_count:
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
55 print 'There were %i reads with no mate.' % skip_count
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
56 print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 )
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
57
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
58 input.close()
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
59 mate1_out.close()
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
60 mate2_out.close()
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
61 single1_out.close()
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
62 single2_out.close()
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
63
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
64
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
65 if __name__ == "__main__":
e6e6498bf63c Imported from capsule None
devteam
parents:
diff changeset
66 main()