comparison mismatch_frequencies.py @ 7:270681625775

Add help text and fix for skipping the beginning/end of reads aligned in reverse orientation.
author mvdbeek
date Wed, 28 Jan 2015 13:12:56 +0100
parents e8ebe5132737
children
comparison
equal deleted inserted replaced
6:a7bf987b8cc4 7:270681625775
29 len_dict={} 29 len_dict={}
30 for i in range(minimal_readlength, maximal_readlength+1): 30 for i in range(minimal_readlength, maximal_readlength+1):
31 len_dict[i]=mismatch_dict.copy() 31 len_dict[i]=mismatch_dict.copy()
32 for alignedread in pysam_alignment: 32 for alignedread in pysam_alignment:
33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): 33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
34 len_dict[int(alignedread.rlen)]['total_mapped'] += 1 34 len_dict[int(alignedread.rlen)]['total valid reads'] += 1
35 MD=alignedread.opt('MD') 35 MD=alignedread.opt('MD')
36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): 36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) 37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
38 if ref_base == None: 38 if ref_base == None:
39 continue 39 continue
131 reference_base+=MD[result.end()-1] 131 reference_base+=MD[result.end()-1]
132 mismatched_base+=readseq[mismatch_position-1] 132 mismatched_base+=readseq[mismatch_position-1]
133 if is_reverse: 133 if is_reverse:
134 reference_base=reverseComplement(reference_base) 134 reference_base=reverseComplement(reference_base)
135 mismatched_base=reverseComplement(mismatched_base) 135 mismatched_base=reverseComplement(mismatched_base)
136 mismatch_position=len(readseq)-mismatch_position-1
136 if mismatched_base=='N': 137 if mismatched_base=='N':
137 return (None, None) 138 return (None, None)
138 if self.mismatch_in_allowed_region(readseq, mismatch_position): 139 if self.mismatch_in_allowed_region(readseq, mismatch_position):
139 return (reference_base, mismatched_base) 140 return (reference_base, mismatched_base)
140 else: 141 else:
141 return (None, None) 142 return (None, None)
142
143 143
144 def reverseComplement(sequence): 144 def reverseComplement(sequence):
145 '''do a reverse complement of DNA base. 145 '''do a reverse complement of DNA base.
146 >>> reverseComplement('ATGC')=='GCAT' 146 >>> reverseComplement('ATGC')=='GCAT'
147 True 147 True
152 return sequence.upper().translate(complement)[::-1] 152 return sequence.upper().translate(complement)[::-1]
153 153
154 def barplot(df, library, axes): 154 def barplot(df, library, axes):
155 df.plot(kind='bar', ax=axes, subplots=False,\ 155 df.plot(kind='bar', ax=axes, subplots=False,\
156 stacked=False, legend='test',\ 156 stacked=False, legend='test',\
157 title='Mismatches in TE small RNAs from {0}'.format(library)) 157 title='Mismatch frequencies for {0}'.format(library))
158 158
159 def result_dict_to_df(result_dict): 159 def result_dict_to_df(result_dict):
160 mismatches = [] 160 mismatches = []
161 libraries = [] 161 libraries = []
162 for mismatch, library in result_dict.iteritems(): 162 for mismatch, library in result_dict.iteritems():
176 for i,library in enumerate (names): 176 for i,library in enumerate (names):
177 axes=fig.add_subplot(nrows,2,i+1) 177 axes=fig.add_subplot(nrows,2,i+1)
178 library_dict=result_dict[library] 178 library_dict=result_dict[library]
179 for length in library_dict.keys(): 179 for length in library_dict.keys():
180 for mismatch in library_dict[length]: 180 for mismatch in library_dict[length]:
181 if mismatch == 'total_mapped': 181 if mismatch == 'total valid reads':
182 continue 182 continue
183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100 183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100
184 del library_dict[length]['total_mapped'] 184 del library_dict[length]['total valid reads']
185 df=pd.DataFrame(library_dict) 185 df=pd.DataFrame(library_dict)
186 barplot(df, library, axes), 186 barplot(df, library, axes),
187 axes.set_ylabel('Percent of mapped reads with mismatches') 187 axes.set_ylabel('Mismatch count / all valid reads * 100')
188 fig.savefig(args.output_pdf, format='pdf') 188 fig.savefig(args.output_pdf, format='pdf')
189 189
190 def setup_MismatchFrequencies(args): 190 def setup_MismatchFrequencies(args):
191 resultDict=OrderedDict() 191 resultDict=OrderedDict()
192 kw_list=[{'result_dict' : resultDict, 192 kw_list=[{'result_dict' : resultDict,