Mercurial > repos > mvdbeek > mismatch_frequencies
comparison mismatch_frequencies.py @ 7:270681625775
Add help text and fix for skipping the beginning/end of reads aligned in reverse orientation.
| author | mvdbeek |
|---|---|
| date | Wed, 28 Jan 2015 13:12:56 +0100 |
| parents | e8ebe5132737 |
| children |
comparison
equal
deleted
inserted
replaced
| 6:a7bf987b8cc4 | 7:270681625775 |
|---|---|
| 29 len_dict={} | 29 len_dict={} |
| 30 for i in range(minimal_readlength, maximal_readlength+1): | 30 for i in range(minimal_readlength, maximal_readlength+1): |
| 31 len_dict[i]=mismatch_dict.copy() | 31 len_dict[i]=mismatch_dict.copy() |
| 32 for alignedread in pysam_alignment: | 32 for alignedread in pysam_alignment: |
| 33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): | 33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): |
| 34 len_dict[int(alignedread.rlen)]['total_mapped'] += 1 | 34 len_dict[int(alignedread.rlen)]['total valid reads'] += 1 |
| 35 MD=alignedread.opt('MD') | 35 MD=alignedread.opt('MD') |
| 36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): | 36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): |
| 37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) | 37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) |
| 38 if ref_base == None: | 38 if ref_base == None: |
| 39 continue | 39 continue |
| 131 reference_base+=MD[result.end()-1] | 131 reference_base+=MD[result.end()-1] |
| 132 mismatched_base+=readseq[mismatch_position-1] | 132 mismatched_base+=readseq[mismatch_position-1] |
| 133 if is_reverse: | 133 if is_reverse: |
| 134 reference_base=reverseComplement(reference_base) | 134 reference_base=reverseComplement(reference_base) |
| 135 mismatched_base=reverseComplement(mismatched_base) | 135 mismatched_base=reverseComplement(mismatched_base) |
| 136 mismatch_position=len(readseq)-mismatch_position-1 | |
| 136 if mismatched_base=='N': | 137 if mismatched_base=='N': |
| 137 return (None, None) | 138 return (None, None) |
| 138 if self.mismatch_in_allowed_region(readseq, mismatch_position): | 139 if self.mismatch_in_allowed_region(readseq, mismatch_position): |
| 139 return (reference_base, mismatched_base) | 140 return (reference_base, mismatched_base) |
| 140 else: | 141 else: |
| 141 return (None, None) | 142 return (None, None) |
| 142 | |
| 143 | 143 |
| 144 def reverseComplement(sequence): | 144 def reverseComplement(sequence): |
| 145 '''do a reverse complement of DNA base. | 145 '''do a reverse complement of DNA base. |
| 146 >>> reverseComplement('ATGC')=='GCAT' | 146 >>> reverseComplement('ATGC')=='GCAT' |
| 147 True | 147 True |
| 152 return sequence.upper().translate(complement)[::-1] | 152 return sequence.upper().translate(complement)[::-1] |
| 153 | 153 |
| 154 def barplot(df, library, axes): | 154 def barplot(df, library, axes): |
| 155 df.plot(kind='bar', ax=axes, subplots=False,\ | 155 df.plot(kind='bar', ax=axes, subplots=False,\ |
| 156 stacked=False, legend='test',\ | 156 stacked=False, legend='test',\ |
| 157 title='Mismatches in TE small RNAs from {0}'.format(library)) | 157 title='Mismatch frequencies for {0}'.format(library)) |
| 158 | 158 |
| 159 def result_dict_to_df(result_dict): | 159 def result_dict_to_df(result_dict): |
| 160 mismatches = [] | 160 mismatches = [] |
| 161 libraries = [] | 161 libraries = [] |
| 162 for mismatch, library in result_dict.iteritems(): | 162 for mismatch, library in result_dict.iteritems(): |
| 176 for i,library in enumerate (names): | 176 for i,library in enumerate (names): |
| 177 axes=fig.add_subplot(nrows,2,i+1) | 177 axes=fig.add_subplot(nrows,2,i+1) |
| 178 library_dict=result_dict[library] | 178 library_dict=result_dict[library] |
| 179 for length in library_dict.keys(): | 179 for length in library_dict.keys(): |
| 180 for mismatch in library_dict[length]: | 180 for mismatch in library_dict[length]: |
| 181 if mismatch == 'total_mapped': | 181 if mismatch == 'total valid reads': |
| 182 continue | 182 continue |
| 183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100 | 183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100 |
| 184 del library_dict[length]['total_mapped'] | 184 del library_dict[length]['total valid reads'] |
| 185 df=pd.DataFrame(library_dict) | 185 df=pd.DataFrame(library_dict) |
| 186 barplot(df, library, axes), | 186 barplot(df, library, axes), |
| 187 axes.set_ylabel('Percent of mapped reads with mismatches') | 187 axes.set_ylabel('Mismatch count / all valid reads * 100') |
| 188 fig.savefig(args.output_pdf, format='pdf') | 188 fig.savefig(args.output_pdf, format='pdf') |
| 189 | 189 |
| 190 def setup_MismatchFrequencies(args): | 190 def setup_MismatchFrequencies(args): |
| 191 resultDict=OrderedDict() | 191 resultDict=OrderedDict() |
| 192 kw_list=[{'result_dict' : resultDict, | 192 kw_list=[{'result_dict' : resultDict, |
