diff mismatch_frequencies.py @ 7:270681625775

Add help text and fix for skipping the beginning/end of reads aligned in reverse orientation.
author mvdbeek
date Wed, 28 Jan 2015 13:12:56 +0100
parents e8ebe5132737
children
line wrap: on
line diff
--- a/mismatch_frequencies.py	Tue Jan 27 12:30:07 2015 -0500
+++ b/mismatch_frequencies.py	Wed Jan 28 13:12:56 2015 +0100
@@ -31,7 +31,7 @@
             len_dict[i]=mismatch_dict.copy()
         for alignedread in pysam_alignment:
             if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
-                len_dict[int(alignedread.rlen)]['total_mapped'] += 1
+                len_dict[int(alignedread.rlen)]['total valid reads'] += 1
                 MD=alignedread.opt('MD')
                 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
                     (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
@@ -133,6 +133,7 @@
         if is_reverse:
             reference_base=reverseComplement(reference_base)
             mismatched_base=reverseComplement(mismatched_base)
+            mismatch_position=len(readseq)-mismatch_position-1
         if mismatched_base=='N':
             return (None, None)
         if self.mismatch_in_allowed_region(readseq, mismatch_position):
@@ -140,7 +141,6 @@
         else:
             return (None, None)
 
-
 def reverseComplement(sequence):
     '''do a reverse complement of DNA base.
     >>> reverseComplement('ATGC')=='GCAT'
@@ -154,7 +154,7 @@
 def barplot(df, library, axes):
     df.plot(kind='bar', ax=axes, subplots=False,\
             stacked=False, legend='test',\
-            title='Mismatches in TE small RNAs from {0}'.format(library))
+            title='Mismatch frequencies for {0}'.format(library))
   
 def result_dict_to_df(result_dict):
     mismatches = []
@@ -178,13 +178,13 @@
         library_dict=result_dict[library]
         for length in library_dict.keys():
             for mismatch in library_dict[length]:
-                if mismatch == 'total_mapped':
+                if mismatch == 'total valid reads':
                     continue
-                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100
-            del library_dict[length]['total_mapped']
+                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100
+            del library_dict[length]['total valid reads']
         df=pd.DataFrame(library_dict)
         barplot(df, library, axes),
-        axes.set_ylabel('Percent of mapped reads with mismatches')
+        axes.set_ylabel('Mismatch count / all valid reads * 100')
     fig.savefig(args.output_pdf, format='pdf')    
 
 def setup_MismatchFrequencies(args):