Mercurial > repos > mvdbeek > mismatch_frequencies

--- a/mismatch_frequencies.py	Tue Jan 27 12:30:07 2015 -0500
+++ b/mismatch_frequencies.py	Wed Jan 28 13:12:56 2015 +0100
@@ -31,7 +31,7 @@
             len_dict[i]=mismatch_dict.copy()
         for alignedread in pysam_alignment:
             if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
-                len_dict[int(alignedread.rlen)]['total_mapped'] += 1
+                len_dict[int(alignedread.rlen)]['total valid reads'] += 1
                 MD=alignedread.opt('MD')
                 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
                     (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
@@ -133,6 +133,7 @@
         if is_reverse:
             reference_base=reverseComplement(reference_base)
             mismatched_base=reverseComplement(mismatched_base)
+            mismatch_position=len(readseq)-mismatch_position-1
         if mismatched_base=='N':
             return (None, None)
         if self.mismatch_in_allowed_region(readseq, mismatch_position):
@@ -140,7 +141,6 @@
         else:
             return (None, None)

-
 def reverseComplement(sequence):
     '''do a reverse complement of DNA base.
     >>> reverseComplement('ATGC')=='GCAT'
@@ -154,7 +154,7 @@
 def barplot(df, library, axes):
     df.plot(kind='bar', ax=axes, subplots=False,\
             stacked=False, legend='test',\
-            title='Mismatches in TE small RNAs from {0}'.format(library))
+            title='Mismatch frequencies for {0}'.format(library))

 def result_dict_to_df(result_dict):
     mismatches = []
@@ -178,13 +178,13 @@
         library_dict=result_dict[library]
         for length in library_dict.keys():
             for mismatch in library_dict[length]:
-                if mismatch == 'total_mapped':
+                if mismatch == 'total valid reads':
                     continue
-                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100
-            del library_dict[length]['total_mapped']
+                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100
+            del library_dict[length]['total valid reads']
         df=pd.DataFrame(library_dict)
         barplot(df, library, axes),
-        axes.set_ylabel('Percent of mapped reads with mismatches')
+        axes.set_ylabel('Mismatch count / all valid reads * 100')
     fig.savefig(args.output_pdf, format='pdf')

 def setup_MismatchFrequencies(args):
--- a/mismatch_frequencies.xml	Tue Jan 27 12:30:07 2015 -0500
+++ b/mismatch_frequencies.xml	Wed Jan 28 13:12:56 2015 +0100
@@ -1,4 +1,4 @@
-<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.4" hidden="false" >
+<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.5" hidden="false" >
   <description>Analyze mismatch frequencies in BAM/SAM alignments</description>
   <requirements>
     <requirement type="package" version="0.7.7">pysam</requirement>
@@ -19,8 +19,8 @@
                  --three_p $three_p
   </command>
   <inputs>
-    <repeat name="rep" title="alignment files">
-      <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/>
+    <repeat name="rep" title="alignment files" min="1">
+      <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which you want to calculate mismatch frequencies."/>
     </repeat>
     <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/>
     <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/>
@@ -35,12 +35,36 @@
   <tests>
     <test>
       <param name="rep_0|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" />
-      <param name="rep_1|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" />
+      <param name="rep_1|input_file" value="3mismatches_ago2ip_ovary.bam" ftype="bam" />
       <param name="number_of_mismatches" value="1" />
       <param name="min_length" value="21" />
       <param name="max_length" value="21" />
       <output name="tabular" file="mismatch.tab" ftype="tabular"/>
-      <output name="pdf" file="mismatch.pdf" ftype="pdf"/>
     </test>
   </tests>
+  <help>
+
+.. class:: infomark
+
+
+***What it does***
+
+This tool reconstitues for each aligned read of an alignment file in SAM/BAM format whether
+a mismatch is annotated in the MD tag, and if that is the case counts the identity of the
+mismatch relative to the reference sequence. The output is a PDF document with the calculated
+frequency for each mismatch that occured relative to the total number of valid reads and a table
+with the corresponding values. Read length can be limited to a specific read length, and 5 prime and
+3 prime-most nucleotides of a read can be ignored.
+
+----
+
+.. class:: warningmark
+
+***Warning***
+
+This tool skips all read that have insertions and has been tested only with bowtie and bowtie2
+generated alignment files.
+
+Written by Marius van den Beek, m.vandenbeek at gmail . com
+  </help>
 </tool>
--- a/test-data/mismatch.tab	Tue Jan 27 12:30:07 2015 -0500
+++ b/test-data/mismatch.tab	Wed Jan 28 13:12:56 2015 +0100
@@ -1,3 +1,3 @@
-library	readsize	A to C	A to G	A to T	C to A	C to G	C to T	G to A	G to C	G to T	T to A	T to C	T to G	total_mapped
+library	readsize	A to C	A to G	A to T	C to A	C to G	C to T	G to A	G to C	G to T	T to A	T to C	T to G	total valid reads
 3mismatches_ago2ip_s2.bam	21	31	5484	69	25	40	137	156	109	188	51	196	29	43881
 3mismatches_ago2ip_ovary.bam	21	293	879	411	452	231	872	845	191	473	384	818	324	138649