Mercurial > repos > mvdbeek > mismatch_frequencies

--- a/mismatch_frequencies.py	Wed Apr 01 14:22:11 2015 +0200
+++ b/mismatch_frequencies.py	Sun May 24 17:33:33 2015 +0200
@@ -31,7 +31,7 @@
             len_dict[i]=mismatch_dict.copy()
         for alignedread in pysam_alignment:
             if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
-                len_dict[int(alignedread.rlen)]['total valid reads'] += 1
+                len_dict[int(alignedread.rlen)]['total_mapped'] += 1
                 MD=alignedread.opt('MD')
                 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
                     (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
@@ -133,7 +133,6 @@
         if is_reverse:
             reference_base=reverseComplement(reference_base)
             mismatched_base=reverseComplement(mismatched_base)
-            mismatch_position=len(readseq)-mismatch_position-1
         if mismatched_base=='N':
             return (None, None)
         if self.mismatch_in_allowed_region(readseq, mismatch_position):
@@ -141,6 +140,7 @@
         else:
             return (None, None)

+
 def reverseComplement(sequence):
     '''do a reverse complement of DNA base.
     >>> reverseComplement('ATGC')=='GCAT'
@@ -154,7 +154,7 @@
 def barplot(df, library, axes):
     df.plot(kind='bar', ax=axes, subplots=False,\
             stacked=False, legend='test',\
-            title='Mismatch frequencies for {0}'.format(library))
+            title='Mismatches in TE small RNAs from {0}'.format(library))

 def result_dict_to_df(result_dict):
     mismatches = []
@@ -178,13 +178,13 @@
         library_dict=result_dict[library]
         for length in library_dict.keys():
             for mismatch in library_dict[length]:
-                if mismatch == 'total valid reads':
+                if mismatch == 'total_mapped':
                     continue
-                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100
-            del library_dict[length]['total valid reads']
+                library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100
+            del library_dict[length]['total_mapped']
         df=pd.DataFrame(library_dict)
         barplot(df, library, axes),
-        axes.set_ylabel('Mismatch count / all valid reads * 100')
+        axes.set_ylabel('Percent of mapped reads with mismatches')
     fig.savefig(args.output_pdf, format='pdf')

 def setup_MismatchFrequencies(args):
--- a/mismatch_frequencies.xml	Wed Apr 01 14:22:11 2015 +0200
+++ b/mismatch_frequencies.xml	Sun May 24 17:33:33 2015 +0200
@@ -1,11 +1,11 @@
-<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.6" hidden="false" >
-  <description>Analyze mismatch frequencies in BAM/SAM alignments</description>
-  <requirements>
-    <requirement type="package" version="0.7.7">pysam</requirement>
-    <requirement type="package" version="0.14.1">pandas</requirement>
-    <requirement type="package" version="1.4">matplotlib</requirement>
-  </requirements>
-  <command interpreter="python">mismatch_frequencies.py --input
+<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.3" hidden="false" >
+	<description>Analyze mismatch frequencies in BAM/SAM alignments</description>
+        <requirements>
+    	    <requirement type="package" version="0.7.7">pysam</requirement>
+    	    <requirement type="package" version="0.14">pandas</requirement>
+    	    <requirement type="package" version="1.4">matplotlib</requirement>
+        </requirements>
+	<command interpreter="python">mismatch_frequencies.py --input
 		#for i in $rep
 			"$i.input_file"
 		#end for
@@ -17,26 +17,20 @@
                  --n_mm $number_of_mismatches
                  --five_p $five_p
                  --three_p $three_p
-  </command>
-  <inputs>
-    <repeat name="rep" title="alignment files">
-      <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/>
-    </repeat>
-    <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/>
-    <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/>
-    <param name="max_length" label="Maximum read length to analyse" type="integer" value="21"/>
-    <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/>
-    <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="output_tab" />
-    <data format="pdf" name="output_pdf" />
-  </outputs>
-  <tests>
-    <test>
-      <param name="rep_0|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" />
-      <param name="rep_1|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" />
-      <param name="number_of_mismatches" value="1" />
-      <param name="min_length" value="21" />
-      <param name="max_length" value="21" />
+        </command>
+	<inputs>
+            <repeat name="rep" title="alignment files">
+        	<param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/>
+            </repeat>
+          <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/>
+          <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/>
+	  <param name="max_length" label="Maximum read length to analyse" type="integer" value="21"/>
+	  <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/>
+	  <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/>
+	</inputs>
+        <outputs>
+                <data format="pdf" name="output_pdf" />
+                <data format="tabular" name="output_tab" />
+        </outputs>
+
 </tool>
Binary file test-data/3mismatches_ago2ip_ovary.bam has changed
Binary file test-data/3mismatches_ago2ip_s2.bam has changed
Binary file test-data/mismatch.pdf has changed
--- a/tool_dependencies.xml	Wed Apr 01 14:22:11 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="pysam" version="0.7.7">
-        <repository changeset_revision="ca10c522f37e" name="package_pysam_0_7_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="pandas" version="0.14.1">
-        <repository changeset_revision="ef98e20431a7" name="package_pandas_0_14" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="matplotlib" version="1.4">
-        <repository changeset_revision="62a48352f6a6" name="package_matplotlib_1_4" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>