Mercurial > repos > mvdbeek > mismatch_frequencies
changeset 23:ca7b7890ed20
merge heads
author | Marius van den Beek <m.vandenbeek@gmail.com> |
---|---|
date | Sun, 24 May 2015 17:33:33 +0200 |
parents | 942464ea4211 (diff) 2612bb9caf71 (current diff) |
children | 6590be3f8e3f |
files | mismatch_frequencies.xml |
diffstat | 6 files changed, 31 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/mismatch_frequencies.py Wed Apr 01 14:22:11 2015 +0200 +++ b/mismatch_frequencies.py Sun May 24 17:33:33 2015 +0200 @@ -31,7 +31,7 @@ len_dict[i]=mismatch_dict.copy() for alignedread in pysam_alignment: if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): - len_dict[int(alignedread.rlen)]['total valid reads'] += 1 + len_dict[int(alignedread.rlen)]['total_mapped'] += 1 MD=alignedread.opt('MD') if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) @@ -133,7 +133,6 @@ if is_reverse: reference_base=reverseComplement(reference_base) mismatched_base=reverseComplement(mismatched_base) - mismatch_position=len(readseq)-mismatch_position-1 if mismatched_base=='N': return (None, None) if self.mismatch_in_allowed_region(readseq, mismatch_position): @@ -141,6 +140,7 @@ else: return (None, None) + def reverseComplement(sequence): '''do a reverse complement of DNA base. >>> reverseComplement('ATGC')=='GCAT' @@ -154,7 +154,7 @@ def barplot(df, library, axes): df.plot(kind='bar', ax=axes, subplots=False,\ stacked=False, legend='test',\ - title='Mismatch frequencies for {0}'.format(library)) + title='Mismatches in TE small RNAs from {0}'.format(library)) def result_dict_to_df(result_dict): mismatches = [] @@ -178,13 +178,13 @@ library_dict=result_dict[library] for length in library_dict.keys(): for mismatch in library_dict[length]: - if mismatch == 'total valid reads': + if mismatch == 'total_mapped': continue - library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100 - del library_dict[length]['total valid reads'] + library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100 + del library_dict[length]['total_mapped'] df=pd.DataFrame(library_dict) barplot(df, library, axes), - axes.set_ylabel('Mismatch count / all valid reads * 100') + axes.set_ylabel('Percent of mapped reads with mismatches') fig.savefig(args.output_pdf, format='pdf') def setup_MismatchFrequencies(args):
--- a/mismatch_frequencies.xml Wed Apr 01 14:22:11 2015 +0200 +++ b/mismatch_frequencies.xml Sun May 24 17:33:33 2015 +0200 @@ -1,11 +1,11 @@ -<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.6" hidden="false" > - <description>Analyze mismatch frequencies in BAM/SAM alignments</description> - <requirements> - <requirement type="package" version="0.7.7">pysam</requirement> - <requirement type="package" version="0.14.1">pandas</requirement> - <requirement type="package" version="1.4">matplotlib</requirement> - </requirements> - <command interpreter="python">mismatch_frequencies.py --input +<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.3" hidden="false" > + <description>Analyze mismatch frequencies in BAM/SAM alignments</description> + <requirements> + <requirement type="package" version="0.7.7">pysam</requirement> + <requirement type="package" version="0.14">pandas</requirement> + <requirement type="package" version="1.4">matplotlib</requirement> + </requirements> + <command interpreter="python">mismatch_frequencies.py --input #for i in $rep "$i.input_file" #end for @@ -17,26 +17,20 @@ --n_mm $number_of_mismatches --five_p $five_p --three_p $three_p - </command> - <inputs> - <repeat name="rep" title="alignment files"> - <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/> - </repeat> - <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/> - <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/> - <param name="max_length" label="Maximum read length to analyse" type="integer" value="21"/> - <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/> - <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/> - </inputs> - <outputs> - <data format="tabular" name="output_tab" /> - <data format="pdf" name="output_pdf" /> - </outputs> - <tests> - <test> - <param name="rep_0|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" /> - <param name="rep_1|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" /> - <param name="number_of_mismatches" value="1" /> - <param name="min_length" value="21" /> - <param name="max_length" value="21" /> + </command> + <inputs> + <repeat name="rep" title="alignment files"> + <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/> + </repeat> + <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/> + <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/> + <param name="max_length" label="Maximum read length to analyse" type="integer" value="21"/> + <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/> + <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/> + </inputs> + <outputs> + <data format="pdf" name="output_pdf" /> + <data format="tabular" name="output_tab" /> + </outputs> + </tool>
--- a/tool_dependencies.xml Wed Apr 01 14:22:11 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="pysam" version="0.7.7"> - <repository changeset_revision="ca10c522f37e" name="package_pysam_0_7_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="pandas" version="0.14.1"> - <repository changeset_revision="ef98e20431a7" name="package_pandas_0_14" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="matplotlib" version="1.4"> - <repository changeset_revision="62a48352f6a6" name="package_matplotlib_1_4" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>