Mercurial > repos > mvdbeek > mismatch_frequencies
changeset 7:270681625775
Add help text and fix for skipping the beginning/end of reads aligned in reverse orientation.
author | mvdbeek |
---|---|
date | Wed, 28 Jan 2015 13:12:56 +0100 |
parents | a7bf987b8cc4 |
children | 2e041a1564ad |
files | mismatch_frequencies.py mismatch_frequencies.xml test-data/mismatch.tab |
diffstat | 3 files changed, 37 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/mismatch_frequencies.py Tue Jan 27 12:30:07 2015 -0500 +++ b/mismatch_frequencies.py Wed Jan 28 13:12:56 2015 +0100 @@ -31,7 +31,7 @@ len_dict[i]=mismatch_dict.copy() for alignedread in pysam_alignment: if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): - len_dict[int(alignedread.rlen)]['total_mapped'] += 1 + len_dict[int(alignedread.rlen)]['total valid reads'] += 1 MD=alignedread.opt('MD') if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) @@ -133,6 +133,7 @@ if is_reverse: reference_base=reverseComplement(reference_base) mismatched_base=reverseComplement(mismatched_base) + mismatch_position=len(readseq)-mismatch_position-1 if mismatched_base=='N': return (None, None) if self.mismatch_in_allowed_region(readseq, mismatch_position): @@ -140,7 +141,6 @@ else: return (None, None) - def reverseComplement(sequence): '''do a reverse complement of DNA base. >>> reverseComplement('ATGC')=='GCAT' @@ -154,7 +154,7 @@ def barplot(df, library, axes): df.plot(kind='bar', ax=axes, subplots=False,\ stacked=False, legend='test',\ - title='Mismatches in TE small RNAs from {0}'.format(library)) + title='Mismatch frequencies for {0}'.format(library)) def result_dict_to_df(result_dict): mismatches = [] @@ -178,13 +178,13 @@ library_dict=result_dict[library] for length in library_dict.keys(): for mismatch in library_dict[length]: - if mismatch == 'total_mapped': + if mismatch == 'total valid reads': continue - library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100 - del library_dict[length]['total_mapped'] + library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100 + del library_dict[length]['total valid reads'] df=pd.DataFrame(library_dict) barplot(df, library, axes), - axes.set_ylabel('Percent of mapped reads with mismatches') + axes.set_ylabel('Mismatch count / all valid reads * 100') fig.savefig(args.output_pdf, format='pdf') def setup_MismatchFrequencies(args):
--- a/mismatch_frequencies.xml Tue Jan 27 12:30:07 2015 -0500 +++ b/mismatch_frequencies.xml Wed Jan 28 13:12:56 2015 +0100 @@ -1,4 +1,4 @@ -<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.4" hidden="false" > +<tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.0.5" hidden="false" > <description>Analyze mismatch frequencies in BAM/SAM alignments</description> <requirements> <requirement type="package" version="0.7.7">pysam</requirement> @@ -19,8 +19,8 @@ --three_p $three_p </command> <inputs> - <repeat name="rep" title="alignment files"> - <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/> + <repeat name="rep" title="alignment files" min="1"> + <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which you want to calculate mismatch frequencies."/> </repeat> <param name="number_of_mismatches" label="Maximum number of allowed mismatches per read" help="Discard reads with more than the chosen number of mismatches from the frequency calculation" type="integer" value="3"/> <param name="min_length" label="Minumum read length to analyse" type="integer" value="21"/> @@ -35,12 +35,36 @@ <tests> <test> <param name="rep_0|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" /> - <param name="rep_1|input_file" value="3mismatches_ago2ip_s2.bam" ftype="bam" /> + <param name="rep_1|input_file" value="3mismatches_ago2ip_ovary.bam" ftype="bam" /> <param name="number_of_mismatches" value="1" /> <param name="min_length" value="21" /> <param name="max_length" value="21" /> <output name="tabular" file="mismatch.tab" ftype="tabular"/> - <output name="pdf" file="mismatch.pdf" ftype="pdf"/> </test> </tests> + <help> + +.. class:: infomark + + +***What it does*** + +This tool reconstitues for each aligned read of an alignment file in SAM/BAM format whether +a mismatch is annotated in the MD tag, and if that is the case counts the identity of the +mismatch relative to the reference sequence. The output is a PDF document with the calculated +frequency for each mismatch that occured relative to the total number of valid reads and a table +with the corresponding values. Read length can be limited to a specific read length, and 5 prime and +3 prime-most nucleotides of a read can be ignored. + +---- + +.. class:: warningmark + +***Warning*** + +This tool skips all read that have insertions and has been tested only with bowtie and bowtie2 +generated alignment files. + +Written by Marius van den Beek, m.vandenbeek at gmail . com + </help> </tool>
--- a/test-data/mismatch.tab Tue Jan 27 12:30:07 2015 -0500 +++ b/test-data/mismatch.tab Wed Jan 28 13:12:56 2015 +0100 @@ -1,3 +1,3 @@ -library readsize A to C A to G A to T C to A C to G C to T G to A G to C G to T T to A T to C T to G total_mapped +library readsize A to C A to G A to T C to A C to G C to T G to A G to C G to T T to A T to C T to G total valid reads 3mismatches_ago2ip_s2.bam 21 31 5484 69 25 40 137 156 109 188 51 196 29 43881 3mismatches_ago2ip_ovary.bam 21 293 879 411 452 231 872 845 191 473 384 818 324 138649