comparison mismatch_frequencies.py @ 19:f7da7f3e2c98

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author mvdbeek
date Sun, 24 May 2015 11:12:24 -0400
parents 848d799e6fe8
children 942464ea4211
comparison
equal deleted inserted replaced
18:e3d950e52e38 19:f7da7f3e2c98
1 import pysam, re, string 1 import pysam, re, string
2 import matplotlib.pyplot as plt 2 import matplotlib.pyplot as plt
3 import pandas as pd 3 import pandas as pd
4 import json
4 from collections import defaultdict 5 from collections import defaultdict
5 from collections import OrderedDict 6 from collections import OrderedDict
6 import argparse 7 import argparse
8 import itertools
7 9
8 class MismatchFrequencies: 10 class MismatchFrequencies:
9 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One 11 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One
10 class instance per alignment file. The result_dict attribute will contain a 12 class instance per alignment file. The result_dict attribute will contain a
11 nested dictionary with name, readlength and mismatch count.''' 13 nested dictionary with name, readlength and mismatch count.'''
12 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, maximal_readlength=21, 14 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21,
13 number_of_allowed_mismatches=1, ignore_5p_nucleotides=0, ignore_3p_nucleotides=0): 15 maximal_readlength=21,
16 number_of_allowed_mismatches=1,
17 ignore_5p_nucleotides=0,
18 ignore_3p_nucleotides=0,
19 possible_mismatches = [
20 'AC', 'AG', 'AT',
21 'CA', 'CG', 'CT',
22 'GA', 'GC', 'GT',
23 'TA', 'TC', 'TG'
24 ]):
14 25
15 self.result_dict = result_dict 26 self.result_dict = result_dict
16 self.name = name 27 self.name = name
17 self.minimal_readlength = minimal_readlength 28 self.minimal_readlength = minimal_readlength
18 self.maximal_readlength = maximal_readlength 29 self.maximal_readlength = maximal_readlength
19 self.number_of_allowed_mismatches = number_of_allowed_mismatches 30 self.number_of_allowed_mismatches = number_of_allowed_mismatches
20 self.ignore_5p_nucleotides = ignore_5p_nucleotides 31 self.ignore_5p_nucleotides = ignore_5p_nucleotides
21 self.ignore_3p_nucleotides = ignore_3p_nucleotides 32 self.ignore_3p_nucleotides = ignore_3p_nucleotides
33 self.possible_mismatches = possible_mismatches
22 34
23 if alignment_file: 35 if alignment_file:
24 self.pysam_alignment = pysam.Samfile(alignment_file) 36 self.pysam_alignment = pysam.Samfile(alignment_file)
25 result_dict[name]=self.get_mismatches(self.pysam_alignment, minimal_readlength, maximal_readlength) 37 self.references = self.pysam_alignment.references #names of fasta reference sequences
26 38 result_dict[name]=self.get_mismatches(
27 def get_mismatches(self, pysam_alignment, minimal_readlength, maximal_readlength): 39 self.pysam_alignment,
40 minimal_readlength,
41 maximal_readlength,
42 possible_mismatches
43 )
44
45 def get_mismatches(self, pysam_alignment, minimal_readlength,
46 maximal_readlength, possible_mismatches):
28 mismatch_dict = defaultdict(int) 47 mismatch_dict = defaultdict(int)
29 len_dict={} 48 rec_dd = lambda: defaultdict(rec_dd)
30 for i in range(minimal_readlength, maximal_readlength+1): 49 len_dict = rec_dd()
31 len_dict[i]=mismatch_dict.copy()
32 for alignedread in pysam_alignment: 50 for alignedread in pysam_alignment:
33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): 51 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
34 len_dict[int(alignedread.rlen)]['total valid reads'] += 1 52 chromosome = pysam_alignment.getrname(alignedread.rname)
35 MD=alignedread.opt('MD') 53 try:
54 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] += 1
55 except TypeError:
56 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] = 1
57 MD = alignedread.opt('MD')
36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): 58 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) 59 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
38 if ref_base == None: 60 if ref_base == None:
39 continue 61 continue
40 else: 62 else:
41 for i, base in enumerate(ref_base): 63 for i, base in enumerate(ref_base):
42 len_dict[int(alignedread.rlen)][ref_base[i]+' to '+mismatch_base[i]] += 1 64 if not ref_base[i]+mismatch_base[i] in possible_mismatches:
65 continue
66 try:
67 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] += 1
68 except TypeError:
69 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1
43 return len_dict 70 return len_dict
44 71
45 def read_is_valid(self, read, min_readlength, max_readlength): 72 def read_is_valid(self, read, min_readlength, max_readlength):
46 '''Filter out reads that are unmatched, too short or 73 '''Filter out reads that are unmatched, too short or
47 too long or that contian insertions''' 74 too long or that contian insertions'''
153 180
154 def barplot(df, library, axes): 181 def barplot(df, library, axes):
155 df.plot(kind='bar', ax=axes, subplots=False,\ 182 df.plot(kind='bar', ax=axes, subplots=False,\
156 stacked=False, legend='test',\ 183 stacked=False, legend='test',\
157 title='Mismatch frequencies for {0}'.format(library)) 184 title='Mismatch frequencies for {0}'.format(library))
158 185
159 def result_dict_to_df(result_dict):
160 mismatches = []
161 libraries = []
162 for mismatch, library in result_dict.iteritems():
163 mismatches.append(mismatch)
164 libraries.append(pd.DataFrame.from_dict(library, orient='index'))
165 df=pd.concat(libraries, keys=mismatches)
166 df.index.names = ['library', 'readsize']
167 return df
168
169 def df_to_tab(df, output): 186 def df_to_tab(df, output):
170 df.to_csv(output, sep='\t') 187 df.to_csv(output, sep='\t')
171 188
172 def plot_result(result_dict, args): 189 def reduce_result(df, possible_mismatches):
173 names=args.name 190 '''takes a pandas dataframe with full mismatch details and
191 summarises the results for plotting.'''
192 alignments = df['Alignment_file'].unique()
193 readlengths = df['Readlength'].unique()
194 combinations = itertools.product(*[alignments, readlengths]) #generate all possible combinations of readlength and alignment files
195 reduced_dict = {}
196 frames = []
197 last_column = 3+len(possible_mismatches)
198 for combination in combinations:
199 library_subset = df[df['Alignment_file'] == combination[0]]
200 library_readlength_subset = library_subset[library_subset['Readlength'] == combination[1]]
201 sum_of_library_and_readlength = library_readlength_subset.iloc[:,3:last_column+1].sum()
202 if not reduced_dict.has_key(combination[0]):
203 reduced_dict[combination[0]] = {}
204 reduced_dict[combination[0]][combination[1]] = sum_of_library_and_readlength.to_dict()
205 return reduced_dict
206
207 def plot_result(reduced_dict, args):
208 names=reduced_dict.keys()
174 nrows=len(names)/2+1 209 nrows=len(names)/2+1
175 fig = plt.figure(figsize=(16,32)) 210 fig = plt.figure(figsize=(16,32))
176 for i,library in enumerate (names): 211 for i,library in enumerate (names):
177 axes=fig.add_subplot(nrows,2,i+1) 212 axes=fig.add_subplot(nrows,2,i+1)
178 library_dict=result_dict[library] 213 library_dict=reduced_dict[library]
179 for length in library_dict.keys():
180 for mismatch in library_dict[length]:
181 if mismatch == 'total valid reads':
182 continue
183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total valid reads'])*100
184 del library_dict[length]['total valid reads']
185 df=pd.DataFrame(library_dict) 214 df=pd.DataFrame(library_dict)
215 df.drop(['total aligned reads'], inplace=True)
186 barplot(df, library, axes), 216 barplot(df, library, axes),
187 axes.set_ylabel('Mismatch count / all valid reads * 100') 217 axes.set_ylabel('Mismatch count / all valid reads * readlength')
188 fig.savefig(args.output_pdf, format='pdf') 218 fig.savefig(args.output_pdf, format='pdf')
189 219
220 def format_result_dict(result_dict, chromosomes, possible_mismatches):
221 '''Turn nested dictionary into preformatted tab seperated lines'''
222 header = "Reference sequence\tAlignment_file\tReadlength\t" + "\t".join(
223 possible_mismatches) + "\ttotal aligned reads"
224 libraries = result_dict.keys()
225 readlengths = result_dict[libraries[0]].keys()
226 result = []
227 for chromosome in chromosomes:
228 for library in libraries:
229 for readlength in readlengths:
230 line = []
231 line.extend([chromosome, library, readlength])
232 try:
233 line.extend([result_dict[library][readlength][chromosome].get(mismatch, 0) for mismatch in possible_mismatches])
234 line.extend([result_dict[library][readlength][chromosome].get(u'total valid reads', 0)])
235 except KeyError:
236 line.extend([0 for mismatch in possible_mismatches])
237 line.extend([0])
238 result.append(line)
239 df = pd.DataFrame(result, columns=header.split('\t'))
240 last_column=3+len(possible_mismatches)
241 df['mismatches/per aligned nucleotides'] = df.iloc[:,3:last_column].sum(1)/(df.iloc[:,last_column]*df['Readlength'])
242 return df
243
190 def setup_MismatchFrequencies(args): 244 def setup_MismatchFrequencies(args):
191 resultDict=OrderedDict() 245 resultDict=OrderedDict()
192 kw_list=[{'result_dict' : resultDict, 246 kw_list=[{'result_dict' : resultDict,
193 'alignment_file' :alignment_file, 247 'alignment_file' :alignment_file,
194 'name' : name, 248 'name' : name,
195 'minimal_readlength' : args.min, 249 'minimal_readlength' : args.min,
196 'maximal_readlength' : args.max, 250 'maximal_readlength' : args.max,
197 'number_of_allowed_mismatches' : args.n_mm, 251 'number_of_allowed_mismatches' : args.n_mm,
198 'ignore_5p_nucleotides' : args.five_p, 252 'ignore_5p_nucleotides' : args.five_p,
199 'ignore_3p_nucleotides' : args.three_p} 253 'ignore_3p_nucleotides' : args.three_p,
254 'possible_mismatches' : args.possible_mismatches }
200 for alignment_file, name in zip(args.input, args.name)] 255 for alignment_file, name in zip(args.input, args.name)]
201 return (kw_list, resultDict) 256 return (kw_list, resultDict)
202 257
258 def nested_dict_to_df(dictionary):
259 dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()}
260 df=pd.DataFrame.from_dict(dictionary).transpose()
261 df.index.names = ['Library', 'Readlength']
262 return df
263
203 def run_MismatchFrequencies(args): 264 def run_MismatchFrequencies(args):
204 kw_list, resultDict=setup_MismatchFrequencies(args) 265 kw_list, resultDict=setup_MismatchFrequencies(args)
205 [MismatchFrequencies(**kw_dict) for kw_dict in kw_list] 266 references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list]
206 return resultDict 267 return (resultDict, references[0])
207 268
208 def main(): 269 def main():
209 result_dict=run_MismatchFrequencies(args) 270 result_dict, references = run_MismatchFrequencies(args)
210 df=result_dict_to_df(result_dict) 271 df = format_result_dict(result_dict, references, args.possible_mismatches)
211 plot_result(result_dict, args) 272 reduced_dict = reduce_result(df, args.possible_mismatches)
212 df_to_tab(df, args.output_tab) 273 plot_result(reduced_dict, args)
274 reduced_df = nested_dict_to_df(reduced_dict)
275 df_to_tab(reduced_df, args.output_tab)
276 if not args.expanded_output_tab == None:
277 df_to_tab(df, args.expanded_output_tab)
278 return reduced_dict
213 279
214 if __name__ == "__main__": 280 if __name__ == "__main__":
215 281
216 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.') 282 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.')
217 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format') 283 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format')
218 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs') 284 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs')
219 parser.add_argument('--output_pdf', help='Output filename for graph') 285 parser.add_argument('--output_pdf', help='Output filename for graph')
220 parser.add_argument('--output_tab', help='Output filename for table') 286 parser.add_argument('--output_tab', help='Output filename for table')
287 parser.add_argument('--expanded_output_tab', default=None, help='Output filename for table')
288 parser.add_argument('--possible_mismatches', default=[
289 'AC', 'AG', 'AT','CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'
290 ], nargs='+', help='specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.')
221 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength') 291 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength')
222 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength') 292 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength')
223 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches') 293 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches')
224 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read') 294 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read')
225 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read') 295 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read')
226 #args = parser.parse_args(['--input', '3mismatches_ago2ip.bam', '2mismatch.bam', '--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--min', '21', '--max', '21']) 296 #args = parser.parse_args(['--input', '3mismatches_ago2ip_s2.bam', '3mismatches_ago2ip_ovary.bam','--possible_mismatches','AC','AG', 'CG', 'TG', 'CT','--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--expanded_output_tab', 'expanded.tab', '--min', '20', '--max', '22'])
227 args = parser.parse_args() 297 args = parser.parse_args()
228 main() 298 reduced_dict = main()
229 299
300