comparison mismatch_frequencies.py @ 22:942464ea4211

remove heads
author Marius van den Beek <m.vandenbeek@gmail.com>
date Sun, 24 May 2015 17:29:41 +0200
parents f7da7f3e2c98
children 6590be3f8e3f
comparison
equal deleted inserted replaced
21:8d604c41010d 22:942464ea4211
1 import pysam, re, string 1 import pysam, re, string
2 import matplotlib.pyplot as plt 2 import matplotlib.pyplot as plt
3 import pandas as pd 3 import pandas as pd
4 import json
5 from collections import defaultdict 4 from collections import defaultdict
6 from collections import OrderedDict 5 from collections import OrderedDict
7 import argparse 6 import argparse
8 import itertools
9 7
10 class MismatchFrequencies: 8 class MismatchFrequencies:
11 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One 9 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One
12 class instance per alignment file. The result_dict attribute will contain a 10 class instance per alignment file. The result_dict attribute will contain a
13 nested dictionary with name, readlength and mismatch count.''' 11 nested dictionary with name, readlength and mismatch count.'''
14 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, 12 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, maximal_readlength=21,
15 maximal_readlength=21, 13 number_of_allowed_mismatches=1, ignore_5p_nucleotides=0, ignore_3p_nucleotides=0):
16 number_of_allowed_mismatches=1,
17 ignore_5p_nucleotides=0,
18 ignore_3p_nucleotides=0,
19 possible_mismatches = [
20 'AC', 'AG', 'AT',
21 'CA', 'CG', 'CT',
22 'GA', 'GC', 'GT',
23 'TA', 'TC', 'TG'
24 ]):
25 14
26 self.result_dict = result_dict 15 self.result_dict = result_dict
27 self.name = name 16 self.name = name
28 self.minimal_readlength = minimal_readlength 17 self.minimal_readlength = minimal_readlength
29 self.maximal_readlength = maximal_readlength 18 self.maximal_readlength = maximal_readlength
30 self.number_of_allowed_mismatches = number_of_allowed_mismatches 19 self.number_of_allowed_mismatches = number_of_allowed_mismatches
31 self.ignore_5p_nucleotides = ignore_5p_nucleotides 20 self.ignore_5p_nucleotides = ignore_5p_nucleotides
32 self.ignore_3p_nucleotides = ignore_3p_nucleotides 21 self.ignore_3p_nucleotides = ignore_3p_nucleotides
33 self.possible_mismatches = possible_mismatches
34 22
35 if alignment_file: 23 if alignment_file:
36 self.pysam_alignment = pysam.Samfile(alignment_file) 24 self.pysam_alignment = pysam.Samfile(alignment_file)
37 self.references = self.pysam_alignment.references #names of fasta reference sequences 25 result_dict[name]=self.get_mismatches(self.pysam_alignment, minimal_readlength, maximal_readlength)
38 result_dict[name]=self.get_mismatches( 26
39 self.pysam_alignment, 27 def get_mismatches(self, pysam_alignment, minimal_readlength, maximal_readlength):
40 minimal_readlength,
41 maximal_readlength,
42 possible_mismatches
43 )
44
45 def get_mismatches(self, pysam_alignment, minimal_readlength,
46 maximal_readlength, possible_mismatches):
47 mismatch_dict = defaultdict(int) 28 mismatch_dict = defaultdict(int)
48 rec_dd = lambda: defaultdict(rec_dd) 29 len_dict={}
49 len_dict = rec_dd() 30 for i in range(minimal_readlength, maximal_readlength+1):
31 len_dict[i]=mismatch_dict.copy()
50 for alignedread in pysam_alignment: 32 for alignedread in pysam_alignment:
51 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): 33 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
52 chromosome = pysam_alignment.getrname(alignedread.rname) 34 len_dict[int(alignedread.rlen)]['total_mapped'] += 1
53 try: 35 MD=alignedread.opt('MD')
54 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] += 1
55 except TypeError:
56 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] = 1
57 MD = alignedread.opt('MD')
58 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): 36 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
59 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) 37 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
60 if ref_base == None: 38 if ref_base == None:
61 continue 39 continue
62 else: 40 else:
63 for i, base in enumerate(ref_base): 41 for i, base in enumerate(ref_base):
64 if not ref_base[i]+mismatch_base[i] in possible_mismatches: 42 len_dict[int(alignedread.rlen)][ref_base[i]+' to '+mismatch_base[i]] += 1
65 continue
66 try:
67 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] += 1
68 except TypeError:
69 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1
70 return len_dict 43 return len_dict
71 44
72 def read_is_valid(self, read, min_readlength, max_readlength): 45 def read_is_valid(self, read, min_readlength, max_readlength):
73 '''Filter out reads that are unmatched, too short or 46 '''Filter out reads that are unmatched, too short or
74 too long or that contian insertions''' 47 too long or that contian insertions'''
158 reference_base+=MD[result.end()-1] 131 reference_base+=MD[result.end()-1]
159 mismatched_base+=readseq[mismatch_position-1] 132 mismatched_base+=readseq[mismatch_position-1]
160 if is_reverse: 133 if is_reverse:
161 reference_base=reverseComplement(reference_base) 134 reference_base=reverseComplement(reference_base)
162 mismatched_base=reverseComplement(mismatched_base) 135 mismatched_base=reverseComplement(mismatched_base)
163 mismatch_position=len(readseq)-mismatch_position-1
164 if mismatched_base=='N': 136 if mismatched_base=='N':
165 return (None, None) 137 return (None, None)
166 if self.mismatch_in_allowed_region(readseq, mismatch_position): 138 if self.mismatch_in_allowed_region(readseq, mismatch_position):
167 return (reference_base, mismatched_base) 139 return (reference_base, mismatched_base)
168 else: 140 else:
169 return (None, None) 141 return (None, None)
142
170 143
171 def reverseComplement(sequence): 144 def reverseComplement(sequence):
172 '''do a reverse complement of DNA base. 145 '''do a reverse complement of DNA base.
173 >>> reverseComplement('ATGC')=='GCAT' 146 >>> reverseComplement('ATGC')=='GCAT'
174 True 147 True
179 return sequence.upper().translate(complement)[::-1] 152 return sequence.upper().translate(complement)[::-1]
180 153
181 def barplot(df, library, axes): 154 def barplot(df, library, axes):
182 df.plot(kind='bar', ax=axes, subplots=False,\ 155 df.plot(kind='bar', ax=axes, subplots=False,\
183 stacked=False, legend='test',\ 156 stacked=False, legend='test',\
184 title='Mismatch frequencies for {0}'.format(library)) 157 title='Mismatches in TE small RNAs from {0}'.format(library))
185 158
159 def result_dict_to_df(result_dict):
160 mismatches = []
161 libraries = []
162 for mismatch, library in result_dict.iteritems():
163 mismatches.append(mismatch)
164 libraries.append(pd.DataFrame.from_dict(library, orient='index'))
165 df=pd.concat(libraries, keys=mismatches)
166 df.index.names = ['library', 'readsize']
167 return df
168
186 def df_to_tab(df, output): 169 def df_to_tab(df, output):
187 df.to_csv(output, sep='\t') 170 df.to_csv(output, sep='\t')
188 171
189 def reduce_result(df, possible_mismatches): 172 def plot_result(result_dict, args):
190 '''takes a pandas dataframe with full mismatch details and 173 names=args.name
191 summarises the results for plotting.'''
192 alignments = df['Alignment_file'].unique()
193 readlengths = df['Readlength'].unique()
194 combinations = itertools.product(*[alignments, readlengths]) #generate all possible combinations of readlength and alignment files
195 reduced_dict = {}
196 frames = []
197 last_column = 3+len(possible_mismatches)
198 for combination in combinations:
199 library_subset = df[df['Alignment_file'] == combination[0]]
200 library_readlength_subset = library_subset[library_subset['Readlength'] == combination[1]]
201 sum_of_library_and_readlength = library_readlength_subset.iloc[:,3:last_column+1].sum()
202 if not reduced_dict.has_key(combination[0]):
203 reduced_dict[combination[0]] = {}
204 reduced_dict[combination[0]][combination[1]] = sum_of_library_and_readlength.to_dict()
205 return reduced_dict
206
207 def plot_result(reduced_dict, args):
208 names=reduced_dict.keys()
209 nrows=len(names)/2+1 174 nrows=len(names)/2+1
210 fig = plt.figure(figsize=(16,32)) 175 fig = plt.figure(figsize=(16,32))
211 for i,library in enumerate (names): 176 for i,library in enumerate (names):
212 axes=fig.add_subplot(nrows,2,i+1) 177 axes=fig.add_subplot(nrows,2,i+1)
213 library_dict=reduced_dict[library] 178 library_dict=result_dict[library]
179 for length in library_dict.keys():
180 for mismatch in library_dict[length]:
181 if mismatch == 'total_mapped':
182 continue
183 library_dict[length][mismatch]=library_dict[length][mismatch]/float(library_dict[length]['total_mapped'])*100
184 del library_dict[length]['total_mapped']
214 df=pd.DataFrame(library_dict) 185 df=pd.DataFrame(library_dict)
215 df.drop(['total aligned reads'], inplace=True)
216 barplot(df, library, axes), 186 barplot(df, library, axes),
217 axes.set_ylabel('Mismatch count / all valid reads * readlength') 187 axes.set_ylabel('Percent of mapped reads with mismatches')
218 fig.savefig(args.output_pdf, format='pdf') 188 fig.savefig(args.output_pdf, format='pdf')
219 189
220 def format_result_dict(result_dict, chromosomes, possible_mismatches):
221 '''Turn nested dictionary into preformatted tab seperated lines'''
222 header = "Reference sequence\tAlignment_file\tReadlength\t" + "\t".join(
223 possible_mismatches) + "\ttotal aligned reads"
224 libraries = result_dict.keys()
225 readlengths = result_dict[libraries[0]].keys()
226 result = []
227 for chromosome in chromosomes:
228 for library in libraries:
229 for readlength in readlengths:
230 line = []
231 line.extend([chromosome, library, readlength])
232 try:
233 line.extend([result_dict[library][readlength][chromosome].get(mismatch, 0) for mismatch in possible_mismatches])
234 line.extend([result_dict[library][readlength][chromosome].get(u'total valid reads', 0)])
235 except KeyError:
236 line.extend([0 for mismatch in possible_mismatches])
237 line.extend([0])
238 result.append(line)
239 df = pd.DataFrame(result, columns=header.split('\t'))
240 last_column=3+len(possible_mismatches)
241 df['mismatches/per aligned nucleotides'] = df.iloc[:,3:last_column].sum(1)/(df.iloc[:,last_column]*df['Readlength'])
242 return df
243
244 def setup_MismatchFrequencies(args): 190 def setup_MismatchFrequencies(args):
245 resultDict=OrderedDict() 191 resultDict=OrderedDict()
246 kw_list=[{'result_dict' : resultDict, 192 kw_list=[{'result_dict' : resultDict,
247 'alignment_file' :alignment_file, 193 'alignment_file' :alignment_file,
248 'name' : name, 194 'name' : name,
249 'minimal_readlength' : args.min, 195 'minimal_readlength' : args.min,
250 'maximal_readlength' : args.max, 196 'maximal_readlength' : args.max,
251 'number_of_allowed_mismatches' : args.n_mm, 197 'number_of_allowed_mismatches' : args.n_mm,
252 'ignore_5p_nucleotides' : args.five_p, 198 'ignore_5p_nucleotides' : args.five_p,
253 'ignore_3p_nucleotides' : args.three_p, 199 'ignore_3p_nucleotides' : args.three_p}
254 'possible_mismatches' : args.possible_mismatches }
255 for alignment_file, name in zip(args.input, args.name)] 200 for alignment_file, name in zip(args.input, args.name)]
256 return (kw_list, resultDict) 201 return (kw_list, resultDict)
257 202
258 def nested_dict_to_df(dictionary):
259 dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()}
260 df=pd.DataFrame.from_dict(dictionary).transpose()
261 df.index.names = ['Library', 'Readlength']
262 return df
263
264 def run_MismatchFrequencies(args): 203 def run_MismatchFrequencies(args):
265 kw_list, resultDict=setup_MismatchFrequencies(args) 204 kw_list, resultDict=setup_MismatchFrequencies(args)
266 references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list] 205 [MismatchFrequencies(**kw_dict) for kw_dict in kw_list]
267 return (resultDict, references[0]) 206 return resultDict
268 207
269 def main(): 208 def main():
270 result_dict, references = run_MismatchFrequencies(args) 209 result_dict=run_MismatchFrequencies(args)
271 df = format_result_dict(result_dict, references, args.possible_mismatches) 210 df=result_dict_to_df(result_dict)
272 reduced_dict = reduce_result(df, args.possible_mismatches) 211 plot_result(result_dict, args)
273 plot_result(reduced_dict, args) 212 df_to_tab(df, args.output_tab)
274 reduced_df = nested_dict_to_df(reduced_dict)
275 df_to_tab(reduced_df, args.output_tab)
276 if not args.expanded_output_tab == None:
277 df_to_tab(df, args.expanded_output_tab)
278 return reduced_dict
279 213
280 if __name__ == "__main__": 214 if __name__ == "__main__":
281 215
282 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.') 216 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.')
283 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format') 217 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format')
284 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs') 218 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs')
285 parser.add_argument('--output_pdf', help='Output filename for graph') 219 parser.add_argument('--output_pdf', help='Output filename for graph')
286 parser.add_argument('--output_tab', help='Output filename for table') 220 parser.add_argument('--output_tab', help='Output filename for table')
287 parser.add_argument('--expanded_output_tab', default=None, help='Output filename for table')
288 parser.add_argument('--possible_mismatches', default=[
289 'AC', 'AG', 'AT','CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'
290 ], nargs='+', help='specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.')
291 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength') 221 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength')
292 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength') 222 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength')
293 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches') 223 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches')
294 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read') 224 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read')
295 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read') 225 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read')
296 #args = parser.parse_args(['--input', '3mismatches_ago2ip_s2.bam', '3mismatches_ago2ip_ovary.bam','--possible_mismatches','AC','AG', 'CG', 'TG', 'CT','--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--expanded_output_tab', 'expanded.tab', '--min', '20', '--max', '22']) 226 #args = parser.parse_args(['--input', '3mismatches_ago2ip.bam', '2mismatch.bam', '--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--min', '21', '--max', '21'])
297 args = parser.parse_args() 227 args = parser.parse_args()
298 reduced_dict = main() 228 main()
299 229
300