comparison dapars.py @ 1:1b20ba32b4c5 draft

planemo upload for repository https://github.com/mvdbeek/dapars commit a02ced0b40d8946aa78a58321bd8f9771148391e-dirty
author mvdbeek
date Wed, 28 Oct 2015 05:32:30 -0400
parents bb84ee2f2137
children 47b0044bc7f8
comparison
equal deleted inserted replaced
0:bb84ee2f2137 1:1b20ba32b4c5
80 """ 80 """
81 coverage_files = [] 81 coverage_files = []
82 cmds = [] 82 cmds = []
83 for alignment_file in self.all_alignments: 83 for alignment_file in self.all_alignments:
84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " 84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | "
85 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ 85 cmd = cmd + "~/bin/bedtools coverage -d -s -abam {alignment_file} -b stdin |" \
86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( 86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format(
87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) 87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] )
88 cmds.append(cmd) 88 cmds.append(cmd)
89 pool = Pool(self.n_cpus) 89 pool = Pool(self.n_cpus)
90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] 90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds]
160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) 160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ])
161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? 161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited?
162 return coverage_weights 162 return coverage_weights
163 163
164 def get_result_tuple(self): 164 def get_result_tuple(self):
165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", "control_mean_percent", "treatment_mean_percent" ] 165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint",
166 "breakpoint_type", "control_mean_percent", "treatment_mean_percent" ]
166 samples_desc = [] 167 samples_desc = []
167 for statistic in ["coverage_long", "coverage_short", "percent_long"]: 168 for statistic in ["coverage_long", "coverage_short", "percent_long"]:
168 for i, sample in enumerate(self.control_alignments): 169 for i, sample in enumerate(self.control_alignments):
169 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic)) 170 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic))
170 for i, sample in enumerate(self.treatment_alignments): 171 for i, sample in enumerate(self.treatment_alignments):
179 "num_control":len(self.control_alignments), 180 "num_control":len(self.control_alignments),
180 "num_treatment":len(self.treatment_alignments), 181 "num_treatment":len(self.treatment_alignments),
181 "result_d":result_d} 182 "result_d":result_d}
182 pool = Pool(self.n_cpus) 183 pool = Pool(self.n_cpus)
183 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, 184 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples,
184 len(self.control_alignments), len(self.treatment_alignments), result_d, self.search_start, self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] 185 len(self.control_alignments), len(self.treatment_alignments), self.search_start,
186 self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ]
185 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] 187 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks]
186 result = [res.get() for res in processed_tasks] 188 result = [res.get() for res in processed_tasks]
187 for d in result: 189 for res_control, res_treatment in result:
188 if isinstance(d, dict): 190 if isinstance(res_control, dict):
189 t = self.result_tuple(**d) 191 t = self.result_tuple(**res_control)
190 result_d[d["gene"]] = t 192 result_d[res_control["gene"]+"_bp_control"] = t
193 if isinstance(res_treatment, dict):
194 t = self.result_tuple(**res_treatment)
195 result_d[res_treatment["gene"]+"_bp_treatment"] = t
191 return result_d 196 return result_d
192 197
193 def write_results(self): 198 def write_results(self):
194 w = csv.writer(self.result_file, delimiter='\t') 199 w = csv.writer(self.result_file, delimiter='\t')
195 header = list(self.result_tuple._fields) 200 header = list(self.result_tuple._fields)
197 w.writerow(header) # field header 202 w.writerow(header) # field header
198 w.writerows( self.result_d.values()) 203 w.writerows( self.result_d.values())
199 204
200 def write_bed(self): 205 def write_bed(self):
201 w = csv.writer(self.bed_output, delimiter='\t') 206 w = csv.writer(self.bed_output, delimiter='\t')
202 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene, 0, result.strand) for result in self.result_d.itervalues()] 207 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene+"_"+result.breakpoint_type, 0, result.strand) for result in self.result_d.itervalues()]
203 w.writerows(bed) 208 w.writerows(bed)
204 209
210
205 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, 211 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control,
206 num_treatment, result_d, search_start, coverage_threshold): 212 num_treatment, search_start, coverage_threshold):
207 res = dict(zip(result_tuple_fields, result_tuple_fields)) 213 res_control = dict(zip(result_tuple_fields, result_tuple_fields))
214 res_treatment = res_control.copy()
208 if utr_d["strand"] == "+": 215 if utr_d["strand"] == "+":
209 is_reverse = False 216 is_reverse = False
210 else: 217 else:
211 is_reverse = True 218 is_reverse = True
212 mse, breakpoint, abundances = estimate_coverage_extended_utr(utr_coverage, 219 control_breakpoint, \
213 utr_d["new_start"], 220 control_abundance, \
214 utr_d["new_end"], 221 treatment_breakpoint, \
215 is_reverse, 222 treatment_abundance = optimize_breakpoint(utr_coverage, utr_d["new_start"], utr_d["new_end"], coverage_weights,
216 coverage_weights, 223 search_start, coverage_threshold, num_control)
217 search_start, 224 if control_breakpoint:
218 coverage_threshold) 225 breakpoint_to_result(res_control, utr, utr_d, control_breakpoint, "control_breakpoint", control_abundance, is_reverse, num_samples,
219 if not str(mse) == "Na": 226 num_control, num_treatment)
220 long_coverage_vector = abundances[0] 227 if treatment_breakpoint:
221 short_coverage_vector = abundances[1] 228 breakpoint_to_result(res_treatment, utr, utr_d, treatment_breakpoint, "treatment_breakpoint", treatment_abundance, is_reverse,
222 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias 229 num_samples, num_control, num_treatment)
223 if num_non_zero == num_samples: 230 return res_control, res_treatment
224 percentage_long = [] 231
225 for i in range(num_samples): 232
226 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage 233
227 percentage_long.append(ratio) 234
228 for i in range(num_control): 235 def breakpoint_to_result(res, utr, utr_d, breakpoint, breakpoint_type,
229 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) 236 abundances, is_reverse, num_samples, num_control, num_treatment):
230 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) 237 """
231 res["control_{i}_percent_long".format(i=i)] = percentage_long[i] 238 Takes in a result dictionary res and fills the necessary fields
232 for k in range(num_treatment): 239 """
233 i = k + num_control 240 long_coverage_vector = abundances[0]
234 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) 241 short_coverage_vector = abundances[1]
235 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) 242 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias
236 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] 243 if num_non_zero == num_samples:
237 control_mean_percent = np.mean(np.array(percentage_long[:num_control])) 244 percentage_long = []
238 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) 245 for i in range(num_samples):
239 res["chr"] = utr_d["chr"] 246 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage
240 res["start"] = utr_d["start"] 247 percentage_long.append(ratio)
241 res["end"] = utr_d["end"] 248 for i in range(num_control):
242 res["strand"] = utr_d["strand"] 249 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i])
243 if is_reverse: 250 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i])
244 breakpoint = utr_d["new_end"] - breakpoint 251 res["control_{i}_percent_long".format(i=i)] = percentage_long[i]
245 else: 252 for k in range(num_treatment):
246 breakpoint = utr_d["new_start"] + breakpoint 253 i = k + num_control
247 res["breakpoint"] = breakpoint 254 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i])
248 res["control_mean_percent"] = control_mean_percent 255 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i])
249 res["treatment_mean_percent"] = treatment_mean_percent 256 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i]
250 res["gene"] = utr 257 control_mean_percent = np.mean(np.array(percentage_long[:num_control]))
251 return res 258 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:]))
252 259 res["chr"] = utr_d["chr"]
253 260 res["start"] = utr_d["start"]
254 def estimate_coverage_extended_utr(utr_coverage, UTR_start, 261 res["end"] = utr_d["end"]
255 UTR_end, is_reverse, coverage_weigths, search_start, coverage_threshold): 262 res["strand"] = utr_d["strand"]
256 """ 263 if is_reverse:
257 We are searching for a breakpoint in coverage?! 264 breakpoint = utr_d["new_end"] - breakpoint
258 utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. 265 else:
266 breakpoint = utr_d["new_start"] + breakpoint
267 res["breakpoint"] = breakpoint
268 res["breakpoint_type"] = breakpoint_type
269 res["control_mean_percent"] = control_mean_percent
270 res["treatment_mean_percent"] = treatment_mean_percent
271 res["gene"] = utr
272
273
274 def optimize_breakpoint(utr_coverage, UTR_start, UTR_end, coverage_weigths, search_start, coverage_threshold, num_control):
275 """
276 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided
277 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample.
259 """ 278 """
260 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? 279 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why?
261 num_samples = len(utr_coverage) 280 num_samples = len(utr_coverage)
262 ##read coverage filtering 281 normalized_utr_coverage = np.array([coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )])
263 normalized_utr_coverage = [coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )]
264 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt 282 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt
265 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? 283 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why?
266 is_above_length = UTR_end - UTR_start >= 150 284 is_above_length = UTR_end - UTR_start >= 150
267 if (is_above_threshold) and (is_above_length): 285 if (is_above_threshold) and (is_above_length):
268 if not is_reverse:
269 search_region = range(UTR_start + search_start, UTR_end - search_point_end + 1)
270 else:
271 search_region = range(UTR_end - search_start, UTR_start + search_point_end - 1, -1)
272 search_end = UTR_end - UTR_start - search_point_end 286 search_end = UTR_end - UTR_start - search_point_end
273 normalized_utr_coverage = np.array(normalized_utr_coverage)
274 breakpoints = range(search_start, search_end + 1) 287 breakpoints = range(search_start, search_end + 1)
275 mse_list = [ estimate_mse(normalized_utr_coverage,bp, num_samples) for bp in breakpoints ] 288 mse_list = [ estimate_mse(normalized_utr_coverage, bp, num_samples, num_control) for bp in breakpoints ]
276 if len(mse_list) > 0: 289 if len(mse_list) > 0:
277 min_ele_index = mse_list.index(min(mse_list)) 290 return mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples)
278 breakpoint = breakpoints[min_ele_index] 291 return False, False, False, False
279 UTR_abundances = estimate_abundance(normalized_utr_coverage, breakpoint, num_samples) 292
280 select_mean_squared_error = mse_list[min_ele_index] 293
281 selected_break_point = breakpoint 294 def mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples):
282 else: 295 """
283 select_mean_squared_error = 'Na' 296 Take in mse_list with control and treatment mse and return breakpoint and utr abundance
284 UTR_abundances = 'Na' 297 """
285 selected_break_point = 'Na' 298 mse_control = [mse[0] for mse in mse_list]
286 else: 299 mse_treatment = [mse[1] for mse in mse_list]
287 select_mean_squared_error = 'Na' 300 control_index = mse_control.index(min(mse_control))
288 UTR_abundances = 'Na' 301 treatment_index = mse_treatment.index(min(mse_treatment))
289 selected_break_point = 'Na' 302 control_breakpoint = breakpoints[control_index]
290 303 treatment_breakpoint = breakpoints[treatment_index]
291 return select_mean_squared_error, selected_break_point, UTR_abundances 304 control_abundance = estimate_abundance(normalized_utr_coverage, control_breakpoint, num_samples)
292 305 treatment_abundance = estimate_abundance(normalized_utr_coverage, treatment_breakpoint, num_samples)
293 306 return control_breakpoint, control_abundance, treatment_breakpoint, treatment_abundance
294 def estimate_mse(cov, bp, num_samples): 307
308
309 def estimate_mse(cov, bp, num_samples, num_control):
295 """ 310 """
296 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. 311 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr.
297 """ 312 """
298 with warnings.catch_warnings(): 313 with warnings.catch_warnings():
299 warnings.simplefilter("ignore", category=RuntimeWarning) 314 warnings.simplefilter("ignore", category=RuntimeWarning)
301 short_utr_vector = cov[:num_samples, 0:bp] 316 short_utr_vector = cov[:num_samples, 0:bp]
302 mean_long_utr = np.mean(long_utr_vector, 1) 317 mean_long_utr = np.mean(long_utr_vector, 1)
303 mean_short_utr = np.mean(short_utr_vector, 1) 318 mean_short_utr = np.mean(short_utr_vector, 1)
304 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 319 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2
305 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 320 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2
306 mse = np.mean(np.append(square_mean_centered_short_utr_vector[:num_samples], square_mean_centered_long_utr_vector[:num_samples])) 321 mse_control = np.mean(np.append(square_mean_centered_long_utr_vector[:num_control], square_mean_centered_short_utr_vector[:num_control]))
307 return mse 322 mse_treatment = np.mean(np.append(square_mean_centered_long_utr_vector[num_control:], square_mean_centered_short_utr_vector[num_control:]))
323 return mse_control, mse_treatment
324
308 325
309 def estimate_abundance(cov, bp, num_samples): 326 def estimate_abundance(cov, bp, num_samples):
310 with warnings.catch_warnings(): 327 with warnings.catch_warnings():
311 warnings.simplefilter("ignore", category=RuntimeWarning) 328 warnings.simplefilter("ignore", category=RuntimeWarning)
312 long_utr_vector = cov[:num_samples, bp:] 329 long_utr_vector = cov[:num_samples, bp:]