# HG changeset patch # User mvdbeek # Date 1446024750 14400 # Node ID 1b20ba32b4c5d0ebdb86c06804552a8aba297c49 # Parent bb84ee2f2137d9e6f6e5e9df6c7e50253bab1dc2 planemo upload for repository https://github.com/mvdbeek/dapars commit a02ced0b40d8946aa78a58321bd8f9771148391e-dirty diff -r bb84ee2f2137 -r 1b20ba32b4c5 dapars.py --- a/dapars.py Tue Oct 27 10:14:33 2015 -0400 +++ b/dapars.py Wed Oct 28 05:32:30 2015 -0400 @@ -82,7 +82,7 @@ cmds = [] for alignment_file in self.all_alignments: cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " - cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ + cmd = cmd + "~/bin/bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ " cut -f 4,7,8 > coverage_file_{alignment_name}".format( alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) cmds.append(cmd) @@ -162,7 +162,8 @@ return coverage_weights def get_result_tuple(self): - static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", "control_mean_percent", "treatment_mean_percent" ] + static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", + "breakpoint_type", "control_mean_percent", "treatment_mean_percent" ] samples_desc = [] for statistic in ["coverage_long", "coverage_short", "percent_long"]: for i, sample in enumerate(self.control_alignments): @@ -181,13 +182,17 @@ "result_d":result_d} pool = Pool(self.n_cpus) tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, - len(self.control_alignments), len(self.treatment_alignments), result_d, self.search_start, self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] + len(self.control_alignments), len(self.treatment_alignments), self.search_start, + self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] result = [res.get() for res in processed_tasks] - for d in result: - if isinstance(d, dict): - t = self.result_tuple(**d) - result_d[d["gene"]] = t + for res_control, res_treatment in result: + if isinstance(res_control, dict): + t = self.result_tuple(**res_control) + result_d[res_control["gene"]+"_bp_control"] = t + if isinstance(res_treatment, dict): + t = self.result_tuple(**res_treatment) + result_d[res_treatment["gene"]+"_bp_treatment"] = t return result_d def write_results(self): @@ -199,99 +204,109 @@ def write_bed(self): w = csv.writer(self.bed_output, delimiter='\t') - bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene, 0, result.strand) for result in self.result_d.itervalues()] + bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene+"_"+result.breakpoint_type, 0, result.strand) for result in self.result_d.itervalues()] w.writerows(bed) + def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, - num_treatment, result_d, search_start, coverage_threshold): - res = dict(zip(result_tuple_fields, result_tuple_fields)) + num_treatment, search_start, coverage_threshold): + res_control = dict(zip(result_tuple_fields, result_tuple_fields)) + res_treatment = res_control.copy() if utr_d["strand"] == "+": is_reverse = False else: is_reverse = True - mse, breakpoint, abundances = estimate_coverage_extended_utr(utr_coverage, - utr_d["new_start"], - utr_d["new_end"], - is_reverse, - coverage_weights, - search_start, - coverage_threshold) - if not str(mse) == "Na": - long_coverage_vector = abundances[0] - short_coverage_vector = abundances[1] - num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias - if num_non_zero == num_samples: - percentage_long = [] - for i in range(num_samples): - ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage - percentage_long.append(ratio) - for i in range(num_control): - res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) - res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) - res["control_{i}_percent_long".format(i=i)] = percentage_long[i] - for k in range(num_treatment): - i = k + num_control - res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) - res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) - res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] - control_mean_percent = np.mean(np.array(percentage_long[:num_control])) - treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) - res["chr"] = utr_d["chr"] - res["start"] = utr_d["start"] - res["end"] = utr_d["end"] - res["strand"] = utr_d["strand"] - if is_reverse: - breakpoint = utr_d["new_end"] - breakpoint - else: - breakpoint = utr_d["new_start"] + breakpoint - res["breakpoint"] = breakpoint - res["control_mean_percent"] = control_mean_percent - res["treatment_mean_percent"] = treatment_mean_percent - res["gene"] = utr - return res + control_breakpoint, \ + control_abundance, \ + treatment_breakpoint, \ + treatment_abundance = optimize_breakpoint(utr_coverage, utr_d["new_start"], utr_d["new_end"], coverage_weights, + search_start, coverage_threshold, num_control) + if control_breakpoint: + breakpoint_to_result(res_control, utr, utr_d, control_breakpoint, "control_breakpoint", control_abundance, is_reverse, num_samples, + num_control, num_treatment) + if treatment_breakpoint: + breakpoint_to_result(res_treatment, utr, utr_d, treatment_breakpoint, "treatment_breakpoint", treatment_abundance, is_reverse, + num_samples, num_control, num_treatment) + return res_control, res_treatment + + -def estimate_coverage_extended_utr(utr_coverage, UTR_start, - UTR_end, is_reverse, coverage_weigths, search_start, coverage_threshold): +def breakpoint_to_result(res, utr, utr_d, breakpoint, breakpoint_type, + abundances, is_reverse, num_samples, num_control, num_treatment): + """ + Takes in a result dictionary res and fills the necessary fields """ - We are searching for a breakpoint in coverage?! - utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. + long_coverage_vector = abundances[0] + short_coverage_vector = abundances[1] + num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias + if num_non_zero == num_samples: + percentage_long = [] + for i in range(num_samples): + ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage + percentage_long.append(ratio) + for i in range(num_control): + res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) + res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) + res["control_{i}_percent_long".format(i=i)] = percentage_long[i] + for k in range(num_treatment): + i = k + num_control + res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) + res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) + res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] + control_mean_percent = np.mean(np.array(percentage_long[:num_control])) + treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) + res["chr"] = utr_d["chr"] + res["start"] = utr_d["start"] + res["end"] = utr_d["end"] + res["strand"] = utr_d["strand"] + if is_reverse: + breakpoint = utr_d["new_end"] - breakpoint + else: + breakpoint = utr_d["new_start"] + breakpoint + res["breakpoint"] = breakpoint + res["breakpoint_type"] = breakpoint_type + res["control_mean_percent"] = control_mean_percent + res["treatment_mean_percent"] = treatment_mean_percent + res["gene"] = utr + + +def optimize_breakpoint(utr_coverage, UTR_start, UTR_end, coverage_weigths, search_start, coverage_threshold, num_control): + """ + We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided + at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. """ search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? num_samples = len(utr_coverage) - ##read coverage filtering - normalized_utr_coverage = [coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )] + normalized_utr_coverage = np.array([coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )]) start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? is_above_length = UTR_end - UTR_start >= 150 if (is_above_threshold) and (is_above_length): - if not is_reverse: - search_region = range(UTR_start + search_start, UTR_end - search_point_end + 1) - else: - search_region = range(UTR_end - search_start, UTR_start + search_point_end - 1, -1) search_end = UTR_end - UTR_start - search_point_end - normalized_utr_coverage = np.array(normalized_utr_coverage) breakpoints = range(search_start, search_end + 1) - mse_list = [ estimate_mse(normalized_utr_coverage,bp, num_samples) for bp in breakpoints ] + mse_list = [ estimate_mse(normalized_utr_coverage, bp, num_samples, num_control) for bp in breakpoints ] if len(mse_list) > 0: - min_ele_index = mse_list.index(min(mse_list)) - breakpoint = breakpoints[min_ele_index] - UTR_abundances = estimate_abundance(normalized_utr_coverage, breakpoint, num_samples) - select_mean_squared_error = mse_list[min_ele_index] - selected_break_point = breakpoint - else: - select_mean_squared_error = 'Na' - UTR_abundances = 'Na' - selected_break_point = 'Na' - else: - select_mean_squared_error = 'Na' - UTR_abundances = 'Na' - selected_break_point = 'Na' - - return select_mean_squared_error, selected_break_point, UTR_abundances + return mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples) + return False, False, False, False -def estimate_mse(cov, bp, num_samples): +def mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples): + """ + Take in mse_list with control and treatment mse and return breakpoint and utr abundance + """ + mse_control = [mse[0] for mse in mse_list] + mse_treatment = [mse[1] for mse in mse_list] + control_index = mse_control.index(min(mse_control)) + treatment_index = mse_treatment.index(min(mse_treatment)) + control_breakpoint = breakpoints[control_index] + treatment_breakpoint = breakpoints[treatment_index] + control_abundance = estimate_abundance(normalized_utr_coverage, control_breakpoint, num_samples) + treatment_abundance = estimate_abundance(normalized_utr_coverage, treatment_breakpoint, num_samples) + return control_breakpoint, control_abundance, treatment_breakpoint, treatment_abundance + + +def estimate_mse(cov, bp, num_samples, num_control): """ get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. """ @@ -303,8 +318,10 @@ mean_short_utr = np.mean(short_utr_vector, 1) square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 - mse = np.mean(np.append(square_mean_centered_short_utr_vector[:num_samples], square_mean_centered_long_utr_vector[:num_samples])) - return mse + mse_control = np.mean(np.append(square_mean_centered_long_utr_vector[:num_control], square_mean_centered_short_utr_vector[:num_control])) + mse_treatment = np.mean(np.append(square_mean_centered_long_utr_vector[num_control:], square_mean_centered_short_utr_vector[num_control:])) + return mse_control, mse_treatment + def estimate_abundance(cov, bp, num_samples): with warnings.catch_warnings():