Mercurial > repos > mvdbeek > dapars
comparison dapars.py @ 1:1b20ba32b4c5 draft
planemo upload for repository https://github.com/mvdbeek/dapars commit a02ced0b40d8946aa78a58321bd8f9771148391e-dirty
| author | mvdbeek |
|---|---|
| date | Wed, 28 Oct 2015 05:32:30 -0400 |
| parents | bb84ee2f2137 |
| children | 47b0044bc7f8 |
comparison
equal
deleted
inserted
replaced
| 0:bb84ee2f2137 | 1:1b20ba32b4c5 |
|---|---|
| 80 """ | 80 """ |
| 81 coverage_files = [] | 81 coverage_files = [] |
| 82 cmds = [] | 82 cmds = [] |
| 83 for alignment_file in self.all_alignments: | 83 for alignment_file in self.all_alignments: |
| 84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " | 84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " |
| 85 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ | 85 cmd = cmd + "~/bin/bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ |
| 86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( | 86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( |
| 87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) | 87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) |
| 88 cmds.append(cmd) | 88 cmds.append(cmd) |
| 89 pool = Pool(self.n_cpus) | 89 pool = Pool(self.n_cpus) |
| 90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] | 90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] |
| 160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) | 160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) |
| 161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? | 161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? |
| 162 return coverage_weights | 162 return coverage_weights |
| 163 | 163 |
| 164 def get_result_tuple(self): | 164 def get_result_tuple(self): |
| 165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", "control_mean_percent", "treatment_mean_percent" ] | 165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", |
| 166 "breakpoint_type", "control_mean_percent", "treatment_mean_percent" ] | |
| 166 samples_desc = [] | 167 samples_desc = [] |
| 167 for statistic in ["coverage_long", "coverage_short", "percent_long"]: | 168 for statistic in ["coverage_long", "coverage_short", "percent_long"]: |
| 168 for i, sample in enumerate(self.control_alignments): | 169 for i, sample in enumerate(self.control_alignments): |
| 169 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic)) | 170 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic)) |
| 170 for i, sample in enumerate(self.treatment_alignments): | 171 for i, sample in enumerate(self.treatment_alignments): |
| 179 "num_control":len(self.control_alignments), | 180 "num_control":len(self.control_alignments), |
| 180 "num_treatment":len(self.treatment_alignments), | 181 "num_treatment":len(self.treatment_alignments), |
| 181 "result_d":result_d} | 182 "result_d":result_d} |
| 182 pool = Pool(self.n_cpus) | 183 pool = Pool(self.n_cpus) |
| 183 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, | 184 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, |
| 184 len(self.control_alignments), len(self.treatment_alignments), result_d, self.search_start, self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] | 185 len(self.control_alignments), len(self.treatment_alignments), self.search_start, |
| 186 self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] | |
| 185 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] | 187 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] |
| 186 result = [res.get() for res in processed_tasks] | 188 result = [res.get() for res in processed_tasks] |
| 187 for d in result: | 189 for res_control, res_treatment in result: |
| 188 if isinstance(d, dict): | 190 if isinstance(res_control, dict): |
| 189 t = self.result_tuple(**d) | 191 t = self.result_tuple(**res_control) |
| 190 result_d[d["gene"]] = t | 192 result_d[res_control["gene"]+"_bp_control"] = t |
| 193 if isinstance(res_treatment, dict): | |
| 194 t = self.result_tuple(**res_treatment) | |
| 195 result_d[res_treatment["gene"]+"_bp_treatment"] = t | |
| 191 return result_d | 196 return result_d |
| 192 | 197 |
| 193 def write_results(self): | 198 def write_results(self): |
| 194 w = csv.writer(self.result_file, delimiter='\t') | 199 w = csv.writer(self.result_file, delimiter='\t') |
| 195 header = list(self.result_tuple._fields) | 200 header = list(self.result_tuple._fields) |
| 197 w.writerow(header) # field header | 202 w.writerow(header) # field header |
| 198 w.writerows( self.result_d.values()) | 203 w.writerows( self.result_d.values()) |
| 199 | 204 |
| 200 def write_bed(self): | 205 def write_bed(self): |
| 201 w = csv.writer(self.bed_output, delimiter='\t') | 206 w = csv.writer(self.bed_output, delimiter='\t') |
| 202 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene, 0, result.strand) for result in self.result_d.itervalues()] | 207 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene+"_"+result.breakpoint_type, 0, result.strand) for result in self.result_d.itervalues()] |
| 203 w.writerows(bed) | 208 w.writerows(bed) |
| 204 | 209 |
| 210 | |
| 205 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, | 211 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, |
| 206 num_treatment, result_d, search_start, coverage_threshold): | 212 num_treatment, search_start, coverage_threshold): |
| 207 res = dict(zip(result_tuple_fields, result_tuple_fields)) | 213 res_control = dict(zip(result_tuple_fields, result_tuple_fields)) |
| 214 res_treatment = res_control.copy() | |
| 208 if utr_d["strand"] == "+": | 215 if utr_d["strand"] == "+": |
| 209 is_reverse = False | 216 is_reverse = False |
| 210 else: | 217 else: |
| 211 is_reverse = True | 218 is_reverse = True |
| 212 mse, breakpoint, abundances = estimate_coverage_extended_utr(utr_coverage, | 219 control_breakpoint, \ |
| 213 utr_d["new_start"], | 220 control_abundance, \ |
| 214 utr_d["new_end"], | 221 treatment_breakpoint, \ |
| 215 is_reverse, | 222 treatment_abundance = optimize_breakpoint(utr_coverage, utr_d["new_start"], utr_d["new_end"], coverage_weights, |
| 216 coverage_weights, | 223 search_start, coverage_threshold, num_control) |
| 217 search_start, | 224 if control_breakpoint: |
| 218 coverage_threshold) | 225 breakpoint_to_result(res_control, utr, utr_d, control_breakpoint, "control_breakpoint", control_abundance, is_reverse, num_samples, |
| 219 if not str(mse) == "Na": | 226 num_control, num_treatment) |
| 220 long_coverage_vector = abundances[0] | 227 if treatment_breakpoint: |
| 221 short_coverage_vector = abundances[1] | 228 breakpoint_to_result(res_treatment, utr, utr_d, treatment_breakpoint, "treatment_breakpoint", treatment_abundance, is_reverse, |
| 222 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias | 229 num_samples, num_control, num_treatment) |
| 223 if num_non_zero == num_samples: | 230 return res_control, res_treatment |
| 224 percentage_long = [] | 231 |
| 225 for i in range(num_samples): | 232 |
| 226 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage | 233 |
| 227 percentage_long.append(ratio) | 234 |
| 228 for i in range(num_control): | 235 def breakpoint_to_result(res, utr, utr_d, breakpoint, breakpoint_type, |
| 229 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) | 236 abundances, is_reverse, num_samples, num_control, num_treatment): |
| 230 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) | 237 """ |
| 231 res["control_{i}_percent_long".format(i=i)] = percentage_long[i] | 238 Takes in a result dictionary res and fills the necessary fields |
| 232 for k in range(num_treatment): | 239 """ |
| 233 i = k + num_control | 240 long_coverage_vector = abundances[0] |
| 234 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) | 241 short_coverage_vector = abundances[1] |
| 235 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) | 242 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias |
| 236 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] | 243 if num_non_zero == num_samples: |
| 237 control_mean_percent = np.mean(np.array(percentage_long[:num_control])) | 244 percentage_long = [] |
| 238 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) | 245 for i in range(num_samples): |
| 239 res["chr"] = utr_d["chr"] | 246 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage |
| 240 res["start"] = utr_d["start"] | 247 percentage_long.append(ratio) |
| 241 res["end"] = utr_d["end"] | 248 for i in range(num_control): |
| 242 res["strand"] = utr_d["strand"] | 249 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) |
| 243 if is_reverse: | 250 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) |
| 244 breakpoint = utr_d["new_end"] - breakpoint | 251 res["control_{i}_percent_long".format(i=i)] = percentage_long[i] |
| 245 else: | 252 for k in range(num_treatment): |
| 246 breakpoint = utr_d["new_start"] + breakpoint | 253 i = k + num_control |
| 247 res["breakpoint"] = breakpoint | 254 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) |
| 248 res["control_mean_percent"] = control_mean_percent | 255 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) |
| 249 res["treatment_mean_percent"] = treatment_mean_percent | 256 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] |
| 250 res["gene"] = utr | 257 control_mean_percent = np.mean(np.array(percentage_long[:num_control])) |
| 251 return res | 258 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) |
| 252 | 259 res["chr"] = utr_d["chr"] |
| 253 | 260 res["start"] = utr_d["start"] |
| 254 def estimate_coverage_extended_utr(utr_coverage, UTR_start, | 261 res["end"] = utr_d["end"] |
| 255 UTR_end, is_reverse, coverage_weigths, search_start, coverage_threshold): | 262 res["strand"] = utr_d["strand"] |
| 256 """ | 263 if is_reverse: |
| 257 We are searching for a breakpoint in coverage?! | 264 breakpoint = utr_d["new_end"] - breakpoint |
| 258 utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. | 265 else: |
| 266 breakpoint = utr_d["new_start"] + breakpoint | |
| 267 res["breakpoint"] = breakpoint | |
| 268 res["breakpoint_type"] = breakpoint_type | |
| 269 res["control_mean_percent"] = control_mean_percent | |
| 270 res["treatment_mean_percent"] = treatment_mean_percent | |
| 271 res["gene"] = utr | |
| 272 | |
| 273 | |
| 274 def optimize_breakpoint(utr_coverage, UTR_start, UTR_end, coverage_weigths, search_start, coverage_threshold, num_control): | |
| 275 """ | |
| 276 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided | |
| 277 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. | |
| 259 """ | 278 """ |
| 260 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? | 279 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? |
| 261 num_samples = len(utr_coverage) | 280 num_samples = len(utr_coverage) |
| 262 ##read coverage filtering | 281 normalized_utr_coverage = np.array([coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )]) |
| 263 normalized_utr_coverage = [coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )] | |
| 264 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt | 282 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt |
| 265 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? | 283 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? |
| 266 is_above_length = UTR_end - UTR_start >= 150 | 284 is_above_length = UTR_end - UTR_start >= 150 |
| 267 if (is_above_threshold) and (is_above_length): | 285 if (is_above_threshold) and (is_above_length): |
| 268 if not is_reverse: | |
| 269 search_region = range(UTR_start + search_start, UTR_end - search_point_end + 1) | |
| 270 else: | |
| 271 search_region = range(UTR_end - search_start, UTR_start + search_point_end - 1, -1) | |
| 272 search_end = UTR_end - UTR_start - search_point_end | 286 search_end = UTR_end - UTR_start - search_point_end |
| 273 normalized_utr_coverage = np.array(normalized_utr_coverage) | |
| 274 breakpoints = range(search_start, search_end + 1) | 287 breakpoints = range(search_start, search_end + 1) |
| 275 mse_list = [ estimate_mse(normalized_utr_coverage,bp, num_samples) for bp in breakpoints ] | 288 mse_list = [ estimate_mse(normalized_utr_coverage, bp, num_samples, num_control) for bp in breakpoints ] |
| 276 if len(mse_list) > 0: | 289 if len(mse_list) > 0: |
| 277 min_ele_index = mse_list.index(min(mse_list)) | 290 return mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples) |
| 278 breakpoint = breakpoints[min_ele_index] | 291 return False, False, False, False |
| 279 UTR_abundances = estimate_abundance(normalized_utr_coverage, breakpoint, num_samples) | 292 |
| 280 select_mean_squared_error = mse_list[min_ele_index] | 293 |
| 281 selected_break_point = breakpoint | 294 def mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples): |
| 282 else: | 295 """ |
| 283 select_mean_squared_error = 'Na' | 296 Take in mse_list with control and treatment mse and return breakpoint and utr abundance |
| 284 UTR_abundances = 'Na' | 297 """ |
| 285 selected_break_point = 'Na' | 298 mse_control = [mse[0] for mse in mse_list] |
| 286 else: | 299 mse_treatment = [mse[1] for mse in mse_list] |
| 287 select_mean_squared_error = 'Na' | 300 control_index = mse_control.index(min(mse_control)) |
| 288 UTR_abundances = 'Na' | 301 treatment_index = mse_treatment.index(min(mse_treatment)) |
| 289 selected_break_point = 'Na' | 302 control_breakpoint = breakpoints[control_index] |
| 290 | 303 treatment_breakpoint = breakpoints[treatment_index] |
| 291 return select_mean_squared_error, selected_break_point, UTR_abundances | 304 control_abundance = estimate_abundance(normalized_utr_coverage, control_breakpoint, num_samples) |
| 292 | 305 treatment_abundance = estimate_abundance(normalized_utr_coverage, treatment_breakpoint, num_samples) |
| 293 | 306 return control_breakpoint, control_abundance, treatment_breakpoint, treatment_abundance |
| 294 def estimate_mse(cov, bp, num_samples): | 307 |
| 308 | |
| 309 def estimate_mse(cov, bp, num_samples, num_control): | |
| 295 """ | 310 """ |
| 296 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. | 311 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. |
| 297 """ | 312 """ |
| 298 with warnings.catch_warnings(): | 313 with warnings.catch_warnings(): |
| 299 warnings.simplefilter("ignore", category=RuntimeWarning) | 314 warnings.simplefilter("ignore", category=RuntimeWarning) |
| 301 short_utr_vector = cov[:num_samples, 0:bp] | 316 short_utr_vector = cov[:num_samples, 0:bp] |
| 302 mean_long_utr = np.mean(long_utr_vector, 1) | 317 mean_long_utr = np.mean(long_utr_vector, 1) |
| 303 mean_short_utr = np.mean(short_utr_vector, 1) | 318 mean_short_utr = np.mean(short_utr_vector, 1) |
| 304 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 | 319 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 |
| 305 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 | 320 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 |
| 306 mse = np.mean(np.append(square_mean_centered_short_utr_vector[:num_samples], square_mean_centered_long_utr_vector[:num_samples])) | 321 mse_control = np.mean(np.append(square_mean_centered_long_utr_vector[:num_control], square_mean_centered_short_utr_vector[:num_control])) |
| 307 return mse | 322 mse_treatment = np.mean(np.append(square_mean_centered_long_utr_vector[num_control:], square_mean_centered_short_utr_vector[num_control:])) |
| 323 return mse_control, mse_treatment | |
| 324 | |
| 308 | 325 |
| 309 def estimate_abundance(cov, bp, num_samples): | 326 def estimate_abundance(cov, bp, num_samples): |
| 310 with warnings.catch_warnings(): | 327 with warnings.catch_warnings(): |
| 311 warnings.simplefilter("ignore", category=RuntimeWarning) | 328 warnings.simplefilter("ignore", category=RuntimeWarning) |
| 312 long_utr_vector = cov[:num_samples, bp:] | 329 long_utr_vector = cov[:num_samples, bp:] |
