Mercurial > repos > mvdbeek > dapars
comparison dapars.py @ 1:1b20ba32b4c5 draft
planemo upload for repository https://github.com/mvdbeek/dapars commit a02ced0b40d8946aa78a58321bd8f9771148391e-dirty
author | mvdbeek |
---|---|
date | Wed, 28 Oct 2015 05:32:30 -0400 |
parents | bb84ee2f2137 |
children | 47b0044bc7f8 |
comparison
equal
deleted
inserted
replaced
0:bb84ee2f2137 | 1:1b20ba32b4c5 |
---|---|
80 """ | 80 """ |
81 coverage_files = [] | 81 coverage_files = [] |
82 cmds = [] | 82 cmds = [] |
83 for alignment_file in self.all_alignments: | 83 for alignment_file in self.all_alignments: |
84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " | 84 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " |
85 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ | 85 cmd = cmd + "~/bin/bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ |
86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( | 86 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( |
87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) | 87 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) |
88 cmds.append(cmd) | 88 cmds.append(cmd) |
89 pool = Pool(self.n_cpus) | 89 pool = Pool(self.n_cpus) |
90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] | 90 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] |
160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) | 160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) |
161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? | 161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? |
162 return coverage_weights | 162 return coverage_weights |
163 | 163 |
164 def get_result_tuple(self): | 164 def get_result_tuple(self): |
165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", "control_mean_percent", "treatment_mean_percent" ] | 165 static_desc = ["chr", "start", "end", "strand", "gene", "breakpoint", |
166 "breakpoint_type", "control_mean_percent", "treatment_mean_percent" ] | |
166 samples_desc = [] | 167 samples_desc = [] |
167 for statistic in ["coverage_long", "coverage_short", "percent_long"]: | 168 for statistic in ["coverage_long", "coverage_short", "percent_long"]: |
168 for i, sample in enumerate(self.control_alignments): | 169 for i, sample in enumerate(self.control_alignments): |
169 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic)) | 170 samples_desc.append("control_{i}_{statistic}".format(i=i, statistic = statistic)) |
170 for i, sample in enumerate(self.treatment_alignments): | 171 for i, sample in enumerate(self.treatment_alignments): |
179 "num_control":len(self.control_alignments), | 180 "num_control":len(self.control_alignments), |
180 "num_treatment":len(self.treatment_alignments), | 181 "num_treatment":len(self.treatment_alignments), |
181 "result_d":result_d} | 182 "result_d":result_d} |
182 pool = Pool(self.n_cpus) | 183 pool = Pool(self.n_cpus) |
183 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, | 184 tasks = [ (self.utr_coverages[utr], utr, utr_d, self.result_tuple._fields, self.coverage_weights, self.num_samples, |
184 len(self.control_alignments), len(self.treatment_alignments), result_d, self.search_start, self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] | 185 len(self.control_alignments), len(self.treatment_alignments), self.search_start, |
186 self.coverage_threshold) for utr, utr_d in self.utr_dict.iteritems() ] | |
185 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] | 187 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] |
186 result = [res.get() for res in processed_tasks] | 188 result = [res.get() for res in processed_tasks] |
187 for d in result: | 189 for res_control, res_treatment in result: |
188 if isinstance(d, dict): | 190 if isinstance(res_control, dict): |
189 t = self.result_tuple(**d) | 191 t = self.result_tuple(**res_control) |
190 result_d[d["gene"]] = t | 192 result_d[res_control["gene"]+"_bp_control"] = t |
193 if isinstance(res_treatment, dict): | |
194 t = self.result_tuple(**res_treatment) | |
195 result_d[res_treatment["gene"]+"_bp_treatment"] = t | |
191 return result_d | 196 return result_d |
192 | 197 |
193 def write_results(self): | 198 def write_results(self): |
194 w = csv.writer(self.result_file, delimiter='\t') | 199 w = csv.writer(self.result_file, delimiter='\t') |
195 header = list(self.result_tuple._fields) | 200 header = list(self.result_tuple._fields) |
197 w.writerow(header) # field header | 202 w.writerow(header) # field header |
198 w.writerows( self.result_d.values()) | 203 w.writerows( self.result_d.values()) |
199 | 204 |
200 def write_bed(self): | 205 def write_bed(self): |
201 w = csv.writer(self.bed_output, delimiter='\t') | 206 w = csv.writer(self.bed_output, delimiter='\t') |
202 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene, 0, result.strand) for result in self.result_d.itervalues()] | 207 bed = [(result.chr, result.breakpoint, result.breakpoint+1, result.gene+"_"+result.breakpoint_type, 0, result.strand) for result in self.result_d.itervalues()] |
203 w.writerows(bed) | 208 w.writerows(bed) |
204 | 209 |
210 | |
205 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, | 211 def calculate_all_utr(utr_coverage, utr, utr_d, result_tuple_fields, coverage_weights, num_samples, num_control, |
206 num_treatment, result_d, search_start, coverage_threshold): | 212 num_treatment, search_start, coverage_threshold): |
207 res = dict(zip(result_tuple_fields, result_tuple_fields)) | 213 res_control = dict(zip(result_tuple_fields, result_tuple_fields)) |
214 res_treatment = res_control.copy() | |
208 if utr_d["strand"] == "+": | 215 if utr_d["strand"] == "+": |
209 is_reverse = False | 216 is_reverse = False |
210 else: | 217 else: |
211 is_reverse = True | 218 is_reverse = True |
212 mse, breakpoint, abundances = estimate_coverage_extended_utr(utr_coverage, | 219 control_breakpoint, \ |
213 utr_d["new_start"], | 220 control_abundance, \ |
214 utr_d["new_end"], | 221 treatment_breakpoint, \ |
215 is_reverse, | 222 treatment_abundance = optimize_breakpoint(utr_coverage, utr_d["new_start"], utr_d["new_end"], coverage_weights, |
216 coverage_weights, | 223 search_start, coverage_threshold, num_control) |
217 search_start, | 224 if control_breakpoint: |
218 coverage_threshold) | 225 breakpoint_to_result(res_control, utr, utr_d, control_breakpoint, "control_breakpoint", control_abundance, is_reverse, num_samples, |
219 if not str(mse) == "Na": | 226 num_control, num_treatment) |
220 long_coverage_vector = abundances[0] | 227 if treatment_breakpoint: |
221 short_coverage_vector = abundances[1] | 228 breakpoint_to_result(res_treatment, utr, utr_d, treatment_breakpoint, "treatment_breakpoint", treatment_abundance, is_reverse, |
222 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias | 229 num_samples, num_control, num_treatment) |
223 if num_non_zero == num_samples: | 230 return res_control, res_treatment |
224 percentage_long = [] | 231 |
225 for i in range(num_samples): | 232 |
226 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage | 233 |
227 percentage_long.append(ratio) | 234 |
228 for i in range(num_control): | 235 def breakpoint_to_result(res, utr, utr_d, breakpoint, breakpoint_type, |
229 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) | 236 abundances, is_reverse, num_samples, num_control, num_treatment): |
230 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) | 237 """ |
231 res["control_{i}_percent_long".format(i=i)] = percentage_long[i] | 238 Takes in a result dictionary res and fills the necessary fields |
232 for k in range(num_treatment): | 239 """ |
233 i = k + num_control | 240 long_coverage_vector = abundances[0] |
234 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) | 241 short_coverage_vector = abundances[1] |
235 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) | 242 num_non_zero = sum((np.array(long_coverage_vector) + np.array(short_coverage_vector)) > 0) # TODO: This introduces bias |
236 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] | 243 if num_non_zero == num_samples: |
237 control_mean_percent = np.mean(np.array(percentage_long[:num_control])) | 244 percentage_long = [] |
238 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) | 245 for i in range(num_samples): |
239 res["chr"] = utr_d["chr"] | 246 ratio = float(long_coverage_vector[i]) / (long_coverage_vector[i] + short_coverage_vector[i]) # long 3'UTR percentage |
240 res["start"] = utr_d["start"] | 247 percentage_long.append(ratio) |
241 res["end"] = utr_d["end"] | 248 for i in range(num_control): |
242 res["strand"] = utr_d["strand"] | 249 res["control_{i}_coverage_long".format(i=i)] = float(long_coverage_vector[i]) |
243 if is_reverse: | 250 res["control_{i}_coverage_short".format(i=i)] = float(short_coverage_vector[i]) |
244 breakpoint = utr_d["new_end"] - breakpoint | 251 res["control_{i}_percent_long".format(i=i)] = percentage_long[i] |
245 else: | 252 for k in range(num_treatment): |
246 breakpoint = utr_d["new_start"] + breakpoint | 253 i = k + num_control |
247 res["breakpoint"] = breakpoint | 254 res["treatment_{i}_coverage_long".format(i=k)] = float(long_coverage_vector[i]) |
248 res["control_mean_percent"] = control_mean_percent | 255 res["treatment_{i}_coverage_short".format(i=k)] = float(short_coverage_vector[i]) |
249 res["treatment_mean_percent"] = treatment_mean_percent | 256 res["treatment_{i}_percent_long".format(i=k)] = percentage_long[i] |
250 res["gene"] = utr | 257 control_mean_percent = np.mean(np.array(percentage_long[:num_control])) |
251 return res | 258 treatment_mean_percent = np.mean(np.array(percentage_long[num_control:])) |
252 | 259 res["chr"] = utr_d["chr"] |
253 | 260 res["start"] = utr_d["start"] |
254 def estimate_coverage_extended_utr(utr_coverage, UTR_start, | 261 res["end"] = utr_d["end"] |
255 UTR_end, is_reverse, coverage_weigths, search_start, coverage_threshold): | 262 res["strand"] = utr_d["strand"] |
256 """ | 263 if is_reverse: |
257 We are searching for a breakpoint in coverage?! | 264 breakpoint = utr_d["new_end"] - breakpoint |
258 utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. | 265 else: |
266 breakpoint = utr_d["new_start"] + breakpoint | |
267 res["breakpoint"] = breakpoint | |
268 res["breakpoint_type"] = breakpoint_type | |
269 res["control_mean_percent"] = control_mean_percent | |
270 res["treatment_mean_percent"] = treatment_mean_percent | |
271 res["gene"] = utr | |
272 | |
273 | |
274 def optimize_breakpoint(utr_coverage, UTR_start, UTR_end, coverage_weigths, search_start, coverage_threshold, num_control): | |
275 """ | |
276 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided | |
277 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. | |
259 """ | 278 """ |
260 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? | 279 search_point_end = int(abs((UTR_end - UTR_start)) * 0.1) # TODO: This is 10% of total UTR end. Why? |
261 num_samples = len(utr_coverage) | 280 num_samples = len(utr_coverage) |
262 ##read coverage filtering | 281 normalized_utr_coverage = np.array([coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )]) |
263 normalized_utr_coverage = [coverage/ coverage_weigths[i] for i, coverage in enumerate( utr_coverage.values() )] | |
264 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt | 282 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt |
265 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? | 283 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? |
266 is_above_length = UTR_end - UTR_start >= 150 | 284 is_above_length = UTR_end - UTR_start >= 150 |
267 if (is_above_threshold) and (is_above_length): | 285 if (is_above_threshold) and (is_above_length): |
268 if not is_reverse: | |
269 search_region = range(UTR_start + search_start, UTR_end - search_point_end + 1) | |
270 else: | |
271 search_region = range(UTR_end - search_start, UTR_start + search_point_end - 1, -1) | |
272 search_end = UTR_end - UTR_start - search_point_end | 286 search_end = UTR_end - UTR_start - search_point_end |
273 normalized_utr_coverage = np.array(normalized_utr_coverage) | |
274 breakpoints = range(search_start, search_end + 1) | 287 breakpoints = range(search_start, search_end + 1) |
275 mse_list = [ estimate_mse(normalized_utr_coverage,bp, num_samples) for bp in breakpoints ] | 288 mse_list = [ estimate_mse(normalized_utr_coverage, bp, num_samples, num_control) for bp in breakpoints ] |
276 if len(mse_list) > 0: | 289 if len(mse_list) > 0: |
277 min_ele_index = mse_list.index(min(mse_list)) | 290 return mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples) |
278 breakpoint = breakpoints[min_ele_index] | 291 return False, False, False, False |
279 UTR_abundances = estimate_abundance(normalized_utr_coverage, breakpoint, num_samples) | 292 |
280 select_mean_squared_error = mse_list[min_ele_index] | 293 |
281 selected_break_point = breakpoint | 294 def mse_to_breakpoint(mse_list, normalized_utr_coverage, breakpoints, num_samples): |
282 else: | 295 """ |
283 select_mean_squared_error = 'Na' | 296 Take in mse_list with control and treatment mse and return breakpoint and utr abundance |
284 UTR_abundances = 'Na' | 297 """ |
285 selected_break_point = 'Na' | 298 mse_control = [mse[0] for mse in mse_list] |
286 else: | 299 mse_treatment = [mse[1] for mse in mse_list] |
287 select_mean_squared_error = 'Na' | 300 control_index = mse_control.index(min(mse_control)) |
288 UTR_abundances = 'Na' | 301 treatment_index = mse_treatment.index(min(mse_treatment)) |
289 selected_break_point = 'Na' | 302 control_breakpoint = breakpoints[control_index] |
290 | 303 treatment_breakpoint = breakpoints[treatment_index] |
291 return select_mean_squared_error, selected_break_point, UTR_abundances | 304 control_abundance = estimate_abundance(normalized_utr_coverage, control_breakpoint, num_samples) |
292 | 305 treatment_abundance = estimate_abundance(normalized_utr_coverage, treatment_breakpoint, num_samples) |
293 | 306 return control_breakpoint, control_abundance, treatment_breakpoint, treatment_abundance |
294 def estimate_mse(cov, bp, num_samples): | 307 |
308 | |
309 def estimate_mse(cov, bp, num_samples, num_control): | |
295 """ | 310 """ |
296 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. | 311 get abundance of long utr vs short utr with breakpoint specifying the position of long and short utr. |
297 """ | 312 """ |
298 with warnings.catch_warnings(): | 313 with warnings.catch_warnings(): |
299 warnings.simplefilter("ignore", category=RuntimeWarning) | 314 warnings.simplefilter("ignore", category=RuntimeWarning) |
301 short_utr_vector = cov[:num_samples, 0:bp] | 316 short_utr_vector = cov[:num_samples, 0:bp] |
302 mean_long_utr = np.mean(long_utr_vector, 1) | 317 mean_long_utr = np.mean(long_utr_vector, 1) |
303 mean_short_utr = np.mean(short_utr_vector, 1) | 318 mean_short_utr = np.mean(short_utr_vector, 1) |
304 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 | 319 square_mean_centered_short_utr_vector = (short_utr_vector[:num_samples] - mean_short_utr[:, np.newaxis] )**2 |
305 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 | 320 square_mean_centered_long_utr_vector = (long_utr_vector[:num_samples] - mean_long_utr[:, np.newaxis])**2 |
306 mse = np.mean(np.append(square_mean_centered_short_utr_vector[:num_samples], square_mean_centered_long_utr_vector[:num_samples])) | 321 mse_control = np.mean(np.append(square_mean_centered_long_utr_vector[:num_control], square_mean_centered_short_utr_vector[:num_control])) |
307 return mse | 322 mse_treatment = np.mean(np.append(square_mean_centered_long_utr_vector[num_control:], square_mean_centered_short_utr_vector[num_control:])) |
323 return mse_control, mse_treatment | |
324 | |
308 | 325 |
309 def estimate_abundance(cov, bp, num_samples): | 326 def estimate_abundance(cov, bp, num_samples): |
310 with warnings.catch_warnings(): | 327 with warnings.catch_warnings(): |
311 warnings.simplefilter("ignore", category=RuntimeWarning) | 328 warnings.simplefilter("ignore", category=RuntimeWarning) |
312 long_utr_vector = cov[:num_samples, bp:] | 329 long_utr_vector = cov[:num_samples, bp:] |