Mercurial > repos > mvdbeek > dapars
comparison dapars.py @ 13:538c4e2b423e draft
planemo upload for repository https://github.com/mvdbeek/dapars commit deab588a5d5ec7022de63a395fbd04e415ba0a42-dirty
author | mvdbeek |
---|---|
date | Fri, 30 Oct 2015 04:59:15 -0400 |
parents | a5d8b08af089 |
children | ea5d573d6d40 |
comparison
equal
deleted
inserted
replaced
12:04895ae7ac58 | 13:538c4e2b423e |
---|---|
64 self.html_file = args.html_file | 64 self.html_file = args.html_file |
65 self.utr = args.utr_bed_file | 65 self.utr = args.utr_bed_file |
66 self.gtf_fields = filter_utr.get_gtf_fields() | 66 self.gtf_fields = filter_utr.get_gtf_fields() |
67 self.result_file = args.output_file | 67 self.result_file = args.output_file |
68 self.all_alignments = self.control_alignments + self.treatment_alignments | 68 self.all_alignments = self.control_alignments + self.treatment_alignments |
69 self.alignment_names = { file: os.path.basename(file) for file in self.all_alignments } | 69 self.alignment_names = OrderedDict(( file, os.path.basename(file)) for file in self.all_alignments ) |
70 self.num_samples = len(self.all_alignments) | 70 self.num_samples = len(self.all_alignments) |
71 self.utr_dict = self.get_utr_dict(0.2) | 71 self.utr_dict = self.get_utr_dict(0.2) |
72 self.dump_utr_dict_to_bedfile() | 72 self.dump_utr_dict_to_bedfile() |
73 print "Established dictionary of 3\'UTRs" | 73 print "Established dictionary of 3\'UTRs" |
74 self.coverage_files = self.run_bedtools_coverage() | 74 self.coverage_files = self.run_bedtools_coverage() |
91 | 91 |
92 def run_bedtools_coverage(self): | 92 def run_bedtools_coverage(self): |
93 """ | 93 """ |
94 Use bedtools coverage to generate pileup data for all alignment files for the regions specified in utr_dict. | 94 Use bedtools coverage to generate pileup data for all alignment files for the regions specified in utr_dict. |
95 """ | 95 """ |
96 coverage_files = [] | |
97 cmds = [] | 96 cmds = [] |
98 for alignment_file in self.all_alignments: | 97 for alignment_file in self.all_alignments: |
99 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " | 98 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " |
100 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ | 99 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b tmp_bedfile.bed |" \ |
101 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( | 100 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( |
102 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) | 101 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) |
103 cmds.append(cmd) | 102 cmds.append(cmd) |
104 pool = Pool(self.n_cpus) | |
105 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] | 103 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] |
106 [p.wait() for p in subprocesses] | 104 [p.wait() for p in subprocesses] |
107 coverage_files = ["gene_position_coverage_{alignment_name}".format( | 105 coverage_files = ["gene_position_coverage_{alignment_name}".format( |
108 alignment_name = self.alignment_names[alignment_file]) for alignment_file in self.all_alignments ] | 106 alignment_name = self.alignment_names[alignment_file]) for alignment_file in self.all_alignments ] |
109 return coverage_files | 107 return coverage_files |
110 | 108 |
111 def read_coverage_result(self): | 109 def read_coverage_result(self): |
112 """ | 110 """ |
113 Read coverages back in and store as dictionary of numpy arrays | 111 Read coverages back in and store as dictionary of numpy arrays |
114 """ | 112 """ |
115 coverage_dict = { gene: { name: np.zeros(utr_d["new_end"]+1-utr_d["new_start"]) for name in self.alignment_names.itervalues() } for gene, utr_d in self.utr_dict.iteritems() } | 113 coverage_dict = { gene: [np.zeros(utr_d["new_end"]+1-utr_d["new_start"]) for name in self.alignment_names.values() ] for gene, utr_d in self.utr_dict.items() } |
116 for alignment_name in self.alignment_names.itervalues(): | 114 for i, alignment_name in enumerate(self.alignment_names.values()): |
117 with open("coverage_file_{alignment_name}".format(alignment_name = alignment_name)) as coverage_file: | 115 with open("coverage_file_{alignment_name}".format(alignment_name = alignment_name)) as coverage_file: |
118 for line in coverage_file: | 116 for line in coverage_file: |
119 gene, position, coverage= line.strip().split("\t") | 117 gene, position, coverage= line.strip().split("\t") |
120 coverage_dict[gene][alignment_name][int(position)-1] = coverage | 118 coverage_dict[gene][i][int(position)-1] = coverage |
121 for utr_d in self.utr_dict.itervalues(): | 119 for utr_d in self.utr_dict.values(): |
122 if utr_d["strand"] == "-": | 120 if utr_d["strand"] == "-": |
123 for alignment_name in self.alignment_names.values(): | 121 for i, alignment_name in enumerate(self.alignment_names.values()): |
124 coverage_dict[gene][alignment_name] = coverage_dict[gene][alignment_name][::-1] | 122 coverage_dict[gene][i] = coverage_dict[gene][i][::-1] |
125 return coverage_dict | 123 return coverage_dict |
126 | 124 |
127 def get_utr_dict(self, shift): | 125 def get_utr_dict(self, shift): |
128 """ | 126 """ |
129 The utr end is extended by UTR length * shift, to discover novel distal polyA sites. | 127 The utr end is extended by UTR length * shift, to discover novel distal polyA sites. |
152 utr_coverage is still confusing. | 150 utr_coverage is still confusing. |
153 """ | 151 """ |
154 coverage_per_alignment = [] | 152 coverage_per_alignment = [] |
155 for utr in self.utr_coverages.itervalues(): # TODO: be smarter about this. | 153 for utr in self.utr_coverages.itervalues(): # TODO: be smarter about this. |
156 utr_coverage = [] | 154 utr_coverage = [] |
157 for vector in utr.itervalues(): | 155 for vector in utr: |
158 utr_coverage.append(np.sum(vector)) | 156 utr_coverage.append(np.sum(vector)) |
159 coverage_per_alignment.append(utr_coverage) | 157 coverage_per_alignment.append(utr_coverage) |
160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) | 158 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) |
161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? Or even no normalization! | 159 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? Or even no normalization! |
162 return coverage_weights | 160 return coverage_weights |
184 tasks = [ (self.utr_coverages[utr], self.plot_path, utr, utr_d, self.coverage_weights, len(self.control_alignments), | 182 tasks = [ (self.utr_coverages[utr], self.plot_path, utr, utr_d, self.coverage_weights, len(self.control_alignments), |
185 len(self.treatment_alignments), self.search_start, self.coverage_threshold) \ | 183 len(self.treatment_alignments), self.search_start, self.coverage_threshold) \ |
186 for utr, utr_d in self.utr_dict.iteritems() ] | 184 for utr, utr_d in self.utr_dict.iteritems() ] |
187 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] | 185 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] |
188 result_list = [res.get() for res in processed_tasks] | 186 result_list = [res.get() for res in processed_tasks] |
187 #result_list = [calculate_all_utr(*t) for t in tasks] # uncomment for easier debugging | |
189 for res_control, res_treatment in result_list: | 188 for res_control, res_treatment in result_list: |
190 if not res_control: | 189 if not res_control: |
191 continue | 190 continue |
192 for i, result in enumerate(res_control): | 191 for i, result in enumerate(res_control): |
193 if isinstance(result, dict): | 192 if isinstance(result, dict): |
280 """ | 279 """ |
281 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided | 280 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided |
282 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. | 281 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. |
283 """ | 282 """ |
284 num_samples = len(utr_coverage) | 283 num_samples = len(utr_coverage) |
285 normalized_utr_coverage = np.array(utr_coverage.values())/np.expand_dims(coverage_weigths, axis=1) | 284 normalized_utr_coverage = np.array(utr_coverage)/np.expand_dims(coverage_weigths, axis=1) |
286 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt | 285 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage] # filters threshold on mean coverage over first 100 nt |
287 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? | 286 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? |
288 is_above_length = UTR_end - UTR_start >= 150 | 287 is_above_length = UTR_end - UTR_start >= 150 |
289 if (is_above_threshold) and (is_above_length): | 288 if (is_above_threshold) and (is_above_length): |
290 search_end = UTR_end - UTR_start | 289 search_end = UTR_end - UTR_start |
291 breakpoints = range(search_start, search_end + 1) | 290 breakpoints = range(search_start, search_end + 1) |