comparison dapars.py @ 13:538c4e2b423e draft

planemo upload for repository https://github.com/mvdbeek/dapars commit deab588a5d5ec7022de63a395fbd04e415ba0a42-dirty
author mvdbeek
date Fri, 30 Oct 2015 04:59:15 -0400
parents a5d8b08af089
children ea5d573d6d40
comparison
equal deleted inserted replaced
12:04895ae7ac58 13:538c4e2b423e
64 self.html_file = args.html_file 64 self.html_file = args.html_file
65 self.utr = args.utr_bed_file 65 self.utr = args.utr_bed_file
66 self.gtf_fields = filter_utr.get_gtf_fields() 66 self.gtf_fields = filter_utr.get_gtf_fields()
67 self.result_file = args.output_file 67 self.result_file = args.output_file
68 self.all_alignments = self.control_alignments + self.treatment_alignments 68 self.all_alignments = self.control_alignments + self.treatment_alignments
69 self.alignment_names = { file: os.path.basename(file) for file in self.all_alignments } 69 self.alignment_names = OrderedDict(( file, os.path.basename(file)) for file in self.all_alignments )
70 self.num_samples = len(self.all_alignments) 70 self.num_samples = len(self.all_alignments)
71 self.utr_dict = self.get_utr_dict(0.2) 71 self.utr_dict = self.get_utr_dict(0.2)
72 self.dump_utr_dict_to_bedfile() 72 self.dump_utr_dict_to_bedfile()
73 print "Established dictionary of 3\'UTRs" 73 print "Established dictionary of 3\'UTRs"
74 self.coverage_files = self.run_bedtools_coverage() 74 self.coverage_files = self.run_bedtools_coverage()
91 91
92 def run_bedtools_coverage(self): 92 def run_bedtools_coverage(self):
93 """ 93 """
94 Use bedtools coverage to generate pileup data for all alignment files for the regions specified in utr_dict. 94 Use bedtools coverage to generate pileup data for all alignment files for the regions specified in utr_dict.
95 """ 95 """
96 coverage_files = []
97 cmds = [] 96 cmds = []
98 for alignment_file in self.all_alignments: 97 for alignment_file in self.all_alignments:
99 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | " 98 cmd = "sort -k1,1 -k2,2n tmp_bedfile.bed | "
100 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b stdin |" \ 99 cmd = cmd + "bedtools coverage -d -s -abam {alignment_file} -b tmp_bedfile.bed |" \
101 " cut -f 4,7,8 > coverage_file_{alignment_name}".format( 100 " cut -f 4,7,8 > coverage_file_{alignment_name}".format(
102 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] ) 101 alignment_file = alignment_file, alignment_name= self.alignment_names[alignment_file] )
103 cmds.append(cmd) 102 cmds.append(cmd)
104 pool = Pool(self.n_cpus)
105 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds] 103 subprocesses = [subprocess.Popen([cmd], shell=True) for cmd in cmds]
106 [p.wait() for p in subprocesses] 104 [p.wait() for p in subprocesses]
107 coverage_files = ["gene_position_coverage_{alignment_name}".format( 105 coverage_files = ["gene_position_coverage_{alignment_name}".format(
108 alignment_name = self.alignment_names[alignment_file]) for alignment_file in self.all_alignments ] 106 alignment_name = self.alignment_names[alignment_file]) for alignment_file in self.all_alignments ]
109 return coverage_files 107 return coverage_files
110 108
111 def read_coverage_result(self): 109 def read_coverage_result(self):
112 """ 110 """
113 Read coverages back in and store as dictionary of numpy arrays 111 Read coverages back in and store as dictionary of numpy arrays
114 """ 112 """
115 coverage_dict = { gene: { name: np.zeros(utr_d["new_end"]+1-utr_d["new_start"]) for name in self.alignment_names.itervalues() } for gene, utr_d in self.utr_dict.iteritems() } 113 coverage_dict = { gene: [np.zeros(utr_d["new_end"]+1-utr_d["new_start"]) for name in self.alignment_names.values() ] for gene, utr_d in self.utr_dict.items() }
116 for alignment_name in self.alignment_names.itervalues(): 114 for i, alignment_name in enumerate(self.alignment_names.values()):
117 with open("coverage_file_{alignment_name}".format(alignment_name = alignment_name)) as coverage_file: 115 with open("coverage_file_{alignment_name}".format(alignment_name = alignment_name)) as coverage_file:
118 for line in coverage_file: 116 for line in coverage_file:
119 gene, position, coverage= line.strip().split("\t") 117 gene, position, coverage= line.strip().split("\t")
120 coverage_dict[gene][alignment_name][int(position)-1] = coverage 118 coverage_dict[gene][i][int(position)-1] = coverage
121 for utr_d in self.utr_dict.itervalues(): 119 for utr_d in self.utr_dict.values():
122 if utr_d["strand"] == "-": 120 if utr_d["strand"] == "-":
123 for alignment_name in self.alignment_names.values(): 121 for i, alignment_name in enumerate(self.alignment_names.values()):
124 coverage_dict[gene][alignment_name] = coverage_dict[gene][alignment_name][::-1] 122 coverage_dict[gene][i] = coverage_dict[gene][i][::-1]
125 return coverage_dict 123 return coverage_dict
126 124
127 def get_utr_dict(self, shift): 125 def get_utr_dict(self, shift):
128 """ 126 """
129 The utr end is extended by UTR length * shift, to discover novel distal polyA sites. 127 The utr end is extended by UTR length * shift, to discover novel distal polyA sites.
152 utr_coverage is still confusing. 150 utr_coverage is still confusing.
153 """ 151 """
154 coverage_per_alignment = [] 152 coverage_per_alignment = []
155 for utr in self.utr_coverages.itervalues(): # TODO: be smarter about this. 153 for utr in self.utr_coverages.itervalues(): # TODO: be smarter about this.
156 utr_coverage = [] 154 utr_coverage = []
157 for vector in utr.itervalues(): 155 for vector in utr:
158 utr_coverage.append(np.sum(vector)) 156 utr_coverage.append(np.sum(vector))
159 coverage_per_alignment.append(utr_coverage) 157 coverage_per_alignment.append(utr_coverage)
160 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ]) 158 coverages = np.array([ sum(x) for x in zip(*coverage_per_alignment) ])
161 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? Or even no normalization! 159 coverage_weights = coverages / np.mean(coverages) # TODO: proabably median is better suited? Or even no normalization!
162 return coverage_weights 160 return coverage_weights
184 tasks = [ (self.utr_coverages[utr], self.plot_path, utr, utr_d, self.coverage_weights, len(self.control_alignments), 182 tasks = [ (self.utr_coverages[utr], self.plot_path, utr, utr_d, self.coverage_weights, len(self.control_alignments),
185 len(self.treatment_alignments), self.search_start, self.coverage_threshold) \ 183 len(self.treatment_alignments), self.search_start, self.coverage_threshold) \
186 for utr, utr_d in self.utr_dict.iteritems() ] 184 for utr, utr_d in self.utr_dict.iteritems() ]
187 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks] 185 processed_tasks = [ pool.apply_async(calculate_all_utr, t) for t in tasks]
188 result_list = [res.get() for res in processed_tasks] 186 result_list = [res.get() for res in processed_tasks]
187 #result_list = [calculate_all_utr(*t) for t in tasks] # uncomment for easier debugging
189 for res_control, res_treatment in result_list: 188 for res_control, res_treatment in result_list:
190 if not res_control: 189 if not res_control:
191 continue 190 continue
192 for i, result in enumerate(res_control): 191 for i, result in enumerate(res_control):
193 if isinstance(result, dict): 192 if isinstance(result, dict):
280 """ 279 """
281 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided 280 We are searching for a point within the UTR that minimizes the mean squared error, if the coverage vector was divided
282 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample. 281 at that point. utr_coverage is a list with items corresponding to numpy arrays of coverage for a sample.
283 """ 282 """
284 num_samples = len(utr_coverage) 283 num_samples = len(utr_coverage)
285 normalized_utr_coverage = np.array(utr_coverage.values())/np.expand_dims(coverage_weigths, axis=1) 284 normalized_utr_coverage = np.array(utr_coverage)/np.expand_dims(coverage_weigths, axis=1)
286 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage.values()] # filters threshold on mean coverage over first 100 nt 285 start_coverage = [np.mean(coverage[0:99]) for coverage in utr_coverage] # filters threshold on mean coverage over first 100 nt
287 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why? 286 is_above_threshold = sum(np.array(start_coverage) >= coverage_threshold) >= num_samples # This filters on the raw threshold. Why?
288 is_above_length = UTR_end - UTR_start >= 150 287 is_above_length = UTR_end - UTR_start >= 150
289 if (is_above_threshold) and (is_above_length): 288 if (is_above_threshold) and (is_above_length):
290 search_end = UTR_end - UTR_start 289 search_end = UTR_end - UTR_start
291 breakpoints = range(search_start, search_end + 1) 290 breakpoints = range(search_start, search_end + 1)