Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_predict_wrapper.py @ 5:58ebf089377e draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 902e994cb04db968ce797afa676f5fa6512ab6d3
author | bgruening |
---|---|
date | Tue, 06 Aug 2024 14:54:58 +0000 |
parents | 2d6de5b769e6 |
children |
comparison
equal
deleted
inserted
replaced
4:2d6de5b769e6 | 5:58ebf089377e |
---|---|
4 import os | 4 import os |
5 import subprocess | 5 import subprocess |
6 import sys | 6 import sys |
7 | 7 |
8 import gplib | 8 import gplib |
9 | |
10 | 9 |
11 """ | 10 """ |
12 | 11 |
13 TOOL DEPENDENCIES | 12 TOOL DEPENDENCIES |
14 ================= | 13 ================= |
79 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | 78 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 |
80 | 79 |
81 """ | 80 """ |
82 | 81 |
83 | 82 |
84 ############################################################################### | 83 ####################################################################### |
84 | |
85 | 85 |
86 def setup_argument_parser(): | 86 def setup_argument_parser(): |
87 """Setup argparse parser.""" | 87 """Setup argparse parser.""" |
88 help_description = """ | 88 help_description = """ |
89 Galaxy wrapper script for GraphProt (-action predict and -action | 89 Galaxy wrapper script for GraphProt (-action predict and -action |
98 If --gen-site-bed .bed file is provided, peak regions will be output | 98 If --gen-site-bed .bed file is provided, peak regions will be output |
99 with genomic coordinates too. | 99 with genomic coordinates too. |
100 | 100 |
101 """ | 101 """ |
102 # Define argument parser. | 102 # Define argument parser. |
103 p = ap.ArgumentParser(add_help=False, | 103 p = ap.ArgumentParser( |
104 prog="graphprot_predict_wrapper.py", | 104 add_help=False, |
105 description=help_description, | 105 prog="graphprot_predict_wrapper.py", |
106 formatter_class=ap.MetavarTypeHelpFormatter) | 106 description=help_description, |
107 formatter_class=ap.MetavarTypeHelpFormatter, | |
108 ) | |
107 | 109 |
108 # Argument groups. | 110 # Argument groups. |
109 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | 111 p_man = p.add_argument_group("REQUIRED ARGUMENTS") |
110 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | 112 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") |
111 | 113 |
112 # Required arguments. | 114 # Required arguments. |
113 p_opt.add_argument("-h", "--help", | 115 p_opt.add_argument("-h", "--help", action="help", help="Print help message") |
114 action="help", | 116 p_man.add_argument( |
115 help="Print help message") | 117 "--fasta", |
116 p_man.add_argument("--fasta", | 118 dest="in_fa", |
117 dest="in_fa", | 119 type=str, |
118 type=str, | 120 required=True, |
119 required=True, | 121 help="Sequences .fa file to predict" " on (option -fasta)", |
120 help="Sequences .fa file to predict" | 122 ) |
121 " on (option -fasta)") | 123 p_man.add_argument( |
122 p_man.add_argument("--model", | 124 "--model", |
123 dest="in_model", | 125 dest="in_model", |
124 type=str, | 126 type=str, |
125 required=True, | 127 required=True, |
126 help="GraphProt model file to use for predictions" | 128 help="GraphProt model file to use for predictions" " (option -model)", |
127 " (option -model)") | 129 ) |
128 p_man.add_argument("--params", | 130 p_man.add_argument( |
129 dest="in_params", | 131 "--params", |
130 type=str, | 132 dest="in_params", |
131 required=True, | 133 type=str, |
132 help="Parameter file for given model") | 134 required=True, |
133 p_man.add_argument("--data-id", | 135 help="Parameter file for given model", |
134 dest="data_id", | 136 ) |
135 type=str, | 137 p_man.add_argument( |
136 required=True, | 138 "--data-id", |
137 help="Data ID (option -prefix)") | 139 dest="data_id", |
140 type=str, | |
141 required=True, | |
142 help="Data ID (option -prefix)", | |
143 ) | |
138 # ---> I'm a conditional argument <--- | 144 # ---> I'm a conditional argument <--- |
139 p_opt.add_argument("--ws-pred", | 145 p_opt.add_argument( |
140 dest="ws_pred", | 146 "--ws-pred", |
141 default=False, | 147 dest="ws_pred", |
142 action="store_true", | 148 default=False, |
143 help="Run a whole site prediction instead " | 149 action="store_true", |
144 "of calculating profiles (default: false)") | 150 help="Run a whole site prediction instead " |
151 "of calculating profiles (default: false)", | |
152 ) | |
145 # Additional arguments. | 153 # Additional arguments. |
146 p_opt.add_argument("--sc-thr", | 154 p_opt.add_argument( |
147 dest="score_thr", | 155 "--sc-thr", |
148 type=float, | 156 dest="score_thr", |
149 default=0, | 157 type=float, |
150 help="Score threshold for extracting " | 158 default=0, |
151 "average profile peak regions (default: 0)") | 159 help="Score threshold for extracting " |
152 p_opt.add_argument("--max-merge-dist", | 160 "average profile peak regions (default: 0)", |
153 dest="max_merge_dist", | 161 ) |
154 type=int, | 162 p_opt.add_argument( |
155 default=0, | 163 "--max-merge-dist", |
156 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | 164 dest="max_merge_dist", |
157 help="Maximum merge distance for nearby peak regions" | 165 type=int, |
158 " (default: report all non-overlapping regions)") | 166 default=0, |
159 p_opt.add_argument("--gen-site-bed", | 167 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], |
160 dest="genomic_sites_bed", | 168 help="Maximum merge distance for nearby peak regions" |
161 type=str, | 169 " (default: report all non-overlapping regions)", |
162 help=".bed file specifying the genomic regions of " | 170 ) |
163 "the input .fa sequences. Corrupt .bed " | 171 p_opt.add_argument( |
164 "information will be punished (default: false)") | 172 "--gen-site-bed", |
165 p_opt.add_argument("--conf-out", | 173 dest="genomic_sites_bed", |
166 dest="conf_out", | 174 type=str, |
167 default=False, | 175 help=".bed file specifying the genomic regions of " |
168 action="store_true", | 176 "the input .fa sequences. Corrupt .bed " |
169 help="Output filtered peak regions BED file or " | 177 "information will be punished (default: false)", |
170 "predictions file (if --ws-pred) using the median " | 178 ) |
171 "positive training site score for filtering " | 179 p_opt.add_argument( |
172 "(default: false)") | 180 "--conf-out", |
173 p_opt.add_argument("--gp-output", | 181 dest="conf_out", |
174 dest="gp_output", | 182 default=False, |
175 default=False, | 183 action="store_true", |
176 action="store_true", | 184 help="Output filtered peak regions BED file or " |
177 help="Print output produced by GraphProt " | 185 "predictions file (if --ws-pred) using the median " |
178 "(default: false)") | 186 "positive training site score for filtering " |
179 p_opt.add_argument("--ap-extlr", | 187 "(default: false)", |
180 dest="ap_extlr", | 188 ) |
181 type=int, | 189 p_opt.add_argument( |
182 default=5, | 190 "--gp-output", |
183 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | 191 dest="gp_output", |
184 help="Define average profile up- and downstream " | 192 default=False, |
185 "extension to produce the average profile. " | 193 action="store_true", |
186 "The mean over small sequence windows " | 194 help="Print output produced by GraphProt " "(default: false)", |
187 "(window length = --ap-extlr*2 + 1) is used to " | 195 ) |
188 "get position scores, thus the average profile " | 196 p_opt.add_argument( |
189 "is more smooth than the initial profile output " | 197 "--ap-extlr", |
190 "by GraphProt (default: 5)") | 198 dest="ap_extlr", |
199 type=int, | |
200 default=5, | |
201 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | |
202 help="Define average profile up- and downstream " | |
203 "extension to produce the average profile. " | |
204 "The mean over small sequence windows " | |
205 "(window length = --ap-extlr*2 + 1) is used to " | |
206 "get position scores, thus the average profile " | |
207 "is more smooth than the initial profile output " | |
208 "by GraphProt (default: 5)", | |
209 ) | |
191 return p | 210 return p |
192 | 211 |
193 | 212 |
194 ############################################################################### | 213 ####################################################################### |
195 | 214 |
196 if __name__ == '__main__': | 215 if __name__ == "__main__": |
197 | 216 |
198 # Setup argparse. | 217 # Setup argparse. |
199 parser = setup_argument_parser() | 218 parser = setup_argument_parser() |
200 # Read in command line arguments. | 219 # Read in command line arguments. |
201 args = parser.parse_args() | 220 args = parser.parse_args() |
207 # Check for Linux. | 226 # Check for Linux. |
208 assert "linux" in sys.platform, "please use Linux" | 227 assert "linux" in sys.platform, "please use Linux" |
209 # Check tool availability. | 228 # Check tool availability. |
210 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | 229 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" |
211 # Check file inputs. | 230 # Check file inputs. |
212 assert os.path.exists(args.in_fa), \ | 231 assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa) |
213 "input .fa file \"%s\" not found" % (args.in_fa) | 232 assert os.path.exists(args.in_model), 'input .model file "%s" not found' % ( |
214 assert os.path.exists(args.in_model), \ | 233 args.in_model |
215 "input .model file \"%s\" not found" % (args.in_model) | 234 ) |
216 assert os.path.exists(args.in_params), \ | 235 assert os.path.exists(args.in_params), 'input .params file "%s" not found' % ( |
217 "input .params file \"%s\" not found" % (args.in_params) | 236 args.in_params |
237 ) | |
218 # Count .fa entries. | 238 # Count .fa entries. |
219 c_in_fa = gplib.count_fasta_headers(args.in_fa) | 239 c_in_fa = gplib.count_fasta_headers(args.in_fa) |
220 assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) | 240 assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa) |
221 print("# input .fa sequences: %i" % (c_in_fa)) | 241 print("# input .fa sequences: %i" % (c_in_fa)) |
222 # Read in FASTA sequences to check for uppercase sequences. | 242 # Read in FASTA sequences to check for uppercase sequences. |
223 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) | 243 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) |
224 # Check for lowercase only sequences, which cause GP to crash. | 244 # Check for lowercase only sequences, which cause GP to crash. |
225 error_mess = "input sequences encountered containing "\ | 245 error_mess = ( |
226 "only lowercase characters or lowercase characters in between "\ | 246 "input sequences encountered containing " |
227 "uppercase characters. Please provide either all uppercase "\ | 247 "only lowercase characters or lowercase characters in between " |
228 "sequences or sequences containing uppercase regions surrounded "\ | 248 "uppercase characters. Please provide either all uppercase " |
229 "by lowercase context regions for structure calculation (see "\ | 249 "sequences or sequences containing uppercase regions surrounded " |
230 "viewpoint concept in original GraphProt publication "\ | 250 "by lowercase context regions for structure calculation (see " |
251 "viewpoint concept in original GraphProt publication " | |
231 "for more details)" | 252 "for more details)" |
253 ) | |
232 if args.ws_pred: | 254 if args.ws_pred: |
233 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 255 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
234 assert not bad_ids, "%s" % (error_mess) | 256 assert not bad_ids, "%s" % (error_mess) |
235 | 257 |
236 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) | 258 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) |
237 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ | 259 assert c_uc_nt, ( |
238 "Please change sequences to uppercase (keep in mind "\ | 260 "no uppercase nucleotides in input .fa sequences. " |
239 "GraphProt only scores uppercase regions (according "\ | 261 "Please change sequences to uppercase (keep in mind " |
240 "to its viewpoint concept)" | 262 "GraphProt only scores uppercase regions (according " |
263 "to its viewpoint concept)" | |
264 ) | |
241 if not args.ws_pred: | 265 if not args.ws_pred: |
242 # Check for lowercase sequences. | 266 # Check for lowercase sequences. |
243 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) | 267 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) |
244 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ | 268 assert not c_lc_nt, ( |
245 " in profile predictions, since GraphProt only scores "\ | 269 "lowercase nucleotides in input .fa not allowed" |
270 " in profile predictions, since GraphProt only scores " | |
246 "uppercase regions (according to its viewpoint concept)" | 271 "uppercase regions (according to its viewpoint concept)" |
272 ) | |
247 # Check .bed. | 273 # Check .bed. |
248 if args.genomic_sites_bed: | 274 if args.genomic_sites_bed: |
249 # An array of checks, marvelous. | 275 # An array of checks, marvelous. |
250 assert \ | 276 assert os.path.exists( |
251 os.path.exists(args.genomic_sites_bed), \ | 277 args.genomic_sites_bed |
252 "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) | 278 ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed) |
253 # Check .bed for content. | 279 # Check .bed for content. |
254 assert gplib.count_file_rows(args.genomic_sites_bed), \ | 280 assert gplib.count_file_rows( |
255 "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) | 281 args.genomic_sites_bed |
282 ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed) | |
256 # Check .bed for 6-column format. | 283 # Check .bed for 6-column format. |
257 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ | 284 assert gplib.bed_check_six_col_format( |
258 "genomic .bed file \"%s\" appears to not "\ | 285 args.genomic_sites_bed |
259 "be in 6-column .bed format" % (args.genomic_sites_bed) | 286 ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % ( |
287 args.genomic_sites_bed | |
288 ) | |
260 # Check for unique column 4 IDs. | 289 # Check for unique column 4 IDs. |
261 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ | 290 assert gplib.bed_check_unique_ids( |
262 "genomic .bed file \"%s\" column 4 IDs not unique" % \ | 291 args.genomic_sites_bed |
263 (args.genomic_sites_bed) | 292 ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed) |
264 # Read in .bed regions, compare to FASTA sequences. | 293 # Read in .bed regions, compare to FASTA sequences. |
265 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) | 294 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) |
266 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) | 295 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) |
267 for seq_id in seq_len_dic: | 296 for seq_id in seq_len_dic: |
268 seq_l = seq_len_dic[seq_id] | 297 seq_l = seq_len_dic[seq_id] |
269 assert seq_id in reg_len_dic, \ | 298 assert ( |
270 "sequence ID \"%s\" missing in input .bed \"%s\"" % \ | 299 seq_id in reg_len_dic |
271 (seq_id, args.genomic_sites_bed) | 300 ), 'sequence ID "%s" missing in input .bed "%s"' % ( |
301 seq_id, | |
302 args.genomic_sites_bed, | |
303 ) | |
272 reg_l = reg_len_dic[seq_id] | 304 reg_l = reg_len_dic[seq_id] |
273 assert seq_l == reg_l, \ | 305 assert ( |
274 "sequence length differs from .bed region length (%i != %i)" \ | 306 seq_l == reg_l |
275 % (seq_l, reg_l) | 307 ), "sequence length differs from .bed region length (%i != %i)" % ( |
308 seq_l, | |
309 reg_l, | |
310 ) | |
276 # Read in model parameters. | 311 # Read in model parameters. |
277 param_dic = gplib.graphprot_get_param_dic(args.in_params) | 312 param_dic = gplib.graphprot_get_param_dic(args.in_params) |
278 # Create GraphProt parameter string. | 313 # Create GraphProt parameter string. |
279 param_string = gplib.graphprot_get_param_string(args.in_params) | 314 param_string = gplib.graphprot_get_param_string(args.in_params) |
280 | 315 |
282 Run predictions. | 317 Run predictions. |
283 | 318 |
284 """ | 319 """ |
285 if args.ws_pred: | 320 if args.ws_pred: |
286 # Do whole site prediction. | 321 # Do whole site prediction. |
287 print("Starting whole site predictions on input .fa file" | 322 print( |
288 " (-action predict) ... ") | 323 "Starting whole site predictions on input .fa file" |
289 check_cmd = "GraphProt.pl -action predict -prefix " \ | 324 " (-action predict) ... " |
290 + args.data_id + " -fasta " + args.in_fa + " " \ | 325 ) |
291 + param_string + " -model " + args.in_model | 326 check_cmd = ( |
327 "GraphProt.pl -action predict -prefix " | |
328 + args.data_id | |
329 + " -fasta " | |
330 + args.in_fa | |
331 + " " | |
332 + param_string | |
333 + " -model " | |
334 + args.in_model | |
335 ) | |
292 output = subprocess.getoutput(check_cmd) | 336 output = subprocess.getoutput(check_cmd) |
293 assert output, "the following call of GraphProt.pl "\ | 337 assert ( |
294 "produced no output:\n%s" % (check_cmd) | 338 output |
339 ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd) | |
295 if args.gp_output: | 340 if args.gp_output: |
296 print(output) | 341 print(output) |
297 ws_predictions_file = args.data_id + ".predictions" | 342 ws_predictions_file = args.data_id + ".predictions" |
298 assert os.path.exists(ws_predictions_file), \ | 343 assert os.path.exists( |
299 "Whole site prediction output .predictions file \"%s\" not found" \ | 344 ws_predictions_file |
300 % (ws_predictions_file) | 345 ), 'Whole site prediction output .predictions file "%s" not found' % ( |
346 ws_predictions_file | |
347 ) | |
301 if args.conf_out: | 348 if args.conf_out: |
302 # Filter by pos_train_ws_pred_median median. | 349 # Filter by pos_train_ws_pred_median median. |
303 assert "pos_train_ws_pred_median" in param_dic, \ | 350 assert "pos_train_ws_pred_median" in param_dic, ( |
304 "whole site top scores median information "\ | 351 "whole site top scores median information " "missing in .params file" |
305 "missing in .params file" | 352 ) |
306 pos_tr_ws_pred_med = \ | 353 pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"]) |
307 float(param_dic["pos_train_ws_pred_median"]) | |
308 # Filtered file. | 354 # Filtered file. |
309 filt_ws_predictions_file = args.data_id + ".p50.predictions" | 355 filt_ws_predictions_file = args.data_id + ".p50.predictions" |
310 print("Extracting p50 sites from whole site predictions" | 356 print( |
311 " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) | 357 "Extracting p50 sites from whole site predictions" |
312 gplib.graphprot_filter_predictions_file(ws_predictions_file, | 358 " (score threshold = %f) ... " % (pos_tr_ws_pred_med) |
313 filt_ws_predictions_file, | 359 ) |
314 sc_thr=pos_tr_ws_pred_med) | 360 gplib.graphprot_filter_predictions_file( |
361 ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med | |
362 ) | |
315 else: | 363 else: |
316 # Do profile prediction. | 364 # Do profile prediction. |
317 print("Starting profile predictions on on input .fa file " | 365 print( |
318 "(-action predict_profile) ... ") | 366 "Starting profile predictions on on input .fa file " |
319 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ | 367 "(-action predict_profile) ... " |
320 + args.data_id + " -fasta " + args.in_fa + " " + \ | 368 ) |
321 param_string + " -model " + args.in_model | 369 check_cmd = ( |
370 "GraphProt.pl -action predict_profile -prefix " | |
371 + args.data_id | |
372 + " -fasta " | |
373 + args.in_fa | |
374 + " " | |
375 + param_string | |
376 + " -model " | |
377 + args.in_model | |
378 ) | |
322 output = subprocess.getoutput(check_cmd) | 379 output = subprocess.getoutput(check_cmd) |
323 assert output, "the following call of GraphProt.pl produced "\ | 380 assert ( |
324 "no output:\n%s" % (check_cmd) | 381 output |
382 ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd) | |
325 if args.gp_output: | 383 if args.gp_output: |
326 print(output) | 384 print(output) |
327 profile_predictions_file = args.data_id + ".profile" | 385 profile_predictions_file = args.data_id + ".profile" |
328 assert os.path.exists(profile_predictions_file), \ | 386 assert os.path.exists( |
329 "Profile prediction output .profile file \"%s\" not found" % \ | 387 profile_predictions_file |
330 (profile_predictions_file) | 388 ), 'Profile prediction output .profile file "%s" not found' % ( |
389 profile_predictions_file | |
390 ) | |
331 | 391 |
332 # Profile prediction output files. | 392 # Profile prediction output files. |
333 avg_prof_file = args.data_id + ".avg_profile" | 393 avg_prof_file = args.data_id + ".avg_profile" |
334 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" | 394 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" |
335 avg_prof_gen_peaks_file = args.data_id + \ | 395 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" |
336 ".avg_profile.genomic_peaks.bed" | 396 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" |
337 avg_prof_peaks_p50_file = args.data_id + \ | 397 avg_prof_gen_peaks_p50_file = ( |
338 ".avg_profile.p50.peaks.bed" | 398 args.data_id + ".avg_profile.p50.genomic_peaks.bed" |
339 avg_prof_gen_peaks_p50_file = args.data_id + \ | 399 ) |
340 ".avg_profile.p50.genomic_peaks.bed" | |
341 | 400 |
342 # Get sequence IDs in order from input .fa file. | 401 # Get sequence IDs in order from input .fa file. |
343 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) | 402 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) |
344 # Calculate average profiles. | 403 # Calculate average profiles. |
345 print("Getting average profile from profile " | 404 print( |
346 "(extlr for smoothing: %i) ... " % (args.ap_extlr)) | 405 "Getting average profile from profile " |
347 gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, | 406 "(extlr for smoothing: %i) ... " % (args.ap_extlr) |
348 avg_prof_file, | 407 ) |
349 ap_extlr=args.ap_extlr, | 408 gplib.graphprot_profile_calc_avg_profile( |
350 seq_ids_list=seq_ids_list, | 409 profile_predictions_file, |
351 method=2) | 410 avg_prof_file, |
411 ap_extlr=args.ap_extlr, | |
412 seq_ids_list=seq_ids_list, | |
413 method=2, | |
414 ) | |
352 # Extract peak regions on sequences with threshold score 0. | 415 # Extract peak regions on sequences with threshold score 0. |
353 print("Extracting peak regions from average profile " | 416 print( |
354 "(score threshold = 0) ... ") | 417 "Extracting peak regions from average profile " "(score threshold = 0) ... " |
418 ) | |
355 killpep8 = args.max_merge_dist | 419 killpep8 = args.max_merge_dist |
356 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, | 420 gplib.graphprot_profile_extract_peak_regions( |
357 avg_prof_peaks_file, | 421 avg_prof_file, |
358 max_merge_dist=killpep8, | 422 avg_prof_peaks_file, |
359 sc_thr=args.score_thr) | 423 max_merge_dist=killpep8, |
424 sc_thr=args.score_thr, | |
425 ) | |
360 # Convert peaks to genomic coordinates. | 426 # Convert peaks to genomic coordinates. |
361 if args.genomic_sites_bed: | 427 if args.genomic_sites_bed: |
362 print("Converting peak regions to genomic coordinates ... ") | 428 print("Converting peak regions to genomic coordinates ... ") |
363 killit = args.genomic_sites_bed | 429 killit = args.genomic_sites_bed |
364 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, | 430 gplib.bed_peaks_to_genomic_peaks( |
365 avg_prof_gen_peaks_file, | 431 avg_prof_peaks_file, |
366 print_rows=False, | 432 avg_prof_gen_peaks_file, |
367 genomic_sites_bed=killit) | 433 print_rows=False, |
434 genomic_sites_bed=killit, | |
435 ) | |
368 # Extract peak regions with threshold score p50. | 436 # Extract peak regions with threshold score p50. |
369 if args.conf_out: | 437 if args.conf_out: |
370 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) | 438 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) |
371 # Filter by pos_tr_ws_pred_med median. | 439 # Filter by pos_tr_ws_pred_med median. |
372 assert sc_id in param_dic, "average profile extlr %i median "\ | 440 assert sc_id in param_dic, ( |
441 "average profile extlr %i median " | |
373 "information missing in .params file" % (args.ap_extlr) | 442 "information missing in .params file" % (args.ap_extlr) |
443 ) | |
374 p50_sc_thr = float(param_dic[sc_id]) | 444 p50_sc_thr = float(param_dic[sc_id]) |
375 print("Extracting p50 peak regions from average profile " | 445 print( |
376 "(score threshold = %f) ... " % (p50_sc_thr)) | 446 "Extracting p50 peak regions from average profile " |
447 "(score threshold = %f) ... " % (p50_sc_thr) | |
448 ) | |
377 despair = avg_prof_peaks_p50_file | 449 despair = avg_prof_peaks_p50_file |
378 pain = args.max_merge_dist | 450 pain = args.max_merge_dist |
379 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, | 451 gplib.graphprot_profile_extract_peak_regions( |
380 despair, | 452 avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr |
381 max_merge_dist=pain, | 453 ) |
382 sc_thr=p50_sc_thr) | |
383 # Convert peaks to genomic coordinates. | 454 # Convert peaks to genomic coordinates. |
384 if args.genomic_sites_bed: | 455 if args.genomic_sites_bed: |
385 print("Converting p50 peak regions to " | 456 print("Converting p50 peak regions to " "genomic coordinates ... ") |
386 "genomic coordinates ... ") | |
387 madness = args.genomic_sites_bed | 457 madness = args.genomic_sites_bed |
388 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, | 458 gplib.bed_peaks_to_genomic_peaks( |
389 avg_prof_gen_peaks_p50_file, | 459 avg_prof_peaks_p50_file, |
390 genomic_sites_bed=madness) | 460 avg_prof_gen_peaks_p50_file, |
461 genomic_sites_bed=madness, | |
462 ) | |
391 # Done. | 463 # Done. |
392 print("Script: I'm done.") | 464 print("Script: I'm done.") |
393 print("Author: ... ") | 465 print("Author: ... ") |