Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_train_wrapper.py @ 5:58ebf089377e draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 902e994cb04db968ce797afa676f5fa6512ab6d3
author | bgruening |
---|---|
date | Tue, 06 Aug 2024 14:54:58 +0000 |
parents | 9a83a84a25a7 |
children |
comparison
equal
deleted
inserted
replaced
4:2d6de5b769e6 | 5:58ebf089377e |
---|---|
4 import os | 4 import os |
5 import subprocess | 5 import subprocess |
6 import sys | 6 import sys |
7 | 7 |
8 import gplib | 8 import gplib |
9 | |
10 | 9 |
11 """ | 10 """ |
12 | 11 |
13 TOOL DEPENDENCIES | 12 TOOL DEPENDENCIES |
14 ================= | 13 ================= |
60 | 59 |
61 | 60 |
62 """ | 61 """ |
63 | 62 |
64 | 63 |
65 ############################################################################### | 64 ####################################################################### |
65 | |
66 | 66 |
67 def setup_argument_parser(): | 67 def setup_argument_parser(): |
68 """Setup argparse parser.""" | 68 """Setup argparse parser.""" |
69 help_description = """ | 69 help_description = """ |
70 Galaxy wrapper script for GraphProt to train a GraphProt model on | 70 Galaxy wrapper script for GraphProt to train a GraphProt model on |
82 score out of these to store in .params file, using it later | 82 score out of these to store in .params file, using it later |
83 for outputting binding sites or peaks with higher confidence. | 83 for outputting binding sites or peaks with higher confidence. |
84 | 84 |
85 """ | 85 """ |
86 # Define argument parser. | 86 # Define argument parser. |
87 p = ap.ArgumentParser(add_help=False, | 87 p = ap.ArgumentParser( |
88 prog="graphprot_train_wrapper.py", | 88 add_help=False, |
89 description=help_description, | 89 prog="graphprot_train_wrapper.py", |
90 formatter_class=ap.MetavarTypeHelpFormatter) | 90 description=help_description, |
91 formatter_class=ap.MetavarTypeHelpFormatter, | |
92 ) | |
91 | 93 |
92 # Argument groups. | 94 # Argument groups. |
93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | 95 p_man = p.add_argument_group("REQUIRED ARGUMENTS") |
94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | 96 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") |
95 | 97 |
96 # Required arguments. | 98 # Required arguments. |
97 p_opt.add_argument("-h", "--help", | 99 p_opt.add_argument("-h", "--help", action="help", help="Print help message") |
98 action="help", | 100 p_man.add_argument( |
99 help="Print help message") | 101 "--pos", |
100 p_man.add_argument("--pos", | 102 dest="in_pos_fa", |
101 dest="in_pos_fa", | 103 type=str, |
102 type=str, | 104 required=True, |
103 required=True, | 105 help="Positive (= binding site) sequences .fa file " |
104 help="Positive (= binding site) sequences .fa file " | 106 "for model training (option -fasta)", |
105 "for model training (option -fasta)") | 107 ) |
106 p_man.add_argument("--neg", | 108 p_man.add_argument( |
107 dest="in_neg_fa", | 109 "--neg", |
108 type=str, | 110 dest="in_neg_fa", |
109 required=True, | 111 type=str, |
110 help="Negative sequences .fa file for model " | 112 required=True, |
111 "training (option -negfasta)") | 113 help="Negative sequences .fa file for model " "training (option -negfasta)", |
112 p_man.add_argument("--data-id", | 114 ) |
113 dest="data_id", | 115 p_man.add_argument( |
114 type=str, | 116 "--data-id", |
115 required=True, | 117 dest="data_id", |
116 help="Data ID (option -prefix)") | 118 type=str, |
119 required=True, | |
120 help="Data ID (option -prefix)", | |
121 ) | |
117 # Additional arguments. | 122 # Additional arguments. |
118 p_opt.add_argument("--opt-set-size", | 123 p_opt.add_argument( |
119 dest="opt_set_size", | 124 "--opt-set-size", |
120 type=int, | 125 dest="opt_set_size", |
121 default=500, | 126 type=int, |
122 help="Hyperparameter optimization set size (taken " | 127 default=500, |
123 "away from both --pos and --neg) (default: 500)") | 128 help="Hyperparameter optimization set size (taken " |
124 p_opt.add_argument("--opt-pos", | 129 "away from both --pos and --neg) (default: 500)", |
125 dest="opt_pos_fa", | 130 ) |
126 type=str, | 131 p_opt.add_argument( |
127 help="Positive (= binding site) sequences .fa file " | 132 "--opt-pos", |
128 "for hyperparameter optimization (default: take " | 133 dest="opt_pos_fa", |
129 "--opt-set-size from --pos)") | 134 type=str, |
130 p_opt.add_argument("--opt-neg", | 135 help="Positive (= binding site) sequences .fa file " |
131 dest="opt_neg_fa", | 136 "for hyperparameter optimization (default: take " |
132 type=str, | 137 "--opt-set-size from --pos)", |
133 help="Negative sequences .fa file for hyperparameter " | 138 ) |
134 "optimization (default: take --opt-set-size " | 139 p_opt.add_argument( |
135 "from --neg)") | 140 "--opt-neg", |
136 p_opt.add_argument("--min-train", | 141 dest="opt_neg_fa", |
137 dest="min_train", | 142 type=str, |
138 type=int, | 143 help="Negative sequences .fa file for hyperparameter " |
139 default=500, | 144 "optimization (default: take --opt-set-size " |
140 help="Minimum amount of training sites demanded " | 145 "from --neg)", |
141 "(default: 500)") | 146 ) |
142 p_opt.add_argument("--disable-cv", | 147 p_opt.add_argument( |
143 dest="disable_cv", | 148 "--min-train", |
144 default=False, | 149 dest="min_train", |
145 action="store_true", | 150 type=int, |
146 help="Disable cross validation step (default: false)") | 151 default=500, |
147 p_opt.add_argument("--disable-motifs", | 152 help="Minimum amount of training sites demanded " "(default: 500)", |
148 dest="disable_motifs", | 153 ) |
149 default=False, | 154 p_opt.add_argument( |
150 action="store_true", | 155 "--disable-cv", |
151 help="Disable motif generation step (default: false)") | 156 dest="disable_cv", |
152 p_opt.add_argument("--gp-output", | 157 default=False, |
153 dest="gp_output", | 158 action="store_true", |
154 default=False, | 159 help="Disable cross validation step (default: false)", |
155 action="store_true", | 160 ) |
156 help="Print output produced by GraphProt " | 161 p_opt.add_argument( |
157 "(default: false)") | 162 "--disable-motifs", |
158 p_opt.add_argument("--str-model", | 163 dest="disable_motifs", |
159 dest="train_str_model", | 164 default=False, |
160 default=False, | 165 action="store_true", |
161 action="store_true", | 166 help="Disable motif generation step (default: false)", |
162 help="Train a structure model (default: train " | 167 ) |
163 "a sequence model)") | 168 p_opt.add_argument( |
169 "--gp-output", | |
170 dest="gp_output", | |
171 default=False, | |
172 action="store_true", | |
173 help="Print output produced by GraphProt " "(default: false)", | |
174 ) | |
175 p_opt.add_argument( | |
176 "--str-model", | |
177 dest="train_str_model", | |
178 default=False, | |
179 action="store_true", | |
180 help="Train a structure model (default: train " "a sequence model)", | |
181 ) | |
164 return p | 182 return p |
165 | 183 |
166 | 184 |
167 ############################################################################### | 185 ####################################################################### |
168 | 186 |
169 if __name__ == '__main__': | 187 if __name__ == "__main__": |
170 | 188 |
171 # Setup argparse. | 189 # Setup argparse. |
172 parser = setup_argument_parser() | 190 parser = setup_argument_parser() |
173 # Read in command line arguments. | 191 # Read in command line arguments. |
174 args = parser.parse_args() | 192 args = parser.parse_args() |
180 # Check for Linux. | 198 # Check for Linux. |
181 assert "linux" in sys.platform, "please use Linux" | 199 assert "linux" in sys.platform, "please use Linux" |
182 # Check tool availability. | 200 # Check tool availability. |
183 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | 201 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" |
184 # Check file inputs. | 202 # Check file inputs. |
185 assert os.path.exists(args.in_pos_fa), \ | 203 assert os.path.exists(args.in_pos_fa), 'positives .fa file "%s" not found' % ( |
186 "positives .fa file \"%s\" not found" % (args.in_pos_fa) | 204 args.in_pos_fa |
187 assert os.path.exists(args.in_neg_fa), \ | 205 ) |
188 "negatives .fa file \"%s\" not found" % (args.in_neg_fa) | 206 assert os.path.exists(args.in_neg_fa), 'negatives .fa file "%s" not found' % ( |
207 args.in_neg_fa | |
208 ) | |
189 # Count .fa entries. | 209 # Count .fa entries. |
190 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) | 210 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) |
191 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) | 211 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) |
192 assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \ | 212 assert c_pos_fa, 'positives .fa file "%s" no headers found' % (args.in_pos_fa) |
193 (args.in_pos_fa) | 213 assert c_neg_fa, 'negatives .fa file "%s" no headers found' % (args.in_neg_fa) |
194 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \ | |
195 (args.in_neg_fa) | |
196 print("# positive .fa sequences: %i" % (c_pos_fa)) | 214 print("# positive .fa sequences: %i" % (c_pos_fa)) |
197 print("# negative .fa sequences: %i" % (c_neg_fa)) | 215 print("# negative .fa sequences: %i" % (c_neg_fa)) |
198 # Check additional files. | 216 # Check additional files. |
199 if args.opt_pos_fa: | 217 if args.opt_pos_fa: |
200 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" | 218 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" |
201 if args.opt_neg_fa: | 219 if args.opt_neg_fa: |
202 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" | 220 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" |
203 # Check for lowercase only sequences, which cause GP to crash. | 221 # Check for lowercase only sequences, which cause GP to crash. |
204 error_mess = "input sequences encountered containing "\ | 222 error_mess = ( |
205 "only lowercase characters or lowercase characters in between "\ | 223 "input sequences encountered containing " |
206 "uppercase characters. Please provide either all uppercase "\ | 224 "only lowercase characters or lowercase characters in between " |
207 "sequences or sequences containing uppercase regions surrounded "\ | 225 "uppercase characters. Please provide either all uppercase " |
208 "by lowercase context regions for structure calculation (see "\ | 226 "sequences or sequences containing uppercase regions surrounded " |
209 "viewpoint concept in original GraphProt publication "\ | 227 "by lowercase context regions for structure calculation (see " |
228 "viewpoint concept in original GraphProt publication " | |
210 "for more details)" | 229 "for more details)" |
230 ) | |
211 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) | 231 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) |
212 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 232 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
213 assert not bad_ids, "%s" % (error_mess) | 233 assert not bad_ids, "%s" % (error_mess) |
214 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa) | 234 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa) |
215 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 235 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
225 | 245 |
226 # If parop .fa files given. | 246 # If parop .fa files given. |
227 if args.opt_pos_fa and args.opt_neg_fa: | 247 if args.opt_pos_fa and args.opt_neg_fa: |
228 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) | 248 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) |
229 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) | 249 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) |
230 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \ | 250 assert c_parop_pos_fa, '--opt-pos .fa file "%s" no headers found' % ( |
231 % (args.opt_pos_fa) | 251 args.opt_pos_fa |
232 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \ | 252 ) |
233 % (args.opt_neg_fa) | 253 assert c_parop_neg_fa, '--opt-neg .fa file "%s" no headers found' % ( |
254 args.opt_neg_fa | |
255 ) | |
234 # Less than 500 for training?? You gotta be kidding. | 256 # Less than 500 for training?? You gotta be kidding. |
235 assert c_pos_fa >= args.min_train, \ | 257 assert c_pos_fa >= args.min_train, ( |
236 "--pos for training < %i, please provide more (try at least "\ | 258 "--pos for training < %i, please provide more (try at least " |
237 "> 1000, the more the better)" % (args.min_train) | 259 "> 1000, the more the better)" % (args.min_train) |
238 assert c_neg_fa >= args.min_train, \ | 260 ) |
239 "--neg for training < %i, please provide more (try at least "\ | 261 assert c_neg_fa >= args.min_train, ( |
262 "--neg for training < %i, please provide more (try at least " | |
240 "> 1000, the more the better)" % (args.min_train) | 263 "> 1000, the more the better)" % (args.min_train) |
264 ) | |
241 # Looking closer at ratios. | 265 # Looking closer at ratios. |
242 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa | 266 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa |
243 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: | 267 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: |
244 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\ | 268 assert 0, ( |
245 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ | 269 "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 " |
246 "identical numbers (keep in mind that performance measures "\ | 270 "(ratio = %f). Try to keep ratio closer to 1 or better use " |
247 "such as accuracy or AUROC are not suitable for imbalanced "\ | 271 "identical numbers (keep in mind that performance measures " |
272 "such as accuracy or AUROC are not suitable for imbalanced " | |
248 " datasets!)" % (pos_neg_ratio) | 273 " datasets!)" % (pos_neg_ratio) |
274 ) | |
249 else: | 275 else: |
250 # Define some minimum amount of training sites for the sake of sanity. | 276 # Define some minimum amount of training sites for the sake of sanity. |
251 c_pos_train = c_pos_fa - args.opt_set_size | 277 c_pos_train = c_pos_fa - args.opt_set_size |
252 c_neg_train = c_neg_fa - args.opt_set_size | 278 c_neg_train = c_neg_fa - args.opt_set_size |
253 # Start complaining. | 279 # Start complaining. |
254 assert c_pos_fa >= args.opt_set_size, \ | 280 assert ( |
255 "# positives < --opt-set-size (%i < %i)" \ | 281 c_pos_fa >= args.opt_set_size |
256 % (c_pos_fa, args.opt_set_size) | 282 ), "# positives < --opt-set-size (%i < %i)" % (c_pos_fa, args.opt_set_size) |
257 assert c_neg_fa >= args.opt_set_size, \ | 283 assert ( |
258 "# negatives < --opt-set-size (%i < %i)" \ | 284 c_neg_fa >= args.opt_set_size |
259 % (c_neg_fa, args.opt_set_size) | 285 ), "# negatives < --opt-set-size (%i < %i)" % (c_neg_fa, args.opt_set_size) |
260 assert c_pos_train >= args.opt_set_size, \ | 286 assert ( |
261 "# positives remaining for training < --opt-set-size "\ | 287 c_pos_train >= args.opt_set_size |
262 "(%i < %i)" % (c_pos_train, args.opt_set_size) | 288 ), "# positives remaining for training < --opt-set-size " "(%i < %i)" % ( |
263 assert c_neg_train >= args.opt_set_size, "# negatives remaining "\ | 289 c_pos_train, |
264 "for training < --opt-set-size (%i < %i)" \ | 290 args.opt_set_size, |
265 % (c_neg_train, args.opt_set_size) | 291 ) |
292 assert ( | |
293 c_neg_train >= args.opt_set_size | |
294 ), "# negatives remaining " "for training < --opt-set-size (%i < %i)" % ( | |
295 c_neg_train, | |
296 args.opt_set_size, | |
297 ) | |
266 # Less than 500?? You gotta be kidding. | 298 # Less than 500?? You gotta be kidding. |
267 assert c_pos_train >= args.min_train, \ | 299 assert c_pos_train >= args.min_train, ( |
268 "# positives remaining for training < %i, please provide more "\ | 300 "# positives remaining for training < %i, please provide more " |
269 " (try at least > 1000, the more the better)" % (args.min_train) | 301 " (try at least > 1000, the more the better)" % (args.min_train) |
270 assert c_neg_train >= args.min_train, \ | 302 ) |
271 "# negatives remaining for training < %i, please provide more "\ | 303 assert c_neg_train >= args.min_train, ( |
304 "# negatives remaining for training < %i, please provide more " | |
272 "(try at least > 1000, the more the better)" % (args.min_train) | 305 "(try at least > 1000, the more the better)" % (args.min_train) |
306 ) | |
273 # Looking closer at ratios. | 307 # Looking closer at ratios. |
274 pos_neg_ratio = c_pos_train / c_neg_train | 308 pos_neg_ratio = c_pos_train / c_neg_train |
275 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: | 309 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: |
276 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\ | 310 assert 0, ( |
277 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ | 311 "ratio of --pos to --neg < 0.8 or > 1.25 " |
278 "identical numbers (keep in mind that performance measures "\ | 312 "(ratio = %f). Try to keep ratio closer to 1 or better use " |
279 "such as accuracy or AUROC are not suitable for imbalanced "\ | 313 "identical numbers (keep in mind that performance measures " |
314 "such as accuracy or AUROC are not suitable for imbalanced " | |
280 "datasets!)" % (pos_neg_ratio) | 315 "datasets!)" % (pos_neg_ratio) |
316 ) | |
281 | 317 |
282 """ | 318 """ |
283 Generate parop + train .fa output files for hyperparameter | 319 Generate parop + train .fa output files for hyperparameter |
284 optimization + training. | 320 optimization + training. |
285 | 321 |
297 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) | 333 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) |
298 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) | 334 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) |
299 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) | 335 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) |
300 else: | 336 else: |
301 # Generate parop + train .fa files from input .fa files. | 337 # Generate parop + train .fa files from input .fa files. |
302 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, | 338 gplib.split_fasta_into_test_train_files( |
303 pos_train_fa, | 339 args.in_pos_fa, pos_parop_fa, pos_train_fa, test_size=args.opt_set_size |
304 test_size=args.opt_set_size) | 340 ) |
305 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, | 341 gplib.split_fasta_into_test_train_files( |
306 neg_train_fa, | 342 args.in_neg_fa, neg_parop_fa, neg_train_fa, test_size=args.opt_set_size |
307 test_size=args.opt_set_size) | 343 ) |
308 | 344 |
309 """ | 345 """ |
310 Do the hyperparameter optimization. | 346 Do the hyperparameter optimization. |
311 | 347 |
312 """ | 348 """ |
313 print("Starting hyperparameter optimization (-action ls) ... ") | 349 print("Starting hyperparameter optimization (-action ls) ... ") |
314 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \ | 350 check_cmd = ( |
315 " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa | 351 "GraphProt.pl -action ls -prefix " |
352 + args.data_id | |
353 + " -fasta " | |
354 + pos_parop_fa | |
355 + " -negfasta " | |
356 + neg_parop_fa | |
357 ) | |
316 # If sequence model should be trained (default). | 358 # If sequence model should be trained (default). |
317 if not args.train_str_model: | 359 if not args.train_str_model: |
318 check_cmd += " -onlyseq" | 360 check_cmd += " -onlyseq" |
319 print(check_cmd) | 361 print(check_cmd) |
320 output = subprocess.getoutput(check_cmd) | 362 output = subprocess.getoutput(check_cmd) |
321 params_file = args.data_id + ".params" | 363 params_file = args.data_id + ".params" |
322 assert os.path.exists(params_file), "Hyperparameter optimization output "\ | 364 assert os.path.exists( |
323 " .params file \"%s\" not found" % (params_file) | 365 params_file |
366 ), "Hyperparameter optimization output " ' .params file "%s" not found' % ( | |
367 params_file | |
368 ) | |
324 # Add model type to params file. | 369 # Add model type to params file. |
325 if args.train_str_model: | 370 if args.train_str_model: |
326 gplib.echo_add_to_file("model_type: structure", params_file) | 371 gplib.echo_add_to_file("model_type: structure", params_file) |
327 else: | 372 else: |
328 gplib.echo_add_to_file("model_type: sequence", params_file) | 373 gplib.echo_add_to_file("model_type: sequence", params_file) |
332 """ | 377 """ |
333 Do the model training. (Yowza!) | 378 Do the model training. (Yowza!) |
334 | 379 |
335 """ | 380 """ |
336 print("Starting model training (-action train) ... ") | 381 print("Starting model training (-action train) ... ") |
337 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \ | 382 check_cmd = ( |
338 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 383 "GraphProt.pl -action train -prefix " |
339 + " " + param_string | 384 + args.data_id |
385 + " -fasta " | |
386 + pos_train_fa | |
387 + " -negfasta " | |
388 + neg_train_fa | |
389 + " " | |
390 + param_string | |
391 ) | |
340 print(check_cmd) | 392 print(check_cmd) |
341 output = subprocess.getoutput(check_cmd) | 393 output = subprocess.getoutput(check_cmd) |
342 assert output, \ | 394 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
343 "The following call of GraphProt.pl produced no output:\n%s" \ | 395 check_cmd |
344 % (check_cmd) | 396 ) |
345 if args.gp_output: | 397 if args.gp_output: |
346 print(output) | 398 print(output) |
347 model_file = args.data_id + ".model" | 399 model_file = args.data_id + ".model" |
348 assert os.path.exists(model_file), \ | 400 assert os.path.exists(model_file), 'Training output .model file "%s" not found' % ( |
349 "Training output .model file \"%s\" not found" % (model_file) | 401 model_file |
402 ) | |
350 | 403 |
351 """ | 404 """ |
352 Do the 10-fold cross validation. | 405 Do the 10-fold cross validation. |
353 | 406 |
354 """ | 407 """ |
355 if not args.disable_cv: | 408 if not args.disable_cv: |
356 print("Starting 10-fold cross validation (-action cv) ... ") | 409 print("Starting 10-fold cross validation (-action cv) ... ") |
357 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \ | 410 check_cmd = ( |
358 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 411 "GraphProt.pl -action cv -prefix " |
359 + " " + param_string + " -model " + model_file | 412 + args.data_id |
413 + " -fasta " | |
414 + pos_train_fa | |
415 + " -negfasta " | |
416 + neg_train_fa | |
417 + " " | |
418 + param_string | |
419 + " -model " | |
420 + model_file | |
421 ) | |
360 print(check_cmd) | 422 print(check_cmd) |
361 output = subprocess.getoutput(check_cmd) | 423 output = subprocess.getoutput(check_cmd) |
362 assert output, \ | 424 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
363 "The following call of GraphProt.pl produced no output:\n%s" \ | 425 check_cmd |
364 % (check_cmd) | 426 ) |
365 if args.gp_output: | 427 if args.gp_output: |
366 print(output) | 428 print(output) |
367 cv_results_file = args.data_id + ".cv_results" | 429 cv_results_file = args.data_id + ".cv_results" |
368 assert os.path.exists(cv_results_file), \ | 430 assert os.path.exists( |
369 "CV output .cv_results file \"%s\" not found" % (cv_results_file) | 431 cv_results_file |
432 ), 'CV output .cv_results file "%s" not found' % (cv_results_file) | |
370 | 433 |
371 """ | 434 """ |
372 Do the motif generation. | 435 Do the motif generation. |
373 | 436 |
374 """ | 437 """ |
375 if not args.disable_motifs: | 438 if not args.disable_motifs: |
376 print("Starting motif generation (-action motif) ... ") | 439 print("Starting motif generation (-action motif) ... ") |
377 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \ | 440 check_cmd = ( |
378 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 441 "GraphProt.pl -action motif -prefix " |
379 + " " + param_string + " -model " + model_file | 442 + args.data_id |
443 + " -fasta " | |
444 + pos_train_fa | |
445 + " -negfasta " | |
446 + neg_train_fa | |
447 + " " | |
448 + param_string | |
449 + " -model " | |
450 + model_file | |
451 ) | |
380 print(check_cmd) | 452 print(check_cmd) |
381 output = subprocess.getoutput(check_cmd) | 453 output = subprocess.getoutput(check_cmd) |
382 assert output, \ | 454 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
383 "The following call of GraphProt.pl produced no output:\n%s" \ | 455 check_cmd |
384 % (check_cmd) | 456 ) |
385 if args.gp_output: | 457 if args.gp_output: |
386 print(output) | 458 print(output) |
387 seq_motif_file = args.data_id + ".sequence_motif" | 459 seq_motif_file = args.data_id + ".sequence_motif" |
388 seq_motif_png_file = args.data_id + ".sequence_motif.png" | 460 seq_motif_png_file = args.data_id + ".sequence_motif.png" |
389 assert os.path.exists(seq_motif_file), \ | 461 assert os.path.exists( |
390 "Motif output .sequence_motif file \"%s\" not found" \ | 462 seq_motif_file |
391 % (seq_motif_file) | 463 ), 'Motif output .sequence_motif file "%s" not found' % (seq_motif_file) |
392 assert os.path.exists(seq_motif_png_file), \ | 464 assert os.path.exists( |
393 "Motif output .sequence_motif.png file \"%s\" not found" \ | 465 seq_motif_png_file |
394 % (seq_motif_png_file) | 466 ), 'Motif output .sequence_motif.png file "%s" not found' % (seq_motif_png_file) |
395 if args.train_str_model: | 467 if args.train_str_model: |
396 str_motif_file = args.data_id + ".structure_motif" | 468 str_motif_file = args.data_id + ".structure_motif" |
397 str_motif_png_file = args.data_id + ".structure_motif.png" | 469 str_motif_png_file = args.data_id + ".structure_motif.png" |
398 assert os.path.exists(str_motif_file), \ | 470 assert os.path.exists( |
399 "Motif output .structure_motif file \"%s\" not found" \ | 471 str_motif_file |
400 % (str_motif_file) | 472 ), 'Motif output .structure_motif file "%s" not found' % (str_motif_file) |
401 assert os.path.exists(str_motif_png_file), \ | 473 assert os.path.exists( |
402 "Motif output .structure_motif.png file \"%s\" not found" \ | 474 str_motif_png_file |
403 % (str_motif_png_file) | 475 ), 'Motif output .structure_motif.png file "%s" not found' % ( |
476 str_motif_png_file | |
477 ) | |
404 | 478 |
405 """ | 479 """ |
406 Do whole site predictions on positive training set. | 480 Do whole site predictions on positive training set. |
407 | 481 |
408 """ | 482 """ |
409 print("Starting whole site predictions on positive training set " | 483 print( |
410 " (-action predict) ... ") | 484 "Starting whole site predictions on positive training set " |
411 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \ | 485 " (-action predict) ... " |
412 + " -fasta " + pos_train_fa + " " + param_string \ | 486 ) |
413 + " -model " + model_file | 487 check_cmd = ( |
488 "GraphProt.pl -action predict -prefix " | |
489 + args.data_id | |
490 + " -fasta " | |
491 + pos_train_fa | |
492 + " " | |
493 + param_string | |
494 + " -model " | |
495 + model_file | |
496 ) | |
414 print(check_cmd) | 497 print(check_cmd) |
415 output = subprocess.getoutput(check_cmd) | 498 output = subprocess.getoutput(check_cmd) |
416 assert output, \ | 499 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
417 "The following call of GraphProt.pl produced no output:\n%s" \ | 500 check_cmd |
418 % (check_cmd) | 501 ) |
419 if args.gp_output: | 502 if args.gp_output: |
420 print(output) | 503 print(output) |
421 ws_predictions_file = args.data_id + ".predictions" | 504 ws_predictions_file = args.data_id + ".predictions" |
422 assert os.path.exists(ws_predictions_file), \ | 505 assert os.path.exists( |
423 "Whole site prediction output .predictions file \"%s\" not found" \ | 506 ws_predictions_file |
424 % (ws_predictions_file) | 507 ), 'Whole site prediction output .predictions file "%s" not found' % ( |
508 ws_predictions_file | |
509 ) | |
425 | 510 |
426 """ | 511 """ |
427 Do profile predictions on positive training set. | 512 Do profile predictions on positive training set. |
428 | 513 |
429 """ | 514 """ |
430 print("Starting profile predictions on positive training set " | 515 print( |
431 "-action predict_profile) ... ") | 516 "Starting profile predictions on positive training set " |
432 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ | 517 "-action predict_profile) ... " |
433 + args.data_id + " -fasta " + pos_train_fa + " " \ | 518 ) |
434 + param_string + " -model " + model_file | 519 check_cmd = ( |
520 "GraphProt.pl -action predict_profile -prefix " | |
521 + args.data_id | |
522 + " -fasta " | |
523 + pos_train_fa | |
524 + " " | |
525 + param_string | |
526 + " -model " | |
527 + model_file | |
528 ) | |
435 print(check_cmd) | 529 print(check_cmd) |
436 output = subprocess.getoutput(check_cmd) | 530 output = subprocess.getoutput(check_cmd) |
437 assert output, \ | 531 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
438 "The following call of GraphProt.pl produced no output:\n%s" \ | 532 check_cmd |
439 % (check_cmd) | 533 ) |
440 if args.gp_output: | 534 if args.gp_output: |
441 print(output) | 535 print(output) |
442 profile_predictions_file = args.data_id + ".profile" | 536 profile_predictions_file = args.data_id + ".profile" |
443 assert os.path.exists(profile_predictions_file), \ | 537 assert os.path.exists( |
444 "Profile prediction output .profile file \"%s\" not found" \ | 538 profile_predictions_file |
445 % (profile_predictions_file) | 539 ), 'Profile prediction output .profile file "%s" not found' % ( |
540 profile_predictions_file | |
541 ) | |
446 | 542 |
447 """ | 543 """ |
448 Get 50 % score (median) for .predictions and .profile file. | 544 Get 50 % score (median) for .predictions and .profile file. |
449 For .profile, first extract for each site the maximum score, and then | 545 For .profile, first extract for each site the maximum score, and then |
450 from the list of maximum site scores get the median. | 546 from the list of maximum site scores get the median. |
452 | 548 |
453 """ | 549 """ |
454 print("Getting .profile and .predictions median scores ... ") | 550 print("Getting .profile and .predictions median scores ... ") |
455 | 551 |
456 # Whole site scores median. | 552 # Whole site scores median. |
457 ws_pred_median = \ | 553 ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file) |
458 gplib.graphprot_predictions_get_median(ws_predictions_file) | |
459 # Profile top site scores median. | 554 # Profile top site scores median. |
460 profile_median = \ | 555 profile_median = gplib.graphprot_profile_get_tsm( |
461 gplib.graphprot_profile_get_tsm(profile_predictions_file, | 556 profile_predictions_file, profile_type="profile" |
462 profile_type="profile") | 557 ) |
463 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) | 558 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) |
464 profile_string = "pos_train_profile_median: %f" % (profile_median) | 559 profile_string = "pos_train_profile_median: %f" % (profile_median) |
465 gplib.echo_add_to_file(ws_pred_string, params_file) | 560 gplib.echo_add_to_file(ws_pred_string, params_file) |
466 gplib.echo_add_to_file(profile_string, params_file) | 561 gplib.echo_add_to_file(profile_string, params_file) |
467 # Average profile top site scores median for extlr 1 to 10. | 562 # Average profile top site scores median for extlr 1 to 10. |
468 for i in range(10): | 563 for i in range(10): |
469 i += 1 | 564 i += 1 |
470 avg_profile_median = \ | 565 avg_profile_median = gplib.graphprot_profile_get_tsm( |
471 gplib.graphprot_profile_get_tsm(profile_predictions_file, | 566 profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i |
472 profile_type="avg_profile", | 567 ) |
473 avg_profile_extlr=i) | 568 |
474 | 569 avg_profile_string = "pos_train_avg_profile_median_%i: %f" % ( |
475 avg_profile_string = "pos_train_avg_profile_median_%i: %f" \ | 570 i, |
476 % (i, avg_profile_median) | 571 avg_profile_median, |
572 ) | |
477 gplib.echo_add_to_file(avg_profile_string, params_file) | 573 gplib.echo_add_to_file(avg_profile_string, params_file) |
478 | 574 |
479 print("Script: I'm done.") | 575 print("Script: I'm done.") |
480 print("Author: Good. Now go back to your file system directory.") | 576 print("Author: Good. Now go back to your file system directory.") |
481 print("Script: Ok.") | 577 print("Script: Ok.") |