graphprot_predict_profile: graphprot_predict

comparison graphprot_predict_wrapper.py @ 3:9a83a84a25a7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"

author	bgruening
date	Wed, 27 Jan 2021 19:27:12 +0000
parents	4cebb3439e1a
children	2d6de5b769e6

comparison

equal deleted inserted replaced

-:4cebb3439e1a
+:9a83a84a25a7
 #!/usr/bin/env python3
+import argparse as ap
+import os
 import subprocess
-import argparse
+import sys
-import shutil
 import gplib
-import gzip
-import sys
-import os
 """
 TOOL DEPENDENCIES
 EXAMPLE CALLS
 =============
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
+flake8 coming out of hotel room. FB enters.
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed
+FB: Who is this f*** ???
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest
+--fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+--fasta gp_data/test10_predict.fa --data-id test2pred
+--gen-site-bed gp_data/test10_predict.bed
-python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred
-pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+--gen-site-bed gp_data/test10_predict.bed --conf-out
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred
+python graphprot_predict_wrapper.py --model test-data/test.model
+--params test-data/test.params --fasta test-data/test_predict.fa
+--data-id predtest
+python graphprot_predict_wrapper.py --model test-data/test.model
+--params test-data/test.params --fasta test-data/test_predict.fa
+--data-id predtest --gen-site-bed test-data/test_predict.bed
+--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+python graphprot_predict_wrapper.py --data-id GraphProt
+--fasta test-data/test_predict.fa --model test-data/test.model
+--params test-data/test.params --gen-site-bed test-data/test_predict.bed
+--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
 """
-################################################################################
+###############################################################################
 def setup_argument_parser():
 """Setup argparse parser."""
 help_description = """
 Galaxy wrapper script for GraphProt (-action predict and -action
 predict_profile) to compute whole site or position-wise scores for input
 FASTA sequences.
 By default, profile predictions are calculated, followed by average
 profiles computions and peak regions extraction from average profiles.
 If --ws-pred is set, whole site score predictions on input sequences
 will be run instead.
 If --conf-out is set, sites or peak regions with a score >= the median
 score of positive training sites will be output.
 If --gen-site-bed .bed file is provided, peak regions will be output
 with genomic coordinates too.
 """
 # Define argument parser.
-p = argparse.ArgumentParser(add_help=False,
+p = ap.ArgumentParser(add_help=False,
 prog="graphprot_predict_wrapper.py",
 description=help_description,
-formatter_class=argparse.MetavarTypeHelpFormatter)
+formatter_class=ap.MetavarTypeHelpFormatter)
 # Argument groups.
 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
 # Required arguments.
 p_opt.add_argument("-h", "--help",
 action="help",
 help="Print help message")
 p_man.add_argument("--fasta",
 dest="in_fa",
 type=str,
-required = True,
+required=True,
-help = "Sequences .fa file to predict on (option -fasta)")
+help="Sequences .fa file to predict"
+" on (option -fasta)")
 p_man.add_argument("--model",
 dest="in_model",
 type=str,
-required = True,
+required=True,
-help = "GraphProt model file to use for predictions (option -model)")
+help="GraphProt model file to use for predictions"
+" (option -model)")
 p_man.add_argument("--params",
 dest="in_params",
 type=str,
-required = True,
+required=True,
-help = "Parameter file for given model")
+help="Parameter file for given model")
 p_man.add_argument("--data-id",
 dest="data_id",
 type=str,
-required = True,
+required=True,
-help = "Data ID (option -prefix)")
+help="Data ID (option -prefix)")
 # ---> I'm  a conditional argument <---
 p_opt.add_argument("--ws-pred",
-dest = "ws_pred",
+dest="ws_pred",
-default = False,
+default=False,
-action = "store_true",
+action="store_true",
-help = "Run a whole site prediction instead of calculating profiles (default: false)")
+help="Run a whole site prediction instead "
+"of calculating profiles (default: false)")
 # Additional arguments.
 p_opt.add_argument("--sc-thr",
 dest="score_thr",
-type = float,
+type=float,
-default = 0,
+default=0,
-help = "Score threshold for extracting average profile peak regions (default: 0)")
+help="Score threshold for extracting "
+"average profile peak regions (default: 0)")
 p_opt.add_argument("--max-merge-dist",
 dest="max_merge_dist",
-type = int,
+type=int,
-default = 0,
+default=0,
-choices = [0,1,2,3,4,5,6,7,8,9,10],
+choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)")
+help="Maximum merge distance for nearby peak regions"
+" (default: report all non-overlapping regions)")
 p_opt.add_argument("--gen-site-bed",
 dest="genomic_sites_bed",
 type=str,
-help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)")
+help=".bed file specifying the genomic regions of "
+"the input .fa sequences. Corrupt .bed "
+"information will be punished (default: false)")
 p_opt.add_argument("--conf-out",
 dest="conf_out",
-default = False,
+default=False,
-action = "store_true",
+action="store_true",
-help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)")
+help="Output filtered peak regions BED file or "
+"predictions file (if --ws-pred) using the median "
+"positive training site score for filtering "
+"(default: false)")
 p_opt.add_argument("--gp-output",
-dest = "gp_output",
+dest="gp_output",
-default = False,
+default=False,
-action = "store_true",
+action="store_true",
-help = "Print output produced by GraphProt (default: false)")
+help="Print output produced by GraphProt "
+"(default: false)")
 p_opt.add_argument("--ap-extlr",
 dest="ap_extlr",
-type = int,
+type=int,
-default = 5,
+default=5,
-choices = [0,1,2,3,4,5,6,7,8,9,10],
+choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)")
+help="Define average profile up- and downstream "
+"extension to produce the average profile. "
+"The mean over small sequence windows "
+"(window length = --ap-extlr*2 + 1) is used to "
+"get position scores, thus the average profile "
+"is more smooth than the initial profile output "
+"by GraphProt (default: 5)")
 return p
-################################################################################
+###############################################################################
 if __name__ == '__main__':
 # Setup argparse.
 parser = setup_argument_parser()
 # Read in command line arguments.
 args = parser.parse_args()
 """
 Do all sorts of sanity checking.
 """
 # Check for Linux.
 assert "linux" in sys.platform, "please use Linux"
 # Check tool availability.
 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
 # Check file inputs.
-assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa)
+assert os.path.exists(args.in_fa), \
-assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model)
+"input .fa file \"%s\" not found" % (args.in_fa)
-assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params)
+assert os.path.exists(args.in_model), \
+"input .model file \"%s\" not found" % (args.in_model)
+assert os.path.exists(args.in_params), \
+"input .params file \"%s\" not found" % (args.in_params)
 # Count .fa entries.
 c_in_fa = gplib.count_fasta_headers(args.in_fa)
-assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa)
+assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
-print("# input .fa sequences:   %i" %(c_in_fa))
+print("# input .fa sequences:   %i" % (c_in_fa))
 # Read in FASTA sequences to check for uppercase sequences.
 seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
-assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept)"
+assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
+"Please change sequences to uppercase (keep in mind "\
+"GraphProt only scores uppercase regions (according "\
+"to its viewpoint concept)"
 if not args.ws_pred:
 # Check for lowercase sequences.
 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
-assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept)"
+assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
+" in profile predictions, since GraphProt only scores "\
+"uppercase regions (according to its viewpoint concept)"
 # Check .bed.
 if args.genomic_sites_bed:
 # An array of checks, marvelous.
-assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed)
+assert \
+os.path.exists(args.genomic_sites_bed), \
+"genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
 # Check .bed for content.
-assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed)
+assert gplib.count_file_rows(args.genomic_sites_bed), \
+"genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
 # Check .bed for 6-column format.
-assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed)
+assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
+"genomic .bed file \"%s\" appears to not "\
+"be in 6-column .bed format" % (args.genomic_sites_bed)
 # Check for unique column 4 IDs.
-assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed)
+assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
-# Read in .bed regions, compare to FASTA sequences (compare IDs + lengths)
+"genomic .bed file \"%s\" column 4 IDs not unique" % \
+(args.genomic_sites_bed)
+# Read in .bed regions, compare to FASTA sequences.
 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
 for seq_id in seq_len_dic:
 seq_l = seq_len_dic[seq_id]
-assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed)
+assert seq_id in reg_len_dic, \
+"sequence ID \"%s\" missing in input .bed \"%s\"" % \
+(seq_id, args.genomic_sites_bed)
 reg_l = reg_len_dic[seq_id]
-assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l)
+assert seq_l == reg_l, \
+"sequence length differs from .bed region length (%i != %i)" \
+% (seq_l, reg_l)
 # Read in model parameters.
 param_dic = gplib.graphprot_get_param_dic(args.in_params)
 # Create GraphProt parameter string.
 param_string = gplib.graphprot_get_param_string(args.in_params)
 """
 Run predictions.
 """
 if args.ws_pred:
 # Do whole site prediction.
-print("Starting whole site predictions on input .fa file (-action predict) ... ")
+print("Starting whole site predictions on input .fa file"
-check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model
+" (-action predict) ... ")
+check_cmd = "GraphProt.pl -action predict -prefix " \
++ args.data_id + " -fasta " + args.in_fa + " " \
++ param_string + " -model " + args.in_model
 output = subprocess.getoutput(check_cmd)
-assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+assert output, "the following call of GraphProt.pl "\
+"produced no output:\n%s" % (check_cmd)
 if args.gp_output:
 print(output)
 ws_predictions_file = args.data_id + ".predictions"
-assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file)
+assert os.path.exists(ws_predictions_file), \
+"Whole site prediction output .predictions file \"%s\" not found" \
+% (ws_predictions_file)
 if args.conf_out:
 # Filter by pos_train_ws_pred_median median.
-assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file"
+assert "pos_train_ws_pred_median" in param_dic, \
-pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"])
+"whole site top scores median information "\
+"missing in .params file"
+pos_tr_ws_pred_med = \
+float(param_dic["pos_train_ws_pred_median"])
 # Filtered file.
 filt_ws_predictions_file = args.data_id + ".p50.predictions"
-print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median))
+print("Extracting p50 sites from whole site predictions"
-gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file,
+" (score threshold = %f) ... " % (pos_tr_ws_pred_med))
-sc_thr=pos_train_ws_pred_median)
+gplib.graphprot_filter_predictions_file(ws_predictions_file,
+filt_ws_predictions_file,
+sc_thr=pos_tr_ws_pred_med)
 else:
 # Do profile prediction.
-print("Starting profile predictions on on input .fa file (-action predict_profile) ... ")
+print("Starting profile predictions on on input .fa file "
-check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model
+"(-action predict_profile) ... ")
+check_cmd = "GraphProt.pl -action predict_profile -prefix " \
++ args.data_id + " -fasta " + args.in_fa + " " + \
+param_string + " -model " + args.in_model
 output = subprocess.getoutput(check_cmd)
-assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+assert output, "the following call of GraphProt.pl produced "\
+"no output:\n%s" % (check_cmd)
 if args.gp_output:
 print(output)
 profile_predictions_file = args.data_id + ".profile"
-assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file)
+assert os.path.exists(profile_predictions_file), \
+"Profile prediction output .profile file \"%s\" not found" % \
+(profile_predictions_file)
 # Profile prediction output files.
 avg_prof_file = args.data_id + ".avg_profile"
 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
-avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
+avg_prof_gen_peaks_file = args.data_id + \
-avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
+".avg_profile.genomic_peaks.bed"
-avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed"
+avg_prof_peaks_p50_file = args.data_id + \
+".avg_profile.p50.peaks.bed"
+avg_prof_gen_peaks_p50_file = args.data_id + \
+".avg_profile.p50.genomic_peaks.bed"
 # Get sequence IDs in order from input .fa file.
 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
 # Calculate average profiles.
-print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr))
+print("Getting average profile from profile "
-gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file,
+"(extlr for smoothing: %i) ... " % (args.ap_extlr))
-avg_prof_file,
+gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
-ap_extlr=args.ap_extlr,
+avg_prof_file,
-seq_ids_list=seq_ids_list,
+ap_extlr=args.ap_extlr,
-method=2)
+seq_ids_list=seq_ids_list,
+method=2)
 # Extract peak regions on sequences with threshold score 0.
-print("Extracting peak regions from average profile (score threshold = 0) ... ")
+print("Extracting peak regions from average profile "
-gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file,
+"(score threshold = 0) ... ")
-max_merge_dist=args.max_merge_dist,
+killpep8 = args.max_merge_dist
-sc_thr=args.score_thr)
+gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+avg_prof_peaks_file,
+max_merge_dist=killpep8,
+sc_thr=args.score_thr)
 # Convert peaks to genomic coordinates.
 if args.genomic_sites_bed:
 print("Converting peak regions to genomic coordinates ... ")
-gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file,
+killit = args.genomic_sites_bed
+gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
+avg_prof_gen_peaks_file,
 print_rows=False,
-genomic_sites_bed=args.genomic_sites_bed)
+genomic_sites_bed=killit)
-# gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file)
 # Extract peak regions with threshold score p50.
 if args.conf_out:
-sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr)
+sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
-# Filter by pos_train_ws_pred_median median.
+# Filter by pos_tr_ws_pred_med median.
-assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr)
+assert sc_id in param_dic, "average profile extlr %i median "\
+"information missing in .params file" % (args.ap_extlr)
 p50_sc_thr = float(param_dic[sc_id])
-print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr))
+print("Extracting p50 peak regions from average profile "
-gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file,
+"(score threshold = %f) ... " % (p50_sc_thr))
-max_merge_dist=args.max_merge_dist,
+despair = avg_prof_peaks_p50_file
+pain = args.max_merge_dist
+gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+despair,
+max_merge_dist=pain,
 sc_thr=p50_sc_thr)
 # Convert peaks to genomic coordinates.
 if args.genomic_sites_bed:
-print("Converting p50 peak regions to genomic coordinates ... ")
+print("Converting p50 peak regions to "
-gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file,
+"genomic coordinates ... ")
-genomic_sites_bed=args.genomic_sites_bed)
+madness = args.genomic_sites_bed
+gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
+avg_prof_gen_peaks_p50_file,
+genomic_sites_bed=madness)
 # Done.
 print("Script: I'm done.")
 print("Author: ... ")

Mercurial > repos > rnateam > graphprot_predict_profile

comparison graphprot_predict_wrapper.py @ 3:9a83a84a25a7 draft