Galaxy |

Changeset 5:d4d531006735 (2022-03-10)

Previous changeset 4:cfc65ae227f8 (2022-03-07) Next changeset 6:922d309640db (2022-03-11)

Commit message:
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"

modified:
MaxQuantProcessingScript.R
mqppep_anova.R
mqppep_anova.xml
mqppep_mrgfltr.py
repository_dependencies.xml
search_ppep.py

diff -r cfc65ae227f8 -r d4d531006735 MaxQuantProcessingScript.R
--- a/MaxQuantProcessingScript.R Mon Mar 07 20:43:57 2022 +0000
+++ b/MaxQuantProcessingScript.R Thu Mar 10 23:42:48 2022 +0000

[

b'@@ -1,6 +1,6 @@\n #!/usr/bin/env Rscript\n \n-# This is the implementation for the \n+# This is the implementation for the\n # "MaxQuant Phosphopeptide Localization Probability Cutoff"\n # Galaxy tool (mqppep_lclztn_filter)\n # It is adapted from the MaxQuant Processing Script written by Larry Cheng.\n@@ -10,16 +10,14 @@\n library(data.table)\n library(stringr)\n library(ggplot2)\n-#library(PTXQC)\n-#require(PTXQC)\n-#require(methods)\n \n # title: "MaxQuant Processing Script"\n # author: "Larry Cheng"\n # date: "February 19, 2018"\n #\n # # MaxQuant Processing Script\n-# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):\n+# Takes MaxQuant Phospho (STY)sites.txt file as input\n+# and performs the following (in order):\n # 1) Runs the Proteomics Quality Control software\n # 2) Remove contaminant and reverse sequence rows\n # 3) Filters rows based on localization probability\n@@ -27,24 +25,32 @@\n # 5) Sequences phosphopeptides\n # 6) Merges multiply phosphorylated peptides\n # 7) Filters out phosphopeptides based on enrichment\n-# The output file contains the phosphopeptide (first column) and the quantitative values for each sample\n+# The output file contains the phosphopeptide (first column)\n+# and the quantitative values for each sample.\n #\n # ## Revision History\n # Rev. 2022-02-10 :wrap for inclusion in Galaxy\n-# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"\n+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"\n+# and "Phosphopeptide Processing Script"\n # Rev. 2017-12-12 :added PTXQC\n # added additional plots and table outputs for quality control\n-# allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-\n-# "\\\\.(\\\\d+)[A-Z]$"\n-# converted from .r to .rmd file to knit report for quality control\n-# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times\n-# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows\n-# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output\n-# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\\\.(\\\\d+)[AB]$" so that it looks at the end of string\n+# allowed for more than 2 samples to be grouped together\n+# (up to 26 (eg, 1A, 1B, 1C, etc))\n+# converted from .r to .rmd file to knit report\n+# for quality control\n+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data\n+# impute multiple times\n+# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows\n+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to\n+# preANOVA file output\n+# Rev. 2016-08-22 :use regexSampleNames <- "\\\\.(\\\\d + )[AB]$"\n+# so that it looks at the end of string\n # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)\n-# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA\n+# Rev. 2016-07-03 :Removed row names from the write.table() output for\n+# ANOVA and PreANOVA\n # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75\n-# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards\n+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting\n+# the row numbers afterwards\n # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol\n \n \n@@ -52,9 +58,9 @@\n \n # Read first line of file at filePath\n # adapted from: https://stackoverflow.com/a/35761217/15509512\n-readFirstLine <- function(filepath) {\n- con = file(filepath, "r")\n- line = readLines(con, n = 1)\n+read_first_line <- function(filepath) {\n+ con <- file(filepath, "r")\n+ line <- readLines(con'..b'hGraphData$value\n-phosphoresidue <- enrichGraphData$group\n+slices <- enrich_graph_data$value\n+phosphoresidue <- enrich_graph_data$group\n pct <- round(100 * slices / sum(slices))\n-lbls <- paste(enrichGraphData$group,"\\n",pct, "%\\n(", slices, ")", sep="")\n+lbls <-\n+ paste(enrich_graph_data$group, "\\n", pct, "%\\n(", slices, ")", sep = "")\n slc_ctr <- c()\n run_tot <- 0\n for (p in pct) {\n- slc_ctr <- c(slc_ctr, run_tot + p/2.0)\n+ slc_ctr <- c(slc_ctr, run_tot + p / 2.0)\n run_tot <- run_tot + p\n }\n lbl_y <- 100 - slc_ctr\n-df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))\n-gigi <- ggplot(\n- df\n-, aes(x = 1, y = pct, fill = phosphoresidue)) +\n+df <-\n+ data.frame(slices,\n+ pct,\n+ lbls,\n+ phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))\n+gigi <- ggplot(df\n+ , aes(x = 1, y = pct, fill = phosphoresidue)) +\n geom_col(position = "stack", orientation = "x") +\n geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +\n coord_polar(theta = "y", direction = -1) +\n labs(\n x = NULL\n- , y = NULL\n- , title = "Percentages (and counts) of phosphosites, by type of residue"\n- , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)\n+ ,\n+ y = NULL\n+ ,\n+ title = "Percentages (and counts) of phosphosites, by type of residue"\n+ ,\n+ caption = sprintf(\n+ "Roughly %s of peptides have multiple phosphosites.",\n+ pct_multiphos\n+ )\n ) +\n labs(x = NULL, y = NULL, fill = NULL) +\n theme_classic() +\n- theme( legend.position="right"\n- , axis.line = element_blank()\n- , axis.text = element_blank()\n- , axis.ticks = element_blank()\n- , plot.title = element_text(hjust = 0.5)\n- , plot.subtitle = element_text(hjust = 0.5)\n- , plot.caption = element_text(hjust = 0.5)\n- , plot.title.position = "plot"\n- ) +\n- scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))\n+ theme(\n+ legend.position = "right"\n+ ,\n+ axis.line = element_blank()\n+ ,\n+ axis.text = element_blank()\n+ ,\n+ axis.ticks = element_blank()\n+ ,\n+ plot.title = element_text(hjust = 0.5)\n+ ,\n+ plot.subtitle = element_text(hjust = 0.5)\n+ ,\n+ plot.caption = element_text(hjust = 0.5)\n+ ,\n+ plot.title.position = "plot"\n+ ) +\n+ scale_fill_manual(breaks = phosphoresidue,\n+ values = c("#c7eae5", "#f6e8c3", "#dfc27d"))\n \n-pdf(enrichGraphFilename)\n+pdf(enrich_graph_filename)\n print(gigi)\n dev.off()\n-svg(enrichGraphFilename_svg)\n+svg(enrich_graph_filename_svg)\n print(gigi)\n dev.off()\n # ...\n@@ -484,17 +676,31 @@\n \n # Filter phosphopeptides by enrichment\n # --\n-if (enriched == "Y"){\n- quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]\n-} else if ( enriched == "ST" ) {\n- quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]\n+if (enriched == "Y") {\n+ quant_data_qc_enrichment <- quant_data_qc_collapsed[\n+ str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),\n+ ]\n+} else if (enriched == "ST") {\n+ quant_data_qc_enrichment <- quant_data_qc_collapsed[\n+ str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |\n+ str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),\n+ ]\n } else {\n print("Error in enriched variable. Set to either \'Y\' or \'ST\'")\n }\n # ...\n \n+print("quant_data_qc_enrichment")\n+head(quant_data_qc_enrichment)\n \n # Write phosphopeptides filtered by enrichment\n # --\n-write.table(quantData_qc_enrichment, file=outputfilename, sep="\\t", quote = FALSE, row.names = FALSE)\n+#ACE colnames(quant_data_qc_enrichment)[1] <- "Phosphopeptide"\n+write.table(\n+ quant_data_qc_enrichment,\n+ file = output_filename,\n+ sep = "\\t",\n+ quote = FALSE,\n+ row.names = FALSE\n+)\n # ...\n'

diff -r cfc65ae227f8 -r d4d531006735 mqppep_anova.R
--- a/mqppep_anova.R Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_anova.R Thu Mar 10 23:42:48 2022 +0000

[

b'@@ -3,10 +3,6 @@\n library(optparse)\r\n library(data.table)\r\n library(stringr)\r\n-#library(ggplot2)\r\n-#library(PTXQC)\r\n-#require(PTXQC)\r\n-#require(methods)\r\n # bioconductor-preprocesscore\r\n # - libopenblas\r\n # - r-data.table\r\n@@ -18,7 +14,6 @@\n \r\n # parse options\r\n option_list <- list(\r\n- # <param name="inputFilename" type="data" format="tabular" label="Phosphopeptide Intensities" help="First column label \'Phosphopeptide\'; sample-intensities must begin in column 10 and must have column labels to match argument regexSampleNames"/>\r\n make_option(\r\n c("-i", "--inputFile"),\r\n action = "store",\r\n@@ -31,7 +26,8 @@\n action = "store",\r\n default = NA,\r\n type = "character",\r\n- help = "List of alpha cutoff values for significance testing; path to text file having one column and no header"\r\n+ help = paste0("List of alpha cutoff values for significance testing;",\r\n+ " path to text file having one column and no header")\r\n ),\r\n make_option(\r\n c("-f", "--firstDataColumn"),\r\n@@ -40,26 +36,29 @@\n type = "character",\r\n help = "First column of intensity values"\r\n ),\r\n- make_option( # imputationMethod <- c("group-median","median","mean","random")[1]\r\n+ make_option(\r\n c("-m", "--imputationMethod"),\r\n action = "store",\r\n default = "group-median",\r\n type = "character",\r\n- help = "Method for missing-value imputation, one of c(\'group-median\',\'median\',\'mean\',\'random\')"\r\n+ help = paste0("Method for missing-value imputation,",\r\n+ " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\r\n ),\r\n make_option(\r\n c("-p", "--meanPercentile"),\r\n action = "store",\r\n default = 3,\r\n type = "integer",\r\n- help = "Mean percentile for randomly generated imputed values; range [1,99]"\r\n+ help = paste0("Mean percentile for randomly generated imputed values;",\r\n+ ", range [1,99]")\r\n ),\r\n make_option(\r\n c("-d", "--sdPercentile"),\r\n action = "store",\r\n default = 3,\r\n type = "double",\r\n- help = "Adjustment value for standard deviation of randomly generated imputed values; real"\r\n+ help = paste0("Adjustment value for standard deviation of",\r\n+ " randomly generated imputed values; real")\r\n ),\r\n make_option(\r\n c("-s", "--regexSampleNames"),\r\n@@ -73,9 +72,9 @@\n action = "store",\r\n default = "(\\\\d+)",\r\n type = "character",\r\n- help = "Regular expression extracting sample-group from an extracted sample-name"\r\n+ help = paste0("Regular expression extracting sample-group",\r\n+ " from an extracted sample-name")\r\n ),\r\n- # <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data>\r\n make_option(\r\n c("-o", "--imputedDataFile"),\r\n action = "store",\r\n@@ -83,7 +82,6 @@\n type = "character",\r\n help = "Imputed Phosphopeptide Intensities output file path"\r\n ),\r\n- # <data name="report_file" format="html" label="report (download/unzip to view)" ></data>\r\n make_option(\r\n c("-r", "--reportFile"),\r\n action = "store",\r\n@@ -92,82 +90,96 @@\n help = "HTML report file path"\r\n )\r\n )\r\n-args <- parse_args(OptionParser(option_list=option_list))\r\n+args <- parse_args(OptionParser(option_list = option_list))\r\n+\r\n # Check parameter values\r\n \r\n if (! file.exists(args$inputFile)) {\r\n stop((paste("Input file", args$inputFile, "does not exist")))\r\n }\r\n-inputFile <- args$inputFile\r\n-alphaFile <- args$alphaFile\r\n-firstDataColumn <- args$firstDataColumn\r\n-imputationMethod <- args$imputationMethod\r\n-meanPercentile <- args$meanPercentile\r\n-sdPercentile <- args$sdPercentile\r\n+input_file <- args$inputFile\r\n+alpha_file <- args$alphaFile\r\n+first_data_column <- args$firstDataColumn\r\n+imputation_method <- args$imputationMethod\r\n+mean_percentile <- args$meanPercentile\r\n+sd_percentile <- args$sdPercentile\r\n \r\n-regexSampleNames <- gsub(\'^[ \\t\\n]*\', \'\' , readChar(args$regexSampleNames, 1000))\r\n-regexSam'..b'mpleNames is:")\r\n-cat(str(regexSampleNames))\r\n+print("regex_sample_names is:")\r\n+cat(str(regex_sample_names))\r\n \r\n-print("regexSampleGrouping is:")\r\n-cat(str(regexSampleGrouping))\r\n+print("regex_sample_grouping is:")\r\n+cat(str(regex_sample_grouping))\r\n \r\n-# from: https://github.com/molgenis/molgenis-pipelines/wiki/How-to-source-another_file.R-from-within-your-R-script\r\n-LocationOfThisScript = function() # Function LocationOfThisScript returns the location of this .R script (may be needed to source other files in same dir)\r\n-{\r\n- this.file = NULL\r\n+# from: https://github.com/molgenis/molgenis-pipelines/wiki/\r\n+# How-to-source-another_file.R-from-within-your-R-script\r\n+# Function location_of_this_script returns the location of this .R script\r\n+# (may be needed to source other files in same dir)\r\n+location_of_this_script <- function() {\r\n+ this_file <- NULL\r\n # This file may be \'sourced\'\r\n- for (i in -(1:sys.nframe())) {\r\n- if (identical(sys.function(i), base::source)) this.file = (normalizePath(sys.frame(i)$ofile))\r\n+ for (i in - (1:sys.nframe())) {\r\n+ if (identical(sys.function(i), base::source)) {\r\n+ this_file <- (normalizePath(sys.frame(i)$ofile))\r\n+ }\r\n }\r\n \r\n- if (!is.null(this.file)) return(dirname(this.file))\r\n+ if (!is.null(this_file)) return(dirname(this_file))\r\n \r\n # But it may also be called from the command line\r\n- cmd.args = commandArgs(trailingOnly = FALSE)\r\n- cmd.args.trailing = commandArgs(trailingOnly = TRUE)\r\n- cmd.args = cmd.args[seq.int(from=1, length.out=length(cmd.args) - length(cmd.args.trailing))]\r\n- res = gsub("^(?:--file=(.*)|.*)$", "\\\\1", cmd.args)\r\n+ cmd_args <- commandArgs(trailingOnly = FALSE)\r\n+ cmd_args_trailing <- commandArgs(trailingOnly = TRUE)\r\n+ cmd_args <- cmd_args[\r\n+ seq.int(\r\n+ from = 1,\r\n+ length.out = length(cmd_args) - length(cmd_args_trailing)\r\n+ )\r\n+ ]\r\n+ res <- gsub("^(?:--file=(.*)|.*)$", "\\\\1", cmd_args)\r\n \r\n # If multiple --file arguments are given, R uses the last one\r\n- res = tail(res[res != ""], 1)\r\n+ res <- tail(res[res != ""], 1)\r\n if (0 < length(res)) return(dirname(res))\r\n \r\n # Both are not the case. Maybe we are in an R GUI?\r\n return(NULL)\r\n }\r\n \r\n-script.dir <- LocationOfThisScript()\r\n+script_dir <- location_of_this_script()\r\n \r\n rmarkdown_params <- list(\r\n- inputFile = inputFile\r\n- , alphaFile = alphaFile\r\n- , firstDataColumn = firstDataColumn\r\n- , imputationMethod = imputationMethod\r\n- , meanPercentile = meanPercentile\r\n- , sdPercentile = sdPercentile\r\n- , regexSampleNames = regexSampleNames\r\n- , regexSampleGrouping = regexSampleGrouping\r\n- , imputedDataFilename = imputedDataFilename\r\n+ inputFile = input_file\r\n+ , alphaFile = alpha_file\r\n+ , firstDataColumn = first_data_column\r\n+ , imputationMethod = imputation_method\r\n+ , meanPercentile = mean_percentile\r\n+ , sdPercentile = sd_percentile\r\n+ , regexSampleNames = regex_sample_names\r\n+ , regexSampleGrouping = regex_sample_grouping\r\n+ , imputedDataFilename = imputed_data_file_name\r\n )\r\n \r\n str(rmarkdown_params)\r\n@@ -178,14 +190,15 @@\n # for reason:\r\n # "The following dependencies are not available in conda"\r\n # reported here:\r\n-# https://github.com/ami-iit/bipedal-locomotion-framework/pull/457/commits/e98ccef8c8cb63e207df36628192af6ce22feb13\r\n+# https://github.com/ami-iit/bipedal-locomotion-framework/pull/457\r\n \r\n-# freeze the random number generator so the same results will be produced from run to run\r\n+# freeze the random number generator so the same results will be produced\r\n+# from run to run\r\n set.seed(28571)\r\n \r\n rmarkdown::render(\r\n- input = paste(script.dir, "mqppep_anova_script.Rmd", sep="/")\r\n+ input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")\r\n , output_format = rmarkdown::html_document(pandoc_args = "--self-contained")\r\n-, output_file = reportFileName\r\n+, output_file = report_file_name\r\n , params = rmarkdown_params\r\n )\r\n'

diff -r cfc65ae227f8 -r d4d531006735 mqppep_anova.xml
--- a/mqppep_anova.xml Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_anova.xml Thu Mar 10 23:42:48 2022 +0000

[

@@ -4,33 +4,37 @@
         <import>macros.xml</import>
     </macros>
     <requirements>
-        <requirement type="package" version="1.7.1">r-optparse</requirement>
-        <requirement type="package" version="1.4.0">r-stringr</requirement>
-        <requirement type="package" version="1.14.2">r-data.table</requirement>
-        <requirement type="package" version="3.3.5">r-ggplot2</requirement>
-        <requirement type="package" version="1.56.0">bioconductor-preprocesscore</requirement>
-        <requirement type="package" version="0.3.3" >openblas</requirement>
-        <requirement type="package" version="2.11"  >r-rmarkdown</requirement>
-        <requirement type="package" version="0.4.0" >r-sass</requirement>
-        <requirement type="package"                 >texlive-core</requirement>
+        <requirement type="package" version="1.7.1"   >r-optparse</requirement>
+        <requirement type="package" version="1.4.0"   >r-stringr</requirement>
+        <requirement type="package" version="1.14.2"  >r-data.table</requirement>
+        <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+        <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
+        <requirement type="package" version="0.3.3"   >openblas</requirement>
+        <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
+        <requirement type="package" version="0.4.0"   >r-sass</requirement>
+        <requirement type="package" version="20210325">texlive-core</requirement>

     </requirements>
     
     <command detect_errors="exit_code"><![CDATA[
-cat $sample_names_regex_f; cat $sample_grouping_regex_f;
-Rscript '$__tool_directory__/mqppep_anova.R'
---inputFile '$input_file'
---alphaFile $alpha_file
---firstDataColumn $first_data_column
---imputationMethod $imputation.imputation_method
-#if '$imputation_method' == 'random':
-  --meanPercentile '$meanPercentile'
-  --sdPercentile   '$sdPercentile'
-#end if
---regexSampleNames $sample_names_regex_f
---regexSampleGrouping $sample_grouping_regex_f
---imputedDataFile $imputed_data_file
---reportFile $report_file
+      cd /tmp;
+      cp '$__tool_directory__/mqppep_anova_script.Rmd' . || exit 0;
+      cp '$__tool_directory__/mqppep_anova.R' . || exit 0;
+      \${CONDA_PREFIX}/bin/Rscript /tmp/mqppep_anova.R
+      --inputFile '$input_file'
+      --alphaFile $alpha_file
+      --firstDataColumn $first_data_column
+      --imputationMethod $imputation.imputation_method
+      #if '$imputation_method' == 'random':
+        --meanPercentile '$meanPercentile'
+        --sdPercentile   '$sdPercentile'
+      #end if
+      --regexSampleNames $sample_names_regex_f
+      --regexSampleGrouping $sample_grouping_regex_f
+      --imputedDataFile $imputed_data_file
+      --reportFile $report_file;
+      rm mqppep_anova_script.Rmd;
+      rm mqppep_anova.R
     ]]></command>
     <configfiles>
       <configfile name="sample_names_regex_f">
@@ -109,9 +113,9 @@
             <output name="imputed_data_file">
                 <assert_contents>
                     <has_text text="Phosphopeptide" />
-                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
-                    <has_text text="23574000" />
-                    <has_text text="pSESELIDELSEDFDR" />
+                    <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
+                    <has_text text="7.935878" />
+                    <has_text text="pSQKQEEENPAEETGEEK" />
                 </assert_contents>
             </output>
         </test>
@@ -125,9 +129,9 @@
             <output name="imputed_data_file">
                 <assert_contents>
                     <has_text text="Phosphopeptide" />
-                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
-                    <has_text text="997800000" />
-                    <has_text text="pSESELIDELSEDFDR" />
+                    <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
+                    <has_text text="0.82258" />
+                    <has_text text="pSQKQEEENPAEETGEEK" />
                 </assert_contents>
             </output>
         </test>
@@ -209,7 +213,7 @@
PERL-compatible regular expressions
===================================

-Note that the PERL-compatible regular expressions accepted by this tool are documented at https://rdrr.io/r/base/regex.html
+Note that the PERL-compatible regular expressions accepted by this tool are documented at http://rdrr.io/r/base/regex.html

     ]]></help>
     <citations>

diff -r cfc65ae227f8 -r d4d531006735 mqppep_mrgfltr.py
--- a/mqppep_mrgfltr.py Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_mrgfltr.py Thu Mar 10 23:42:48 2022 +0000

[

b'@@ -1,1337 +1,1500 @@\n-#!/usr/bin/env python\r\n-\r\n-# Import the packages needed\r\n-import argparse\r\n-import os.path\r\n-import sys\r\n-\r\n-import pandas\r\n-import re\r\n-import time\r\n-import sqlite3 as sql\r\n-from codecs import getreader as cx_getreader\r\n-import sys\r\n-import numpy as np\r\n-\r\n-# for sorting list of lists using operator.itemgetter\r\n-import operator\r\n-\r\n-# for formatting stack-trace\r\n-import traceback\r\n-\r\n-# for Aho-Corasick search for fixed set of substrings\r\n-import ahocorasick\r\n-import operator\r\n-import hashlib\r\n-\r\n-# for shutil.copyfile(src, dest)\r\n-import shutil\r\n-\r\n-# global constants\r\n-N_A = \'N/A\'\r\n-\r\n-# ref: https://stackoverflow.com/a/8915613/15509512\r\n-# answers: "How to handle exceptions in a list comprehensions"\r\n-# usage:\r\n-# from math import log\r\n-# eggs = [1,3,0,3,2]\r\n-# print([x for x in [catch(log, egg) for egg in eggs] if x is not None])\r\n-# producing:\r\n-# for <built-in function log>\r\n-# with args (0,)\r\n-# exception: math domain error\r\n-# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]\r\n-def catch(func, *args, handle=lambda e : e, **kwargs):\r\n- try:\r\n- return func(*args, **kwargs)\r\n- except Exception as e:\r\n- print("For %s" % str(func))\r\n- print(" with args %s" % str(args))\r\n- print(" caught exception: %s" % str(e))\r\n- (ty, va, tb) = sys.exc_info()\r\n- print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))\r\n- exit(-1)\r\n- return None # was handle(e)\r\n-\r\n-def ppep_join(x):\r\n- x = [i for i in x if N_A != i]\r\n- result = "%s" % \' | \'.join(x)\r\n- if result != "":\r\n- return result\r\n- else:\r\n- return N_A\r\n-\r\n-def melt_join(x):\r\n- tmp = {key.lower(): key for key in x}\r\n- result = "%s" % \' | \'.join([tmp[key] for key in tmp])\r\n- return result\r\n-\r\n-def __main__():\r\n- # Parse Command Line\r\n- parser = argparse.ArgumentParser(\r\n- description=\'Phopsphoproteomic Enrichment Pipeline Merge and Filter.\'\r\n- )\r\n-\r\n- # inputs:\r\n- # Phosphopeptide data for experimental results, including the intensities\r\n- # and the mapping to kinase domains, in tabular format.\r\n- parser.add_argument(\r\n- \'--phosphopeptides\', \'-p\',\r\n- nargs=1,\r\n- required=True,\r\n- dest=\'phosphopeptides\',\r\n- help=\'Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format\'\r\n- )\r\n- # UniProtKB/SwissProt DB input, SQLite\r\n- parser.add_argument(\r\n- \'--ppep_mapping_db\', \'-d\',\r\n- nargs=1,\r\n- required=True,\r\n- dest=\'ppep_mapping_db\',\r\n- help=\'UniProtKB/SwissProt SQLite Database\'\r\n- )\r\n- #ACE # PhosPhositesPlus DB input, csv\r\n- #ACE parser.add_argument(\r\n- #ACE \'--psp_regulatory_sites\', \'-s\',\r\n- #ACE nargs=1,\r\n- #ACE required=True,\r\n- #ACE dest=\'psp_regulatory_sites_csv\',\r\n- #ACE help=\'PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header\'\r\n- #ACE )\r\n- # species to limit records chosed from PhosPhositesPlus\r\n- parser.add_argument(\r\n- \'--species\', \'-x\',\r\n- nargs=1,\r\n- required=False,\r\n- default=[],\r\n- dest=\'species\',\r\n- help=\'limit PhosphoSitePlus records to indicated species (field may be empty)\'\r\n- )\r\n-\r\n- # outputs:\r\n- # tabular output\r\n- parser.add_argument(\r\n- \'--mrgfltr_tab\', \'-o\',\r\n- nargs=1,\r\n- required=True,\r\n- dest=\'mrgfltr_tab\',\r\n- help=\'Tabular output file for results\'\r\n- )\r\n- # CSV output\r\n- parser.add_argument(\r\n- \'--mrgfltr_csv\', \'-c\',\r\n- nargs=1,\r\n- required=True,\r\n- dest=\'mrgfltr_csv\',\r\n- help=\'CSV output file for results\'\r\n- )\r\n- # SQLite output\r\n- parser.add_argument(\r\n- \'--mrgfltr_sqlite\', \'-S\',\r\n- nargs=1,\r\n- required=T'..b'cur.execute(\n+ CITATION_INSERT_STMT,\n+ ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),\n+ )\n+ cur.execute(\n+ CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)\n+ )\n+\n+ # Read ppep-to-sequence LUT\n+ ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)\n+ # write only metadata for merged/filtered records to SQLite\n+ mrgfltr_metadata_df = output_df.copy()\n+ # replace phosphopeptide seq with ppep.id\n+ mrgfltr_metadata_df = ppep_lut_df.merge(\n+ mrgfltr_metadata_df,\n+ left_on="ppep_seq",\n+ right_on=PHOSPHOPEPTIDE,\n+ how="inner",\n+ )\n+ mrgfltr_metadata_df.drop(\n+ columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True\n+ )\n+ # rename columns\n+ mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS\n+ mrgfltr_metadata_df.to_sql(\n+ "mrgfltr_metadata",\n+ con=conn,\n+ if_exists="append",\n+ index=False,\n+ method="multi",\n+ )\n+\n+ # Close SwissProt SQLite database\n+ conn.close()\n+ # ----------- Write merge/filter metadata to SQLite database (finish) -----------\n+\n+ output_df = output_df.merge(\n+ quant_data,\n+ how="right",\n+ left_on=PHOSPHOPEPTIDE,\n+ right_on=PHOSPHOPEPTIDE_MATCH,\n+ )\n+ output_cols = output_df.columns.tolist()\n+ output_cols = output_cols[:-1]\n+ output_df = output_df[output_cols]\n+\n+ # cosmetic changes to Upstream column\n+ output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[\n+ PUTATIVE_UPSTREAM_DOMAINS\n+ ].fillna(\n+ ""\n+ ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping\n+ us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])\n+ i = 0\n+ while i < len(us_series):\n+ # turn blanks into N_A to signify the info was searched for but cannot be found\n+ if us_series[i] == "":\n+ us_series[i] = N_A\n+ i += 1\n+ output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series\n+\n+ end_time = time.process_time() # timer\n+ print(\n+ "%0.6f establisheed output [3]" % (end_time - start_time,),\n+ file=sys.stderr,\n+ ) # timer\n+\n+ (output_rows, output_cols) = output_df.shape\n+\n+ output_df = output_df.convert_dtypes(convert_integer=True)\n+\n+ # Output onto Final CSV file\n+ output_df.to_csv(output_filename_csv, index=False)\n+ output_df.to_csv(\n+ output_filename_tab, quoting=None, sep="\\t", index=False\n+ )\n+\n+ end_time = time.process_time() # timer\n+ print(\n+ "%0.6f wrote output [4]" % (end_time - start_time,),\n+ file=sys.stderr,\n+ ) # timer\n+\n+ print(\n+ "{:>10} phosphopeptides written to output".format(str(output_rows))\n+ )\n+\n+ end_time = time.process_time() # timer\n+ print(\n+ "%0.6f seconds of non-system CPU time were consumed"\n+ % (end_time - start_time,),\n+ file=sys.stderr,\n+ ) # timer\n+\n+ # Rev. 7/1/2016\n+ # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A\'s\n+ # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS\n+ # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \\\n+ # read from SwissProt SQLite database\n+ # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper\n+\n+ #\n+ # copied from Excel Output Script.ipynb END #\n+ #\n+\n+ try:\n+ catch(\n+ mqpep_getswissprot,\n+ )\n+ exit(0)\n+ except Exception as e:\n+ exit("Internal error running mqpep_getswissprot(): %s" % (e))\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n'

diff -r cfc65ae227f8 -r d4d531006735 repository_dependencies.xml
--- a/repository_dependencies.xml Mon Mar 07 20:43:57 2022 +0000
+++ b/repository_dependencies.xml Thu Mar 10 23:42:48 2022 +0000

@@ -1,5 +1,5 @@
<?xml version="1.0" ?>
<repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment.">
     <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="8be24b489992"/>
-    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="f9c13bc8e7ad"/>
+    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="cfc65ae227f8"/>
</repositories>
\ No newline at end of file

diff -r cfc65ae227f8 -r d4d531006735 search_ppep.py
--- a/search_ppep.py Mon Mar 07 20:43:57 2022 +0000
+++ b/search_ppep.py Thu Mar 10 23:42:48 2022 +0000

[

b'@@ -3,20 +3,19 @@\n \n import argparse\n import os.path\n+import re\n import sqlite3\n-import re\n+import sys # import the sys module for exc_info\n+import time\n+import traceback # import the traceback module for format_exception\n from codecs import getreader as cx_getreader\n-import time\n \n # For Aho-Corasick search for fixed set of substrings\n # - add_word\n # - make_automaton\n # - iter\n import ahocorasick\n-# Support map over auto.iter(...)\n-# - itemgetter\n-import operator\n-#import hashlib\n+\n \n # ref: https://stackoverflow.com/a/8915613/15509512\n # answers: "How to handle exceptions in a list comprehensions"\n@@ -29,7 +28,8 @@\n # with args (0,)\n # exception: math domain error\n # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]\n-def catch(func, *args, handle=lambda e : e, **kwargs):\n+def catch(func, *args, handle=lambda e: e, **kwargs):\n+\n try:\n return func(*args, **kwargs)\n except Exception as e:\n@@ -38,13 +38,13 @@\n print(" caught exception: %s" % str(e))\n (ty, va, tb) = sys.exc_info()\n print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))\n- #exit(-1)\n- return None # was handle(e)\n+ # exit(-1)\n+ return None # was handle(e)\n+\n \n def __main__():\n- ITEM_GETTER = operator.itemgetter(1)\n \n- DROP_TABLES_SQL = \'\'\'\n+ DROP_TABLES_SQL = """\n DROP VIEW IF EXISTS ppep_gene_site_view;\n DROP VIEW IF EXISTS uniprot_view;\n DROP VIEW IF EXISTS uniprotkb_pep_ppep_view;\n@@ -59,9 +59,9 @@\n DROP TABLE IF EXISTS ppep_gene_site;\n DROP TABLE IF EXISTS ppep_metadata;\n DROP TABLE IF EXISTS ppep_intensity;\n- \'\'\'\n+ """\n \n- CREATE_TABLES_SQL = \'\'\'\n+ CREATE_TABLES_SQL = """\n CREATE TABLE deppep\n ( id INTEGER PRIMARY KEY\n , seq TEXT UNIQUE ON CONFLICT IGNORE\n@@ -213,53 +213,55 @@\n AND\n ppep_intensity.ppep_id = ppep.id\n ;\n- \'\'\'\n+ """\n \n- UNIPROT_SEQ_AND_ID_SQL = \'\'\'\n+ UNIPROT_SEQ_AND_ID_SQL = """\n select Sequence, Uniprot_ID\n from UniProtKB\n- \'\'\'\n+ """\n \n # Parse Command Line\n parser = argparse.ArgumentParser(\n- description=\'Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).\'\n- )\n+ description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."\n+ )\n \n # inputs:\n # Phosphopeptide data for experimental results, including the intensities\n # and the mapping to kinase domains, in tabular format.\n parser.add_argument(\n- \'--phosphopeptides\', \'-p\',\n+ "--phosphopeptides",\n+ "-p",\n nargs=1,\n required=True,\n- dest=\'phosphopeptides\',\n- help=\'Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool\'\n- )\n+ dest="phosphopeptides",\n+ help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",\n+ )\n parser.add_argument(\n- \'--uniprotkb\', \'-u\',\n+ "--uniprotkb",\n+ "-u",\n nargs=1,\n required=True,\n- dest=\'uniprotkb\',\n- help=\'UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool\'\n- )\n+ dest="uniprotkb",\n+ help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",\n+ )\n parser.add_argument(\n- \'--schema\',\n- action=\'store_true\',\n- dest=\'db_schema\',\n- help=\'show updated database schema\'\n- )\n+ "--schema",\n+ action="store_true",\n+ dest="db_schema",\n+ help="show updated database schema",\n+ )\n parser.add_argument(\n- \'--warn-duplicates\',\n- action=\'store_true\',\n- dest='..b' to an automaton (a finite-state machine)\n auto.make_automaton()\n \n # Execute query for seqs and metadata without fetching the results yet\n uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)\n- while batch := uniprot_seq_and_id.fetchmany(size=50):\n- if None == batch:\n- break\n- for Sequence, UniProtKB_id in batch:\n- if Sequence is not None:\n- for end_index, (insert_order, original_value) in auto.iter(Sequence):\n- ker.execute(\'\'\'\n+ while 1:\n+ batch = uniprot_seq_and_id.fetchmany(size=50)\n+ if not batch:\n+ break\n+ for Sequence, UniProtKB_id in batch:\n+ if Sequence is not None:\n+ for end_index, (insert_order, original_value) in auto.iter(\n+ Sequence\n+ ):\n+ ker.execute(\n+ """\n INSERT INTO deppep_UniProtKB\n (deppep_id,UniProtKB_id,pos_start,pos_end)\n VALUES (?,?,?,?)\n- \'\'\', (\n- insert_order,\n- UniProtKB_id,\n- 1 + end_index - len(original_value),\n- end_index\n- )\n- )\n- else:\n- raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))\n- ker.execute("""\n+ """,\n+ (\n+ insert_order,\n+ UniProtKB_id,\n+ 1 + end_index - len(original_value),\n+ end_index,\n+ ),\n+ )\n+ else:\n+ raise ValueError(\n+ "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"\n+ % (UniProtKB_id,)\n+ )\n+ ker.execute(\n+ """\n SELECT count(*) || \' accession-peptide-phosphopeptide combinations were found\'\n FROM uniprotkb_pep_ppep_view\n """\n- )\n+ )\n for row in ker.fetchall():\n print(row[0])\n \n- ker.execute("""\n+ ker.execute(\n+ """\n SELECT count(*) || \' accession matches were found\', count(*) AS accession_count\n FROM (\n SELECT accession\n@@ -467,12 +486,12 @@\n GROUP BY accession\n )\n """\n- )\n+ )\n for row in ker.fetchall():\n- print(row[0])\n- accession_count = row[1]\n+ print(row[0])\n \n- ker.execute("""\n+ ker.execute(\n+ """\n SELECT count(*) || \' peptide matches were found\'\n FROM (\n SELECT peptide\n@@ -480,11 +499,12 @@\n GROUP BY peptide\n )\n """\n- )\n+ )\n for row in ker.fetchall():\n- print(row[0])\n+ print(row[0])\n \n- ker.execute("""\n+ ker.execute(\n+ """\n SELECT count(*) || \' phosphopeptide matches were found\', count(*) AS phosphopeptide_count\n FROM (\n SELECT phosphopeptide\n@@ -492,21 +512,24 @@\n GROUP BY phosphopeptide\n )\n """\n- )\n+ )\n for row in ker.fetchall():\n- print(row[0])\n- phosphopeptide_count = row[1]\n+ print(row[0])\n \n con.commit()\n- ker.execute(\'vacuum\')\n+ ker.execute("vacuum")\n con.close()\n \n+\n if __name__ == "__main__":\n wrap_start_time = time.perf_counter()\n __main__()\n wrap_stop_time = time.perf_counter()\n # print(wrap_start_time)\n # print(wrap_stop_time)\n- print("\\nThe matching process took %d milliseconds to run.\\n" % ((wrap_stop_time - wrap_start_time)*1000),)\n+ print(\n+ "\\nThe matching process took %d milliseconds to run.\\n"\n+ % ((wrap_stop_time - wrap_start_time) * 1000),\n+ )\n \n- # vim: sw=4 ts=4 et ai :\n+# vim: sw=4 ts=4 et ai :\n'