Mercurial > repos > eschen42 > mqppep_preproc
changeset 12:ae1044bcf13d draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e87d28ea433cc26db7fe44768685d08c06f7a0d0"
author | eschen42 |
---|---|
date | Tue, 15 Mar 2022 18:17:19 +0000 |
parents | 302918bd77e0 |
children | 28a126da9b28 |
files | macros.xml mqppep_anova.R mqppep_anova_script.Rmd repository_dependencies.xml test-data/alpha_levels.tabular test-data/test_input_for_anova.tabular workflow/ppenrich_suite_wf.ga |
diffstat | 7 files changed, 1 insertions(+), 1718 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Tue Mar 15 12:44:04 2022 +0000 +++ b/macros.xml Tue Mar 15 18:17:19 2022 +0000 @@ -1,5 +1,5 @@ <macros> - <token name="@TOOL_VERSION@">0.1.2</token> + <token name="@TOOL_VERSION@">0.1.3</token> <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements>
--- a/mqppep_anova.R Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,207 +0,0 @@ -#!/usr/bin/env Rscript -# libraries -library(optparse) -library(data.table) -library(stringr) -# bioconductor-preprocesscore -# - libopenblas -# - r-data.table -# - r-rmarkdown -# - r-ggplot2 -# - texlive-core - -# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 - -# parse options -option_list <- list( - make_option( - c("-i", "--inputFile"), - action = "store", - default = NA, - type = "character", - help = "Phosphopeptide Intensities sparse input file path" - ), - make_option( - c("-a", "--alphaFile"), - action = "store", - default = NA, - type = "character", - help = paste0("List of alpha cutoff values for significance testing;", - " path to text file having one column and no header") - ), - make_option( - c("-f", "--firstDataColumn"), - action = "store", - default = "10", - type = "character", - help = "First column of intensity values" - ), - make_option( - c("-m", "--imputationMethod"), - action = "store", - default = "group-median", - type = "character", - help = paste0("Method for missing-value imputation,", - " one of c('group-median','median','mean','random')") - ), - make_option( - c("-p", "--meanPercentile"), - action = "store", - default = 3, - type = "integer", - help = paste0("Mean percentile for randomly generated imputed values;", - ", range [1,99]") - ), - make_option( - c("-d", "--sdPercentile"), - action = "store", - default = 3, - type = "double", - help = paste0("Adjustment value for standard deviation of", - " randomly generated imputed values; real") - ), - make_option( - c("-s", "--regexSampleNames"), - action = "store", - default = "\\.(\\d+)[A-Z]$", - type = "character", - help = "Regular expression extracting sample-names" - ), - make_option( - c("-g", "--regexSampleGrouping"), - action = "store", - default = "(\\d+)", - type = "character", - help = paste0("Regular expression extracting sample-group", - " from an extracted sample-name") - ), - make_option( - c("-o", "--imputedDataFile"), - action = "store", - default = "output_imputed.tsv", - type = "character", - help = "Imputed Phosphopeptide Intensities output file path" - ), - make_option( - c("-r", "--reportFile"), - action = "store", - default = "QuantDataProcessingScript.html", - type = "character", - help = "HTML report file path" - ) -) -args <- parse_args(OptionParser(option_list = option_list)) - -# Check parameter values - -if (! file.exists(args$inputFile)) { - stop((paste("Input file", args$inputFile, "does not exist"))) -} -input_file <- args$inputFile -alpha_file <- args$alphaFile -first_data_column <- args$firstDataColumn -imputation_method <- args$imputationMethod -mean_percentile <- args$meanPercentile -sd_percentile <- args$sdPercentile - -regex_sample_names <- gsub("^[ \t\n]*", "", - readChar(args$regexSampleNames, 1000) - ) -regex_sample_names <- gsub("[ \t\n]*$", "", - regex_sample_names - ) -cat(regex_sample_names) -cat("\n") - -regex_sample_grouping <- gsub("^[ \t\n]*", "", - readChar(args$regexSampleGrouping, 1000) - ) -regex_sample_grouping <- gsub("[ \t\n]*$", "", - regex_sample_grouping - ) -cat(regex_sample_grouping) -cat("\n") - -imputed_data_file_name <- args$imputedDataFile -report_file_name <- args$reportFile - -print("args is:") -cat(str(args)) - -print("regex_sample_names is:") -cat(str(regex_sample_names)) - -print("regex_sample_grouping is:") -cat(str(regex_sample_grouping)) - -# from: https://github.com/molgenis/molgenis-pipelines/wiki/ -# How-to-source-another_file.R-from-within-your-R-script -# Function location_of_this_script returns the location of this .R script -# (may be needed to source other files in same dir) -location_of_this_script <- function() { - this_file <- NULL - # This file may be 'sourced' - for (i in - (1:sys.nframe())) { - if (identical(sys.function(i), base::source)) { - this_file <- (normalizePath(sys.frame(i)$ofile)) - } - } - - if (!is.null(this_file)) return(dirname(this_file)) - - # But it may also be called from the command line - cmd_args <- commandArgs(trailingOnly = FALSE) - cmd_args_trailing <- commandArgs(trailingOnly = TRUE) - cmd_args <- cmd_args[ - seq.int( - from = 1, - length.out = length(cmd_args) - length(cmd_args_trailing) - ) - ] - res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args) - - # If multiple --file arguments are given, R uses the last one - res <- tail(res[res != ""], 1) - if (0 < length(res)) return(dirname(res)) - - # Both are not the case. Maybe we are in an R GUI? - return(NULL) -} - -script_dir <- location_of_this_script() - -rmarkdown_params <- list( - inputFile = input_file - , alphaFile = alpha_file - , firstDataColumn = first_data_column - , imputationMethod = imputation_method - , meanPercentile = mean_percentile - , sdPercentile = sd_percentile - , regexSampleNames = regex_sample_names - , regexSampleGrouping = regex_sample_grouping - , imputedDataFilename = imputed_data_file_name - ) - -str(rmarkdown_params) - -# BUG -# Must render as HTML for the time being until this issue is resolved: -# https://github.com/conda-forge/texlive-core-feedstock/issues/19 -# for reason: -# "The following dependencies are not available in conda" -# reported here: -# https://github.com/ami-iit/bipedal-locomotion-framework/pull/457 - -# freeze the random number generator so the same results will be produced -# from run to run -set.seed(28571) - - -library(tinytex) -tinytex::install_tinytex() -rmarkdown::render( - input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") -, output_format = rmarkdown::pdf_document() -, output_file = report_file_name -, params = rmarkdown_params -)
--- a/mqppep_anova_script.Rmd Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,801 +0,0 @@ ---- -title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" -author: "Larry Cheng; Art Eschenlauer" -date: "May 28, 2018; Nov 16, 2021" -output: - pdf_document: default -params: - inputFile: "test-data/test_input_for_anova.tabular" - alphaFile: "test-data/alpha_levels.tabular" - firstDataColumn: "Intensity" - imputationMethod: !r c("group-median", "median", "mean", "random")[1] - meanPercentile: 1 - sdPercentile: 0.2 - regexSampleNames: "\\.(\\d+)[A-Z]$" - regexSampleGrouping: "(\\d+)" - imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" ---- -```{r setup, include = FALSE} -# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 -knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) - -### FUNCTIONS - -#ANOVA filter function -anova_func <- function(x, grouping_factor) { - x_aov <- aov(as.numeric(x) ~ grouping_factor) - pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] - pvalue -} -``` - -## Purpose: -Perform imputation of missing values, quantile normalization, and ANOVA. - -<!-- -## Variables to change for each input file ---> -```{r include = FALSE} -# Input Filename -input_file <- params$inputFile - -# First data column - ideally, this could be detected via regexSampleNames, -# but for now leave it as is. -first_data_column <- params$firstDataColumn -fdc_is_integer <- TRUE -first_data_column <- withCallingHandlers( - as.integer(first_data_column) - , warning = function(w) fdc_is_integer <<- FALSE - ) -if (FALSE == fdc_is_integer) { - first_data_column <- params$firstDataColumn -} - -# False discovery rate adjustment for ANOVA -# Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 -val_fdr <- - read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] - -#Imputed Data filename -imputed_data_filename <- params$imputedDataFilename - -#ANOVA data filename -``` - -```{r echo = FALSE} -# Imputation method, should be one of -# "random", "group-median", "median", or "mean" -imputation_method <- params$imputationMethod - -# Selection of percentile of logvalue data to set the mean for random number -# generation when using random imputation -mean_percentile <- params$meanPercentile / 100.0 - -# deviation adjustment-factor for random values; real number. -sd_percentile <- params$sdPercentile - -# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" -regex_sample_names <- params$regexSampleNames - -# Regular expression to extract Sample Grouping from Sample Name; -# if error occurs, compare sample_factor_levels and temp_matches -# to see if groupings/pairs line up -# e.g., "(\\d+)" -regex_sample_grouping <- params$regexSampleGrouping - -``` - -```{r echo = FALSE} -### READ DATA - -library(data.table) - -# read.table reads a file in table format and creates a data frame from it. -# - note that `quote = ""` means that quotation marks are treated literally. -full_data <- read.table( - file = input_file, - sep = "\t", - header = T, - quote = "", - check.names = FALSE - ) -``` - -### Column names from input file - -```{r echo = FALSE, results = 'markup'} -print(colnames(full_data)) -data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) -cat(sprintf("First data column: %d\n", min(data_column_indices))) -cat(sprintf("Last data column: %d\n", max(data_column_indices))) -``` - -```{r echo = FALSE, results = 'asis'} -cat("\\newpage\n") -``` - -### Checking that log-transformed sample distributions are similar: - -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} - -if (FALSE == fdc_is_integer) { - - if (length(data_column_indices) > 0) { - first_data_column <- data_column_indices[1] - } else { - stop(paste("failed to convert firstDataColumn:", first_data_column)) - } -} - -quant_data0 <- full_data[first_data_column:length(full_data)] -quant_data <- full_data[first_data_column:length(full_data)] -quant_data[quant_data == 0] <- NA #replace 0 with NA -quant_data_log <- log10(quant_data) - -rownames(quant_data_log) <- full_data$Phosphopeptide - -# data visualization -old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) -) -boxplot( - quant_data_log -, las = 2 -) -par(old_par) - - - -cat("\\newline\n") -cat("\\newline\n") - -``` - -```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} -quant_data_log_stack <- stack(quant_data_log) -library(ggplot2) -ggplot( - quant_data_log_stack, - aes(x = values)) + geom_density(aes(group = ind, colour = ind)) -``` - -### Globally, are phosphopeptide intensities are approximately unimodal? - -<!-- -# ref for bquote below particularly and plotting math expressions generally: -# https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ ---> -```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} - -# identify the location of missing values -fin <- is.finite(as.numeric(as.matrix(quant_data_log))) - -logvalues <- as.numeric(as.matrix(quant_data_log))[fin] -plot( - density(logvalues), - main = bquote( - "Smoothed estimated probability density vs." ~ log[10](intensity)), - xlab = bquote(log[10](intensity)) - ) -hist( - x = as.numeric(as.matrix(quant_data_log)) -, breaks = 100 -, main = bquote("Frequency vs." ~ log[10](intensity)) -, xlab = bquote(log[10](intensity)) -) -``` - -### Distribution of standard deviations of phosphopeptides, ignoring missing values: - -```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} -# determine quantile -q1 <- quantile(logvalues, probs = mean_percentile)[1] - -# determine standard deviation of quantile to impute -sd_finite <- function(x) { - ok <- is.finite(x) - sd(x[ok]) * sd_percentile -} -# 1 = row of matrix (ie, phosphopeptide) -sds <- apply(quant_data_log, 1, sd_finite) -plot( - density(sds, na.rm = T) -, main = "Smoothed estimated probability density vs. std. deviation" -, sub = "(probability estimation made with Gaussian smoothing)" -) - -m1 <- median(sds, na.rm = T) #sd to be used is the median sd - -``` - - - -<!-- -The number of missing values are: ---> -```{r echo = FALSE} -#Determine number of cells to impute -temp <- quant_data[is.na(quant_data)] - -#Determine number of values to impute -number_to_impute <- length(temp) -``` - -<!-- -% of values that are missing: ---> -```{r echo = FALSE} -pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100 -``` - -<!-- -First few rows of data before imputation: ---> -```{r echo = FALSE, results = 'asis'} -cat("\\newpage\n") -``` - -## Parse sample names - -Parse the names of the samples to deduce the factor level for each sample: - -```{r echo = FALSE} - -# prep for trt-median based imputation - -# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" -# get factors -> -# group runs (samples) by ignoring terminal [A-Z] in sample names - -m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE) -temp_matches <- regmatches(names(quant_data), m) -print("Extracted sample names") -print(temp_matches) -m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) -sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) -print("Factor levels") -print(sample_factor_levels) - -``` -## Impute missing values - -```{r echo = FALSE} - -#Determine number of cells to impute -cat("Before imputation,", - sprintf( - "there are:\n %d peptides\n %d missing values (%2.0f%s)", - sum(rep.int(TRUE, nrow(quant_data))), - sum(is.na(quant_data)), - pct_missing_values, - "%" - ) -) - -``` -```{r echo = FALSE} - -#Impute data -quant_data_imp <- quant_data - -# Identify which values are missing and need to be imputed -ind <- which(is.na(quant_data_imp), arr.ind = TRUE) - -``` -```{r echo = FALSE} - -# Apply imputation -switch( - imputation_method -, "group-median" = { - cat("Imputation method:\n substitute missing value", - "with median peptide-intensity for sample-group\n") - - sample_level_integers <- as.integer(sample_factor_levels) - for (i in seq_len(length(levels(sample_factor_levels)))) { - level_cols <- i == sample_level_integers - ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) - quant_data_imp[ind, level_cols] <- - apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] - } - good_rows <- !is.na(rowMeans(quant_data_imp)) - } -, "median" = { - cat("Imputation method:\n substitute missing value with", - "median peptide-intensity across all sample classes\n") - quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] - good_rows <- !is.na(rowMeans(quant_data_imp)) - } -, "mean" = { - cat("Imputation method:\n substitute missing value with", - "mean peptide-intensity across all sample classes\n") - quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] - good_rows <- !is.na(rowMeans(quant_data_imp)) - } -, "random" = { - cat( - "Imputation method:\n substitute missing value with\n ", - sprintf( - "random intensity N ~ (%0.2f, %0.2f)\n" - , q1, m1 - ) - ) - quant_data_imp[is.na(quant_data_imp)] <- - 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) - good_rows <- !is.na(rowMeans(quant_data_imp)) - } -) - -``` -```{r echo = FALSE} - -#Determine number of cells to impute -temp <- quant_data_imp[is.na(quant_data_imp)] -cat("After imputation, there are:", - sprintf( - "\n %d missing values\n %d usable peptides analysis" - , sum(is.na(quant_data_imp[good_rows, ])) - , sum(good_rows) - ), - sprintf( - "\n %d peptides with too many missing values for further analysis" - , sum(!good_rows) - ) -) -``` -```{r echo = FALSE} - - -# Zap rows where imputation was ineffective -full_data <- full_data [good_rows, ] -quant_data <- quant_data [good_rows, ] -quant_data_imp <- quant_data_imp[good_rows, ] - -``` -```{r echo = FALSE} - -d_combined <- (density(as.numeric(as.matrix( - log10(quant_data_imp) -)))) -d_original <- - density(as.numeric(as.matrix( - log10(quant_data_imp[!is.na(quant_data)])))) - -``` -```{r echo = FALSE} - -if (sum(is.na(quant_data)) > 0) { - # There ARE missing values - d_imputed <- - (density(as.numeric(as.matrix( - log10(quant_data_imp[is.na(quant_data)]) - )))) -} else { - # There are NO missing values - d_imputed <- d_combined -} - -``` - -```{r echo = FALSE, fig.dim = c(9, 5)} -ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) -plot( - d_combined, - ylim = ylim, - sub = "Blue = data before imputation; Red = imputed data", - main = "Density vs. log10(intensity) before and after imputation" -) -lines(d_original, col = "blue") -lines(d_imputed, col = "red") -``` - -## Perform Quantile Normalization - -<!-- -# Apply quantile normalization using preprocessCore::normalize.quantiles -# --- -# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html -# except this: https://support.bioconductor.org/p/122925/#9135989 -# says to install it like this: -# ``` -# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) -# ``` -# conda installation (necessary because of a bug in recent openblas): -# conda install bioconductor-preprocesscore openblas=0.3.3 -# ... -# --- -# normalize.quantiles {preprocessCore} -- Quantile Normalization -# -# Description: -# Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. -# -# Usage: -# normalize.quantiles(x, copy = TRUE, keep.names = FALSE) -# -# Arguments: -# -# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. -# -# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, -# but in certain situations not making a copy of the matrix, but instead normalizing -# it in place will be more memory friendly. -# -# - keep.names: Boolean option to preserve matrix row and column names in output. -# -# Details: -# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. -# No special allowances are made for outliers. If you make use of quantile normalization -# please cite Bolstad et al, Bioinformatics (2003). -# -# This functions will handle missing data (ie NA values), based on -# the assumption that the data is missing at random. -# -# Note that the current implementation optimizes for better memory usage -# at the cost of some additional run-time. -# -# Value: A normalized matrix. -# -# Author: Ben Bolstad, bmbolstad.com -# -# References -# -# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide -# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf -# -# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of -# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias -# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 -# http://bmbolstad.com/misc/normalize/normalize.html -# ... ---> -```{r echo = FALSE} -library(preprocessCore) - -if (TRUE) { - quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) -} else { - quant_data_imp_qn <- as.matrix(quant_data_imp) -} - -quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) -names(quant_data_imp_qn) <- names(quant_data_imp) -quant_data_imp_qn_log <- log10(quant_data_imp_qn) - -rownames(quant_data_imp_qn_log) <- full_data[, 1] - -quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) -any_nan <- function(x) { - !any(x == "NaN") -} -sel <- apply(quant_data_imp_qn_ls, 1, any_nan) -quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] -quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) - -#output quantile normalized data -data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) -write.table( - data_table_imp_qn_lt, - file = paste(paste( - strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_" - ), ".txt", sep = ""), - sep = "\t", - col.names = TRUE, - row.names = FALSE -) - -``` - -<!-- ACE insertion begin --> -### Checking that normalized, imputed, log-transformed sample distributions are similar: - -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} - - -# Save unimputed quant_data_log for plotting below -unimputed_quant_data_log <- quant_data_log - -# log10 transform (after preparing for zero values, -# which should never happen...) -quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 -quant_data_log <- log10(quant_data_imp_qn) - -# Output quantile-normalized log-transformed dataset -# with imputed, normalized data - -data_table_imputed <- cbind(full_data[1:9], quant_data_log) -write.table( - data_table_imputed - , file = imputed_data_filename - , sep = "\t" - , col.names = TRUE - , row.names = FALSE - , quote = FALSE - ) - - - -# data visualization -old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) -, oma = par("oma") + c(0.5, 0, 0, 0) -) -boxplot( - quant_data_log -, las = 2 -) -par(old_par) - - - -cat("\\newline\n") -cat("\\newline\n") - -``` - -```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} -quant_data_log_stack <- stack(quant_data_log) -ggplot( - quant_data_log_stack, - aes(x = values) - ) + geom_density(aes(group = ind, colour = ind)) -``` - -## Perform ANOVA filters - -(see following pages) - -```{r, echo = FALSE} -# Make new data frame containing only Phosphopeptides -# to connect preANOVA to ANOVA (connect_df) -connect_df <- data.frame( - data_table_imp_qn_lt$Phosphopeptide - , data_table_imp_qn_lt[, first_data_column] - ) -colnames(connect_df) <- c("Phosphopeptide", "Intensity") -``` - -```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -# Get factors -> group replicates (as indicated by terminal letter) -# by the preceding digits; -# e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. -m <- - regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE) - -temp_matches <- regmatches(names(quant_data_imp_qn_log), m) - -number_of_samples <- length(temp_matches) - -m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) - - -sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) - - -if (length(levels(sample_factor_levels)) < 2) { - cat( - "ERROR!!!! Cannot perform ANOVA analysis", - "because it requires two or more factor levels\n" - ) - cat("Unparsed sample names are:\n") - print(names(quant_data_imp_qn_log)) - cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names)) - cat("Parsed names are:\n") - print(temp_matches) - cat(sprintf( - "Parsing rule for SampleGrouping is '%s'\n", - regex_sample_grouping - )) - cat("Sample group assignments are:\n") - print(regmatches(temp_matches, m2)) -} else { - p_value_data_anova_ps <- - apply( - quant_data_imp_qn_log, - 1, - anova_func, - grouping_factor = sample_factor_levels - ) - - p_value_data_anova_ps_fdr <- - p.adjust(p_value_data_anova_ps, method = "fdr") - p_value_data <- data.frame( - phosphopeptide = full_data[, 1] - , - raw_anova_p = p_value_data_anova_ps - , - fdr_adjusted_anova_p = p_value_data_anova_ps_fdr - ) - - # output ANOVA file to constructed filename, - # e.g. "Outputfile_pST_ANOVA_STEP5.txt" - # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" - - # Re-output quantile-normalized log-transformed dataset - # with imputed, normalized data to include p-values - - data_table_imputed <- - cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log) - write.table( - data_table_imputed, - file = imputed_data_filename, - sep = "\t", - col.names = TRUE, - row.names = FALSE, - quote = FALSE - ) - - - p_value_data <- - p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] - - cutoff <- val_fdr[1] - for (cutoff in val_fdr) { - #loop through FDR cutoffs - - filtered_p <- - p_value_data[ - which(p_value_data$fdr_adjusted_anova_p < cutoff), - , - drop = FALSE - ] - filtered_data_filtered <- - quant_data_imp_qn_log[ - rownames(filtered_p), - , - drop = FALSE - ] - filtered_data_filtered <- - filtered_data_filtered[ - order(filtered_p$fdr_adjusted_anova_p), - , - drop = FALSE - ] - - # <!-- ACE insertion start --> - old_oma <- par("oma") - old_par <- par( - mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), - oma = old_oma * c(1, 1, 0.3, 1), - cex.main = 0.9, - cex.axis = 0.7 - ) - - cat("\\newpage\n") - if (nrow(filtered_data_filtered) > 0) { - cat(sprintf( - "Intensities for peptides whose adjusted p-value < %0.2f\n", - cutoff - )) - cat("\\newline\n") - cat("\\newline\n") - - boxplot( - filtered_data_filtered, - main = "Imputed, normalized intensities", # no line plot - las = 2, - ylab = expression(log[10](intensity)) - ) - } else { - cat(sprintf( - "No peptides were found to have cutoff adjusted p-value < %0.2f\n", - cutoff - )) - } - par(old_par) - - if (nrow(filtered_data_filtered) > 0) { - #Add Phosphopeptide column to anova_filtered table - anova_filtered_merge <- merge( - x = connect_df - , - y = filtered_data_filtered - , - by.x = "Intensity" - , - by.y = 1 - ) - anova_filtered_merge_order <- rownames(filtered_p) - - anova_filtered_merge_format <- sapply( - X = filtered_p$fdr_adjusted_anova_p - , - FUN = function(x) { - if (x > 0.0001) - paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") - else - paste0("(%0.4e) %s") - } - ) - - - - anova_filtered <- data.table( - anova_filtered_merge$Phosphopeptide - , - anova_filtered_merge$Intensity - , - anova_filtered_merge[, 2:number_of_samples + 1] - ) - colnames(anova_filtered) <- - c("Phosphopeptide", colnames(filtered_data_filtered)) - - # merge qualitative columns into the ANOVA data - output_table <- data.frame(anova_filtered$Phosphopeptide) - output_table <- merge( - x = output_table - , - y = data_table_imp_qn_lt - , - by.x = "anova_filtered.Phosphopeptide" - , - by.y = "Phosphopeptide" - ) - - #Produce heatmap to visualize significance and the effect of imputation - m <- - as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) - if (nrow(m) > 0) { - rownames_m <- rownames(m) - rownames(m) <- sapply( - X = seq_len(nrow(m)) - , - FUN = function(i) { - sprintf( - anova_filtered_merge_format[i] - , - filtered_p$fdr_adjusted_anova_p[i] - , - rownames_m[i] - ) - } - ) - margins <- c(max(nchar(colnames(m))) * 10 / 16 # col - , max(nchar(rownames(m))) * 5 / 16 # row - ) - how_many_peptides <- min(50, nrow(m)) - - cat("\\newpage\n") - if (nrow(m) > 50) { - cat("Heatmap for the 50 most-significant peptides", - sprintf( - "whose adjusted p-value < %0.2f\n", - cutoff) - ) - } else { - cat("Heatmap for peptides whose", - sprintf("adjusted p-value < %0.2f\n", - cutoff) - ) - } - cat("\\newline\n") - cat("\\newline\n") - op <- par("cex.main") - try( - if (nrow(m) > 1) { - par(cex.main = 0.6) - heatmap( - m[how_many_peptides:1, ], - Rowv = NA, - Colv = NA, - cexRow = 0.7, - cexCol = 0.8, - scale = "row", - margins = margins, - main = - "Heatmap of unimputed, unnormalized intensities", - xlab = "" - ) - } - ) - par(op) - } - } - } -} -``` - -<!-- -## Peptide IDs, etc. - -See output files. --->
--- a/repository_dependencies.xml Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -<?xml version="1.0" ?> -<repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment."> - <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="07fb0e756c69"/> - <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="6c22e8563a93"/> -</repositories> \ No newline at end of file
--- a/test-data/alpha_levels.tabular Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -0.05 -0.1 -0.2
--- a/test-data/test_input_for_anova.tabular Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -Phosphopeptide Sequence10 Sequence7 Gene_Name Phosphoresidue UniProt_ID Description Function Phosphoresidue(PSP=PhosphoSitePlus.org) Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains Intensity.shL.1A Intensity.shL.1B Intensity.shL.1C Intensity.shR.2A Intensity.shR.2B Intensity.shR.2C -AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2 38150000 39445000 56305000 55338000 7010600 70203000 -AAAITDMADLEELSRLpSPLPPGpSPGSAAR MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE LEELSRLsPLPPGSP | LSPLPPGsPGSAARG AEBP2; AEBP2 pS18, pS24; pS18, pS24 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A N/A 5416400 7101800 385280000 208060000 41426000 352400000 -ADALQAGASQFETpSAAK LQAGASQFETpSAAKLKRKYWW GASQFETsAAKLKRK VAMP2; VAMP3 pS80; pS63 P63027; Q15836 VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKD3 | PKCiota 44627000 41445000 69094000 42521000 5738000 61819000 -DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate 75542000 44814000 32924000 35016000 11023000 4669900 -EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3 12562000 16302000 23000000 7857800 0 18830000 -EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate 0 0 0 0 0 0 -IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000 -IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | CK2 22562000 18225000 9119700 11689000 0 0 -KGLLApTpSGNDGTIR VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR HERC1 pT3445, pS3446 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A N/A 7843600 0 241700000 0 0 10042600 -KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A G protein-coupled receptor kinase 1 substrate 0 0 18629000 0 0 0 -KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Casein Kinase I substrate | MDC1 BRCT domain binding | GSK3 | AURORA 7090300 8341200 9691500 10030000 1675200 9952100 -LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 0 11706000 12495000 0 7273000 8877800 -NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | Casein Kinase I substrate | b-Adrenergic Receptor kinase substrate | BARD1 BRCT domain binding | CK2 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000 -pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A ATM kinase substrate | PKC kinase substrate | PKA kinase substrate 0 0 8765300 0 2355900 14706000 -QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6 68201000 87774000 138300000 95357000 19966000 149110000 -RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A Casein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA 19262000 11103000 19454000 0 1816900 22028000 -SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A Casein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1 31407000 17665000 20892000 23194000 5132400 54893000 -SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A b-Adrenergic Receptor kinase substrate 11766000 13176000 20540000 16963000 4364700 21308000 -STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate 3057100 4718800 12052000 5047700 1070900 8333500 -TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate 20400000 9738500 7862300 0 0 76518000 -TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 29764000 20957000 24855000 30752000 8304800 23771000 -YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) b-Adrenergic Receptor kinase substrate 323250000 127970000 0 67123000 12790000 71378000
--- a/workflow/ppenrich_suite_wf.ga Tue Mar 15 12:44:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,678 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA", - "creator": [ - { - "class": "Person", - "identifier": "0000-0002-2882-0508", - "name": "Art Eschenlauer" - } - ], - "format-version": "0.1", - "license": "MIT", - "name": "ppenrich_suite_wf", - "steps": { - "0": { - "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", - "content_id": null, - "errors": null, - "id": 0, - "input_connections": {}, - "inputs": [ - { - "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", - "name": "Phospho (STY)Sites.txt" - } - ], - "label": "Phospho (STY)Sites.txt", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": -36.30000305175781, - "height": 82.19999694824219, - "left": 150, - "right": 350, - "top": -118.5, - "width": 200, - "x": 150, - "y": -118.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "f4273d40-f2b8-4ad0-8bcc-91e72bd25fe1", - "workflow_outputs": [] - }, - "1": { - "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", - "content_id": null, - "errors": null, - "id": 1, - "input_connections": {}, - "inputs": [ - { - "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", - "name": "SwissProt_Human_Canonical_Isoform.fasta" - } - ], - "label": "SwissProt_Human_Canonical_Isoform.fasta", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 278.1000061035156, - "height": 102.60000610351562, - "left": 376, - "right": 576, - "top": 175.5, - "width": 200, - "x": 376, - "y": 175.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"fasta\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "cb31b0ac-cacc-42ee-bd42-f42d0bdae128", - "workflow_outputs": [] - }, - "2": { - "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", - "content_id": null, - "errors": null, - "id": 2, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", - "name": "NetworKIN_cutoffscore2.0.tabular" - } - ], - "label": "NetworKIN_cutoffscore2.0.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 423.1000061035156, - "height": 102.60000610351562, - "left": 387, - "right": 587, - "top": 320.5, - "width": 200, - "x": 387, - "y": 320.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "e6ec01b8-ff1a-4c90-a064-b40c5cad75bb", - "workflow_outputs": [] - }, - "3": { - "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", - "content_id": null, - "errors": null, - "id": 3, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", - "name": "pSTY_Motifs.tabular" - } - ], - "label": "pSTY_Motifs.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 546.6999969482422, - "height": 82.19999694824219, - "left": 399, - "right": 599, - "top": 464.5, - "width": 200, - "x": 399, - "y": 464.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "2c59056a-c1b4-4a20-a194-991d56c8b6c2", - "workflow_outputs": [] - }, - "4": { - "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "content_id": null, - "errors": null, - "id": 4, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "name": "PSP_Kinase_Substrate_Dataset.tabular" - } - ], - "label": "PSP_Kinase_Substrate_Dataset.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 696.1000061035156, - "height": 102.60000610351562, - "left": 420, - "right": 620, - "top": 593.5, - "width": 200, - "x": 420, - "y": 593.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "987a5891-15f1-4f70-89a8-386447f0bf24", - "workflow_outputs": [] - }, - "5": { - "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "content_id": null, - "errors": null, - "id": 5, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "name": "PSP_Regulatory_sites.tabular" - } - ], - "label": "PSP_Regulatory_sites.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 820.6999969482422, - "height": 82.19999694824219, - "left": 436, - "right": 636, - "top": 738.5, - "width": 200, - "x": 436, - "y": 738.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "964d8d21-b063-411a-aee8-372a0d0dfba3", - "workflow_outputs": [] - }, - "6": { - "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", - "content_id": null, - "errors": null, - "id": 6, - "input_connections": {}, - "inputs": [ - { - "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", - "name": "alpha_levels.tabular" - } - ], - "label": "alpha_levels.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 1071.1999969482422, - "height": 82.19999694824219, - "left": 418, - "right": 618, - "top": 989, - "width": 200, - "x": 418, - "y": 989 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "42577db7-d5e5-4f39-b3ad-d0648abb9df3", - "workflow_outputs": [] - }, - "7": { - "annotation": "", - "content_id": "mqppep_preproc", - "errors": null, - "id": 7, - "input_connections": { - "networkin": { - "id": 2, - "output_name": "output" - }, - "p_sty_motifs": { - "id": 3, - "output_name": "output" - }, - "phosphoSites": { - "id": 0, - "output_name": "output" - }, - "protein_fasta": { - "id": 1, - "output_name": "output" - }, - "psp_kinase_substrate": { - "id": 4, - "output_name": "output" - }, - "psp_regulatory_sites": { - "id": 5, - "output_name": "output" - } - }, - "inputs": [ - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "networkin" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "p_sty_motifs" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "phosphoSites" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "protein_fasta" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "psp_kinase_substrate" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "psp_regulatory_sites" - } - ], - "label": null, - "name": "MaxQuant Phosphopeptide Preprocessing", - "outputs": [ - { - "name": "phosphoPepIntensities", - "type": "tabular" - }, - { - "name": "enrichGraph", - "type": "pdf" - }, - { - "name": "locProbCutoffGraph", - "type": "pdf" - }, - { - "name": "enrichGraph_svg", - "type": "svg" - }, - { - "name": "locProbCutoffGraph_svg", - "type": "svg" - }, - { - "name": "filteredData_tabular", - "type": "tabular" - }, - { - "name": "quantData_tabular", - "type": "tabular" - }, - { - "name": "mapped_phophopeptides", - "type": "tabular" - }, - { - "name": "melted_phophopeptide_map", - "type": "tabular" - }, - { - "name": "mqppep_output_sqlite", - "type": "sqlite" - }, - { - "name": "preproc_tab", - "type": "tabular" - }, - { - "name": "preproc_csv", - "type": "csv" - }, - { - "name": "preproc_sqlite", - "type": "sqlite" - } - ], - "position": { - "bottom": 964.0999755859375, - "height": 793.5999755859375, - "left": 826.5, - "right": 1026.5, - "top": 170.5, - "width": 200, - "x": 826.5, - "y": 170.5 - }, - "post_job_actions": { - "RenameDatasetActionenrichGraph": { - "action_arguments": { - "newname": "#{phosphoSites}.enrichGraph_pdf" - }, - "action_type": "RenameDatasetAction", - "output_name": "enrichGraph" - }, - "RenameDatasetActionenrichGraph_svg": { - "action_arguments": { - "newname": "#{phosphoSites}.enrichGraph_svg" - }, - "action_type": "RenameDatasetAction", - "output_name": "enrichGraph_svg" - }, - "RenameDatasetActionfilteredData_tabular": { - "action_arguments": { - "newname": "#{phosphoSites}.filteredData" - }, - "action_type": "RenameDatasetAction", - "output_name": "filteredData_tabular" - }, - "RenameDatasetActionlocProbCutoffGraph": { - "action_arguments": { - "newname": "#{phosphoSites}.locProbCutoffGraph_pdf" - }, - "action_type": "RenameDatasetAction", - "output_name": "locProbCutoffGraph" - }, - "RenameDatasetActionlocProbCutoffGraph_svg": { - "action_arguments": { - "newname": "#{phosphoSites}.locProbCutoffGraph_svg" - }, - "action_type": "RenameDatasetAction", - "output_name": "locProbCutoffGraph_svg" - }, - "RenameDatasetActionmapped_phophopeptides": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_map" - }, - "action_type": "RenameDatasetAction", - "output_name": "mapped_phophopeptides" - }, - "RenameDatasetActionmelted_phophopeptide_map": { - "action_arguments": { - "newname": "#{phosphoSites}.melted" - }, - "action_type": "RenameDatasetAction", - "output_name": "melted_phophopeptide_map" - }, - "RenameDatasetActionmqppep_output_sqlite": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_mapping_sqlite" - }, - "action_type": "RenameDatasetAction", - "output_name": "mqppep_output_sqlite" - }, - "RenameDatasetActionphosphoPepIntensities": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_intensities" - }, - "action_type": "RenameDatasetAction", - "output_name": "phosphoPepIntensities" - }, - "RenameDatasetActionpreproc_csv": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_csv" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_csv" - }, - "RenameDatasetActionpreproc_sqlite": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_sqlite" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_sqlite" - }, - "RenameDatasetActionpreproc_tab": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_tab" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_tab" - }, - "RenameDatasetActionquantData_tabular": { - "action_arguments": { - "newname": "#{phosphoSites}.quantData" - }, - "action_type": "RenameDatasetAction", - "output_name": "quantData_tabular" - } - }, - "tool_id": "mqppep_preproc", - "tool_state": "{\"collapseFunc\": \"sum\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"RuntimeValue\"}, \"p_sty_motifs\": {\"__class__\": \"RuntimeValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"RuntimeValue\"}, \"protein_fasta\": {\"__class__\": \"RuntimeValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"RuntimeValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"RuntimeValue\"}, \"pst_not_py\": \"true\", \"species\": \"human\", \"startCol\": \"^Intensity[^_]\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "886043ce-8d9b-474e-b970-4fe9ee6a74fa", - "workflow_outputs": [ - { - "label": "ppep_intensities", - "output_name": "phosphoPepIntensities", - "uuid": "e19a64d1-edee-4119-a72e-456af7a6c056" - }, - { - "label": "enrichGraph_pdf", - "output_name": "enrichGraph", - "uuid": "7e9936d9-9617-4df4-9133-7a04f8d05d26" - }, - { - "label": "locProbCutoffGraph_pdf", - "output_name": "locProbCutoffGraph", - "uuid": "5656cba7-25e2-4362-ae92-1ddac67dee07" - }, - { - "label": "enrichGraph_svg", - "output_name": "enrichGraph_svg", - "uuid": "ca13a22e-a41b-481c-ab87-1f97bbf768e9" - }, - { - "label": "locProbCutoffGraph_svg", - "output_name": "locProbCutoffGraph_svg", - "uuid": "fc7a11f5-30d8-4409-878a-d3b70366711c" - }, - { - "label": "filteredData", - "output_name": "filteredData_tabular", - "uuid": "aab49fc5-a3cf-4479-ac23-8e9272dadf28" - }, - { - "label": "quantData", - "output_name": "quantData_tabular", - "uuid": "23940202-403e-4256-916b-92539db07cdb" - }, - { - "label": "ppep_map", - "output_name": "mapped_phophopeptides", - "uuid": "08ad13d4-c103-4f18-92cc-2c3b58565981" - }, - { - "label": "melted_phosphopeptide_map", - "output_name": "melted_phophopeptide_map", - "uuid": "77cecaeb-8f7c-482e-b78a-e4809b194eb7" - }, - { - "label": "ppep_mapping_sqlite", - "output_name": "mqppep_output_sqlite", - "uuid": "8e53e05a-a47c-4b97-87e4-ebab133ccaea" - }, - { - "label": "preproc_tab", - "output_name": "preproc_tab", - "uuid": "530a8140-9eba-4c87-a76b-4922febc12e7" - }, - { - "label": "preproc_csv", - "output_name": "preproc_csv", - "uuid": "c5f22f05-0bf7-48cf-adc0-c2beffe33169" - }, - { - "label": "preproc_sqlite", - "output_name": "preproc_sqlite", - "uuid": "53424150-7673-40af-ad60-0b4035e0c302" - } - ] - }, - "8": { - "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.", - "content_id": "mqppep_anova", - "errors": null, - "id": 8, - "input_connections": { - "alpha_file": { - "id": 6, - "output_name": "output" - }, - "input_file": { - "id": 7, - "output_name": "preproc_tab" - } - }, - "inputs": [], - "label": "MaxQuant Phosphopeptide ANOVA group-median imputed", - "name": "MaxQuant Phosphopeptide ANOVA", - "outputs": [ - { - "name": "imputed_data_file", - "type": "tabular" - }, - { - "name": "report_file", - "type": "html" - } - ], - "position": { - "bottom": 1349, - "height": 256, - "left": 1058, - "right": 1258, - "top": 1093, - "width": 200, - "x": 1058, - "y": 1093 - }, - "post_job_actions": { - "RenameDatasetActionimputed_data_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT" - }, - "action_type": "RenameDatasetAction", - "output_name": "imputed_data_file" - }, - "RenameDatasetActionreport_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_group-mean-imputed_report (download/unzip to view)" - }, - "action_type": "RenameDatasetAction", - "output_name": "report_file" - } - }, - "tool_id": "mqppep_anova", - "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "a3cb902d-8ef6-4f84-bed3-80b2b20d1916", - "workflow_outputs": [ - { - "label": "intensities_group-mean-imputed_QN_LT", - "output_name": "imputed_data_file", - "uuid": "ef19dcd3-8f3e-4fc4-829e-dae6719ff1cc" - }, - { - "label": "intensities_group-mean-imputed_report", - "output_name": "report_file", - "uuid": "26bb93b0-bc11-4455-a280-241253b21981" - } - ] - }, - "9": { - "annotation": "Perform ANOVA. For imputing missing values, create random values.", - "content_id": "mqppep_anova", - "errors": null, - "id": 9, - "input_connections": { - "alpha_file": { - "id": 6, - "output_name": "output" - }, - "input_file": { - "id": 7, - "output_name": "preproc_tab" - } - }, - "inputs": [], - "label": "MaxQuant Phosphopeptide ANOVA randomly imputed", - "name": "MaxQuant Phosphopeptide ANOVA", - "outputs": [ - { - "name": "imputed_data_file", - "type": "tabular" - }, - { - "name": "report_file", - "type": "html" - } - ], - "position": { - "bottom": 1186, - "height": 256, - "left": 1308, - "right": 1508, - "top": 930, - "width": 200, - "x": 1308, - "y": 930 - }, - "post_job_actions": { - "RenameDatasetActionimputed_data_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_randomly-imputed_QN_LT" - }, - "action_type": "RenameDatasetAction", - "output_name": "imputed_data_file" - }, - "RenameDatasetActionreport_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_randomly-imputed_report (download/unzip to view)" - }, - "action_type": "RenameDatasetAction", - "output_name": "report_file" - } - }, - "tool_id": "mqppep_anova", - "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"0.2\"}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "217d92af-f6d6-4fd3-a78a-090d8afd3ae0", - "workflow_outputs": [ - { - "label": "intensities_randomly-imputed_QN_LT", - "output_name": "imputed_data_file", - "uuid": "925d734f-f9d8-49e8-aebb-c8d7598d45b2" - }, - { - "label": "intensities_randomly-imputed_report", - "output_name": "report_file", - "uuid": "4ab5f1b1-d04e-4634-8765-265122bc1064" - } - ] - } - }, - "tags": [ - "ppenrich" - ], - "uuid": "c54c2b2e-8080-445c-bc3e-43950c89d4e4", - "version": 3 -} \ No newline at end of file