Galaxy |

Changeset 0:c1403d18c189 (2022-03-07)

Next changeset 1:5ccf4e985c6a (2022-03-07)

Commit message:
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"

added:
MaxQuantProcessingScript.R
PhosphoPeptide_Upstream_Kinase_Mapping.pl
macros.xml
mqppep_anova.R
mqppep_anova.xml
mqppep_anova_script.Rmd
mqppep_mrgfltr.py
search_ppep.py
test-data/alpha_levels.tabular
test-data/pSTY_motifs.tabular
test-data/test_input_for_anova.tabular
test-data/test_input_for_preproc.tabular
test-data/test_kinase_substrate.tabular
test-data/test_networkin.tabular
test-data/test_regulatory_sites.tabular
test-data/test_swissprot.fasta
workflow/ppenrich_suite_wf.ga

diff -r 000000000000 -r c1403d18c189 MaxQuantProcessingScript.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MaxQuantProcessingScript.R Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,500 @@\n+#!/usr/bin/env Rscript\n+\n+# This is the implementation for the \n+# "MaxQuant Phosphopeptide Localization Probability Cutoff"\n+# Galaxy tool (mqppep_lclztn_filter)\n+# It is adapted from the MaxQuant Processing Script written by Larry Cheng.\n+\n+# libraries\n+library(optparse)\n+library(data.table)\n+library(stringr)\n+library(ggplot2)\n+#library(PTXQC)\n+#require(PTXQC)\n+#require(methods)\n+\n+# title: "MaxQuant Processing Script"\n+# author: "Larry Cheng"\n+# date: "February 19, 2018"\n+#\n+# # MaxQuant Processing Script\n+# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):\n+# 1) Runs the Proteomics Quality Control software\n+# 2) Remove contaminant and reverse sequence rows\n+# 3) Filters rows based on localization probability\n+# 4) Extract the quantitative data\n+# 5) Sequences phosphopeptides\n+# 6) Merges multiply phosphorylated peptides\n+# 7) Filters out phosphopeptides based on enrichment\n+# The output file contains the phosphopeptide (first column) and the quantitative values for each sample\n+#\n+# ## Revision History\n+# Rev. 2022-02-10 :wrap for inclusion in Galaxy\n+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"\n+# Rev. 2017-12-12 :added PTXQC\n+# added additional plots and table outputs for quality control\n+# allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-\n+# "\\\\.(\\\\d+)[A-Z]$"\n+# converted from .r to .rmd file to knit report for quality control\n+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times\n+# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows\n+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output\n+# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\\\.(\\\\d+)[AB]$" so that it looks at the end of string\n+# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)\n+# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA\n+# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75\n+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards\n+# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol\n+\n+\n+### FUNCTION DECLARATIONS begin ----------------------------------------------\n+\n+# Read first line of file at filePath\n+# adapted from: https://stackoverflow.com/a/35761217/15509512\n+readFirstLine <- function(filepath) {\n+ con = file(filepath, "r")\n+ line = readLines(con, n = 1)\n+ close(con)\n+ return(line)\n+}\n+\n+# Move columns to the end of dataframe\n+# - data: the dataframe\n+# - move: a vector of column names, each of which is an element of names(data)\n+movetolast <- function(data, move) {\n+ data[c(setdiff(names(data), move), move)]\n+}\n+\n+# Generate phosphopeptide and build list when applied\n+phosphopeptide_func <- function(df) {\n+\n+ #generate peptide sequence and list of phosphopositions\n+ phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]\n+ output <- vector()\n+ phosphopeptide <- ""\n+ counter <- 0 #keep track of position in peptide\n+ phosphopositions <- vector() #keep track of phosphorylation positions in peptide\n+ score_diff <- ""\n+ for (chara in phosphoprobsequence){\n+ #build peptide sequence\n+ if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {\n+ phosphopeptide <- paste(phosphopeptide,chara,sep="")\n+ counter <- counter + 1\n+ }\n+ #generate score_diff\n+ if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7'..b'teger(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames\n+colnames(merge_df) <- c("rn", "Phosphopeptide")\n+# ...\n+\n+\n+# Add Phosphopeptide column to quant columns for quality control checking\n+# ---\n+quantData_qc <- as.data.frame(quantData)\n+setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data\n+quantData_qc$rn <- as.integer(quantData_qc$rn)\n+quantData_qc <- merge(merge_df,quantData_qc, by="rn")\n+quantData_qc$rn <- NULL #remove rn column\n+# ...\n+\n+\n+# Collapse multiphosphorylated peptides\n+# ---\n+quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")\n+quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)\n+# ...\n+\n+\n+# Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step)\n+# ---\n+pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc))\n+pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")\n+# ...\n+\n+\n+# Compute and visualize breakdown of pY, pS, and pT before enrichment filter\n+# ---\n+pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]\n+pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]\n+pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]\n+\n+pY_num <- nrow(pY_data)\n+pS_num <- nrow(pS_data)\n+pT_num <- nrow(pT_data)\n+\n+# Visualize enrichment\n+enrichGraphData <- data.frame(\n+ group = c("pY", "pS", "pT"),\n+ value = c(pY_num, pS_num, pT_num)\n+)\n+\n+enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,]\n+\n+# Plot pie chart with legend\n+# start: https://stackoverflow.com/a/62522478/15509512\n+# refine: https://www.statology.org/ggplot-pie-chart/\n+# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8\n+slices <- enrichGraphData$value\n+phosphoresidue <- enrichGraphData$group\n+pct <- round(100 * slices / sum(slices))\n+lbls <- paste(enrichGraphData$group,"\\n",pct, "%\\n(", slices, ")", sep="")\n+slc_ctr <- c()\n+run_tot <- 0\n+for (p in pct) {\n+ slc_ctr <- c(slc_ctr, run_tot + p/2.0)\n+ run_tot <- run_tot + p\n+}\n+lbl_y <- 100 - slc_ctr\n+df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))\n+gigi <- ggplot(\n+ df\n+, aes(x = 1, y = pct, fill = phosphoresidue)) +\n+ geom_col(position = "stack", orientation = "x") +\n+ geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +\n+ coord_polar(theta = "y", direction = -1) +\n+ labs(\n+ x = NULL\n+ , y = NULL\n+ , title = "Percentages (and counts) of phosphosites, by type of residue"\n+ , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)\n+ ) +\n+ labs(x = NULL, y = NULL, fill = NULL) +\n+ theme_classic() +\n+ theme( legend.position="right"\n+ , axis.line = element_blank()\n+ , axis.text = element_blank()\n+ , axis.ticks = element_blank()\n+ , plot.title = element_text(hjust = 0.5)\n+ , plot.subtitle = element_text(hjust = 0.5)\n+ , plot.caption = element_text(hjust = 0.5)\n+ , plot.title.position = "plot"\n+ ) +\n+ scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))\n+\n+pdf(enrichGraphFilename)\n+print(gigi)\n+dev.off()\n+svg(enrichGraphFilename_svg)\n+print(gigi)\n+dev.off()\n+# ...\n+\n+\n+# Filter phosphopeptides by enrichment\n+# --\n+if (enriched == "Y"){\n+ quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]\n+} else if ( enriched == "ST" ) {\n+ quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]\n+} else {\n+ print("Error in enriched variable. Set to either \'Y\' or \'ST\'")\n+}\n+# ...\n+\n+\n+# Write phosphopeptides filtered by enrichment\n+# --\n+write.table(quantData_qc_enrichment, file=outputfilename, sep="\\t", quote = FALSE, row.names = FALSE)\n+# ...\n'

diff -r 000000000000 -r c1403d18c189 PhosphoPeptide_Upstream_Kinase_Mapping.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,2124 @@\n+#!/usr/local/bin/perl\r\n+###############################################################################################################################\r\n+# perl Kinase_enrichment_analysis_complete_v0.pl\r\n+#\r\n+# Nick Graham, USC\r\n+# 2016-02-27\r\n+#\r\n+# Built from scripts written by NG at UCLA in Tom Graeber\'s lab:\r\n+# CombinePhosphoSites.pl\r\n+# Retrieve_p_motifs.pl\r\n+# NetworKIN_Motif_Finder_v7.pl\r\n+#\r\n+# Given a list of phospho-peptides, find protein information and upstream kinases.\r\n+# Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl\r\n+#\r\n+# Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake\'s lab:\r\n+# Added warnings and used strict;\r\n+# fixed some code paths resulting in more NetworKIN matches;\r\n+# applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)\r\n+# to speed up "Match the non_p_peptides to the @sequences array";\r\n+# added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;\r\n+# added support for SQLite output in addition to tabular files.\r\n+#\r\n+#\r\n+###############################################################################################################################\r\n+\r\n+use strict;\r\n+use warnings;\r\n+\r\n+use Getopt::Std;\r\n+use DBD::SQLite::Constants qw/:file_open/;\r\n+use DBI qw(:sql_types);\r\n+use File::Copy;\r\n+use File::Basename;\r\n+use POSIX qw(strftime);\r\n+use Time::HiRes qw(gettimeofday);\r\n+#use Data::Dump qw(dump);\r\n+\r\n+my $USE_SEARCH_PPEP_PY = 1;\r\n+\r\n+my $dirname = dirname(__FILE__);\r\n+my %opts;\r\n+my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);\r\n+my $dbtype;\r\n+my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);\r\n+my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);\r\n+my $line = 0;\r\n+my @failed_match = ("Failed match");\r\n+my @failed_matches;\r\n+my (%all_data);\r\n+my (@p_peptides, @non_p_peptides);\r\n+my @parsed_fasta;\r\n+my (@accessions, @names, @sequences, @databases, $database);\r\n+my ($dbfile, $dbh, $stmth);\r\n+my @col_names;\r\n+my (%matched_sequences, %accessions, %names, %sites, );\r\n+my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites);\r\n+my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);\r\n+my (@kinases_observed, $kinases);\r\n+my (@kinases_observed_lbl, @phosphosites_observed_lbl);\r\n+my ($p_sequence_kinase, $p_sequence, $kinase);\r\n+my (@motif_sequence, %motif_type, %motif_count);\r\n+my (@kinases_PhosphoSite, $kinases_PhosphoSite);\r\n+my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);\r\n+my (%regulatory_sites_PhosphoSite_hash);\r\n+#ACE my %psp_regsite_protein;\r\n+my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);\r\n+my (%unique_motifs);\r\n+my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);\r\n+my %psp_regsite_protein_2;\r\n+my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);\r\n+my @timeData;\r\n+my $PhosphoSitePlusCitation;\r\n+my %site_description;\r\n+\r\n+my %kinase_substrate_NetworKIN_matches;\r\n+my %kinase_motif_matches;\r\n+my $regulatory_sites_PhosphoSite;\r\n+my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);\r\n+my %kinase_substrate_PhosphoSite_matches;\r\n+my @formatted_sequence;\r\n+my $pSTY_sequence;\r\n+my $i;\r\n+my @a;\r\n+my $use_sqlite;\r\n+my $verbose;\r\n+\r\n+##########\r\n+## opts ##\r\n+##########\r\n+ ## input files\r\n+ # i : path to input file, e.g., \'outputfile_STEP2.txt\'\r\n+ # f : path to UniProtKB/SwissProt FASTA\r\n+ # s : optional species argument\r\n+ # n : path to NetworKIN_201612_cutoffscore2.0.txt\r\n+ # m : path to pSTY_Motifs.txt\r\n+ # p : path to 2017-03_PSP_Kinase_Sub'..b'$ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id\r\n+ if (not $ppep_gene_site_stmth->execute()) {\r\n+ print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\\n";\r\n+ }\r\n+ # ...\r\n+ # end store-to-SQLite "ppep_gene_site" table\r\n+ }\r\n+ else { print OUT "\\t";}\r\n+ }\r\n+ #ACE my %wrote_motif = {};\r\n+ my %wrote_motif;\r\n+ my $motif_parts_0;\r\n+ for my $i (0 .. $#motif_sequence) {\r\n+ if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {\r\n+ print OUT "X\\t";\r\n+ #ACE my @motif_parts = split(/ motif /, $motif_type{$motif_sequence[$i]});\r\n+ $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];\r\n+ my $key = "$peptide\\t$gene_names\\t$motif_parts_0";\r\n+ if (!exists($wrote_motif{$key})) {\r\n+ $wrote_motif{$key} = $key;\r\n+ print MELT "$peptide\\t$gene_names\\t$site_description{$SITE_MOTIF}\\t$motif_parts_0\\n";\r\n+ # print "Line 657: i is $i\\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\\n"; #debug\r\n+ # begin store-to-SQLite "ppep_gene_site" table\r\n+ # ---\r\n+ $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id\r\n+ $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names\r\n+ $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map\r\n+ $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF); # ppep_gene_site.site_type_id\r\n+ if (not $ppep_gene_site_stmth->execute()) {\r\n+ print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\\n";\r\n+ }\r\n+ # ...\r\n+ # end store-to-SQLite "ppep_gene_site" table\r\n+ }\r\n+ }\r\n+ else { print OUT "\\t";}\r\n+ }\r\n+ for my $i (0 .. $#kinases_PhosphoSite) {\r\n+ if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {\r\n+ print MELT "$peptide\\t$gene_names\\t$site_description{$SITE_PHOSPHOSITE}\\t$phosphosites_observed_lbl[$i]\\n";\r\n+ if ($i < $#kinases_PhosphoSite) {\r\n+ print OUT "X\\t";\r\n+ }\r\n+ else {\r\n+ print OUT "X\\n";\r\n+ }\r\n+ # begin store-to-SQLite "ppep_gene_site" table\r\n+ # ---\r\n+ $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id\r\n+ $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names\r\n+ $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map\r\n+ $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id\r\n+ if (not $ppep_gene_site_stmth->execute()) {\r\n+ print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\\n";\r\n+ }\r\n+ # ...\r\n+ # end store-to-SQLite "ppep_gene_site" table\r\n+ }\r\n+ else {\r\n+ if ($i < $#kinases_PhosphoSite) {\r\n+ print OUT "\\t";\r\n+ }\r\n+ elsif ($i == $#kinases_PhosphoSite) {\r\n+ print OUT "\\n";\r\n+ }\r\n+ }\r\n+ }\r\n+}\r\n+\r\n+close OUT;\r\n+close MELT;\r\n+$ppep_gene_site_stmth->finish;\r\n+print "begin DB commit at " . format_localtime_iso8601() . "\\n";\r\n+$dbh->{AutoCommit} = $auto_commit;\r\n+$dbh->disconnect if ( defined $dbh );\r\n+\r\n+print "\\nFinished writing output at " . format_localtime_iso8601() ."\\n\\n";\r\n+\r\n+###############################################################################################################################\r\n'

diff -r 000000000000 -r c1403d18c189 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Mar 07 19:05:01 2022 +0000

@@ -0,0 +1,4 @@
+<macros>
+ <token name="@TOOL_VERSION@">0.1.0</token>
+ <token name="@VERSION_SUFFIX@">0</token>
+</macros>

diff -r 000000000000 -r c1403d18c189 mqppep_anova.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.R Mon Mar 07 19:05:01 2022 +0000

[

@@ -0,0 +1,191 @@
+#!/usr/bin/env Rscript
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+#library(ggplot2)
+#library(PTXQC)
+#require(PTXQC)
+#require(methods)
+# bioconductor-preprocesscore
+#  - libopenblas
+#  - r-data.table
+#  - r-rmarkdown
+#  - r-ggplot2
+#  - texlive-core
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+
+# parse options
+option_list <- list(
+  # <param name="inputFilename" type="data" format="tabular" label="Phosphopeptide Intensities" help="First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument regexSampleNames"/>
+  make_option(
+    c("-i", "--inputFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Phosphopeptide Intensities sparse input file path"
+  ),
+  make_option(
+    c("-a", "--alphaFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "List of alpha cutoff values for significance testing; path to text file having one column and no header"
+  ),
+  make_option(
+    c("-f", "--firstDataColumn"),
+    action = "store",
+    default = "10",
+    type = "character",
+    help = "First column of intensity values"
+  ),
+  make_option( # imputationMethod <- c("group-median","median","mean","random")[1]
+    c("-m", "--imputationMethod"),
+    action = "store",
+    default = "group-median",
+    type = "character",
+    help = "Method for missing-value imputation, one of c('group-median','median','mean','random')"
+  ),
+  make_option(
+    c("-p", "--meanPercentile"),
+    action = "store",
+    default = 3,
+    type = "integer",
+    help = "Mean percentile for randomly generated imputed values; range [1,99]"
+  ),
+  make_option(
+    c("-d", "--sdPercentile"),
+    action = "store",
+    default = 3,
+    type = "double",
+    help = "Adjustment value for standard deviation of randomly generated imputed values; real"
+  ),
+  make_option(
+    c("-s", "--regexSampleNames"),
+    action = "store",
+    default = "\\.(\\d+)[A-Z]$",
+    type = "character",
+    help = "Regular expression extracting sample-names"
+  ),
+  make_option(
+    c("-g", "--regexSampleGrouping"),
+    action = "store",
+    default = "(\\d+)",
+    type = "character",
+    help = "Regular expression extracting sample-group from an extracted sample-name"
+  ),
+  # <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data>
+  make_option(
+    c("-o", "--imputedDataFile"),
+    action = "store",
+    default = "output_imputed.tsv",
+    type = "character",
+    help = "Imputed Phosphopeptide Intensities output file path"
+  ),
+  # <data name="report_file" format="html" label="report (download/unzip to view)" ></data>
+  make_option(
+    c("-r", "--reportFile"),
+    action = "store",
+    default = "QuantDataProcessingScript.html",
+    type = "character",
+    help = "HTML report file path"
+  )
+)
+args <- parse_args(OptionParser(option_list=option_list))
+# Check parameter values
+
+if (! file.exists(args$inputFile)) {
+  stop((paste("Input file", args$inputFile, "does not exist")))
+}
+inputFile <- args$inputFile
+alphaFile <- args$alphaFile
+firstDataColumn <- args$firstDataColumn
+imputationMethod <- args$imputationMethod
+meanPercentile <- args$meanPercentile
+sdPercentile <- args$sdPercentile
+
+regexSampleNames    <- gsub('^[ \t\n]*', ''  , readChar(args$regexSampleNames,  1000))
+regexSampleNames    <- gsub('[ \t\n]*$', ''  ,               regexSampleNames        )
+# regexSampleNames    <- gsub('\\\\'     , '@@',               regexSampleNames        )
+# regexSampleNames    <- gsub('@@'       , '\\',               regexSampleNames        )
+cat(regexSampleNames)
+cat('\n')
+
+regexSampleGrouping <- gsub('^[ \t\n]*', '', readChar(args$regexSampleGrouping, 1000))
+regexSampleGrouping <- gsub('[ \t\n]*$', '',               regexSampleGrouping       )
+# regexSampleGrouping <- gsub('\\\\'     , '@@',             regexSampleGrouping       )
+cat(regexSampleGrouping)
+cat('\n')
+
+# regexSampleGrouping <- gsub('@@'       , '\\',             regexSampleGrouping       )
+imputedDataFilename <- args$imputedDataFile
+reportFileName <- args$reportFile
+
+print("args is:")
+cat(str(args))
+
+print("regexSampleNames is:")
+cat(str(regexSampleNames))
+
+print("regexSampleGrouping is:")
+cat(str(regexSampleGrouping))
+
+# from: https://github.com/molgenis/molgenis-pipelines/wiki/How-to-source-another_file.R-from-within-your-R-script
+LocationOfThisScript = function() # Function LocationOfThisScript returns the location of this .R script (may be needed to source other files in same dir)
+{
+    this.file = NULL
+    # This file may be 'sourced'
+    for (i in -(1:sys.nframe())) {
+        if (identical(sys.function(i), base::source)) this.file = (normalizePath(sys.frame(i)$ofile))
+    }
+
+    if (!is.null(this.file)) return(dirname(this.file))
+
+    # But it may also be called from the command line
+    cmd.args = commandArgs(trailingOnly = FALSE)
+    cmd.args.trailing = commandArgs(trailingOnly = TRUE)
+    cmd.args = cmd.args[seq.int(from=1, length.out=length(cmd.args) - length(cmd.args.trailing))]
+    res = gsub("^(?:--file=(.*)|.*)$", "\\1", cmd.args)
+
+    # If multiple --file arguments are given, R uses the last one
+    res = tail(res[res != ""], 1)
+    if (0 < length(res)) return(dirname(res))
+
+    # Both are not the case. Maybe we are in an R GUI?
+    return(NULL)
+}
+
+script.dir <-  LocationOfThisScript()
+
+rmarkdown_params <- list(
+    inputFile = inputFile
+  , alphaFile = alphaFile
+  , firstDataColumn = firstDataColumn
+  , imputationMethod = imputationMethod
+  , meanPercentile = meanPercentile
+  , sdPercentile = sdPercentile
+  , regexSampleNames = regexSampleNames
+  , regexSampleGrouping = regexSampleGrouping
+  , imputedDataFilename = imputedDataFilename
+  )
+
+str(rmarkdown_params)
+
+# BUG
+# Must render as HTML for the time being until this issue is resolved:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/19
+# for reason:
+#   "The following dependencies are not available in conda"
+# reported here:
+#   https://github.com/ami-iit/bipedal-locomotion-framework/pull/457/commits/e98ccef8c8cb63e207df36628192af6ce22feb13
+
+# freeze the random number generator so the same results will be produced from run to run
+set.seed(28571)
+
+rmarkdown::render(
+  input = paste(script.dir, "mqppep_anova_script.Rmd", sep="/")
+, output_format = rmarkdown::html_document(pandoc_args = "--self-contained")
+, output_file = reportFileName
+, params = rmarkdown_params
+)

diff -r 000000000000 -r c1403d18c189 mqppep_anova.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.xml Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,219 @@\n+<tool id="mqppep_anova" name="MaxQuant Phosphopeptide ANOVA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">\n+ <description>Perform ANOVA on merged and filtered data from phospho-peptide enrichment/MaxQuant pipeline</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <requirements>\n+ <requirement type="package" version="1.7.1">r-optparse</requirement>\n+ <requirement type="package" version="1.4.0">r-stringr</requirement>\n+ <requirement type="package" version="1.14.2">r-data.table</requirement>\n+ <requirement type="package" version="3.3.5">r-ggplot2</requirement>\n+ <requirement type="package" version="1.56.0">bioconductor-preprocesscore</requirement>\n+ <requirement type="package" version="0.3.3" >openblas</requirement>\n+ <requirement type="package" version="2.11" >r-rmarkdown</requirement>\n+ <requirement type="package" version="0.4.0" >r-sass</requirement>\n+ <requirement type="package" >texlive-core</requirement>\n+\n+ </requirements>\n+ \n+ <command detect_errors="exit_code"><![CDATA[\n+cat $sample_names_regex_f; cat $sample_grouping_regex_f;\n+Rscript \'$__tool_directory__/mqppep_anova.R\' \n+--inputFile \'$input_file\' \n+--alphaFile $alpha_file\n+--firstDataColumn $first_data_column\n+--imputationMethod $imputation.imputation_method\n+#if \'$imputation_method\' == \'random\':\n+ --meanPercentile \'$meanPercentile\'\n+ --sdPercentile \'$sdPercentile\'\n+#end if\n+--regexSampleNames $sample_names_regex_f\n+--regexSampleGrouping $sample_grouping_regex_f\n+--imputedDataFile $imputed_data_file\n+--reportFile $report_file\n+ ]]></command>\n+ <configfiles>\n+ <configfile name="sample_names_regex_f">\n+ $sample_names_regex\n+ </configfile>\n+ <configfile name="sample_grouping_regex_f">\n+ $sample_grouping_regex\n+ </configfile>\n+ </configfiles>\n+ <inputs>\n+ <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities"\n+ help="[input_file] Phosphopeptide intensities filtered for minimal quality. First column label \'Phosphopeptide\'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]"\n+ />\n+ <param name="alpha_file" type="data" format="tabular" label="alpha cutoff level"\n+ help="[alpha_file] List of alpha cutoff values for significance testing; text file having one column and no header"\n+ />\n+ <param name="first_data_column" type="text" value="Intensity"\n+ label="First data column"\n+ help="[first_data_column] First column having intensity values (integer or PERL-compatible regular expression matching column label)"\n+ />\n+ \n+ <conditional name="imputation">\n+ <param name="imputation_method" type="select" label="Imputation Method"\n+ help="[imputation_method] Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])"\n+ >\n+ <option value="random" selected="true">random</option>\n+ <option value="group-median">group-median</option>\n+ <option value="median">median</option>\n+ <option value="mean">mean</option>\n+ </param>\n+ <when value="group-median" />\n+ <when value="median" />\n+ <when value="mean" />\n+ <when value="random">\n+ <param name="meanPercentile" type="integer" value="1" min="1" max="99"\n+ label="Mean percentile f'..b'putation_method" value="random"/>\n+ <param name="sample_names_regex" value="\\.\\d+[A-Z]$"/>\n+ <param name="sample_grouping_regex" value="\\d+"/>\n+ <output name="imputed_data_file">\n+ <assert_contents>\n+ <has_text text="Phosphopeptide" />\n+ <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />\n+ <has_text text="997800000" />\n+ <has_text text="pSESELIDELSEDFDR" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+ </tests>\n+ <help><![CDATA[\n+===========================================\n+Phopsphoproteomic Enrichment Pipeline ANOVA\n+===========================================\n+\n+**Input files**\n+\n+``input_file``\n+ Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format).\n+ This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter"\n+ (``mqppep_mrgflt``) tool.\n+\n+``alpha_file``\n+ List of alpha cutoff values for significance testing; text file having one column and no header. For example:\n+\n+::\n+\n+ 0.2\n+ 0.1\n+ 0.05\n+\n+**Input parameters**\n+\n+``first_data_column``\n+ First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity**\n+\n+``imputation_method``\n+ Impute missing values by:\n+\n+ 1. using median for each sample-group;\n+ 2. using median across all samples;\n+ 3. using mean across all samples; or\n+ 4. using randomly generated values where:\n+\n+ - ``meanPercentile`` specifies the percentile among non-missing values to be used as mean of random values, and\n+ - ``sdPercentile`` specifies the factor to be mulitplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.\n+\n+``sample_names_regex``\n+ PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample.\n+\n+ - For example, ``"\\.\\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A``\n+ - Note that *this is case sensitive* by default.\n+\n+``sample_grouping_regex``\n+ PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``).\n+\n+ - For example, ``"\\d+$"`` applied to ``.10A`` would produce ``10``\n+ - Note that *this is case sensitive* by default.\n+\n+\n+**Outputs**\n+\n+``intensities_*-imputed_QN_LT``\n+ Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.\n+\n+``report_file``\n+ (download/unzip to view) Summary report for normalization, imputation, and ANOVA.\n+ This dataset is displayed in Galaxy as having a datatype of ``html`` in Galaxy,\n+ but it is in fact a zipfile; the zip file contains\n+ an HTML file. Please download and unzip it locally to view the report.\n+ Ideally this report would be a PDF, but there is an issue\n+ `(linked here)\n+ <https://github.com/conda-forge/texlive-core-feedstock/issues/19>`_.\n+ that needs to be resolved first.\n+\n+**Authors**\n+\n+``Larry C. Cheng``\n+ (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.\n+\n+``Arthur C. Eschenlauer``\n+ (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.\n+\n+===================================\n+PERL-compatible regular expressions\n+===================================\n+\n+Note that the PERL-compatible regular expressions accepted by this tool are documented at https://rdrr.io/r/base/regex.html\n+\n+ ]]></help>\n+ <citations>\n+ \n+ <citation type="doi">10.3791/57996</citation>\n+ </citations>\n+</tool>\n'

diff -r 000000000000 -r c1403d18c189 mqppep_anova_script.Rmd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_script.Rmd Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,657 @@\n+---\n+title: "Quant Data Processing Script"\n+author: "Larry Cheng; Art Eschenlauer"\n+date: "May 28, 2018; Nov 16, 2021"\n+output:\n+ html_document: default\n+ pdf_document: default\n+params:\n+ inputFile: "Upstream_Map_pST_outputfile_STEP4.txt"\n+ alphaFile: "alpha_levels.txt"\n+ firstDataColumn: "Intensity"\n+ imputationMethod: !r c("group-median","median","mean","random")[4]\n+ meanPercentile: 1\n+ sdPercentile: 0.2\n+ regexSampleNames: "\\\\.(\\\\d+)[A-Z]$"\n+ regexSampleGrouping: "(\\\\d+)"\n+ imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"\n+---\n+```{r setup, include=FALSE}\n+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285\n+knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10))\n+```\n+\n+## Purpose:\n+Perform imputation of missing values, quantile normalization, and ANOVA.\n+\n+\n+```{r include = FALSE}\n+#Input Filename\n+inputFile <- params$inputFile\n+\n+#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is.\n+firstDataColumn <- params$firstDataColumn\n+FDC_is_integer <- TRUE\n+firstDataColumn <- withCallingHandlers(\n+ as.integer(firstDataColumn)\n+ , warning = function(w) FDC_is_integer <<- FALSE\n+ )\n+if (FALSE == FDC_is_integer) {\n+ firstDataColumn <- params$firstDataColumn\n+}\n+\n+#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05)\n+valFDR <- read.table(file = params$alphaFile, sep = "\\t", header=F, quote="")[,1]\n+\n+#Imputed Data filename\n+imputedDataFilename <- params$imputedDataFilename\n+\n+#ANOVA data filename\n+```\n+\n+```{r include = FALSE}\n+#Imputation method, should be one of c("random","group-median","median","mean")\n+imputationMethod <- params$imputationMethod\n+\n+#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation\n+meanPercentile <- params$meanPercentile / 100.0\n+\n+#deviation adjustment-factor for random values; real number.\n+sdPercentile <- params$sdPercentile\n+\n+#Regular expression of Sample Names, e.g., "\\\\.(\\\\d+)[A-Z]$"\n+regexSampleNames <- params$regexSampleNames\n+\n+#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up)\n+# e.g., "(\\\\d+)"\n+regexSampleGrouping <- params$regexSampleGrouping\n+\n+```\n+\n+\n+```{r include = FALSE}\n+### FUNCTIONS\n+\n+#ANOVA filter function\n+anovaFunc <- function(x, groupingFactor) {\n+ x.aov = aov(as.numeric(x) ~ groupingFactor)\n+ pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1]\n+ pvalue\n+}\n+```\n+\n+\n+\n+### Checking that log-transformed sample distributions are similar:\n+```{r echo=FALSE}\n+\n+library(data.table)\n+\n+# read.table reads a file in table format and creates a data frame from it.\n+# - note that `quote=""` means that quotation marks are treated literally.\n+fullData <- read.table(file = inputFile, sep = "\\t", header=T, quote="", check.names=FALSE)\n+print(colnames(fullData))\n+#head(fullData)\n+\n+if (FALSE == FDC_is_integer) {\n+ dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE)\n+ str(dataColumnIndices)\n+ if (length(dataColumnIndices) > 0) {\n+ firstDataColumn <- dataColumnIndices[1]\n+ } else {\n+ stop(paste("failed to convert firstDataColumn:", firstDataColumn))\n+ }\n+}\n+ \n+quantData0 <- fullData[firstDataColumn:length(fullData)]\n+quantData <- fullData[firstDataColumn:length(fullData)]\n+quantData[quantData==0] <- NA #replace 0 with NA\n+quantDataLog <- log10(quantData)\n+\n+rownames(quantDataLog) <- fullData$Phosphopeptide\n+\n+summary(quantDataLog)\n+\n+#data visualization\n+old_par <- par(\n+ mai=par("mai") + c(0.5,0,0,0)\n+)\n+boxplot(\n+ quantDataLog\n+, las=2\n+)\n+par(old_par)\n+\n+quantDataLog_stack <- stack(quantDataLog)\n+```\n+\n+```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}\n+library(ggplot2)\n+ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))\n+```\n+\n+###'..b'de p-values\n+\n+ dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog)\n+ write.table(\n+ dataTableImputed\n+ , file=imputedDataFilename\n+ , sep="\\t"\n+ , col.names=TRUE\n+ , row.names=FALSE\n+ , quote=FALSE\n+ )\n+\n+\n+ pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),]\n+\n+ cutoff <- valFDR[1]\n+ for (cutoff in valFDR){ #loop through FDR cutoffs\n+\n+ filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE]\n+ filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE]\n+ filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE]\n+\n+ # \n+ old_oma <- par("oma")\n+ old_par <- par(\n+ mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1)\n+ , oma=old_oma * c(1,1,0.3,1)\n+ , cex.main=0.9\n+ , cex.axis=0.7\n+ )\n+ \n+ if (nrow(filteredData.filtered) > 0) {\n+ boxplot(\n+ filteredData.filtered\n+ , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff)\n+ # no line plot , main = ""\n+ , las = 2\n+ # , ylim = c(5.5,10)\n+ , ylab = expression(log[10](intensity))\n+ )\n+ } else {\n+ cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\\n", cutoff))\n+ }\n+ par(old_par)\n+ \n+ #Add Phosphopeptide column to ANOVA filtered table\n+ ANOVA.filtered_merge <- merge(\n+ x = connect_df\n+ , y = filteredData.filtered\n+ , by.x="Intensity"\n+ , by.y=1\n+ )\n+ ANOVA.filtered_merge.order <- rownames(filtered_p)\n+ \n+ ANOVA.filtered_merge.format <- sapply(\n+ X = filtered_p$FDRadjustedANOVAp\n+ , FUN = function(x) {\n+ if (x > 0.0001)\n+ paste0("(%0.",1+ceiling(-log10(x)),"f) %s")\n+ else\n+ paste0("(%0.4e) %s")\n+ }\n+ )\n+\n+ #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s")\n+\n+ ANOVA.filtered <- data.table(\n+ ANOVA.filtered_merge$Phosphopeptide\n+ , ANOVA.filtered_merge$Intensity\n+ , ANOVA.filtered_merge[, 2:numSamples+1]\n+ )\n+ colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered))\n+ \n+ # merge qualitative columns into the ANOVA data\n+ output_table <- data.frame(ANOVA.filtered$Phosphopeptide)\n+ output_table <- merge(\n+ x = output_table\n+ , y = dataTableImputed_QN_LT\n+ , by.x = "ANOVA.filtered.Phosphopeptide"\n+ , by.y="Phosphopeptide"\n+ )\n+\n+ #Produce heatmap to visualize significance and the effect of imputation\n+ m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,])\n+ if (nrow(m) > 0) {\n+ rownames_m <- rownames(m)\n+ rownames(m) <- sapply(\n+ X = 1:nrow(m)\n+ , FUN = function(i) {\n+ sprintf(\n+ ANOVA.filtered_merge.format[i]\n+ , filtered_p$FDRadjustedANOVAp[i]\n+ , rownames_m[i]\n+ )\n+ }\n+ )\n+ margins <- c(\n+ max(nchar(colnames(m))) * 10 / 16 # col\n+ , max(nchar(rownames(m))) * 5 / 16 # row\n+ )\n+ how_many_peptides <- min(50, nrow(m))\n+\n+ op <- par("cex.main")\n+ try(\n+ if (nrow(m) > 1) {\n+ par(cex.main=0.6)\n+ heatmap(\n+ m[how_many_peptides:1,]\n+ , Rowv = NA\n+ , Colv = NA\n+ , cexRow = 0.7\n+ , cexCol = 0.8\n+ , scale="row"\n+ , margins = margins\n+ , main = "Heatmap of unimputed, unnormalized intensities"\n+ , xlab = ""\n+ # , main = bquote(\n+ # .( how_many_peptides )\n+ # ~ " peptides with adjusted p-value <"\n+ # ~ .(sprintf("%0.2f", cutoff))\n+ # )\n+ )\n+ } \n+ )\n+ #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim)\n+ par(op)\n+ }\n+ \n+ }\n+}\n+```\n+\n+## Peptide IDs, etc.\n+\n+See output files.\n'

diff -r 000000000000 -r c1403d18c189 mqppep_mrgfltr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_mrgfltr.py Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,1337 @@\n+#!/usr/bin/env python\r\n+\r\n+# Import the packages needed\r\n+import argparse\r\n+import os.path\r\n+import sys\r\n+\r\n+import pandas\r\n+import re\r\n+import time\r\n+import sqlite3 as sql\r\n+from codecs import getreader as cx_getreader\r\n+import sys\r\n+import numpy as np\r\n+\r\n+# for sorting list of lists using operator.itemgetter\r\n+import operator\r\n+\r\n+# for formatting stack-trace\r\n+import traceback\r\n+\r\n+# for Aho-Corasick search for fixed set of substrings\r\n+import ahocorasick\r\n+import operator\r\n+import hashlib\r\n+\r\n+# for shutil.copyfile(src, dest)\r\n+import shutil\r\n+\r\n+# global constants\r\n+N_A = \'N/A\'\r\n+\r\n+# ref: https://stackoverflow.com/a/8915613/15509512\r\n+# answers: "How to handle exceptions in a list comprehensions"\r\n+# usage:\r\n+# from math import log\r\n+# eggs = [1,3,0,3,2]\r\n+# print([x for x in [catch(log, egg) for egg in eggs] if x is not None])\r\n+# producing:\r\n+# for <built-in function log>\r\n+# with args (0,)\r\n+# exception: math domain error\r\n+# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]\r\n+def catch(func, *args, handle=lambda e : e, **kwargs):\r\n+ try:\r\n+ return func(*args, **kwargs)\r\n+ except Exception as e:\r\n+ print("For %s" % str(func))\r\n+ print(" with args %s" % str(args))\r\n+ print(" caught exception: %s" % str(e))\r\n+ (ty, va, tb) = sys.exc_info()\r\n+ print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))\r\n+ exit(-1)\r\n+ return None # was handle(e)\r\n+\r\n+def ppep_join(x):\r\n+ x = [i for i in x if N_A != i]\r\n+ result = "%s" % \' | \'.join(x)\r\n+ if result != "":\r\n+ return result\r\n+ else:\r\n+ return N_A\r\n+\r\n+def melt_join(x):\r\n+ tmp = {key.lower(): key for key in x}\r\n+ result = "%s" % \' | \'.join([tmp[key] for key in tmp])\r\n+ return result\r\n+\r\n+def __main__():\r\n+ # Parse Command Line\r\n+ parser = argparse.ArgumentParser(\r\n+ description=\'Phopsphoproteomic Enrichment Pipeline Merge and Filter.\'\r\n+ )\r\n+\r\n+ # inputs:\r\n+ # Phosphopeptide data for experimental results, including the intensities\r\n+ # and the mapping to kinase domains, in tabular format.\r\n+ parser.add_argument(\r\n+ \'--phosphopeptides\', \'-p\',\r\n+ nargs=1,\r\n+ required=True,\r\n+ dest=\'phosphopeptides\',\r\n+ help=\'Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format\'\r\n+ )\r\n+ # UniProtKB/SwissProt DB input, SQLite\r\n+ parser.add_argument(\r\n+ \'--ppep_mapping_db\', \'-d\',\r\n+ nargs=1,\r\n+ required=True,\r\n+ dest=\'ppep_mapping_db\',\r\n+ help=\'UniProtKB/SwissProt SQLite Database\'\r\n+ )\r\n+ #ACE # PhosPhositesPlus DB input, csv\r\n+ #ACE parser.add_argument(\r\n+ #ACE \'--psp_regulatory_sites\', \'-s\',\r\n+ #ACE nargs=1,\r\n+ #ACE required=True,\r\n+ #ACE dest=\'psp_regulatory_sites_csv\',\r\n+ #ACE help=\'PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header\'\r\n+ #ACE )\r\n+ # species to limit records chosed from PhosPhositesPlus\r\n+ parser.add_argument(\r\n+ \'--species\', \'-x\',\r\n+ nargs=1,\r\n+ required=False,\r\n+ default=[],\r\n+ dest=\'species\',\r\n+ help=\'limit PhosphoSitePlus records to indicated species (field may be empty)\'\r\n+ )\r\n+\r\n+ # outputs:\r\n+ # tabular output\r\n+ parser.add_argument(\r\n+ \'--mrgfltr_tab\', \'-o\',\r\n+ nargs=1,\r\n+ required=True,\r\n+ dest=\'mrgfltr_tab\',\r\n+ help=\'Tabular output file for results\'\r\n+ )\r\n+ # CSV output\r\n+ parser.add_argument(\r\n+ \'--mrgfltr_csv\', \'-c\',\r\n+ nargs=1,\r\n+ required=True,\r\n+ dest=\'mrgfltr_csv\',\r\n+ help=\'CSV output file for results\'\r\n+ )\r\n+ # SQLite output\r\n+ parser.add_argument(\r\n+ \'--mrgfltr_sqlite\', \'-S\',\r\n+ nargs=1,\r\n+ required=True'..b' CITATION_INSERT_STMT,\r\n+ (\'mrgfltr_metadata\', CITATION_INSERT_PSP_REF)\r\n+ )\r\n+\r\n+ # Read ppep-to-sequence LUT\r\n+ ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)\r\n+ #ACE ppep_lut_df.info(verbose=True)\r\n+ # write only metadata for merged/filtered records to SQLite\r\n+ mrgfltr_metadata_df = output_df.copy()\r\n+ # replace phosphopeptide seq with ppep.id\r\n+ mrgfltr_metadata_df = ppep_lut_df.merge(\r\n+ mrgfltr_metadata_df,\r\n+ left_on=\'ppep_seq\',\r\n+ right_on=PHOSPHOPEPTIDE,\r\n+ how=\'inner\'\r\n+ )\r\n+ mrgfltr_metadata_df.drop(\r\n+ columns=[PHOSPHOPEPTIDE, \'ppep_seq\'],\r\n+ inplace=True\r\n+ )\r\n+ #rename columns\r\n+ mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS\r\n+ #ACE mrgfltr_metadata_df.info(verbose=True)\r\n+ mrgfltr_metadata_df.to_sql(\r\n+ \'mrgfltr_metadata\',\r\n+ con=conn,\r\n+ if_exists=\'append\',\r\n+ index=False,\r\n+ method=\'multi\'\r\n+ )\r\n+\r\n+ # Close SwissProt SQLite database\r\n+ conn.close()\r\n+ ## ----------- Write merge/filter metadata to SQLite database (finish) -----------\r\n+\r\n+ output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)\r\n+ output_cols = output_df.columns.tolist()\r\n+ output_cols = output_cols[:-1]\r\n+ output_df = output_df[output_cols]\r\n+\r\n+ #cosmetic changes to Upstream column\r\n+ output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping\r\n+ us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])\r\n+ i = 0\r\n+ while i < len(us_series):\r\n+ #turn blanks into N_A to signify the info was searched for but cannot be found\r\n+ if us_series[i] == "":\r\n+ us_series[i] = N_A\r\n+ i += 1\r\n+ output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series\r\n+\r\n+ end_time = time.process_time() #timer\r\n+ print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer\r\n+\r\n+ (output_rows, output_cols) = output_df.shape\r\n+\r\n+ #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)\r\n+ output_df = output_df.convert_dtypes(convert_integer=True)\r\n+\r\n+\r\n+ #Output onto Final CSV file\r\n+ output_df.to_csv(output_filename_csv, index=False)\r\n+ output_df.to_csv(output_filename_tab, quoting=None, sep=\'\\t\', index=False)\r\n+\r\n+ end_time = time.process_time() #timer\r\n+ print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer\r\n+\r\n+ print(\'{:>10} phosphopeptides written to output\'.format(str(output_rows)))\r\n+\r\n+ end_time = time.process_time() #timer\r\n+ print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer\r\n+\r\n+\r\n+ #Rev. 7/1/2016\r\n+ #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A\'s\r\n+ #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS\r\n+ #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \\\r\n+ # read from SwissProt SQLite database\r\n+ #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper\r\n+\r\n+ #############################################\r\n+ # copied from Excel Output Script.ipynb END #\r\n+ #############################################\r\n+\r\n+ try:\r\n+ catch(mqpep_getswissprot,)\r\n+ exit(0)\r\n+ except Exception as e:\r\n+ exit(\'Internal error running mqpep_getswissprot(): %s\' % (e))\r\n+\r\n+if __name__ == "__main__":\r\n+ __main__()\r\n+\r\n'

diff -r 000000000000 -r c1403d18c189 search_ppep.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/search_ppep.py Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,512 @@\n+#!/usr/bin/env python\n+# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB\n+\n+import argparse\n+import os.path\n+import sqlite3\n+import re\n+from codecs import getreader as cx_getreader\n+import time\n+\n+# For Aho-Corasick search for fixed set of substrings\n+# - add_word\n+# - make_automaton\n+# - iter\n+import ahocorasick\n+# Support map over auto.iter(...)\n+# - itemgetter\n+import operator\n+#import hashlib\n+\n+# ref: https://stackoverflow.com/a/8915613/15509512\n+# answers: "How to handle exceptions in a list comprehensions"\n+# usage:\n+# from math import log\n+# eggs = [1,3,0,3,2]\n+# print([x for x in [catch(log, egg) for egg in eggs] if x is not None])\n+# producing:\n+# for <built-in function log>\n+# with args (0,)\n+# exception: math domain error\n+# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]\n+def catch(func, *args, handle=lambda e : e, **kwargs):\n+ try:\n+ return func(*args, **kwargs)\n+ except Exception as e:\n+ print("For %s" % str(func))\n+ print(" with args %s" % str(args))\n+ print(" caught exception: %s" % str(e))\n+ (ty, va, tb) = sys.exc_info()\n+ print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))\n+ #exit(-1)\n+ return None # was handle(e)\n+\n+def __main__():\n+ ITEM_GETTER = operator.itemgetter(1)\n+\n+ DROP_TABLES_SQL = \'\'\'\n+ DROP VIEW IF EXISTS ppep_gene_site_view;\n+ DROP VIEW IF EXISTS uniprot_view;\n+ DROP VIEW IF EXISTS uniprotkb_pep_ppep_view;\n+ DROP VIEW IF EXISTS ppep_intensity_view;\n+ DROP VIEW IF EXISTS ppep_metadata_view;\n+\n+ DROP TABLE IF EXISTS sample;\n+ DROP TABLE IF EXISTS ppep;\n+ DROP TABLE IF EXISTS site_type;\n+ DROP TABLE IF EXISTS deppep_UniProtKB;\n+ DROP TABLE IF EXISTS deppep;\n+ DROP TABLE IF EXISTS ppep_gene_site;\n+ DROP TABLE IF EXISTS ppep_metadata;\n+ DROP TABLE IF EXISTS ppep_intensity;\n+ \'\'\'\n+\n+ CREATE_TABLES_SQL = \'\'\'\n+ CREATE TABLE deppep\n+ ( id INTEGER PRIMARY KEY\n+ , seq TEXT UNIQUE ON CONFLICT IGNORE\n+ )\n+ ;\n+ CREATE TABLE deppep_UniProtKB\n+ ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE\n+ , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE\n+ , pos_start INTEGER\n+ , pos_end INTEGER\n+ , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)\n+ ON CONFLICT IGNORE\n+ )\n+ ;\n+ CREATE TABLE ppep\n+ ( id INTEGER PRIMARY KEY\n+ , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE\n+ , seq TEXT UNIQUE ON CONFLICT IGNORE\n+ , scrubbed TEXT\n+ );\n+ CREATE TABLE site_type\n+ ( id INTEGER PRIMARY KEY\n+ , type_name TEXT UNIQUE ON CONFLICT IGNORE\n+ );\n+ CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)\n+ ;\n+ CREATE TABLE sample\n+ ( id INTEGER PRIMARY KEY\n+ , name TEXT UNIQUE ON CONFLICT IGNORE\n+ )\n+ ;\n+ CREATE VIEW uniprot_view AS\n+ SELECT DISTINCT\n+ Uniprot_ID\n+ , Description\n+ , Organism_Name\n+ , Organism_ID\n+ , Gene_Name\n+ , PE\n+ , SV\n+ , Sequence\n+ , Description || \' OS=\' ||\n+ Organism_Name || \' OX=\' || Organism_ID ||\n+ CASE WHEN Gene_Name = \'N/A\' THEN \'\' ELSE \' GN=\'|| Gene_Name END ||\n+ CASE WHEN PE = \'N/A\' THEN \'\' ELSE \' PE=\'|| PE END ||\n+ CASE WHEN SV = \'N/A\' THEN \'\' ELSE \' SV=\'|| SV END\n+ AS long_description\n+ , '..b'sphopeptides are represented in input" % deppep_count\n+ )\n+ # Look for cases where both Gene_Name and Sequence are identical\n+ cur.execute(\'\'\'\n+ SELECT Uniprot_ID, Gene_Name, Sequence\n+ FROM UniProtKB\n+ WHERE Sequence IN (\n+ SELECT Sequence\n+ FROM UniProtKB\n+ GROUP BY Sequence, Gene_Name\n+ HAVING count(*) > 1\n+ )\n+ ORDER BY Sequence\n+ \'\'\')\n+ duplicate_count = 0\n+ old_seq = \'\'\n+ for row in cur.fetchall():\n+ if duplicate_count == 0:\n+ print("\\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")\n+ if row[2] != old_seq:\n+ old_seq = row[2]\n+ duplicate_count += 1\n+ if options.warn_duplicates:\n+ print("\\n%s\\t%s\\t%s" % row)\n+ else:\n+ if options.warn_duplicates:\n+ print("%s\\t%s" % (row[0], row[1]))\n+ if duplicate_count > 0:\n+ print("\\n%d sequences have duplicated accession IDs\\n" % duplicate_count)\n+\n+ print(\n+ "%s accession sequences will be searched\\n" % sequence_count\n+ )\n+\n+ #print(auto.dump())\n+\n+ # Convert the trie to an automaton (a finite-state machine)\n+ auto.make_automaton()\n+\n+ # Execute query for seqs and metadata without fetching the results yet\n+ uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)\n+ while batch := uniprot_seq_and_id.fetchmany(size=50):\n+ if None == batch:\n+ break\n+ for Sequence, UniProtKB_id in batch:\n+ if Sequence is not None:\n+ for end_index, (insert_order, original_value) in auto.iter(Sequence):\n+ ker.execute(\'\'\'\n+ INSERT INTO deppep_UniProtKB\n+ (deppep_id,UniProtKB_id,pos_start,pos_end)\n+ VALUES (?,?,?,?)\n+ \'\'\', (\n+ insert_order,\n+ UniProtKB_id,\n+ 1 + end_index - len(original_value),\n+ end_index\n+ )\n+ )\n+ else:\n+ raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))\n+ ker.execute("""\n+ SELECT count(*) || \' accession-peptide-phosphopeptide combinations were found\'\n+ FROM uniprotkb_pep_ppep_view\n+ """\n+ )\n+ for row in ker.fetchall():\n+ print(row[0])\n+\n+ ker.execute("""\n+ SELECT count(*) || \' accession matches were found\', count(*) AS accession_count\n+ FROM (\n+ SELECT accession\n+ FROM uniprotkb_pep_ppep_view\n+ GROUP BY accession\n+ )\n+ """\n+ )\n+ for row in ker.fetchall():\n+ print(row[0])\n+ accession_count = row[1]\n+\n+ ker.execute("""\n+ SELECT count(*) || \' peptide matches were found\'\n+ FROM (\n+ SELECT peptide\n+ FROM uniprotkb_pep_ppep_view\n+ GROUP BY peptide\n+ )\n+ """\n+ )\n+ for row in ker.fetchall():\n+ print(row[0])\n+\n+ ker.execute("""\n+ SELECT count(*) || \' phosphopeptide matches were found\', count(*) AS phosphopeptide_count\n+ FROM (\n+ SELECT phosphopeptide\n+ FROM uniprotkb_pep_ppep_view\n+ GROUP BY phosphopeptide\n+ )\n+ """\n+ )\n+ for row in ker.fetchall():\n+ print(row[0])\n+ phosphopeptide_count = row[1]\n+\n+ con.commit()\n+ ker.execute(\'vacuum\')\n+ con.close()\n+\n+if __name__ == "__main__":\n+ wrap_start_time = time.perf_counter()\n+ __main__()\n+ wrap_stop_time = time.perf_counter()\n+ # print(wrap_start_time)\n+ # print(wrap_stop_time)\n+ print("\\nThe matching process took %d milliseconds to run.\\n" % ((wrap_stop_time - wrap_start_time)*1000),)\n+\n+ # vim: sw=4 ts=4 et ai :\n'

diff -r 000000000000 -r c1403d18c189 test-data/alpha_levels.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/alpha_levels.tabular Mon Mar 07 19:05:01 2022 +0000

@@ -0,0 +1,3 @@
+0.05
+0.1
+0.2

diff -r 000000000000 -r c1403d18c189 test-data/pSTY_motifs.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pSTY_motifs.tabular Mon Mar 07 19:05:01 2022 +0000

b'@@ -0,0 +1,196 @@\n+1\t((E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D))|(pS.(E|pS|pT))|(pS..(E|pS|pT))|((pS|pT)..(E|D))|(pS(D|E).(D|E).(D|E))|((D|E)pS(D|E).(D|E))|(pS(D|E)(D|E)(D|E))|((pS|pT)..(D|E))|((pS|pT)..(E|D|pS|pY))|((S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q))\tCasein Kinase II substrate motif (HPRD)\n+2\t((L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V))|(..B.R..pS..)|(pS...(pS|pT))\tMAPKAPK2 kinase substrate motif (HPRD)\n+3\t((M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F))|((M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I))|((M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F))\tAMP-activated protein kinase substrate motif (HPRD)\n+4\t((P|L|I|M).(L|I|D|E)pSQ)|(LpSQE)|(pSQ)\tATM kinase substrate motif (HPRD)\n+5\t((R|K).R..(pS|pT)(M|L|V|I))|(VFLGFpTYVAP)\tp70 Ribosomal S6 kinase substrate motif (HPRD)\n+6\t((R|K).R..pS)|(RRR.pS)\tMAPKAPK1 kinase substrate motif (HPRD)\n+7\t((R|K)pSP(R|P)(R|K|H))|((pS|pT)P.(R|K))|(HHH(R|K)pSPR(R|K)R)\tCdc2 kinase substrate motif (HPRD)\n+8\t((R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M))|(FR.(pS|pT))|(RF(R|K)(R|K)pS(R|I)(R|I)MI)\tNIMA kinase substrate motif (HPRD)\n+9\t((pS|pT)P.(K|R))|((K|R)(pS|pT)P)|((pS|pT)P(K|R))\tGrowth associated histone HI kinase substrate motif (HPRD)\n+10\t(..(pS|pT)E)|(.(pS|pT)...(A|P|S|T))\tG protein-coupled receptor kinase 1 substrate motif (HPRD)\n+11\t(.R..(pS|pT).R.)|((pS|pT).(R|K))|((R|K)..(pS|pT))|((R|K)..(pS|pT).(R|K))|((K|R).(pS|pT))|((R|K).(pS|pT).(R|K))\tPKC kinase substrate motif (HPRD)\n+12\t(.pSQ)|(P(pS|pT).)\tDNA dependent Protein kinase substrate motif (HPRD)\n+13\t(AKRRRLSpSLRA)|(VRKRpTLRRL)\tPAK1 kinase substrate motif (HPRD)\n+14\t(ARKGpSLRQ)|(R(R|F)RR(R|K)GpSF(R|K)(R|K))\tPKC alpha kinase substrate motif (HPRD)\n+15\t(HpSTSDD)|(YRpSVDE)\tBranched chain alpha-ketoacid dehydrogenase kinase substrate motif (HPRD)\n+16\t(KCSpTWP)|(R..pS)|(R.R..pS.P)|(YpTV)|(RS.(pS|pT).P)|(R.(Y|F).pS.P)|(RPVSSAApSVY)\t14-3-3 domain binding motif (HPRD)\n+17\t(KK.RRpT(L|V).)|(KKR.RpT(L|V).)|((R|K).RR.(pS|pT)(L|V).)\tDMPK1 kinase substrate motif (HPRD)\n+18\t(KKKKKK(pS|pT)...)|((R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C))\tTGF beta receptor kinase substrate motif (HPRD)\n+19\t(KRKQIpSVR)|((F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I))|((K|R)..pS(V|I))\tPhosphorylase kinase substrate motif (HPRD)\n+20\t(KRQGpSVRR)|(R(K|E|R).pS)\tPKC epsilon kinase substrate motif (HPRD)\n+21\t(P.(pS|pT)P)|(pSP)\tERK1, ERK2 Kinase substrate motif (HPRD)\n+22\t(P.(pS|pT)PP)|(..P.(pS|pT)PPP.)\tERK1,2 kinase substrate motif (HPRD)\n+23\t(PL(pS|pT)PIP(K|R|H))|(PL(pS|pT)P.(K|R|H))\tCDK4 kinase substrate motif (HPRD)\n+24\t(PLpTLP)|(PLLpTP)|(PLpTP)|(PpTLP)|(PLpTLP)|(PpTLP)|(LpTP)\tRAF1 kinase substrate motif (HPRD)\n+25\t(R..(pS|pT))|((K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K))|((M|V|L|I|F).(R|K)..(pS|pT)..)|(R..pS)\tCalmodulin-dependent protein kinase II substrate motif (HPRD)\n+26\t(R..pSPV)|(K(pS|pT)P.K)|(KpSP...K)|(KpSP..K)|(KpSP....K)|(KpTPAKEE)|(P.pSP)|(.(pS|pT)P)|(..pSP)\tGSK-3, ERK1, ERK2, CDK5 substrate motif (HPRD)\n+27\t(R.R..(pS|pT)(F|L))|(R.R..(pS|pT))|(GRART(S|T)pSFAE)|((R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q))|((R|K).(R|K)(S|T).pS)\tAkt kinase substrate motif (HPRD)\n+28\t(RR..pS)|(KR.RpS)|(KRR.pT)\tZIP kinase substrate motif (HPRD)\n+29\t(RR.pS(M|I|L|V|F|Y))|(R.pS)|(KR..pS)|(R..pS)|((R|K).(pS|pT))|(K..(pS|pT))|((R|K)(R|K).(pS|pT))|(K...(pS|pT))|((pS|pT).(R|K))|(RRRRpSIIFI)|(RR.pS)|(R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D)|(RR.pS)|(RRR(R|N)pSII(F|D))|((R|C|P|K)(R|A|P)(R|K)(R|K|S)(N|L|S|M|P)Ps(I|L|V|C)(S|P|H|Q)(S|W|Q)(S|L|G))\tPKA kinase substrate motif (HPRD)\n+30\t(RRFGpSBRRF)|(RRFGpS(M|L|V|I|F)RR(M|L|V|I|F))\tMEKK kinase substrate motif (HPRD)\n+31\t(VPGKARKKpSSCQLL)|(PLARTLpSVAGLP)|((M|I|L|V|F|Y).R..(pS|pT))\tCalmodulin-dependent protein kinase IV substrate motif (HPRD)\n+32\t(pSD.E)|(pS..(E|D))\tCasein kinase II substrate motif (HPRD)\n+33\t(pSP..(pS|pT))|((D|E)..(pS|pT))|((pS|pT)..(S|T))|((pS|pT)...(S|T)(M|L|V|I|F))\tCasein Kinase I substrate'..b'+140\t(pY..P)|(pYDHP)\tCrk SH2 domain binding motif (HPRD)\n+141\t(pY..Q)|(pY(M|L|V|I|F)(P|R|K|H)Q)\tSTAT3 SH2 domain binding motif (HPRD)\n+142\t(pY..YY)|(pY(D|E).(I|L|V|M))|((D|E)..pY)|(pY....(F|Y))\tALK kinase substrate motif (HPRD)\n+143\t(pYIDL)|(pYASI)|(EFpYA.(V|I)G(R|K|H)S)\tSHP2 phosphatase substrate motif (HPRD)\n+144\t(pYM.M)|(EDAIpY)|(.VIpYAAPF)|(EAIpYAAPF)|(EEIpYEEpY)|(E.IpY..P.)|(EEIpYYYVH)|(ERIpYARTK)|(AEV(I|V|L|F)pYAA(P|F)F)\tAbl kinase substrate motif (HPRD)\n+145\t(pYM.M)|(EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F))|(.EEEpYMMMM)|(KKSRGDpYMTMQIG)|(KKKLPATGDpYMNMSPVGD)\tInsulin receptor kinase substrate motif (HPRD)\n+146\t(pYM.M)|(YIpYGSFK)|(EEEIpY(G|E)EFD)|(D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F)|((D|E).......(D|E)..pY..L.......Y..(L|I))|((D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E))|(pY(A|G|S|T|D|E))\tSrc kinase substrate motif (HPRD)\n+147\t(pYM.M)|(pY..M)|(pYMPMS)\tPI3 Kinase p85 SH2 domain binding motif (HPRD)\n+148\tME(E|N)(I|V)pY(G|E)IFF\tFgr kinase substrate motif (HPRD)\n+149\tKKKSPGEpYVNIEFG\tIGF1 receptor kinase substrate motif (HPRD)\n+150\tpY..(L|I|V)\tJAK2 kinase substrate motif (HPRD)\n+151\tpTPpY\tJNK kinase substrate motif (HPRD)\n+152\t(E|D|pT|pY).pYEE\tSyk kinase substrate motif (HPRD)\n+153\tDpYpYR\tPTP1B, TC-PTP phosphatase substrate motif (HPRD)\n+154\t(D|E)FpY(G|A)(F|Y)(A|G)\tPTPRH phosphatase substrate motif (HPRD)\n+155\tF(M|L|V|I)pY\tPTPRJ phosphatase substrate motif (HPRD)\n+156\tpY(E|M|V)(N|V|I)\t3BP2 SH2 domain binding motif (HPRD)\n+157\tpYENP\tAbl SH2 domain binding motif (HPRD)\n+158\tpY(T|A|S)(K|R|Q|N)(M|I|V|R)\tCsk SH2 domain binding motif (HPRD)\n+159\tpYE.(V|I)\tFes SH2 domain binding motif (HPRD)\n+160\tpYEE(I|V)\tFgr SH2 domain binding motif (HPRD)\n+161\tpYEDP\tFyn SH2 domain binding motif (HPRD)\n+162\tpY(M|I|L|V).(M|I|L|V)\tGRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif (HPRD)\n+163\t(F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S)\tGRB7, GRB10 SH2 domain binding motif (HPRD)\n+164\tpYF.(F|P|L|Y)\tHCP SH2 domain binding motif (HPRD)\n+165\tpY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H)\tItk SH2 domain binding motif (HPRD)\n+166\tpYDYV\tLck and Src SH2 domain binding motif (HPRD)\n+167\tpYDEP\tNck SH2 domain binding motif (HPRD)\n+168\tpY(L|I|V)E(L|I|V)\tPLCgamma C and N-terminal SH2 domain binding motif (HPRD)\n+169\tpY..P\tRasGAP C-terminal SH2 domain binding motif (HPRD)\n+170\tpYILV.(M|L|I|V|P)\tRasGAP N-terminal SH2 domain binding motif (HPRD)\n+171\tTIpY..(V|I)\tSAP and EAT2 SH2 domain binding motif (HPRD)\n+172\tpY(L|V)N(V|P)\tSem5 SH2 domain binding motif (HPRD)\n+173\tpY(T|V|I).L\tShb SH2 domain binding motif (HPRD)\n+174\tpY(I|E|Y|L).(I|L|M)\tSHC SH2 domain binding motif (HPRD)\n+175\t(I|V|L|S).pY..(L|I)\tSHIP2 SH2 domain binding motif (HPRD)\n+176\t(I|V).pY..(L|V)\tSHP1 SH2 domain binding motif (HPRD)\n+177\t(V|I|L).pY(M|L|F).P\tSHP1, SHP2 SH2 domain binding motif (HPRD)\n+178\t(T|V|I|Y).pY(A|S|T|V).(I|V|L)\tSHP2 CSH2 domain binding motif (HPRD)\n+179\t(I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P)\tSHP2 C-terminal SH2 domain binding motif (HPRD)\n+180\tpYIPP\tSHP2, PLCgamma SH2 domain binding motif (HPRD)\n+181\tpYM.M\tSrc and Abl SH2 domain binding motif (HPRD)\n+182\tpY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P)\tSrc, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif (HPRD)\n+183\tPP.pY\tSrc, Fyn,Csk, Nck and SHC SH2 domain binding motif (HPRD)\n+184\tpYEEI\tSrc,Lck and Fyn SH2 domains binding motif (HPRD)\n+185\tpY(D|E)(P|R)(R|P|Q)\tSTAT1 SH2 domain binding motif (HPRD)\n+186\tpY(Q|T|E)(E|Q)(L|I)\tSyk C-terminal SH2 domain binding motif (HPRD)\n+187\tpYTT(I|L|M)\tSyk N-terminal SH2 domain binding motif (HPRD)\n+188\t(D|E).......(D|E)..pY..L.......Y..(L|I)\tSyk, ZAP-70, Shc, Lyn SH2 domain binding motif (HPRD)\n+189\tpYEN(F|I|V)\tTensin SH2 domain binding motif (HPRD)\n+190\tD(N|D).pY\tCbl PTB domain binding motif (HPRD)\n+191\tN.LpY\tDok1 PTB domain binding motif (HPRD)\n+192\tN..pY\tFRIP PTB domain binding motif (HPRD)\n+193\tNP.pY\tShc PTB domain binding motif (HPRD)\n+194\tDD.pY\tShb PTB domain binding motif (HPRD)\n+195\tNP.pYF.R\tShcA PTB domain binding motif (HPRD)\n+196\tHN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY\tShcC PTB domain binding motif (HPRD)\n'

diff -r 000000000000 -r c1403d18c189 test-data/test_input_for_anova.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_anova.tabular Mon Mar 07 19:05:01 2022 +0000

b'@@ -0,0 +1,23 @@\n+Phosphopeptide\tSequence10\tSequence7\tGene_Name\tPhosphoresidue\tUniProt_ID\tDescription\tFunction Phosphoresidue(PSP=PhosphoSitePlus.org)\tPutative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains\tIntensity.shL.1A\tIntensity.shL.1B\tIntensity.shL.1C\tIntensity.shR.2A\tIntensity.shR.2B\tIntensity.shR.2C\n+AAAAPDSRVpSEEENLK\tMAAAAPDSRVpSEEENLKKTPK\tAAPDSRVsEEENLKK\tRRP15\tpS11\tQ9Y3B9\tRRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2\tN/A\tCK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2\t38150000\t39445000\t56305000\t55338000\t7010600\t70203000\n+AAAITDMADLEELSRLpSPLPPGpSPGSAAR\tMADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE\tLEELSRLsPLPPGSP | LSPLPPGsPGSAARG\tAEBP2; AEBP2\tpS18, pS24; pS18, pS24\tQ6ZN18; Q6ZN18-2\tAEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2\tN/A\tN/A\t5416400\t7101800\t385280000\t208060000\t41426000\t352400000\n+ADALQAGASQFETpSAAK\tLQAGASQFETpSAAKLKRKYWW\tGASQFETsAAKLKRK\tVAMP2; VAMP3\tpS80; pS63\tP63027; Q15836\tVAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3\tN/A\tPKD3 | PKCiota\t44627000\t41445000\t69094000\t42521000\t5738000\t61819000\n+DQKLpSELDDR\tDKVLERDQKLpSELDDRADALQ\tLERDQKLsELDDRAD\tVAMP1; VAMP1; VAMP1; VAMP2; VAMP3\tpS63; pS63; pS63; pS61; pS44\tP23763; P23763-2; P23763-3; P63027; Q15836\tVAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3\tN/A\tCK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate\t75542000\t44814000\t32924000\t35016000\t11023000\t4669900\n+EFVpSSDESSSGENK\tSESFKSKEFVpSSDESSSGENK\tFKSKEFVsSDESSSG\tSSRP1\tpS667\tQ08945\tSSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1\tN/A\tCK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3\t12562000\t16302000\t23000000\t7857800\t0\t18830000\n+EGMNPSYDEYADpSDEDQHDAYLER\tMNPSYDEYADpSDEDQHDAYLE\tSYDEYADsDEDQHDA\tSSRP1\tpS444\tQ08945\tSSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1\tN/A\tCK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate\t0\t0\t0\t0\t0\t0\n+IGNEEpSDLEEACILPHpSPINVDK\tDDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA\tEKIGNEEsDLEEACI | EACILPHsPINVDKR\tHERC2\tpS1577, pS1588\tO95714\tHERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2\tN/A\tCK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6\t167764000\t121218000\t155736000\t140640000\t83642000\t128468000\n+IRAEEEDLAAVPFLApSDNEEEEDEK\tEDLAAVPFLApSDNEEEEDEKG\tAAVPFLAsDNEEEED\tHERC2\tpS2928\tO95714\tHERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2\tN/A\tCK2alpha | Casein kinase II substrate | CK2\t22562000\t18225000\t9119700\t11689000\t0\t0\n+KGLLApTpSGNDGTIR\tVWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN\tNKKGLLAtSGNDGTI | KKGLLATsGNDGTIR\tHERC1\tpT3445, pS3446\tQ15751\tHERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1'..b'-9\tENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA\tN/A\tATM kinase substrate | PKC kinase substrate | PKA kinase substrate\t0\t0\t8765300\t0\t2355900\t14706000\n+QLSEpSFK\tSKSSSRQLSEpSFKSKEFVSSD\tSSRQLSEsFKSKEFV\tSSRP1\tpS659\tQ08945\tSSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1\tN/A\tCK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6\t68201000\t87774000\t138300000\t95357000\t19966000\t149110000\n+RGpSLEMSSDGEPLSR\tSSATSGGRRGpSLEMSSDGEPL\tTSGGRRGsLEMSSDG\tAEBP2; AEBP2\tpS206; pS206\tQ6ZN18; Q6ZN18-2\tAEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2\tN/A\tCasein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA\t19262000\t11103000\t19454000\t0\t1816900\t22028000\n+SDGpSLEDGDDVHR\tIEDGGARSDGpSLEDGDDVHRA\tGGARSDGsLEDGDDV\tSERINC1\tpS364\tQ9NRX5\tSERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1\tN/A\tCasein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1\t31407000\t17665000\t20892000\t23194000\t5132400\t54893000\n+SEpSLTAESR\tEGGGLMTRSEpSLTAESRLVHT\tGLMTRSEsLTAESRL\tHERC1\tpS1491\tQ15751\tHERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2\tN/A\tb-Adrenergic Receptor kinase substrate\t11766000\t13176000\t20540000\t16963000\t4364700\t21308000\n+STGPTAATGpSNRR\tMSTGPTAATGpSNRRLQQTQNQ\tGPTAATGsNRRLQQT\tVAMP3\tpS11\tQ15836\tVAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3\tN/A\tPKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate\t3057100\t4718800\t12052000\t5047700\t1070900\t8333500\n+TEDLEATpSEHFK\tRNKTEDLEATpSEHFKTTSQKV\tTEDLEATsEHFKTTS\tVAMP8\tpS55\tQ9BV40\tVAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1\tactivity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion\tG protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate\t20400000\t9738500\t7862300\t0\t0\t76518000\n+TFWpSPELK\tSSMNSIKTFWpSPELKKERVLR\tNSIKTFWsPELKKER\tERC2\tpS187\tO15083\tERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3\tN/A\tIKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding\t29764000\t20957000\t24855000\t30752000\t8304800\t23771000\n+YFDpSGDYNMAK\tCADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM\tEMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK\tENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA\tpS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83\tO43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9\tENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA\tmolecular association, regulation; cell cycle regulation; PPP2CA(INDUCES)\tb-Adrenergic Receptor kinase substrate\t323250000\t127970000\t0\t67123000\t12790000\t71378000\n'

diff -r 000000000000 -r c1403d18c189 test-data/test_input_for_preproc.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_preproc.tabular Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,38 @@\n+Proteins\tPositions within proteins\tLeading proteins\tProtein\tFasta headers\tLocalization prob\tScore diff\tPEP\tScore\tDelta score\tScore for localization\tLocalization prob shL.1A\tScore diff shL.1A\tPEP shL.1A\tScore shL.1A\tLocalization prob shL.1B\tScore diff shL.1B\tPEP shL.1B\tScore shL.1B\tLocalization prob shL.1C\tScore diff shL.1C\tPEP shL.1C\tScore shL.1C\tLocalization prob shR.2A\tScore diff shR.2A\tPEP shR.2A\tScore shR.2A\tLocalization prob shR.2B\tScore diff shR.2B\tPEP shR.2B\tScore shR.2B\tLocalization prob shR.2C\tScore diff shR.2C\tPEP shR.2C\tScore shR.2C\tDiagnostic peak\tNumber of Phospho (STY)\tAmino acid\tSequence window\tModification window\tPeptide window coverage\tPhospho (STY) Probabilities\tPhospho (STY) Score diffs\tPosition in peptide\tCharge\tMass error [ppm]\tIdentification type shL.1A\tIdentification type shL.1B\tIdentification type shL.1C\tIdentification type shR.2A\tIdentification type shR.2B\tIdentification type shR.2C\tIntensity\tIntensity___1\tIntensity___2\tIntensity___3\tRatio mod/base\tIntensity shL.1A\tIntensity shL.1B\tIntensity shL.1C\tIntensity shR.2A\tIntensity shR.2B\tIntensity shR.2C\tRatio mod/base shL.1A\tRatio mod/base shL.1B\tRatio mod/base shL.1C\tRatio mod/base shR.2A\tRatio mod/base shR.2B\tRatio mod/base shR.2C\tIntensity shL.1A___1\tIntensity shL.1A___2\tIntensity shL.1A___3\tIntensity shL.1B___1\tIntensity shL.1B___2\tIntensity shL.1B___3\tIntensity shL.1C___1\tIntensity shL.1C___2\tIntensity shL.1C___3\tIntensity shR.2A___1\tIntensity shR.2A___2\tIntensity shR.2A___3\tIntensity shR.2B___1\tIntensity shR.2B___2\tIntensity shR.2B___3\tIntensity shR.2C___1\tIntensity shR.2C___2\tIntensity shR.2C___3\tOccupancy shL.1A\tOccupancy ratioshL.1A\tOccupancy error scale shL.1A\tOccupancy shL.1B\tOccupancy ratioshL.1B\tOccupancy error scale shL.1B\tOccupancy shL.1C\tOccupancy ratioshL.1C\tOccupancy error scale shL.1C\tOccupancy shR.2A\tOccupancy ratioshR.2A\tOccupancy error scale shR.2A\tOccupancy shR.2B\tOccupancy ratioshR.2B\tOccupancy error scale shR.2B\tOccupancy shR.2C\tOccupancy ratioshR.2C\tOccupancy error scale shR.2C\tReverse\tPotential contaminant\tid\tProtein group IDs\tPositions\tPosition\tPeptide IDs\tMod. peptide IDs\tEvidence IDs\tMS/MS IDs\tBest localization evidence ID\tBest localization MS/MS ID\tBest localization raw file\tBest localization scan number\tBest score evidence ID\tBest score MS/MS ID\tBest score raw file\tBest score scan number\tBest PEP evidence ID\tBest PEP MS/MS ID\tBest PEP raw file\tBest PEP scan number\n+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN\t108;108;124;124;131;104;104;120\tsp|O43768-2|ENSA_HUMAN\tsp|O43768-2|ENSA_HUMAN\t\t0.877317\t8.54376\t0.001041\t110.11\t55.028\t110.11\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\tTGDHIPTPQDLPQRKSSLVTSKLAG______\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX\tKS(0.877)S(0.123)LVTSK\tKS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K\t2\t2\t0.022801\t\t\tBy MS/MS\t\t\t\t18629000\t18629000\t0\t0\t\t0\t0\t18629000\t0\t0\t0\t\t\t\t\t\t\t0\t0\t0\t0\t0\t0\t18629000\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t700\t529\t108\t108\t12310;20039\t13742;22688\t99166\t91729\t99166\t91729\tQE05099\t5593\t99166\t91729\tQE05099\t5593\t99166\t91729\tQE05099\t5593\n+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN\t109;109;125;125;132;105;105;121\tsp|O43768-2|ENSA_HUMAN\tsp|O43768-2|ENSA_HUMAN\t\t0.877764\t9.23011\t0.00135208\t98.182\t25.939\t55.754\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\tGDHIPTPQDLPQRKSSLVTSKLAG_______\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX\tKS(0.105)S(0.878)LVT(0.015)S(0.002)K\tKS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K\t3\t2\t-0.061619\tBy MS/MS\tBy MS/MS\tBy matching\tBy matching\tBy matching\tBy MS/MS\t81973000\t81973000\t0\t0\t\t7090300\t8341200\t9691500\t10030000\t1675200\t9952100\t\t\t\t\t\t\t7090300\t0\t0\t8341200\t0\t0\t9691500\t0\t0\t10030000\t0\t0\t1675200\t0\t0\t99'..b'tching\tBy matching\t86590000\t86590000\t0\t0\t0.032027\t17447000\t15753000\t20219000\t14001000\t6284700\t12885000\t0.028348\t0.025719\t0.032895\t0.033925\t0.083789\t0.034516\t17447000\t0\t0\t15753000\t0\t0\t20219000\t0\t0\t14001000\t0\t0\t6284700\t0\t0\t12885000\t0\t0\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t\t\t1189\t809\t48\t48\t17891\t20149\t142427;142428;142429;142430;142431;142432\t127454\t142427\t127454\tQE05099\t48504\t142427\t127454\tQE05099\t48504\t142427\t127454\tQE05099\t48504\n+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN\t63;80\tsp|Q15836|VAMP3_HUMAN\tsp|Q15836|VAMP3_HUMAN\t\t0.920811\t10.6555\t1.81E-09\t124.1\t98.278\t107.25\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\tDRADALQAGASQFETSAAKLKRKYWWKNCKM\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX\tADALQAGASQFET(0.079)S(0.921)AAK\tADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK\t14\t2\t0.23449\tBy MS/MS\tBy MS/MS\tBy MS/MS\tBy MS/MS\tBy matching\tBy MS/MS\t265240000\t265240000\t0\t0\t0.036151\t44627000\t41445000\t69094000\t42521000\t5738000\t61819000\t0.03226\t0.028442\t0.039791\t0.036967\t0.030963\t0.043392\t44627000\t0\t0\t41445000\t0\t0\t69094000\t0\t0\t42521000\t0\t0\t5738000\t0\t0\t61819000\t0\t0\t0.47624\t0.90925\t12.188\t0.51677\t1.0694\t7.2217\tNaN\tNaN\tNaN\t0.81588\t4.4311\t19.209\tNaN\tNaN\tNaN\t0.4388\t0.78189\t5.9861\t\t\t4442\t2836\t63\t63\t279\t319\t2297;2298;2299;2300;2301;2302\t1992;1993;1994;1995;1996\t2300\t1995\tQE05100\t30086\t2301\t1996\tQE05102\t30007\t2301\t1996\tQE05102\t30007\n+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN\t44;61;63;63;63\tsp|Q15836|VAMP3_HUMAN\tsp|Q15836|VAMP3_HUMAN\t\t1\t65.4951\t2.36E-06\t126.19\t98.602\t65.495\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\tMRVNVDKVLERDQKLSELDDRADALQAGASQ\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX\tDQKLS(1)ELDDR\tDQKLS(65.5)ELDDR\t5\t3\t-0.72518\tBy MS/MS\tBy MS/MS\tBy MS/MS\tBy MS/MS\tBy matching\tBy MS/MS\t412950000\t412950000\t0\t0\tNaN\t75542000\t44814000\t32924000\t35016000\t11023000\t4669900\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t75542000\t0\t0\t44814000\t0\t0\t32924000\t0\t0\t35016000\t0\t0\t11023000\t0\t0\t4669900\t0\t0\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t\t\t4443\t2836\t44\t44\t4530\t5083\t37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104\t34712;34713;34714;34715;34716;34717;34718;34719\t37100\t34719\tQE05102\t18436\t37093\t34712\tQE05097\t18245\t37093\t34712\tQE05097\t18245\n+sp|Q15836|VAMP3_HUMAN\t11\tsp|Q15836|VAMP3_HUMAN\tsp|Q15836|VAMP3_HUMAN\t\t0.97018\t15.1316\t0.000117365\t79.652\t72.041\t79.652\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\t_____MSTGPTAATGSNRRLQQTQNQVDEVV\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX\tSTGPTAAT(0.03)GS(0.97)NRR\tS(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR\t10\t2\t-0.15791\tBy matching\tBy matching\tBy MS/MS\tBy matching\tBy matching\tBy MS/MS\t34280000\t34280000\t0\t0\tNaN\t3057100\t4718800\t12052000\t5047700\t1070900\t8333500\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t3057100\t0\t0\t4718800\t0\t0\t12052000\t0\t0\t5047700\t0\t0\t1070900\t0\t0\t8333500\t0\t0\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t\t\t4444\t2836\t11\t11\t20280\t22978\t162490;162491;162492;162493;162494;162495\t144222;144223\t162490\t144222\tQE05099\t7582\t162490\t144222\tQE05099\t7582\t162490\t144222\tQE05099\t7582\n+sp|Q9BV40|VAMP8_HUMAN\t55\tsp|Q9BV40|VAMP8_HUMAN\tsp|Q9BV40|VAMP8_HUMAN\t\t0.959784\t13.7778\t3.78E-05\t91.969\t27.98\t91.969\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1\tS\tNLEHLRNKTEDLEATSEHFKTTSQKVARKFW\tX;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X\tXXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX\tTEDLEAT(0.04)S(0.96)EHFK\tT(-83.18)EDLEAT(-13.78)S(13.78)EHFK\t8\t2\t0.40785\tBy matching\tBy matching\tBy matching\t\t\tBy MS/MS\t114520000\t114520000\t0\t0\tNaN\t20400000\t9738500\t7862300\t0\t0\t76518000\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t20400000\t0\t0\t9738500\t0\t0\t7862300\t0\t0\t0\t0\t0\t0\t0\t0\t76518000\t0\t0\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\tNaN\t\t\t7902\t4687\t55\t55\t21013\t23827\t168874;168875;168876;168877\t150433\t168874\t150433\tQE05102\t19524\t168874\t150433\tQE05102\t19524\t168874\t150433\tQE05102\t19524\n'

diff -r 000000000000 -r c1403d18c189 test-data/test_kinase_substrate.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_kinase_substrate.tabular Mon Mar 07 19:05:01 2022 +0000

@@ -0,0 +1,2 @@
+GENE KINASE KIN_ACC_ID KIN_ORGANISM SUBSTRATE SUB_GENE_ID SUB_ACC_ID SUB_GENE SUB_ORGANISM SUB_MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN IN_VIVO_RXN IN_VITRO_RXN CST_CAT#
+Csnk2a1 CK2A1 Q60737 human VAMP4 53330 O70480 Vamp4 human S30 454285 RNLLEDDsDEEEDFF X

diff -r 000000000000 -r c1403d18c189 test-data/test_networkin.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_networkin.tabular Mon Mar 07 19:05:01 2022 +0000

@@ -0,0 +1,33 @@
+#substrate position id networkin_score tree netphorest_group netphorest_score string_identifier string_score substrate_name sequence string_path
+VAMP4 (ENSP00000236192) 30 CK2alpha 35.6396 KIN CK2_group 0.5228 ENSP00000236192 0.85 VAMP4 LLEDDsDEEED "ENSP00000217244, 0.68 ENSP00000236192"
+SSRP1 (ENSP00000278412) 444 CK2alpha 28.6345 KIN CK2_group 0.3768 ENSP00000278412 0.874 SSRP1 DEYADsDEDQH "ENSP00000217244, 0.6992 ENSP00000278412"
+SSRP1 (ENSP00000278412) 667 CK2alpha 22.2088 KIN CK2_group 0.3168 ENSP00000278412 0.874 SSRP1 SKEFVsSDESS "ENSP00000217244, 0.6992 ENSP00000278412"
+HERC2 (ENSP00000261609) 1577 CK2alpha 10.7686 KIN CK2_group 0.5253 ENSP00000261609 0.4514 HERC2 IGNEEsDLEEA "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+HERC2 (ENSP00000261609) 2928 CK2alpha 10.7686 KIN CK2_group 0.4698 ENSP00000261609 0.4514 HERC2 VPFLAsDNEEE "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+RRP15 (ENSP00000355899) 11 CK2alpha 8.5484 KIN CK2_group 0.3566 ENSP00000355899 0.461 RRP15 PDSRVsEEENL "ENSP00000217244, 0.3688 ENSP00000355899"
+SSRP1 (ENSP00000278412) 444 CK2a2 7.8435 KIN CK2_group 0.3768 ENSP00000278412 0.615 SSRP1 DEYADsDEDQH "ENSP00000262506, 0.492 ENSP00000278412"
+SSRP1 (ENSP00000278412) 667 CK2a2 7.7757 KIN CK2_group 0.3168 ENSP00000278412 0.615 SSRP1 SKEFVsSDESS "ENSP00000262506, 0.492 ENSP00000278412"
+VAMP2 (ENSP00000314214) 80 PKD3 6.9217 KIN PKD_group 0.0744 ENSP00000314214 0.949 VAMP2 SQFETsAAKLK "ENSP00000234179, 0.7592 ENSP00000314214"
+VAMP2 (ENSP00000314214) 61 CK2alpha 6.3122 KIN CK2_group 0.3338 ENSP00000314214 0.4391 VAMP2 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214"
+VAMP1 (ENSP00000380148) 63 CK2alpha 6.1363 KIN CK2_group 0.3338 ENSP00000380148 0.4364 VAMP1 RDQKLsELDDR "ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148"
+ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158"
+ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158"
+VAMP2 (ENSP00000314214) 61 PKAbeta 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000359719, 0.64 ENSP00000314214"
+VAMP2 (ENSP00000314214) 61 PKAgamma 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000366488, 0.64 ENSP00000314214"
+VAMP3 (ENSP00000054666) 44 CK2alpha 4.2842 KIN CK2_group 0.3338 ENSP00000054666 0.4201 VAMP3 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666"
+VAMP2 (ENSP00000314214) 80 PKCiota 3.8971 KIN PKC_group 0.0928 ENSP00000314214 0.899 VAMP2 SQFETsAAKLK "ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412) 444 CDK7 3.6159 KIN CDK7 0.0186 ENSP00000278412 0.903 SSRP1 DEYADsDEDQH "ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412) 444 CK1alpha 3.3573 KIN CK1_group 0.1264 ENSP00000278412 0.404 SSRP1 DEYADsDEDQH "ENSP00000261798, 0.3232 ENSP00000278412"
+VAMP3 (ENSP00000054666) 11 PKCalpha 3.0633 KIN PKC_group 0.4633 ENSP00000054666 0.3277 VAMP3 TAATGsNRRLQ "ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666"
+SSRP1 (ENSP00000278412) 659 PKCalpha 3.0524 KIN PKC_group 0.4345 ENSP00000278412 0.237 SSRP1 RQLSEsFKSKE "ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412"
+VAMP2 (ENSP00000314214) 61 PKCiota 2.7785 KIN PKC_group 0.0463 ENSP00000314214 0.899 VAMP2 RDQKLsELDDR "ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412) 659 CDK7 2.5961 KIN CDK7 0.0104 ENSP00000278412 0.903 SSRP1 RQLSEsFKSKE "ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412) 667 CDK7 2.5961 KIN CDK7 0.0124 ENSP00000278412 0.903 SSRP1 SKEFVsSDESS "ENSP00000256443, 0.7224 ENSP00000278412"
+ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158"
+ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158"
+SSRP1 (ENSP00000278412) 659 PKCbeta 2.4948 KIN PKC_group 0.4345 ENSP00000278412 0.1743 SSRP1 RQLSEsFKSKE "ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412"
+VAMP3 (ENSP00000054666) 11 PKCbeta 2.4948 KIN PKC_group 0.4633 ENSP00000054666 0.2393 VAMP3 TAATGsNRRLQ "ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666"
+SSRP1 (ENSP00000278412) 659 CK2a2 2.4345 KIN CK2_group 0.0356 ENSP00000278412 0.615 SSRP1 RQLSEsFKSKE "ENSP00000262506, 0.492 ENSP00000278412"
+ERC1 (ENSP00000354158) 191 HIPK2 2.2748 KIN HIPK1_HIPK2_group 0.0463 ENSP00000354158 0.4159 ERC1 IKTFWsPELKK "ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158"
+VAMP3 (ENSP00000054666) 11 PKCzeta 2.0773 KIN PKC_group 0.4633 ENSP00000054666 0.4263 VAMP3 TAATGsNRRLQ "ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666"
+SSRP1 (ENSP00000278412) 659 DNAPK 2.0042 KIN DNAPK 0.0584 ENSP00000278412 0.56 SSRP1 RQLSEsFKSKE "ENSP00000313420, 0.448 ENSP00000278412"

diff -r 000000000000 -r c1403d18c189 test-data/test_regulatory_sites.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_regulatory_sites.tabular Mon Mar 07 19:05:01 2022 +0000

@@ -0,0 +1,8 @@
+32017
+"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."""
+
+GENE PROTEIN PROT_TYPE ACC_ID GENE_ID HU_CHR_LOC ORGANISM MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN ON_FUNCTION ON_PROCESS ON_PROT_INTERACT ON_OTHER_INTERACT PMIDs LT_LIT MS_LIT MS_CST NOTES
+ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S109-p 477819 DLPQRKSsLVTSKLA Endosulfine "molecular association, regulation; protein conformation" SNCA(DISRUPTS) 18973346 1 34 50
+VAMP8 VAMP8 "Membrane protein, integral; Vesicle" Q9BV40 8673 2p11.2 human S55-p 12738929 TEDLEATsEHFKTTS Synaptobrevin "activity, inhibited" 27402227 1 8 0 "abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion"
+ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S67-p 455934 KGQKYFDsGDYNMAK Endosulfine "molecular association, regulation" cell cycle regulation PPP2CA(INDUCES) 27889260 3 56 47
+Vamp4 VAMP4 "Membrane protein, integral; Vesicle" O70480 53330 1 H2.1|1 70.29 cM mouse S30-p 454285 RNLLEDDsDEEEDFF "molecular association, regulation; intracellular localization" PACS-1(INDUCES) 14608369 1 64 10

diff -r 000000000000 -r c1403d18c189 test-data/test_swissprot.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_swissprot.fasta Mon Mar 07 19:05:01 2022 +0000

b'@@ -0,0 +1,68 @@\n+>sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2\n+MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT\n+>sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1\n+MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE\n+>sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3\n+MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS\n+>sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2\n+MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS\n+>sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2\n+MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS\n+>sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2\n+MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS\n+>sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1\n+MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD\n+>sp|O43768|ENSA_HUMAN Alpha-endosulf'..b'=1 SV=3\n+MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA\n+>sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1\n+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT\n+>sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1\n+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR\n+>sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1\n+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD\n+>sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3\n+MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS\n+>sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3\n+MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST\n+>sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2\n+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT\n+>sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4\n+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT\n+>sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1\n+MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN\n+>sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3\n+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK\n+>sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7\n+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL\n+>sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7\n+MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK\n+>sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1\n+MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS\n'

diff -r 000000000000 -r c1403d18c189 workflow/ppenrich_suite_wf.ga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/workflow/ppenrich_suite_wf.ga Mon Mar 07 19:05:01 2022 +0000

[

b'@@ -0,0 +1,653 @@\n+{\n+ "a_galaxy_workflow": "true",\n+ "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA",\n+ "creator": [\n+ {\n+ "class": "Person",\n+ "identifier": "0000-0002-2882-0508",\n+ "name": "Art Eschenlauer"\n+ }\n+ ],\n+ "format-version": "0.1",\n+ "license": "MIT",\n+ "name": "ppenrich_suite_wf",\n+ "steps": {\n+ "0": {\n+ "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",\n+ "content_id": null,\n+ "errors": null,\n+ "id": 0,\n+ "input_connections": {},\n+ "inputs": [\n+ {\n+ "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",\n+ "name": "Phospho (STY)Sites.txt"\n+ }\n+ ],\n+ "label": "Phospho (STY)Sites.txt",\n+ "name": "Input dataset",\n+ "outputs": [],\n+ "position": {\n+ "bottom": 257.06666564941406,\n+ "height": 81.39999389648438,\n+ "left": 339.95001220703125,\n+ "right": 539.9500122070312,\n+ "top": 175.6666717529297,\n+ "width": 200,\n+ "x": 339.95001220703125,\n+ "y": 175.6666717529297\n+ },\n+ "tool_id": null,\n+ "tool_state": "{\\"optional\\": false, \\"format\\": [\\"tabular\\"], \\"tag\\": null}",\n+ "tool_version": null,\n+ "type": "data_input",\n+ "uuid": "002d55e6-29a5-426d-9248-70ec33424b15",\n+ "workflow_outputs": []\n+ },\n+ "1": {\n+ "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",\n+ "content_id": null,\n+ "errors": null,\n+ "id": 1,\n+ "input_connections": {},\n+ "inputs": [\n+ {\n+ "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",\n+ "name": "SwissProt_Human_Canonical_Isoform.fasta"\n+ }\n+ ],\n+ "label": "SwissProt_Human_Canonical_Isoform.fasta",\n+ "name": "Input dataset",\n+ "outputs": [],\n+ "position": {\n+ "bottom": 411.4666748046875,\n+ "height": 101.79998779296875,\n+ "left": 379.95001220703125,\n+ "right": 579.9500122070312,\n+ "top": 309.66668701171875,\n+ "width": 200,\n+ "x": 379.95001220703125,\n+ "y": 309.66668701171875\n+ },\n+ "tool_id": null,\n+ "tool_state": "{\\"optional\\": false, \\"format\\": [\\"fasta\\"], \\"tag\\": null}",\n+ "tool_version": null,\n+ "type": "data_input",\n+ "uuid": "8f079dcc-1843-47cd-b4dc-1830e4466430",\n+ "workflow_outputs": []\n+ },\n+ "2": {\n+ "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)",\n+ "content_id": null,\n+ "errors": null,\n+ "id": 2,\n+ "input_connections": {},\n+ "inputs": [\n+ {\n+ "description": "Derived from https://networkin.info/download/networkin_human_predic'..b'pe": "tool",\n+ "uuid": "2257286b-6f9a-45c1-90a3-bf5b972959d5",\n+ "workflow_outputs": [\n+ {\n+ "label": "intensities_group-mean-imputed_QN_LT",\n+ "output_name": "imputed_data_file",\n+ "uuid": "8e7317c6-95e9-4454-b4d7-31b4de6167a8"\n+ },\n+ {\n+ "label": "intensities_group-mean-imputed_report",\n+ "output_name": "report_file",\n+ "uuid": "dfe9b34e-1f3e-4971-8382-41178104e253"\n+ }\n+ ]\n+ },\n+ "9": {\n+ "annotation": "Perform ANOVA. For imputing missing values, create random values.",\n+ "content_id": "mqppep_anova",\n+ "errors": null,\n+ "id": 9,\n+ "input_connections": {\n+ "alpha_file": {\n+ "id": 6,\n+ "output_name": "output"\n+ },\n+ "input_file": {\n+ "id": 7,\n+ "output_name": "preproc_tab"\n+ }\n+ },\n+ "inputs": [],\n+ "label": "MaxQuant Phosphopeptide ANOVA randomly imputed",\n+ "name": "MaxQuant Phosphopeptide ANOVA",\n+ "outputs": [\n+ {\n+ "name": "imputed_data_file",\n+ "type": "tabular"\n+ },\n+ {\n+ "name": "report_file",\n+ "type": "html"\n+ }\n+ ],\n+ "position": {\n+ "bottom": 1325.0999603271484,\n+ "height": 254.93333435058594,\n+ "left": 1452.949951171875,\n+ "right": 1652.949951171875,\n+ "top": 1070.1666259765625,\n+ "width": 200,\n+ "x": 1452.949951171875,\n+ "y": 1070.1666259765625\n+ },\n+ "post_job_actions": {\n+ "RenameDatasetActionimputed_data_file": {\n+ "action_arguments": {\n+ "newname": "#{input_file}.intensities_randomly-imputed_QN_LT"\n+ },\n+ "action_type": "RenameDatasetAction",\n+ "output_name": "imputed_data_file"\n+ },\n+ "RenameDatasetActionreport_file": {\n+ "action_arguments": {\n+ "newname": "#{input_file}.intensities_randomly-imputed_report (download/unzip to view)"\n+ },\n+ "action_type": "RenameDatasetAction",\n+ "output_name": "report_file"\n+ }\n+ },\n+ "tool_id": "mqppep_anova",\n+ "tool_state": "{\\"alpha_file\\": {\\"__class__\\": \\"ConnectedValue\\"}, \\"first_data_column\\": \\"Intensity\\", \\"imputation\\": {\\"imputation_method\\": \\"random\\", \\"__current_case__\\": 3, \\"meanPercentile\\": \\"1\\", \\"sdPercentile\\": \\"0.2\\"}, \\"input_file\\": {\\"__class__\\": \\"ConnectedValue\\"}, \\"sample_grouping_regex\\": \\"(\\\\\\\\d+)\\", \\"sample_names_regex\\": \\"\\\\\\\\.(\\\\\\\\d+)[A-Z]$\\", \\"__page__\\": null, \\"__rerun_remap_job_id__\\": null}",\n+ "tool_version": "0.1.0+galaxy0",\n+ "type": "tool",\n+ "uuid": "9516971c-8532-4797-8bf9-4655ff104dbd",\n+ "workflow_outputs": [\n+ {\n+ "label": "intensities_randomly-imputed_QN_LT",\n+ "output_name": "imputed_data_file",\n+ "uuid": "8ceda029-d5fd-4d75-a2b3-ac582bb137c3"\n+ },\n+ {\n+ "label": "intensities_randomly-imputed_report",\n+ "output_name": "report_file",\n+ "uuid": "84bedf25-c15b-4cc7-97e0-92f746e89f9c"\n+ }\n+ ]\n+ }\n+ },\n+ "tags": [\n+ "ppenrich"\n+ ],\n+ "uuid": "ac7bf2d1-89fe-4bf6-920a-d5508842d3f9",\n+ "version": 7\n+}\n\\ No newline at end of file\n'