comparison MaxQuantProcessingScript.R @ 0:b09ed1684301 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant_phosphopeptide_intensity commit c41945617c468ca66813c95de6a5e6fe0edb0c15"
author galaxyp
date Thu, 04 Nov 2021 18:35:10 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b09ed1684301
1 #!/usr/bin/env Rscript
2 # libraries
3 library(optparse)
4 library(data.table)
5 library(stringr)
6 #library(ggplot2)
7 #library(PTXQC)
8 #require(PTXQC)
9 #require(methods)
10
11 # title: "MaxQuant Processing Script"
12 # author: "Larry Cheng"
13 # date: "February 19, 2018"
14 #
15 # # MaxQuant Processing Script
16 # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
17 # 1) Runs the Proteomics Quality Control software
18 # 2) Remove contaminant and reverse sequence rows
19 # 3) Filters rows based on localization probability
20 # 4) Extract the quantitative data
21 # 5) Sequences phosphopeptides
22 # 6) Merges multiply phosphorylated peptides
23 # 7) Filters out phosphopeptides based on enrichment
24 # The output file contains the phosphopeptide (first column) and the quantitative values for each sample
25 #
26 # ## Revision History
27 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
28 # Rev. 2017-12-12 :added PTXQC
29 # added additional plots and table outputs for quality control
30 # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
31 # "\\.(\\d+)[A-Z]$"
32 # converted from .r to .rmd file to knit report for quality control
33 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
34 # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
35 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
36 # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
37 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
38 # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
39 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
40 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
41 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
42
43 ## Variables to change for each input file
44 ## ```{r}
45 ## #Location of MaxQuant txt folder
46 ## # C:\Users\art\src\ppenrich\postMQanalysis\step2.1
47 ## dir_txtfolder <- "C:/Users/art/src/ppenrich/postMQanalysis/step2.1"
48 ## #Input Filename
49 ## inputFilename <- paste0(dir"/Phospho (STY)Sites.txt"
50 ## #Column number for the Intensity of the first sample
51 ## startCol <- 58
52 ## #Number of runs
53 ## numSamples <- 6
54 ## #Column number of "Number.of.Phospho..STY."
55 ## phosphoCol <- 37
56 ## #pY or pST enriched samples (ie, "Y" or "ST")
57 ## enriched <- "ST"
58 ## #output filename
59 ## outputfilename = "outputfile_STEP2.txt"
60 ## ```
61 ##
62 ## ## Other modifiable variables
63 ## ```{r}
64 ## #Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column)
65 ## intervalCol <- 1
66 ## #Localization Probability Cutoff
67 ## localProbCutoff <- 0.75
68 ## #merge identical phosphopeptides by ("sum" or "average") the intensities
69 ## collapse_FUN = "sum"
70 ## ```
71
72 # parse options
73 option_list <- list(
74 make_option(
75 c("-i", "--input"),
76 action = "store",
77 default = NA,
78 type = "character",
79 help = "A MaxQuant Phospho (STY)Sites.txt"
80 ),
81 make_option(
82 c("-o", "--output"),
83 action = "store",
84 default = "output.tsv",
85 type = "character",
86 help = "output file"
87 ),
88 make_option(
89 c("-e", "--enriched"),
90 action = "store",
91 default = "output.tsv",
92 type = "character",
93 help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
94 ),
95 make_option(
96 c("-n", "--numSamples"),
97 action = "store",
98 default = 3,
99 type = "integer",
100 help = "Number of samples"
101 ),
102 make_option(
103 c("-p", "--phosphoCol"),
104 action = "store",
105 default = 13,
106 type = "integer",
107 help = "Column number of Number.of.Phospho..STY."
108 ),
109 make_option(
110 c("-s", "--startCol"),
111 action = "store",
112 default = 24,
113 type = "integer",
114 help = "Column number for the Intensity of the first sample"
115 ),
116 make_option(
117 c("-I", "--intervalCol"),
118 action = "store",
119 default = 1,
120 type = "integer",
121 help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
122 ),
123 make_option(
124 c("-l", "--localProbCutoff"),
125 action = "store",
126 default = 0.75,
127 type = "double",
128 help = "Localization Probability Cutoff"
129 ),
130 make_option(
131 c("-f", "--collapse_func"),
132 action = "store",
133 default = "sum",
134 type = "character",
135 help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
136 )
137 )
138 args <- parse_args(OptionParser(option_list=option_list))
139 # Check parameter values
140
141 if (! file.exists(args$input)) {
142 stop((paste("File", args$input, "does not exist")))
143 }
144 inputFilename <- args$input
145 outputfilename <- args$output
146 localProbCutoff <- args$localProbCutoff
147 numSamples <- args$numSamples
148 phosphoCol <- args$phosphoCol
149 startCol <- args$startCol
150 intervalCol <- args$intervalCol
151 enriched <- "ST"
152 collapse_FUN = args$collapse_func
153
154 ### FUNCTIONS
155
156 #function to generate phosphopeptide and build list when applied
157 phosphopeptide_func <- function(df) {
158
159 #generate peptide sequence and list of phosphopositions
160 phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
161 output <- vector()
162 phosphopeptide <- ""
163 counter <- 0 #keep track of position in peptide
164 phosphopositions <- vector() #keep track of phosphorylation positions in peptide
165 score_diff <- ""
166 for (chara in phosphoprobsequence){
167 #build peptide sequence
168 if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
169 phosphopeptide <- paste(phosphopeptide,chara,sep="")
170 counter <- counter + 1
171 }
172 #generate score_diff
173 if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
174 score_diff <- paste(score_diff,chara,sep="")
175 }
176 #evaluate score_diff
177 if (chara == ")" ){
178 score_diff <- as.numeric(score_diff)
179 #only consider a phosphoresidue if score_diff > 0
180 if (score_diff > 0) {
181 phosphopositions <- append(phosphopositions, counter)
182 }
183 score_diff <- ""
184 }
185 }
186
187 #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
188 counter <- 1
189 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
190 phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
191 while (counter <= length(phosphopositions) ) {
192 phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
193 counter <- counter + 1
194 phosphoposition_correction1 <- phosphoposition_correction1 + 1
195 phosphoposition_correction2 <- phosphoposition_correction2 + 1
196 }
197
198 #building phosphopeptide list
199 output <- append(output,phosphopeptide)
200 return(output)
201 }
202 ##
203 ## ## Run Proteomics Quality Control for MaxQuant Results
204 ## (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
205 ##
206 ## ```{r echo=FALSE}
207 ## library(PTXQC)
208 ## require(PTXQC)
209 ## require(methods)
210 ## txt_folder <- dir_txtfolder
211 ## r = createReport(txt_folder)
212 ## cat(paste0("\nReport has been generated inside the MaxQuant txt folder '", r$report_file, "'\n\n"))
213 ## ```
214 ##
215 ##
216 ## Perform filters on contaminants, reverse sequences, and localization probability
217
218 fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
219
220 #Filter out contaminant rows and reverse rows
221 filteredData <- subset(fullData,!grepl("CON__", Proteins))
222 filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
223 filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
224 filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
225 write.table(filteredData, file = "filteredData.txt", sep = "\t", col.names=TRUE, row.names=FALSE)
226
227 #Data filtered by localization probability
228 locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
229
230 ## Localization probability
231 ## #visualize locprob cutoff
232 locProbGraphData <- data.frame(
233 group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
234 value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
235 )
236 ## library(ggplot2)
237 ## locProbGraph <- ggplot(locProbGraphData, aes(x="", y=value, fill=group)) +
238 ## geom_bar(width = 0.5, stat = "identity", color="black")
239 ##
240 ## locProbGraph
241
242 ## Extract out quantitative values
243 #Extract out quantitative values
244 quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
245 write.table(quantData, file = "quantdata.txt", sep = "\t", col.names=TRUE, row.names=FALSE)
246 #head(quantData)
247 ## ```
248 ##
249 ##
250 ## ## Generate Phosphopeptide Sequence
251 ## First few rows of the phosphopeptide column:
252 #for latest version of MaxQuant (Version 1.5.3.30)
253 dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
254 colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
255 ##
256 #Generate column in dataTable for the predicted phosphopeptide
257 dataTable$Phosphopeptide <- apply(dataTable,1,phosphopeptide_func)
258 ##
259 #Move Quant columns to the end
260 movetolast <- function(data, move) { #moves columns to the end of dataframe
261 data[c(setdiff(names(data), move), move)]
262 }
263 dataTable <- movetolast(dataTable,c(colnames(quantData)))
264 ##
265 #Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
266 dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
267 merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
268 colnames(merge_df) <- c("rn", "Phosphopeptide")
269 ##
270 ##
271 head(merge_df)
272 ##
273 ##
274 ## First few rows of the MaxQuant phosphoresidue score differences: Sequence representation for each of the possible PTM positions in each possible configuration, the difference is calculated between the identification score with the PTM added to that position and the best scoring identification where no PTM is added to that position. When this value is negative, it is unlikely that the particular modification is located at this position.
275 ## ```{r echo=FALSE}
276 ##
277 ## head(locProbFilteredData$Phospho..STY..Score.diffs)
278 ## ```
279 ##
280 ## First few rows of data with Phosphopeptide Sequence.
281 ## ```{r echo=FALSE }
282 ## # Add Phosphopeptide column to quant columns for quality control checking
283 quantData_qc <- as.data.frame(quantData)
284 setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
285 quantData_qc$rn <- as.integer(quantData_qc$rn)
286 quantData_qc <- merge(merge_df,quantData_qc, by="rn")
287 quantData_qc$rn <- NULL #remove rn column
288 head(quantData_qc)
289 ##
290 ## ```
291 ##
292 ##
293 ## ## Collapse multiphosphorylated peptides
294 ##
295 ## Before collapse:
296 ## ```{r echo=FALSE}
297 ## summary(quantData_qc)
298 ## ```
299 ##
300 ##
301 ## After collapse:
302 ## ```{r echo=FALSE}
303 #Collapse multiphosphorylated peptides
304 quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
305 quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
306 summary(quantData_qc_collapsed)
307 ##
308 ## ```
309 ##
310 ## % of phosphopeptides that are multiphosphorylated is roughly ~
311 ## ```{r echo=FALSE}
312 (nrow(quantData_qc) - nrow(quantData_qc_collapsed))/nrow(quantData_qc)/2
313 ## ```
314 ##
315 ## Breakdown of pY, pS, and pT before enrichment filter
316 ## ```{r echo=FALSE}
317 ## library(stringr)
318 pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
319 pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
320 pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
321 ##
322 pY_num <- nrow(pY_data)
323 pS_num <- nrow(pS_data)
324 pT_num <- nrow(pT_data)
325 ##
326 ## #visualize enrichment
327 enrichGraphData <- data.frame(
328 group = c("pY", "pS", "pT"),
329 value = c(pY_num, pS_num, pT_num)
330 )
331 ##
332 ## library(ggplot2)
333 ## enrichGraph <- ggplot(enrichGraphData, aes(x="", y=value, fill=group)) +
334 ## geom_bar(width = 0.5, stat = "identity", color="black") + coord_polar("y", start=0)
335 ##
336 ## enrichGraph
337 ## ```
338 ##
339 ## ## Filter phosphopeptides by enrichment
340 ## ```{r echo=FALSE}
341 ## #Data filtered by enrichment
342 ## library(stringr)
343 if (enriched == "Y"){
344 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
345 } else if ( enriched == "ST" ) {
346 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
347 } else { print("Error in enriched variable. Set to either 'Y' or 'ST'")}
348 ## ```
349 ##
350 ##
351 ## ```{r echo=FALSE}
352 ## #output files
353 ## #write.table(quantData_qc, file="BeforeCollapse.txt", sep="\t", row.names = FALSE) #for quality control
354 ## #write.table(quantData_qc_collapsed, file="AfterCollapse.txt", sep="\t", row.names = FALSE) #for quality control
355 write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
356 ## ```