Mercurial > repos > galaxyp > maxquant_phosphopeptide_intensity
comparison MaxQuantProcessingScript.R @ 0:b09ed1684301 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant_phosphopeptide_intensity commit c41945617c468ca66813c95de6a5e6fe0edb0c15"
author | galaxyp |
---|---|
date | Thu, 04 Nov 2021 18:35:10 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b09ed1684301 |
---|---|
1 #!/usr/bin/env Rscript | |
2 # libraries | |
3 library(optparse) | |
4 library(data.table) | |
5 library(stringr) | |
6 #library(ggplot2) | |
7 #library(PTXQC) | |
8 #require(PTXQC) | |
9 #require(methods) | |
10 | |
11 # title: "MaxQuant Processing Script" | |
12 # author: "Larry Cheng" | |
13 # date: "February 19, 2018" | |
14 # | |
15 # # MaxQuant Processing Script | |
16 # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order): | |
17 # 1) Runs the Proteomics Quality Control software | |
18 # 2) Remove contaminant and reverse sequence rows | |
19 # 3) Filters rows based on localization probability | |
20 # 4) Extract the quantitative data | |
21 # 5) Sequences phosphopeptides | |
22 # 6) Merges multiply phosphorylated peptides | |
23 # 7) Filters out phosphopeptides based on enrichment | |
24 # The output file contains the phosphopeptide (first column) and the quantitative values for each sample | |
25 # | |
26 # ## Revision History | |
27 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script" | |
28 # Rev. 2017-12-12 :added PTXQC | |
29 # added additional plots and table outputs for quality control | |
30 # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <- | |
31 # "\\.(\\d+)[A-Z]$" | |
32 # converted from .r to .rmd file to knit report for quality control | |
33 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times | |
34 # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows | |
35 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output | |
36 # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string | |
37 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) | |
38 # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA | |
39 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 | |
40 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards | |
41 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol | |
42 | |
43 ## Variables to change for each input file | |
44 ## ```{r} | |
45 ## #Location of MaxQuant txt folder | |
46 ## # C:\Users\art\src\ppenrich\postMQanalysis\step2.1 | |
47 ## dir_txtfolder <- "C:/Users/art/src/ppenrich/postMQanalysis/step2.1" | |
48 ## #Input Filename | |
49 ## inputFilename <- paste0(dir"/Phospho (STY)Sites.txt" | |
50 ## #Column number for the Intensity of the first sample | |
51 ## startCol <- 58 | |
52 ## #Number of runs | |
53 ## numSamples <- 6 | |
54 ## #Column number of "Number.of.Phospho..STY." | |
55 ## phosphoCol <- 37 | |
56 ## #pY or pST enriched samples (ie, "Y" or "ST") | |
57 ## enriched <- "ST" | |
58 ## #output filename | |
59 ## outputfilename = "outputfile_STEP2.txt" | |
60 ## ``` | |
61 ## | |
62 ## ## Other modifiable variables | |
63 ## ```{r} | |
64 ## #Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column) | |
65 ## intervalCol <- 1 | |
66 ## #Localization Probability Cutoff | |
67 ## localProbCutoff <- 0.75 | |
68 ## #merge identical phosphopeptides by ("sum" or "average") the intensities | |
69 ## collapse_FUN = "sum" | |
70 ## ``` | |
71 | |
72 # parse options | |
73 option_list <- list( | |
74 make_option( | |
75 c("-i", "--input"), | |
76 action = "store", | |
77 default = NA, | |
78 type = "character", | |
79 help = "A MaxQuant Phospho (STY)Sites.txt" | |
80 ), | |
81 make_option( | |
82 c("-o", "--output"), | |
83 action = "store", | |
84 default = "output.tsv", | |
85 type = "character", | |
86 help = "output file" | |
87 ), | |
88 make_option( | |
89 c("-e", "--enriched"), | |
90 action = "store", | |
91 default = "output.tsv", | |
92 type = "character", | |
93 help = "pY or pST enriched samples (ie, 'Y' or 'ST')" | |
94 ), | |
95 make_option( | |
96 c("-n", "--numSamples"), | |
97 action = "store", | |
98 default = 3, | |
99 type = "integer", | |
100 help = "Number of samples" | |
101 ), | |
102 make_option( | |
103 c("-p", "--phosphoCol"), | |
104 action = "store", | |
105 default = 13, | |
106 type = "integer", | |
107 help = "Column number of Number.of.Phospho..STY." | |
108 ), | |
109 make_option( | |
110 c("-s", "--startCol"), | |
111 action = "store", | |
112 default = 24, | |
113 type = "integer", | |
114 help = "Column number for the Intensity of the first sample" | |
115 ), | |
116 make_option( | |
117 c("-I", "--intervalCol"), | |
118 action = "store", | |
119 default = 1, | |
120 type = "integer", | |
121 help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column" | |
122 ), | |
123 make_option( | |
124 c("-l", "--localProbCutoff"), | |
125 action = "store", | |
126 default = 0.75, | |
127 type = "double", | |
128 help = "Localization Probability Cutoff" | |
129 ), | |
130 make_option( | |
131 c("-f", "--collapse_func"), | |
132 action = "store", | |
133 default = "sum", | |
134 type = "character", | |
135 help = "merge identical phosphopeptides by ('sum' or 'average') the intensities" | |
136 ) | |
137 ) | |
138 args <- parse_args(OptionParser(option_list=option_list)) | |
139 # Check parameter values | |
140 | |
141 if (! file.exists(args$input)) { | |
142 stop((paste("File", args$input, "does not exist"))) | |
143 } | |
144 inputFilename <- args$input | |
145 outputfilename <- args$output | |
146 localProbCutoff <- args$localProbCutoff | |
147 numSamples <- args$numSamples | |
148 phosphoCol <- args$phosphoCol | |
149 startCol <- args$startCol | |
150 intervalCol <- args$intervalCol | |
151 enriched <- "ST" | |
152 collapse_FUN = args$collapse_func | |
153 | |
154 ### FUNCTIONS | |
155 | |
156 #function to generate phosphopeptide and build list when applied | |
157 phosphopeptide_func <- function(df) { | |
158 | |
159 #generate peptide sequence and list of phosphopositions | |
160 phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] | |
161 output <- vector() | |
162 phosphopeptide <- "" | |
163 counter <- 0 #keep track of position in peptide | |
164 phosphopositions <- vector() #keep track of phosphorylation positions in peptide | |
165 score_diff <- "" | |
166 for (chara in phosphoprobsequence){ | |
167 #build peptide sequence | |
168 if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) { | |
169 phosphopeptide <- paste(phosphopeptide,chara,sep="") | |
170 counter <- counter + 1 | |
171 } | |
172 #generate score_diff | |
173 if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){ | |
174 score_diff <- paste(score_diff,chara,sep="") | |
175 } | |
176 #evaluate score_diff | |
177 if (chara == ")" ){ | |
178 score_diff <- as.numeric(score_diff) | |
179 #only consider a phosphoresidue if score_diff > 0 | |
180 if (score_diff > 0) { | |
181 phosphopositions <- append(phosphopositions, counter) | |
182 } | |
183 score_diff <- "" | |
184 } | |
185 } | |
186 | |
187 #generate phosphopeptide sequence (ie, peptide sequence with "p"'s) | |
188 counter <- 1 | |
189 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string | |
190 phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string | |
191 while (counter <= length(phosphopositions) ) { | |
192 phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="") | |
193 counter <- counter + 1 | |
194 phosphoposition_correction1 <- phosphoposition_correction1 + 1 | |
195 phosphoposition_correction2 <- phosphoposition_correction2 + 1 | |
196 } | |
197 | |
198 #building phosphopeptide list | |
199 output <- append(output,phosphopeptide) | |
200 return(output) | |
201 } | |
202 ## | |
203 ## ## Run Proteomics Quality Control for MaxQuant Results | |
204 ## (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) | |
205 ## | |
206 ## ```{r echo=FALSE} | |
207 ## library(PTXQC) | |
208 ## require(PTXQC) | |
209 ## require(methods) | |
210 ## txt_folder <- dir_txtfolder | |
211 ## r = createReport(txt_folder) | |
212 ## cat(paste0("\nReport has been generated inside the MaxQuant txt folder '", r$report_file, "'\n\n")) | |
213 ## ``` | |
214 ## | |
215 ## | |
216 ## Perform filters on contaminants, reverse sequences, and localization probability | |
217 | |
218 fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="") | |
219 | |
220 #Filter out contaminant rows and reverse rows | |
221 filteredData <- subset(fullData,!grepl("CON__", Proteins)) | |
222 filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins)) | |
223 filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins)) | |
224 filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins) | |
225 write.table(filteredData, file = "filteredData.txt", sep = "\t", col.names=TRUE, row.names=FALSE) | |
226 | |
227 #Data filtered by localization probability | |
228 locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,] | |
229 | |
230 ## Localization probability | |
231 ## #visualize locprob cutoff | |
232 locProbGraphData <- data.frame( | |
233 group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")), | |
234 value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100) | |
235 ) | |
236 ## library(ggplot2) | |
237 ## locProbGraph <- ggplot(locProbGraphData, aes(x="", y=value, fill=group)) + | |
238 ## geom_bar(width = 0.5, stat = "identity", color="black") | |
239 ## | |
240 ## locProbGraph | |
241 | |
242 ## Extract out quantitative values | |
243 #Extract out quantitative values | |
244 quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)] | |
245 write.table(quantData, file = "quantdata.txt", sep = "\t", col.names=TRUE, row.names=FALSE) | |
246 #head(quantData) | |
247 ## ``` | |
248 ## | |
249 ## | |
250 ## ## Generate Phosphopeptide Sequence | |
251 ## First few rows of the phosphopeptide column: | |
252 #for latest version of MaxQuant (Version 1.5.3.30) | |
253 dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData) | |
254 colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData)) | |
255 ## | |
256 #Generate column in dataTable for the predicted phosphopeptide | |
257 dataTable$Phosphopeptide <- apply(dataTable,1,phosphopeptide_func) | |
258 ## | |
259 #Move Quant columns to the end | |
260 movetolast <- function(data, move) { #moves columns to the end of dataframe | |
261 data[c(setdiff(names(data), move), move)] | |
262 } | |
263 dataTable <- movetolast(dataTable,c(colnames(quantData))) | |
264 ## | |
265 #Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df) | |
266 dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map | |
267 merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames | |
268 colnames(merge_df) <- c("rn", "Phosphopeptide") | |
269 ## | |
270 ## | |
271 head(merge_df) | |
272 ## | |
273 ## | |
274 ## First few rows of the MaxQuant phosphoresidue score differences: Sequence representation for each of the possible PTM positions in each possible configuration, the difference is calculated between the identification score with the PTM added to that position and the best scoring identification where no PTM is added to that position. When this value is negative, it is unlikely that the particular modification is located at this position. | |
275 ## ```{r echo=FALSE} | |
276 ## | |
277 ## head(locProbFilteredData$Phospho..STY..Score.diffs) | |
278 ## ``` | |
279 ## | |
280 ## First few rows of data with Phosphopeptide Sequence. | |
281 ## ```{r echo=FALSE } | |
282 ## # Add Phosphopeptide column to quant columns for quality control checking | |
283 quantData_qc <- as.data.frame(quantData) | |
284 setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data | |
285 quantData_qc$rn <- as.integer(quantData_qc$rn) | |
286 quantData_qc <- merge(merge_df,quantData_qc, by="rn") | |
287 quantData_qc$rn <- NULL #remove rn column | |
288 head(quantData_qc) | |
289 ## | |
290 ## ``` | |
291 ## | |
292 ## | |
293 ## ## Collapse multiphosphorylated peptides | |
294 ## | |
295 ## Before collapse: | |
296 ## ```{r echo=FALSE} | |
297 ## summary(quantData_qc) | |
298 ## ``` | |
299 ## | |
300 ## | |
301 ## After collapse: | |
302 ## ```{r echo=FALSE} | |
303 #Collapse multiphosphorylated peptides | |
304 quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide") | |
305 quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN) | |
306 summary(quantData_qc_collapsed) | |
307 ## | |
308 ## ``` | |
309 ## | |
310 ## % of phosphopeptides that are multiphosphorylated is roughly ~ | |
311 ## ```{r echo=FALSE} | |
312 (nrow(quantData_qc) - nrow(quantData_qc_collapsed))/nrow(quantData_qc)/2 | |
313 ## ``` | |
314 ## | |
315 ## Breakdown of pY, pS, and pT before enrichment filter | |
316 ## ```{r echo=FALSE} | |
317 ## library(stringr) | |
318 pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] | |
319 pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),] | |
320 pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] | |
321 ## | |
322 pY_num <- nrow(pY_data) | |
323 pS_num <- nrow(pS_data) | |
324 pT_num <- nrow(pT_data) | |
325 ## | |
326 ## #visualize enrichment | |
327 enrichGraphData <- data.frame( | |
328 group = c("pY", "pS", "pT"), | |
329 value = c(pY_num, pS_num, pT_num) | |
330 ) | |
331 ## | |
332 ## library(ggplot2) | |
333 ## enrichGraph <- ggplot(enrichGraphData, aes(x="", y=value, fill=group)) + | |
334 ## geom_bar(width = 0.5, stat = "identity", color="black") + coord_polar("y", start=0) | |
335 ## | |
336 ## enrichGraph | |
337 ## ``` | |
338 ## | |
339 ## ## Filter phosphopeptides by enrichment | |
340 ## ```{r echo=FALSE} | |
341 ## #Data filtered by enrichment | |
342 ## library(stringr) | |
343 if (enriched == "Y"){ | |
344 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] | |
345 } else if ( enriched == "ST" ) { | |
346 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] | |
347 } else { print("Error in enriched variable. Set to either 'Y' or 'ST'")} | |
348 ## ``` | |
349 ## | |
350 ## | |
351 ## ```{r echo=FALSE} | |
352 ## #output files | |
353 ## #write.table(quantData_qc, file="BeforeCollapse.txt", sep="\t", row.names = FALSE) #for quality control | |
354 ## #write.table(quantData_qc_collapsed, file="AfterCollapse.txt", sep="\t", row.names = FALSE) #for quality control | |
355 write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE) | |
356 ## ``` |