Mercurial > repos > galaxyp > psm_to_sam
diff PSM2SAM.R @ 7:cd69250e1150 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm2sam commit 9910ed076e4b8a3f083351b89fa861d0a4a93beb
| author | galaxyp |
|---|---|
| date | Wed, 17 May 2017 20:22:39 -0400 |
| parents | ce09f1a1bbad |
| children | d9eba92cce55 |
line wrap: on
line diff
--- a/PSM2SAM.R Wed Feb 03 12:38:19 2016 -0500 +++ b/PSM2SAM.R Wed May 17 20:22:39 2017 -0400 @@ -1,398 +1,149 @@ -#!/usr/bin/env Rscript - -initial.options <- commandArgs(trailingOnly = FALSE) -script_parent_dir <- dirname(sub("--file=", "", initial.options[grep("--file=", initial.options)])) - -## begin warning handler -withCallingHandlers({ - -library(methods) # Because Rscript does not always do this - -options('useFancyQuotes' = FALSE) - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("RGalaxy")) - - -option_list <- list() - -option_list$passedPSM <- make_option('--passedPSM', type='character') -option_list$XScolumn <- make_option('--XScolumn', type='character') -option_list$exon_anno <- make_option('--exon_anno', type='character') -option_list$proteinseq <- make_option('--proteinseq', type='character') -option_list$procodingseq <- make_option('--procodingseq', type='character') -option_list$header <- make_option('--header', type='character') -option_list$OutputFile <- make_option('--OutputFile', type='character') - - -opt <- parse_args(OptionParser(option_list=option_list)) - - - - -PSMtab2SAM <- function( - passedPSM_file = GalaxyInputFile(required=TRUE), - exon_anno_file = GalaxyInputFile(required=TRUE), - proteinseq_file = GalaxyInputFile(required=TRUE), - procodingseq_file = GalaxyInputFile(required=TRUE), - header_file = GalaxyInputFile(required=TRUE), - XScolumn = GalaxyCharacterParam(required=TRUE), - OutputFile = GalaxyOutput("proSAM","sam")) -{ - if (!file.exists(header_file)) { gstop("failed to read header file") } - if (file.exists(OutputFile)) - { - if (file.info(OutputFile)$size > 0) { gstop("output file already exists") } - else - { - tryCatch( - { - file.remove(OutputFile) - }, error=function(err) - { - gstop("failed to remove empty existing file") - }) - } - } - - suppressPackageStartupMessages(library(GenomicRanges)) - suppressPackageStartupMessages(library(Biostrings)) - - options(stringsAsFactors=FALSE) - - scoreName = XScolumn - columnName = gsub("[^A-Z0-9]", "_", scoreName) - - passedPSM <- tryCatch({ - #read.delim(passedPSM_file, row.names=1) - suppressPackageStartupMessages(library(RSQLite)) - drv <- dbDriver("SQLite") - con <- dbConnect(drv, passedPSM_file) - - # do case-insensitive search for the score name - res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep="")) - scoreInfo = fetch(res, n=1) - scoreId = scoreInfo["Id"] - realScoreName = scoreInfo["Name"] # original case - dbClearResult(res) - - sql <- paste("SELECT ss.Name as SourceName, s.NativeID", - ", psm.ObservedNeutralMass AS precursor_neutral_mass", - ", psm.Charge AS assumed_charge", - ", s.ScanTimeInSeconds AS retention_time_sec", - ", psm.Rank AS hit_rank", - ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide", - ", pro.Accession AS protein", - ", COUNT(DISTINCT pro.Id) AS num_tot_proteins", - ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass", - ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff", - ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term", - ", MissedCleavages AS num_missed_cleavages", - ", psm.QValue AS qvalue", - ", psmScore.Value AS ", columnName, - ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification", - "FROM PeptideSpectrumMatch psm", - "JOIN Spectrum s ON psm.Spectrum=s.Id", - "JOIN SpectrumSource ss ON s.Source=ss.Id", - "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide", - "JOIN Protein pro ON pi.Protein=pro.Id", - "JOIN Peptide pep ON pi.Peptide=pep.Id", - "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId, - "LEFT JOIN ProteinData pd ON pro.Id=pd.Id", - "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch", - "LEFT JOIN Modification mod ON pm.Modification=mod.Id", - "GROUP BY psm.Id" - ) - - res <- dbSendQuery(con, sql) - passedPSM <- fetch(res, n=-1) - dbClearResult(res) - dbDisconnect(con) - passedPSM - }, error=function(err) { - gstop("failed to read passedPSM: ", err) - }) - - tryCatch({ - load(exon_anno_file) - exon_anno <- exon - }, error=function(err) { - gstop("failed to read exon_anno: ", conditionMessage(err)) - }) - - tryCatch({ - load(proteinseq_file) - }, error=function(err) { - gstop("failed to read proteinseq: ", conditionMessage(err)) - }) - - tryCatch({ - load(procodingseq_file) - }, error=function(err) { - gstop("failed to read procodingseq: ", conditionMessage(err)) - }) - - PEP <- passedPSM[, 'peptide'] - #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/") - Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/") - #PEP_SEQ <- formatPep(spectra[, 'Sequence']) - - SAM <- c() - - spectrumcount <- table(Spectrumid) - - for(i in 1:dim(passedPSM)[1]){ - #print(i) - peptide <- PEP[i] - QNAME <- Spectrumid[i] - idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE) - if(length(idx) == 0){ - RNAME <- '*' - MAPQ <- 255 - RNEXT <- '*' - PNEXT <- 0 - TLEN <- 0 - QUAL <- '*' - POS <- 0 - SEQ <- '*' - CIGAR <- '*' - FLAG <- 0x4 - annoted <- '?' - XA <- paste('XA:Z:', annoted, sep='') - res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, - as.character(SEQ), QUAL, XA) - res <- unique(data.frame(t(res))) - }else{ - pro <- proteinseq[idx, ] - sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) - regexpr(peptide, x, fixed=TRUE))) - pep_len <- nchar(peptide) - end_pos <- sta_pos + pep_len - 1 - - coding <- procodingseq[match(pro[, 'pro_name'], - procodingseq[, 'pro_name']), ] - code_s <- (sta_pos-1) * 3 + 1 - code_e <- end_pos * 3 - codingseq <- substring(coding[, 'coding'], code_s, code_e) - - - - exonp <- lapply(pro[, 'tx_name'], function(x) - exon_anno[exon_anno[, 'tx_name']==x, ]) - - exonp <- lapply(exonp, function(x){ - if(length(unique(x[, 'tx_id'])) > 1){ - x[grep(x[1, 'tx_id'], x[, 'tx_id'], - fixed=TRUE), ] - }else x - }) - - if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE - - res <- mapply(function(x, y, z, m) - if(dim(z)[1] == 0){ - .proteinUnannotated(x, y, z, m, primary=pri) - }else{ - if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], - na.rm = TRUE))){ - #if(toString(translate(DNAString(m))) != peptide){ - .peptideUnannotated(x, y, z, m, primary=pri) - }else{ - .MapCoding2genome(x, y, z, m, primary=pri) - } - }, - code_s, code_e, exonp, codingseq) - - res <- unique(data.frame(t(res))) - - } - XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='') - NH <- paste('NH:i:', dim(res)[1], sep='') - XP <- paste('XP:Z:', peptide, sep='') - #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='') - XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='') - XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), - digits=4), sep='') - #XA <- paste('XA:Z:', annoted, sep='') - XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='') - XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='') - - XM <- ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'), - paste('XM:Z:', passedPSM[i, 'modification'], sep='')) - - res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT) - SAM <- rbind(SAM, res) - } - - file.copy(header_file, OutputFile) - write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE) -} - - - -.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) -{ - RNAME <- '*' - MAPQ <- 255 - RNEXT <- '*' - PNEXT <- 0 - TLEN <- 0 - QUAL <- '*' - POS <- 0 - SEQ <- '*' - CIGAR <- '*' - annoted <- 2 - XA <- paste('XA:Z:', annoted, sep='') - FLAG <- 0x4 - - tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, - as.character(SEQ), QUAL, XA) - tmp -} - - - -.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) -{ - #RNAME <- as.character(exon_anno[1, 'chromosome_name']) - RNAME <- '*' - MAPQ <- 255 - RNEXT <- '*' - PNEXT <- 0 - TLEN <- 0 - QUAL <- '*' - POS <- 0 - SEQ <- '*' - CIGAR <- '*' - annoted <- 1 - XA <- paste('XA:Z:', annoted, sep='') - FLAG <- 0x4 - #if(exon_anno[1, 'strand'] == '+'){ - # FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) - #}else{ - # FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100) - #} - - tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, - as.character(SEQ), QUAL, XA) - tmp -} - - - -.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) -{ - idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), - which(exon_anno[, 'cds_end'] >= c_sta)) - idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), - which(exon_anno[, 'cds_end'] >= c_end)) - len <- c_end - c_sta + 1 - RNAME <- as.character(exon_anno[1, 'chromosome_name']) - MAPQ <- 255 - RNEXT <- '*' - PNEXT <- 0 - TLEN <- 0 - QUAL <- '*' - annoted <- 0 - XA <- paste('XA:Z:', annoted, sep='') - - if(exon_anno[1, 'strand'] == '+'){ - POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - - exon_anno[idxs, 'cds_start'] - SEQ <- DNAString(cseq) - FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) - - }else{ - POS <- exon_anno[idxe, 'cds_chr_start'] + - exon_anno[idxe, 'cds_end'] - c_end - SEQ <- reverseComplement(DNAString(cseq)) - FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) - } - - if(idxe == idxs){ - CIGAR <- paste(len, 'M', sep='') - }else{ - if(exon_anno[1, 'strand'] == '+'){ - #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, - # 'cds_chr_end']- 1 - part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 - part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1 - - insert <- unlist(lapply(1:(idxe - idxs), function(x) - paste(exon_anno[idxs + x, 'cds_chr_start'] - - exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep=''))) - if(idxe-idxs >1){ - innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) - paste(exon_anno[idxs + x, 'cds_chr_end'] - - exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', - sep=''))) - }else{ innerexon <- ''} - - #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) - #paste(exon_anno[idxs+x, 'cds_chr_end'] - - # exon_anno[idxs+x, 'cds_chr_start']+1, - # 'M', sep=''))), '') - midpattern <- rep(NA, length(insert) + length(innerexon)) - midpattern[seq(1, length(insert) + length(innerexon), - by=2)] <- insert - midpattern[seq(2, length(insert) + length(innerexon), - by=2)] <- innerexon - midpattern <- paste(midpattern, collapse='') - - }else{ - #insert <- exon_anno[idxs, 'cds_chr_start'] - - # exon_anno[idxe, 'cds_chr_end']- 1 - part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 - part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 - - insert <- unlist(lapply(1:(idxe-idxs), function(x) - paste(exon_anno[idxe - x, 'cds_chr_start'] - - exon_anno[idxe-x + 1, 'cds_chr_end']- 1, - 'N', sep=''))) - if(idxe-idxs >1){ - innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) - paste(exon_anno[idxe-x, 'cds_chr_end'] - - exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep=''))) - }else{ innerexon <-''} - - midpattern <- rep(NA, length(insert)+length(innerexon)) - midpattern[seq(1, length(insert) + length(innerexon), - by=2)] <- insert - midpattern[seq(2, length(insert) + length(innerexon), - by=2)] <- innerexon - midpattern <- paste(midpattern, collapse='') - - } - - CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='') - } - - tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, - as.character(SEQ), QUAL, XA) - tmp -} - - -params <- list() -for(param in names(opt)) -{ - if (!param == "help") - params[param] <- opt[param] -} - -setClass("GalaxyRemoteError", contains="character") -wrappedFunction <- function(f) -{ - tryCatch(do.call(f, params), - error=function(e) new("GalaxyRemoteError", conditionMessage(e))) -} - - -suppressPackageStartupMessages(library(RGalaxy)) -do.call(PSMtab2SAM, params) - -## end warning handler -}, warning = function(w) { - cat(paste("Warning:", conditionMessage(w), "\n")) - invokeRestart("muffleWarning") -}) +#!/usr/bin/env Rscript + +## begin warning handler +withCallingHandlers({ + +library(methods) # Because Rscript does not always do this + +options('useFancyQuotes' = FALSE) + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("RGalaxy")) + + +option_list <- list() + +option_list$exon_anno <- make_option('--exon_anno', type='character') +option_list$proteinseq <- make_option('--proteinseq', type='character') +option_list$procodingseq <- make_option('--procodingseq', type='character') +option_list$bam_file <- make_option('--bam', type='character') +option_list$idpDB_file <- make_option('--idpDB', type='character') +option_list$pepXmlTab_file <- make_option('--pepXmlTab', type='character') +option_list$peptideShakerPsmReport_file <- make_option('--peptideShakerPsmReport', type='character') +option_list$variantAnnotation_file <- make_option('--variantAnnotation', type='character') +option_list$searchEngineScore <- make_option('--searchEngineScore', type='character') + + +opt <- parse_args(OptionParser(option_list=option_list)) + + +psm2sam <- function( + exon_anno_file = GalaxyInputFile(required=TRUE), + proteinseq_file = GalaxyInputFile(required=TRUE), + procodingseq_file = GalaxyInputFile(required=TRUE), + bam_file = GalaxyInputFile(required=TRUE), + idpDB_file = GalaxyInputFile(required=FALSE), + pepXmlTab_file = GalaxyInputFile(required=FALSE), + peptideShakerPsmReport_file = GalaxyInputFile(required=FALSE), + variantAnnotation_file = GalaxyInputFile(required=FALSE), + searchEngineScore = GalaxyCharacterParam(required=FALSE) + ) +{ + options(stringsAsFactors = FALSE) + + if (length(bam_file) == 0) + { + stop("BAM file must be specified to provide sequence headers") + } + + outputHeader = grep("^@(?!PG)", readLines(bam_file, n=500, warn=FALSE), value=TRUE, perl=TRUE) + if (length(outputHeader) == 0) + { + stop("failed to read header lines from bam_file") + } + + # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github) + download.file("https://github.com/chambm/customProDB/archive/master.zip", "customProDB.zip", quiet=TRUE) + unzip("customProDB.zip") + devtools::load_all("customProDB-master") + + # load proBAMr from GitHub + download.file("https://github.com/chambm/proBAMr/archive/master.zip", "proBAMr.zip", quiet=TRUE) + unzip("proBAMr.zip") + devtools::load_all("proBAMr-master") + + psmInputLength = length(idpDB_file)+length(pepXmlTab_file)+length(peptideShakerPsmReport_file) + if (psmInputLength == 0) + { + stop("one of the input PSM file parameters must be specified") + } + else if (psmInputLength > 1) + { + stop("only one of the input PSM file parameters can be specified") + } + + if (length(idpDB_file) > 0) + { + if (length(searchEngineScore) == 0) + stop("searchEngineScore parameter must be specified when reading IDPicker PSMs, e.g. 'MyriMatch:MVH'") + passedPSM = readIdpDB(idpDB_file, searchEngineScore) + } + else if (length(pepXmlTab_file) > 0) + { + if (length(searchEngineScore) == 0) + stop("searchEngineScore parameter must be specified when reading pepXmlTab PSMs, e.g. 'mvh'") + passedPSM = readPepXmlTab(pepXmlTab_file, searchEngineScore) + } + else if (length(peptideShakerPsmReport_file) > 0) + { + if (length(searchEngineScore) > 0) + warning("searchEngineScore parameter is ignored when reading PeptideShaker PSM report") + passedPSM = readPeptideShakerPsmReport(peptideShakerPsmReport_file) + } + + load(exon_anno_file) + load(proteinseq_file) + load(procodingseq_file) + + if (length(variantAnnotation_file) > 0) + { + load(variantAnnotation_file) # variantAnnotation list, with members snvprocoding/snvproseq and indelprocoding/indelproseq + + varprocoding = unique(rbind(variantAnnotation$snvprocoding, variantAnnotation$indelprocoding)) + varproseq = unique(rbind(variantAnnotation$snvproseq, variantAnnotation$indelproseq)) + } + else + { + varprocoding = NULL + varproseq = NULL + } + + # add proBAMr program key + outputHeader = c(outputHeader, paste0("@PG\tID:proBAMr\tVN:", packageVersion("proBAMr"))) + + # first write header lines to the output SAM + writeLines(outputHeader, "output.sam") + + # then write the PSM "reads" + PSMtab2SAM(passedPSM, exon, + proteinseq, procodingseq, + varproseq, varprocoding, + outfile = "output.sam", + show_progress = FALSE) + + invisible(NULL) +} + +params <- list() +for(param in names(opt)) +{ + if (!param == "help") + params[param] <- opt[param] +} + +setClass("GalaxyRemoteError", contains="character") +wrappedFunction <- function(f) +{ + tryCatch(do.call(f, params), + error=function(e) new("GalaxyRemoteError", conditionMessage(e))) +} + + +suppressPackageStartupMessages(library(RGalaxy)) +do.call(psm2sam, params) + +## end warning handler +}, warning = function(w) { + cat(paste("Warning:", conditionMessage(w), "\n")) + invokeRestart("muffleWarning") +})
