diff PSM2SAM.R @ 7:cd69250e1150 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm2sam commit 9910ed076e4b8a3f083351b89fa861d0a4a93beb
author galaxyp
date Wed, 17 May 2017 20:22:39 -0400
parents ce09f1a1bbad
children d9eba92cce55
line wrap: on
line diff
--- a/PSM2SAM.R	Wed Feb 03 12:38:19 2016 -0500
+++ b/PSM2SAM.R	Wed May 17 20:22:39 2017 -0400
@@ -1,398 +1,149 @@
-#!/usr/bin/env Rscript
-
-initial.options <- commandArgs(trailingOnly = FALSE)
-script_parent_dir <- dirname(sub("--file=", "", initial.options[grep("--file=", initial.options)]))
-
-## begin warning handler
-withCallingHandlers({
-
-library(methods) # Because Rscript does not always do this
-
-options('useFancyQuotes' = FALSE)
-
-suppressPackageStartupMessages(library("optparse"))
-suppressPackageStartupMessages(library("RGalaxy"))
-
-
-option_list <- list()
-
-option_list$passedPSM <- make_option('--passedPSM', type='character')
-option_list$XScolumn <- make_option('--XScolumn', type='character')
-option_list$exon_anno <- make_option('--exon_anno', type='character')
-option_list$proteinseq <- make_option('--proteinseq', type='character')
-option_list$procodingseq <- make_option('--procodingseq', type='character')
-option_list$header <- make_option('--header', type='character')
-option_list$OutputFile <- make_option('--OutputFile', type='character')
-
-
-opt <- parse_args(OptionParser(option_list=option_list))
-
-
-
-
-PSMtab2SAM <- function(
-	passedPSM_file = GalaxyInputFile(required=TRUE), 
-	exon_anno_file = GalaxyInputFile(required=TRUE),
-	proteinseq_file = GalaxyInputFile(required=TRUE),
-	procodingseq_file = GalaxyInputFile(required=TRUE),
-	header_file = GalaxyInputFile(required=TRUE),
-	XScolumn = GalaxyCharacterParam(required=TRUE), 
-	OutputFile = GalaxyOutput("proSAM","sam"))
-{
-    if (!file.exists(header_file)) { gstop("failed to read header file") }
-    if (file.exists(OutputFile))
-    {
-        if (file.info(OutputFile)$size > 0) { gstop("output file already exists") }
-        else
-        {
-            tryCatch(
-            {
-                file.remove(OutputFile)
-            }, error=function(err)
-            {
-                gstop("failed to remove empty existing file")
-            })
-        }
-    }
-
-    suppressPackageStartupMessages(library(GenomicRanges))
-    suppressPackageStartupMessages(library(Biostrings))
-    
-    options(stringsAsFactors=FALSE)
-
-    scoreName = XScolumn
-    columnName = gsub("[^A-Z0-9]", "_", scoreName)
-    
-    passedPSM <- tryCatch({
-        #read.delim(passedPSM_file, row.names=1)
-        suppressPackageStartupMessages(library(RSQLite))
-        drv <- dbDriver("SQLite")
-        con <- dbConnect(drv, passedPSM_file)
-        
-        # do case-insensitive search for the score name
-        res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep=""))
-        scoreInfo = fetch(res, n=1)
-        scoreId = scoreInfo["Id"]
-        realScoreName = scoreInfo["Name"] # original case
-        dbClearResult(res)
-
-        sql <- paste("SELECT ss.Name as SourceName, s.NativeID",
-                     ", psm.ObservedNeutralMass AS precursor_neutral_mass",
-                     ", psm.Charge AS assumed_charge",
-                     ", s.ScanTimeInSeconds AS retention_time_sec",
-                     ", psm.Rank AS hit_rank",
-                     ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide",
-                     ", pro.Accession AS protein",
-                     ", COUNT(DISTINCT pro.Id) AS num_tot_proteins",
-                     ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass",
-                     ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff",
-                     ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term",
-                     ", MissedCleavages AS num_missed_cleavages",
-                     ", psm.QValue AS qvalue",
-                     ", psmScore.Value AS ", columnName,
-                     ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification",
-                     "FROM PeptideSpectrumMatch psm",
-                     "JOIN Spectrum s ON psm.Spectrum=s.Id",
-                     "JOIN SpectrumSource ss ON s.Source=ss.Id",
-                     "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide",
-                     "JOIN Protein pro ON pi.Protein=pro.Id",
-                     "JOIN Peptide pep ON pi.Peptide=pep.Id",
-                     "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId,
-                     "LEFT JOIN ProteinData pd ON pro.Id=pd.Id",
-                     "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch",
-                     "LEFT JOIN Modification mod ON pm.Modification=mod.Id",
-                     "GROUP BY psm.Id"
-                    )
-
-        res <- dbSendQuery(con, sql)
-        passedPSM <- fetch(res, n=-1)
-        dbClearResult(res)
-        dbDisconnect(con)
-        passedPSM
-    }, error=function(err) {
-        gstop("failed to read passedPSM: ", err)
-    })
-
-    tryCatch({
-        load(exon_anno_file)
-        exon_anno <- exon
-    }, error=function(err) {
-        gstop("failed to read exon_anno: ", conditionMessage(err))
-    })
-
-    tryCatch({
-        load(proteinseq_file)
-    }, error=function(err) {
-        gstop("failed to read proteinseq: ", conditionMessage(err))
-    })
-
-    tryCatch({
-        load(procodingseq_file)
-    }, error=function(err) {
-        gstop("failed to read procodingseq: ", conditionMessage(err))
-    })
-
-    PEP <- passedPSM[, 'peptide']
-    #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/")
-    Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/")
-    #PEP_SEQ <- formatPep(spectra[, 'Sequence'])
-
-    SAM <- c()
-
-    spectrumcount <- table(Spectrumid)
-
-    for(i in 1:dim(passedPSM)[1]){
-        #print(i)
-        peptide <- PEP[i]
-        QNAME <- Spectrumid[i]
-        idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE)
-        if(length(idx) == 0){
-            RNAME <- '*' 
-            MAPQ <- 255
-            RNEXT <- '*'
-            PNEXT <- 0
-            TLEN <- 0
-            QUAL <- '*'
-            POS <- 0
-            SEQ <- '*'
-            CIGAR <- '*'            
-            FLAG <- 0x4 
-            annoted <- '?'
-            XA <- paste('XA:Z:', annoted, sep='')
-            res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
-                    as.character(SEQ), QUAL, XA)
-            res <-  unique(data.frame(t(res))) 
-        }else{
-            pro <- proteinseq[idx, ]
-            sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) 
-                        regexpr(peptide, x, fixed=TRUE)))
-            pep_len <- nchar(peptide)
-            end_pos <- sta_pos + pep_len - 1
-            
-            coding <- procodingseq[match(pro[, 'pro_name'], 
-                                        procodingseq[, 'pro_name']), ]
-            code_s <- (sta_pos-1) * 3 + 1
-            code_e <- end_pos * 3
-            codingseq <- substring(coding[, 'coding'], code_s, code_e)
-            
-            
-            
-            exonp <- lapply(pro[, 'tx_name'], function(x)
-                        exon_anno[exon_anno[, 'tx_name']==x, ])
-            
-            exonp <- lapply(exonp, function(x){
-                            if(length(unique(x[, 'tx_id'])) > 1){
-                                x[grep(x[1, 'tx_id'], x[, 'tx_id'], 
-                                fixed=TRUE), ]
-                            }else x
-                        })
-                       
-            if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE
-                
-            res <- mapply(function(x, y, z, m) 
-                    if(dim(z)[1] == 0){
-                        .proteinUnannotated(x, y, z, m, primary=pri)
-                    }else{
-                        if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], 
-                            na.rm = TRUE))){
-                        #if(toString(translate(DNAString(m))) != peptide){
-                            .peptideUnannotated(x, y, z, m, primary=pri)
-                        }else{
-                            .MapCoding2genome(x, y, z, m, primary=pri)
-                        }
-                    }, 
-                    code_s, code_e, exonp, codingseq) 
-                    
-            res <-  unique(data.frame(t(res)))
-            
-        }
-        XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='')
-        NH <- paste('NH:i:', dim(res)[1], sep='')
-        XP <- paste('XP:Z:', peptide, sep='')
-        #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='')
-        XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='')
-        XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), 
-                        digits=4), sep='')
-        #XA <- paste('XA:Z:', annoted, sep='')
-        XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='')
-        XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='')
-
-        XM <-  ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'),
-                    paste('XM:Z:', passedPSM[i, 'modification'], sep=''))
-
-        res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT)    
-        SAM <- rbind(SAM, res)
-    }
-
-    file.copy(header_file, OutputFile)
-    write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE)
-}
-
-
-
-.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
-{ 
-    RNAME <- '*' 
-    MAPQ <- 255
-    RNEXT <- '*'
-    PNEXT <- 0
-    TLEN <- 0
-    QUAL <- '*'
-    POS <- 0
-    SEQ <- '*'
-    CIGAR <- '*'
-    annoted <- 2
-    XA <- paste('XA:Z:', annoted, sep='')
-    FLAG <- 0x4 
-    
-    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
-                as.character(SEQ), QUAL, XA)
-    tmp
-}
-
-
-
-.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
-{    
-    #RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
-    RNAME <- '*'
-    MAPQ <- 255
-    RNEXT <- '*'
-    PNEXT <- 0
-    TLEN <- 0
-    QUAL <- '*'
-    POS <- 0
-    SEQ <- '*'
-    CIGAR <- '*'
-    annoted <- 1
-    XA <- paste('XA:Z:', annoted, sep='')
-    FLAG <- 0x4 
-    #if(exon_anno[1, 'strand'] == '+'){        
-    #    FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
-    #}else{
-    #    FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100)   
-    #}
-    
-    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
-                as.character(SEQ), QUAL, XA)
-    tmp
-}
-
-
-
-.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
-{
-    idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), 
-                        which(exon_anno[, 'cds_end'] >= c_sta))
-    idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), 
-                        which(exon_anno[, 'cds_end'] >= c_end))              
-    len <- c_end - c_sta + 1 
-    RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
-    MAPQ <- 255
-    RNEXT <- '*'
-    PNEXT <- 0
-    TLEN <- 0
-    QUAL <- '*'
-    annoted <- 0
-    XA <- paste('XA:Z:', annoted, sep='')
-    
-    if(exon_anno[1, 'strand'] == '+'){
-            POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - 
-                    exon_anno[idxs, 'cds_start']
-            SEQ <- DNAString(cseq)
-            FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
-            
-     }else{
-            POS <- exon_anno[idxe, 'cds_chr_start'] +  
-                    exon_anno[idxe, 'cds_end'] - c_end
-            SEQ <- reverseComplement(DNAString(cseq))
-            FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) 
-     }
-    
-    if(idxe == idxs){
-        CIGAR <- paste(len, 'M', sep='')        
-    }else{
-         if(exon_anno[1, 'strand'] == '+'){
-            #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, 
-            #                        'cds_chr_end']- 1
-            part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 
-            part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1  
-            
-            insert <- unlist(lapply(1:(idxe - idxs), function(x) 
-                paste(exon_anno[idxs + x, 'cds_chr_start'] - 
-                    exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep='')))
-            if(idxe-idxs >1){
-                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
-                        paste(exon_anno[idxs + x, 'cds_chr_end'] - 
-                        exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', 
-                        sep=''))) 
-            }else{ innerexon <- ''}
-            
-            #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) 
-            #paste(exon_anno[idxs+x, 'cds_chr_end'] - 
-            #             exon_anno[idxs+x, 'cds_chr_start']+1, 
-            #             'M', sep=''))), '')
-            midpattern <- rep(NA, length(insert) + length(innerexon))
-            midpattern[seq(1, length(insert) + length(innerexon), 
-                                                        by=2)] <- insert
-            midpattern[seq(2, length(insert) + length(innerexon), 
-                                                        by=2)] <- innerexon
-            midpattern <- paste(midpattern, collapse='') 
-            
-        }else{
-            #insert <- exon_anno[idxs, 'cds_chr_start'] - 
-            #            exon_anno[idxe, 'cds_chr_end']- 1
-            part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 
-            part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1  
-            
-            insert <- unlist(lapply(1:(idxe-idxs), function(x) 
-                paste(exon_anno[idxe - x, 'cds_chr_start'] - 
-                        exon_anno[idxe-x + 1, 'cds_chr_end']- 1, 
-                        'N', sep='')))
-            if(idxe-idxs >1){
-                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
-                        paste(exon_anno[idxe-x, 'cds_chr_end'] - 
-                        exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep='')))
-            }else{ innerexon <-''}
-        
-            midpattern <- rep(NA, length(insert)+length(innerexon))
-            midpattern[seq(1, length(insert) + length(innerexon), 
-                                                        by=2)] <- insert
-            midpattern[seq(2, length(insert) + length(innerexon), 
-                                                        by=2)] <- innerexon
-            midpattern <- paste(midpattern, collapse='') 
-
-        }
-
-        CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='')    
-    }
-
-    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
-                as.character(SEQ), QUAL, XA)
-    tmp
-}
-
-
-params <- list()
-for(param in names(opt))
-{
-    if (!param == "help")
-        params[param] <- opt[param]
-}
-
-setClass("GalaxyRemoteError", contains="character")
-wrappedFunction <- function(f)
-{
-    tryCatch(do.call(f, params),
-        error=function(e) new("GalaxyRemoteError", conditionMessage(e)))
-}
-
-
-suppressPackageStartupMessages(library(RGalaxy))
-do.call(PSMtab2SAM, params)
-
-## end warning handler
-}, warning = function(w) {
-    cat(paste("Warning:", conditionMessage(w), "\n"))
-    invokeRestart("muffleWarning")
-})
+#!/usr/bin/env Rscript
+
+## begin warning handler
+withCallingHandlers({
+
+library(methods) # Because Rscript does not always do this
+
+options('useFancyQuotes' = FALSE)
+
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("RGalaxy"))
+
+
+option_list <- list()
+
+option_list$exon_anno <- make_option('--exon_anno', type='character')
+option_list$proteinseq <- make_option('--proteinseq', type='character')
+option_list$procodingseq <- make_option('--procodingseq', type='character')
+option_list$bam_file <- make_option('--bam', type='character')
+option_list$idpDB_file <- make_option('--idpDB', type='character')
+option_list$pepXmlTab_file <- make_option('--pepXmlTab', type='character')
+option_list$peptideShakerPsmReport_file <- make_option('--peptideShakerPsmReport', type='character')
+option_list$variantAnnotation_file <- make_option('--variantAnnotation', type='character')
+option_list$searchEngineScore <- make_option('--searchEngineScore', type='character')
+
+
+opt <- parse_args(OptionParser(option_list=option_list))
+
+
+psm2sam <- function(
+    exon_anno_file = GalaxyInputFile(required=TRUE),
+    proteinseq_file = GalaxyInputFile(required=TRUE),
+    procodingseq_file = GalaxyInputFile(required=TRUE),
+    bam_file = GalaxyInputFile(required=TRUE),
+    idpDB_file = GalaxyInputFile(required=FALSE),
+    pepXmlTab_file = GalaxyInputFile(required=FALSE),
+    peptideShakerPsmReport_file = GalaxyInputFile(required=FALSE),
+    variantAnnotation_file = GalaxyInputFile(required=FALSE),
+    searchEngineScore = GalaxyCharacterParam(required=FALSE)
+    )
+{
+    options(stringsAsFactors = FALSE)
+
+    if (length(bam_file) == 0)
+    {
+        stop("BAM file must be specified to provide sequence headers")
+    }
+
+    outputHeader = grep("^@(?!PG)", readLines(bam_file, n=500, warn=FALSE), value=TRUE, perl=TRUE)
+    if (length(outputHeader) == 0)
+    {
+        stop("failed to read header lines from bam_file")
+    }
+
+    # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github)
+    download.file("https://github.com/chambm/customProDB/archive/master.zip", "customProDB.zip", quiet=TRUE)
+    unzip("customProDB.zip")
+    devtools::load_all("customProDB-master")
+
+    # load proBAMr from GitHub
+    download.file("https://github.com/chambm/proBAMr/archive/master.zip", "proBAMr.zip", quiet=TRUE)
+    unzip("proBAMr.zip")
+    devtools::load_all("proBAMr-master")
+    
+    psmInputLength = length(idpDB_file)+length(pepXmlTab_file)+length(peptideShakerPsmReport_file)
+    if (psmInputLength == 0)
+    {
+        stop("one of the input PSM file parameters must be specified")
+    }
+    else if (psmInputLength > 1)
+    {
+        stop("only one of the input PSM file parameters can be specified")
+    }
+    
+    if (length(idpDB_file) > 0)
+    {
+        if (length(searchEngineScore) == 0)
+            stop("searchEngineScore parameter must be specified when reading IDPicker PSMs, e.g. 'MyriMatch:MVH'")
+        passedPSM = readIdpDB(idpDB_file, searchEngineScore)
+    }
+    else if (length(pepXmlTab_file) > 0)
+    {
+        if (length(searchEngineScore) == 0)
+            stop("searchEngineScore parameter must be specified when reading pepXmlTab PSMs, e.g. 'mvh'")
+        passedPSM = readPepXmlTab(pepXmlTab_file, searchEngineScore)
+    }
+    else if (length(peptideShakerPsmReport_file) > 0)
+    {
+        if (length(searchEngineScore) > 0)
+            warning("searchEngineScore parameter is ignored when reading PeptideShaker PSM report")
+        passedPSM = readPeptideShakerPsmReport(peptideShakerPsmReport_file)
+    }
+
+    load(exon_anno_file)
+    load(proteinseq_file)
+    load(procodingseq_file)
+
+    if (length(variantAnnotation_file) > 0)
+    {
+        load(variantAnnotation_file) # variantAnnotation list, with members snvprocoding/snvproseq and indelprocoding/indelproseq
+
+        varprocoding = unique(rbind(variantAnnotation$snvprocoding, variantAnnotation$indelprocoding))
+        varproseq = unique(rbind(variantAnnotation$snvproseq, variantAnnotation$indelproseq))
+    }
+    else
+    {
+        varprocoding = NULL
+        varproseq = NULL
+    }
+
+    # add proBAMr program key
+    outputHeader = c(outputHeader, paste0("@PG\tID:proBAMr\tVN:", packageVersion("proBAMr")))
+
+    # first write header lines to the output SAM
+    writeLines(outputHeader, "output.sam")
+
+    # then write the PSM "reads"
+    PSMtab2SAM(passedPSM, exon,
+               proteinseq, procodingseq,
+               varproseq, varprocoding,
+               outfile = "output.sam",
+               show_progress = FALSE)
+
+    invisible(NULL)
+}
+
+params <- list()
+for(param in names(opt))
+{
+    if (!param == "help")
+        params[param] <- opt[param]
+}
+
+setClass("GalaxyRemoteError", contains="character")
+wrappedFunction <- function(f)
+{
+    tryCatch(do.call(f, params),
+        error=function(e) new("GalaxyRemoteError", conditionMessage(e)))
+}
+
+
+suppressPackageStartupMessages(library(RGalaxy))
+do.call(psm2sam, params)
+
+## end warning handler
+}, warning = function(w) {
+    cat(paste("Warning:", conditionMessage(w), "\n"))
+    invokeRestart("muffleWarning")
+})