Mercurial > repos > galaxyp > custom_pro_db
comparison customProDB.R @ 14:bc10f130dbec draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit e025f5b4d590c44537cf0702e2fb040a28f98fec
| author | galaxyp |
|---|---|
| date | Fri, 12 May 2017 13:17:27 -0400 |
| parents | 982fb2cde6c5 |
| children | 87274e4ef662 |
comparison
equal
deleted
inserted
replaced
| 13:37cd89a40cea | 14:bc10f130dbec |
|---|---|
| 25 option_list$ids <- make_option('--ids', type='character') | 25 option_list$ids <- make_option('--ids', type='character') |
| 26 option_list$dbsnpinCoding <- make_option('--dbsnpinCoding', type='character') | 26 option_list$dbsnpinCoding <- make_option('--dbsnpinCoding', type='character') |
| 27 option_list$cosmic <- make_option('--cosmic', type='character') | 27 option_list$cosmic <- make_option('--cosmic', type='character') |
| 28 option_list$annotationFromHistory <- make_option('--annotationFromHistory', type='logical', action="store_true", default=FALSE) | 28 option_list$annotationFromHistory <- make_option('--annotationFromHistory', type='logical', action="store_true", default=FALSE) |
| 29 option_list$rpkmCutoff <- make_option('--rpkmCutoff', type='character') | 29 option_list$rpkmCutoff <- make_option('--rpkmCutoff', type='character') |
| 30 #option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE) | 30 option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE) |
| 31 #option_list$outputNovelJunctions <- make_option('--outputNovelJunctions', type='logical', action="store_true", default=FALSE) | 31 #option_list$outputNovelJunctions <- make_option('--outputNovelJunctions', type='logical', action="store_true", default=FALSE) |
| 32 option_list$outputFile <- make_option('--outputFile', type='character') | 32 #option_list$bedFile <- make_option('--bedFile', type='character') |
| 33 #option_list$bsGenome <- make_option('--bsGenome', type='character') | |
| 34 option_list$outputRData <- make_option('--outputRData', type='logical', action="store_true", default=FALSE) | |
| 35 option_list$outputSQLite <- make_option('--outputSQLite', type='logical', action="store_true", default=FALSE) | |
| 33 | 36 |
| 34 | 37 |
| 35 opt <- parse_args(OptionParser(option_list=option_list)) | 38 opt <- parse_args(OptionParser(option_list=option_list)) |
| 36 | 39 |
| 37 | 40 |
| 38 customProDB <- function( | 41 customProDB <- function( |
| 39 bam_file = GalaxyInputFile(required=TRUE), | 42 bam_file = GalaxyInputFile(required=TRUE), |
| 40 bai_file = GalaxyInputFile(required=TRUE), | 43 bai_file = GalaxyInputFile(required=TRUE), |
| 41 vcf_file = GalaxyInputFile(required=TRUE), | 44 vcf_file = GalaxyInputFile(required=TRUE), |
| 42 exon_anno_file = GalaxyInputFile(required=TRUE), | 45 exon_anno_file = GalaxyInputFile(required=TRUE), |
| 43 proteinseq_file = GalaxyInputFile(required=TRUE), | 46 proteinseq_file = GalaxyInputFile(required=TRUE), |
| 44 procodingseq_file = GalaxyInputFile(required=TRUE), | 47 procodingseq_file = GalaxyInputFile(required=TRUE), |
| 45 ids_file = GalaxyInputFile(required=TRUE), | 48 ids_file = GalaxyInputFile(required=TRUE), |
| 46 dbsnpinCoding_file = GalaxyInputFile(required=FALSE), | 49 dbsnpinCoding_file = GalaxyInputFile(required=FALSE), |
| 47 cosmic_file = GalaxyInputFile(required=FALSE), | 50 cosmic_file = GalaxyInputFile(required=FALSE), |
| 48 annotationFromHistory = GalaxyLogicalParam(required=FALSE), | 51 annotationFromHistory = GalaxyLogicalParam(required=FALSE), |
| 49 rpkmCutoff = GalaxyNumericParam(required=TRUE), | 52 rpkmCutoff = GalaxyNumericParam(required=TRUE), |
| 50 #outputIndels = GalaxyLogicalParam(required=FALSE), | 53 outputIndels = GalaxyLogicalParam(required=FALSE), |
| 51 #outputNovelJunctions = GalaxyLogicalParam(required=FALSE), | 54 outputRData = GalaxyLogicalParam(required=FALSE), |
| 52 outputFile = GalaxyOutput("FASTA","fasta")) | 55 outputSQLite = GalaxyLogicalParam(required=FALSE) |
| 56 #,outputNovelJunctions = GalaxyLogicalParam(required=FALSE) | |
| 57 #,bedFile = GalaxyInputFile(required=FALSE) | |
| 58 #,bsGenome = GalaxyCharacterParam(required=FALSE) | |
| 59 ) | |
| 53 { | 60 { |
| 61 old <- options(stringsAsFactors = FALSE, gsubfn.engine = "R") | |
| 62 on.exit(options(old), add = TRUE) | |
| 63 | |
| 54 file.symlink(exon_anno_file, paste(getwd(), "exon_anno.RData", sep="/")) | 64 file.symlink(exon_anno_file, paste(getwd(), "exon_anno.RData", sep="/")) |
| 55 file.symlink(proteinseq_file, paste(getwd(), "proseq.RData", sep="/")) | 65 file.symlink(proteinseq_file, paste(getwd(), "proseq.RData", sep="/")) |
| 56 file.symlink(procodingseq_file, paste(getwd(), "procodingseq.RData", sep="/")) | 66 file.symlink(procodingseq_file, paste(getwd(), "procodingseq.RData", sep="/")) |
| 57 file.symlink(ids_file, paste(getwd(), "ids.RData", sep="/")) | 67 file.symlink(ids_file, paste(getwd(), "ids.RData", sep="/")) |
| 58 | 68 |
| 69 load(exon_anno_file) | |
| 70 load(proteinseq_file) | |
| 71 load(procodingseq_file) | |
| 72 load(ids_file) | |
| 73 | |
| 59 if (length(dbsnpinCoding_file) > 0) | 74 if (length(dbsnpinCoding_file) > 0) |
| 60 { | 75 { |
| 61 file.symlink(dbsnpinCoding_file, paste(getwd(), "dbsnpinCoding.RData", sep="/")) | 76 file.symlink(dbsnpinCoding_file, paste(getwd(), "dbsnpinCoding.RData", sep="/")) |
| 62 labelrsid = T | 77 labelrsid = TRUE |
| 78 load(dbsnpinCoding_file) | |
| 63 } | 79 } |
| 64 else | 80 else |
| 65 { | 81 { |
| 66 labelrsid = F | 82 dbsnpinCoding = NULL |
| 83 labelrsid = FALSE | |
| 67 } | 84 } |
| 68 | 85 |
| 69 if (length(cosmic_file) > 0) | 86 if (length(cosmic_file) > 0) |
| 70 { | 87 { |
| 71 file.symlink(cosmic_file, paste(getwd(), "cosmic.RData", sep="/")) | 88 file.symlink(cosmic_file, paste(getwd(), "cosmic.RData", sep="/")) |
| 72 cosmic = T | 89 use_cosmic = TRUE |
| 90 load(cosmic_file) | |
| 73 } | 91 } |
| 74 else | 92 else |
| 75 { | 93 { |
| 76 cosmic = F | 94 cosmic = NULL |
| 95 use_cosmic = FALSE | |
| 77 } | 96 } |
| 78 | 97 |
| 79 bamLink = "input.bam" | 98 bamLink = "input.bam" |
| 80 file.symlink(bam_file, bamLink) | 99 file.symlink(bam_file, bamLink) |
| 81 file.symlink(bai_file, paste(bamLink, ".bai", sep="")) | 100 file.symlink(bai_file, paste(bamLink, ".bai", sep="")) |
| 82 | 101 |
| 83 suppressPackageStartupMessages(library(customProDB)) | 102 # load from GitHub until conda package is available |
| 103 download.file("https://github.com/ggrothendieck/sqldf/archive/master.zip", "sqldf.zip", quiet=TRUE) | |
| 104 unzip("sqldf.zip") | |
| 105 devtools::load_all("sqldf-master") | |
| 106 | |
| 107 # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github) | |
| 108 download.file("https://github.com/chambm/customProDB/archive/master.zip", "customProDB.zip", quiet=TRUE) | |
| 109 unzip("customProDB.zip") | |
| 110 devtools::load_all("customProDB-master") | |
| 84 | 111 |
| 85 easyRun(bamFile=bamLink, vcfFile=vcf_file, annotation_path=getwd(), | 112 easyRun(bamFile=bamLink, vcfFile=vcf_file, annotation_path=getwd(), |
| 86 rpkm_cutoff=rpkmCutoff, outfile_path=".", outfile_name="output", | 113 rpkm_cutoff=rpkmCutoff, outfile_path=".", outfile_name="output", |
| 87 nov_junction=F, INDEL=T, lablersid=labelrsid, COSMIC=cosmic) | 114 nov_junction=FALSE, INDEL=outputIndels, |
| 115 lablersid=labelrsid, COSMIC=use_cosmic) | |
| 116 | |
| 117 # save variant annotations to an RData file (needed by proBAMr) | |
| 118 if (outputRData || outputSQLite) | |
| 119 { | |
| 120 variantAnnotation = getVariantAnnotation(vcf_file, ids, exon, proteinseq, procodingseq, dbsnpinCoding, cosmic) | |
| 121 if (outputRData) save(variantAnnotation, file="output.rdata") | |
| 122 } | |
| 123 | |
| 124 if (outputSQLite) | |
| 125 { | |
| 126 # create protein-centric variant annotation table (needed by Galaxy-P viewer MVP) | |
| 127 varproseq = unique(rbind(variantAnnotation$snvproseq, variantAnnotation$indelproseq)) | |
| 128 ref_vs_var_seq = sqldf::sqldf("SELECT reference.pro_name, variant.pro_name AS var_pro_name, reference.peptide AS ref_seq, variant.peptide AS var_seq | |
| 129 FROM proteinseq reference, varproseq variant | |
| 130 WHERE reference.tx_name=variant.tx_name | |
| 131 GROUP BY variant.pro_name") | |
| 132 getCigarishString = function(ref, var) | |
| 133 { | |
| 134 a = Biostrings::pairwiseAlignment(ref, var) | |
| 135 d = gsub("[A-Z]", "=", Biostrings::compareStrings(a@pattern, a@subject)) | |
| 136 r = rle(strsplit(d, "")[[1]]) | |
| 137 gsub("-", "D", gsub("\\+", "I", gsub("\\?", "X", paste0(r$lengths, r$values, collapse="")))) | |
| 138 } | |
| 139 ref_vs_var_seq$cigar = mapply(FUN=getCigarishString, ref_vs_var_seq$ref_seq, ref_vs_var_seq$var_seq, USE.NAMES=FALSE) | |
| 140 ref_vs_var_seq$annotation = substring(ref_vs_var_seq$var_pro_name, stringr::str_length(ref_vs_var_seq$pro_name)+2) | |
| 141 | |
| 142 variant_annotation_sqlite = dbConnect(RSQLite::SQLite(), "output_variant_annotation.sqlite") | |
| 143 dbWriteTable(variant_annotation_sqlite, | |
| 144 "variant_annotation", | |
| 145 sqldf::sqldf("SELECT var_pro_name, pro_name, cigar, annotation FROM ref_vs_var_seq")) | |
| 146 DBI::dbExecute(variant_annotation_sqlite, "CREATE INDEX variant_annotation_var_pro_name ON variant_annotation (var_pro_name)") | |
| 147 | |
| 148 # save genomic mapping to a SQLite file (needed by Galaxy-P viewer MVP) | |
| 149 exon$cds_start = as.integer(exon$cds_start) | |
| 150 exon$cds_end = as.integer(exon$cds_end) | |
| 151 genomic_mapping_sqlite = dbConnect(RSQLite::SQLite(), "output_genomic_mapping.sqlite") | |
| 152 varprocoding = unique(rbind(variantAnnotation$snvprocoding, variantAnnotation$indelprocoding)) | |
| 153 dbWriteTable(genomic_mapping_sqlite, | |
| 154 "genomic_mapping", | |
| 155 sqldf::sqldf("SELECT exon.gene_name, exon.tx_name, varprocoding.pro_name, cds_start, cds_end, | |
| 156 chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand | |
| 157 FROM exon, varprocoding | |
| 158 WHERE exon.tx_id=varprocoding.tx_id AND cds_chr_start > 0 | |
| 159 GROUP BY exon.tx_id, rank | |
| 160 UNION | |
| 161 SELECT gene_name, tx_name, pro_name, cds_start, cds_end, | |
| 162 chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand | |
| 163 FROM exon | |
| 164 WHERE cds_chr_start > 0 | |
| 165 GROUP BY tx_id, rank")) | |
| 166 DBI::dbExecute(genomic_mapping_sqlite, "CREATE INDEX genomic_mapping_pro_name ON genomic_mapping (pro_name)") | |
| 167 } | |
| 168 | |
| 169 invisible(NULL) | |
| 88 } | 170 } |
| 89 | 171 |
| 90 | 172 |
| 91 params <- list() | 173 params <- list() |
| 92 for(param in names(opt)) | 174 for(param in names(opt)) |
