Mercurial > repos > davidvanzessen > mutation_analysis
comparison baseline/Baseline_Functions.r @ 114:e7b550d52eb7 draft
Uploaded
| author | davidvanzessen | 
|---|---|
| date | Tue, 09 Aug 2016 07:20:41 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 113:b84477f57318 | 114:e7b550d52eb7 | 
|---|---|
| 1 ######################################################################################### | |
| 2 # License Agreement | |
| 3 # | |
| 4 # THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE | |
| 5 # ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER | |
| 6 # APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE | |
| 7 # OR COPYRIGHT LAW IS PROHIBITED. | |
| 8 # | |
| 9 # BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE | |
| 10 # BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED | |
| 11 # TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN | |
| 12 # CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. | |
| 13 # | |
| 14 # BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences | |
| 15 # Coded by: Mohamed Uduman & Gur Yaari | |
| 16 # Copyright 2012 Kleinstein Lab | |
| 17 # Version: 1.3 (01/23/2014) | |
| 18 ######################################################################################### | |
| 19 | |
| 20 # Global variables | |
| 21 | |
| 22 FILTER_BY_MUTATIONS = 1000 | |
| 23 | |
| 24 # Nucleotides | |
| 25 NUCLEOTIDES = c("A","C","G","T") | |
| 26 | |
| 27 # Amino Acids | |
| 28 AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G") | |
| 29 names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG") | |
| 30 names(AMINO_ACIDS) <- names(AMINO_ACIDS) | |
| 31 | |
| 32 #Amino Acid Traits | |
| 33 #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y" | |
| 34 #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") | |
| 35 TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N") | |
| 36 names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS)) | |
| 37 TRAITS_AMINO_ACIDS <- array(NA,21) | |
| 38 | |
| 39 # Codon Table | |
| 40 CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12)) | |
| 41 | |
| 42 # Substitution Model: Smith DS et al. 1996 | |
| 43 substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
| 44 substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
| 45 substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
| 46 load("FiveS_Substitution.RData") | |
| 47 | |
| 48 # Mutability Models: Shapiro GS et al. 2002 | |
| 49 triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, 0.34, 0.49, 0.33, 2.62, 0.16, 0.4, 0.16, 0.35, 0.75, 1.85, 0.94, 1.61, 0.85, 2.09, 1.39, 0.3, 0.52, 1.33, 0.29, 0.51, 0.26, 0.51, 3.83, 2.01, 0.71, 0.58, 0.62, 1.07, 0.28, 1.2, 0.74, 0.25, 0.59, 1.09, 0.91, 1.36, 0.45, 2.89, 1.27, 3.7, 0.69, 0.28, 0.41, 1.17, 0.56, 0.93, 3.41, 1, 1, NA, 5.9, 0.74, 2.51, 2.24, 2.24, 1.95, 3.32, 2.34, 1.3, 2.3, 1, 0.66, 0.73, 0.93, 0.41, 0.65, 0.89, 0.65, 0.32, NA, 0.43, 0.85, 0.43, 0.31, 0.31, 0.23, 0.29, 0.57, 0.71, 0.48, 0.44, 0.76, 0.51, 1.7, 0.85, 0.74, 2.23, 2.08, 1.16, 0.51, 0.51, 1, 0.5, NA, NA, 0.71, 2.14), nrow=64,byrow=T) | |
| 50 triMutability_Literature_Mouse <- matrix(c(1.31, 1.35, 1.42, 1.18, 2.02, 2.02, 1.02, 1.61, 1.99, 1.42, 2.01, 1.03, 2.02, 0.97, 0.53, 0.71, 1.19, 0.83, 0.96, 0.96, 0, 1.7, 2.22, 0.59, 1.24, 1.07, 0.51, 1.68, 3.36, 3.36, 1.14, 0.29, 0.33, 0.9, 1.11, 0.63, 1.08, 2.07, 2.27, 1.74, 0.22, 1.19, 2.37, 1.15, 1.15, 1.56, 0.81, 0.34, 0.87, 0.79, 2.13, 0.49, 0.85, 0.97, 0.36, 0.82, 0.66, 0.63, 1.15, 0.94, 0.85, 0.25, 0.93, 1.19, 0.4, 0.2, 0.44, 0.44, 0.88, 1.06, 0.77, 0.39, 0, 0, 0, 0, 0, 0, 0.43, 0.43, 0.86, 0.59, 0.59, 0, 1.18, 0.86, 2.9, 1.66, 0.4, 0.2, 1.54, 0.43, 0.69, 1.71, 0.68, 0.55, 0.91, 0.7, 1.71, 0.09, 0.27, 0.63, 0.2, 0.45, 1.01, 1.63, 0.96, 1.48, 2.18, 1.2, 1.31, 0.66, 2.13, 0.49, 0, 0, 0, 2.97, 2.8, 0.79, 0.4, 0.5, 0.4, 0.11, 1.68, 0.42, 0.13, 0.44, 0.93, 0.71, 1.11, 1.19, 2.71, 1.08, 3.43, 0.4, 0.67, 0.47, 1.02, 0.14, 1.56, 1.98, 0.53, 0.33, 0.63, 2.06, 1.77, 1.46, 3.74, 2.93, 2.1, 2.18, 0.78, 0.73, 2.93, 0.63, 0.57, 0.17, 0.85, 0.52, 0.31, 0.31, 0, 0, 0.51, 0.29, 0.83, 0.54, 0.28, 0.47, 0.9, 0.99, 1.24, 2.47, 0.73, 0.23, 1.13, 0.24, 2.12, 0.24, 0.33, 0.83, 1.41, 0.62, 0.28, 0.35, 0.77, 0.17, 0.72, 0.58, 0.45, 0.41), nrow=64,byrow=T) | |
| 51 triMutability_Names <- c("AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT") | |
| 52 load("FiveS_Mutability.RData") | |
| 53 | |
| 54 # Functions | |
| 55 | |
| 56 # Translate codon to amino acid | |
| 57 translateCodonToAminoAcid<-function(Codon){ | |
| 58 return(AMINO_ACIDS[Codon]) | |
| 59 } | |
| 60 | |
| 61 # Translate amino acid to trait change | |
| 62 translateAminoAcidToTraitChange<-function(AminoAcid){ | |
| 63 return(TRAITS_AMINO_ACIDS[AminoAcid]) | |
| 64 } | |
| 65 | |
| 66 # Initialize Amino Acid Trait Changes | |
| 67 initializeTraitChange <- function(traitChangeModel=1,species=1,traitChangeFileName=NULL){ | |
| 68 if(!is.null(traitChangeFileName)){ | |
| 69 tryCatch( | |
| 70 traitChange <- read.delim(traitChangeFileName,sep="\t",header=T) | |
| 71 , error = function(ex){ | |
| 72 cat("Error|Error reading trait changes. Please check file name/path and format.\n") | |
| 73 q() | |
| 74 } | |
| 75 ) | |
| 76 }else{ | |
| 77 traitChange <- TRAITS_AMINO_ACIDS_CHOTHIA98 | |
| 78 } | |
| 79 TRAITS_AMINO_ACIDS <<- traitChange | |
| 80 } | |
| 81 | |
| 82 # Read in formatted nucleotide substitution matrix | |
| 83 initializeSubstitutionMatrix <- function(substitutionModel,species,subsMatFileName=NULL){ | |
| 84 if(!is.null(subsMatFileName)){ | |
| 85 tryCatch( | |
| 86 subsMat <- read.delim(subsMatFileName,sep="\t",header=T) | |
| 87 , error = function(ex){ | |
| 88 cat("Error|Error reading substitution matrix. Please check file name/path and format.\n") | |
| 89 q() | |
| 90 } | |
| 91 ) | |
| 92 if(sum(apply(subsMat,1,sum)==1)!=4) subsMat = t(apply(subsMat,1,function(x)x/sum(x))) | |
| 93 }else{ | |
| 94 if(substitutionModel==1)subsMat <- substitution_Literature_Mouse | |
| 95 if(substitutionModel==2)subsMat <- substitution_Flu_Human | |
| 96 if(substitutionModel==3)subsMat <- substitution_Flu25_Human | |
| 97 | |
| 98 } | |
| 99 | |
| 100 if(substitutionModel==0){ | |
| 101 subsMat <- matrix(1,4,4) | |
| 102 subsMat[,] = 1/3 | |
| 103 subsMat[1,1] = 0 | |
| 104 subsMat[2,2] = 0 | |
| 105 subsMat[3,3] = 0 | |
| 106 subsMat[4,4] = 0 | |
| 107 } | |
| 108 | |
| 109 | |
| 110 NUCLEOTIDESN = c(NUCLEOTIDES,"N", "-") | |
| 111 if(substitutionModel==5){ | |
| 112 subsMat <- FiveS_Substitution | |
| 113 return(subsMat) | |
| 114 }else{ | |
| 115 subsMat <- rbind(subsMat,rep(NA,4),rep(NA,4)) | |
| 116 return( matrix(data.matrix(subsMat),6,4,dimnames=list(NUCLEOTIDESN,NUCLEOTIDES) ) ) | |
| 117 } | |
| 118 } | |
| 119 | |
| 120 | |
| 121 # Read in formatted Mutability file | |
| 122 initializeMutabilityMatrix <- function(mutabilityModel=1, species=1,mutabilityMatFileName=NULL){ | |
| 123 if(!is.null(mutabilityMatFileName)){ | |
| 124 tryCatch( | |
| 125 mutabilityMat <- read.delim(mutabilityMatFileName,sep="\t",header=T) | |
| 126 , error = function(ex){ | |
| 127 cat("Error|Error reading mutability matrix. Please check file name/path and format.\n") | |
| 128 q() | |
| 129 } | |
| 130 ) | |
| 131 }else{ | |
| 132 mutabilityMat <- triMutability_Literature_Human | |
| 133 if(species==2) mutabilityMat <- triMutability_Literature_Mouse | |
| 134 } | |
| 135 | |
| 136 if(mutabilityModel==0){ mutabilityMat <- matrix(1,64,3)} | |
| 137 | |
| 138 if(mutabilityModel==5){ | |
| 139 mutabilityMat <- FiveS_Mutability | |
| 140 return(mutabilityMat) | |
| 141 }else{ | |
| 142 return( matrix( data.matrix(mutabilityMat), 64, 3, dimnames=list(triMutability_Names,1:3)) ) | |
| 143 } | |
| 144 } | |
| 145 | |
| 146 # Read FASTA file formats | |
| 147 # Modified from read.fasta from the seqinR package | |
| 148 baseline.read.fasta <- | |
| 149 function (file = system.file("sequences/sample.fasta", package = "seqinr"), | |
| 150 seqtype = c("DNA", "AA"), as.string = FALSE, forceDNAtolower = TRUE, | |
| 151 set.attributes = TRUE, legacy.mode = TRUE, seqonly = FALSE, | |
| 152 strip.desc = FALSE, sizeof.longlong = .Machine$sizeof.longlong, | |
| 153 endian = .Platform$endian, apply.mask = TRUE) | |
| 154 { | |
| 155 seqtype <- match.arg(seqtype) | |
| 156 | |
| 157 lines <- readLines(file) | |
| 158 | |
| 159 if (legacy.mode) { | |
| 160 comments <- grep("^;", lines) | |
| 161 if (length(comments) > 0) | |
| 162 lines <- lines[-comments] | |
| 163 } | |
| 164 | |
| 165 | |
| 166 ind_groups<-which(substr(lines, 1L, 3L) == ">>>") | |
| 167 lines_mod<-lines | |
| 168 | |
| 169 if(!length(ind_groups)){ | |
| 170 lines_mod<-c(">>>All sequences combined",lines) | |
| 171 } | |
| 172 | |
| 173 ind_groups<-which(substr(lines_mod, 1L, 3L) == ">>>") | |
| 174 | |
| 175 lines <- array("BLA",dim=(length(ind_groups)+length(lines_mod))) | |
| 176 id<-sapply(1:length(ind_groups),function(i)ind_groups[i]+i-1)+1 | |
| 177 lines[id] <- "THIS IS A FAKE SEQUENCE" | |
| 178 lines[-id] <- lines_mod | |
| 179 rm(lines_mod) | |
| 180 | |
| 181 ind <- which(substr(lines, 1L, 1L) == ">") | |
| 182 nseq <- length(ind) | |
| 183 if (nseq == 0) { | |
| 184 stop("no line starting with a > character found") | |
| 185 } | |
| 186 start <- ind + 1 | |
| 187 end <- ind - 1 | |
| 188 | |
| 189 while( any(which(ind%in%end)) ){ | |
| 190 ind=ind[-which(ind%in%end)] | |
| 191 nseq <- length(ind) | |
| 192 if (nseq == 0) { | |
| 193 stop("no line starting with a > character found") | |
| 194 } | |
| 195 start <- ind + 1 | |
| 196 end <- ind - 1 | |
| 197 } | |
| 198 | |
| 199 end <- c(end[-1], length(lines)) | |
| 200 sequences <- lapply(seq_len(nseq), function(i) paste(lines[start[i]:end[i]], collapse = "")) | |
| 201 if (seqonly) | |
| 202 return(sequences) | |
| 203 nomseq <- lapply(seq_len(nseq), function(i) { | |
| 204 | |
| 205 #firstword <- strsplit(lines[ind[i]], " ")[[1]][1] | |
| 206 substr(lines[ind[i]], 2, nchar(lines[ind[i]])) | |
| 207 | |
| 208 }) | |
| 209 if (seqtype == "DNA") { | |
| 210 if (forceDNAtolower) { | |
| 211 sequences <- as.list(tolower(chartr(".","-",sequences))) | |
| 212 }else{ | |
| 213 sequences <- as.list(toupper(chartr(".","-",sequences))) | |
| 214 } | |
| 215 } | |
| 216 if (as.string == FALSE) | |
| 217 sequences <- lapply(sequences, s2c) | |
| 218 if (set.attributes) { | |
| 219 for (i in seq_len(nseq)) { | |
| 220 Annot <- lines[ind[i]] | |
| 221 if (strip.desc) | |
| 222 Annot <- substr(Annot, 2L, nchar(Annot)) | |
| 223 attributes(sequences[[i]]) <- list(name = nomseq[[i]], | |
| 224 Annot = Annot, class = switch(seqtype, AA = "SeqFastaAA", | |
| 225 DNA = "SeqFastadna")) | |
| 226 } | |
| 227 } | |
| 228 names(sequences) <- nomseq | |
| 229 return(sequences) | |
| 230 } | |
| 231 | |
| 232 | |
| 233 # Replaces non FASTA characters in input files with N | |
| 234 replaceNonFASTAChars <-function(inSeq="ACGTN-AApA"){ | |
| 235 gsub('[^ACGTNacgt[:punct:]-[:punct:].]','N',inSeq,perl=TRUE) | |
| 236 } | |
| 237 | |
| 238 # Find the germlines in the FASTA list | |
| 239 germlinesInFile <- function(seqIDs){ | |
| 240 firstChar = sapply(seqIDs,function(x){substr(x,1,1)}) | |
| 241 secondChar = sapply(seqIDs,function(x){substr(x,2,2)}) | |
| 242 return(firstChar==">" & secondChar!=">") | |
| 243 } | |
| 244 | |
| 245 # Find the groups in the FASTA list | |
| 246 groupsInFile <- function(seqIDs){ | |
| 247 sapply(seqIDs,function(x){substr(x,1,2)})==">>" | |
| 248 } | |
| 249 | |
| 250 # In the process of finding germlines/groups, expand from the start to end of the group | |
| 251 expandTillNext <- function(vecPosToID){ | |
| 252 IDs = names(vecPosToID) | |
| 253 posOfInterests = which(vecPosToID) | |
| 254 | |
| 255 expandedID = rep(NA,length(IDs)) | |
| 256 expandedIDNames = gsub(">","",IDs[posOfInterests]) | |
| 257 startIndexes = c(1,posOfInterests[-1]) | |
| 258 stopIndexes = c(posOfInterests[-1]-1,length(IDs)) | |
| 259 expandedID = unlist(sapply(1:length(startIndexes),function(i){ | |
| 260 rep(i,stopIndexes[i]-startIndexes[i]+1) | |
| 261 })) | |
| 262 names(expandedID) = unlist(sapply(1:length(startIndexes),function(i){ | |
| 263 rep(expandedIDNames[i],stopIndexes[i]-startIndexes[i]+1) | |
| 264 })) | |
| 265 return(expandedID) | |
| 266 } | |
| 267 | |
| 268 # Process FASTA (list) to return a matrix[input, germline) | |
| 269 processInputAdvanced <- function(inputFASTA){ | |
| 270 | |
| 271 seqIDs = names(inputFASTA) | |
| 272 numbSeqs = length(seqIDs) | |
| 273 posGermlines1 = germlinesInFile(seqIDs) | |
| 274 numbGermlines = sum(posGermlines1) | |
| 275 posGroups1 = groupsInFile(seqIDs) | |
| 276 numbGroups = sum(posGroups1) | |
| 277 consDef = NA | |
| 278 | |
| 279 if(numbGermlines==0){ | |
| 280 posGermlines = 2 | |
| 281 numbGermlines = 1 | |
| 282 } | |
| 283 | |
| 284 glPositionsSum = cumsum(posGermlines1) | |
| 285 glPositions = table(glPositionsSum) | |
| 286 #Find the position of the conservation row | |
| 287 consDefPos = as.numeric(names(glPositions[names(glPositions)!=0 & glPositions==1]))+1 | |
| 288 if( length(consDefPos)> 0 ){ | |
| 289 consDefID = match(consDefPos, glPositionsSum) | |
| 290 #The coservation rows need to be pulled out and stores seperately | |
| 291 consDef = inputFASTA[consDefID] | |
| 292 inputFASTA = inputFASTA[-consDefID] | |
| 293 | |
| 294 seqIDs = names(inputFASTA) | |
| 295 numbSeqs = length(seqIDs) | |
| 296 posGermlines1 = germlinesInFile(seqIDs) | |
| 297 numbGermlines = sum(posGermlines1) | |
| 298 posGroups1 = groupsInFile(seqIDs) | |
| 299 numbGroups = sum(posGroups1) | |
| 300 if(numbGermlines==0){ | |
| 301 posGermlines = 2 | |
| 302 numbGermlines = 1 | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 posGroups <- expandTillNext(posGroups1) | |
| 307 posGermlines <- expandTillNext(posGermlines1) | |
| 308 posGermlines[posGroups1] = 0 | |
| 309 names(posGermlines)[posGroups1] = names(posGroups)[posGroups1] | |
| 310 posInput = rep(TRUE,numbSeqs) | |
| 311 posInput[posGroups1 | posGermlines1] = FALSE | |
| 312 | |
| 313 matInput = matrix(NA, nrow=sum(posInput), ncol=2) | |
| 314 rownames(matInput) = seqIDs[posInput] | |
| 315 colnames(matInput) = c("Input","Germline") | |
| 316 | |
| 317 vecInputFASTA = unlist(inputFASTA) | |
| 318 matInput[,1] = vecInputFASTA[posInput] | |
| 319 matInput[,2] = vecInputFASTA[ which( names(inputFASTA)%in%paste(">",names(posGermlines)[posInput],sep="") )[ posGermlines[posInput]] ] | |
| 320 | |
| 321 germlines = posGermlines[posInput] | |
| 322 groups = posGroups[posInput] | |
| 323 | |
| 324 return( list("matInput"=matInput, "germlines"=germlines, "groups"=groups, "conservationDefinition"=consDef )) | |
| 325 } | |
| 326 | |
| 327 | |
| 328 # Replace leading and trailing dashes in the sequence | |
| 329 replaceLeadingTrailingDashes <- function(x,readEnd){ | |
| 330 iiGap = unlist(gregexpr("-",x[1])) | |
| 331 ggGap = unlist(gregexpr("-",x[2])) | |
| 332 #posToChange = intersect(iiGap,ggGap) | |
| 333 | |
| 334 | |
| 335 seqIn = replaceLeadingTrailingDashesHelper(x[1]) | |
| 336 seqGL = replaceLeadingTrailingDashesHelper(x[2]) | |
| 337 seqTemplate = rep('N',readEnd) | |
| 338 seqIn <- c(seqIn,seqTemplate[(length(seqIn)+1):readEnd]) | |
| 339 seqGL <- c(seqGL,seqTemplate[(length(seqGL)+1):readEnd]) | |
| 340 # if(posToChange!=-1){ | |
| 341 # seqIn[posToChange] = "-" | |
| 342 # seqGL[posToChange] = "-" | |
| 343 # } | |
| 344 | |
| 345 seqIn = c2s(seqIn[1:readEnd]) | |
| 346 seqGL = c2s(seqGL[1:readEnd]) | |
| 347 | |
| 348 lenGL = nchar(seqGL) | |
| 349 if(lenGL<readEnd){ | |
| 350 seqGL = paste(seqGL,c2s(rep("N",readEnd-lenGL)),sep="") | |
| 351 } | |
| 352 | |
| 353 lenInput = nchar(seqIn) | |
| 354 if(lenInput<readEnd){ | |
| 355 seqIn = paste(seqIn,c2s(rep("N",readEnd-lenInput)),sep="") | |
| 356 } | |
| 357 return( c(seqIn,seqGL) ) | |
| 358 } | |
| 359 | |
| 360 replaceLeadingTrailingDashesHelper <- function(x){ | |
| 361 grepResults = gregexpr("-*",x) | |
| 362 grepResultsPos = unlist(grepResults) | |
| 363 grepResultsLen = attr(grepResults[[1]],"match.length") | |
| 364 #print(paste("x = '", x, "'", sep="")) | |
| 365 x = s2c(x) | |
| 366 if(x[1]=="-"){ | |
| 367 x[1:grepResultsLen[1]] = "N" | |
| 368 } | |
| 369 if(x[length(x)]=="-"){ | |
| 370 x[(length(x)-grepResultsLen[length(grepResultsLen)]+1):length(x)] = "N" | |
| 371 } | |
| 372 return(x) | |
| 373 } | |
| 374 | |
| 375 | |
| 376 | |
| 377 | |
| 378 # Check sequences for indels | |
| 379 checkForInDels <- function(matInputP){ | |
| 380 insPos <- checkInsertion(matInputP) | |
| 381 delPos <- checkDeletions(matInputP) | |
| 382 return(list("Insertions"=insPos, "Deletions"=delPos)) | |
| 383 } | |
| 384 | |
| 385 # Check sequences for insertions | |
| 386 checkInsertion <- function(matInputP){ | |
| 387 insertionCheck = apply( matInputP,1, function(x){ | |
| 388 inputGaps <- as.vector( gregexpr("-",x[1])[[1]] ) | |
| 389 glGaps <- as.vector( gregexpr("-",x[2])[[1]] ) | |
| 390 return( is.finite( match(FALSE, glGaps%in%inputGaps ) ) ) | |
| 391 }) | |
| 392 return(as.vector(insertionCheck)) | |
| 393 } | |
| 394 # Fix inserstions | |
| 395 fixInsertions <- function(matInputP){ | |
| 396 insPos <- checkInsertion(matInputP) | |
| 397 sapply((1:nrow(matInputP))[insPos],function(rowIndex){ | |
| 398 x <- matInputP[rowIndex,] | |
| 399 inputGaps <- gregexpr("-",x[1])[[1]] | |
| 400 glGaps <- gregexpr("-",x[2])[[1]] | |
| 401 posInsertions <- glGaps[!(glGaps%in%inputGaps)] | |
| 402 inputInsertionToN <- s2c(x[2]) | |
| 403 inputInsertionToN[posInsertions]!="-" | |
| 404 inputInsertionToN[posInsertions] <- "N" | |
| 405 inputInsertionToN <- c2s(inputInsertionToN) | |
| 406 matInput[rowIndex,2] <<- inputInsertionToN | |
| 407 }) | |
| 408 return(insPos) | |
| 409 } | |
| 410 | |
| 411 # Check sequences for deletions | |
| 412 checkDeletions <-function(matInputP){ | |
| 413 deletionCheck = apply( matInputP,1, function(x){ | |
| 414 inputGaps <- as.vector( gregexpr("-",x[1])[[1]] ) | |
| 415 glGaps <- as.vector( gregexpr("-",x[2])[[1]] ) | |
| 416 return( is.finite( match(FALSE, inputGaps%in%glGaps ) ) ) | |
| 417 }) | |
| 418 return(as.vector(deletionCheck)) | |
| 419 } | |
| 420 # Fix sequences with deletions | |
| 421 fixDeletions <- function(matInputP){ | |
| 422 delPos <- checkDeletions(matInputP) | |
| 423 sapply((1:nrow(matInputP))[delPos],function(rowIndex){ | |
| 424 x <- matInputP[rowIndex,] | |
| 425 inputGaps <- gregexpr("-",x[1])[[1]] | |
| 426 glGaps <- gregexpr("-",x[2])[[1]] | |
| 427 posDeletions <- inputGaps[!(inputGaps%in%glGaps)] | |
| 428 inputDeletionToN <- s2c(x[1]) | |
| 429 inputDeletionToN[posDeletions] <- "N" | |
| 430 inputDeletionToN <- c2s(inputDeletionToN) | |
| 431 matInput[rowIndex,1] <<- inputDeletionToN | |
| 432 }) | |
| 433 return(delPos) | |
| 434 } | |
| 435 | |
| 436 | |
| 437 # Trim DNA sequence to the last codon | |
| 438 trimToLastCodon <- function(seqToTrim){ | |
| 439 seqLen = nchar(seqToTrim) | |
| 440 trimmedSeq = s2c(seqToTrim) | |
| 441 poi = seqLen | |
| 442 tailLen = 0 | |
| 443 | |
| 444 while(trimmedSeq[poi]=="-" || trimmedSeq[poi]=="."){ | |
| 445 tailLen = tailLen + 1 | |
| 446 poi = poi - 1 | |
| 447 } | |
| 448 | |
| 449 trimmedSeq = c2s(trimmedSeq[1:(seqLen-tailLen)]) | |
| 450 seqLen = nchar(trimmedSeq) | |
| 451 # Trim sequence to last codon | |
| 452 if( getCodonPos(seqLen)[3] > seqLen ) | |
| 453 trimmedSeq = substr(seqToTrim,1, ( (getCodonPos(seqLen)[1])-1 ) ) | |
| 454 | |
| 455 return(trimmedSeq) | |
| 456 } | |
| 457 | |
| 458 # Given a nuclotide position, returns the pos of the 3 nucs that made the codon | |
| 459 # e.g. nuc 86 is part of nucs 85,86,87 | |
| 460 getCodonPos <- function(nucPos){ | |
| 461 codonNum = (ceiling(nucPos/3))*3 | |
| 462 return( (codonNum-2):codonNum) | |
| 463 } | |
| 464 | |
| 465 # Given a nuclotide position, returns the codon number | |
| 466 # e.g. nuc 86 = codon 29 | |
| 467 getCodonNumb <- function(nucPos){ | |
| 468 return( ceiling(nucPos/3) ) | |
| 469 } | |
| 470 | |
| 471 # Given a codon, returns all the nuc positions that make the codon | |
| 472 getCodonNucs <- function(codonNumb){ | |
| 473 getCodonPos(codonNumb*3) | |
| 474 } | |
| 475 | |
| 476 computeCodonTable <- function(testID=1){ | |
| 477 | |
| 478 if(testID<=4){ | |
| 479 # Pre-compute every codons | |
| 480 intCounter = 1 | |
| 481 for(pOne in NUCLEOTIDES){ | |
| 482 for(pTwo in NUCLEOTIDES){ | |
| 483 for(pThree in NUCLEOTIDES){ | |
| 484 codon = paste(pOne,pTwo,pThree,sep="") | |
| 485 colnames(CODON_TABLE)[intCounter] = codon | |
| 486 intCounter = intCounter + 1 | |
| 487 CODON_TABLE[,codon] = mutationTypeOptimized(cbind(permutateAllCodon(codon),rep(codon,12))) | |
| 488 } | |
| 489 } | |
| 490 } | |
| 491 chars = c("N","A","C","G","T", "-") | |
| 492 for(a in chars){ | |
| 493 for(b in chars){ | |
| 494 for(c in chars){ | |
| 495 if(a=="N" | b=="N" | c=="N"){ | |
| 496 #cat(paste(a,b,c),sep="","\n") | |
| 497 CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) | |
| 498 } | |
| 499 } | |
| 500 } | |
| 501 } | |
| 502 | |
| 503 chars = c("-","A","C","G","T") | |
| 504 for(a in chars){ | |
| 505 for(b in chars){ | |
| 506 for(c in chars){ | |
| 507 if(a=="-" | b=="-" | c=="-"){ | |
| 508 #cat(paste(a,b,c),sep="","\n") | |
| 509 CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) | |
| 510 } | |
| 511 } | |
| 512 } | |
| 513 } | |
| 514 CODON_TABLE <<- as.matrix(CODON_TABLE) | |
| 515 } | |
| 516 } | |
| 517 | |
| 518 collapseClone <- function(vecInputSeqs,glSeq,readEnd,nonTerminalOnly=0){ | |
| 519 #print(length(vecInputSeqs)) | |
| 520 vecInputSeqs = unique(vecInputSeqs) | |
| 521 if(length(vecInputSeqs)==1){ | |
| 522 return( list( c(vecInputSeqs,glSeq), F) ) | |
| 523 }else{ | |
| 524 charInputSeqs <- sapply(vecInputSeqs, function(x){ | |
| 525 s2c(x)[1:readEnd] | |
| 526 }) | |
| 527 charGLSeq <- s2c(glSeq) | |
| 528 matClone <- sapply(1:readEnd, function(i){ | |
| 529 posNucs = unique(charInputSeqs[i,]) | |
| 530 posGL = charGLSeq[i] | |
| 531 error = FALSE | |
| 532 if(posGL=="-" & sum(!(posNucs%in%c("-","N")))==0 ){ | |
| 533 return(c("-",error)) | |
| 534 } | |
| 535 if(length(posNucs)==1) | |
| 536 return(c(posNucs[1],error)) | |
| 537 else{ | |
| 538 if("N"%in%posNucs){ | |
| 539 error=TRUE | |
| 540 } | |
| 541 if(sum(!posNucs[posNucs!="N"]%in%posGL)==0){ | |
| 542 return( c(posGL,error) ) | |
| 543 }else{ | |
| 544 #return( c(sample(posNucs[posNucs!="N"],1),error) ) | |
| 545 if(nonTerminalOnly==0){ | |
| 546 return( c(sample(charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL],1),error) ) | |
| 547 }else{ | |
| 548 posNucs = charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL] | |
| 549 posNucsTable = table(posNucs) | |
| 550 if(sum(posNucsTable>1)==0){ | |
| 551 return( c(posGL,error) ) | |
| 552 }else{ | |
| 553 return( c(sample( posNucs[posNucs%in%names(posNucsTable)[posNucsTable>1]],1),error) ) | |
| 554 } | |
| 555 } | |
| 556 | |
| 557 } | |
| 558 } | |
| 559 }) | |
| 560 | |
| 561 | |
| 562 #print(length(vecInputSeqs)) | |
| 563 return(list(c(c2s(matClone[1,]),glSeq),"TRUE"%in%matClone[2,])) | |
| 564 } | |
| 565 } | |
| 566 | |
| 567 # Compute the expected for each sequence-germline pair | |
| 568 getExpectedIndividual <- function(matInput){ | |
| 569 if( any(grep("multicore",search())) ){ | |
| 570 facGL <- factor(matInput[,2]) | |
| 571 facLevels = levels(facGL) | |
| 572 LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ | |
| 573 computeMutabilities(facLevels[x]) | |
| 574 }) | |
| 575 facIndex = match(facGL,facLevels) | |
| 576 | |
| 577 LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ | |
| 578 cInput = rep(NA,nchar(matInput[x,1])) | |
| 579 cInput[s2c(matInput[x,1])!="N"] = 1 | |
| 580 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
| 581 }) | |
| 582 | |
| 583 LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ | |
| 584 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
| 585 }) | |
| 586 | |
| 587 LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ | |
| 588 #print(x) | |
| 589 computeMutationTypes(matInput[x,2]) | |
| 590 }) | |
| 591 | |
| 592 LisGLs_Exp = mclapply(1:dim(matInput)[1], function(x){ | |
| 593 computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) | |
| 594 }) | |
| 595 | |
| 596 ul_LisGLs_Exp = unlist(LisGLs_Exp) | |
| 597 return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) | |
| 598 }else{ | |
| 599 facGL <- factor(matInput[,2]) | |
| 600 facLevels = levels(facGL) | |
| 601 LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ | |
| 602 computeMutabilities(facLevels[x]) | |
| 603 }) | |
| 604 facIndex = match(facGL,facLevels) | |
| 605 | |
| 606 LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ | |
| 607 cInput = rep(NA,nchar(matInput[x,1])) | |
| 608 cInput[s2c(matInput[x,1])!="N"] = 1 | |
| 609 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
| 610 }) | |
| 611 | |
| 612 LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ | |
| 613 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
| 614 }) | |
| 615 | |
| 616 LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ | |
| 617 #print(x) | |
| 618 computeMutationTypes(matInput[x,2]) | |
| 619 }) | |
| 620 | |
| 621 LisGLs_Exp = lapply(1:dim(matInput)[1], function(x){ | |
| 622 computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) | |
| 623 }) | |
| 624 | |
| 625 ul_LisGLs_Exp = unlist(LisGLs_Exp) | |
| 626 return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) | |
| 627 | |
| 628 } | |
| 629 } | |
| 630 | |
| 631 # Compute mutabilities of sequence based on the tri-nucleotide model | |
| 632 computeMutabilities <- function(paramSeq){ | |
| 633 seqLen = nchar(paramSeq) | |
| 634 seqMutabilites = rep(NA,seqLen) | |
| 635 | |
| 636 gaplessSeq = gsub("-", "", paramSeq) | |
| 637 gaplessSeqLen = nchar(gaplessSeq) | |
| 638 gaplessSeqMutabilites = rep(NA,gaplessSeqLen) | |
| 639 | |
| 640 if(mutabilityModel!=5){ | |
| 641 pos<- 3:(gaplessSeqLen) | |
| 642 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
| 643 gaplessSeqMutabilites[pos] = | |
| 644 tapply( c( | |
| 645 getMutability( substr(subSeq,1,3), 3) , | |
| 646 getMutability( substr(subSeq,2,4), 2), | |
| 647 getMutability( substr(subSeq,3,5), 1) | |
| 648 ),rep(1:(gaplessSeqLen-2),3),mean,na.rm=TRUE | |
| 649 ) | |
| 650 #Pos 1 | |
| 651 subSeq = substr(gaplessSeq,1,3) | |
| 652 gaplessSeqMutabilites[1] = getMutability(subSeq , 1) | |
| 653 #Pos 2 | |
| 654 subSeq = substr(gaplessSeq,1,4) | |
| 655 gaplessSeqMutabilites[2] = mean( c( | |
| 656 getMutability( substr(subSeq,1,3), 2) , | |
| 657 getMutability( substr(subSeq,2,4), 1) | |
| 658 ),na.rm=T | |
| 659 ) | |
| 660 seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites | |
| 661 return(seqMutabilites) | |
| 662 }else{ | |
| 663 | |
| 664 pos<- 3:(gaplessSeqLen) | |
| 665 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
| 666 gaplessSeqMutabilites[pos] = sapply(subSeq,function(x){ getMutability5(x) }, simplify=T) | |
| 667 seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites | |
| 668 return(seqMutabilites) | |
| 669 } | |
| 670 | |
| 671 } | |
| 672 | |
| 673 # Returns the mutability of a triplet at a given position | |
| 674 getMutability <- function(codon, pos=1:3){ | |
| 675 triplets <- rownames(mutability) | |
| 676 mutability[ match(codon,triplets) ,pos] | |
| 677 } | |
| 678 | |
| 679 getMutability5 <- function(fivemer){ | |
| 680 return(mutability[fivemer]) | |
| 681 } | |
| 682 | |
| 683 # Returns the substitution probabilty | |
| 684 getTransistionProb <- function(nuc){ | |
| 685 substitution[nuc,] | |
| 686 } | |
| 687 | |
| 688 getTransistionProb5 <- function(fivemer){ | |
| 689 if(any(which(fivemer==colnames(substitution)))){ | |
| 690 return(substitution[,fivemer]) | |
| 691 }else{ | |
| 692 return(array(NA,4)) | |
| 693 } | |
| 694 } | |
| 695 | |
| 696 # Given a nuc, returns the other 3 nucs it can mutate to | |
| 697 canMutateTo <- function(nuc){ | |
| 698 NUCLEOTIDES[- which(NUCLEOTIDES==nuc)] | |
| 699 } | |
| 700 | |
| 701 # Given a nucleotide, returns the probabilty of other nucleotide it can mutate to | |
| 702 canMutateToProb <- function(nuc){ | |
| 703 substitution[nuc,canMutateTo(nuc)] | |
| 704 } | |
| 705 | |
| 706 # Compute targeting, based on precomputed mutatbility & substitution | |
| 707 computeTargeting <- function(param_strSeq,param_vecMutabilities){ | |
| 708 | |
| 709 if(substitutionModel!=5){ | |
| 710 vecSeq = s2c(param_strSeq) | |
| 711 matTargeting = sapply( 1:length(vecSeq), function(x) { param_vecMutabilities[x] * getTransistionProb(vecSeq[x]) } ) | |
| 712 #matTargeting = apply( rbind(vecSeq,param_vecMutabilities),2, function(x) { as.vector(as.numeric(x[2]) * getTransistionProb(x[1])) } ) | |
| 713 dimnames( matTargeting ) = list(NUCLEOTIDES,1:(length(vecSeq))) | |
| 714 return (matTargeting) | |
| 715 }else{ | |
| 716 | |
| 717 seqLen = nchar(param_strSeq) | |
| 718 seqsubstitution = matrix(NA,ncol=seqLen,nrow=4) | |
| 719 paramSeq <- param_strSeq | |
| 720 gaplessSeq = gsub("-", "", paramSeq) | |
| 721 gaplessSeqLen = nchar(gaplessSeq) | |
| 722 gaplessSeqSubstitution = matrix(NA,ncol=gaplessSeqLen,nrow=4) | |
| 723 | |
| 724 pos<- 3:(gaplessSeqLen) | |
| 725 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
| 726 gaplessSeqSubstitution[,pos] = sapply(subSeq,function(x){ getTransistionProb5(x) }, simplify=T) | |
| 727 seqsubstitution[,which(s2c(paramSeq)!="-")]<- gaplessSeqSubstitution | |
| 728 #matTargeting <- param_vecMutabilities %*% seqsubstitution | |
| 729 matTargeting <- sweep(seqsubstitution,2,param_vecMutabilities,`*`) | |
| 730 dimnames( matTargeting ) = list(NUCLEOTIDES,1:(seqLen)) | |
| 731 return (matTargeting) | |
| 732 } | |
| 733 } | |
| 734 | |
| 735 # Compute the mutations types | |
| 736 computeMutationTypes <- function(param_strSeq){ | |
| 737 #cat(param_strSeq,"\n") | |
| 738 #vecSeq = trimToLastCodon(param_strSeq) | |
| 739 lenSeq = nchar(param_strSeq) | |
| 740 vecCodons = sapply({1:(lenSeq/3)}*3-2,function(x){substr(param_strSeq,x,x+2)}) | |
| 741 matMutationTypes = matrix( unlist(CODON_TABLE[,vecCodons]) ,ncol=lenSeq,nrow=4, byrow=F) | |
| 742 dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(ncol(matMutationTypes))) | |
| 743 return(matMutationTypes) | |
| 744 } | |
| 745 computeMutationTypesFast <- function(param_strSeq){ | |
| 746 matMutationTypes = matrix( CODON_TABLE[,param_strSeq] ,ncol=3,nrow=4, byrow=F) | |
| 747 #dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(length(vecSeq))) | |
| 748 return(matMutationTypes) | |
| 749 } | |
| 750 mutationTypeOptimized <- function( matOfCodons ){ | |
| 751 apply( matOfCodons,1,function(x){ mutationType(x[2],x[1]) } ) | |
| 752 } | |
| 753 | |
| 754 # Returns a vector of codons 1 mutation away from the given codon | |
| 755 permutateAllCodon <- function(codon){ | |
| 756 cCodon = s2c(codon) | |
| 757 matCodons = t(array(cCodon,dim=c(3,12))) | |
| 758 matCodons[1:4,1] = NUCLEOTIDES | |
| 759 matCodons[5:8,2] = NUCLEOTIDES | |
| 760 matCodons[9:12,3] = NUCLEOTIDES | |
| 761 apply(matCodons,1,c2s) | |
| 762 } | |
| 763 | |
| 764 # Given two codons, tells you if the mutation is R or S (based on your definition) | |
| 765 mutationType <- function(codonFrom,codonTo){ | |
| 766 if(testID==4){ | |
| 767 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
| 768 return(NA) | |
| 769 }else{ | |
| 770 mutationType = "S" | |
| 771 if( translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonFrom)) != translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonTo)) ){ | |
| 772 mutationType = "R" | |
| 773 } | |
| 774 if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ | |
| 775 mutationType = "Stop" | |
| 776 } | |
| 777 return(mutationType) | |
| 778 } | |
| 779 }else if(testID==5){ | |
| 780 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
| 781 return(NA) | |
| 782 }else{ | |
| 783 if(codonFrom==codonTo){ | |
| 784 mutationType = "S" | |
| 785 }else{ | |
| 786 codonFrom = s2c(codonFrom) | |
| 787 codonTo = s2c(codonTo) | |
| 788 mutationType = "Stop" | |
| 789 nucOfI = codonFrom[which(codonTo!=codonFrom)] | |
| 790 if(nucOfI=="C"){ | |
| 791 mutationType = "R" | |
| 792 }else if(nucOfI=="G"){ | |
| 793 mutationType = "S" | |
| 794 } | |
| 795 } | |
| 796 return(mutationType) | |
| 797 } | |
| 798 }else{ | |
| 799 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
| 800 return(NA) | |
| 801 }else{ | |
| 802 mutationType = "S" | |
| 803 if( translateCodonToAminoAcid(codonFrom) != translateCodonToAminoAcid(codonTo) ){ | |
| 804 mutationType = "R" | |
| 805 } | |
| 806 if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ | |
| 807 mutationType = "Stop" | |
| 808 } | |
| 809 return(mutationType) | |
| 810 } | |
| 811 } | |
| 812 } | |
| 813 | |
| 814 | |
| 815 #given a mat of targeting & it's corresponding mutationtypes returns | |
| 816 #a vector of Exp_RCDR,Exp_SCDR,Exp_RFWR,Exp_RFWR | |
| 817 computeExpected <- function(paramTargeting,paramMutationTypes){ | |
| 818 # Replacements | |
| 819 RPos = which(paramMutationTypes=="R") | |
| 820 #FWR | |
| 821 Exp_R_FWR = sum(paramTargeting[ RPos[which(FWR_Nuc_Mat[RPos]==T)] ],na.rm=T) | |
| 822 #CDR | |
| 823 Exp_R_CDR = sum(paramTargeting[ RPos[which(CDR_Nuc_Mat[RPos]==T)] ],na.rm=T) | |
| 824 # Silents | |
| 825 SPos = which(paramMutationTypes=="S") | |
| 826 #FWR | |
| 827 Exp_S_FWR = sum(paramTargeting[ SPos[which(FWR_Nuc_Mat[SPos]==T)] ],na.rm=T) | |
| 828 #CDR | |
| 829 Exp_S_CDR = sum(paramTargeting[ SPos[which(CDR_Nuc_Mat[SPos]==T)] ],na.rm=T) | |
| 830 | |
| 831 return(c(Exp_R_CDR,Exp_S_CDR,Exp_R_FWR,Exp_S_FWR)) | |
| 832 } | |
| 833 | |
| 834 # Count the mutations in a sequence | |
| 835 # each mutation is treated independently | |
| 836 analyzeMutations2NucUri_website <- function( rev_in_matrix ){ | |
| 837 paramGL = rev_in_matrix[2,] | |
| 838 paramSeq = rev_in_matrix[1,] | |
| 839 | |
| 840 #Fill seq with GL seq if gapped | |
| 841 #if( any(paramSeq=="-") ){ | |
| 842 # gapPos_Seq = which(paramSeq=="-") | |
| 843 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "-"] | |
| 844 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
| 845 #} | |
| 846 | |
| 847 | |
| 848 #if( any(paramSeq=="N") ){ | |
| 849 # gapPos_Seq = which(paramSeq=="N") | |
| 850 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
| 851 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
| 852 #} | |
| 853 | |
| 854 analyzeMutations2NucUri( matrix(c( paramGL, paramSeq ),2,length(paramGL),byrow=T) ) | |
| 855 | |
| 856 } | |
| 857 | |
| 858 #1 = GL | |
| 859 #2 = Seq | |
| 860 analyzeMutations2NucUri <- function( in_matrix=matrix(c(c("A","A","A","C","C","C"),c("A","G","G","C","C","A")),2,6,byrow=T) ){ | |
| 861 paramGL = in_matrix[2,] | |
| 862 paramSeq = in_matrix[1,] | |
| 863 paramSeqUri = paramGL | |
| 864 #mutations = apply(rbind(paramGL,paramSeq), 2, function(x){!x[1]==x[2]}) | |
| 865 mutations_val = paramGL != paramSeq | |
| 866 if(any(mutations_val)){ | |
| 867 mutationPos = {1:length(mutations_val)}[mutations_val] | |
| 868 mutationPos = mutationPos[sapply(mutationPos, function(x){!any(paramSeq[getCodonPos(x)]=="N")})] | |
| 869 length_mutations =length(mutationPos) | |
| 870 mutationInfo = rep(NA,length_mutations) | |
| 871 if(any(mutationPos)){ | |
| 872 | |
| 873 pos<- mutationPos | |
| 874 pos_array<-array(sapply(pos,getCodonPos)) | |
| 875 codonGL = paramGL[pos_array] | |
| 876 | |
| 877 codonSeq = sapply(pos,function(x){ | |
| 878 seqP = paramGL[getCodonPos(x)] | |
| 879 muCodonPos = {x-1}%%3+1 | |
| 880 seqP[muCodonPos] = paramSeq[x] | |
| 881 return(seqP) | |
| 882 }) | |
| 883 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
| 884 Seqcodons = apply(codonSeq,2,c2s) | |
| 885 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 886 names(mutationInfo) = mutationPos | |
| 887 } | |
| 888 if(any(!is.na(mutationInfo))){ | |
| 889 return(mutationInfo[!is.na(mutationInfo)]) | |
| 890 }else{ | |
| 891 return(NA) | |
| 892 } | |
| 893 | |
| 894 | |
| 895 }else{ | |
| 896 return (NA) | |
| 897 } | |
| 898 } | |
| 899 | |
| 900 processNucMutations2 <- function(mu){ | |
| 901 if(!is.na(mu)){ | |
| 902 #R | |
| 903 if(any(mu=="R")){ | |
| 904 Rs = mu[mu=="R"] | |
| 905 nucNumbs = as.numeric(names(Rs)) | |
| 906 R_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) | |
| 907 R_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) | |
| 908 }else{ | |
| 909 R_CDR = 0 | |
| 910 R_FWR = 0 | |
| 911 } | |
| 912 | |
| 913 #S | |
| 914 if(any(mu=="S")){ | |
| 915 Ss = mu[mu=="S"] | |
| 916 nucNumbs = as.numeric(names(Ss)) | |
| 917 S_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) | |
| 918 S_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) | |
| 919 }else{ | |
| 920 S_CDR = 0 | |
| 921 S_FWR = 0 | |
| 922 } | |
| 923 | |
| 924 | |
| 925 retVec = c(R_CDR,S_CDR,R_FWR,S_FWR) | |
| 926 retVec[is.na(retVec)]=0 | |
| 927 return(retVec) | |
| 928 }else{ | |
| 929 return(rep(0,4)) | |
| 930 } | |
| 931 } | |
| 932 | |
| 933 | |
| 934 ## Z-score Test | |
| 935 computeZScore <- function(mat, test="Focused"){ | |
| 936 matRes <- matrix(NA,ncol=2,nrow=(nrow(mat))) | |
| 937 if(test=="Focused"){ | |
| 938 #Z_Focused_CDR | |
| 939 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
| 940 P = apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}) | |
| 941 R_mean = apply(cbind(mat[,c(1,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) | |
| 942 R_sd=sqrt(R_mean*(1-P)) | |
| 943 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
| 944 | |
| 945 #Z_Focused_FWR | |
| 946 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
| 947 P = apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}) | |
| 948 R_mean = apply(cbind(mat[,c(3,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) | |
| 949 R_sd=sqrt(R_mean*(1-P)) | |
| 950 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
| 951 } | |
| 952 | |
| 953 if(test=="Local"){ | |
| 954 #Z_Focused_CDR | |
| 955 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
| 956 P = apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}) | |
| 957 R_mean = apply(cbind(mat[,c(1,2)],P),1,function(x){x[3]*(sum(x[1:2]))}) | |
| 958 R_sd=sqrt(R_mean*(1-P)) | |
| 959 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
| 960 | |
| 961 #Z_Focused_FWR | |
| 962 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
| 963 P = apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}) | |
| 964 R_mean = apply(cbind(mat[,c(3,4)],P),1,function(x){x[3]*(sum(x[1:2]))}) | |
| 965 R_sd=sqrt(R_mean*(1-P)) | |
| 966 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
| 967 } | |
| 968 | |
| 969 if(test=="Imbalanced"){ | |
| 970 #Z_Focused_CDR | |
| 971 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
| 972 P = apply(mat[,5:8],1,function(x){((x[1]+x[2])/sum(x))}) | |
| 973 R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) | |
| 974 R_sd=sqrt(R_mean*(1-P)) | |
| 975 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
| 976 | |
| 977 #Z_Focused_FWR | |
| 978 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
| 979 P = apply(mat[,5:8],1,function(x){((x[3]+x[4])/sum(x))}) | |
| 980 R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) | |
| 981 R_sd=sqrt(R_mean*(1-P)) | |
| 982 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
| 983 } | |
| 984 | |
| 985 matRes[is.nan(matRes)] = NA | |
| 986 return(matRes) | |
| 987 } | |
| 988 | |
| 989 # Return a p-value for a z-score | |
| 990 z2p <- function(z){ | |
| 991 p=NA | |
| 992 if( !is.nan(z) && !is.na(z)){ | |
| 993 if(z>0){ | |
| 994 p = (1 - pnorm(z,0,1)) | |
| 995 } else if(z<0){ | |
| 996 p = (-1 * pnorm(z,0,1)) | |
| 997 } else{ | |
| 998 p = 0.5 | |
| 999 } | |
| 1000 }else{ | |
| 1001 p = NA | |
| 1002 } | |
| 1003 return(p) | |
| 1004 } | |
| 1005 | |
| 1006 | |
| 1007 ## Bayesian Test | |
| 1008 | |
| 1009 # Fitted parameter for the bayesian framework | |
| 1010 BAYESIAN_FITTED<-c(0.407277142798302, 0.554007336744485, 0.63777155771234, 0.693989162719009, 0.735450014674917, 0.767972534429806, 0.794557287143399, 0.816906816601605, 0.83606796225341, 0.852729446430296, 0.867370424541641, 0.880339760590323, 0.891900995024999, 0.902259181289864, 0.911577919359,0.919990301665853, 0.927606458124537, 0.934518806350661, 0.940805863754375, 0.946534836475715, 0.951763691199255, 0.95654428191308, 0.960920179487397, 0.964930893680829, 0.968611312149038, 0.971992459313836, 0.975102110004818, 0.977964943023096, 0.980603428208439, 0.983037660179428, 0.985285800977406, 0.987364285326685, 0.989288037855441, 0.991070478823525, 0.992723699729969, 0.994259575477392, 0.995687688867975, 0.997017365051493, 0.998257085153047, 0.999414558305388, 1.00049681357804, 1.00151036237481, 1.00246080204981, 1.00335370751909, 1.0041939329768, 1.0049859393417, 1.00573382091263, 1.00644127217376, 1.00711179729107, 1.00774845526417, 1.00835412715854, 1.00893143010366, 1.00948275846309, 1.01001030293661, 1.01051606798079, 1.01100188771288, 1.01146944044216, 1.01192026195449, 1.01235575766094, 1.01277721370986) | |
| 1011 CONST_i <- sort(c(((2^(seq(-39,0,length.out=201)))/2)[1:200],(c(0:11,13:99)+0.5)/100,1-(2^(seq(-39,0,length.out=201)))/2)) | |
| 1012 | |
| 1013 # Given x, M & p, returns a pdf | |
| 1014 calculate_bayes <- function ( x=3, N=10, p=0.33, | |
| 1015 i=CONST_i, | |
| 1016 max_sigma=20,length_sigma=4001 | |
| 1017 ){ | |
| 1018 if(!0%in%N){ | |
| 1019 G <- max(length(x),length(N),length(p)) | |
| 1020 x=array(x,dim=G) | |
| 1021 N=array(N,dim=G) | |
| 1022 p=array(p,dim=G) | |
| 1023 sigma_s<-seq(-max_sigma,max_sigma,length.out=length_sigma) | |
| 1024 sigma_1<-log({i/{1-i}}/{p/{1-p}}) | |
| 1025 index<-min(N,60) | |
| 1026 y<-dbeta(i,x+BAYESIAN_FITTED[index],N+BAYESIAN_FITTED[index]-x)*(1-p)*p*exp(sigma_1)/({1-p}^2+2*p*{1-p}*exp(sigma_1)+{p^2}*exp(2*sigma_1)) | |
| 1027 if(!sum(is.na(y))){ | |
| 1028 tmp<-approx(sigma_1,y,sigma_s)$y | |
| 1029 tmp/sum(tmp)/{2*max_sigma/{length_sigma-1}} | |
| 1030 }else{ | |
| 1031 return(NA) | |
| 1032 } | |
| 1033 }else{ | |
| 1034 return(NA) | |
| 1035 } | |
| 1036 } | |
| 1037 # Given a mat of observed & expected, return a list of CDR & FWR pdf for selection | |
| 1038 computeBayesianScore <- function(mat, test="Focused", max_sigma=20,length_sigma=4001){ | |
| 1039 flagOneSeq = F | |
| 1040 if(nrow(mat)==1){ | |
| 1041 mat=rbind(mat,mat) | |
| 1042 flagOneSeq = T | |
| 1043 } | |
| 1044 if(test=="Focused"){ | |
| 1045 #CDR | |
| 1046 P = c(apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
| 1047 N = c(apply(mat[,c(1,2,4)],1,function(x){(sum(x))}),0) | |
| 1048 X = c(mat[,1],0) | |
| 1049 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1050 bayesCDR = bayesCDR[-length(bayesCDR)] | |
| 1051 | |
| 1052 #FWR | |
| 1053 P = c(apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
| 1054 N = c(apply(mat[,c(3,2,4)],1,function(x){(sum(x))}),0) | |
| 1055 X = c(mat[,3],0) | |
| 1056 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1057 bayesFWR = bayesFWR[-length(bayesFWR)] | |
| 1058 } | |
| 1059 | |
| 1060 if(test=="Local"){ | |
| 1061 #CDR | |
| 1062 P = c(apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}),0.5) | |
| 1063 N = c(apply(mat[,c(1,2)],1,function(x){(sum(x))}),0) | |
| 1064 X = c(mat[,1],0) | |
| 1065 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1066 bayesCDR = bayesCDR[-length(bayesCDR)] | |
| 1067 | |
| 1068 #FWR | |
| 1069 P = c(apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
| 1070 N = c(apply(mat[,c(3,4)],1,function(x){(sum(x))}),0) | |
| 1071 X = c(mat[,3],0) | |
| 1072 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1073 bayesFWR = bayesFWR[-length(bayesFWR)] | |
| 1074 } | |
| 1075 | |
| 1076 if(test=="Imbalanced"){ | |
| 1077 #CDR | |
| 1078 P = c(apply(mat[,c(5:8)],1,function(x){((x[1]+x[2])/sum(x))}),0.5) | |
| 1079 N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) | |
| 1080 X = c(apply(mat[,c(1:2)],1,function(x){(sum(x))}),0) | |
| 1081 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1082 bayesCDR = bayesCDR[-length(bayesCDR)] | |
| 1083 | |
| 1084 #FWR | |
| 1085 P = c(apply(mat[,c(5:8)],1,function(x){((x[3]+x[4])/sum(x))}),0.5) | |
| 1086 N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) | |
| 1087 X = c(apply(mat[,c(3:4)],1,function(x){(sum(x))}),0) | |
| 1088 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1089 bayesFWR = bayesFWR[-length(bayesFWR)] | |
| 1090 } | |
| 1091 | |
| 1092 if(test=="ImbalancedSilent"){ | |
| 1093 #CDR | |
| 1094 P = c(apply(mat[,c(6,8)],1,function(x){((x[1])/sum(x))}),0.5) | |
| 1095 N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) | |
| 1096 X = c(apply(mat[,c(2,4)],1,function(x){(x[1])}),0) | |
| 1097 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1098 bayesCDR = bayesCDR[-length(bayesCDR)] | |
| 1099 | |
| 1100 #FWR | |
| 1101 P = c(apply(mat[,c(6,8)],1,function(x){((x[2])/sum(x))}),0.5) | |
| 1102 N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) | |
| 1103 X = c(apply(mat[,c(2,4)],1,function(x){(x[2])}),0) | |
| 1104 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
| 1105 bayesFWR = bayesFWR[-length(bayesFWR)] | |
| 1106 } | |
| 1107 | |
| 1108 if(flagOneSeq==T){ | |
| 1109 bayesCDR = bayesCDR[1] | |
| 1110 bayesFWR = bayesFWR[1] | |
| 1111 } | |
| 1112 return( list("CDR"=bayesCDR, "FWR"=bayesFWR) ) | |
| 1113 } | |
| 1114 | |
| 1115 ##Covolution | |
| 1116 break2chunks<-function(G=1000){ | |
| 1117 base<-2^round(log(sqrt(G),2),0) | |
| 1118 return(c(rep(base,floor(G/base)-1),base+G-(floor(G/base)*base))) | |
| 1119 } | |
| 1120 | |
| 1121 PowersOfTwo <- function(G=100){ | |
| 1122 exponents <- array() | |
| 1123 i = 0 | |
| 1124 while(G > 0){ | |
| 1125 i=i+1 | |
| 1126 exponents[i] <- floor( log2(G) ) | |
| 1127 G <- G-2^exponents[i] | |
| 1128 } | |
| 1129 return(exponents) | |
| 1130 } | |
| 1131 | |
| 1132 convolutionPowersOfTwo <- function( cons, length_sigma=4001 ){ | |
| 1133 G = ncol(cons) | |
| 1134 if(G>1){ | |
| 1135 for(gen in log(G,2):1){ | |
| 1136 ll<-seq(from=2,to=2^gen,by=2) | |
| 1137 sapply(ll,function(l){cons[,l/2]<<-weighted_conv(cons[,l],cons[,l-1],length_sigma=length_sigma)}) | |
| 1138 } | |
| 1139 } | |
| 1140 return( cons[,1] ) | |
| 1141 } | |
| 1142 | |
| 1143 convolutionPowersOfTwoByTwos <- function( cons, length_sigma=4001,G=1 ){ | |
| 1144 if(length(ncol(cons))) G<-ncol(cons) | |
| 1145 groups <- PowersOfTwo(G) | |
| 1146 matG <- matrix(NA, ncol=length(groups), nrow=length(cons)/G ) | |
| 1147 startIndex = 1 | |
| 1148 for( i in 1:length(groups) ){ | |
| 1149 stopIndex <- 2^groups[i] + startIndex - 1 | |
| 1150 if(stopIndex!=startIndex){ | |
| 1151 matG[,i] <- convolutionPowersOfTwo( cons[,startIndex:stopIndex], length_sigma=length_sigma ) | |
| 1152 startIndex = stopIndex + 1 | |
| 1153 } | |
| 1154 else { | |
| 1155 if(G>1) matG[,i] <- cons[,startIndex:stopIndex] | |
| 1156 else matG[,i] <- cons | |
| 1157 #startIndex = stopIndex + 1 | |
| 1158 } | |
| 1159 } | |
| 1160 return( list( matG, groups ) ) | |
| 1161 } | |
| 1162 | |
| 1163 weighted_conv<-function(x,y,w=1,m=100,length_sigma=4001){ | |
| 1164 lx<-length(x) | |
| 1165 ly<-length(y) | |
| 1166 if({lx<m}| {{lx*w}<m}| {{ly}<m}| {{ly*w}<m}){ | |
| 1167 if(w<1){ | |
| 1168 y1<-approx(1:ly,y,seq(1,ly,length.out=m))$y | |
| 1169 x1<-approx(1:lx,x,seq(1,lx,length.out=m/w))$y | |
| 1170 lx<-length(x1) | |
| 1171 ly<-length(y1) | |
| 1172 } | |
| 1173 else { | |
| 1174 y1<-approx(1:ly,y,seq(1,ly,length.out=m*w))$y | |
| 1175 x1<-approx(1:lx,x,seq(1,lx,length.out=m))$y | |
| 1176 lx<-length(x1) | |
| 1177 ly<-length(y1) | |
| 1178 } | |
| 1179 } | |
| 1180 else{ | |
| 1181 x1<-x | |
| 1182 y1<-approx(1:ly,y,seq(1,ly,length.out=floor(lx*w)))$y | |
| 1183 ly<-length(y1) | |
| 1184 } | |
| 1185 tmp<-approx(x=1:(lx+ly-1),y=convolve(x1,rev(y1),type="open"),xout=seq(1,lx+ly-1,length.out=length_sigma))$y | |
| 1186 tmp[tmp<=0] = 0 | |
| 1187 return(tmp/sum(tmp)) | |
| 1188 } | |
| 1189 | |
| 1190 calculate_bayesGHelper <- function( listMatG,length_sigma=4001 ){ | |
| 1191 matG <- listMatG[[1]] | |
| 1192 groups <- listMatG[[2]] | |
| 1193 i = 1 | |
| 1194 resConv <- matG[,i] | |
| 1195 denom <- 2^groups[i] | |
| 1196 if(length(groups)>1){ | |
| 1197 while( i<length(groups) ){ | |
| 1198 i = i + 1 | |
| 1199 resConv <- weighted_conv(resConv, matG[,i], w= {{2^groups[i]}/denom} ,length_sigma=length_sigma) | |
| 1200 #cat({{2^groups[i]}/denom},"\n") | |
| 1201 denom <- denom + 2^groups[i] | |
| 1202 } | |
| 1203 } | |
| 1204 return(resConv) | |
| 1205 } | |
| 1206 | |
| 1207 # Given a list of PDFs, returns a convoluted PDF | |
| 1208 groupPosteriors <- function( listPosteriors, max_sigma=20, length_sigma=4001 ,Threshold=2 ){ | |
| 1209 listPosteriors = listPosteriors[ !is.na(listPosteriors) ] | |
| 1210 Length_Postrior<-length(listPosteriors) | |
| 1211 if(Length_Postrior>1 & Length_Postrior<=Threshold){ | |
| 1212 cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) | |
| 1213 listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) | |
| 1214 y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) | |
| 1215 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
| 1216 }else if(Length_Postrior==1) return(listPosteriors[[1]]) | |
| 1217 else if(Length_Postrior==0) return(NA) | |
| 1218 else { | |
| 1219 cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) | |
| 1220 y = fastConv(cons,max_sigma=max_sigma, length_sigma=length_sigma ) | |
| 1221 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
| 1222 } | |
| 1223 } | |
| 1224 | |
| 1225 fastConv<-function(cons, max_sigma=20, length_sigma=4001){ | |
| 1226 chunks<-break2chunks(G=ncol(cons)) | |
| 1227 if(ncol(cons)==3) chunks<-2:1 | |
| 1228 index_chunks_end <- cumsum(chunks) | |
| 1229 index_chunks_start <- c(1,index_chunks_end[-length(index_chunks_end)]+1) | |
| 1230 index_chunks <- cbind(index_chunks_start,index_chunks_end) | |
| 1231 | |
| 1232 case <- sum(chunks!=chunks[1]) | |
| 1233 if(case==1) End <- max(1,((length(index_chunks)/2)-1)) | |
| 1234 else End <- max(1,((length(index_chunks)/2))) | |
| 1235 | |
| 1236 firsts <- sapply(1:End,function(i){ | |
| 1237 indexes<-index_chunks[i,1]:index_chunks[i,2] | |
| 1238 convolutionPowersOfTwoByTwos(cons[ ,indexes])[[1]] | |
| 1239 }) | |
| 1240 if(case==0){ | |
| 1241 result<-calculate_bayesGHelper( convolutionPowersOfTwoByTwos(firsts) ) | |
| 1242 }else if(case==1){ | |
| 1243 last<-list(calculate_bayesGHelper( | |
| 1244 convolutionPowersOfTwoByTwos( cons[ ,index_chunks[length(index_chunks)/2,1]:index_chunks[length(index_chunks)/2,2]] ) | |
| 1245 ),0) | |
| 1246 result_first<-calculate_bayesGHelper(convolutionPowersOfTwoByTwos(firsts)) | |
| 1247 result<-calculate_bayesGHelper( | |
| 1248 list( | |
| 1249 cbind( | |
| 1250 result_first,last[[1]]), | |
| 1251 c(log(index_chunks_end[length(index_chunks)/2-1],2),log(index_chunks[length(index_chunks)/2,2]-index_chunks[length(index_chunks)/2,1]+1,2)) | |
| 1252 ) | |
| 1253 ) | |
| 1254 } | |
| 1255 return(as.vector(result)) | |
| 1256 } | |
| 1257 | |
| 1258 # Computes the 95% CI for a pdf | |
| 1259 calcBayesCI <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ | |
| 1260 if(length(Pdf)!=length_sigma) return(NA) | |
| 1261 sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) | |
| 1262 cdf = cumsum(Pdf) | |
| 1263 cdf = cdf/cdf[length(cdf)] | |
| 1264 return( c(sigma_s[findInterval(low,cdf)-1] , sigma_s[findInterval(up,cdf)]) ) | |
| 1265 } | |
| 1266 | |
| 1267 # Computes a mean for a pdf | |
| 1268 calcBayesMean <- function(Pdf,max_sigma=20,length_sigma=4001){ | |
| 1269 if(length(Pdf)!=length_sigma) return(NA) | |
| 1270 sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) | |
| 1271 norm = {length_sigma-1}/2/max_sigma | |
| 1272 return( (Pdf%*%sigma_s/norm) ) | |
| 1273 } | |
| 1274 | |
| 1275 # Returns the mean, and the 95% CI for a pdf | |
| 1276 calcBayesOutputInfo <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ | |
| 1277 if(is.na(Pdf)) | |
| 1278 return(rep(NA,3)) | |
| 1279 bCI = calcBayesCI(Pdf=Pdf,low=low,up=up,max_sigma=max_sigma,length_sigma=length_sigma) | |
| 1280 bMean = calcBayesMean(Pdf=Pdf,max_sigma=max_sigma,length_sigma=length_sigma) | |
| 1281 return(c(bMean, bCI)) | |
| 1282 } | |
| 1283 | |
| 1284 # Computes the p-value of a pdf | |
| 1285 computeSigmaP <- function(Pdf, length_sigma=4001, max_sigma=20){ | |
| 1286 if(length(Pdf)>1){ | |
| 1287 norm = {length_sigma-1}/2/max_sigma | |
| 1288 pVal = {sum(Pdf[1:{{length_sigma-1}/2}]) + Pdf[{{length_sigma+1}/2}]/2}/norm | |
| 1289 if(pVal>0.5){ | |
| 1290 pVal = pVal-1 | |
| 1291 } | |
| 1292 return(pVal) | |
| 1293 }else{ | |
| 1294 return(NA) | |
| 1295 } | |
| 1296 } | |
| 1297 | |
| 1298 # Compute p-value of two distributions | |
| 1299 compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){ | |
| 1300 #print(c(length(dens1),length(dens2))) | |
| 1301 if(length(dens1)>1 & length(dens2)>1 ){ | |
| 1302 dens1<-dens1/sum(dens1) | |
| 1303 dens2<-dens2/sum(dens2) | |
| 1304 cum2 <- cumsum(dens2)-dens2/2 | |
| 1305 tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i]))) | |
| 1306 #print(tmp) | |
| 1307 if(tmp>0.5)tmp<-tmp-1 | |
| 1308 return( tmp ) | |
| 1309 } | |
| 1310 else { | |
| 1311 return(NA) | |
| 1312 } | |
| 1313 #return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N) | |
| 1314 } | |
| 1315 | |
| 1316 # get number of seqeunces contributing to the sigma (i.e. seqeunces with mutations) | |
| 1317 numberOfSeqsWithMutations <- function(matMutations,test=1){ | |
| 1318 if(test==4)test=2 | |
| 1319 cdrSeqs <- 0 | |
| 1320 fwrSeqs <- 0 | |
| 1321 if(test==1){#focused | |
| 1322 cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2,4)]) }) | |
| 1323 fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4,2)]) }) | |
| 1324 if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) | |
| 1325 if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) | |
| 1326 } | |
| 1327 if(test==2){#local | |
| 1328 cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2)]) }) | |
| 1329 fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4)]) }) | |
| 1330 if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) | |
| 1331 if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) | |
| 1332 } | |
| 1333 return(c("CDR"=cdrSeqs, "FWR"=fwrSeqs)) | |
| 1334 } | |
| 1335 | |
| 1336 | |
| 1337 | |
| 1338 shadeColor <- function(sigmaVal=NA,pVal=NA){ | |
| 1339 if(is.na(sigmaVal) & is.na(pVal)) return(NA) | |
| 1340 if(is.na(sigmaVal) & !is.na(pVal)) sigmaVal=sign(pVal) | |
| 1341 if(is.na(pVal) || pVal==1 || pVal==0){ | |
| 1342 returnColor = "#FFFFFF"; | |
| 1343 }else{ | |
| 1344 colVal=abs(pVal); | |
| 1345 | |
| 1346 if(sigmaVal<0){ | |
| 1347 if(colVal>0.1) | |
| 1348 returnColor = "#CCFFCC"; | |
| 1349 if(colVal<=0.1) | |
| 1350 returnColor = "#99FF99"; | |
| 1351 if(colVal<=0.050) | |
| 1352 returnColor = "#66FF66"; | |
| 1353 if(colVal<=0.010) | |
| 1354 returnColor = "#33FF33"; | |
| 1355 if(colVal<=0.005) | |
| 1356 returnColor = "#00FF00"; | |
| 1357 | |
| 1358 }else{ | |
| 1359 if(colVal>0.1) | |
| 1360 returnColor = "#FFCCCC"; | |
| 1361 if(colVal<=0.1) | |
| 1362 returnColor = "#FF9999"; | |
| 1363 if(colVal<=0.05) | |
| 1364 returnColor = "#FF6666"; | |
| 1365 if(colVal<=0.01) | |
| 1366 returnColor = "#FF3333"; | |
| 1367 if(colVal<0.005) | |
| 1368 returnColor = "#FF0000"; | |
| 1369 } | |
| 1370 } | |
| 1371 | |
| 1372 return(returnColor) | |
| 1373 } | |
| 1374 | |
| 1375 | |
| 1376 | |
| 1377 plotHelp <- function(xfrac=0.05,yfrac=0.05,log=FALSE){ | |
| 1378 if(!log){ | |
| 1379 x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac | |
| 1380 y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac | |
| 1381 }else { | |
| 1382 if(log==2){ | |
| 1383 x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac | |
| 1384 y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) | |
| 1385 } | |
| 1386 if(log==1){ | |
| 1387 x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) | |
| 1388 y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac | |
| 1389 } | |
| 1390 if(log==3){ | |
| 1391 x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) | |
| 1392 y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) | |
| 1393 } | |
| 1394 } | |
| 1395 return(c("x"=x,"y"=y)) | |
| 1396 } | |
| 1397 | |
| 1398 # SHMulation | |
| 1399 | |
| 1400 # Based on targeting, introduce a single mutation & then update the targeting | |
| 1401 oneMutation <- function(){ | |
| 1402 # Pick a postion + mutation | |
| 1403 posMutation = sample(1:(seqGermlineLen*4),1,replace=F,prob=as.vector(seqTargeting)) | |
| 1404 posNucNumb = ceiling(posMutation/4) # Nucleotide number | |
| 1405 posNucKind = 4 - ( (posNucNumb*4) - posMutation ) # Nuc the position mutates to | |
| 1406 | |
| 1407 #mutate the simulation sequence | |
| 1408 seqSimVec <- s2c(seqSim) | |
| 1409 seqSimVec[posNucNumb] <- NUCLEOTIDES[posNucKind] | |
| 1410 seqSim <<- c2s(seqSimVec) | |
| 1411 | |
| 1412 #update Mutability, Targeting & MutationsTypes | |
| 1413 updateMutabilityNTargeting(posNucNumb) | |
| 1414 | |
| 1415 #return(c(posNucNumb,NUCLEOTIDES[posNucKind])) | |
| 1416 return(posNucNumb) | |
| 1417 } | |
| 1418 | |
| 1419 updateMutabilityNTargeting <- function(position){ | |
| 1420 min_i<-max((position-2),1) | |
| 1421 max_i<-min((position+2),nchar(seqSim)) | |
| 1422 min_ii<-min(min_i,3) | |
| 1423 | |
| 1424 #mutability - update locally | |
| 1425 seqMutability[(min_i):(max_i)] <<- computeMutabilities(substr(seqSim,position-4,position+4))[(min_ii):(max_i-min_i+min_ii)] | |
| 1426 | |
| 1427 | |
| 1428 #targeting - compute locally | |
| 1429 seqTargeting[,min_i:max_i] <<- computeTargeting(substr(seqSim,min_i,max_i),seqMutability[min_i:max_i]) | |
| 1430 seqTargeting[is.na(seqTargeting)] <<- 0 | |
| 1431 #mutCodonPos = getCodonPos(position) | |
| 1432 mutCodonPos = seq(getCodonPos(min_i)[1],getCodonPos(max_i)[3]) | |
| 1433 #cat(mutCodonPos,"\n") | |
| 1434 mutTypeCodon = getCodonPos(position) | |
| 1435 seqMutationTypes[,mutTypeCodon] <<- computeMutationTypesFast( substr(seqSim,mutTypeCodon[1],mutTypeCodon[3]) ) | |
| 1436 # Stop = 0 | |
| 1437 if(any(seqMutationTypes[,mutCodonPos]=="Stop",na.rm=T )){ | |
| 1438 seqTargeting[,mutCodonPos][seqMutationTypes[,mutCodonPos]=="Stop"] <<- 0 | |
| 1439 } | |
| 1440 | |
| 1441 | |
| 1442 #Selection | |
| 1443 selectedPos = (min_i*4-4)+(which(seqMutationTypes[,min_i:max_i]=="R")) | |
| 1444 # CDR | |
| 1445 selectedCDR = selectedPos[which(matCDR[selectedPos]==T)] | |
| 1446 seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR] * exp(selCDR) | |
| 1447 seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR]/baseLineCDR_K | |
| 1448 | |
| 1449 # FWR | |
| 1450 selectedFWR = selectedPos[which(matFWR[selectedPos]==T)] | |
| 1451 seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR] * exp(selFWR) | |
| 1452 seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR]/baseLineFWR_K | |
| 1453 | |
| 1454 } | |
| 1455 | |
| 1456 | |
| 1457 | |
| 1458 # Validate the mutation: if the mutation has not been sampled before validate it, else discard it. | |
| 1459 validateMutation <- function(){ | |
| 1460 if( !(mutatedPos%in%mutatedPositions) ){ # if it's a new mutation | |
| 1461 uniqueMutationsIntroduced <<- uniqueMutationsIntroduced + 1 | |
| 1462 mutatedPositions[uniqueMutationsIntroduced] <<- mutatedPos | |
| 1463 }else{ | |
| 1464 if(substr(seqSim,mutatedPos,mutatedPos)==substr(seqGermline,mutatedPos,mutatedPos)){ # back to germline mutation | |
| 1465 mutatedPositions <<- mutatedPositions[-which(mutatedPositions==mutatedPos)] | |
| 1466 uniqueMutationsIntroduced <<- uniqueMutationsIntroduced - 1 | |
| 1467 } | |
| 1468 } | |
| 1469 } | |
| 1470 | |
| 1471 | |
| 1472 | |
| 1473 # Places text (labels) at normalized coordinates | |
| 1474 myaxis <- function(xfrac=0.05,yfrac=0.05,log=FALSE,w="text",cex=1,adj=1,thecol="black"){ | |
| 1475 par(xpd=TRUE) | |
| 1476 if(!log) | |
| 1477 text(par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac,par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac,w,cex=cex,adj=adj,col=thecol) | |
| 1478 else { | |
| 1479 if(log==2) | |
| 1480 text( | |
| 1481 par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac, | |
| 1482 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), | |
| 1483 w,cex=cex,adj=adj,col=thecol) | |
| 1484 if(log==1) | |
| 1485 text( | |
| 1486 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), | |
| 1487 par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac, | |
| 1488 w,cex=cex,adj=adj,col=thecol) | |
| 1489 if(log==3) | |
| 1490 text( | |
| 1491 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), | |
| 1492 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), | |
| 1493 w,cex=cex,adj=adj,col=thecol) | |
| 1494 } | |
| 1495 par(xpd=FALSE) | |
| 1496 } | |
| 1497 | |
| 1498 | |
| 1499 | |
| 1500 # Count the mutations in a sequence | |
| 1501 analyzeMutations <- function( inputMatrixIndex, model = 0 , multipleMutation=0, seqWithStops=0){ | |
| 1502 | |
| 1503 paramGL = s2c(matInput[inputMatrixIndex,2]) | |
| 1504 paramSeq = s2c(matInput[inputMatrixIndex,1]) | |
| 1505 | |
| 1506 #if( any(paramSeq=="N") ){ | |
| 1507 # gapPos_Seq = which(paramSeq=="N") | |
| 1508 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
| 1509 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
| 1510 #} | |
| 1511 mutations_val = paramGL != paramSeq | |
| 1512 | |
| 1513 if(any(mutations_val)){ | |
| 1514 mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] | |
| 1515 length_mutations =length(mutationPos) | |
| 1516 mutationInfo = rep(NA,length_mutations) | |
| 1517 | |
| 1518 pos<- mutationPos | |
| 1519 pos_array<-array(sapply(pos,getCodonPos)) | |
| 1520 codonGL = paramGL[pos_array] | |
| 1521 codonSeqWhole = paramSeq[pos_array] | |
| 1522 codonSeq = sapply(pos,function(x){ | |
| 1523 seqP = paramGL[getCodonPos(x)] | |
| 1524 muCodonPos = {x-1}%%3+1 | |
| 1525 seqP[muCodonPos] = paramSeq[x] | |
| 1526 return(seqP) | |
| 1527 }) | |
| 1528 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
| 1529 SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) | |
| 1530 Seqcodons = apply(codonSeq,2,c2s) | |
| 1531 | |
| 1532 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 1533 names(mutationInfo) = mutationPos | |
| 1534 | |
| 1535 mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 1536 names(mutationInfoWhole) = mutationPos | |
| 1537 | |
| 1538 mutationInfo <- mutationInfo[!is.na(mutationInfo)] | |
| 1539 mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] | |
| 1540 | |
| 1541 if(any(!is.na(mutationInfo))){ | |
| 1542 | |
| 1543 #Filter based on Stop (at the codon level) | |
| 1544 if(seqWithStops==1){ | |
| 1545 nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) | |
| 1546 mutationInfo = mutationInfo[nucleotidesAtStopCodons] | |
| 1547 mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] | |
| 1548 }else{ | |
| 1549 countStops = sum(mutationInfoWhole=="Stop") | |
| 1550 if(seqWithStops==2 & countStops==0) mutationInfo = NA | |
| 1551 if(seqWithStops==3 & countStops>0) mutationInfo = NA | |
| 1552 } | |
| 1553 | |
| 1554 if(any(!is.na(mutationInfo))){ | |
| 1555 #Filter mutations based on multipleMutation | |
| 1556 if(multipleMutation==1 & !is.na(mutationInfo)){ | |
| 1557 mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) | |
| 1558 tableMutationCodons <- table(mutationCodons) | |
| 1559 codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) | |
| 1560 if(any(codonsWithMultipleMutations)){ | |
| 1561 #remove the nucleotide mutations in the codons with multiple mutations | |
| 1562 mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] | |
| 1563 #replace those codons with Ns in the input sequence | |
| 1564 paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" | |
| 1565 matInput[inputMatrixIndex,1] <<- c2s(paramSeq) | |
| 1566 } | |
| 1567 } | |
| 1568 | |
| 1569 #Filter mutations based on the model | |
| 1570 if(any(mutationInfo)==T | is.na(any(mutationInfo))){ | |
| 1571 | |
| 1572 if(model==1 & !is.na(mutationInfo)){ | |
| 1573 mutationInfo <- mutationInfo[mutationInfo=="S"] | |
| 1574 } | |
| 1575 if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(mutationInfo) | |
| 1576 else return(NA) | |
| 1577 }else{ | |
| 1578 return(NA) | |
| 1579 } | |
| 1580 }else{ | |
| 1581 return(NA) | |
| 1582 } | |
| 1583 | |
| 1584 | |
| 1585 }else{ | |
| 1586 return(NA) | |
| 1587 } | |
| 1588 | |
| 1589 | |
| 1590 }else{ | |
| 1591 return (NA) | |
| 1592 } | |
| 1593 } | |
| 1594 | |
| 1595 analyzeMutationsFixed <- function( inputArray, model = 0 , multipleMutation=0, seqWithStops=0){ | |
| 1596 | |
| 1597 paramGL = s2c(inputArray[2]) | |
| 1598 paramSeq = s2c(inputArray[1]) | |
| 1599 inputSeq <- inputArray[1] | |
| 1600 #if( any(paramSeq=="N") ){ | |
| 1601 # gapPos_Seq = which(paramSeq=="N") | |
| 1602 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
| 1603 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
| 1604 #} | |
| 1605 mutations_val = paramGL != paramSeq | |
| 1606 | |
| 1607 if(any(mutations_val)){ | |
| 1608 mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] | |
| 1609 length_mutations =length(mutationPos) | |
| 1610 mutationInfo = rep(NA,length_mutations) | |
| 1611 | |
| 1612 pos<- mutationPos | |
| 1613 pos_array<-array(sapply(pos,getCodonPos)) | |
| 1614 codonGL = paramGL[pos_array] | |
| 1615 codonSeqWhole = paramSeq[pos_array] | |
| 1616 codonSeq = sapply(pos,function(x){ | |
| 1617 seqP = paramGL[getCodonPos(x)] | |
| 1618 muCodonPos = {x-1}%%3+1 | |
| 1619 seqP[muCodonPos] = paramSeq[x] | |
| 1620 return(seqP) | |
| 1621 }) | |
| 1622 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
| 1623 SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) | |
| 1624 Seqcodons = apply(codonSeq,2,c2s) | |
| 1625 | |
| 1626 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 1627 names(mutationInfo) = mutationPos | |
| 1628 | |
| 1629 mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 1630 names(mutationInfoWhole) = mutationPos | |
| 1631 | |
| 1632 mutationInfo <- mutationInfo[!is.na(mutationInfo)] | |
| 1633 mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] | |
| 1634 | |
| 1635 if(any(!is.na(mutationInfo))){ | |
| 1636 | |
| 1637 #Filter based on Stop (at the codon level) | |
| 1638 if(seqWithStops==1){ | |
| 1639 nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) | |
| 1640 mutationInfo = mutationInfo[nucleotidesAtStopCodons] | |
| 1641 mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] | |
| 1642 }else{ | |
| 1643 countStops = sum(mutationInfoWhole=="Stop") | |
| 1644 if(seqWithStops==2 & countStops==0) mutationInfo = NA | |
| 1645 if(seqWithStops==3 & countStops>0) mutationInfo = NA | |
| 1646 } | |
| 1647 | |
| 1648 if(any(!is.na(mutationInfo))){ | |
| 1649 #Filter mutations based on multipleMutation | |
| 1650 if(multipleMutation==1 & !is.na(mutationInfo)){ | |
| 1651 mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) | |
| 1652 tableMutationCodons <- table(mutationCodons) | |
| 1653 codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) | |
| 1654 if(any(codonsWithMultipleMutations)){ | |
| 1655 #remove the nucleotide mutations in the codons with multiple mutations | |
| 1656 mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] | |
| 1657 #replace those codons with Ns in the input sequence | |
| 1658 paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" | |
| 1659 #matInput[inputMatrixIndex,1] <<- c2s(paramSeq) | |
| 1660 inputSeq <- c2s(paramSeq) | |
| 1661 } | |
| 1662 } | |
| 1663 | |
| 1664 #Filter mutations based on the model | |
| 1665 if(any(mutationInfo)==T | is.na(any(mutationInfo))){ | |
| 1666 | |
| 1667 if(model==1 & !is.na(mutationInfo)){ | |
| 1668 mutationInfo <- mutationInfo[mutationInfo=="S"] | |
| 1669 } | |
| 1670 if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(list(mutationInfo,inputSeq)) | |
| 1671 else return(list(NA,inputSeq)) | |
| 1672 }else{ | |
| 1673 return(list(NA,inputSeq)) | |
| 1674 } | |
| 1675 }else{ | |
| 1676 return(list(NA,inputSeq)) | |
| 1677 } | |
| 1678 | |
| 1679 | |
| 1680 }else{ | |
| 1681 return(list(NA,inputSeq)) | |
| 1682 } | |
| 1683 | |
| 1684 | |
| 1685 }else{ | |
| 1686 return (list(NA,inputSeq)) | |
| 1687 } | |
| 1688 } | |
| 1689 | |
| 1690 # triMutability Background Count | |
| 1691 buildMutabilityModel <- function( inputMatrixIndex, model=0 , multipleMutation=0, seqWithStops=0, stopMutations=0){ | |
| 1692 | |
| 1693 #rowOrigMatInput = matInput[inputMatrixIndex,] | |
| 1694 seqGL = gsub("-", "", matInput[inputMatrixIndex,2]) | |
| 1695 seqInput = gsub("-", "", matInput[inputMatrixIndex,1]) | |
| 1696 #matInput[inputMatrixIndex,] <<- cbind(seqInput,seqGL) | |
| 1697 tempInput <- cbind(seqInput,seqGL) | |
| 1698 seqLength = nchar(seqGL) | |
| 1699 list_analyzeMutationsFixed<- analyzeMutationsFixed(tempInput, model, multipleMutation, seqWithStops) | |
| 1700 mutationCount <- list_analyzeMutationsFixed[[1]] | |
| 1701 seqInput <- list_analyzeMutationsFixed[[2]] | |
| 1702 BackgroundMatrix = mutabilityMatrix | |
| 1703 MutationMatrix = mutabilityMatrix | |
| 1704 MutationCountMatrix = mutabilityMatrix | |
| 1705 if(!is.na(mutationCount)){ | |
| 1706 if((stopMutations==0 & model==0) | (stopMutations==1 & (sum(mutationCount=="Stop")<length(mutationCount))) | (model==1 & (sum(mutationCount=="S")>0)) ){ | |
| 1707 | |
| 1708 fivermerStartPos = 1:(seqLength-4) | |
| 1709 fivemerLength <- length(fivermerStartPos) | |
| 1710 fivemerGL <- substr(rep(seqGL,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) | |
| 1711 fivemerSeq <- substr(rep(seqInput,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) | |
| 1712 | |
| 1713 #Background | |
| 1714 for(fivemerIndex in 1:fivemerLength){ | |
| 1715 fivemer = fivemerGL[fivemerIndex] | |
| 1716 if(!any(grep("N",fivemer))){ | |
| 1717 fivemerCodonPos = fivemerCodon(fivemerIndex) | |
| 1718 fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
| 1719 fivemerReadingFrameCodonInputSeq = substr(fivemerSeq[fivemerIndex],fivemerCodonPos[1],fivemerCodonPos[3]) | |
| 1720 | |
| 1721 # All mutations model | |
| 1722 #if(!any(grep("N",fivemerReadingFrameCodon))){ | |
| 1723 if(model==0){ | |
| 1724 if(stopMutations==0){ | |
| 1725 if(!any(grep("N",fivemerReadingFrameCodonInputSeq))) | |
| 1726 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + 1) | |
| 1727 }else{ | |
| 1728 if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" ){ | |
| 1729 positionWithinCodon = which(fivemerCodonPos==3)#positionsWithinCodon[(fivemerCodonPos[1]%%3)+1] | |
| 1730 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probNonStopMutations[fivemerReadingFrameCodon,positionWithinCodon]) | |
| 1731 } | |
| 1732 } | |
| 1733 }else{ # Only silent mutations | |
| 1734 if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" & translateCodonToAminoAcid(fivemerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(fivemerReadingFrameCodon)){ | |
| 1735 positionWithinCodon = which(fivemerCodonPos==3) | |
| 1736 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probSMutations[fivemerReadingFrameCodon,positionWithinCodon]) | |
| 1737 } | |
| 1738 } | |
| 1739 #} | |
| 1740 } | |
| 1741 } | |
| 1742 | |
| 1743 #Mutations | |
| 1744 if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] | |
| 1745 if(model==1) mutationCount = mutationCount[mutationCount=="S"] | |
| 1746 mutationPositions = as.numeric(names(mutationCount)) | |
| 1747 mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
| 1748 mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
| 1749 countMutations = 0 | |
| 1750 for(mutationPosition in mutationPositions){ | |
| 1751 fivemerIndex = mutationPosition-2 | |
| 1752 fivemer = fivemerSeq[fivemerIndex] | |
| 1753 GLfivemer = fivemerGL[fivemerIndex] | |
| 1754 fivemerCodonPos = fivemerCodon(fivemerIndex) | |
| 1755 fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
| 1756 fivemerReadingFrameCodonGL = substr(GLfivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
| 1757 if(!any(grep("N",fivemer)) & !any(grep("N",GLfivemer))){ | |
| 1758 if(model==0){ | |
| 1759 countMutations = countMutations + 1 | |
| 1760 MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + 1) | |
| 1761 MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) | |
| 1762 }else{ | |
| 1763 if( translateCodonToAminoAcid(fivemerReadingFrameCodonGL)!="*" ){ | |
| 1764 countMutations = countMutations + 1 | |
| 1765 positionWithinCodon = which(fivemerCodonPos==3) | |
| 1766 glNuc = substr(fivemerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) | |
| 1767 inputNuc = substr(fivemerReadingFrameCodon,positionWithinCodon,positionWithinCodon) | |
| 1768 MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + substitution[glNuc,inputNuc]) | |
| 1769 MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) | |
| 1770 } | |
| 1771 } | |
| 1772 } | |
| 1773 } | |
| 1774 | |
| 1775 seqMutability = MutationMatrix/BackgroundMatrix | |
| 1776 seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) | |
| 1777 #cat(inputMatrixIndex,"\t",countMutations,"\n") | |
| 1778 return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) | |
| 1779 | |
| 1780 } | |
| 1781 } | |
| 1782 | |
| 1783 } | |
| 1784 | |
| 1785 #Returns the codon position containing the middle nucleotide | |
| 1786 fivemerCodon <- function(fivemerIndex){ | |
| 1787 codonPos = list(2:4,1:3,3:5) | |
| 1788 fivemerType = fivemerIndex%%3 | |
| 1789 return(codonPos[[fivemerType+1]]) | |
| 1790 } | |
| 1791 | |
| 1792 #returns probability values for one mutation in codons resulting in R, S or Stop | |
| 1793 probMutations <- function(typeOfMutation){ | |
| 1794 matMutationProb <- matrix(0,ncol=3,nrow=125,dimnames=list(words(alphabet = c(NUCLEOTIDES,"N"), length=3),c(1:3))) | |
| 1795 for(codon in rownames(matMutationProb)){ | |
| 1796 if( !any(grep("N",codon)) ){ | |
| 1797 for(muPos in 1:3){ | |
| 1798 matCodon = matrix(rep(s2c(codon),3),nrow=3,ncol=3,byrow=T) | |
| 1799 glNuc = matCodon[1,muPos] | |
| 1800 matCodon[,muPos] = canMutateTo(glNuc) | |
| 1801 substitutionRate = substitution[glNuc,matCodon[,muPos]] | |
| 1802 typeOfMutations = apply(rbind(rep(codon,3),apply(matCodon,1,c2s)),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
| 1803 matMutationProb[codon,muPos] <- sum(substitutionRate[typeOfMutations==typeOfMutation]) | |
| 1804 } | |
| 1805 } | |
| 1806 } | |
| 1807 | |
| 1808 return(matMutationProb) | |
| 1809 } | |
| 1810 | |
| 1811 | |
| 1812 | |
| 1813 | |
| 1814 #Mapping Trinucleotides to fivemers | |
| 1815 mapTriToFivemer <- function(triMutability=triMutability_Literature_Human){ | |
| 1816 rownames(triMutability) <- triMutability_Names | |
| 1817 Fivemer<-rep(NA,1024) | |
| 1818 names(Fivemer)<-words(alphabet=NUCLEOTIDES,length=5) | |
| 1819 Fivemer<-sapply(names(Fivemer),function(Word)return(sum( c(triMutability[substring(Word,3,5),1],triMutability[substring(Word,2,4),2],triMutability[substring(Word,1,3),3]),na.rm=TRUE))) | |
| 1820 Fivemer<-Fivemer/sum(Fivemer) | |
| 1821 return(Fivemer) | |
| 1822 } | |
| 1823 | |
| 1824 collapseFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ | |
| 1825 Indices<-substring(names(Fivemer),3,3)==NUC | |
| 1826 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
| 1827 tapply(which(Indices),Factors,function(i)weighted.mean(Fivemer[i],Weights[i],na.rm=TRUE)) | |
| 1828 } | |
| 1829 | |
| 1830 | |
| 1831 | |
| 1832 CountFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ | |
| 1833 Indices<-substring(names(Fivemer),3,3)==NUC | |
| 1834 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
| 1835 tapply(which(Indices),Factors,function(i)sum(Weights[i],na.rm=TRUE)) | |
| 1836 } | |
| 1837 | |
| 1838 #Uses the real counts of the mutated fivemers | |
| 1839 CountFivemerToTri2<-function(Fivemer,Counts=MutabilityCounts,position=1,NUC="A"){ | |
| 1840 Indices<-substring(names(Fivemer),3,3)==NUC | |
| 1841 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
| 1842 tapply(which(Indices),Factors,function(i)sum(Counts[i],na.rm=TRUE)) | |
| 1843 } | |
| 1844 | |
| 1845 bootstrap<-function(x=c(33,12,21),M=10000,alpha=0.05){ | |
| 1846 N<-sum(x) | |
| 1847 if(N){ | |
| 1848 p<-x/N | |
| 1849 k<-length(x)-1 | |
| 1850 tmp<-rmultinom(M, size = N, prob=p) | |
| 1851 tmp_p<-apply(tmp,2,function(y)y/N) | |
| 1852 (apply(tmp_p,1,function(y)quantile(y,c(alpha/2/k,1-alpha/2/k)))) | |
| 1853 } | |
| 1854 else return(matrix(0,2,length(x))) | |
| 1855 } | |
| 1856 | |
| 1857 | |
| 1858 | |
| 1859 | |
| 1860 bootstrap2<-function(x=c(33,12,21),n=10,M=10000,alpha=0.05){ | |
| 1861 | |
| 1862 N<-sum(x) | |
| 1863 k<-length(x) | |
| 1864 y<-rep(1:k,x) | |
| 1865 tmp<-sapply(1:M,function(i)sample(y,n)) | |
| 1866 if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i)))/n | |
| 1867 if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i)))/n | |
| 1868 (apply(tmp_p,1,function(z)quantile(z,c(alpha/2/(k-1),1-alpha/2/(k-1))))) | |
| 1869 } | |
| 1870 | |
| 1871 | |
| 1872 | |
| 1873 p_value<-function(x=c(33,12,21),M=100000,x_obs=c(2,5,3)){ | |
| 1874 n=sum(x_obs) | |
| 1875 N<-sum(x) | |
| 1876 k<-length(x) | |
| 1877 y<-rep(1:k,x) | |
| 1878 tmp<-sapply(1:M,function(i)sample(y,n)) | |
| 1879 if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i))) | |
| 1880 if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i))) | |
| 1881 tmp<-rbind(sapply(1:3,function(i)sum(tmp_p[i,]>=x_obs[i])/M), | |
| 1882 sapply(1:3,function(i)sum(tmp_p[i,]<=x_obs[i])/M)) | |
| 1883 sapply(1:3,function(i){if(tmp[1,i]>=tmp[2,i])return(-tmp[2,i])else return(tmp[1,i])}) | |
| 1884 } | |
| 1885 | |
| 1886 #"D:\\Sequences\\IMGT Germlines\\Human_SNPless_IGHJ.FASTA" | |
| 1887 # Remove SNPs from IMGT germline segment alleles | |
| 1888 generateUnambiguousRepertoire <- function(repertoireInFile,repertoireOutFile){ | |
| 1889 repertoireIn <- read.fasta(repertoireInFile, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) | |
| 1890 alleleNames <- sapply(names(repertoireIn),function(x)strsplit(x,"|",fixed=TRUE)[[1]][2]) | |
| 1891 SNPs <- tapply(repertoireIn,sapply(alleleNames,function(x)strsplit(x,"*",fixed=TRUE)[[1]][1]),function(x){ | |
| 1892 Indices<-NULL | |
| 1893 for(i in 1:length(x)){ | |
| 1894 firstSeq = s2c(x[[1]]) | |
| 1895 iSeq = s2c(x[[i]]) | |
| 1896 Indices<-c(Indices,which(firstSeq[1:320]!=iSeq[1:320] & firstSeq[1:320]!="." & iSeq[1:320]!="." )) | |
| 1897 } | |
| 1898 return(sort(unique(Indices))) | |
| 1899 }) | |
| 1900 repertoireOut <- repertoireIn | |
| 1901 repertoireOut <- lapply(names(repertoireOut), function(repertoireName){ | |
| 1902 alleleName <- strsplit(repertoireName,"|",fixed=TRUE)[[1]][2] | |
| 1903 geneSegmentName <- strsplit(alleleName,"*",fixed=TRUE)[[1]][1] | |
| 1904 alleleSeq <- s2c(repertoireOut[[repertoireName]]) | |
| 1905 alleleSeq[as.numeric(unlist(SNPs[geneSegmentName]))] <- "N" | |
| 1906 alleleSeq <- c2s(alleleSeq) | |
| 1907 repertoireOut[[repertoireName]] <- alleleSeq | |
| 1908 }) | |
| 1909 names(repertoireOut) <- names(repertoireIn) | |
| 1910 write.fasta(repertoireOut,names(repertoireOut),file.out=repertoireOutFile) | |
| 1911 | |
| 1912 } | |
| 1913 | |
| 1914 | |
| 1915 | |
| 1916 | |
| 1917 | |
| 1918 | |
| 1919 ############ | |
| 1920 groupBayes2 = function(indexes, param_resultMat){ | |
| 1921 | |
| 1922 BayesGDist_Focused_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[4])})) | |
| 1923 BayesGDist_Focused_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[2]+x[4])})) | |
| 1924 #BayesGDist_Local_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2])})) | |
| 1925 #BayesGDist_Local_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[4])})) | |
| 1926 #BayesGDist_Global_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[3]+x[4])})) | |
| 1927 #BayesGDist_Global_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[1]+x[2]+x[3]+x[4])})) | |
| 1928 return ( list("BayesGDist_Focused_CDR"=BayesGDist_Focused_CDR, | |
| 1929 "BayesGDist_Focused_FWR"=BayesGDist_Focused_FWR) ) | |
| 1930 #"BayesGDist_Local_CDR"=BayesGDist_Local_CDR, | |
| 1931 #"BayesGDist_Local_FWR" = BayesGDist_Local_FWR)) | |
| 1932 # "BayesGDist_Global_CDR" = BayesGDist_Global_CDR, | |
| 1933 # "BayesGDist_Global_FWR" = BayesGDist_Global_FWR) ) | |
| 1934 | |
| 1935 | |
| 1936 } | |
| 1937 | |
| 1938 | |
| 1939 calculate_bayesG <- function( x=array(), N=array(), p=array(), max_sigma=20, length_sigma=4001){ | |
| 1940 G <- max(length(x),length(N),length(p)) | |
| 1941 x=array(x,dim=G) | |
| 1942 N=array(N,dim=G) | |
| 1943 p=array(p,dim=G) | |
| 1944 | |
| 1945 indexOfZero = N>0 & p>0 | |
| 1946 N = N[indexOfZero] | |
| 1947 x = x[indexOfZero] | |
| 1948 p = p[indexOfZero] | |
| 1949 G <- length(x) | |
| 1950 | |
| 1951 if(G){ | |
| 1952 | |
| 1953 cons<-array( dim=c(length_sigma,G) ) | |
| 1954 if(G==1) { | |
| 1955 return(calculate_bayes(x=x[G],N=N[G],p=p[G],max_sigma=max_sigma,length_sigma=length_sigma)) | |
| 1956 } | |
| 1957 else { | |
| 1958 for(g in 1:G) cons[,g] <- calculate_bayes(x=x[g],N=N[g],p=p[g],max_sigma=max_sigma,length_sigma=length_sigma) | |
| 1959 listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) | |
| 1960 y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) | |
| 1961 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
| 1962 } | |
| 1963 }else{ | |
| 1964 return(NA) | |
| 1965 } | |
| 1966 } | |
| 1967 | |
| 1968 | |
| 1969 calculate_bayesGHelper <- function( listMatG,length_sigma=4001 ){ | |
| 1970 matG <- listMatG[[1]] | |
| 1971 groups <- listMatG[[2]] | |
| 1972 i = 1 | |
| 1973 resConv <- matG[,i] | |
| 1974 denom <- 2^groups[i] | |
| 1975 if(length(groups)>1){ | |
| 1976 while( i<length(groups) ){ | |
| 1977 i = i + 1 | |
| 1978 resConv <- weighted_conv(resConv, matG[,i], w= {{2^groups[i]}/denom} ,length_sigma=length_sigma) | |
| 1979 #cat({{2^groups[i]}/denom},"\n") | |
| 1980 denom <- denom + 2^groups[i] | |
| 1981 } | |
| 1982 } | |
| 1983 return(resConv) | |
| 1984 } | |
| 1985 | |
| 1986 weighted_conv<-function(x,y,w=1,m=100,length_sigma=4001){ | |
| 1987 lx<-length(x) | |
| 1988 ly<-length(y) | |
| 1989 if({lx<m}| {{lx*w}<m}| {{ly}<m}| {{ly*w}<m}){ | |
| 1990 if(w<1){ | |
| 1991 y1<-approx(1:ly,y,seq(1,ly,length.out=m))$y | |
| 1992 x1<-approx(1:lx,x,seq(1,lx,length.out=m/w))$y | |
| 1993 lx<-length(x1) | |
| 1994 ly<-length(y1) | |
| 1995 } | |
| 1996 else { | |
| 1997 y1<-approx(1:ly,y,seq(1,ly,length.out=m*w))$y | |
| 1998 x1<-approx(1:lx,x,seq(1,lx,length.out=m))$y | |
| 1999 lx<-length(x1) | |
| 2000 ly<-length(y1) | |
| 2001 } | |
| 2002 } | |
| 2003 else{ | |
| 2004 x1<-x | |
| 2005 y1<-approx(1:ly,y,seq(1,ly,length.out=floor(lx*w)))$y | |
| 2006 ly<-length(y1) | |
| 2007 } | |
| 2008 tmp<-approx(x=1:(lx+ly-1),y=convolve(x1,rev(y1),type="open"),xout=seq(1,lx+ly-1,length.out=length_sigma))$y | |
| 2009 tmp[tmp<=0] = 0 | |
| 2010 return(tmp/sum(tmp)) | |
| 2011 } | |
| 2012 | |
| 2013 ######################## | |
| 2014 | |
| 2015 | |
| 2016 | |
| 2017 | |
| 2018 mutabilityMatrixONE<-rep(0,4) | |
| 2019 names(mutabilityMatrixONE)<-NUCLEOTIDES | |
| 2020 | |
| 2021 # triMutability Background Count | |
| 2022 buildMutabilityModelONE <- function( inputMatrixIndex, model=0 , multipleMutation=0, seqWithStops=0, stopMutations=0){ | |
| 2023 | |
| 2024 #rowOrigMatInput = matInput[inputMatrixIndex,] | |
| 2025 seqGL = gsub("-", "", matInput[inputMatrixIndex,2]) | |
| 2026 seqInput = gsub("-", "", matInput[inputMatrixIndex,1]) | |
| 2027 matInput[inputMatrixIndex,] <<- c(seqInput,seqGL) | |
| 2028 seqLength = nchar(seqGL) | |
| 2029 mutationCount <- analyzeMutations(inputMatrixIndex, model, multipleMutation, seqWithStops) | |
| 2030 BackgroundMatrix = mutabilityMatrixONE | |
| 2031 MutationMatrix = mutabilityMatrixONE | |
| 2032 MutationCountMatrix = mutabilityMatrixONE | |
| 2033 if(!is.na(mutationCount)){ | |
| 2034 if((stopMutations==0 & model==0) | (stopMutations==1 & (sum(mutationCount=="Stop")<length(mutationCount))) | (model==1 & (sum(mutationCount=="S")>0)) ){ | |
| 2035 | |
| 2036 # ONEmerStartPos = 1:(seqLength) | |
| 2037 # ONEmerLength <- length(ONEmerStartPos) | |
| 2038 ONEmerGL <- s2c(seqGL) | |
| 2039 ONEmerSeq <- s2c(seqInput) | |
| 2040 | |
| 2041 #Background | |
| 2042 for(ONEmerIndex in 1:seqLength){ | |
| 2043 ONEmer = ONEmerGL[ONEmerIndex] | |
| 2044 if(ONEmer!="N"){ | |
| 2045 ONEmerCodonPos = getCodonPos(ONEmerIndex) | |
| 2046 ONEmerReadingFrameCodon = c2s(ONEmerGL[ONEmerCodonPos]) | |
| 2047 ONEmerReadingFrameCodonInputSeq = c2s(ONEmerSeq[ONEmerCodonPos] ) | |
| 2048 | |
| 2049 # All mutations model | |
| 2050 #if(!any(grep("N",ONEmerReadingFrameCodon))){ | |
| 2051 if(model==0){ | |
| 2052 if(stopMutations==0){ | |
| 2053 if(!any(grep("N",ONEmerReadingFrameCodonInputSeq))) | |
| 2054 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + 1) | |
| 2055 }else{ | |
| 2056 if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*"){ | |
| 2057 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex)#positionsWithinCodon[(ONEmerCodonPos[1]%%3)+1] | |
| 2058 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probNonStopMutations[ONEmerReadingFrameCodon,positionWithinCodon]) | |
| 2059 } | |
| 2060 } | |
| 2061 }else{ # Only silent mutations | |
| 2062 if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*" & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(ONEmerReadingFrameCodon) ){ | |
| 2063 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) | |
| 2064 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probSMutations[ONEmerReadingFrameCodon,positionWithinCodon]) | |
| 2065 } | |
| 2066 } | |
| 2067 } | |
| 2068 } | |
| 2069 } | |
| 2070 | |
| 2071 #Mutations | |
| 2072 if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] | |
| 2073 if(model==1) mutationCount = mutationCount[mutationCount=="S"] | |
| 2074 mutationPositions = as.numeric(names(mutationCount)) | |
| 2075 mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
| 2076 mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
| 2077 countMutations = 0 | |
| 2078 for(mutationPosition in mutationPositions){ | |
| 2079 ONEmerIndex = mutationPosition | |
| 2080 ONEmer = ONEmerSeq[ONEmerIndex] | |
| 2081 GLONEmer = ONEmerGL[ONEmerIndex] | |
| 2082 ONEmerCodonPos = getCodonPos(ONEmerIndex) | |
| 2083 ONEmerReadingFrameCodon = c2s(ONEmerSeq[ONEmerCodonPos]) | |
| 2084 ONEmerReadingFrameCodonGL =c2s(ONEmerGL[ONEmerCodonPos]) | |
| 2085 if(!any(grep("N",ONEmer)) & !any(grep("N",GLONEmer))){ | |
| 2086 if(model==0){ | |
| 2087 countMutations = countMutations + 1 | |
| 2088 MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + 1) | |
| 2089 MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) | |
| 2090 }else{ | |
| 2091 if( translateCodonToAminoAcid(ONEmerReadingFrameCodonGL)!="*" ){ | |
| 2092 countMutations = countMutations + 1 | |
| 2093 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) | |
| 2094 glNuc = substr(ONEmerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) | |
| 2095 inputNuc = substr(ONEmerReadingFrameCodon,positionWithinCodon,positionWithinCodon) | |
| 2096 MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + substitution[glNuc,inputNuc]) | |
| 2097 MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) | |
| 2098 } | |
| 2099 } | |
| 2100 } | |
| 2101 } | |
| 2102 | |
| 2103 seqMutability = MutationMatrix/BackgroundMatrix | |
| 2104 seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) | |
| 2105 #cat(inputMatrixIndex,"\t",countMutations,"\n") | |
| 2106 return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) | |
| 2107 # tmp<-list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix) | |
| 2108 } | |
| 2109 } | |
| 2110 | |
| 2111 ################ | |
| 2112 # $Id: trim.R 989 2006-10-29 15:28:26Z ggorjan $ | |
| 2113 | |
| 2114 trim <- function(s, recode.factor=TRUE, ...) | |
| 2115 UseMethod("trim", s) | |
| 2116 | |
| 2117 trim.default <- function(s, recode.factor=TRUE, ...) | |
| 2118 s | |
| 2119 | |
| 2120 trim.character <- function(s, recode.factor=TRUE, ...) | |
| 2121 { | |
| 2122 s <- sub(pattern="^ +", replacement="", x=s) | |
| 2123 s <- sub(pattern=" +$", replacement="", x=s) | |
| 2124 s | |
| 2125 } | |
| 2126 | |
| 2127 trim.factor <- function(s, recode.factor=TRUE, ...) | |
| 2128 { | |
| 2129 levels(s) <- trim(levels(s)) | |
| 2130 if(recode.factor) { | |
| 2131 dots <- list(x=s, ...) | |
| 2132 if(is.null(dots$sort)) dots$sort <- sort | |
| 2133 s <- do.call(what=reorder.factor, args=dots) | |
| 2134 } | |
| 2135 s | |
| 2136 } | |
| 2137 | |
| 2138 trim.list <- function(s, recode.factor=TRUE, ...) | |
| 2139 lapply(s, trim, recode.factor=recode.factor, ...) | |
| 2140 | |
| 2141 trim.data.frame <- function(s, recode.factor=TRUE, ...) | |
| 2142 { | |
| 2143 s[] <- trim.list(s, recode.factor=recode.factor, ...) | |
| 2144 s | |
| 2145 } | |
| 2146 ####################################### | |
| 2147 # Compute the expected for each sequence-germline pair by codon | |
| 2148 getExpectedIndividualByCodon <- function(matInput){ | |
| 2149 if( any(grep("multicore",search())) ){ | |
| 2150 facGL <- factor(matInput[,2]) | |
| 2151 facLevels = levels(facGL) | |
| 2152 LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ | |
| 2153 computeMutabilities(facLevels[x]) | |
| 2154 }) | |
| 2155 facIndex = match(facGL,facLevels) | |
| 2156 | |
| 2157 LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ | |
| 2158 cInput = rep(NA,nchar(matInput[x,1])) | |
| 2159 cInput[s2c(matInput[x,1])!="N"] = 1 | |
| 2160 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
| 2161 }) | |
| 2162 | |
| 2163 LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ | |
| 2164 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
| 2165 }) | |
| 2166 | |
| 2167 LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ | |
| 2168 #print(x) | |
| 2169 computeMutationTypes(matInput[x,2]) | |
| 2170 }) | |
| 2171 | |
| 2172 LisGLs_R_Exp = mclapply(1:nrow(matInput), function(x){ | |
| 2173 Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
| 2174 function(codonNucs){ | |
| 2175 RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") | |
| 2176 sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) | |
| 2177 } | |
| 2178 ) | |
| 2179 }) | |
| 2180 | |
| 2181 LisGLs_S_Exp = mclapply(1:nrow(matInput), function(x){ | |
| 2182 Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
| 2183 function(codonNucs){ | |
| 2184 SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") | |
| 2185 sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) | |
| 2186 } | |
| 2187 ) | |
| 2188 }) | |
| 2189 | |
| 2190 Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
| 2191 Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
| 2192 return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) | |
| 2193 }else{ | |
| 2194 facGL <- factor(matInput[,2]) | |
| 2195 facLevels = levels(facGL) | |
| 2196 LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ | |
| 2197 computeMutabilities(facLevels[x]) | |
| 2198 }) | |
| 2199 facIndex = match(facGL,facLevels) | |
| 2200 | |
| 2201 LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ | |
| 2202 cInput = rep(NA,nchar(matInput[x,1])) | |
| 2203 cInput[s2c(matInput[x,1])!="N"] = 1 | |
| 2204 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
| 2205 }) | |
| 2206 | |
| 2207 LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ | |
| 2208 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
| 2209 }) | |
| 2210 | |
| 2211 LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ | |
| 2212 #print(x) | |
| 2213 computeMutationTypes(matInput[x,2]) | |
| 2214 }) | |
| 2215 | |
| 2216 LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){ | |
| 2217 Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
| 2218 function(codonNucs){ | |
| 2219 RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") | |
| 2220 sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) | |
| 2221 } | |
| 2222 ) | |
| 2223 }) | |
| 2224 | |
| 2225 LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){ | |
| 2226 Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
| 2227 function(codonNucs){ | |
| 2228 SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") | |
| 2229 sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) | |
| 2230 } | |
| 2231 ) | |
| 2232 }) | |
| 2233 | |
| 2234 Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
| 2235 Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
| 2236 return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) | |
| 2237 } | |
| 2238 } | |
| 2239 | |
| 2240 # getObservedMutationsByCodon <- function(listMutations){ | |
| 2241 # numbSeqs <- length(listMutations) | |
| 2242 # obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) | |
| 2243 # obsMu_S <- obsMu_R | |
| 2244 # temp <- mclapply(1:length(listMutations), function(i){ | |
| 2245 # arrMutations = listMutations[[i]] | |
| 2246 # RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) | |
| 2247 # RPos <- sapply(RPos,getCodonNumb) | |
| 2248 # if(any(RPos)){ | |
| 2249 # tabR <- table(RPos) | |
| 2250 # obsMu_R[i,as.numeric(names(tabR))] <<- tabR | |
| 2251 # } | |
| 2252 # | |
| 2253 # SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) | |
| 2254 # SPos <- sapply(SPos,getCodonNumb) | |
| 2255 # if(any(SPos)){ | |
| 2256 # tabS <- table(SPos) | |
| 2257 # obsMu_S[i,names(tabS)] <<- tabS | |
| 2258 # } | |
| 2259 # } | |
| 2260 # ) | |
| 2261 # return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) | |
| 2262 # } | |
| 2263 | |
| 2264 getObservedMutationsByCodon <- function(listMutations){ | |
| 2265 numbSeqs <- length(listMutations) | |
| 2266 obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) | |
| 2267 obsMu_S <- obsMu_R | |
| 2268 temp <- lapply(1:length(listMutations), function(i){ | |
| 2269 arrMutations = listMutations[[i]] | |
| 2270 RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) | |
| 2271 RPos <- sapply(RPos,getCodonNumb) | |
| 2272 if(any(RPos)){ | |
| 2273 tabR <- table(RPos) | |
| 2274 obsMu_R[i,as.numeric(names(tabR))] <<- tabR | |
| 2275 } | |
| 2276 | |
| 2277 SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) | |
| 2278 SPos <- sapply(SPos,getCodonNumb) | |
| 2279 if(any(SPos)){ | |
| 2280 tabS <- table(SPos) | |
| 2281 obsMu_S[i,names(tabS)] <<- tabS | |
| 2282 } | |
| 2283 } | |
| 2284 ) | |
| 2285 return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) | |
| 2286 } | |
| 2287 | 
