Mercurial > repos > davidvanzessen > mutation_analysis
changeset 91:5e237c243088 draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 03 Jun 2016 10:35:52 -0400 |
parents | f0e8dac22c6e |
children | b869a126e2c4 |
files | merge_and_filter.r sequence_overview.r |
diffstat | 2 files changed, 17 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/merge_and_filter.r Wed Jun 01 05:03:24 2016 -0400 +++ b/merge_and_filter.r Fri Jun 03 10:35:52 2016 -0400 @@ -116,15 +116,7 @@ print(paste("Number of sequences in result after merging with sequences:", nrow(result))) -print(paste("Number of N in CDR1:", sum(grepl("n|N", result$CDR1.IMGT.seq)))) -print(paste("Number of N in FR2:", sum(grepl("n|N", result$FR2.IMGT.seq)))) -print(paste("Number of N in CDR2:", sum(grepl("n|N", result$CDR2.IMGT.seq)))) -print(paste("Number of N in FR3:", sum(grepl("n|N", result$FR3.IMGT.seq)))) - -print(paste("Number of sequences with N in CDR1 or FR2 or CDR2 or FR3:", sum(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq)))) - - -result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq)),] +result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] print(paste("Number of sequences in result after n filtering:", nrow(result)))
--- a/sequence_overview.r Wed Jun 01 05:03:24 2016 -0400 +++ b/sequence_overview.r Fri Jun 03 10:35:52 2016 -0400 @@ -22,11 +22,13 @@ #dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) -dat = data.frame(table(merged$seq_conc, merged$Functionality)) +dat = data.frame(table(merged$seq_conc)) +#dat = data.frame(table(merged$seq_conc, merged$Functionality)) #dat = dat[dat$Freq > 1,] -names(dat) = c("seq_conc", "Functionality", "Freq") +#names(dat) = c("seq_conc", "Functionality", "Freq") +names(dat) = c("seq_conc", "Freq") dat$seq_conc = factor(dat$seq_conc) @@ -43,15 +45,16 @@ cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T) for(i in 1:nrow(dat)){ - ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "ca1",] - ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "ca2",] + ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),] + ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),] - cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg1",] - cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg2",] - cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",] - cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",] + cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg1", IDs$best_match),] + cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg2", IDs$best_match),] + cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg3", IDs$best_match),] + cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg4", IDs$best_match),] - cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",] + cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cm", IDs$best_match),] + allc = rbind(ca1, ca2, cg1, cg2, cg3, cg4, cm) classes = c(nrow(ca1), nrow(ca2), nrow(cg1), nrow(cg2), nrow(cg3), nrow(cg4), nrow(cm)) @@ -63,7 +66,7 @@ id = as.numeric(dat[i,"seq_conc"]) - functionality = dat[i,"Functionality"] + functionality = paste(unique(allc[,"Functionality"], sep=",")) if(nrow(ca1) > 0){ cat(tbl(ca1), file=paste("ca1_", id, ".html", sep="")) @@ -104,6 +107,7 @@ cm.html = make.link(id, "cm", nrow(cm)) rw = c(as.character(dat[i,"seq_conc"]), as.character(functionality), ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html) + #print(rw) cat(tr(rw), file=main.html, append=T) } @@ -132,7 +136,7 @@ NTresult = data.frame(nt=c("A", "C", "T", "G")) for(clazz in gene.classes){ - NToverview.sub = NToverview[grepl(clazz, paste("^", NToverview$best_match, sep="")),] + NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),] new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G)) new.col.y = sum(new.col.x) new.col.z = round(new.col.x / new.col.y * 100, 2) @@ -158,8 +162,7 @@ write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0") - -write.table(NToverview, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) +write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)