# HG changeset patch # User davidvanzessen # Date 1462779939 14400 # Node ID becea91089ed44623660dfea15052e42db8f7d04 # Parent 14749ced7ff2bd16c5bd1fe876114f0aec516e3d Uploaded diff -r 14749ced7ff2 -r becea91089ed merge_and_filter.r diff -r 14749ced7ff2 -r becea91089ed mutation_analysis.py --- a/mutation_analysis.py Wed May 04 08:44:16 2016 -0400 +++ b/mutation_analysis.py Mon May 09 03:45:39 2016 -0400 @@ -57,7 +57,6 @@ linesplt = line.split("\t") ID = linesplt[IDIndex] genedic[ID] = linesplt[best_matchIndex] - print line try: mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] diff -r 14749ced7ff2 -r becea91089ed mutation_analysis.r diff -r 14749ced7ff2 -r becea91089ed sequence_overview.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence_overview.r Mon May 09 03:45:39 2016 -0400 @@ -0,0 +1,99 @@ +library(reshape2) + +args <- commandArgs(trailingOnly = TRUE) + +gene.matches = args[1] +sequence.file = args[2] +outputdir = args[3] +main.html = "index.html" + +setwd(outputdir) + +genes = read.table(gene.matches, header=T, sep="\t", fill=T) +sequences = read.table(sequence.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") + +dat = merge(sequences, genes, by="Sequence.ID") + +dat$seq_conc = paste(dat$CDR1.IMGT, dat$CDR2.IMGT, dat$CDR3.IMGT, dat$FR2.IMGT, dat$FR3.IMGT) + +IDs = dat[,c("Sequence.ID", "seq_conc", "best_match")] +IDs$best_match = as.character(IDs$best_match) + +#dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) + +dat = data.frame(table(dat$best_match, dat$seq_conc)) + +dat = dat[dat$Freq > 1,] + +names(dat) = c("best_match", "seq_conc", "Freq") + +dat$seq_conc = factor(dat$seq_conc) + +dat = dat[order(nchar(as.character(dat$seq_conc))),] + +#writing html from R... +td = function(val) { paste("", val, "", sep="") } +tr = function(val) { capture.output(cat("", td(val), "", sep="")) } +make.link = function(id, clss, val) { paste("", val, "", sep="") } +tbl = function(df) { res = ""; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "

"); } + +cat("", file=main.html, append=F) +cat("", file=main.html, append=T) + +for(i in 1:nrow(dat)){ + ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "ca1",] + ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "ca2",] + + cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg1",] + cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg2",] + cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",] + cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",] + + cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",] + + id = as.numeric(dat[i,"seq_conc"]) + + if(nrow(ca1) > 0){ + cat(tbl(ca1), file=paste("ca1_", id, ".html", sep="")) + } + + if(nrow(ca2) > 0){ + cat(tbl(ca2), file=paste("ca2_", id, ".html", sep="")) + } + + if(nrow(cg1) > 0){ + cat(tbl(cg1), file=paste("cg1_", id, ".html", sep="")) + } + + if(nrow(cg2) > 0){ + cat(tbl(cg2), file=paste("cg2_", id, ".html", sep="")) + } + + if(nrow(cg3) > 0){ + cat(tbl(cg3), file=paste("cg3_", id, ".html", sep="")) + } + + if(nrow(cg4) > 0){ + cat(tbl(cg4), file=paste("cg4_", id, ".html", sep="")) + } + + if(nrow(cm) > 0){ + cat(tbl(cm), file=paste("cm_", id, ".html", sep="")) + } + + ca1.html = make.link(id, "ca1", nrow(ca1)) + ca2.html = make.link(id, "ca2", nrow(ca2)) + + cg1.html = make.link(id, "cg1", nrow(cg1)) + cg2.html = make.link(id, "cg2", nrow(cg2)) + cg3.html = make.link(id, "cg3", nrow(cg3)) + cg4.html = make.link(id, "cg4", nrow(cg4)) + + cm.html = make.link(id, "cm", nrow(cm)) + + rw = c(as.character(dat[i,"seq_conc"]), ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html) + + cat(tr(rw), file=main.html, append=T) +} + +cat("

Sequence	ca1	ca2	cg1	cg2	cg3	cg4	cm

", file=main.html, append=T) diff -r 14749ced7ff2 -r becea91089ed wrapper.sh --- a/wrapper.sh Wed May 04 08:44:16 2016 -0400 +++ b/wrapper.sh Mon May 09 03:45:39 2016 -0400 @@ -229,7 +229,11 @@ Rscript $dir/naive_output.r $naive_output_ca $outdir/merged.txt $naive_output_ca $naive_output_cg $naive_output_cm 2>&1 fi +echo "---------------- sequence_overview.r ----------------" +mkdir $outdir/sequence_overview + +Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1