changeset 110:ade5cf6fd2dc draft

Uploaded
author davidvanzessen
date Tue, 02 Aug 2016 08:30:23 -0400
parents 0096cd454380
children de3331cf6368
files aa_histogram.r merge_and_filter.r mutation_analysis.py mutation_analysis.r mutation_analysis.xml style.tar.gz tmp/igat.r wrapper.sh
diffstat 8 files changed, 244 insertions(+), 138 deletions(-) [+]
line wrap: on
line diff
--- a/aa_histogram.r	Thu Jul 14 07:29:56 2016 -0400
+++ b/aa_histogram.r	Tue Aug 02 08:30:23 2016 -0400
@@ -15,47 +15,47 @@
 absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
 
 for(gene in genes){
-	
-	if(gene == ""){
-		mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
-		absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
-	} else {
-		mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
-		absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
-	}
-	if(nrow(mutations.by.id.gene) == 0){
-		next
-	}
-	
-	print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
-	
-	mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
-	aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
 
-	dat_freq = mutations.at.position / aa.at.position
-	dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+        if(gene == ""){
+                mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
+                absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
+        } else {
+                mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
+                absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
+        }
+        print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
+        if(nrow(mutations.by.id.gene) == 0){
+                next
+        }
 
-	print("---------------- plot ----------------")
+        mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
+        aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
+
+        dat_freq = mutations.at.position / aa.at.position
+        dat_freq[is.na(dat_freq)] = 0
+        dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+        print("---------------- plot ----------------")
 
-	m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-	m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=1:length(dat_freq), labels=1:length(dat_freq))
-	m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
-	m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
-	m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
-	m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
-	m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
-	m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
+        m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+        m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
+        m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
+        m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
+        m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
+        m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
+        m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
+        m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
+
+        print("---------------- write/print ----------------")
 
-	print("---------------- write/print ----------------")
-	
-	png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
-	print(m)
-	dev.off()
-	
-	dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)	
-	
-	write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-	write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-	write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-	write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
+        print(m)
+        dev.off()
+
+        dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
+
+        write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
 }
--- a/merge_and_filter.r	Thu Jul 14 07:29:56 2016 -0400
+++ b/merge_and_filter.r	Tue Aug 02 08:30:23 2016 -0400
@@ -34,13 +34,24 @@
 	
 }
 
-print(paste("Number of sequences in summary file:", nrow(summ)))
+input.sequence.count = nrow(summ)
+print(paste("Number of sequences in summary file:", input.sequence.count))
+
+filtering.steps = data.frame(character(0), numeric(0))
+
+filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))
+
+filtering.steps[,1] = as.character(filtering.steps[,1])
+filtering.steps[,2] = as.character(filtering.steps[,2])
+#filtering.steps[,3] = as.numeric(filtering.steps[,3])
 
 summ = merge(summ, gene_identification, by="Sequence.ID")
 
 summ = summ[summ$Functionality != "No results",]
 
-print(paste("Number of sequences after merging with annotation:", nrow(summ)))
+print(paste("Number of sequences after 'No results' filter:", nrow(summ)))
+
+filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ)))
 
 if(functionality == "productive"){
 	summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
@@ -52,6 +63,8 @@
 
 print(paste("Number of sequences after productive filter:", nrow(summ)))
 
+filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ)))
+
 splt = strsplit(class_filter, "_")[[1]]
 chunk_hit_threshold = as.numeric(splt[1])
 nt_hit_threshold = as.numeric(splt[2])
@@ -70,7 +83,6 @@
 if(any(higher_than, na.rm=T)){
 	#summ = summ[higher_than,]
 }
-print(paste("Number of matched sequences:", sum(!grepl("^unmatched", summ$best_match))))
 
 if(nrow(summ) == 0){
 	stop("No data remaining after filter")
@@ -109,6 +121,8 @@
 
 print(paste("Number of sequences in result after", unique_type, "filtering:", nrow(result)))
 
+filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result)))
+
 #remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions.
 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
@@ -124,10 +138,12 @@
 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
 
 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
+filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))
 
 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
 
 print(paste("Number of sequences in result after n filtering:", nrow(result)))
+filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))
 
 cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
                     "CDR1.IMGT.Nb.of.mutations", 
@@ -176,8 +192,20 @@
 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
 
+filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result)))
+
 print(paste("Number of rows in result:", nrow(result)))
 print(paste("Number of rows in unmatched:", nrow(unmatched)))
 
+matched.sequences.count = sum(!grepl("^unmatched", result$best_match))
+unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
+
+filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
+filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
+filtering.steps[,2] = as.numeric(filtering.steps[,2])
+filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)
+
+write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F)
+
 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
--- a/mutation_analysis.py	Thu Jul 14 07:29:56 2016 -0400
+++ b/mutation_analysis.py	Tue Aug 02 08:30:23 2016 -0400
@@ -22,6 +22,7 @@
 
 mutationdic = dict()
 mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
+NAMatchResult = (None, None, None, None, None, None, '')
 linecount = 0
 
 IDIndex = 0
@@ -58,15 +59,19 @@
 		ID = linesplt[IDIndex]
 		genedic[ID] = linesplt[best_matchIndex]
 		try:
-			mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
-			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
-			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
-			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
+			if linesplt[fr1Index] != "NA":
+				mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
+			else:
+				mutationdic[ID + "_FR1"] = []
+			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []
+			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []
+			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []
 			mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
-			mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
-		except:
+			mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
+		except e:
 			print linesplt
 			print linecount
+			print e
 		mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 		mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 
@@ -79,6 +84,8 @@
 		IDlist += [ID]
 
 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
+if AALength < 60:
+	AALength = 64
 
 AA_mutation = [0] * AALength
 AA_mutation_dic = {"ca": AA_mutation[:], "cg": AA_mutation[:], "cm": AA_mutation[:], "un": AA_mutation[:]}
--- a/mutation_analysis.r	Thu Jul 14 07:29:56 2016 -0400
+++ b/mutation_analysis.r	Tue Aug 02 08:30:23 2016 -0400
@@ -307,26 +307,28 @@
 	write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
 }
 
+sum.table = read.table("mutations_sum.txt", sep=",", header=F)
+median.table = read.table("mutations_median.txt", sep=",", header=F)
+
+#sum.table["Median of Number of Mutations (%)",] = median.table[1,]
+
+new.table = sum.table[1,]
+new.table[2,] = median.table[1,]
+new.table[3:12,] = sum.table[2:11,]
+new.table[,1] = as.character(new.table[,1])
+new.table[2,1] = "Median of Number of Mutations (%)"
+
+#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]
+
+write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)
+
+
 
 if (!("ggplot2" %in% rownames(installed.packages()))) {
 	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
 }
 
-genesForPlot = gsub("[0-9]", "", dat$best_match)
-genesForPlot = data.frame(table(genesForPlot))
-colnames(genesForPlot) = c("Gene","Freq")
-genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
-write.table(genesForPlot, "all.txt", sep="\t",quote=F,row.names=F,col.names=T)
-
-
-pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
-pc = pc + geom_bar(width = 1, stat = "identity")
-pc = pc + coord_polar(theta="y")
-pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("Classes", "( n =", sum(genesForPlot$Freq), ")"))
-
-png(filename="all.png")
-pc
-dev.off()
+dat = dat[!grepl("^unmatched", dat$best_match),]
 
 #blegh
 genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
@@ -377,11 +379,6 @@
 
 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
 
-
-
-
-
-
 dat$best_match_class = substr(dat$best_match, 0, 2)
 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
--- a/mutation_analysis.xml	Thu Jul 14 07:29:56 2016 -0400
+++ b/mutation_analysis.xml	Tue Aug 02 08:30:23 2016 -0400
@@ -5,7 +5,7 @@
 	</command>
 	<inputs>
 		<param name="in_file" type="data" label="IMGT zip file to be analysed" />
-		<param name="method" type="select" label="Identification method" help="" >
+		<param name="method" type="select" label="Method of identification of C region" help="" >
 			<option value="custom" selected="true">custom</option>
 			<option value="blastn">blastn</option>
 		</param>
@@ -27,10 +27,10 @@
 			<option value="no" selected="true">No</option>
 		</param>
 		<param name="unique" type="select" label="Remove duplicates based on" help="" >
-			<option value="AA.JUNCTION_V_subclass" selected="true">AA.JUNCTION + V + subclass</option>
-			<option value="AA.JUNCTION_subclass">AA.JUNCTION + subclass</option>
-			<option value="AA.JUNCTION_V">AA.JUNCTION + V</option>
-			<option value="AA.JUNCTION">AA.JUNCTION</option>
+			<option value="AA.JUNCTION_V_subclass" selected="true">Top.V.Gene, CDR3.Seq, C region</option>
+			<option value="AA.JUNCTION_subclass">CDR3.Seq + C region</option>
+			<option value="AA.JUNCTION_V">CDR3.seq + Top.V.Gene</option>
+			<option value="AA.JUNCTION">CDR3.seq</option>
 			<option value="none">Don't remove duplicates</option>
 		</param>
 		<param name="class_filter" type="select" label="Class/Subclass filter" help="" >
Binary file style.tar.gz has changed
--- a/tmp/igat.r	Thu Jul 14 07:29:56 2016 -0400
+++ b/tmp/igat.r	Tue Aug 02 08:30:23 2016 -0400
@@ -15,9 +15,9 @@
 for(f in list.files(imgt.dir, pattern="*.txt$")){
 	#print(paste("filtering", f))
 	path = paste(imgt.dir, f, sep="")
-	dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F)
+	dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE)
 	
-	dat = dat[dat$Sequence.ID %in% merged$Sequence.ID,]
+	dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
 	
 	if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file
 		dat[,grepl("^FR1", names(dat))] = 0
--- a/wrapper.sh	Thu Jul 14 07:29:56 2016 -0400
+++ b/wrapper.sh	Tue Aug 02 08:30:23 2016 -0400
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -e
+#set -e
 dir="$(cd "$(dirname "$0")" && pwd)"
 input=$1
 method=$2
@@ -17,13 +17,13 @@
 class_filter=${13}
 mkdir $outdir
 
+tar -xzf $dir/style.tar.gz -C $outdir
+
 echo "---------------- read parameters ----------------"
 echo "---------------- read parameters ----------------<br />" > $log
 
 echo "unpacking IMGT file"
 
-
-
 type="`file $input`"
 if [[ "$type" == *"Zip archive"* ]] ; then
 	echo "Zip archive"
@@ -67,7 +67,11 @@
 	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
 	sequence_index=$((sequence_index+1))
 	
-	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
+	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp
+	
+	cat $PWD/sequences.tmp | grep -B1 -vE "^$" sequences.fasta #filter out empty sequences
+	
+	rm $PWD/sequences.tmp
 	
 	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
 	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
@@ -149,12 +153,15 @@
 echo "---------------- aa_histogram.r ----------------<br />" >> $log
 
 Rscript $dir/aa_histogram.r $outdir/aa_id_mutations.txt $outdir/absent_aa_id.txt "ca,cg,cm" $outdir/ 2>&1
-mv $outdir/aa_histogram_.png $outdir/aa_histogram.png
-mv $outdir/aa_histogram_.txt $outdir/aa_histogram.txt
+if [ -e "$outdir/aa_histogram_.png" ]; then
+        mv $outdir/aa_histogram_.png $outdir/aa_histogram.png
+        mv $outdir/aa_histogram_.txt $outdir/aa_histogram.txt
+fi
 
 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
 
 funcs=(sum mean median)
+funcs=(sum)
 
 echo "---------------- sequence_overview.r ----------------"
 echo "---------------- sequence_overview.r ----------------<br />" >> $log
@@ -172,6 +179,10 @@
 done < $outdir/sequence_overview/ntoverview.txt
 
 echo "<html><center><h1>$title</h1></center>" > $output
+echo "<script type='text/javascript' src='jquery-1.11.0.min.js'></script>" >> $output
+echo "<script type='text/javascript' src='tabber.js'></script>" >> $output
+echo "<script type='text/javascript' src='script.js'></script>" >> $output
+echo "<link rel='stylesheet' type='text/css' href='style.css'>" >> $output
 
 #display the matched/unmatched for clearity
 
@@ -188,6 +199,10 @@
 
 echo "---------------- main tables ----------------"
 echo "---------------- main tables ----------------<br />" >> $log
+
+echo "<div class='tabber'>" >> $output
+echo "<div class='tabbertab' title='SHM Overview'>" >> $output
+
 for func in ${funcs[@]}
 do
 
@@ -221,9 +236,84 @@
 	#echo "<a href='data_${func}.txt'>Download data</a>" >> $output
 done
 
-echo "---------------- download links ----------------"
-echo "---------------- download links ----------------<br />" >> $log
+echo "</div>" >> $output #SHM overview tab end
+
+echo "---------------- images ----------------"
+echo "---------------- images ----------------<br />" >> $log
+
+echo "<div class='tabbertab' title='SHM Frequency'>" >> $output
+
+if [ -a $outdir/scatter.png ]
+then
+	echo "<img src='scatter.png'/><br />" >> $output
+	echo "<a href='scatter.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/frequency_ranges.png ]
+then
+	echo "<img src='frequency_ranges.png'/><br />" >> $output
+	echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
+	echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
+fi
+
+echo "</div>" >> $output #SHM frequency tab end
+
+echo "<div class='tabbertab' title='Transition tables'>" >> $output
+
+for gene in ${genes[@]}
+do
+	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
+	while IFS=, read from a c g t
+		do
+			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+	done < $outdir/transitions_${gene}_sum.txt
+	echo "</table>" >> $output
+done
 
+echo "<table border='1'><caption>All transition table</caption>" >> $output
+while IFS=, read from a c g t
+	do
+		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+done < $outdir/transitions_all_sum.txt
+echo "</table>" >> $output
+
+echo "</div>" >> $output #transition tables tab end
+
+echo "<div class='tabbertab' title='Antigen Selection'>" >> $output
+
+if [ -a $outdir/aa_histogram.png ]
+then
+	echo "<img src='aa_histogram.png'/><br />" >> $output
+	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
+	echo "<img src='aa_histogram_ca.png'/><br />" >> $output
+	echo "<a href='aa_histogram_ca.txt'>download data</a><br />" >> $output
+	echo "<img src='aa_histogram_cg.png'/><br />" >> $output
+	echo "<a href='aa_histogram_cg.txt'>download data</a><br />" >> $output
+	echo "<img src='aa_histogram_cm.png'/><br />" >> $output
+	echo "<a href='aa_histogram_cm.txt'>download data</a><br />" >> $output
+fi
+
+echo "<embed src='baseline_ca.pdf' width='700px' height='1000px'>" >> $output
+echo "<embed src='baseline_cg.pdf' width='700px' height='1000px'>" >> $output
+echo "<embed src='baseline_cm.pdf' width='700px' height='1000px'>" >> $output
+
+echo "</div>" >> $output #antigen selection tab end
+
+echo "<div class='tabbertab' title='CSR'>" >> $output
+
+if [ -a $outdir/ca.png ] 
+then
+	echo "<img src='ca.png'/><br />" >> $output
+	echo "<a href='ca.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/cg.png ]
+then
+	echo "<img src='cg.png'/><br />" >> $output
+	echo "<a href='cg.txt'>download data</a><br />" >> $output
+fi
+
+echo "</div>" >> $output #CSR tab end
+
+echo "<div class='tabbertab' title='Downloads'>" >> $output
 
 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
@@ -250,61 +340,9 @@
 echo "<a href='new_IMGT_cg.txz'>Filtered cg IMGT zip</a><br />" >> $output
 echo "<a href='new_IMGT_cm.txz'>Filtered cm IMGT zip</a><br />" >> $output
 
-
-echo "---------------- images ----------------"
-echo "---------------- images ----------------<br />" >> $log
+echo "</div>" >> $output #downloads tab end
 
-echo "<img src='all.png'/><br />" >> $output
-echo "<a href='all.txt'>download data</a><br />" >> $output
-if [ -a $outdir/ca.png ] 
-then
-	echo "<img src='ca.png'/><br />" >> $output
-	echo "<a href='ca.txt'>download data</a><br />" >> $output
-fi
-if [ -a $outdir/cg.png ]
-then
-	echo "<img src='cg.png'/><br />" >> $output
-	echo "<a href='cg.txt'>download data</a><br />" >> $output
-fi
-if [ -a $outdir/scatter.png ]
-then
-	echo "<img src='scatter.png'/><br />" >> $output
-	echo "<a href='scatter.txt'>download data</a><br />" >> $output
-fi
-if [ -a $outdir/frequency_ranges.png ]
-then
-	echo "<img src='frequency_ranges.png'/><br />" >> $output
-	echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
-	echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
-fi
-if [ -a $outdir/aa_histogram.png ]
-then
-	echo "<img src='aa_histogram.png'/><br />" >> $output
-	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
-	echo "<img src='aa_histogram_ca.png'/><br />" >> $output
-	echo "<a href='aa_histogram_ca.txt'>download data</a><br />" >> $output
-	echo "<img src='aa_histogram_cg.png'/><br />" >> $output
-	echo "<a href='aa_histogram_cg.txt'>download data</a><br />" >> $output
-	echo "<img src='aa_histogram_cm.png'/><br />" >> $output
-	echo "<a href='aa_histogram_cm.txt'>download data</a><br />" >> $output
-fi
-
-for gene in ${genes[@]}
-do
-	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
-	while IFS=, read from a c g t
-		do
-			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
-	done < $outdir/transitions_${gene}_sum.txt
-	echo "</table>" >> $output
-done
-
-echo "<table border='1'><caption>All transition table</caption>" >> $output
-while IFS=, read from a c g t
-	do
-		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
-done < $outdir/transitions_all_sum.txt
-echo "</table>" >> $output
+echo "</div>" >> $output #tabs end 
 
 echo "</html>" >> $output
 
@@ -368,8 +406,44 @@
 
 mv $log $outdir/log.html
 
-cp $outdir/index.html $log
+echo "<html><center><h1><a href='index.html'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> 'open in new tab' on the link above)<br />" > $log
+echo "<table border = 1>" >> $log
+echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log
+tIFS="$TMP"
+IFS=$'\t'
+while read step seq perc
+	do
+		echo "<tr>" >> $log
+		echo "<td>$step</td>" >> $log
+		echo "<td>$seq</td>" >> $log
+		echo "<td>${perc}%</td>" >> $log
+		echo "</tr>" >> $log
+done < $outdir/filtering_steps.txt
+echo "</table border></center></html>" >> $log
+
+IFS="$tIFS"
+
 
 echo "---------------- Done! ----------------"
 echo "---------------- Done! ----------------<br />" >> $outdir/log.html
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+