Mercurial > repos > davidvanzessen > mutation_analysis

--- a/merge_and_filter.r	Thu Jun 16 05:05:47 2016 -0400
+++ b/merge_and_filter.r	Thu Jun 16 10:01:54 2016 -0400
@@ -155,7 +155,6 @@
 		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match)
 	} else {
 		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
-
 	}

 	#fltr = result$unique.def %in% result.filtered$unique.def
--- a/sequence_overview.r	Thu Jun 16 05:05:47 2016 -0400
+++ b/sequence_overview.r	Thu Jun 16 10:01:54 2016 -0400
@@ -3,10 +3,9 @@
 args <- commandArgs(trailingOnly = TRUE)

 input.file = args[1]
-input.file = args[2]
-outputdir = args[3]
-gene.classes = unlist(strsplit(args[4], ","))
-hotspot.analysis.sum.file = args[5]
+outputdir = args[2]
+gene.classes = unlist(strsplit(args[3], ","))
+hotspot.analysis.sum.file = args[4]
 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
 main.html = "index.html"
@@ -41,12 +40,17 @@
 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") }
 tbl = function(df) { res = "<table border='1'>"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "</table>"); }

-print(paste("Number of unique sequences to be written to the sequence overview page", nrow(dat)))
-
 cat("<table border='1'>", file=main.html, append=F)
 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
 cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T)

+
+
+single.sequences=0 #sequence only found once, skipped
+in.multiple=0 #same sequence across multiple subclasses
+multiple.in.one=0 #same sequence multiple times in one subclass
+matched=0 #should be the same als matched sequences
+
 for(i in 1:nrow(dat)){

 	ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]
@@ -65,10 +69,16 @@
 	classes.sum = sum(classes)

 	if(classes.sum == 1){
-		print(paste("next", i, classes.sum))
+		single.sequences = single.sequences + 1
 		next
+	}
+
+	matched = matched + sum(classes > 0) #count in how many subclasses the sequence occurs.
+
+	if(any(classes  == classes.sum)){
+		in.multiple = in.multiple + 1
 	} else {
-		print(i)
+		multiple.in.one = multiple.in.one + 1
 	}

 	id = as.numeric(dat[i,"seq_conc"])
@@ -120,6 +130,10 @@

 cat("</table>", file=main.html, append=T)

+print(paste("Single sequences:", single.sequences))
+print(paste("Sequences in multiple subclasses:", in.multiple))
+print(paste("Multiple sequences in one subclass:", multiple.in.one))
+print(paste("Count that should match 'matched' sequences:", matched))

 #ACGT overview

@@ -131,8 +145,6 @@
 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))

-print(sum(colSums(NToverview[,c("A", "C", "T", "G")])))
-
 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))

 #NToverview = rbind(NToverview, NTsum)
@@ -166,8 +178,6 @@

 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)

-print(hotspot.analysis.sum)
-
 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
--- a/tmp/igat.r	Thu Jun 16 05:05:47 2016 -0400
+++ b/tmp/igat.r	Thu Jun 16 10:01:54 2016 -0400
@@ -2,9 +2,14 @@

 imgt.dir = args[1]
 merged.file = args[2]
+gene = args[3]

 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F)

+if(gene != "-"){
+	merged = merged[grepl(gene, merged$best_match),]
+}
+
 merged = merged[!grepl("unmatched", merged$best_match),]

 for(f in list.files(imgt.dir, pattern="*.txt$")){
@@ -14,7 +19,7 @@

 	dat = dat[dat$Sequence.ID %in% merged$Sequence.ID,]

-	if("FR1.IMGT" %in% colnames(dat)){
+	if(nrow(dat) > 0 &  "FR1.IMGT" %in% colnames(dat)){
 		dat$FR1.IMGT = ""
 	}
--- a/wrapper.sh	Thu Jun 16 05:05:47 2016 -0400
+++ b/wrapper.sh	Thu Jun 16 10:01:54 2016 -0400
@@ -93,21 +93,44 @@
 cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"
 cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"

-Rscript $dir/tmp/igat.r $outdir/new_IMGT/ $outdir/merged.txt 2>&1
+mkdir $outdir/new_IMGT_ca
+cp $outdir/new_IMGT/* $outdir/new_IMGT_ca
+
+mkdir $outdir/new_IMGT_cg
+cp $outdir/new_IMGT/* $outdir/new_IMGT_cg
+
+mkdir $outdir/new_IMGT_cm
+cp $outdir/new_IMGT/* $outdir/new_IMGT_cm
+
+Rscript $dir/tmp/igat.r $outdir/new_IMGT/ $outdir/merged.txt "-" 2>&1
+Rscript $dir/tmp/igat.r $outdir/new_IMGT_ca/ $outdir/merged.txt "ca "2>&1
+Rscript $dir/tmp/igat.r $outdir/new_IMGT_cg/ $outdir/merged.txt "cg "2>&1
+Rscript $dir/tmp/igat.r $outdir/new_IMGT_cm/ $outdir/merged.txt "cm "2>&1


 tmp="$PWD"
 cd $outdir/new_IMGT/ #tar weirdness...
 tar -cJf ../new_IMGT.txz *
-
 cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT/IgAT.xlsm
-
-#tar -cJf ../IgAT.txz *
 zip -r ../IgAT.zip *

+cd $outdir/new_IMGT_ca/
+tar -cJf ../new_IMGT_ca.txz *
+cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_ca/IgAT.xlsm
+zip -r ../IgAT_ca.zip *
+
+cd $outdir/new_IMGT_cg/
+tar -cJf ../new_IMGT_cg.txz *
+cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cg/IgAT.xlsm
+zip -r ../IgAT_cg.zip *
+
+cd $outdir/new_IMGT_cm/
+tar -cJf ../new_IMGT_cm.txz *
+cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cm/IgAT.xlsm
+zip -r ../IgAT_cm.zip *
+
 cd $tmp

-
 echo "---------------- mutation_analysis.r ----------------"
 echo "---------------- mutation_analysis.r ----------------<br />" >> $output

@@ -136,7 +159,7 @@
 mkdir $outdir/sequence_overview

 #Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
-Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
+Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1

 echo "<table border='1'>" > $outdir/base_overview.html

@@ -207,7 +230,21 @@
 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
 echo "<a href='baseline.pdf'>Baseline PDF</a><br />" >> $output
 echo "<a href='baseline.txt'>Baseline Table</a><br />" >> $output
+echo "<a href='baseline_ca.pdf'>Baseline ca PDF</a><br />" >> $output
+echo "<a href='baseline_ca.txt'>Baseline ca Table</a><br />" >> $output
+echo "<a href='baseline_cg.pdf'>Baseline cg PDF</a><br />" >> $output
+echo "<a href='baseline_cg.txt'>Baseline cg Table</a><br />" >> $output
+echo "<a href='baseline_cm.pdf'>Baseline cm PDF</a><br />" >> $output
+echo "<a href='baseline_cm.txt'>Baseline cm Table</a><br />" >> $output
 echo "<a href='IgAT.zip'>IgAT zip</a><br />" >> $output
+echo "<a href='IgAT_ca.zip'>IgAT ca zip</a><br />" >> $output
+echo "<a href='IgAT_cg.zip'>IgAT cg zip</a><br />" >> $output
+echo "<a href='IgAT_cm.zip'>IgAT cm zip</a><br />" >> $output
+echo "<a href='new_IMGT.txz'>Filtered IMGT zip</a><br />" >> $output
+echo "<a href='new_IMGT_ca.txz'>Filtered ca IMGT zip</a><br />" >> $output
+echo "<a href='new_IMGT_cg.txz'>Filtered cg IMGT zip</a><br />" >> $output
+echo "<a href='new_IMGT_cm.txz'>Filtered cm IMGT zip</a><br />" >> $output
+

 echo "---------------- images ----------------"

@@ -260,7 +297,10 @@
 echo "</html>" >> $output

 echo "---------------- baseline ----------------"
-bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "sample name" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"
+bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"
+bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt"
+bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt"
+bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt"

 #optional output for naive

@@ -280,11 +320,3 @@

 echo "---------------- Done! ----------------"

-
-
-
-
-
-
-#rm $outdir/HS12RSS.txt
-#rm $outdir/HS23RSS.txt