changeset 98:5ffbf40cdd4b draft

Uploaded
author davidvanzessen
date Thu, 16 Jun 2016 05:05:47 -0400
parents 6e8dfbe164c6
children 86206431cbb0
files merge_and_filter.r mutation_analysis.py mutation_analysis.r sequence_overview.r wrapper.sh
diffstat 5 files changed, 33 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Wed Jun 15 04:48:41 2016 -0400
+++ b/merge_and_filter.r	Thu Jun 16 05:05:47 2016 -0400
@@ -116,6 +116,11 @@
 
 print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
 
+print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
+print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
+print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
+print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
+
 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
 
 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
@@ -170,6 +175,7 @@
 }
 
 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
 
 print(paste("Number of rows in result:", nrow(result)))
 print(paste("Number of rows in unmatched:", nrow(unmatched)))
--- a/mutation_analysis.py	Wed Jun 15 04:48:41 2016 -0400
+++ b/mutation_analysis.py	Thu Jun 16 05:05:47 2016 -0400
@@ -161,12 +161,12 @@
 	sys.exit()
 
 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
-RGYWCount = {g: 0 for g in genes}
-WRCYCount = {g: 0 for g in genes}
-WACount = {g: 0 for g in genes}
-TWCount = {g: 0 for g in genes}
+RGYWCount = {}
+WRCYCount = {}
+WACount = {}
+TWCount = {}
 
-IDIndex = 0
+#IDIndex = 0
 ataIndex = 0
 tatIndex = 0
 aggctatIndex = 0
@@ -185,6 +185,8 @@
 		linesplt = line.split("\t")
 		gene = linesplt[best_matchIndex]
 		ID = linesplt[IDIndex]
+		if ID == "ca2":
+			print linesplt
 		RGYW = [(int(x), int(y), z) for (x, y, z) in
 				[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
 		WRCY = [(int(x), int(y), z) for (x, y, z) in
@@ -249,12 +251,14 @@
 def get_xyz(lst, gene, f, fname):
 	x = int(round(f(lst)))
 	y = valuedic[gene + "_" + fname]
-	z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0"
+	z = str(round(x / float(y) * 100, 1)) if y != 0 else "0"
 	return (str(x), str(y), z)
 
 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
 arr = ["RGYW", "WRCY", "WA", "TW"]
 
+geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
+
 for fname in funcs.keys():
 	func = funcs[fname]
 	foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
@@ -263,14 +267,14 @@
 			o.write(typ + " (%)")
 			curr = dic[typ]
 			for gene in genes:
-				geneMatcher = re.compile("^" + gene + ".*")
+				geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....
 				if valuedic[gene + "_" + fname] is 0:
 					o.write(",0,0,0")
 				else:
 					x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
 					o.write("," + x + "," + y + "," + z)
-			# for total
-			x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
+
+			x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
 			o.write("," + x + "," + y + "," + z + "\n")
 
 
--- a/mutation_analysis.r	Wed Jun 15 04:48:41 2016 -0400
+++ b/mutation_analysis.r	Thu Jun 16 05:05:47 2016 -0400
@@ -295,7 +295,7 @@
 	  matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
 	}
 
-	matrx = calculate_result(i + 1, ".*", dat, matrx, func, fname, name="all")
+	matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
 	
 	result = data.frame(matrx)
 	if(fname == "sum"){
--- a/sequence_overview.r	Wed Jun 15 04:48:41 2016 -0400
+++ b/sequence_overview.r	Thu Jun 16 05:05:47 2016 -0400
@@ -41,11 +41,14 @@
 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") }
 tbl = function(df) { res = "<table border='1'>"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "</table>"); }
 
+print(paste("Number of unique sequences to be written to the sequence overview page", nrow(dat)))
+
 cat("<table border='1'>", file=main.html, append=F)
 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
 cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T)
 
 for(i in 1:nrow(dat)){
+	
 	ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]
 	ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),]
 	
@@ -62,7 +65,10 @@
 	classes.sum = sum(classes)
 	
 	if(classes.sum == 1){
+		print(paste("next", i, classes.sum))
 		next
+	} else {
+		print(i)
 	}
 	
 	id = as.numeric(dat[i,"seq_conc"])
@@ -144,6 +150,10 @@
 	names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
 }
 
+write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
+
+NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
+
 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
 new.col.y = sum(new.col.x)
 new.col.z = round(new.col.x / new.col.y * 100, 2)
@@ -160,8 +170,6 @@
 
 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
 
-write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
-
 
 
 
--- a/wrapper.sh	Wed Jun 15 04:48:41 2016 -0400
+++ b/wrapper.sh	Thu Jun 16 05:05:47 2016 -0400
@@ -77,7 +77,7 @@
 
 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
 
-echo "---------------- creating new IMGT zip ----------------<br />"
+echo "---------------- creating new IMGT zip ----------------"
 echo "---------------- creating new IMGT zip ----------------<br />" >> $output
 
 mkdir $outdir/new_IMGT
@@ -150,7 +150,7 @@
 
 #display the matched/unmatched for clearity
 
-matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
+matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`"
 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
 total_count=$((matched_count + unmatched_count))
 perc_count=$((unmatched_count / total_count * 100))
@@ -169,7 +169,7 @@
 	
 	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
 
-	echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
+	echo "<table border='1' width='100%'><caption><h3><a href='data_${func}.txt'>${func} table</a></h3></caption>" >> $output
 	echo "<tr><th>info</th>" >> $output
 	for gene in ${genes[@]}
 	do