# HG changeset patch
# User davidvanzessen
# Date 1466067947 14400
# Node ID 5ffbf40cdd4b13f86d4e773a71a0996099cc2e08
# Parent 6e8dfbe164c62770b1badade0649e4cd5cfec53f
Uploaded
diff -r 6e8dfbe164c6 -r 5ffbf40cdd4b merge_and_filter.r
--- a/merge_and_filter.r Wed Jun 15 04:48:41 2016 -0400
+++ b/merge_and_filter.r Thu Jun 16 05:05:47 2016 -0400
@@ -116,6 +116,11 @@
print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
+print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
+print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
+print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
+print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
+
result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
@@ -170,6 +175,7 @@
}
print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
print(paste("Number of rows in result:", nrow(result)))
print(paste("Number of rows in unmatched:", nrow(unmatched)))
diff -r 6e8dfbe164c6 -r 5ffbf40cdd4b mutation_analysis.py
--- a/mutation_analysis.py Wed Jun 15 04:48:41 2016 -0400
+++ b/mutation_analysis.py Thu Jun 16 05:05:47 2016 -0400
@@ -161,12 +161,12 @@
sys.exit()
hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
-RGYWCount = {g: 0 for g in genes}
-WRCYCount = {g: 0 for g in genes}
-WACount = {g: 0 for g in genes}
-TWCount = {g: 0 for g in genes}
+RGYWCount = {}
+WRCYCount = {}
+WACount = {}
+TWCount = {}
-IDIndex = 0
+#IDIndex = 0
ataIndex = 0
tatIndex = 0
aggctatIndex = 0
@@ -185,6 +185,8 @@
linesplt = line.split("\t")
gene = linesplt[best_matchIndex]
ID = linesplt[IDIndex]
+ if ID == "ca2":
+ print linesplt
RGYW = [(int(x), int(y), z) for (x, y, z) in
[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
WRCY = [(int(x), int(y), z) for (x, y, z) in
@@ -249,12 +251,14 @@
def get_xyz(lst, gene, f, fname):
x = int(round(f(lst)))
y = valuedic[gene + "_" + fname]
- z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0"
+ z = str(round(x / float(y) * 100, 1)) if y != 0 else "0"
return (str(x), str(y), z)
dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
arr = ["RGYW", "WRCY", "WA", "TW"]
+geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
+
for fname in funcs.keys():
func = funcs[fname]
foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
@@ -263,14 +267,14 @@
o.write(typ + " (%)")
curr = dic[typ]
for gene in genes:
- geneMatcher = re.compile("^" + gene + ".*")
+ geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....
if valuedic[gene + "_" + fname] is 0:
o.write(",0,0,0")
else:
x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
o.write("," + x + "," + y + "," + z)
- # for total
- x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
+
+ x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
o.write("," + x + "," + y + "," + z + "\n")
diff -r 6e8dfbe164c6 -r 5ffbf40cdd4b mutation_analysis.r
--- a/mutation_analysis.r Wed Jun 15 04:48:41 2016 -0400
+++ b/mutation_analysis.r Thu Jun 16 05:05:47 2016 -0400
@@ -295,7 +295,7 @@
matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
}
- matrx = calculate_result(i + 1, ".*", dat, matrx, func, fname, name="all")
+ matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
result = data.frame(matrx)
if(fname == "sum"){
diff -r 6e8dfbe164c6 -r 5ffbf40cdd4b sequence_overview.r
--- a/sequence_overview.r Wed Jun 15 04:48:41 2016 -0400
+++ b/sequence_overview.r Thu Jun 16 05:05:47 2016 -0400
@@ -41,11 +41,14 @@
make.link = function(id, clss, val) { paste("", val, "", sep="") }
tbl = function(df) { res = "
"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "
"); }
+print(paste("Number of unique sequences to be written to the sequence overview page", nrow(dat)))
+
cat("", file=main.html, append=F)
cat("CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once", file=main.html, append=T)
cat("Sequence | Functionality | ca1 | ca2 | cg1 | cg2 | cg3 | cg4 | cm |
", file=main.html, append=T)
for(i in 1:nrow(dat)){
+
ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]
ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),]
@@ -62,7 +65,10 @@
classes.sum = sum(classes)
if(classes.sum == 1){
+ print(paste("next", i, classes.sum))
next
+ } else {
+ print(i)
}
id = as.numeric(dat[i,"seq_conc"])
@@ -144,6 +150,10 @@
names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
}
+write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
+
+NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
+
new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
new.col.y = sum(new.col.x)
new.col.z = round(new.col.x / new.col.y * 100, 2)
@@ -160,8 +170,6 @@
write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
-write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
-
diff -r 6e8dfbe164c6 -r 5ffbf40cdd4b wrapper.sh
--- a/wrapper.sh Wed Jun 15 04:48:41 2016 -0400
+++ b/wrapper.sh Thu Jun 16 05:05:47 2016 -0400
@@ -77,7 +77,7 @@
Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
-echo "---------------- creating new IMGT zip ----------------
"
+echo "---------------- creating new IMGT zip ----------------"
echo "---------------- creating new IMGT zip ----------------
" >> $output
mkdir $outdir/new_IMGT
@@ -150,7 +150,7 @@
#display the matched/unmatched for clearity
-matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
+matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`"
unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
total_count=$((matched_count + unmatched_count))
perc_count=$((unmatched_count / total_count * 100))
@@ -169,7 +169,7 @@
cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
- echo "${func} table
" >> $output
+ echo "" >> $output
echo "info | " >> $output
for gene in ${genes[@]}
do
---|