changeset 89:480fdd383fdb draft

Uploaded
author davidvanzessen
date Tue, 31 May 2016 08:30:50 -0400
parents d57c624a9aa9
children f0e8dac22c6e
files merge_and_filter.r sequence_overview.r wrapper.sh
diffstat 3 files changed, 51 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Mon May 30 10:11:20 2016 -0400
+++ b/merge_and_filter.r	Tue May 31 08:30:50 2016 -0400
@@ -87,48 +87,6 @@
 
 print(paste("Number of sequences after merging with hotspots:", nrow(result)))
 
-cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
-                    "CDR1.IMGT.Nb.of.mutations", 
-                    "FR2.IMGT.Nb.of.mutations", 
-                    "CDR2.IMGT.Nb.of.mutations", 
-                    "FR3.IMGT.Nb.of.mutations")
-
-for(col in cleanup_columns){
-  result[,col] = gsub("\\(.*\\)", "", result[,col])
-  result[,col] = as.numeric(result[,col])
-  result[is.na(result[,col]),] = 0
-}
-
-result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
-result$VGene = gsub("[*].*", "", result$VGene)
-result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
-result$JGene = gsub("[*].*", "", result$JGene)
-
-if(filter_unique != "no"){
-	clmns = names(result)
-	
-	if(grepl("_c", filter_unique)){
-		result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT, result$best_match)
-	} else {
-		result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT)
-	}
-
-	#fltr = result$unique.def %in% result.filtered$unique.def
-		
-	if(grepl("keep", filter_unique)){
-		result = result[!duplicated(result$unique.def),]
-	} else {
-		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
-		result = result[!duplicated(result$unique.def),]
-	}
-	
-	result = result[,clmns]
-	
-	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
-}
-
-print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
-
 #result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
 if(unique_type == "AA.JUNCTION_V_subclass"){
 	result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match)
@@ -157,6 +115,50 @@
 
 print(paste("Number of sequences in result after n filtering:", nrow(result)))
 
+cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
+                    "CDR1.IMGT.Nb.of.mutations", 
+                    "FR2.IMGT.Nb.of.mutations", 
+                    "CDR2.IMGT.Nb.of.mutations", 
+                    "FR3.IMGT.Nb.of.mutations")
+
+for(col in cleanup_columns){
+  result[,col] = gsub("\\(.*\\)", "", result[,col])
+  result[,col] = as.numeric(result[,col])
+  result[is.na(result[,col]),] = 0
+}
+
+result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
+result$VGene = gsub("[*].*", "", result$VGene)
+result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
+result$JGene = gsub("[*].*", "", result$JGene)
+
+write.table(result, "before_unique_filter.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
+if(filter_unique != "no"){
+	#clmns = names(result)
+	
+	if(grepl("_c", filter_unique)){
+		result$unique.def = paste(result$CDR1.IMGT, result$FR2.IMGT, result$CDR2.IMGT, result$FR3.IMGT, result$CDR3.IMGT, result$best_match)
+	} else {
+		result$unique.def = paste(result$CDR1.IMGT, result$FR2.IMGT, result$CDR2.IMGT, result$FR3.IMGT, result$CDR3.IMGT)
+	}
+
+	#fltr = result$unique.def %in% result.filtered$unique.def
+		
+	if(grepl("keep", filter_unique)){
+		result = result[!duplicated(result$unique.def),]
+	} else {
+		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+		result = result[!duplicated(result$unique.def),]
+	}
+	
+	#result = result[,clmns]
+	
+	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
+}
+
+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+
 print(paste("Number of rows in result:", nrow(result)))
 print(paste("Number of rows in unmatched:", nrow(unmatched)))
 
--- a/sequence_overview.r	Mon May 30 10:11:20 2016 -0400
+++ b/sequence_overview.r	Tue May 31 08:30:50 2016 -0400
@@ -143,7 +143,7 @@
 NTresult = data.frame(nt=c("A", "C", "T", "G"))
 
 for(clazz in gene.classes){
-	NToverview.sub = NToverview[grepl(clazz, NToverview$best_match),]
+	NToverview.sub = NToverview[grepl(clazz, paste("^", NToverview$best_match, sep="")),]
 	new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
 	new.col.y = sum(new.col.x)
 	new.col.z = round(new.col.x / new.col.y * 100, 2)
--- a/wrapper.sh	Mon May 30 10:11:20 2016 -0400
+++ b/wrapper.sh	Tue May 31 08:30:50 2016 -0400
@@ -21,6 +21,8 @@
 
 echo "unpacking IMGT file"
 
+
+
 type="`file $input`"
 if [[ "$type" == *"Zip archive"* ]] ; then
 	echo "Zip archive"
@@ -41,8 +43,6 @@
 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
 
-
-
 #cat $PWD/files/*/1_* > $PWD/summary.txt
 #cat $PWD/files/*/3_* > $PWD/sequences.txt
 #cat $PWD/files/*/5_* > $PWD/aa.txt
@@ -56,22 +56,17 @@
 echo "${BLASTN_DIR}"
 
 echo "identification ($method)"
-echo "identification ($method)<br />" >> $output
-
-echo "blast or custom"
+echo "---------------- identification ($method) ----------------"
+echo "---------------- identification ($method) ----------------<br />" >> $output
 
 if [[ "${method}" == "custom" ]] ; then
-	echo "custom"
 	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
 else
-	echo "blast"
 	ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
 	ID_index=$((ID_index+1))
 	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
 	sequence_index=$((sequence_index+1))
 	
-	echo "$ID_index ${sequence_index}"
-	
 	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
 	
 	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
@@ -110,6 +105,7 @@
 mkdir $outdir/sequence_overview
 
 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
+#Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
 
 echo "<table border='1'>" > $outdir/base_overview.html
 
@@ -153,7 +149,7 @@
 	tmp=`cat $outdir/unmatched_${func}_n.txt`
 	echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
 	tmp=`cat $outdir/all_${func}_n.txt`
-	echo "<th><a href='matched_${func}_all.txt'>all (N = $tmp)</a></th>" >> $output
+	echo "<th><a href='matched_all_${func}.txt'>all (N = $tmp)</a></th>" >> $output
 
 	while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
 	do