changeset 62:4262e880472d draft

Uploaded
author davidvanzessen
date Fri, 25 Mar 2016 04:39:18 -0400
parents 64e6a7803e07
children a7381fd96dad
files gene_identification.py merge_and_filter.r wrapper.sh
diffstat 3 files changed, 77 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/gene_identification.py	Fri Mar 18 08:17:08 2016 -0400
+++ b/gene_identification.py	Fri Mar 25 04:39:18 2016 -0400
@@ -41,10 +41,11 @@
 print "Number of input sequences:", len(dic)
 
 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc
+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag
 
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
-                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag",
+                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
                  "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
 
 compiledregex = {"ca": [],
@@ -59,6 +60,12 @@
 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'}
 
+#remove last snp for shorter cg sequence --- note, also change varsInCG
+del cg1[132]
+del cg2[132]
+del cg3[132]
+del cg4[132]
+
 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap
 chunklength = 8
 
@@ -152,7 +159,7 @@
 chunksInCM = len(compiledregex["cm"])
 requiredChunkPercentage = 0.7
 varsInCA = float(len(ca1.keys()) * 2)
-varsInCG = float(len(cg1.keys()) * 2) + 1
+varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice
 varsInCM = 0
 requiredVarPercentage = 0.7
 
--- a/merge_and_filter.r	Fri Mar 18 08:17:08 2016 -0400
+++ b/merge_and_filter.r	Fri Mar 25 04:39:18 2016 -0400
@@ -32,10 +32,14 @@
 	
 }
 
+print(paste("Number of sequences in summary file:", nrow(summ)))
+
 summ = merge(summ, gene_identification, by="Sequence.ID")
 
 summ = summ[summ$Functionality != "No results",]
 
+print(paste("Number of sequences after merging with annotation:", nrow(summ)))
+
 if(functionality == "productive"){
 	summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
 } else if (functionality == "unproductive"){
@@ -44,20 +48,35 @@
 	summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
 }
 
+print(paste("Number of sequences after productive filter:", nrow(summ)))
+
 higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70)
+
+print(paste("test: ", sum(higher_than), sum(!higher_than), (sum(higher_than) + sum(!higher_than))))
+
 unmatched = summ[!higher_than,]
 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
 unmatched$best_match = paste("unmatched,", unmatched$best_match)
 summ = summ[higher_than,]
 
+print(paste("Number of matched sequences:", nrow(summ)))
+
 if(length(summ$Sequence.ID) == 0){
 	stop("No data remaining after filter")
 }
 
 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
+
+print(paste("Number of sequences after merging with mutation analysis:", nrow(result)))
+
 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
+
+print(paste("Number of sequences after merging with mutation stats:", nrow(result)))
+
 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
 
+print(paste("Number of sequences after merging with hotspots:", nrow(result)))
+
 cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
                     "CDR1.IMGT.Nb.of.mutations", 
                     "FR2.IMGT.Nb.of.mutations", 
@@ -75,6 +94,27 @@
 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
 result$JGene = gsub("[*].*", "", result$JGene)
 
+if(filter_unique != "no"){
+	clmns = names(result)
+	
+	result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR1.IMGT, result$FR2.IMGT, result$FR3.IMGT)
+	result.filtered = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+	fltr = result$Sequence.ID %in% result.filtered$Sequence.ID
+	#fltr = result$unique.def %in% result.filtered$unique.def
+	
+	result = result[fltr,]
+	
+	if(filter_unique == "keep"){
+		result = result[!fltr,]
+	}
+	
+	result = result[,clmns]
+	
+	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
+}
+
+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+
 #result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
 if(unique_type == "AA.JUNCTION_V_subclass"){
 	result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match)
@@ -90,41 +130,21 @@
 	result$past = 1:nrow(result)
 }
 
-print(paste("filter uniques: ", filter_unique))
-
-if(filter_unique != "no"){
-	clmns = names(result)
-	
-	result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR1.IMGT, result$FR2.IMGT, result$FR3.IMGT)
-	result.filtered = result[duplicated(result$unique.def),]
-	fltr = result$unique.def %in% result.filtered$unique.def
-	
-	result.removed = result[!fltr,]
-	
-	result = result[fltr,]
-	
-	if(filter_unique == "keep"){
-		result = result.removed
-	}
-	
-	result = result[,clmns]
-	
-	#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
-}
-
-
-result = result[!duplicated(result$past), ]
+result = result[!(duplicated(result$past)), ]
 
 result = result[,!(names(result) %in% c("past"))]
 
-print(paste("Number of rows in result:", nrow(result)))
-print(paste("Number of rows in unmatched:", nrow(unmatched)))
-
+print(paste("Number of sequences in result after", unique_type, "filtering:", nrow(result)))
 
 #remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions.
 sequences = sequences[grepl("n|N", sequences$FR2.IMGT) | grepl("n|N", sequences$FR3.IMGT) | grepl("n|N", sequences$CDR1.IMGT) | grepl("n|N", sequences$CDR2.IMGT),]
 
 result = result[!(result$Sequence.ID %in% sequences$Sequence.ID),]
 
+print(paste("Number of sequences in result after n filtering:", nrow(result)))
+
+print(paste("Number of rows in result:", nrow(result)))
+print(paste("Number of rows in unmatched:", nrow(unmatched)))
+
 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
--- a/wrapper.sh	Fri Mar 18 08:17:08 2016 -0400
+++ b/wrapper.sh	Fri Mar 25 04:39:18 2016 -0400
@@ -14,6 +14,7 @@
 mkdir $outdir
 
 echo "---------------- read parameters ----------------"
+echo "---------------- read parameters ----------------" > $output
 
 echo "unpacking IMGT file"
 
@@ -42,6 +43,7 @@
 echo "${BLASTN_DIR}"
 
 echo "identification ($method)"
+echo "identification ($method)" >> $output
 
 echo "blast or custom"
 
@@ -64,10 +66,12 @@
 fi
 
 echo "---------------- merge_and_filter.r ----------------"
+echo "---------------- merge_and_filter.r ----------------" >> $output
 
 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique}
 
 echo "---------------- mutation_analysis.r ----------------"
+echo "---------------- mutation_analysis.r ----------------" >> $output
 
 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
 echo "R mutation analysis"
@@ -79,11 +83,13 @@
 
 
 echo "---------------- mutation_analysis.py ----------------"
+echo "---------------- mutation_analysis.py ----------------" >> $output
 
 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
 echo "R AA histogram"
 
 echo "---------------- aa_histogram.r ----------------"
+echo "---------------- aa_histogram.r ----------------" >> $output
 
 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
 
@@ -92,7 +98,21 @@
 funcs=(sum mean median)
 
 
-echo "<html><center><h1>$title</h1></center>" >> $output
+echo "<html><center><h1>$title</h1></center>" > $output
+
+#display the matched/unmatched for clearity
+
+matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
+unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
+total_count=$((matched_count + unmatched_count))
+perc_count=$((unmatched_count / total_count * 100))
+perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
+perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
+
+echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
+echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
+echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
+
 echo "---------------- main tables ----------------"
 for func in ${funcs[@]}
 do