changeset 93:53fb2948726e draft

Uploaded
author davidvanzessen
date Tue, 07 Jun 2016 04:45:50 -0400
parents b869a126e2c4
children e39176ccddc8
files gene_identification.py merge_and_filter.r
diffstat 2 files changed, 5 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/gene_identification.py	Mon Jun 06 08:26:54 2016 -0400
+++ b/gene_identification.py	Tue Jun 07 04:45:50 2016 -0400
@@ -160,7 +160,7 @@
 chunksInCM = len(compiledregex["cm"])
 requiredChunkPercentage = 0.7
 varsInCA = float(len(ca1.keys()) * 2)
-varsInCG = float(len(cg1.keys()) * 2) - 2 # -1 because the sliding window doesn't hit the first nt twice
+varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice
 varsInCM = 0
 
 
--- a/merge_and_filter.r	Mon Jun 06 08:26:54 2016 -0400
+++ b/merge_and_filter.r	Tue Jun 07 04:45:50 2016 -0400
@@ -140,20 +140,23 @@
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
 if(filter_unique != "no"){
-	#clmns = names(result)
+	clmns = names(result)
 	
 	if(grepl("_c", filter_unique)){
 		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match)
 	} else {
 		result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+		
 	}
 
 	#fltr = result$unique.def %in% result.filtered$unique.def
 		
 	if(grepl("keep", filter_unique)){
+		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
 		result = result[!duplicated(result$unique.def),]
 	} else {
 		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
 		result = result[!duplicated(result$unique.def),]
 	}