# HG changeset patch # User davidvanzessen # Date 1465289150 14400 # Node ID 53fb2948726e09f83a81e3dc2506456629be417b # Parent b869a126e2c4b3d199b935ee0f8521b393eb11f7 Uploaded diff -r b869a126e2c4 -r 53fb2948726e gene_identification.py --- a/gene_identification.py Mon Jun 06 08:26:54 2016 -0400 +++ b/gene_identification.py Tue Jun 07 04:45:50 2016 -0400 @@ -160,7 +160,7 @@ chunksInCM = len(compiledregex["cm"]) requiredChunkPercentage = 0.7 varsInCA = float(len(ca1.keys()) * 2) -varsInCG = float(len(cg1.keys()) * 2) - 2 # -1 because the sliding window doesn't hit the first nt twice +varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice varsInCM = 0 diff -r b869a126e2c4 -r 53fb2948726e merge_and_filter.r --- a/merge_and_filter.r Mon Jun 06 08:26:54 2016 -0400 +++ b/merge_and_filter.r Tue Jun 07 04:45:50 2016 -0400 @@ -140,20 +140,23 @@ write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) if(filter_unique != "no"){ - #clmns = names(result) + clmns = names(result) if(grepl("_c", filter_unique)){ result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match) } else { result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } #fltr = result$unique.def %in% result.filtered$unique.def if(grepl("keep", filter_unique)){ + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes result = result[!duplicated(result$unique.def),] } else { result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes result = result[!duplicated(result$unique.def),] }