# HG changeset patch # User davidvanzessen # Date 1465980521 14400 # Node ID 6e8dfbe164c62770b1badade0649e4cd5cfec53f # Parent 925efcd00c5855be8604e19a282a8098c35df1a0 Uploaded diff -r 925efcd00c58 -r 6e8dfbe164c6 merge_and_filter.r --- a/merge_and_filter.r Wed Jun 08 03:58:40 2016 -0400 +++ b/merge_and_filter.r Wed Jun 15 04:48:41 2016 -0400 @@ -116,6 +116,10 @@ print(paste("Number of sequences in result after merging with sequences:", nrow(result))) +result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + +print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) + result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] print(paste("Number of sequences in result after n filtering:", nrow(result))) diff -r 925efcd00c58 -r 6e8dfbe164c6 tmp/igat.r --- a/tmp/igat.r Wed Jun 08 03:58:40 2016 -0400 +++ b/tmp/igat.r Wed Jun 15 04:48:41 2016 -0400 @@ -5,6 +5,8 @@ merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F) +merged = merged[!grepl("unmatched", merged$best_match),] + for(f in list.files(imgt.dir, pattern="*.txt$")){ print(paste("filtering", f)) path = paste(imgt.dir, f, sep="")