# HG changeset patch
# User davidvanzessen
# Date 1446474013 18000
# Node ID 1b45c7d7d94175f878c39eadd83fe77940ca572d
# Parent  e022c21f8c479df0392d4b71097cdc52ad459946
Uploaded

diff -r e022c21f8c47 -r 1b45c7d7d941 merge_and_filter.r
--- a/merge_and_filter.r	Mon Nov 02 07:10:36 2015 -0500
+++ b/merge_and_filter.r	Mon Nov 02 09:20:13 2015 -0500
@@ -2,7 +2,7 @@
 
 
 summaryfile = args[1]
-junctionfile = args[2]
+sequencesfile = args[2]
 mutationanalysisfile = args[3]
 mutationstatsfile = args[4]
 hotspotsfile = args[5]
@@ -14,7 +14,7 @@
 unique_type=args[11]
 
 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
-junctions = read.table(junctionfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
@@ -98,10 +98,10 @@
 print(paste("Number of rows in unmatched:", nrow(unmatched)))
 
 
-#remove the sequences that have an 'n' (or 'N') in the junction.
-junctions = junctions[grepl("n|N", junctions$JUNCTION),]
+#remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions.
+sequences = sequences[grepl("n|N", sequences$FR2.IMGT) | grepl("n|N", sequences$FR3.IMGT) | grepl("n|N", sequences$CDR1.IMGT) | grepl("n|N", sequences$CDR2.IMGT),]
 
-result = result[!(result$Sequence.ID %in% junctions$Sequence.ID),]
+result = result[!(result$Sequence.ID %in% sequences$Sequence.ID),]
 
 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
diff -r e022c21f8c47 -r 1b45c7d7d941 wrapper.sh
--- a/wrapper.sh	Mon Nov 02 07:10:36 2015 -0500
+++ b/wrapper.sh	Mon Nov 02 09:20:13 2015 -0500
@@ -24,6 +24,7 @@
 fi
 
 cat $PWD/files/*/1_* > $PWD/summary.txt
+cat $PWD/files/*/3_* > $PWD/sequences.txt
 cat $PWD/files/*/6_* > $PWD/junction.txt
 cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
 cat $PWD/files/*/8_* > $PWD/mutationstats.txt
@@ -54,7 +55,7 @@
 
 
 echo "merging"
-Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/junction.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique
+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique
 
 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
 echo "R mutation analysis"