# HG changeset patch
# User davidvanzessen
# Date 1464697850 14400
# Node ID 480fdd383fdbec050e5d02fc9948e4d522724688
# Parent d57c624a9aa9551b5234884bd41b6d038a5cbae0
Uploaded
diff -r d57c624a9aa9 -r 480fdd383fdb merge_and_filter.r
--- a/merge_and_filter.r Mon May 30 10:11:20 2016 -0400
+++ b/merge_and_filter.r Tue May 31 08:30:50 2016 -0400
@@ -87,48 +87,6 @@
print(paste("Number of sequences after merging with hotspots:", nrow(result)))
-cleanup_columns = c("FR1.IMGT.Nb.of.mutations",
- "CDR1.IMGT.Nb.of.mutations",
- "FR2.IMGT.Nb.of.mutations",
- "CDR2.IMGT.Nb.of.mutations",
- "FR3.IMGT.Nb.of.mutations")
-
-for(col in cleanup_columns){
- result[,col] = gsub("\\(.*\\)", "", result[,col])
- result[,col] = as.numeric(result[,col])
- result[is.na(result[,col]),] = 0
-}
-
-result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
-result$VGene = gsub("[*].*", "", result$VGene)
-result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
-result$JGene = gsub("[*].*", "", result$JGene)
-
-if(filter_unique != "no"){
- clmns = names(result)
-
- if(grepl("_c", filter_unique)){
- result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT, result$best_match)
- } else {
- result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT)
- }
-
- #fltr = result$unique.def %in% result.filtered$unique.def
-
- if(grepl("keep", filter_unique)){
- result = result[!duplicated(result$unique.def),]
- } else {
- result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
- result = result[!duplicated(result$unique.def),]
- }
-
- result = result[,clmns]
-
- #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
-}
-
-print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
-
#result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
if(unique_type == "AA.JUNCTION_V_subclass"){
result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match)
@@ -157,6 +115,50 @@
print(paste("Number of sequences in result after n filtering:", nrow(result)))
+cleanup_columns = c("FR1.IMGT.Nb.of.mutations",
+ "CDR1.IMGT.Nb.of.mutations",
+ "FR2.IMGT.Nb.of.mutations",
+ "CDR2.IMGT.Nb.of.mutations",
+ "FR3.IMGT.Nb.of.mutations")
+
+for(col in cleanup_columns){
+ result[,col] = gsub("\\(.*\\)", "", result[,col])
+ result[,col] = as.numeric(result[,col])
+ result[is.na(result[,col]),] = 0
+}
+
+result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
+result$VGene = gsub("[*].*", "", result$VGene)
+result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
+result$JGene = gsub("[*].*", "", result$JGene)
+
+write.table(result, "before_unique_filter.txt", sep="\t",quote=F,row.names=F,col.names=T)
+
+if(filter_unique != "no"){
+ #clmns = names(result)
+
+ if(grepl("_c", filter_unique)){
+ result$unique.def = paste(result$CDR1.IMGT, result$FR2.IMGT, result$CDR2.IMGT, result$FR3.IMGT, result$CDR3.IMGT, result$best_match)
+ } else {
+ result$unique.def = paste(result$CDR1.IMGT, result$FR2.IMGT, result$CDR2.IMGT, result$FR3.IMGT, result$CDR3.IMGT)
+ }
+
+ #fltr = result$unique.def %in% result.filtered$unique.def
+
+ if(grepl("keep", filter_unique)){
+ result = result[!duplicated(result$unique.def),]
+ } else {
+ result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+ result = result[!duplicated(result$unique.def),]
+ }
+
+ #result = result[,clmns]
+
+ #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
+}
+
+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+
print(paste("Number of rows in result:", nrow(result)))
print(paste("Number of rows in unmatched:", nrow(unmatched)))
diff -r d57c624a9aa9 -r 480fdd383fdb sequence_overview.r
--- a/sequence_overview.r Mon May 30 10:11:20 2016 -0400
+++ b/sequence_overview.r Tue May 31 08:30:50 2016 -0400
@@ -143,7 +143,7 @@
NTresult = data.frame(nt=c("A", "C", "T", "G"))
for(clazz in gene.classes){
- NToverview.sub = NToverview[grepl(clazz, NToverview$best_match),]
+ NToverview.sub = NToverview[grepl(clazz, paste("^", NToverview$best_match, sep="")),]
new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
new.col.y = sum(new.col.x)
new.col.z = round(new.col.x / new.col.y * 100, 2)
diff -r d57c624a9aa9 -r 480fdd383fdb wrapper.sh
--- a/wrapper.sh Mon May 30 10:11:20 2016 -0400
+++ b/wrapper.sh Tue May 31 08:30:50 2016 -0400
@@ -21,6 +21,8 @@
echo "unpacking IMGT file"
+
+
type="`file $input`"
if [[ "$type" == *"Zip archive"* ]] ; then
echo "Zip archive"
@@ -41,8 +43,6 @@
cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
-
-
#cat $PWD/files/*/1_* > $PWD/summary.txt
#cat $PWD/files/*/3_* > $PWD/sequences.txt
#cat $PWD/files/*/5_* > $PWD/aa.txt
@@ -56,22 +56,17 @@
echo "${BLASTN_DIR}"
echo "identification ($method)"
-echo "identification ($method)
" >> $output
-
-echo "blast or custom"
+echo "---------------- identification ($method) ----------------"
+echo "---------------- identification ($method) ----------------
" >> $output
if [[ "${method}" == "custom" ]] ; then
- echo "custom"
python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
else
- echo "blast"
ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
ID_index=$((ID_index+1))
sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
sequence_index=$((sequence_index+1))
- echo "$ID_index ${sequence_index}"
-
cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
@@ -110,6 +105,7 @@
mkdir $outdir/sequence_overview
Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
+#Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
echo "
unmatched (N = ${unmatched_count}) | " >> $output tmp=`cat $outdir/all_${func}_n.txt` - echo "all (N = $tmp) | " >> $output + echo "all (N = $tmp) | " >> $output while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz do
---|