diff wrapper.sh @ 57:16c7fc1c4bf8 draft

Uploaded
author davidvanzessen
date Fri, 18 Mar 2016 07:50:34 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh	Fri Mar 18 07:50:34 2016 -0400
@@ -0,0 +1,210 @@
+#!/bin/bash
+set -e
+dir="$(cd "$(dirname "$0")" && pwd)"
+input=$1
+method=$2
+output=$3
+outdir=$4
+title=$5
+include_fr1=$6
+functionality=$7
+unique=$8
+naive_output=$9
+filter_unique=${10}
+mkdir $outdir
+
+echo "---------------- read parameters ----------------"
+
+echo "unpacking IMGT file"
+
+type="`file $input`"
+if [[ "$type" == *"Zip archive"* ]] ; then
+	echo "Zip archive"
+	echo "unzip $input -d $PWD/files/"
+	unzip $input -d $PWD/files/
+elif [[ "$type" == *"XZ compressed data"* ]] ; then
+	echo "ZX archive"
+	echo "tar -xJf $input -C $PWD/files/"
+	mkdir -p $PWD/files/$title
+	tar -xJf $input -C $PWD/files/$title
+fi
+
+cat $PWD/files/*/1_* > $PWD/summary.txt
+cat $PWD/files/*/3_* > $PWD/sequences.txt
+cat $PWD/files/*/5_* > $PWD/aa.txt
+cat $PWD/files/*/6_* > $PWD/junction.txt
+cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
+cat $PWD/files/*/8_* > $PWD/mutationstats.txt
+cat $PWD/files/*/10_* > $PWD/hotspots.txt
+
+#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
+
+echo "${BLASTN_DIR}"
+
+echo "identification ($method)"
+
+echo "blast or custom"
+
+if [[ "${method}" == "custom" ]] ; then
+	echo "custom"
+	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
+else
+	echo "blast"
+	ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
+	ID_index=$((ID_index+1))
+	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
+	sequence_index=$((sequence_index+1))
+	
+	echo "$ID_index ${sequence_index}"
+	
+	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
+	
+	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
+	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
+fi
+
+echo "---------------- merge_and_filter.r ----------------"
+
+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique}
+
+echo "---------------- mutation_analysis.r ----------------"
+
+genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
+echo "R mutation analysis"
+Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
+
+#echo "." > $output
+#exit 0
+
+
+
+echo "---------------- mutation_analysis.py ----------------"
+
+python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
+echo "R AA histogram"
+
+echo "---------------- aa_histogram.r ----------------"
+
+Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
+
+genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
+
+funcs=(sum mean median)
+
+
+echo "<html><center><h1>$title</h1></center>" >> $output
+echo "---------------- main tables ----------------"
+for func in ${funcs[@]}
+do
+
+	echo "---------------- $func table ----------------"
+	
+	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
+
+	echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
+	echo "<tr><th>info</th>" >> $output
+	for gene in ${genes[@]}
+	do
+		tmp=`cat $outdir/${gene}_${func}_n.txt`
+		echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
+	done
+	tmp=`cat $outdir/all_${func}_n.txt`
+	echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output
+
+	while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
+	do
+		if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
+			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
+		else
+			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
+		fi
+	done < $outdir/result.txt
+
+done
+
+echo "---------------- download links ----------------"
+
+echo "</table>" >> $output
+echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
+echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
+echo "<a href='merged.txt'>all data</a><br />" >> $output
+echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
+echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
+echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
+
+echo "---------------- images ----------------"
+
+echo "<img src='all.png'/><br />" >> $output
+echo "<a href='all.txt'>download data</a><br />" >> $output
+if [ -a $outdir/ca.png ] 
+then
+	echo "<img src='ca.png'/><br />" >> $output
+	echo "<a href='ca.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/cg.png ]
+then
+	echo "<img src='cg.png'/><br />" >> $output
+	echo "<a href='cg.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/scatter.png ]
+then
+	echo "<img src='scatter.png'/><br />" >> $output
+	echo "<a href='scatter.txt'>download data</a><br />" >> $output
+fi
+if [ -a $outdir/frequency_ranges.png ]
+then
+	echo "<img src='frequency_ranges.png'/><br />" >> $output
+	echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
+	echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
+fi
+if [ -a $outdir/aa_histogram.png ]
+then
+	echo "<img src='aa_histogram.png'/><br />" >> $output
+	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
+fi
+
+for gene in ${genes[@]}
+do
+	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
+	while IFS=, read from a c g t
+		do
+			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+	done < $outdir/transitions_${gene}_sum.txt
+	echo "</table>" >> $output
+done
+
+echo "<table border='1'><caption>All transition table</caption>" >> $output
+while IFS=, read from a c g t
+	do
+		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
+done < $outdir/transitions_all_sum.txt
+echo "</table>" >> $output
+
+echo "</html>" >> $output
+
+
+#optional output for naive
+
+echo "---------------- aa_histogram.r ----------------"
+
+if [[ "$naive_output" != "None" ]]
+then
+	echo "---------------- imgt_loader.r ----------------"
+	#python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
+	Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt ${naive_output} 2>&1
+	echo "---------------- naive_output.r ----------------"
+	Rscript $dir/naive_output.r $naive_output $outdir/merged.txt $naive_output 2>&1
+fi
+
+
+
+
+
+
+
+
+
+
+
+#rm $outdir/HS12RSS.txt
+#rm $outdir/HS23RSS.txt