Mercurial > repos > davidvanzessen > mutation_analysis

#!/bin/bash
set -e
dir="$(cd "$(dirname "$0")" && pwd)"
input=$1
method=$2
output=$3
outdir=$4
title=$5
include_fr1=$6
functionality=$7
unique=$8
naive_output_ca=$9
naive_output_cg=${10}
naive_output_cm=${11}
filter_unique=${12}
class_filter=${13}
mkdir $outdir

echo "---------------- read parameters ----------------"
echo "---------------- read parameters ----------------<br />" > $output

echo "unpacking IMGT file"

type="`file $input`"
if [[ "$type" == *"Zip archive"* ]] ; then
	echo "Zip archive"
	echo "unzip $input -d $PWD/files/"
	unzip $input -d $PWD/files/
elif [[ "$type" == *"XZ compressed data"* ]] ; then
	echo "ZX archive"
	echo "tar -xJf $input -C $PWD/files/"
	mkdir -p $PWD/files/$title
	tar -xJf $input -C $PWD/files/$title
fi

cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt


#cat $PWD/files/*/1_* > $PWD/summary.txt
#cat $PWD/files/*/3_* > $PWD/sequences.txt
#cat $PWD/files/*/5_* > $PWD/aa.txt
#cat $PWD/files/*/6_* > $PWD/junction.txt
#cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
#cat $PWD/files/*/8_* > $PWD/mutationstats.txt
#cat $PWD/files/*/10_* > $PWD/hotspots.txt

#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"

echo "${BLASTN_DIR}"

echo "identification ($method)"
echo "identification ($method)<br />" >> $output

echo "blast or custom"

if [[ "${method}" == "custom" ]] ; then
	echo "custom"
	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
else
	echo "blast"
	ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
	ID_index=$((ID_index+1))
	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
	sequence_index=$((sequence_index+1))

	echo "$ID_index ${sequence_index}"

	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta

	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
fi

echo "---------------- merge_and_filter.r ----------------"
echo "---------------- merge_and_filter.r ----------------<br />" >> $output

Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter}

echo "---------------- mutation_analysis.r ----------------"
echo "---------------- mutation_analysis.r ----------------<br />" >> $output

genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
echo "R mutation analysis"
Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1

#echo "." > $output
#exit 0


echo "---------------- mutation_analysis.py ----------------"
echo "---------------- mutation_analysis.py ----------------<br />" >> $output

python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
echo "R AA histogram"

echo "---------------- aa_histogram.r ----------------"
echo "---------------- aa_histogram.r ----------------<br />" >> $output

Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1

genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)

funcs=(sum mean median)


echo "<html><center><h1>$title</h1></center>" > $output

#display the matched/unmatched for clearity

matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
total_count=$((matched_count + unmatched_count))
perc_count=$((unmatched_count / total_count * 100))
perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`

echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output

echo "---------------- main tables ----------------"
for func in ${funcs[@]}
do

	echo "---------------- $func table ----------------"

	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt

	echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
	echo "<tr><th>info</th>" >> $output
	for gene in ${genes[@]}
	do
		tmp=`cat $outdir/${gene}_${func}_n.txt`
		echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
	done
	tmp=`cat $outdir/all_${func}_n.txt`
	echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output

	while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
	do
		if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
		else
			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
		fi
	done < $outdir/result.txt

done

echo "---------------- download links ----------------"

echo "</table>" >> $output
echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
echo "<a href='merged.txt'>all data</a><br />" >> $output
echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output

echo "---------------- images ----------------"

echo "<img src='all.png'/><br />" >> $output
echo "<a href='all.txt'>download data</a><br />" >> $output
if [ -a $outdir/ca.png ]
then
	echo "<img src='ca.png'/><br />" >> $output
	echo "<a href='ca.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/cg.png ]
then
	echo "<img src='cg.png'/><br />" >> $output
	echo "<a href='cg.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/scatter.png ]
then
	echo "<img src='scatter.png'/><br />" >> $output
	echo "<a href='scatter.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/frequency_ranges.png ]
then
	echo "<img src='frequency_ranges.png'/><br />" >> $output
	echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
	echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
fi
if [ -a $outdir/aa_histogram.png ]
then
	echo "<img src='aa_histogram.png'/><br />" >> $output
	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
fi

for gene in ${genes[@]}
do
	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
	while IFS=, read from a c g t
		do
			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
	done < $outdir/transitions_${gene}_sum.txt
	echo "</table>" >> $output
done

echo "<table border='1'><caption>All transition table</caption>" >> $output
while IFS=, read from a c g t
	do
		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
done < $outdir/transitions_all_sum.txt
echo "</table>" >> $output

echo "</html>" >> $output


#optional output for naive

echo "---------------- aa_histogram.r ----------------"

if [[ "$naive_output" != "None" ]]
then
	echo "---------------- imgt_loader.r ----------------"
	#python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
	Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt ${naive_output_ca} 2>&1
	echo "---------------- naive_output.r ----------------"
	Rscript $dir/naive_output.r $naive_output_ca $outdir/merged.txt $naive_output_ca $naive_output_cg $naive_output_cm 2>&1
fi

echo "---------------- sequence_overview.r ----------------"

mkdir $outdir/sequence_overview

Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1


#rm $outdir/HS12RSS.txt
#rm $outdir/HS23RSS.txt
author	davidvanzessen
date	Mon, 09 May 2016 03:45:39 -0400
parents	7acdcd5c52ef
children	c5c86d15cb94