Mercurial > repos > davidvanzessen > mutation_analysis

#!/bin/bash
set -e
dir="$(cd "$(dirname "$0")" && pwd)"
input=$1
method=$2
log=$3 #becomes the main html page at the end
outdir=$4
output="$outdir/index.html" #copied to $log location at the end
title=$5
include_fr1=$6
functionality=$7
unique=$8
naive_output_ca=$9
naive_output_cg=${10}
naive_output_cm=${11}
filter_unique=${12}
class_filter=${13}
mkdir $outdir

echo "---------------- read parameters ----------------"
echo "---------------- read parameters ----------------<br />" > $log

echo "unpacking IMGT file"


type="`file $input`"
if [[ "$type" == *"Zip archive"* ]] ; then
	echo "Zip archive"
	echo "unzip $input -d $PWD/files/"
	unzip $input -d $PWD/files/
elif [[ "$type" == *"XZ compressed data"* ]] ; then
	echo "ZX archive"
	echo "tar -xJf $input -C $PWD/files/"
	mkdir -p $PWD/files/$title
	tar -xJf $input -C $PWD/files/$title
fi

cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt

#cat $PWD/files/*/1_* > $PWD/summary.txt
#cat $PWD/files/*/3_* > $PWD/sequences.txt
#cat $PWD/files/*/5_* > $PWD/aa.txt
#cat $PWD/files/*/6_* > $PWD/junction.txt
#cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
#cat $PWD/files/*/8_* > $PWD/mutationstats.txt
#cat $PWD/files/*/10_* > $PWD/hotspots.txt

#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"

echo "${BLASTN_DIR}"

echo "---------------- identification ($method) ----------------"
echo "---------------- identification ($method) ----------------<br />" >> $log

if [[ "${method}" == "custom" ]] ; then
	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
else
	ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
	ID_index=$((ID_index+1))
	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
	sequence_index=$((sequence_index+1))

	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta

	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
fi

echo "---------------- merge_and_filter.r ----------------"
echo "---------------- merge_and_filter.r ----------------<br />" >> $log

Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1

echo "---------------- creating new IMGT zip ----------------"
echo "---------------- creating new IMGT zip ----------------<br />" >> $log

mkdir $outdir/new_IMGT

cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"
cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"
cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"
cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"
cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"
cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"
cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"
cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"
cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"
cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"

mkdir $outdir/new_IMGT_ca
cp $outdir/new_IMGT/* $outdir/new_IMGT_ca

mkdir $outdir/new_IMGT_cg
cp $outdir/new_IMGT/* $outdir/new_IMGT_cg

mkdir $outdir/new_IMGT_cm
cp $outdir/new_IMGT/* $outdir/new_IMGT_cm

Rscript $dir/tmp/igat.r $outdir/new_IMGT/ $outdir/merged.txt "-" 2>&1
Rscript $dir/tmp/igat.r $outdir/new_IMGT_ca/ $outdir/merged.txt "ca" 2>&1
Rscript $dir/tmp/igat.r $outdir/new_IMGT_cg/ $outdir/merged.txt "cg" 2>&1
Rscript $dir/tmp/igat.r $outdir/new_IMGT_cm/ $outdir/merged.txt "cm" 2>&1


tmp="$PWD"
cd $outdir/new_IMGT/ #tar weirdness...
tar -cJf ../new_IMGT.txz *
cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT/IgAT.xlsm
zip -r ../IgAT.zip *

cd $outdir/new_IMGT_ca/
tar -cJf ../new_IMGT_ca.txz *
cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_ca/IgAT.xlsm
zip -r ../IgAT_ca.zip *

cd $outdir/new_IMGT_cg/
tar -cJf ../new_IMGT_cg.txz *
cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cg/IgAT.xlsm
zip -r ../IgAT_cg.zip *

cd $outdir/new_IMGT_cm/
tar -cJf ../new_IMGT_cm.txz *
cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cm/IgAT.xlsm
zip -r ../IgAT_cm.zip *

cd $tmp

echo "---------------- mutation_analysis.r ----------------"
echo "---------------- mutation_analysis.r ----------------<br />" >> $log

classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
echo "R mutation analysis"
Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1


echo "---------------- mutation_analysis.py ----------------"
echo "---------------- mutation_analysis.py ----------------<br />" >> $log

python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt

echo "---------------- aa_histogram.r ----------------"
echo "---------------- aa_histogram.r ----------------<br />" >> $log

Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png "" 2>&1

echo "---------------- aa_histogram.r ca ----------------"
echo "---------------- aa_histogram.r ca ----------------<br />" >> $log

Rscript $dir/aa_histogram.r $outdir/aa_mutations_ca.txt $outdir/aa_histogram_ca.png "ca" 2>&1

echo "---------------- aa_histogram.r cg ----------------"
echo "---------------- aa_histogram.r cg ----------------<br />" >> $log

Rscript $dir/aa_histogram.r $outdir/aa_mutations_cg.txt $outdir/aa_histogram_cg.png "cg" 2>&1

echo "---------------- aa_histogram.r cm ----------------"
echo "---------------- aa_histogram.r cm ----------------<br />" >> $log

Rscript $dir/aa_histogram.r $outdir/aa_mutations_cm.txt $outdir/aa_histogram_cm.png "cm" 2>&1

genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)

funcs=(sum mean median)

echo "---------------- sequence_overview.r ----------------"
echo "---------------- sequence_overview.r ----------------<br />" >> $log

mkdir $outdir/sequence_overview

#Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1

echo "<table border='1'>" > $outdir/base_overview.html

while IFS=$'\t' read ID class seq A C G T
do
	echo "<tr><td>$ID</td><td>$seq</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
done < $outdir/sequence_overview/ntoverview.txt

echo "<html><center><h1>$title</h1></center>" > $output

#display the matched/unmatched for clearity

matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`"
unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
total_count=$((matched_count + unmatched_count))
perc_count=$((unmatched_count / total_count * 100))
perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`

echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output

echo "---------------- main tables ----------------"
echo "---------------- main tables ----------------<br />" >> $log
for func in ${funcs[@]}
do

	echo "---------------- $func table ----------------"
	echo "---------------- $func table ----------------<br />" >> $log

	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt

	echo "<table border='1' width='100%'><caption><h3><a href='data_${func}.txt'>${func} table</a></h3></caption>" >> $output
	echo "<tr><th>info</th>" >> $output
	for gene in ${genes[@]}
	do
		tmp=`cat $outdir/${gene}_${func}_n.txt`
		echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
	done

	tmp=`cat $outdir/unmatched_${func}_n.txt`
	echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
	tmp=`cat $outdir/all_${func}_n.txt`
	echo "<th><a href='matched_all_${func}.txt'>all (N = $tmp)</a></th>" >> $output

	while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
	do
		if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
		else
			echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
		fi
	done < $outdir/data_${func}.txt
	echo "</table>" >> $output
	#echo "<a href='data_${func}.txt'>Download data</a>" >> $output
done

echo "---------------- download links ----------------"
echo "---------------- download links ----------------<br />" >> $log


echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
echo "<a href='merged.txt'>all data</a><br />" >> $output
echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
echo "<a href='baseline.pdf'>Baseline PDF</a><br />" >> $output
echo "<a href='baseline.txt'>Baseline Table</a><br />" >> $output
echo "<a href='baseline_ca.pdf'>Baseline ca PDF</a><br />" >> $output
echo "<a href='baseline_ca.txt'>Baseline ca Table</a><br />" >> $output
echo "<a href='baseline_cg.pdf'>Baseline cg PDF</a><br />" >> $output
echo "<a href='baseline_cg.txt'>Baseline cg Table</a><br />" >> $output
echo "<a href='baseline_cm.pdf'>Baseline cm PDF</a><br />" >> $output
echo "<a href='baseline_cm.txt'>Baseline cm Table</a><br />" >> $output
echo "<a href='IgAT.zip'>IgAT zip</a><br />" >> $output
echo "<a href='IgAT_ca.zip'>IgAT ca zip</a><br />" >> $output
echo "<a href='IgAT_cg.zip'>IgAT cg zip</a><br />" >> $output
echo "<a href='IgAT_cm.zip'>IgAT cm zip</a><br />" >> $output
echo "<a href='new_IMGT.txz'>Filtered IMGT zip</a><br />" >> $output
echo "<a href='new_IMGT_ca.txz'>Filtered ca IMGT zip</a><br />" >> $output
echo "<a href='new_IMGT_cg.txz'>Filtered cg IMGT zip</a><br />" >> $output
echo "<a href='new_IMGT_cm.txz'>Filtered cm IMGT zip</a><br />" >> $output


echo "---------------- images ----------------"
echo "---------------- images ----------------<br />" >> $log

echo "<img src='all.png'/><br />" >> $output
echo "<a href='all.txt'>download data</a><br />" >> $output
if [ -a $outdir/ca.png ]
then
	echo "<img src='ca.png'/><br />" >> $output
	echo "<a href='ca.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/cg.png ]
then
	echo "<img src='cg.png'/><br />" >> $output
	echo "<a href='cg.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/scatter.png ]
then
	echo "<img src='scatter.png'/><br />" >> $output
	echo "<a href='scatter.txt'>download data</a><br />" >> $output
fi
if [ -a $outdir/frequency_ranges.png ]
then
	echo "<img src='frequency_ranges.png'/><br />" >> $output
	echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
	echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
fi
if [ -a $outdir/aa_histogram.png ]
then
	echo "<img src='aa_histogram.png'/><br />" >> $output
	echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
	echo "<img src='aa_histogram_ca.png'/><br />" >> $output
	echo "<a href='aa_histogram_ca.txt'>download data</a><br />" >> $output
	echo "<img src='aa_histogram_cg.png'/><br />" >> $output
	echo "<a href='aa_histogram_cg.txt'>download data</a><br />" >> $output
	echo "<img src='aa_histogram_cm.png'/><br />" >> $output
	echo "<a href='aa_histogram_cm.txt'>download data</a><br />" >> $output
fi

for gene in ${genes[@]}
do
	echo "<table border='1'><caption>$gene transition table</caption>" >> $output
	while IFS=, read from a c g t
		do
			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
	done < $outdir/transitions_${gene}_sum.txt
	echo "</table>" >> $output
done

echo "<table border='1'><caption>All transition table</caption>" >> $output
while IFS=, read from a c g t
	do
		echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
done < $outdir/transitions_all_sum.txt
echo "</table>" >> $output

echo "</html>" >> $output

echo "---------------- baseline ----------------"
echo "---------------- baseline ----------------<br />" >> $log
tmp="$PWD"

mkdir $outdir/baseline


mkdir $outdir/baseline/ca_cg_cm
if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then
	cd $outdir/baseline/ca_cg_cm
	bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"
else
	echo "No sequences" > "$outdir/baseline.txt"
fi

mkdir $outdir/baseline/ca
if [[ $(wc -l < $outdir/new_IMGT_ca/1_Summary.txt) -gt "1" ]]; then
	cd $outdir/baseline/ca
	bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt"
else
	echo "No ca sequences" > "$outdir/baseline_ca.txt"
fi

mkdir $outdir/baseline/cg
if [[ $(wc -l < $outdir/new_IMGT_cg/1_Summary.txt) -gt "1" ]]; then
	cd $outdir/baseline/cg
	bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt"
else
	echo "No cg sequences" > "$outdir/baseline_cg.txt"
fi

mkdir $outdir/baseline/cm
if [[ $(wc -l < $outdir/new_IMGT_cm/1_Summary.txt) -gt "1" ]]; then
	cd $outdir/baseline/cm
	bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt"
else
	echo "No cm sequences" > "$outdir/baseline_cm.txt"
fi

cd $tmp

echo "---------------- naive_output.r ----------------"
echo "---------------- naive_output.r ----------------<br />" >> $log

if [[ "$naive_output" != "None" ]]
then
	echo "---------------- imgt_loader.r ----------------"
	echo "---------------- imgt_loader.r ----------------<br />" >> $log
	#python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
	Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1

	echo "---------------- naive_output.r ----------------"
	echo "---------------- naive_output.r ----------------<br />" >> $log
	Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
fi

echo "</table>" >> $outdir/base_overview.html

mv $log $outdir/log.html

cp $outdir/index.html $log

echo "---------------- Done! ----------------"
echo "---------------- Done! ----------------<br />" >> $log
author	davidvanzessen
date	Thu, 23 Jun 2016 03:58:03 -0400
parents	603a10976e9c
children	01c9993865af