Mercurial > repos > davidvanzessen > mutation_analysis
view wrapper.sh @ 100:ff5be711382b draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 17 Jun 2016 05:36:32 -0400 |
parents | 86206431cbb0 |
children | 3cffb8a38bb1 |
line wrap: on
line source
#!/bin/bash set -e dir="$(cd "$(dirname "$0")" && pwd)" input=$1 method=$2 output=$3 outdir=$4 title=$5 include_fr1=$6 functionality=$7 unique=$8 naive_output_ca=$9 naive_output_cg=${10} naive_output_cm=${11} filter_unique=${12} class_filter=${13} mkdir $outdir echo "---------------- read parameters ----------------" echo "---------------- read parameters ----------------<br />" > $output echo "unpacking IMGT file" type="`file $input`" if [[ "$type" == *"Zip archive"* ]] ; then echo "Zip archive" echo "unzip $input -d $PWD/files/" unzip $input -d $PWD/files/ elif [[ "$type" == *"XZ compressed data"* ]] ; then echo "ZX archive" echo "tar -xJf $input -C $PWD/files/" mkdir -p $PWD/files/$title tar -xJf $input -C $PWD/files/$title fi cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt #cat $PWD/files/*/1_* > $PWD/summary.txt #cat $PWD/files/*/3_* > $PWD/sequences.txt #cat $PWD/files/*/5_* > $PWD/aa.txt #cat $PWD/files/*/6_* > $PWD/junction.txt #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt #cat $PWD/files/*/8_* > $PWD/mutationstats.txt #cat $PWD/files/*/10_* > $PWD/hotspots.txt #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" echo "${BLASTN_DIR}" echo "---------------- identification ($method) ----------------" echo "---------------- identification ($method) ----------------<br />" >> $output if [[ "${method}" == "custom" ]] ; then python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt else ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l) ID_index=$((ID_index+1)) sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l) sequence_index=$((sequence_index+1)) cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt fi echo "---------------- merge_and_filter.r ----------------" echo "---------------- merge_and_filter.r ----------------<br />" >> $output Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1 echo "---------------- creating new IMGT zip ----------------" echo "---------------- creating new IMGT zip ----------------<br />" >> $output mkdir $outdir/new_IMGT cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt" cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt" cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt" cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt" cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt" cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt" cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt" cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt" cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt" cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt" mkdir $outdir/new_IMGT_ca cp $outdir/new_IMGT/* $outdir/new_IMGT_ca mkdir $outdir/new_IMGT_cg cp $outdir/new_IMGT/* $outdir/new_IMGT_cg mkdir $outdir/new_IMGT_cm cp $outdir/new_IMGT/* $outdir/new_IMGT_cm Rscript $dir/tmp/igat.r $outdir/new_IMGT/ $outdir/merged.txt "-" 2>&1 Rscript $dir/tmp/igat.r $outdir/new_IMGT_ca/ $outdir/merged.txt "ca "2>&1 Rscript $dir/tmp/igat.r $outdir/new_IMGT_cg/ $outdir/merged.txt "cg "2>&1 Rscript $dir/tmp/igat.r $outdir/new_IMGT_cm/ $outdir/merged.txt "cm "2>&1 tmp="$PWD" cd $outdir/new_IMGT/ #tar weirdness... tar -cJf ../new_IMGT.txz * cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT/IgAT.xlsm zip -r ../IgAT.zip * cd $outdir/new_IMGT_ca/ tar -cJf ../new_IMGT_ca.txz * cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_ca/IgAT.xlsm zip -r ../IgAT_ca.zip * cd $outdir/new_IMGT_cg/ tar -cJf ../new_IMGT_cg.txz * cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cg/IgAT.xlsm zip -r ../IgAT_cg.zip * cd $outdir/new_IMGT_cm/ tar -cJf ../new_IMGT_cm.txz * cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cm/IgAT.xlsm zip -r ../IgAT_cm.zip * cd $tmp echo "---------------- mutation_analysis.r ----------------" echo "---------------- mutation_analysis.r ----------------<br />" >> $output classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched" echo "R mutation analysis" Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1 echo "---------------- mutation_analysis.py ----------------" echo "---------------- mutation_analysis.py ----------------<br />" >> $output python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt echo "---------------- aa_histogram.r ----------------" echo "---------------- aa_histogram.r ----------------<br />" >> $output Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) funcs=(sum mean median) echo "---------------- sequence_overview.r ----------------" echo "---------------- sequence_overview.r ----------------" >> $output mkdir $outdir/sequence_overview #Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1 Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1 echo "<table border='1'>" > $outdir/base_overview.html while IFS=$'\t' read ID class seq A C G T do echo "<tr><td>$ID</td><td>$seq</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html done < $outdir/sequence_overview/ntoverview.txt echo "<html><center><h1>$title</h1></center>" > $output #display the matched/unmatched for clearity matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`" unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`" total_count=$((matched_count + unmatched_count)) perc_count=$((unmatched_count / total_count * 100)) perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"` perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"` echo "<center><h2>Total: ${total_count}</h2></center>" >> $output echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output echo "---------------- main tables ----------------" for func in ${funcs[@]} do echo "---------------- $func table ----------------" cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt echo "<table border='1' width='100%'><caption><h3><a href='data_${func}.txt'>${func} table</a></h3></caption>" >> $output echo "<tr><th>info</th>" >> $output for gene in ${genes[@]} do tmp=`cat $outdir/${gene}_${func}_n.txt` echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output done tmp=`cat $outdir/unmatched_${func}_n.txt` echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output tmp=`cat $outdir/all_${func}_n.txt` echo "<th><a href='matched_all_${func}.txt'>all (N = $tmp)</a></th>" >> $output while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz do if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output else echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output fi done < $outdir/data_${func}.txt echo "</table>" >> $output #echo "<a href='data_${func}.txt'>Download data</a>" >> $output done echo "---------------- download links ----------------" echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output echo "<a href='merged.txt'>all data</a><br />" >> $output echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output echo "<a href='base_overview.html'>Base overview</a><br />" >> $output echo "<a href='baseline.pdf'>Baseline PDF</a><br />" >> $output echo "<a href='baseline.txt'>Baseline Table</a><br />" >> $output echo "<a href='baseline_ca.pdf'>Baseline ca PDF</a><br />" >> $output echo "<a href='baseline_ca.txt'>Baseline ca Table</a><br />" >> $output echo "<a href='baseline_cg.pdf'>Baseline cg PDF</a><br />" >> $output echo "<a href='baseline_cg.txt'>Baseline cg Table</a><br />" >> $output echo "<a href='baseline_cm.pdf'>Baseline cm PDF</a><br />" >> $output echo "<a href='baseline_cm.txt'>Baseline cm Table</a><br />" >> $output echo "<a href='IgAT.zip'>IgAT zip</a><br />" >> $output echo "<a href='IgAT_ca.zip'>IgAT ca zip</a><br />" >> $output echo "<a href='IgAT_cg.zip'>IgAT cg zip</a><br />" >> $output echo "<a href='IgAT_cm.zip'>IgAT cm zip</a><br />" >> $output echo "<a href='new_IMGT.txz'>Filtered IMGT zip</a><br />" >> $output echo "<a href='new_IMGT_ca.txz'>Filtered ca IMGT zip</a><br />" >> $output echo "<a href='new_IMGT_cg.txz'>Filtered cg IMGT zip</a><br />" >> $output echo "<a href='new_IMGT_cm.txz'>Filtered cm IMGT zip</a><br />" >> $output echo "---------------- images ----------------" echo "<img src='all.png'/><br />" >> $output echo "<a href='all.txt'>download data</a><br />" >> $output if [ -a $outdir/ca.png ] then echo "<img src='ca.png'/><br />" >> $output echo "<a href='ca.txt'>download data</a><br />" >> $output fi if [ -a $outdir/cg.png ] then echo "<img src='cg.png'/><br />" >> $output echo "<a href='cg.txt'>download data</a><br />" >> $output fi if [ -a $outdir/scatter.png ] then echo "<img src='scatter.png'/><br />" >> $output echo "<a href='scatter.txt'>download data</a><br />" >> $output fi if [ -a $outdir/frequency_ranges.png ] then echo "<img src='frequency_ranges.png'/><br />" >> $output echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output fi if [ -a $outdir/aa_histogram.png ] then echo "<img src='aa_histogram.png'/><br />" >> $output echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output fi for gene in ${genes[@]} do echo "<table border='1'><caption>$gene transition table</caption>" >> $output while IFS=, read from a c g t do echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output done < $outdir/transitions_${gene}_sum.txt echo "</table>" >> $output done echo "<table border='1'><caption>All transition table</caption>" >> $output while IFS=, read from a c g t do echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output done < $outdir/transitions_all_sum.txt echo "</table>" >> $output echo "</html>" >> $output echo "---------------- baseline ----------------" bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt" bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt" bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt" bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt" #optional output for naive echo "---------------- naive_output.r ----------------" if [[ "$naive_output" != "None" ]] then echo "---------------- imgt_loader.r ----------------" #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1 echo "---------------- naive_output.r ----------------" Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1 fi echo "</table>" >> $outdir/base_overview.html echo "---------------- Done! ----------------"