0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
89
|
24
|
|
25
|
35
|
26 type="`file $input`"
|
|
27 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
28 echo "Zip archive"
|
|
29 echo "unzip $input -d $PWD/files/"
|
|
30 unzip $input -d $PWD/files/
|
|
31 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
32 echo "ZX archive"
|
|
33 echo "tar -xJf $input -C $PWD/files/"
|
|
34 mkdir -p $PWD/files/$title
|
|
35 tar -xJf $input -C $PWD/files/$title
|
|
36 fi
|
|
37
|
64
|
38 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
39 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
40 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
41 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
42 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
43 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
44 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
89
|
58 echo "---------------- identification ($method) ----------------"
|
|
59 echo "---------------- identification ($method) ----------------<br />" >> $output
|
55
|
60
|
19
|
61 if [[ "${method}" == "custom" ]] ; then
|
|
62 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
63 else
|
|
64 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
65 ID_index=$((ID_index+1))
|
|
66 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
67 sequence_index=$((sequence_index+1))
|
|
68
|
|
69 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
70
|
|
71 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
72 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
73 fi
|
|
74
|
55
|
75 echo "---------------- merge_and_filter.r ----------------"
|
63
|
76 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
77
|
90
|
78 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
|
0
|
79
|
55
|
80 echo "---------------- mutation_analysis.r ----------------"
|
63
|
81 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
82
|
82
|
83 classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
84 echo "R mutation analysis"
|
82
|
85 Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1
|
53
|
86
|
55
|
87
|
|
88 echo "---------------- mutation_analysis.py ----------------"
|
63
|
89 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
90
|
82
|
91 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
92
|
|
93 echo "---------------- aa_histogram.r ----------------"
|
63
|
94 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
95
|
26
|
96 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
97
|
0
|
98 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
99
|
53
|
100 funcs=(sum mean median)
|
0
|
101
|
82
|
102 echo "---------------- sequence_overview.r ----------------"
|
|
103
|
|
104 mkdir $outdir/sequence_overview
|
|
105
|
90
|
106 #Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
94
|
107 Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
82
|
108
|
|
109 echo "<table border='1'>" > $outdir/base_overview.html
|
|
110
|
92
|
111 while IFS=$'\t' read ID class seq A C G T
|
82
|
112 do
|
85
|
113 echo "<tr><td>$ID</td><td>$seq</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
|
82
|
114 done < $outdir/sequence_overview/ntoverview.txt
|
|
115
|
53
|
116
|
62
|
117 echo "<html><center><h1>$title</h1></center>" > $output
|
|
118
|
|
119 #display the matched/unmatched for clearity
|
|
120
|
|
121 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
122 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
123 total_count=$((matched_count + unmatched_count))
|
|
124 perc_count=$((unmatched_count / total_count * 100))
|
|
125 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
126 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
127
|
|
128 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
129 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
130 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
131
|
55
|
132 echo "---------------- main tables ----------------"
|
53
|
133 for func in ${funcs[@]}
|
4
|
134 do
|
55
|
135
|
|
136 echo "---------------- $func table ----------------"
|
|
137
|
94
|
138 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
|
53
|
139
|
|
140 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
141 echo "<tr><th>info</th>" >> $output
|
53
|
142 for gene in ${genes[@]}
|
|
143 do
|
|
144 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
145 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
146 done
|
78
|
147
|
|
148 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
149 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
150 tmp=`cat $outdir/all_${func}_n.txt`
|
89
|
151 echo "<th><a href='matched_all_${func}.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
152
|
78
|
153 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
154 do
|
|
155 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
156 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
157 else
|
78
|
158 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
159 fi
|
94
|
160 done < $outdir/data_${func}.txt
|
|
161 echo "</table>" >> $output
|
|
162 #echo "<a href='data_${func}.txt'>Download data</a>" >> $output
|
53
|
163 done
|
|
164
|
55
|
165 echo "---------------- download links ----------------"
|
|
166
|
94
|
167
|
53
|
168 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
169 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
170 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
171 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
172 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
173 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
174 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
81
|
175 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
|
2
|
176
|
55
|
177 echo "---------------- images ----------------"
|
|
178
|
4
|
179 echo "<img src='all.png'/><br />" >> $output
|
26
|
180 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
181 if [ -a $outdir/ca.png ]
|
|
182 then
|
|
183 echo "<img src='ca.png'/><br />" >> $output
|
26
|
184 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
185 fi
|
|
186 if [ -a $outdir/cg.png ]
|
|
187 then
|
|
188 echo "<img src='cg.png'/><br />" >> $output
|
26
|
189 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
190 fi
|
22
|
191 if [ -a $outdir/scatter.png ]
|
|
192 then
|
|
193 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
194 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
195 fi
|
49
|
196 if [ -a $outdir/frequency_ranges.png ]
|
|
197 then
|
|
198 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
199 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
200 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
201 fi
|
26
|
202 if [ -a $outdir/aa_histogram.png ]
|
|
203 then
|
|
204 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
205 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
206 fi
|
2
|
207
|
0
|
208 for gene in ${genes[@]}
|
|
209 do
|
|
210 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
211 while IFS=, read from a c g t
|
|
212 do
|
|
213 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
214 done < $outdir/transitions_${gene}_sum.txt
|
0
|
215 echo "</table>" >> $output
|
|
216 done
|
|
217
|
|
218 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
219 while IFS=, read from a c g t
|
|
220 do
|
|
221 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
222 done < $outdir/transitions_all_sum.txt
|
0
|
223 echo "</table>" >> $output
|
|
224
|
|
225 echo "</html>" >> $output
|
2
|
226
|
47
|
227
|
|
228 #optional output for naive
|
|
229
|
82
|
230 echo "---------------- naive_output.r ----------------"
|
55
|
231
|
47
|
232 if [[ "$naive_output" != "None" ]]
|
|
233 then
|
55
|
234 echo "---------------- imgt_loader.r ----------------"
|
50
|
235 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
236 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
55
|
237 echo "---------------- naive_output.r ----------------"
|
81
|
238 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
|
47
|
239 fi
|
|
240
|
81
|
241 echo "</table>" >> $outdir/base_overview.html
|
|
242
|
|
243 echo "---------------- Done! ----------------"
|
47
|
244
|
|
245
|
|
246
|
|
247
|
|
248
|
|
249
|
|
250
|
2
|
251 #rm $outdir/HS12RSS.txt
|
|
252 #rm $outdir/HS23RSS.txt
|