0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
89
|
24
|
|
25
|
35
|
26 type="`file $input`"
|
|
27 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
28 echo "Zip archive"
|
|
29 echo "unzip $input -d $PWD/files/"
|
|
30 unzip $input -d $PWD/files/
|
|
31 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
32 echo "ZX archive"
|
|
33 echo "tar -xJf $input -C $PWD/files/"
|
|
34 mkdir -p $PWD/files/$title
|
|
35 tar -xJf $input -C $PWD/files/$title
|
|
36 fi
|
|
37
|
64
|
38 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
39 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
40 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
41 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
42 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
43 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
44 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
89
|
58 echo "---------------- identification ($method) ----------------"
|
|
59 echo "---------------- identification ($method) ----------------<br />" >> $output
|
55
|
60
|
19
|
61 if [[ "${method}" == "custom" ]] ; then
|
|
62 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
63 else
|
|
64 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
65 ID_index=$((ID_index+1))
|
|
66 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
67 sequence_index=$((sequence_index+1))
|
|
68
|
|
69 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
70
|
|
71 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
72 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
73 fi
|
|
74
|
55
|
75 echo "---------------- merge_and_filter.r ----------------"
|
63
|
76 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
77
|
90
|
78 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
|
0
|
79
|
98
|
80 echo "---------------- creating new IMGT zip ----------------"
|
95
|
81 echo "---------------- creating new IMGT zip ----------------<br />" >> $output
|
|
82
|
|
83 mkdir $outdir/new_IMGT
|
|
84
|
|
85 cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"
|
|
86 cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"
|
|
87 cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"
|
|
88 cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"
|
|
89 cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"
|
|
90 cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"
|
|
91 cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"
|
|
92 cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"
|
|
93 cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"
|
|
94 cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"
|
|
95
|
99
|
96 mkdir $outdir/new_IMGT_ca
|
|
97 cp $outdir/new_IMGT/* $outdir/new_IMGT_ca
|
|
98
|
|
99 mkdir $outdir/new_IMGT_cg
|
|
100 cp $outdir/new_IMGT/* $outdir/new_IMGT_cg
|
|
101
|
|
102 mkdir $outdir/new_IMGT_cm
|
|
103 cp $outdir/new_IMGT/* $outdir/new_IMGT_cm
|
|
104
|
|
105 Rscript $dir/tmp/igat.r $outdir/new_IMGT/ $outdir/merged.txt "-" 2>&1
|
|
106 Rscript $dir/tmp/igat.r $outdir/new_IMGT_ca/ $outdir/merged.txt "ca "2>&1
|
|
107 Rscript $dir/tmp/igat.r $outdir/new_IMGT_cg/ $outdir/merged.txt "cg "2>&1
|
|
108 Rscript $dir/tmp/igat.r $outdir/new_IMGT_cm/ $outdir/merged.txt "cm "2>&1
|
95
|
109
|
|
110
|
|
111 tmp="$PWD"
|
|
112 cd $outdir/new_IMGT/ #tar weirdness...
|
|
113 tar -cJf ../new_IMGT.txz *
|
|
114 cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT/IgAT.xlsm
|
|
115 zip -r ../IgAT.zip *
|
|
116
|
99
|
117 cd $outdir/new_IMGT_ca/
|
|
118 tar -cJf ../new_IMGT_ca.txz *
|
|
119 cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_ca/IgAT.xlsm
|
|
120 zip -r ../IgAT_ca.zip *
|
|
121
|
|
122 cd $outdir/new_IMGT_cg/
|
|
123 tar -cJf ../new_IMGT_cg.txz *
|
|
124 cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cg/IgAT.xlsm
|
|
125 zip -r ../IgAT_cg.zip *
|
|
126
|
|
127 cd $outdir/new_IMGT_cm/
|
|
128 tar -cJf ../new_IMGT_cm.txz *
|
|
129 cp $dir/tmp/IgAT.xlsm $outdir/new_IMGT_cm/IgAT.xlsm
|
|
130 zip -r ../IgAT_cm.zip *
|
|
131
|
95
|
132 cd $tmp
|
|
133
|
55
|
134 echo "---------------- mutation_analysis.r ----------------"
|
63
|
135 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
136
|
82
|
137 classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
138 echo "R mutation analysis"
|
82
|
139 Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1
|
53
|
140
|
55
|
141
|
|
142 echo "---------------- mutation_analysis.py ----------------"
|
63
|
143 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
144
|
82
|
145 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
146
|
|
147 echo "---------------- aa_histogram.r ----------------"
|
63
|
148 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
149
|
26
|
150 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
151
|
0
|
152 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
153
|
53
|
154 funcs=(sum mean median)
|
0
|
155
|
82
|
156 echo "---------------- sequence_overview.r ----------------"
|
95
|
157 echo "---------------- sequence_overview.r ----------------" >> $output
|
82
|
158
|
|
159 mkdir $outdir/sequence_overview
|
|
160
|
90
|
161 #Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
100
|
162 Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
82
|
163
|
|
164 echo "<table border='1'>" > $outdir/base_overview.html
|
|
165
|
92
|
166 while IFS=$'\t' read ID class seq A C G T
|
82
|
167 do
|
85
|
168 echo "<tr><td>$ID</td><td>$seq</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
|
82
|
169 done < $outdir/sequence_overview/ntoverview.txt
|
|
170
|
53
|
171
|
62
|
172 echo "<html><center><h1>$title</h1></center>" > $output
|
|
173
|
|
174 #display the matched/unmatched for clearity
|
|
175
|
98
|
176 matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`"
|
62
|
177 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
178 total_count=$((matched_count + unmatched_count))
|
|
179 perc_count=$((unmatched_count / total_count * 100))
|
|
180 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
181 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
182
|
|
183 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
184 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
185 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
186
|
55
|
187 echo "---------------- main tables ----------------"
|
53
|
188 for func in ${funcs[@]}
|
4
|
189 do
|
55
|
190
|
|
191 echo "---------------- $func table ----------------"
|
|
192
|
94
|
193 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
|
53
|
194
|
98
|
195 echo "<table border='1' width='100%'><caption><h3><a href='data_${func}.txt'>${func} table</a></h3></caption>" >> $output
|
58
|
196 echo "<tr><th>info</th>" >> $output
|
53
|
197 for gene in ${genes[@]}
|
|
198 do
|
|
199 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
200 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
201 done
|
78
|
202
|
|
203 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
204 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
205 tmp=`cat $outdir/all_${func}_n.txt`
|
89
|
206 echo "<th><a href='matched_all_${func}.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
207
|
78
|
208 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
209 do
|
|
210 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
211 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
212 else
|
78
|
213 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
214 fi
|
94
|
215 done < $outdir/data_${func}.txt
|
|
216 echo "</table>" >> $output
|
|
217 #echo "<a href='data_${func}.txt'>Download data</a>" >> $output
|
53
|
218 done
|
|
219
|
55
|
220 echo "---------------- download links ----------------"
|
|
221
|
94
|
222
|
53
|
223 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
224 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
225 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
226 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
227 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
228 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
229 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
81
|
230 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
|
95
|
231 echo "<a href='baseline.pdf'>Baseline PDF</a><br />" >> $output
|
|
232 echo "<a href='baseline.txt'>Baseline Table</a><br />" >> $output
|
99
|
233 echo "<a href='baseline_ca.pdf'>Baseline ca PDF</a><br />" >> $output
|
|
234 echo "<a href='baseline_ca.txt'>Baseline ca Table</a><br />" >> $output
|
|
235 echo "<a href='baseline_cg.pdf'>Baseline cg PDF</a><br />" >> $output
|
|
236 echo "<a href='baseline_cg.txt'>Baseline cg Table</a><br />" >> $output
|
|
237 echo "<a href='baseline_cm.pdf'>Baseline cm PDF</a><br />" >> $output
|
|
238 echo "<a href='baseline_cm.txt'>Baseline cm Table</a><br />" >> $output
|
95
|
239 echo "<a href='IgAT.zip'>IgAT zip</a><br />" >> $output
|
99
|
240 echo "<a href='IgAT_ca.zip'>IgAT ca zip</a><br />" >> $output
|
|
241 echo "<a href='IgAT_cg.zip'>IgAT cg zip</a><br />" >> $output
|
|
242 echo "<a href='IgAT_cm.zip'>IgAT cm zip</a><br />" >> $output
|
|
243 echo "<a href='new_IMGT.txz'>Filtered IMGT zip</a><br />" >> $output
|
|
244 echo "<a href='new_IMGT_ca.txz'>Filtered ca IMGT zip</a><br />" >> $output
|
|
245 echo "<a href='new_IMGT_cg.txz'>Filtered cg IMGT zip</a><br />" >> $output
|
|
246 echo "<a href='new_IMGT_cm.txz'>Filtered cm IMGT zip</a><br />" >> $output
|
|
247
|
2
|
248
|
55
|
249 echo "---------------- images ----------------"
|
|
250
|
4
|
251 echo "<img src='all.png'/><br />" >> $output
|
26
|
252 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
253 if [ -a $outdir/ca.png ]
|
|
254 then
|
|
255 echo "<img src='ca.png'/><br />" >> $output
|
26
|
256 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
257 fi
|
|
258 if [ -a $outdir/cg.png ]
|
|
259 then
|
|
260 echo "<img src='cg.png'/><br />" >> $output
|
26
|
261 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
262 fi
|
22
|
263 if [ -a $outdir/scatter.png ]
|
|
264 then
|
|
265 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
266 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
267 fi
|
49
|
268 if [ -a $outdir/frequency_ranges.png ]
|
|
269 then
|
|
270 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
271 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
272 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
273 fi
|
26
|
274 if [ -a $outdir/aa_histogram.png ]
|
|
275 then
|
|
276 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
277 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
278 fi
|
2
|
279
|
0
|
280 for gene in ${genes[@]}
|
|
281 do
|
|
282 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
283 while IFS=, read from a c g t
|
|
284 do
|
|
285 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
286 done < $outdir/transitions_${gene}_sum.txt
|
0
|
287 echo "</table>" >> $output
|
|
288 done
|
|
289
|
|
290 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
291 while IFS=, read from a c g t
|
|
292 do
|
|
293 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
294 done < $outdir/transitions_all_sum.txt
|
0
|
295 echo "</table>" >> $output
|
|
296
|
|
297 echo "</html>" >> $output
|
2
|
298
|
95
|
299 echo "---------------- baseline ----------------"
|
99
|
300 bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"
|
|
301 bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt"
|
|
302 bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt"
|
|
303 bash $dir/tmp/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/tmp/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt"
|
47
|
304
|
|
305 #optional output for naive
|
|
306
|
82
|
307 echo "---------------- naive_output.r ----------------"
|
55
|
308
|
47
|
309 if [[ "$naive_output" != "None" ]]
|
|
310 then
|
55
|
311 echo "---------------- imgt_loader.r ----------------"
|
50
|
312 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
313 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
95
|
314
|
55
|
315 echo "---------------- naive_output.r ----------------"
|
81
|
316 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
|
47
|
317 fi
|
|
318
|
81
|
319 echo "</table>" >> $outdir/base_overview.html
|
|
320
|
|
321 echo "---------------- Done! ----------------"
|
47
|
322
|