0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
84
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
82
|
89 classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
90 echo "R mutation analysis"
|
82
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1
|
53
|
92
|
55
|
93
|
|
94 echo "---------------- mutation_analysis.py ----------------"
|
63
|
95 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
96
|
82
|
97 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
98
|
|
99 echo "---------------- aa_histogram.r ----------------"
|
63
|
100 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
101
|
26
|
102 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
103
|
0
|
104 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
105
|
53
|
106 funcs=(sum mean median)
|
0
|
107
|
82
|
108 echo "---------------- sequence_overview.r ----------------"
|
|
109
|
|
110 mkdir $outdir/sequence_overview
|
|
111
|
|
112 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
|
113
|
|
114 echo "<table border='1'>" > $outdir/base_overview.html
|
|
115
|
|
116 while read ID class seq A C G T
|
|
117 do
|
85
|
118 echo "<tr><td>$ID</td><td>$seq</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
|
82
|
119 done < $outdir/sequence_overview/ntoverview.txt
|
|
120
|
53
|
121
|
62
|
122 echo "<html><center><h1>$title</h1></center>" > $output
|
|
123
|
|
124 #display the matched/unmatched for clearity
|
|
125
|
|
126 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
127 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
128 total_count=$((matched_count + unmatched_count))
|
|
129 perc_count=$((unmatched_count / total_count * 100))
|
|
130 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
131 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
132
|
|
133 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
134 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
135 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
136
|
55
|
137 echo "---------------- main tables ----------------"
|
53
|
138 for func in ${funcs[@]}
|
4
|
139 do
|
55
|
140
|
|
141 echo "---------------- $func table ----------------"
|
|
142
|
53
|
143 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
144
|
|
145 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
146 echo "<tr><th>info</th>" >> $output
|
53
|
147 for gene in ${genes[@]}
|
|
148 do
|
|
149 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
150 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
151 done
|
78
|
152
|
|
153 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
154 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
155 tmp=`cat $outdir/all_${func}_n.txt`
|
78
|
156 echo "<th><a href='matched_${func}_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
157
|
78
|
158 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
159 do
|
|
160 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
161 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
162 else
|
78
|
163 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
164 fi
|
|
165 done < $outdir/result.txt
|
|
166
|
|
167 done
|
|
168
|
55
|
169 echo "---------------- download links ----------------"
|
|
170
|
0
|
171 echo "</table>" >> $output
|
53
|
172 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
173 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
174 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
175 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
176 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
177 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
178 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
81
|
179 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
|
2
|
180
|
55
|
181 echo "---------------- images ----------------"
|
|
182
|
4
|
183 echo "<img src='all.png'/><br />" >> $output
|
26
|
184 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
185 if [ -a $outdir/ca.png ]
|
|
186 then
|
|
187 echo "<img src='ca.png'/><br />" >> $output
|
26
|
188 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
189 fi
|
|
190 if [ -a $outdir/cg.png ]
|
|
191 then
|
|
192 echo "<img src='cg.png'/><br />" >> $output
|
26
|
193 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
194 fi
|
22
|
195 if [ -a $outdir/scatter.png ]
|
|
196 then
|
|
197 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
198 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
199 fi
|
49
|
200 if [ -a $outdir/frequency_ranges.png ]
|
|
201 then
|
|
202 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
203 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
204 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
205 fi
|
26
|
206 if [ -a $outdir/aa_histogram.png ]
|
|
207 then
|
|
208 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
209 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
210 fi
|
2
|
211
|
0
|
212 for gene in ${genes[@]}
|
|
213 do
|
|
214 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
215 while IFS=, read from a c g t
|
|
216 do
|
|
217 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
218 done < $outdir/transitions_${gene}_sum.txt
|
0
|
219 echo "</table>" >> $output
|
|
220 done
|
|
221
|
|
222 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
223 while IFS=, read from a c g t
|
|
224 do
|
|
225 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
226 done < $outdir/transitions_all_sum.txt
|
0
|
227 echo "</table>" >> $output
|
|
228
|
|
229 echo "</html>" >> $output
|
2
|
230
|
47
|
231
|
|
232 #optional output for naive
|
|
233
|
82
|
234 echo "---------------- naive_output.r ----------------"
|
55
|
235
|
47
|
236 if [[ "$naive_output" != "None" ]]
|
|
237 then
|
55
|
238 echo "---------------- imgt_loader.r ----------------"
|
50
|
239 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
240 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
55
|
241 echo "---------------- naive_output.r ----------------"
|
81
|
242 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
|
47
|
243 fi
|
|
244
|
81
|
245 echo "</table>" >> $outdir/base_overview.html
|
|
246
|
|
247 echo "---------------- Done! ----------------"
|
47
|
248
|
|
249
|
|
250
|
|
251
|
|
252
|
|
253
|
|
254
|
2
|
255 #rm $outdir/HS12RSS.txt
|
|
256 #rm $outdir/HS23RSS.txt
|