0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
66
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter}
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
78
|
89 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
90 echo "R mutation analysis"
|
22
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
|
53
|
92
|
|
93 #echo "." > $output
|
|
94 #exit 0
|
|
95
|
55
|
96
|
|
97
|
|
98 echo "---------------- mutation_analysis.py ----------------"
|
63
|
99 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
100
|
32
|
101 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
102
|
|
103 echo "---------------- aa_histogram.r ----------------"
|
63
|
104 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
105
|
26
|
106 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
107
|
0
|
108 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
109
|
53
|
110 funcs=(sum mean median)
|
0
|
111
|
53
|
112
|
62
|
113 echo "<html><center><h1>$title</h1></center>" > $output
|
|
114
|
|
115 #display the matched/unmatched for clearity
|
|
116
|
|
117 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
118 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
119 total_count=$((matched_count + unmatched_count))
|
|
120 perc_count=$((unmatched_count / total_count * 100))
|
|
121 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
122 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
123
|
|
124 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
125 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
126 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
127
|
55
|
128 echo "---------------- main tables ----------------"
|
53
|
129 for func in ${funcs[@]}
|
4
|
130 do
|
55
|
131
|
|
132 echo "---------------- $func table ----------------"
|
|
133
|
53
|
134 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
135
|
|
136 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
137 echo "<tr><th>info</th>" >> $output
|
53
|
138 for gene in ${genes[@]}
|
|
139 do
|
|
140 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
141 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
142 done
|
78
|
143
|
|
144 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
145 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
146 tmp=`cat $outdir/all_${func}_n.txt`
|
78
|
147 echo "<th><a href='matched_${func}_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
148
|
78
|
149 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
150 do
|
|
151 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
152 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
153 else
|
78
|
154 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
155 fi
|
|
156 done < $outdir/result.txt
|
|
157
|
|
158 done
|
|
159
|
55
|
160 echo "---------------- download links ----------------"
|
|
161
|
0
|
162 echo "</table>" >> $output
|
53
|
163 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
164 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
165 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
166 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
167 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
168 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
169 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
81
|
170 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
|
2
|
171
|
55
|
172 echo "---------------- images ----------------"
|
|
173
|
4
|
174 echo "<img src='all.png'/><br />" >> $output
|
26
|
175 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
176 if [ -a $outdir/ca.png ]
|
|
177 then
|
|
178 echo "<img src='ca.png'/><br />" >> $output
|
26
|
179 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
180 fi
|
|
181 if [ -a $outdir/cg.png ]
|
|
182 then
|
|
183 echo "<img src='cg.png'/><br />" >> $output
|
26
|
184 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
185 fi
|
22
|
186 if [ -a $outdir/scatter.png ]
|
|
187 then
|
|
188 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
189 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
190 fi
|
49
|
191 if [ -a $outdir/frequency_ranges.png ]
|
|
192 then
|
|
193 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
194 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
195 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
196 fi
|
26
|
197 if [ -a $outdir/aa_histogram.png ]
|
|
198 then
|
|
199 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
200 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
201 fi
|
2
|
202
|
0
|
203 for gene in ${genes[@]}
|
|
204 do
|
|
205 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
206 while IFS=, read from a c g t
|
|
207 do
|
|
208 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
209 done < $outdir/transitions_${gene}_sum.txt
|
0
|
210 echo "</table>" >> $output
|
|
211 done
|
|
212
|
|
213 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
214 while IFS=, read from a c g t
|
|
215 do
|
|
216 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
217 done < $outdir/transitions_all_sum.txt
|
0
|
218 echo "</table>" >> $output
|
|
219
|
|
220 echo "</html>" >> $output
|
2
|
221
|
47
|
222
|
|
223 #optional output for naive
|
|
224
|
55
|
225 echo "---------------- aa_histogram.r ----------------"
|
|
226
|
47
|
227 if [[ "$naive_output" != "None" ]]
|
|
228 then
|
55
|
229 echo "---------------- imgt_loader.r ----------------"
|
50
|
230 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
231 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
55
|
232 echo "---------------- naive_output.r ----------------"
|
81
|
233 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
|
47
|
234 fi
|
|
235
|
76
|
236 echo "---------------- sequence_overview.r ----------------"
|
47
|
237
|
76
|
238 mkdir $outdir/sequence_overview
|
|
239
|
|
240 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1
|
47
|
241
|
81
|
242 echo "<table border='1'>" > $outdir/base_overview.html
|
47
|
243
|
81
|
244 while read ID class seq A C G T
|
|
245 do
|
|
246 echo "<tr><td>$ID</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
|
|
247 done < $outdir/sequence_overview/ntoverview.txt
|
|
248
|
|
249 echo "</table>" >> $outdir/base_overview.html
|
|
250
|
|
251 echo "---------------- Done! ----------------"
|
47
|
252
|
|
253
|
|
254
|
|
255
|
|
256
|
|
257
|
|
258
|
2
|
259 #rm $outdir/HS12RSS.txt
|
|
260 #rm $outdir/HS23RSS.txt
|