0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
84
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
82
|
89 classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
90 echo "R mutation analysis"
|
82
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1
|
53
|
92
|
|
93 #echo "." > $output
|
|
94 #exit 0
|
|
95
|
55
|
96
|
|
97
|
|
98 echo "---------------- mutation_analysis.py ----------------"
|
63
|
99 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
100
|
82
|
101 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
102
|
|
103 echo "---------------- aa_histogram.r ----------------"
|
63
|
104 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
105
|
26
|
106 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
107
|
0
|
108 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
109
|
53
|
110 funcs=(sum mean median)
|
0
|
111
|
82
|
112 echo "---------------- sequence_overview.r ----------------"
|
|
113
|
|
114 mkdir $outdir/sequence_overview
|
|
115
|
|
116 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1
|
|
117
|
|
118 echo "<table border='1'>" > $outdir/base_overview.html
|
|
119
|
|
120 while read ID class seq A C G T
|
|
121 do
|
|
122 echo "<tr><td>$ID</td><td>$class</td><td>$A</td><td>$C</td><td>$G</td><td>$T</td></tr>" >> $outdir/base_overview.html
|
|
123 done < $outdir/sequence_overview/ntoverview.txt
|
|
124
|
53
|
125
|
62
|
126 echo "<html><center><h1>$title</h1></center>" > $output
|
|
127
|
|
128 #display the matched/unmatched for clearity
|
|
129
|
|
130 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
131 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
132 total_count=$((matched_count + unmatched_count))
|
|
133 perc_count=$((unmatched_count / total_count * 100))
|
|
134 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
135 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
136
|
|
137 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
138 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
139 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
140
|
55
|
141 echo "---------------- main tables ----------------"
|
53
|
142 for func in ${funcs[@]}
|
4
|
143 do
|
55
|
144
|
|
145 echo "---------------- $func table ----------------"
|
|
146
|
53
|
147 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
148
|
|
149 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
150 echo "<tr><th>info</th>" >> $output
|
53
|
151 for gene in ${genes[@]}
|
|
152 do
|
|
153 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
154 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
155 done
|
78
|
156
|
|
157 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
158 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
159 tmp=`cat $outdir/all_${func}_n.txt`
|
78
|
160 echo "<th><a href='matched_${func}_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
161
|
78
|
162 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
163 do
|
|
164 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
165 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
166 else
|
78
|
167 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
168 fi
|
|
169 done < $outdir/result.txt
|
|
170
|
|
171 done
|
|
172
|
55
|
173 echo "---------------- download links ----------------"
|
|
174
|
0
|
175 echo "</table>" >> $output
|
53
|
176 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
177 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
178 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
179 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
180 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
181 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
182 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
81
|
183 echo "<a href='base_overview.html'>Base overview</a><br />" >> $output
|
2
|
184
|
55
|
185 echo "---------------- images ----------------"
|
|
186
|
4
|
187 echo "<img src='all.png'/><br />" >> $output
|
26
|
188 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
189 if [ -a $outdir/ca.png ]
|
|
190 then
|
|
191 echo "<img src='ca.png'/><br />" >> $output
|
26
|
192 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
193 fi
|
|
194 if [ -a $outdir/cg.png ]
|
|
195 then
|
|
196 echo "<img src='cg.png'/><br />" >> $output
|
26
|
197 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
198 fi
|
22
|
199 if [ -a $outdir/scatter.png ]
|
|
200 then
|
|
201 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
202 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
203 fi
|
49
|
204 if [ -a $outdir/frequency_ranges.png ]
|
|
205 then
|
|
206 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
207 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
208 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
209 fi
|
26
|
210 if [ -a $outdir/aa_histogram.png ]
|
|
211 then
|
|
212 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
213 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
214 fi
|
2
|
215
|
0
|
216 for gene in ${genes[@]}
|
|
217 do
|
|
218 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
219 while IFS=, read from a c g t
|
|
220 do
|
|
221 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
222 done < $outdir/transitions_${gene}_sum.txt
|
0
|
223 echo "</table>" >> $output
|
|
224 done
|
|
225
|
|
226 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
227 while IFS=, read from a c g t
|
|
228 do
|
|
229 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
230 done < $outdir/transitions_all_sum.txt
|
0
|
231 echo "</table>" >> $output
|
|
232
|
|
233 echo "</html>" >> $output
|
2
|
234
|
47
|
235
|
|
236 #optional output for naive
|
|
237
|
82
|
238 echo "---------------- naive_output.r ----------------"
|
55
|
239
|
47
|
240 if [[ "$naive_output" != "None" ]]
|
|
241 then
|
55
|
242 echo "---------------- imgt_loader.r ----------------"
|
50
|
243 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
244 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
55
|
245 echo "---------------- naive_output.r ----------------"
|
81
|
246 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
|
47
|
247 fi
|
|
248
|
81
|
249 echo "</table>" >> $outdir/base_overview.html
|
|
250
|
|
251 echo "---------------- Done! ----------------"
|
47
|
252
|
|
253
|
|
254
|
|
255
|
|
256
|
|
257
|
|
258
|
2
|
259 #rm $outdir/HS12RSS.txt
|
|
260 #rm $outdir/HS23RSS.txt
|