0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
66
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter}
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
78
|
89 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched"
|
4
|
90 echo "R mutation analysis"
|
22
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
|
53
|
92
|
|
93 #echo "." > $output
|
|
94 #exit 0
|
|
95
|
55
|
96
|
|
97
|
|
98 echo "---------------- mutation_analysis.py ----------------"
|
63
|
99 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
100
|
32
|
101 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
55
|
102
|
|
103 echo "---------------- aa_histogram.r ----------------"
|
63
|
104 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
105
|
26
|
106 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
107
|
0
|
108 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
109
|
53
|
110 funcs=(sum mean median)
|
0
|
111
|
53
|
112
|
62
|
113 echo "<html><center><h1>$title</h1></center>" > $output
|
|
114
|
|
115 #display the matched/unmatched for clearity
|
|
116
|
|
117 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
118 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
119 total_count=$((matched_count + unmatched_count))
|
|
120 perc_count=$((unmatched_count / total_count * 100))
|
|
121 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
122 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
123
|
|
124 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
125 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
126 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
127
|
55
|
128 echo "---------------- main tables ----------------"
|
53
|
129 for func in ${funcs[@]}
|
4
|
130 do
|
55
|
131
|
|
132 echo "---------------- $func table ----------------"
|
|
133
|
53
|
134 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
135
|
|
136 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
137 echo "<tr><th>info</th>" >> $output
|
53
|
138 for gene in ${genes[@]}
|
|
139 do
|
|
140 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
141 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
142 done
|
78
|
143
|
|
144 tmp=`cat $outdir/unmatched_${func}_n.txt`
|
79
|
145 echo "<th><a href='unmatched.txt'>unmatched (N = ${unmatched_count})</a></th>" >> $output
|
53
|
146 tmp=`cat $outdir/all_${func}_n.txt`
|
78
|
147 echo "<th><a href='matched_${func}_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
148
|
78
|
149 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz
|
53
|
150 do
|
|
151 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
152 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
153 else
|
78
|
154 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${unx}/${uny} (${unz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
53
|
155 fi
|
|
156 done < $outdir/result.txt
|
|
157
|
|
158 done
|
|
159
|
55
|
160 echo "---------------- download links ----------------"
|
|
161
|
0
|
162 echo "</table>" >> $output
|
53
|
163 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
164 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
165 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
166 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
167 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
168 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
169 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
2
|
170
|
55
|
171 echo "---------------- images ----------------"
|
|
172
|
4
|
173 echo "<img src='all.png'/><br />" >> $output
|
26
|
174 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
175 if [ -a $outdir/ca.png ]
|
|
176 then
|
|
177 echo "<img src='ca.png'/><br />" >> $output
|
26
|
178 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
179 fi
|
|
180 if [ -a $outdir/cg.png ]
|
|
181 then
|
|
182 echo "<img src='cg.png'/><br />" >> $output
|
26
|
183 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
184 fi
|
22
|
185 if [ -a $outdir/scatter.png ]
|
|
186 then
|
|
187 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
188 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
189 fi
|
49
|
190 if [ -a $outdir/frequency_ranges.png ]
|
|
191 then
|
|
192 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
193 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
194 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
195 fi
|
26
|
196 if [ -a $outdir/aa_histogram.png ]
|
|
197 then
|
|
198 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
199 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
200 fi
|
2
|
201
|
0
|
202 for gene in ${genes[@]}
|
|
203 do
|
|
204 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
205 while IFS=, read from a c g t
|
|
206 do
|
|
207 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
208 done < $outdir/transitions_${gene}_sum.txt
|
0
|
209 echo "</table>" >> $output
|
|
210 done
|
|
211
|
|
212 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
213 while IFS=, read from a c g t
|
|
214 do
|
|
215 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
216 done < $outdir/transitions_all_sum.txt
|
0
|
217 echo "</table>" >> $output
|
|
218
|
|
219 echo "</html>" >> $output
|
2
|
220
|
47
|
221
|
|
222 #optional output for naive
|
|
223
|
55
|
224 echo "---------------- aa_histogram.r ----------------"
|
|
225
|
47
|
226 if [[ "$naive_output" != "None" ]]
|
|
227 then
|
55
|
228 echo "---------------- imgt_loader.r ----------------"
|
50
|
229 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
80
|
230 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
|
55
|
231 echo "---------------- naive_output.r ----------------"
|
80
|
232 Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} 2>&1
|
47
|
233 fi
|
|
234
|
76
|
235 echo "---------------- sequence_overview.r ----------------"
|
47
|
236
|
76
|
237 mkdir $outdir/sequence_overview
|
|
238
|
|
239 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1
|
47
|
240
|
|
241
|
|
242
|
|
243
|
|
244
|
|
245
|
|
246
|
|
247
|
|
248
|
2
|
249 #rm $outdir/HS12RSS.txt
|
|
250 #rm $outdir/HS23RSS.txt
|