0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
66
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter}
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
4
|
89 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
|
|
90 echo "R mutation analysis"
|
22
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
|
53
|
92
|
|
93 #echo "." > $output
|
|
94 #exit 0
|
|
95
|
55
|
96
|
|
97
|
|
98 echo "---------------- mutation_analysis.py ----------------"
|
63
|
99 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
100
|
32
|
101 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
26
|
102 echo "R AA histogram"
|
55
|
103
|
|
104 echo "---------------- aa_histogram.r ----------------"
|
63
|
105 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
106
|
26
|
107 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
108
|
0
|
109 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
110
|
53
|
111 funcs=(sum mean median)
|
0
|
112
|
53
|
113
|
62
|
114 echo "<html><center><h1>$title</h1></center>" > $output
|
|
115
|
|
116 #display the matched/unmatched for clearity
|
|
117
|
|
118 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
119 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
120 total_count=$((matched_count + unmatched_count))
|
|
121 perc_count=$((unmatched_count / total_count * 100))
|
|
122 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
123 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
124
|
|
125 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
126 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
127 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
128
|
55
|
129 echo "---------------- main tables ----------------"
|
53
|
130 for func in ${funcs[@]}
|
4
|
131 do
|
55
|
132
|
|
133 echo "---------------- $func table ----------------"
|
|
134
|
53
|
135 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
136
|
|
137 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
138 echo "<tr><th>info</th>" >> $output
|
53
|
139 for gene in ${genes[@]}
|
|
140 do
|
|
141 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
142 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
143 done
|
|
144 tmp=`cat $outdir/all_${func}_n.txt`
|
|
145 echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
146
|
53
|
147 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
|
|
148 do
|
|
149 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
150 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
151 else
|
|
152 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
|
153 fi
|
|
154 done < $outdir/result.txt
|
|
155
|
|
156 done
|
|
157
|
55
|
158 echo "---------------- download links ----------------"
|
|
159
|
0
|
160 echo "</table>" >> $output
|
53
|
161 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
162 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
163 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
164 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
165 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
166 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
77
|
167 echo "<a href='sequence_overview/index.html'>Sequence Overview</a><br />" >> $output
|
2
|
168
|
55
|
169 echo "---------------- images ----------------"
|
|
170
|
4
|
171 echo "<img src='all.png'/><br />" >> $output
|
26
|
172 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
173 if [ -a $outdir/ca.png ]
|
|
174 then
|
|
175 echo "<img src='ca.png'/><br />" >> $output
|
26
|
176 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
177 fi
|
|
178 if [ -a $outdir/cg.png ]
|
|
179 then
|
|
180 echo "<img src='cg.png'/><br />" >> $output
|
26
|
181 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
182 fi
|
22
|
183 if [ -a $outdir/scatter.png ]
|
|
184 then
|
|
185 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
186 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
187 fi
|
49
|
188 if [ -a $outdir/frequency_ranges.png ]
|
|
189 then
|
|
190 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
191 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
192 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
193 fi
|
26
|
194 if [ -a $outdir/aa_histogram.png ]
|
|
195 then
|
|
196 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
197 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
198 fi
|
2
|
199
|
0
|
200 for gene in ${genes[@]}
|
|
201 do
|
|
202 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
203 while IFS=, read from a c g t
|
|
204 do
|
|
205 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
206 done < $outdir/transitions_${gene}_sum.txt
|
0
|
207 echo "</table>" >> $output
|
|
208 done
|
|
209
|
|
210 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
211 while IFS=, read from a c g t
|
|
212 do
|
|
213 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
214 done < $outdir/transitions_all_sum.txt
|
0
|
215 echo "</table>" >> $output
|
|
216
|
|
217 echo "</html>" >> $output
|
2
|
218
|
47
|
219
|
|
220 #optional output for naive
|
|
221
|
55
|
222 echo "---------------- aa_histogram.r ----------------"
|
|
223
|
47
|
224 if [[ "$naive_output" != "None" ]]
|
|
225 then
|
55
|
226 echo "---------------- imgt_loader.r ----------------"
|
50
|
227 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
69
|
228 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt ${naive_output_ca} 2>&1
|
55
|
229 echo "---------------- naive_output.r ----------------"
|
69
|
230 Rscript $dir/naive_output.r $naive_output_ca $outdir/merged.txt $naive_output_ca $naive_output_cg $naive_output_cm 2>&1
|
47
|
231 fi
|
|
232
|
76
|
233 echo "---------------- sequence_overview.r ----------------"
|
47
|
234
|
76
|
235 mkdir $outdir/sequence_overview
|
|
236
|
|
237 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1
|
47
|
238
|
|
239
|
|
240
|
|
241
|
|
242
|
|
243
|
|
244
|
|
245
|
|
246
|
2
|
247 #rm $outdir/HS12RSS.txt
|
|
248 #rm $outdir/HS23RSS.txt
|