0
|
1 #!/bin/bash
|
4
|
2 set -e
|
0
|
3 dir="$(cd "$(dirname "$0")" && pwd)"
|
|
4 input=$1
|
19
|
5 method=$2
|
|
6 output=$3
|
|
7 outdir=$4
|
|
8 title=$5
|
22
|
9 include_fr1=$6
|
34
|
10 functionality=$7
|
|
11 unique=$8
|
69
|
12 naive_output_ca=$9
|
|
13 naive_output_cg=${10}
|
|
14 naive_output_cm=${11}
|
|
15 filter_unique=${12}
|
|
16 class_filter=${13}
|
0
|
17 mkdir $outdir
|
|
18
|
55
|
19 echo "---------------- read parameters ----------------"
|
63
|
20 echo "---------------- read parameters ----------------<br />" > $output
|
55
|
21
|
|
22 echo "unpacking IMGT file"
|
|
23
|
35
|
24 type="`file $input`"
|
|
25 if [[ "$type" == *"Zip archive"* ]] ; then
|
|
26 echo "Zip archive"
|
|
27 echo "unzip $input -d $PWD/files/"
|
|
28 unzip $input -d $PWD/files/
|
|
29 elif [[ "$type" == *"XZ compressed data"* ]] ; then
|
|
30 echo "ZX archive"
|
|
31 echo "tar -xJf $input -C $PWD/files/"
|
|
32 mkdir -p $PWD/files/$title
|
|
33 tar -xJf $input -C $PWD/files/$title
|
|
34 fi
|
|
35
|
64
|
36 cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt
|
|
37 cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt
|
|
38 cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt
|
|
39 cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt
|
|
40 cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt
|
|
41 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
|
|
42 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
|
|
43
|
|
44
|
|
45
|
|
46 #cat $PWD/files/*/1_* > $PWD/summary.txt
|
|
47 #cat $PWD/files/*/3_* > $PWD/sequences.txt
|
|
48 #cat $PWD/files/*/5_* > $PWD/aa.txt
|
|
49 #cat $PWD/files/*/6_* > $PWD/junction.txt
|
|
50 #cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
|
|
51 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
|
|
52 #cat $PWD/files/*/10_* > $PWD/hotspots.txt
|
3
|
53
|
26
|
54 #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
|
19
|
55
|
|
56 echo "${BLASTN_DIR}"
|
|
57
|
|
58 echo "identification ($method)"
|
63
|
59 echo "identification ($method)<br />" >> $output
|
19
|
60
|
55
|
61 echo "blast or custom"
|
|
62
|
19
|
63 if [[ "${method}" == "custom" ]] ; then
|
55
|
64 echo "custom"
|
19
|
65 python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
|
|
66 else
|
55
|
67 echo "blast"
|
19
|
68 ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
|
|
69 ID_index=$((ID_index+1))
|
|
70 sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
|
|
71 sequence_index=$((sequence_index+1))
|
|
72
|
|
73 echo "$ID_index ${sequence_index}"
|
|
74
|
|
75 cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta
|
|
76
|
|
77 echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
|
|
78 ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
|
|
79 fi
|
|
80
|
55
|
81 echo "---------------- merge_and_filter.r ----------------"
|
63
|
82 echo "---------------- merge_and_filter.r ----------------<br />" >> $output
|
19
|
83
|
66
|
84 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter}
|
0
|
85
|
55
|
86 echo "---------------- mutation_analysis.r ----------------"
|
63
|
87 echo "---------------- mutation_analysis.r ----------------<br />" >> $output
|
55
|
88
|
4
|
89 genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
|
|
90 echo "R mutation analysis"
|
22
|
91 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
|
53
|
92
|
|
93 #echo "." > $output
|
|
94 #exit 0
|
|
95
|
55
|
96
|
|
97
|
|
98 echo "---------------- mutation_analysis.py ----------------"
|
63
|
99 echo "---------------- mutation_analysis.py ----------------<br />" >> $output
|
55
|
100
|
32
|
101 python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
|
26
|
102 echo "R AA histogram"
|
55
|
103
|
|
104 echo "---------------- aa_histogram.r ----------------"
|
63
|
105 echo "---------------- aa_histogram.r ----------------<br />" >> $output
|
55
|
106
|
26
|
107 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1
|
4
|
108
|
0
|
109 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
|
|
110
|
53
|
111 funcs=(sum mean median)
|
0
|
112
|
53
|
113
|
62
|
114 echo "<html><center><h1>$title</h1></center>" > $output
|
|
115
|
|
116 #display the matched/unmatched for clearity
|
|
117
|
|
118 matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`"
|
|
119 unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`"
|
|
120 total_count=$((matched_count + unmatched_count))
|
|
121 perc_count=$((unmatched_count / total_count * 100))
|
|
122 perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"`
|
|
123 perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"`
|
|
124
|
|
125 echo "<center><h2>Total: ${total_count}</h2></center>" >> $output
|
|
126 echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output
|
|
127 echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output
|
|
128
|
55
|
129 echo "---------------- main tables ----------------"
|
53
|
130 for func in ${funcs[@]}
|
4
|
131 do
|
55
|
132
|
|
133 echo "---------------- $func table ----------------"
|
|
134
|
53
|
135 cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/result.txt
|
|
136
|
|
137 echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output
|
58
|
138 echo "<tr><th>info</th>" >> $output
|
53
|
139 for gene in ${genes[@]}
|
|
140 do
|
|
141 tmp=`cat $outdir/${gene}_${func}_n.txt`
|
|
142 echo "<th><a href='matched_${gene}_${func}.txt'>${gene} (N = $tmp)</a></th>" >> $output
|
|
143 done
|
|
144 tmp=`cat $outdir/all_${func}_n.txt`
|
|
145 echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output
|
4
|
146
|
53
|
147 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
|
|
148 do
|
|
149 if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh
|
|
150 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${allx}/${ally} (${allz})</td></tr>" >> $output
|
|
151 else
|
|
152 echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
|
|
153 fi
|
|
154 done < $outdir/result.txt
|
|
155
|
|
156 done
|
|
157
|
55
|
158 echo "---------------- download links ----------------"
|
|
159
|
0
|
160 echo "</table>" >> $output
|
53
|
161 echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
|
|
162 echo "<a href='motif_per_seq.txt'>motif per sequence</a><br />" >> $output
|
|
163 echo "<a href='merged.txt'>all data</a><br />" >> $output
|
|
164 echo "<a href='mutation_by_id.txt'>mutations by id</a><br />" >> $output
|
|
165 echo "<a href='aa_id_mutations.txt'>AA mutations location by id</a><br />" >> $output
|
|
166 echo "<a href='absent_aa_id.txt'>Absant AA locations by id</a><br />" >> $output
|
2
|
167
|
55
|
168 echo "---------------- images ----------------"
|
|
169
|
4
|
170 echo "<img src='all.png'/><br />" >> $output
|
26
|
171 echo "<a href='all.txt'>download data</a><br />" >> $output
|
4
|
172 if [ -a $outdir/ca.png ]
|
|
173 then
|
|
174 echo "<img src='ca.png'/><br />" >> $output
|
26
|
175 echo "<a href='ca.txt'>download data</a><br />" >> $output
|
4
|
176 fi
|
|
177 if [ -a $outdir/cg.png ]
|
|
178 then
|
|
179 echo "<img src='cg.png'/><br />" >> $output
|
26
|
180 echo "<a href='cg.txt'>download data</a><br />" >> $output
|
4
|
181 fi
|
22
|
182 if [ -a $outdir/scatter.png ]
|
|
183 then
|
|
184 echo "<img src='scatter.png'/><br />" >> $output
|
26
|
185 echo "<a href='scatter.txt'>download data</a><br />" >> $output
|
|
186 fi
|
49
|
187 if [ -a $outdir/frequency_ranges.png ]
|
|
188 then
|
|
189 echo "<img src='frequency_ranges.png'/><br />" >> $output
|
|
190 echo "<a href='frequency_ranges_classes.txt'>download class data</a><br />" >> $output
|
|
191 echo "<a href='frequency_ranges_subclasses.txt'>download subclass data</a><br />" >> $output
|
|
192 fi
|
26
|
193 if [ -a $outdir/aa_histogram.png ]
|
|
194 then
|
|
195 echo "<img src='aa_histogram.png'/><br />" >> $output
|
|
196 echo "<a href='aa_histogram.txt'>download data</a><br />" >> $output
|
22
|
197 fi
|
2
|
198
|
0
|
199 for gene in ${genes[@]}
|
|
200 do
|
|
201 echo "<table border='1'><caption>$gene transition table</caption>" >> $output
|
|
202 while IFS=, read from a c g t
|
|
203 do
|
|
204 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
205 done < $outdir/transitions_${gene}_sum.txt
|
0
|
206 echo "</table>" >> $output
|
|
207 done
|
|
208
|
|
209 echo "<table border='1'><caption>All transition table</caption>" >> $output
|
|
210 while IFS=, read from a c g t
|
|
211 do
|
|
212 echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
|
53
|
213 done < $outdir/transitions_all_sum.txt
|
0
|
214 echo "</table>" >> $output
|
|
215
|
|
216 echo "</html>" >> $output
|
2
|
217
|
47
|
218
|
|
219 #optional output for naive
|
|
220
|
55
|
221 echo "---------------- aa_histogram.r ----------------"
|
|
222
|
47
|
223 if [[ "$naive_output" != "None" ]]
|
|
224 then
|
55
|
225 echo "---------------- imgt_loader.r ----------------"
|
50
|
226 #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
|
69
|
227 Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt ${naive_output_ca} 2>&1
|
55
|
228 echo "---------------- naive_output.r ----------------"
|
69
|
229 Rscript $dir/naive_output.r $naive_output_ca $outdir/merged.txt $naive_output_ca $naive_output_cg $naive_output_cm 2>&1
|
47
|
230 fi
|
|
231
|
76
|
232 echo "---------------- sequence_overview.r ----------------"
|
47
|
233
|
76
|
234 mkdir $outdir/sequence_overview
|
|
235
|
|
236 Rscript $dir/sequence_overview.r $outdir/identified_genes.txt $PWD/sequences.txt $outdir/sequence_overview 2>&1
|
47
|
237
|
|
238
|
|
239
|
|
240
|
|
241
|
|
242
|
|
243
|
|
244
|
|
245
|
2
|
246 #rm $outdir/HS12RSS.txt
|
|
247 #rm $outdir/HS23RSS.txt
|