Mercurial > repos > rnateam > rnacode
comparison breakMAF.pl @ 1:0554fa091710 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit b6d3800e260cdfcbe99e5f056344c3df101dad00
author | rnateam |
---|---|
date | Fri, 13 Apr 2018 07:37:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:95340f016498 | 1:0554fa091710 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 use strict; | |
4 use Getopt::Long; | |
5 use POSIX qw(ceil floor); | |
6 | |
7 my $maxLength = 400; | |
8 my $desiredLength = 200; | |
9 my $help = 0; | |
10 | |
11 GetOptions( | |
12 "maxLength:i" => \$maxLength, | |
13 "desiredLength:i" => \$desiredLength, | |
14 "help" => \$help, | |
15 "h" => \$help | |
16 ); | |
17 | |
18 if ($help) { | |
19 print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n"; | |
20 print STDERR "Options:\n"; | |
21 print STDERR " maxLength: Break all blocks longer than that (default: 400 columns)\n"; | |
22 print STDERR " desiredLength: Try to create blocks of this size (default: 200 columns)\n"; | |
23 exit(1); | |
24 } | |
25 | |
26 $/ = 'a score='; | |
27 | |
28 while ( my $block = <> ) { | |
29 | |
30 $block = 'a score=' . $block; | |
31 | |
32 my $currAln = readMAF($block)->[0]; | |
33 | |
34 next if not defined($currAln); | |
35 | |
36 my $length = length( $currAln->[0]->{seq} ); | |
37 | |
38 if ( $length > $maxLength ) { | |
39 | |
40 my $breakN = int( $length / $desiredLength ); | |
41 my $chunkLength = ceil( $length / $breakN ); | |
42 | |
43 my $from = 0; | |
44 | |
45 while (1) { | |
46 my $to = $from + $chunkLength; | |
47 | |
48 if ( $to > $length ) { | |
49 $to = $length; | |
50 } | |
51 | |
52 my $slice = sliceAlnByColumn( $currAln, $from, $to ); | |
53 | |
54 print formatAln( $slice, 'maf' ), "\n"; | |
55 | |
56 $from = $to; | |
57 | |
58 last if $from == $length; | |
59 } | |
60 | |
61 } else { | |
62 print formatAln( $currAln, 'maf' ), "\n"; | |
63 } | |
64 } | |
65 | |
66 ###################################################################### | |
67 # | |
68 # readMAF($string) | |
69 # | |
70 # Converts the MAF in string to internal alignment format. Returns | |
71 # list of usual array references. | |
72 # | |
73 ###################################################################### | |
74 | |
75 sub readMAF { | |
76 | |
77 my $string = shift; | |
78 | |
79 return [] if $string eq ''; | |
80 | |
81 my @input = split( "\n", $string ); | |
82 | |
83 #open(FILE,"<$file") || die("Could not read $file ($!)"); | |
84 | |
85 my @outAlns = (); | |
86 my @aln = (); | |
87 | |
88 foreach my $i ( 0 .. $#input ) { | |
89 | |
90 $_ = $input[$i]; | |
91 | |
92 next if (/^\s?\#/); | |
93 next if (/^\s?a/); | |
94 | |
95 if (/^\s?s/) { | |
96 ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split; | |
97 | |
98 my $end = $start + $length; | |
99 | |
100 ( my $org, my $chrom ) = split( /\./, $name ); | |
101 | |
102 push @aln, { | |
103 name => $name, | |
104 org => $org, | |
105 chrom => $chrom, | |
106 start => $start, | |
107 end => $end, | |
108 seq => $seq, | |
109 strand => $strand | |
110 }; | |
111 } | |
112 | |
113 if ( /^\s?$/ and @aln ) { | |
114 push @outAlns, [@aln]; | |
115 @aln = (); | |
116 } | |
117 if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) { | |
118 push @outAlns, [@aln]; | |
119 @aln = (); | |
120 } | |
121 } | |
122 return \@outAlns; | |
123 } | |
124 | |
125 sub formatAln { | |
126 | |
127 my @aln = @{ $_[0] }; | |
128 | |
129 my $format = $_[1]; | |
130 | |
131 $format = lc($format); | |
132 | |
133 my @alnSeqs = (); | |
134 my @alnNames = (); | |
135 | |
136 my $counter = 1; | |
137 | |
138 foreach my $row (@aln) { | |
139 | |
140 my $name = "seq$counter"; | |
141 $counter++; | |
142 if ( $row->{name} ) { | |
143 $name = ( $row->{name} ); | |
144 } elsif ( $row->{org} and $row->{chrom} ) { | |
145 $name = "$row->{org}.$row->{chrom}"; | |
146 } | |
147 | |
148 my $start = $row->{start}; | |
149 my $end = $row->{end}; | |
150 my $strand = $row->{strand}; | |
151 | |
152 my $pos = ''; | |
153 | |
154 if ( defined $start | |
155 and defined $end | |
156 and defined $strand | |
157 and ( $format ne 'phylip' ) ) { | |
158 ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' ); | |
159 $pos = "/$start-$end"; | |
160 } | |
161 | |
162 push @alnNames, "$name$pos"; | |
163 push @alnSeqs, $row->{seq}; | |
164 | |
165 } | |
166 | |
167 my $output = ''; | |
168 | |
169 if ( $format eq 'clustal' ) { | |
170 | |
171 $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n"; | |
172 my $maxName = 0; | |
173 | |
174 foreach my $name (@alnNames) { | |
175 $maxName = ( $maxName < length($name) ) ? length($name) : $maxName; | |
176 } | |
177 | |
178 for my $i ( 0 .. $#alnNames ) { | |
179 my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) ); | |
180 $alnNames[$i] .= $buffer; | |
181 } | |
182 my $columnWidth = 60; | |
183 my $currPos = 0; | |
184 my $length = length( $alnSeqs[0] ); | |
185 | |
186 while ( $currPos < $length ) { | |
187 for my $i ( 0 .. $#alnNames ) { | |
188 $output .= $alnNames[$i]; | |
189 $output .= substr( $alnSeqs[$i], $currPos, $columnWidth ); | |
190 $output .= "\n"; | |
191 } | |
192 $output .= "\n\n"; | |
193 $currPos += $columnWidth; | |
194 } | |
195 } elsif ( $format eq 'fasta' ) { | |
196 foreach my $i ( 0 .. $#alnNames ) { | |
197 my $name = $alnNames[$i]; | |
198 my $seq = $alnSeqs[$i]; | |
199 $seq =~ s/(.{60})/$1\n/g; | |
200 $output .= ">$name\n$seq\n"; | |
201 } | |
202 } elsif ( $format eq 'phylip' ) { | |
203 my $length = length( $alnSeqs[0] ); | |
204 my $N = @alnSeqs; | |
205 $output .= " $N $length\n"; | |
206 foreach my $i ( 0 .. $#alnNames ) { | |
207 my $name = $alnNames[$i]; | |
208 my $seq = $alnSeqs[$i]; | |
209 $seq =~ s/(.{60})/$1\n/g; | |
210 $output .= "$name\n$seq\n"; | |
211 } | |
212 } elsif ( $format eq 'maf' ) { | |
213 $output .= "a score=0\n"; | |
214 foreach my $row (@aln) { | |
215 my $length = $row->{end} - $row->{start}; | |
216 $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n"; | |
217 } | |
218 } | |
219 return $output; | |
220 | |
221 } | |
222 | |
223 ###################################################################### | |
224 # | |
225 # sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int) | |
226 # | |
227 # Returns slice of alignment specified by alignment column. | |
228 # | |
229 # \@aln ... alignment in list of hash format | |
230 # $start, $end ... slice to cut | |
231 # | |
232 # Returns reference to alignment in list of hash format. This is a new | |
233 # alignment, i.e. the input is not sliced in place | |
234 # | |
235 ###################################################################### | |
236 | |
237 sub sliceAlnByColumn { | |
238 | |
239 my @aln = @{ $_[0] }; | |
240 shift; | |
241 ( my $start, my $end ) = @_; | |
242 | |
243 # correct ends without warning if outside of valid range | |
244 $start = 0 if ( $start < 0 ); | |
245 $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) ); | |
246 | |
247 #my @newAln=@aln; | |
248 | |
249 # make deep copy of list of hash | |
250 my @newAln = (); | |
251 foreach (@aln) { | |
252 push @newAln, { %{$_} }; | |
253 } | |
254 | |
255 foreach my $i ( 0 .. $#newAln ) { | |
256 | |
257 my $oldStart = $newAln[$i]->{start}; | |
258 my $oldEnd = $newAln[$i]->{end}; | |
259 | |
260 $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start ); | |
261 $newAln[$i]->{end} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1; | |
262 $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start ); | |
263 | |
264 } | |
265 | |
266 return ( [@newAln] ); | |
267 | |
268 } | |
269 | |
270 ###################################################################### | |
271 # | |
272 # alnCol2genomePos($seq string, $start int, $col int) | |
273 # | |
274 # Calculates the genomic position corresponding to a column in an | |
275 # alignment. | |
276 # | |
277 # $seq ... sequence from alignment (i.e. letters with gaps) | |
278 # $start ... Genomic position of first letter in $seq | |
279 # $col ... column in the alignment that is to be mapped | |
280 # | |
281 # Returns genomic position. No error handling, so $col must be a valid | |
282 # column of the string $seq. | |
283 # | |
284 ####################################################################### | |
285 | |
286 sub alnCol2genomePos { | |
287 | |
288 ( my $seq, my $start, my $col ) = @_; | |
289 | |
290 $seq =~ s/\./-/g; #Convert all gaps to "-" | |
291 | |
292 my $newPos = $start; | |
293 | |
294 # if gap only... | |
295 return $start if ( $seq =~ /^-+$/ ); | |
296 | |
297 ( my $tmp ) = $seq =~ /(-*)[^-]/; | |
298 | |
299 my $leadingGaps = length($tmp); | |
300 | |
301 # if column is in gap before first letter, | |
302 # return position of the first letter | |
303 return $start if ( $col < $leadingGaps ); | |
304 | |
305 $newPos = $start - 1; | |
306 | |
307 for my $i ( $leadingGaps .. $col ) { | |
308 $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' ); | |
309 } | |
310 return $newPos; | |
311 } | |
312 |