8
|
1 import copy
|
|
2 import os
|
|
3 import subprocess
|
|
4 import sys
|
|
5 import tempfile
|
|
6
|
|
7 from bx.intervals.io import Comment, Header, GenomicInterval
|
|
8 from bx.intervals.io import GenomicIntervalReader, NiceReaderWrapper, ParseError
|
|
9
|
|
10 # Default chrom, start, end, strand cols for a bed file
|
|
11 BED_DEFAULT_COLS = 0, 1, 2, 5
|
|
12
|
|
13
|
|
14 class GFFInterval(GenomicInterval):
|
|
15 """
|
|
16 A GFF interval, including attributes. If file is strictly a GFF file,
|
|
17 only attribute is 'group.'
|
|
18 """
|
|
19
|
|
20 def __init__(self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
|
|
21 strand_col=6, score_col=5, default_strand='.', fix_strand=False):
|
|
22 # GFF format allows '.' for strand but GenomicInterval does not. To get around this,
|
|
23 # temporarily set strand and then unset after initing GenomicInterval.
|
|
24 unknown_strand = False
|
|
25 if not fix_strand and fields[strand_col] == '.':
|
|
26 unknown_strand = True
|
|
27 fields[strand_col] = '+'
|
|
28 GenomicInterval.__init__(self, reader, fields, chrom_col, start_col, end_col,
|
|
29 strand_col, default_strand, fix_strand=fix_strand)
|
|
30 if unknown_strand:
|
|
31 self.strand = '.'
|
|
32 self.fields[strand_col] = '.'
|
|
33 # Handle feature, score column.
|
|
34 self.feature_col = feature_col
|
|
35 if self.feature_col >= self.nfields:
|
|
36 stop_err("No field for feature_col (%d)" % feature_col)
|
|
37 self.feature = self.fields[self.feature_col]
|
|
38 self.score_col = score_col
|
|
39 if self.score_col >= self.nfields:
|
|
40 stop_err("No field for score_col (%d)" % score_col)
|
|
41 self.score = self.fields[self.score_col]
|
|
42 # GFF attributes.
|
|
43 self.attributes = parse_gff_attributes(fields[8])
|
|
44
|
|
45 def copy(self):
|
|
46 return GFFInterval(self.reader, list(self.fields), self.chrom_col, self.feature_col,
|
|
47 self.start_col, self.end_col, self.strand_col, self.score_col, self.strand)
|
|
48
|
|
49
|
|
50 class GFFFeature(GFFInterval):
|
|
51 """
|
|
52 A GFF feature, which can include multiple intervals.
|
|
53 """
|
|
54
|
|
55 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
|
|
56 score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0):
|
|
57 # Use copy so that first interval and feature do not share fields.
|
|
58 GFFInterval.__init__(self, reader, copy.deepcopy(intervals[0].fields), chrom_col, feature_col,
|
|
59 start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand)
|
|
60 self.intervals = intervals
|
|
61 self.raw_size = raw_size
|
|
62 # Use intervals to set feature attributes.
|
|
63 for interval in self.intervals:
|
|
64 # Error checking. NOTE: intervals need not share the same strand.
|
|
65 if interval.chrom != self.chrom:
|
|
66 stop_err("interval chrom does not match self chrom: %s != %s" % (interval.chrom, self.chrom))
|
|
67 # Set start, end of interval.
|
|
68 if interval.start < self.start:
|
|
69 self.start = interval.start
|
|
70 if interval.end > self.end:
|
|
71 self.end = interval.end
|
|
72
|
|
73 def name(self):
|
|
74 """
|
|
75 Returns feature's name.
|
|
76 """
|
|
77 name = None
|
|
78 # Preference for name:
|
|
79 # GTF: 'gene_id', 'transcript_id'
|
|
80 # GFF3: 'ID', 'id'
|
|
81 # GFF: 'group'
|
|
82 for attr_name in ['gene_id', 'transcript_id', 'ID', 'id', 'group']:
|
|
83 name = self.attributes.get(attr_name, None)
|
|
84 if name is not None:
|
|
85 break
|
|
86 return name
|
|
87
|
|
88 def copy(self):
|
|
89 intervals_copy = []
|
|
90 for interval in self.intervals:
|
|
91 intervals_copy.append(interval.copy())
|
|
92 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col,
|
|
93 self.strand_col, self.score_col, self.strand, intervals=intervals_copy)
|
|
94
|
|
95 def lines(self):
|
|
96 lines = []
|
|
97 for interval in self.intervals:
|
|
98 lines.append('\t'.join(interval.fields))
|
|
99 return lines
|
|
100
|
|
101
|
|
102 class GFFReaderWrapper(NiceReaderWrapper):
|
|
103 """
|
|
104 Reader wrapper for GFF files which has two major functions:
|
|
105 1. group entries for GFF file (via group column), GFF3 (via id attribute),
|
|
106 or GTF (via gene_id/transcript id);
|
|
107 2. convert coordinates from GFF format--starting and ending coordinates
|
|
108 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
|
|
109 half-open. This is useful when using GFF files as inputs to tools that
|
|
110 expect traditional interval format.
|
|
111 """
|
|
112
|
|
113 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
|
|
114 score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs):
|
|
115 NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
|
|
116 strand_col=strand_col, fix_strand=fix_strand, **kwargs)
|
|
117 self.feature_col = feature_col
|
|
118 self.score_col = score_col
|
|
119 self.convert_to_bed_coord = convert_to_bed_coord
|
|
120 self.last_line = None
|
|
121 self.cur_offset = 0
|
|
122 self.seed_interval = None
|
|
123 self.seed_interval_line_len = 0
|
|
124
|
|
125 def parse_row(self, line):
|
|
126 interval = GFFInterval(self, line.split("\t"), self.chrom_col, self.feature_col, self.start_col,
|
|
127 self.end_col, self.strand_col, self.score_col, self.default_strand,
|
|
128 fix_strand=self.fix_strand)
|
|
129 return interval
|
|
130
|
|
131 def next(self):
|
|
132 """
|
|
133 Returns next GFFFeature.
|
|
134 """
|
|
135
|
|
136 def handle_parse_error(parse_error):
|
|
137 """
|
|
138 Actions to take when ParseError found.
|
|
139 """
|
|
140 if self.outstream:
|
|
141 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
|
|
142 self.print_delegate(self.outstream, e, self)
|
|
143 self.skipped += 1
|
|
144 # No reason to stuff an entire bad file into memory.
|
|
145 if self.skipped < 10:
|
|
146 self.skipped_lines.append((self.linenum, self.current_line, str(e)))
|
|
147 # Get next GFFFeature
|
|
148 raw_size = self.seed_interval_line_len
|
|
149 # If there is no seed interval, set one. Also, if there are no more
|
|
150 # intervals to read, this is where iterator dies.
|
|
151 if not self.seed_interval:
|
|
152 while not self.seed_interval:
|
|
153 try:
|
|
154 self.seed_interval = GenomicIntervalReader.next(self)
|
|
155 except ParseError as e:
|
|
156 handle_parse_error(e)
|
|
157 finally:
|
|
158 raw_size += len(self.current_line)
|
|
159 # If header or comment, clear seed interval and return it with its size.
|
|
160 if isinstance(self.seed_interval, (Header, Comment)):
|
|
161 return_val = self.seed_interval
|
|
162 return_val.raw_size = len(self.current_line)
|
|
163 self.seed_interval = None
|
|
164 self.seed_interval_line_len = 0
|
|
165 return return_val
|
|
166 # Initialize feature identifier from seed.
|
|
167 # For GFF.
|
|
168 feature_group = self.seed_interval.attributes.get('group', None)
|
|
169 # For GFF3
|
|
170 feature_id = self.seed_interval.attributes.get('ID', None)
|
|
171 # For GTF.
|
|
172 feature_transcript_id = self.seed_interval.attributes.get('transcript_id', None)
|
|
173 # Read all intervals associated with seed.
|
|
174 feature_intervals = []
|
|
175 feature_intervals.append(self.seed_interval)
|
|
176 while True:
|
|
177 try:
|
|
178 interval = GenomicIntervalReader.next(self)
|
|
179 raw_size += len(self.current_line)
|
|
180 except StopIteration as e:
|
|
181 # No more intervals to read, but last feature needs to be
|
|
182 # returned.
|
|
183 interval = None
|
|
184 raw_size += len(self.current_line)
|
|
185 break
|
|
186 except ParseError as e:
|
|
187 handle_parse_error(e)
|
|
188 raw_size += len(self.current_line)
|
|
189 continue
|
|
190 # Ignore comments.
|
|
191 if isinstance(interval, Comment):
|
|
192 continue
|
|
193 # Determine if interval is part of feature.
|
|
194 part_of = False
|
|
195 group = interval.attributes.get('group', None)
|
|
196 # GFF test:
|
|
197 if group and feature_group == group:
|
|
198 part_of = True
|
|
199 # GFF3 test:
|
|
200 parent_id = interval.attributes.get('Parent', None)
|
|
201 cur_id = interval.attributes.get('ID', None)
|
|
202 if (cur_id and cur_id == feature_id) or (parent_id and parent_id == feature_id):
|
|
203 part_of = True
|
|
204 # GTF test:
|
|
205 transcript_id = interval.attributes.get('transcript_id', None)
|
|
206 if transcript_id and transcript_id == feature_transcript_id:
|
|
207 part_of = True
|
|
208 # If interval is not part of feature, clean up and break.
|
|
209 if not part_of:
|
|
210 # Adjust raw size because current line is not part of feature.
|
|
211 raw_size -= len(self.current_line)
|
|
212 break
|
|
213 # Interval associated with feature.
|
|
214 feature_intervals.append(interval)
|
|
215 # Last interval read is the seed for the next interval.
|
|
216 self.seed_interval = interval
|
|
217 self.seed_interval_line_len = len(self.current_line)
|
|
218 # Return feature.
|
|
219 feature = GFFFeature(self, self.chrom_col, self.feature_col, self.start_col,
|
|
220 self.end_col, self.strand_col, self.score_col,
|
|
221 self.default_strand, fix_strand=self.fix_strand,
|
|
222 intervals=feature_intervals, raw_size=raw_size)
|
|
223 # Convert to BED coords?
|
|
224 if self.convert_to_bed_coord:
|
|
225 convert_gff_coords_to_bed(feature)
|
|
226 return feature
|
|
227
|
|
228
|
|
229 def convert_bed_coords_to_gff(interval):
|
|
230 """
|
|
231 Converts an interval object's coordinates from BED format to GFF format.
|
|
232 Accepted object types include GenomicInterval and list (where the first
|
|
233 element in the list is the interval's start, and the second element is
|
|
234 the interval's end).
|
|
235 """
|
|
236 if isinstance(interval, GenomicInterval):
|
|
237 interval.start += 1
|
|
238 if isinstance(interval, GFFFeature):
|
|
239 for subinterval in interval.intervals:
|
|
240 convert_bed_coords_to_gff(subinterval)
|
|
241 elif isinstance(interval, list):
|
|
242 interval[0] += 1
|
|
243 return interval
|
|
244
|
|
245
|
|
246 def convert_gff_coords_to_bed(interval):
|
|
247 """
|
|
248 Converts an interval object's coordinates from GFF format to BED format.
|
|
249 Accepted object types include GFFFeature, GenomicInterval, and list (where
|
|
250 the first element in the list is the interval's start, and the second
|
|
251 element is the interval's end).
|
|
252 """
|
|
253 if isinstance(interval, GenomicInterval):
|
|
254 interval.start -= 1
|
|
255 if isinstance(interval, GFFFeature):
|
|
256 for subinterval in interval.intervals:
|
|
257 convert_gff_coords_to_bed(subinterval)
|
|
258 elif isinstance(interval, list):
|
|
259 interval[0] -= 1
|
|
260 return interval
|
|
261
|
|
262
|
|
263 def convert_to_twobit(reference_genome):
|
|
264 """
|
|
265 Create 2bit file history fasta dataset.
|
|
266 """
|
|
267 try:
|
|
268 seq_path = tempfile.NamedTemporaryFile(dir=".").name
|
|
269 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
|
|
270 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
|
|
271 tmp_stderr = open(tmp_name, 'wb')
|
|
272 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
|
|
273 returncode = proc.wait()
|
|
274 tmp_stderr.close()
|
|
275 if returncode != 0:
|
|
276 # Get stderr, allowing for case where it's very large.
|
|
277 tmp_stderr = open(tmp_name, 'rb')
|
|
278 stderr = ''
|
|
279 buffsize = 1048576
|
|
280 try:
|
|
281 while True:
|
|
282 stderr += tmp_stderr.read(buffsize)
|
|
283 if not stderr or len(stderr) % buffsize != 0:
|
|
284 break
|
|
285 except OverflowError:
|
|
286 pass
|
|
287 tmp_stderr.close()
|
|
288 os.remove(tmp_name)
|
|
289 stop_err(stderr)
|
|
290 return seq_path
|
|
291 except Exception, e:
|
|
292 stop_err('Error running faToTwoBit. ' + str(e))
|
|
293
|
|
294
|
16
|
295 def get_bedtools_getfasta_default_header(chrom, start, end, strand, includes_strand_col):
|
|
296 """
|
|
297 Return a fasta header that is the default produced by the bedtools
|
|
298 getfasta tool, assuming "force strandedness". This will produce a
|
|
299 header with this format: <chrom>:<start>-<end>(strand). If the input
|
|
300 data includes a strand column and the strand is '+' or '-', then use it.
|
|
301 If the input data includes a strand column and the value of strand is
|
|
302 anything but '+' or '-', set strand to '.' in the header. If the input
|
|
303 data does not include a strand column, set strand to '.' in the header.
|
|
304 """
|
|
305 if includes_strand_col and strand in ['+', '-']:
|
|
306 strand_val = strand
|
|
307 else:
|
|
308 strand_val = '.'
|
|
309 return '%s:%s-%s(%s)' % (chrom, start, end, strand_val)
|
|
310
|
|
311
|
|
312 def get_fasta_header_delimiter(delimiter):
|
|
313 # Return a specified fasta header delimiter.
|
|
314 if delimiter == 'underscore':
|
14
|
315 return '_'
|
16
|
316 if delimiter == 'semicolon':
|
14
|
317 return ';'
|
16
|
318 if delimiter == 'comma':
|
14
|
319 return ','
|
21
|
320 if delimiter == 'tilde':
|
14
|
321 return '~'
|
21
|
322 if delimiter == 'vertical_bar':
|
14
|
323 return '|'
|
|
324 # Set the default to underscore.
|
|
325 return '_'
|
|
326
|
|
327
|
8
|
328 def get_lines(feature):
|
|
329 # Get feature's line(s).
|
|
330 if isinstance(feature, GFFFeature):
|
|
331 return feature.lines()
|
|
332 else:
|
|
333 return [feature.rstrip('\r\n')]
|
|
334
|
|
335
|
|
336 def gff_attributes_to_str(attrs, gff_format):
|
|
337 """
|
|
338 Convert GFF attributes to string. Supported formats are GFF3, GTF.
|
|
339 """
|
|
340 if gff_format == 'GTF':
|
|
341 format_string = '%s "%s"'
|
|
342 # Convert group (GFF) and ID, parent (GFF3) attributes to
|
|
343 # transcript_id, gene_id.
|
|
344 id_attr = None
|
|
345 if 'group' in attrs:
|
|
346 id_attr = 'group'
|
|
347 elif 'ID' in attrs:
|
|
348 id_attr = 'ID'
|
|
349 elif 'Parent' in attrs:
|
|
350 id_attr = 'Parent'
|
|
351 if id_attr:
|
|
352 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
|
|
353 elif gff_format == 'GFF3':
|
|
354 format_string = '%s=%s'
|
|
355 attrs_strs = []
|
|
356 for name, value in attrs.items():
|
|
357 attrs_strs.append(format_string % (name, value))
|
|
358 return " ; ".join(attrs_strs)
|
|
359
|
|
360
|
|
361 def parse_cols_arg(cols):
|
|
362 """
|
|
363 Parse a columns command line argument into a four-tuple.
|
|
364 """
|
|
365 if cols:
|
|
366 # Handle case where no strand column included - in this case, cols
|
|
367 # looks something like 1,2,3,
|
|
368 if cols.endswith(','):
|
|
369 cols += '0'
|
|
370 col_list = map(lambda x: int(x) - 1, cols.split(","))
|
|
371 return col_list
|
|
372 else:
|
|
373 return BED_DEFAULT_COLS
|
|
374
|
|
375
|
|
376 def parse_gff_attributes(attr_str):
|
|
377 """
|
|
378 Parses a GFF/GTF attribute string and returns a dictionary of name-value
|
|
379 pairs. The general format for a GFF3 attributes string is
|
|
380 name1=value1;name2=value2
|
|
381 The general format for a GTF attribute string is
|
|
382 name1 "value1" ; name2 "value2"
|
|
383 The general format for a GFF attribute string is a single string that
|
|
384 denotes the interval's group; in this case, method returns a dictionary
|
|
385 with a single key-value pair, and key name is 'group'.
|
|
386 """
|
|
387 attributes_list = attr_str.split(";")
|
|
388 attributes = {}
|
|
389 for name_value_pair in attributes_list:
|
|
390 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
|
|
391 # attribute; next, try double quotes for GTF.
|
|
392 pair = name_value_pair.strip().split("=")
|
|
393 if len(pair) == 1:
|
|
394 pair = name_value_pair.strip().split("\"")
|
|
395 if len(pair) == 1:
|
|
396 # Could not split for some reason.
|
|
397 continue
|
|
398 if pair == '':
|
|
399 continue
|
|
400 name = pair[0].strip()
|
|
401 if name == '':
|
|
402 continue
|
|
403 # Need to strip double quote from values
|
|
404 value = pair[1].strip(" \"")
|
|
405 attributes[name] = value
|
|
406 if len(attributes) == 0:
|
|
407 # Could not split attributes string, so entire string must be
|
|
408 # 'group' attribute. This is the case for strictly GFF files.
|
|
409 attributes['group'] = attr_str
|
|
410 return attributes
|
|
411
|
|
412
|
|
413 def reverse_complement(s):
|
|
414 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
|
|
415 reversed_s = []
|
|
416 for i in s:
|
|
417 reversed_s.append(complement_dna[i])
|
|
418 reversed_s.reverse()
|
|
419 return "".join(reversed_s)
|
|
420
|
|
421
|
|
422 def stop_err(msg):
|
|
423 sys.stderr.write(msg)
|
|
424 sys.exit(1)
|