annotate extract_genomic_dna_utils.py @ 8:32c6057529a4 draft

Uploaded
author greg
date Fri, 15 Jan 2016 08:43:41 -0500
parents
children 1a10864abc1f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8
32c6057529a4 Uploaded
greg
parents:
diff changeset
1 import copy
32c6057529a4 Uploaded
greg
parents:
diff changeset
2 import os
32c6057529a4 Uploaded
greg
parents:
diff changeset
3 import subprocess
32c6057529a4 Uploaded
greg
parents:
diff changeset
4 import sys
32c6057529a4 Uploaded
greg
parents:
diff changeset
5 import tempfile
32c6057529a4 Uploaded
greg
parents:
diff changeset
6
32c6057529a4 Uploaded
greg
parents:
diff changeset
7 from bx.intervals.io import Comment, Header, GenomicInterval
32c6057529a4 Uploaded
greg
parents:
diff changeset
8 from bx.intervals.io import GenomicIntervalReader, NiceReaderWrapper, ParseError
32c6057529a4 Uploaded
greg
parents:
diff changeset
9
32c6057529a4 Uploaded
greg
parents:
diff changeset
10 # Default chrom, start, end, strand cols for a bed file
32c6057529a4 Uploaded
greg
parents:
diff changeset
11 BED_DEFAULT_COLS = 0, 1, 2, 5
32c6057529a4 Uploaded
greg
parents:
diff changeset
12
32c6057529a4 Uploaded
greg
parents:
diff changeset
13
32c6057529a4 Uploaded
greg
parents:
diff changeset
14 class GFFInterval(GenomicInterval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
15 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
16 A GFF interval, including attributes. If file is strictly a GFF file,
32c6057529a4 Uploaded
greg
parents:
diff changeset
17 only attribute is 'group.'
32c6057529a4 Uploaded
greg
parents:
diff changeset
18 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
19
32c6057529a4 Uploaded
greg
parents:
diff changeset
20 def __init__(self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
32c6057529a4 Uploaded
greg
parents:
diff changeset
21 strand_col=6, score_col=5, default_strand='.', fix_strand=False):
32c6057529a4 Uploaded
greg
parents:
diff changeset
22 # GFF format allows '.' for strand but GenomicInterval does not. To get around this,
32c6057529a4 Uploaded
greg
parents:
diff changeset
23 # temporarily set strand and then unset after initing GenomicInterval.
32c6057529a4 Uploaded
greg
parents:
diff changeset
24 unknown_strand = False
32c6057529a4 Uploaded
greg
parents:
diff changeset
25 if not fix_strand and fields[strand_col] == '.':
32c6057529a4 Uploaded
greg
parents:
diff changeset
26 unknown_strand = True
32c6057529a4 Uploaded
greg
parents:
diff changeset
27 fields[strand_col] = '+'
32c6057529a4 Uploaded
greg
parents:
diff changeset
28 GenomicInterval.__init__(self, reader, fields, chrom_col, start_col, end_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
29 strand_col, default_strand, fix_strand=fix_strand)
32c6057529a4 Uploaded
greg
parents:
diff changeset
30 if unknown_strand:
32c6057529a4 Uploaded
greg
parents:
diff changeset
31 self.strand = '.'
32c6057529a4 Uploaded
greg
parents:
diff changeset
32 self.fields[strand_col] = '.'
32c6057529a4 Uploaded
greg
parents:
diff changeset
33 # Handle feature, score column.
32c6057529a4 Uploaded
greg
parents:
diff changeset
34 self.feature_col = feature_col
32c6057529a4 Uploaded
greg
parents:
diff changeset
35 if self.feature_col >= self.nfields:
32c6057529a4 Uploaded
greg
parents:
diff changeset
36 stop_err("No field for feature_col (%d)" % feature_col)
32c6057529a4 Uploaded
greg
parents:
diff changeset
37 self.feature = self.fields[self.feature_col]
32c6057529a4 Uploaded
greg
parents:
diff changeset
38 self.score_col = score_col
32c6057529a4 Uploaded
greg
parents:
diff changeset
39 if self.score_col >= self.nfields:
32c6057529a4 Uploaded
greg
parents:
diff changeset
40 stop_err("No field for score_col (%d)" % score_col)
32c6057529a4 Uploaded
greg
parents:
diff changeset
41 self.score = self.fields[self.score_col]
32c6057529a4 Uploaded
greg
parents:
diff changeset
42 # GFF attributes.
32c6057529a4 Uploaded
greg
parents:
diff changeset
43 self.attributes = parse_gff_attributes(fields[8])
32c6057529a4 Uploaded
greg
parents:
diff changeset
44
32c6057529a4 Uploaded
greg
parents:
diff changeset
45 def copy(self):
32c6057529a4 Uploaded
greg
parents:
diff changeset
46 return GFFInterval(self.reader, list(self.fields), self.chrom_col, self.feature_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
47 self.start_col, self.end_col, self.strand_col, self.score_col, self.strand)
32c6057529a4 Uploaded
greg
parents:
diff changeset
48
32c6057529a4 Uploaded
greg
parents:
diff changeset
49
32c6057529a4 Uploaded
greg
parents:
diff changeset
50 class GFFFeature(GFFInterval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
51 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
52 A GFF feature, which can include multiple intervals.
32c6057529a4 Uploaded
greg
parents:
diff changeset
53 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
54
32c6057529a4 Uploaded
greg
parents:
diff changeset
55 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
32c6057529a4 Uploaded
greg
parents:
diff changeset
56 score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0):
32c6057529a4 Uploaded
greg
parents:
diff changeset
57 # Use copy so that first interval and feature do not share fields.
32c6057529a4 Uploaded
greg
parents:
diff changeset
58 GFFInterval.__init__(self, reader, copy.deepcopy(intervals[0].fields), chrom_col, feature_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
59 start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand)
32c6057529a4 Uploaded
greg
parents:
diff changeset
60 self.intervals = intervals
32c6057529a4 Uploaded
greg
parents:
diff changeset
61 self.raw_size = raw_size
32c6057529a4 Uploaded
greg
parents:
diff changeset
62 # Use intervals to set feature attributes.
32c6057529a4 Uploaded
greg
parents:
diff changeset
63 for interval in self.intervals:
32c6057529a4 Uploaded
greg
parents:
diff changeset
64 # Error checking. NOTE: intervals need not share the same strand.
32c6057529a4 Uploaded
greg
parents:
diff changeset
65 if interval.chrom != self.chrom:
32c6057529a4 Uploaded
greg
parents:
diff changeset
66 stop_err("interval chrom does not match self chrom: %s != %s" % (interval.chrom, self.chrom))
32c6057529a4 Uploaded
greg
parents:
diff changeset
67 # Set start, end of interval.
32c6057529a4 Uploaded
greg
parents:
diff changeset
68 if interval.start < self.start:
32c6057529a4 Uploaded
greg
parents:
diff changeset
69 self.start = interval.start
32c6057529a4 Uploaded
greg
parents:
diff changeset
70 if interval.end > self.end:
32c6057529a4 Uploaded
greg
parents:
diff changeset
71 self.end = interval.end
32c6057529a4 Uploaded
greg
parents:
diff changeset
72
32c6057529a4 Uploaded
greg
parents:
diff changeset
73 def name(self):
32c6057529a4 Uploaded
greg
parents:
diff changeset
74 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
75 Returns feature's name.
32c6057529a4 Uploaded
greg
parents:
diff changeset
76 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
77 name = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
78 # Preference for name:
32c6057529a4 Uploaded
greg
parents:
diff changeset
79 # GTF: 'gene_id', 'transcript_id'
32c6057529a4 Uploaded
greg
parents:
diff changeset
80 # GFF3: 'ID', 'id'
32c6057529a4 Uploaded
greg
parents:
diff changeset
81 # GFF: 'group'
32c6057529a4 Uploaded
greg
parents:
diff changeset
82 for attr_name in ['gene_id', 'transcript_id', 'ID', 'id', 'group']:
32c6057529a4 Uploaded
greg
parents:
diff changeset
83 name = self.attributes.get(attr_name, None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
84 if name is not None:
32c6057529a4 Uploaded
greg
parents:
diff changeset
85 break
32c6057529a4 Uploaded
greg
parents:
diff changeset
86 return name
32c6057529a4 Uploaded
greg
parents:
diff changeset
87
32c6057529a4 Uploaded
greg
parents:
diff changeset
88 def copy(self):
32c6057529a4 Uploaded
greg
parents:
diff changeset
89 intervals_copy = []
32c6057529a4 Uploaded
greg
parents:
diff changeset
90 for interval in self.intervals:
32c6057529a4 Uploaded
greg
parents:
diff changeset
91 intervals_copy.append(interval.copy())
32c6057529a4 Uploaded
greg
parents:
diff changeset
92 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
93 self.strand_col, self.score_col, self.strand, intervals=intervals_copy)
32c6057529a4 Uploaded
greg
parents:
diff changeset
94
32c6057529a4 Uploaded
greg
parents:
diff changeset
95 def lines(self):
32c6057529a4 Uploaded
greg
parents:
diff changeset
96 lines = []
32c6057529a4 Uploaded
greg
parents:
diff changeset
97 for interval in self.intervals:
32c6057529a4 Uploaded
greg
parents:
diff changeset
98 lines.append('\t'.join(interval.fields))
32c6057529a4 Uploaded
greg
parents:
diff changeset
99 return lines
32c6057529a4 Uploaded
greg
parents:
diff changeset
100
32c6057529a4 Uploaded
greg
parents:
diff changeset
101
32c6057529a4 Uploaded
greg
parents:
diff changeset
102 class GFFReaderWrapper(NiceReaderWrapper):
32c6057529a4 Uploaded
greg
parents:
diff changeset
103 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
104 Reader wrapper for GFF files which has two major functions:
32c6057529a4 Uploaded
greg
parents:
diff changeset
105 1. group entries for GFF file (via group column), GFF3 (via id attribute),
32c6057529a4 Uploaded
greg
parents:
diff changeset
106 or GTF (via gene_id/transcript id);
32c6057529a4 Uploaded
greg
parents:
diff changeset
107 2. convert coordinates from GFF format--starting and ending coordinates
32c6057529a4 Uploaded
greg
parents:
diff changeset
108 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
32c6057529a4 Uploaded
greg
parents:
diff changeset
109 half-open. This is useful when using GFF files as inputs to tools that
32c6057529a4 Uploaded
greg
parents:
diff changeset
110 expect traditional interval format.
32c6057529a4 Uploaded
greg
parents:
diff changeset
111 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
112
32c6057529a4 Uploaded
greg
parents:
diff changeset
113 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
32c6057529a4 Uploaded
greg
parents:
diff changeset
114 score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs):
32c6057529a4 Uploaded
greg
parents:
diff changeset
115 NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
116 strand_col=strand_col, fix_strand=fix_strand, **kwargs)
32c6057529a4 Uploaded
greg
parents:
diff changeset
117 self.feature_col = feature_col
32c6057529a4 Uploaded
greg
parents:
diff changeset
118 self.score_col = score_col
32c6057529a4 Uploaded
greg
parents:
diff changeset
119 self.convert_to_bed_coord = convert_to_bed_coord
32c6057529a4 Uploaded
greg
parents:
diff changeset
120 self.last_line = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
121 self.cur_offset = 0
32c6057529a4 Uploaded
greg
parents:
diff changeset
122 self.seed_interval = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
123 self.seed_interval_line_len = 0
32c6057529a4 Uploaded
greg
parents:
diff changeset
124
32c6057529a4 Uploaded
greg
parents:
diff changeset
125 def parse_row(self, line):
32c6057529a4 Uploaded
greg
parents:
diff changeset
126 interval = GFFInterval(self, line.split("\t"), self.chrom_col, self.feature_col, self.start_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
127 self.end_col, self.strand_col, self.score_col, self.default_strand,
32c6057529a4 Uploaded
greg
parents:
diff changeset
128 fix_strand=self.fix_strand)
32c6057529a4 Uploaded
greg
parents:
diff changeset
129 return interval
32c6057529a4 Uploaded
greg
parents:
diff changeset
130
32c6057529a4 Uploaded
greg
parents:
diff changeset
131 def next(self):
32c6057529a4 Uploaded
greg
parents:
diff changeset
132 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
133 Returns next GFFFeature.
32c6057529a4 Uploaded
greg
parents:
diff changeset
134 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
135
32c6057529a4 Uploaded
greg
parents:
diff changeset
136 def handle_parse_error(parse_error):
32c6057529a4 Uploaded
greg
parents:
diff changeset
137 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
138 Actions to take when ParseError found.
32c6057529a4 Uploaded
greg
parents:
diff changeset
139 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
140 if self.outstream:
32c6057529a4 Uploaded
greg
parents:
diff changeset
141 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
32c6057529a4 Uploaded
greg
parents:
diff changeset
142 self.print_delegate(self.outstream, e, self)
32c6057529a4 Uploaded
greg
parents:
diff changeset
143 self.skipped += 1
32c6057529a4 Uploaded
greg
parents:
diff changeset
144 # No reason to stuff an entire bad file into memory.
32c6057529a4 Uploaded
greg
parents:
diff changeset
145 if self.skipped < 10:
32c6057529a4 Uploaded
greg
parents:
diff changeset
146 self.skipped_lines.append((self.linenum, self.current_line, str(e)))
32c6057529a4 Uploaded
greg
parents:
diff changeset
147 # Get next GFFFeature
32c6057529a4 Uploaded
greg
parents:
diff changeset
148 raw_size = self.seed_interval_line_len
32c6057529a4 Uploaded
greg
parents:
diff changeset
149 # If there is no seed interval, set one. Also, if there are no more
32c6057529a4 Uploaded
greg
parents:
diff changeset
150 # intervals to read, this is where iterator dies.
32c6057529a4 Uploaded
greg
parents:
diff changeset
151 if not self.seed_interval:
32c6057529a4 Uploaded
greg
parents:
diff changeset
152 while not self.seed_interval:
32c6057529a4 Uploaded
greg
parents:
diff changeset
153 try:
32c6057529a4 Uploaded
greg
parents:
diff changeset
154 self.seed_interval = GenomicIntervalReader.next(self)
32c6057529a4 Uploaded
greg
parents:
diff changeset
155 except ParseError as e:
32c6057529a4 Uploaded
greg
parents:
diff changeset
156 handle_parse_error(e)
32c6057529a4 Uploaded
greg
parents:
diff changeset
157 finally:
32c6057529a4 Uploaded
greg
parents:
diff changeset
158 raw_size += len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
159 # If header or comment, clear seed interval and return it with its size.
32c6057529a4 Uploaded
greg
parents:
diff changeset
160 if isinstance(self.seed_interval, (Header, Comment)):
32c6057529a4 Uploaded
greg
parents:
diff changeset
161 return_val = self.seed_interval
32c6057529a4 Uploaded
greg
parents:
diff changeset
162 return_val.raw_size = len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
163 self.seed_interval = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
164 self.seed_interval_line_len = 0
32c6057529a4 Uploaded
greg
parents:
diff changeset
165 return return_val
32c6057529a4 Uploaded
greg
parents:
diff changeset
166 # Initialize feature identifier from seed.
32c6057529a4 Uploaded
greg
parents:
diff changeset
167 # For GFF.
32c6057529a4 Uploaded
greg
parents:
diff changeset
168 feature_group = self.seed_interval.attributes.get('group', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
169 # For GFF3
32c6057529a4 Uploaded
greg
parents:
diff changeset
170 feature_id = self.seed_interval.attributes.get('ID', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
171 # For GTF.
32c6057529a4 Uploaded
greg
parents:
diff changeset
172 feature_transcript_id = self.seed_interval.attributes.get('transcript_id', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
173 # Read all intervals associated with seed.
32c6057529a4 Uploaded
greg
parents:
diff changeset
174 feature_intervals = []
32c6057529a4 Uploaded
greg
parents:
diff changeset
175 feature_intervals.append(self.seed_interval)
32c6057529a4 Uploaded
greg
parents:
diff changeset
176 while True:
32c6057529a4 Uploaded
greg
parents:
diff changeset
177 try:
32c6057529a4 Uploaded
greg
parents:
diff changeset
178 interval = GenomicIntervalReader.next(self)
32c6057529a4 Uploaded
greg
parents:
diff changeset
179 raw_size += len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
180 except StopIteration as e:
32c6057529a4 Uploaded
greg
parents:
diff changeset
181 # No more intervals to read, but last feature needs to be
32c6057529a4 Uploaded
greg
parents:
diff changeset
182 # returned.
32c6057529a4 Uploaded
greg
parents:
diff changeset
183 interval = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
184 raw_size += len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
185 break
32c6057529a4 Uploaded
greg
parents:
diff changeset
186 except ParseError as e:
32c6057529a4 Uploaded
greg
parents:
diff changeset
187 handle_parse_error(e)
32c6057529a4 Uploaded
greg
parents:
diff changeset
188 raw_size += len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
189 continue
32c6057529a4 Uploaded
greg
parents:
diff changeset
190 # Ignore comments.
32c6057529a4 Uploaded
greg
parents:
diff changeset
191 if isinstance(interval, Comment):
32c6057529a4 Uploaded
greg
parents:
diff changeset
192 continue
32c6057529a4 Uploaded
greg
parents:
diff changeset
193 # Determine if interval is part of feature.
32c6057529a4 Uploaded
greg
parents:
diff changeset
194 part_of = False
32c6057529a4 Uploaded
greg
parents:
diff changeset
195 group = interval.attributes.get('group', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
196 # GFF test:
32c6057529a4 Uploaded
greg
parents:
diff changeset
197 if group and feature_group == group:
32c6057529a4 Uploaded
greg
parents:
diff changeset
198 part_of = True
32c6057529a4 Uploaded
greg
parents:
diff changeset
199 # GFF3 test:
32c6057529a4 Uploaded
greg
parents:
diff changeset
200 parent_id = interval.attributes.get('Parent', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
201 cur_id = interval.attributes.get('ID', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
202 if (cur_id and cur_id == feature_id) or (parent_id and parent_id == feature_id):
32c6057529a4 Uploaded
greg
parents:
diff changeset
203 part_of = True
32c6057529a4 Uploaded
greg
parents:
diff changeset
204 # GTF test:
32c6057529a4 Uploaded
greg
parents:
diff changeset
205 transcript_id = interval.attributes.get('transcript_id', None)
32c6057529a4 Uploaded
greg
parents:
diff changeset
206 if transcript_id and transcript_id == feature_transcript_id:
32c6057529a4 Uploaded
greg
parents:
diff changeset
207 part_of = True
32c6057529a4 Uploaded
greg
parents:
diff changeset
208 # If interval is not part of feature, clean up and break.
32c6057529a4 Uploaded
greg
parents:
diff changeset
209 if not part_of:
32c6057529a4 Uploaded
greg
parents:
diff changeset
210 # Adjust raw size because current line is not part of feature.
32c6057529a4 Uploaded
greg
parents:
diff changeset
211 raw_size -= len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
212 break
32c6057529a4 Uploaded
greg
parents:
diff changeset
213 # Interval associated with feature.
32c6057529a4 Uploaded
greg
parents:
diff changeset
214 feature_intervals.append(interval)
32c6057529a4 Uploaded
greg
parents:
diff changeset
215 # Last interval read is the seed for the next interval.
32c6057529a4 Uploaded
greg
parents:
diff changeset
216 self.seed_interval = interval
32c6057529a4 Uploaded
greg
parents:
diff changeset
217 self.seed_interval_line_len = len(self.current_line)
32c6057529a4 Uploaded
greg
parents:
diff changeset
218 # Return feature.
32c6057529a4 Uploaded
greg
parents:
diff changeset
219 feature = GFFFeature(self, self.chrom_col, self.feature_col, self.start_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
220 self.end_col, self.strand_col, self.score_col,
32c6057529a4 Uploaded
greg
parents:
diff changeset
221 self.default_strand, fix_strand=self.fix_strand,
32c6057529a4 Uploaded
greg
parents:
diff changeset
222 intervals=feature_intervals, raw_size=raw_size)
32c6057529a4 Uploaded
greg
parents:
diff changeset
223 # Convert to BED coords?
32c6057529a4 Uploaded
greg
parents:
diff changeset
224 if self.convert_to_bed_coord:
32c6057529a4 Uploaded
greg
parents:
diff changeset
225 convert_gff_coords_to_bed(feature)
32c6057529a4 Uploaded
greg
parents:
diff changeset
226 return feature
32c6057529a4 Uploaded
greg
parents:
diff changeset
227
32c6057529a4 Uploaded
greg
parents:
diff changeset
228
32c6057529a4 Uploaded
greg
parents:
diff changeset
229 def convert_bed_coords_to_gff(interval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
230 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
231 Converts an interval object's coordinates from BED format to GFF format.
32c6057529a4 Uploaded
greg
parents:
diff changeset
232 Accepted object types include GenomicInterval and list (where the first
32c6057529a4 Uploaded
greg
parents:
diff changeset
233 element in the list is the interval's start, and the second element is
32c6057529a4 Uploaded
greg
parents:
diff changeset
234 the interval's end).
32c6057529a4 Uploaded
greg
parents:
diff changeset
235 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
236 if isinstance(interval, GenomicInterval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
237 interval.start += 1
32c6057529a4 Uploaded
greg
parents:
diff changeset
238 if isinstance(interval, GFFFeature):
32c6057529a4 Uploaded
greg
parents:
diff changeset
239 for subinterval in interval.intervals:
32c6057529a4 Uploaded
greg
parents:
diff changeset
240 convert_bed_coords_to_gff(subinterval)
32c6057529a4 Uploaded
greg
parents:
diff changeset
241 elif isinstance(interval, list):
32c6057529a4 Uploaded
greg
parents:
diff changeset
242 interval[0] += 1
32c6057529a4 Uploaded
greg
parents:
diff changeset
243 return interval
32c6057529a4 Uploaded
greg
parents:
diff changeset
244
32c6057529a4 Uploaded
greg
parents:
diff changeset
245
32c6057529a4 Uploaded
greg
parents:
diff changeset
246 def convert_gff_coords_to_bed(interval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
247 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
248 Converts an interval object's coordinates from GFF format to BED format.
32c6057529a4 Uploaded
greg
parents:
diff changeset
249 Accepted object types include GFFFeature, GenomicInterval, and list (where
32c6057529a4 Uploaded
greg
parents:
diff changeset
250 the first element in the list is the interval's start, and the second
32c6057529a4 Uploaded
greg
parents:
diff changeset
251 element is the interval's end).
32c6057529a4 Uploaded
greg
parents:
diff changeset
252 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
253 if isinstance(interval, GenomicInterval):
32c6057529a4 Uploaded
greg
parents:
diff changeset
254 interval.start -= 1
32c6057529a4 Uploaded
greg
parents:
diff changeset
255 if isinstance(interval, GFFFeature):
32c6057529a4 Uploaded
greg
parents:
diff changeset
256 for subinterval in interval.intervals:
32c6057529a4 Uploaded
greg
parents:
diff changeset
257 convert_gff_coords_to_bed(subinterval)
32c6057529a4 Uploaded
greg
parents:
diff changeset
258 elif isinstance(interval, list):
32c6057529a4 Uploaded
greg
parents:
diff changeset
259 interval[0] -= 1
32c6057529a4 Uploaded
greg
parents:
diff changeset
260 return interval
32c6057529a4 Uploaded
greg
parents:
diff changeset
261
32c6057529a4 Uploaded
greg
parents:
diff changeset
262
32c6057529a4 Uploaded
greg
parents:
diff changeset
263 def convert_to_twobit(reference_genome):
32c6057529a4 Uploaded
greg
parents:
diff changeset
264 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
265 Create 2bit file history fasta dataset.
32c6057529a4 Uploaded
greg
parents:
diff changeset
266 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
267 try:
32c6057529a4 Uploaded
greg
parents:
diff changeset
268 seq_path = tempfile.NamedTemporaryFile(dir=".").name
32c6057529a4 Uploaded
greg
parents:
diff changeset
269 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
32c6057529a4 Uploaded
greg
parents:
diff changeset
270 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
32c6057529a4 Uploaded
greg
parents:
diff changeset
271 tmp_stderr = open(tmp_name, 'wb')
32c6057529a4 Uploaded
greg
parents:
diff changeset
272 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
32c6057529a4 Uploaded
greg
parents:
diff changeset
273 returncode = proc.wait()
32c6057529a4 Uploaded
greg
parents:
diff changeset
274 tmp_stderr.close()
32c6057529a4 Uploaded
greg
parents:
diff changeset
275 if returncode != 0:
32c6057529a4 Uploaded
greg
parents:
diff changeset
276 # Get stderr, allowing for case where it's very large.
32c6057529a4 Uploaded
greg
parents:
diff changeset
277 tmp_stderr = open(tmp_name, 'rb')
32c6057529a4 Uploaded
greg
parents:
diff changeset
278 stderr = ''
32c6057529a4 Uploaded
greg
parents:
diff changeset
279 buffsize = 1048576
32c6057529a4 Uploaded
greg
parents:
diff changeset
280 try:
32c6057529a4 Uploaded
greg
parents:
diff changeset
281 while True:
32c6057529a4 Uploaded
greg
parents:
diff changeset
282 stderr += tmp_stderr.read(buffsize)
32c6057529a4 Uploaded
greg
parents:
diff changeset
283 if not stderr or len(stderr) % buffsize != 0:
32c6057529a4 Uploaded
greg
parents:
diff changeset
284 break
32c6057529a4 Uploaded
greg
parents:
diff changeset
285 except OverflowError:
32c6057529a4 Uploaded
greg
parents:
diff changeset
286 pass
32c6057529a4 Uploaded
greg
parents:
diff changeset
287 tmp_stderr.close()
32c6057529a4 Uploaded
greg
parents:
diff changeset
288 os.remove(tmp_name)
32c6057529a4 Uploaded
greg
parents:
diff changeset
289 stop_err(stderr)
32c6057529a4 Uploaded
greg
parents:
diff changeset
290 return seq_path
32c6057529a4 Uploaded
greg
parents:
diff changeset
291 except Exception, e:
32c6057529a4 Uploaded
greg
parents:
diff changeset
292 stop_err('Error running faToTwoBit. ' + str(e))
32c6057529a4 Uploaded
greg
parents:
diff changeset
293
32c6057529a4 Uploaded
greg
parents:
diff changeset
294
32c6057529a4 Uploaded
greg
parents:
diff changeset
295 def get_lines(feature):
32c6057529a4 Uploaded
greg
parents:
diff changeset
296 # Get feature's line(s).
32c6057529a4 Uploaded
greg
parents:
diff changeset
297 if isinstance(feature, GFFFeature):
32c6057529a4 Uploaded
greg
parents:
diff changeset
298 return feature.lines()
32c6057529a4 Uploaded
greg
parents:
diff changeset
299 else:
32c6057529a4 Uploaded
greg
parents:
diff changeset
300 return [feature.rstrip('\r\n')]
32c6057529a4 Uploaded
greg
parents:
diff changeset
301
32c6057529a4 Uploaded
greg
parents:
diff changeset
302
32c6057529a4 Uploaded
greg
parents:
diff changeset
303 def gff_attributes_to_str(attrs, gff_format):
32c6057529a4 Uploaded
greg
parents:
diff changeset
304 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
305 Convert GFF attributes to string. Supported formats are GFF3, GTF.
32c6057529a4 Uploaded
greg
parents:
diff changeset
306 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
307 if gff_format == 'GTF':
32c6057529a4 Uploaded
greg
parents:
diff changeset
308 format_string = '%s "%s"'
32c6057529a4 Uploaded
greg
parents:
diff changeset
309 # Convert group (GFF) and ID, parent (GFF3) attributes to
32c6057529a4 Uploaded
greg
parents:
diff changeset
310 # transcript_id, gene_id.
32c6057529a4 Uploaded
greg
parents:
diff changeset
311 id_attr = None
32c6057529a4 Uploaded
greg
parents:
diff changeset
312 if 'group' in attrs:
32c6057529a4 Uploaded
greg
parents:
diff changeset
313 id_attr = 'group'
32c6057529a4 Uploaded
greg
parents:
diff changeset
314 elif 'ID' in attrs:
32c6057529a4 Uploaded
greg
parents:
diff changeset
315 id_attr = 'ID'
32c6057529a4 Uploaded
greg
parents:
diff changeset
316 elif 'Parent' in attrs:
32c6057529a4 Uploaded
greg
parents:
diff changeset
317 id_attr = 'Parent'
32c6057529a4 Uploaded
greg
parents:
diff changeset
318 if id_attr:
32c6057529a4 Uploaded
greg
parents:
diff changeset
319 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
32c6057529a4 Uploaded
greg
parents:
diff changeset
320 elif gff_format == 'GFF3':
32c6057529a4 Uploaded
greg
parents:
diff changeset
321 format_string = '%s=%s'
32c6057529a4 Uploaded
greg
parents:
diff changeset
322 attrs_strs = []
32c6057529a4 Uploaded
greg
parents:
diff changeset
323 for name, value in attrs.items():
32c6057529a4 Uploaded
greg
parents:
diff changeset
324 attrs_strs.append(format_string % (name, value))
32c6057529a4 Uploaded
greg
parents:
diff changeset
325 return " ; ".join(attrs_strs)
32c6057529a4 Uploaded
greg
parents:
diff changeset
326
32c6057529a4 Uploaded
greg
parents:
diff changeset
327
32c6057529a4 Uploaded
greg
parents:
diff changeset
328 def parse_cols_arg(cols):
32c6057529a4 Uploaded
greg
parents:
diff changeset
329 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
330 Parse a columns command line argument into a four-tuple.
32c6057529a4 Uploaded
greg
parents:
diff changeset
331 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
332 if cols:
32c6057529a4 Uploaded
greg
parents:
diff changeset
333 # Handle case where no strand column included - in this case, cols
32c6057529a4 Uploaded
greg
parents:
diff changeset
334 # looks something like 1,2,3,
32c6057529a4 Uploaded
greg
parents:
diff changeset
335 if cols.endswith(','):
32c6057529a4 Uploaded
greg
parents:
diff changeset
336 cols += '0'
32c6057529a4 Uploaded
greg
parents:
diff changeset
337 col_list = map(lambda x: int(x) - 1, cols.split(","))
32c6057529a4 Uploaded
greg
parents:
diff changeset
338 return col_list
32c6057529a4 Uploaded
greg
parents:
diff changeset
339 else:
32c6057529a4 Uploaded
greg
parents:
diff changeset
340 return BED_DEFAULT_COLS
32c6057529a4 Uploaded
greg
parents:
diff changeset
341
32c6057529a4 Uploaded
greg
parents:
diff changeset
342
32c6057529a4 Uploaded
greg
parents:
diff changeset
343 def parse_gff_attributes(attr_str):
32c6057529a4 Uploaded
greg
parents:
diff changeset
344 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
345 Parses a GFF/GTF attribute string and returns a dictionary of name-value
32c6057529a4 Uploaded
greg
parents:
diff changeset
346 pairs. The general format for a GFF3 attributes string is
32c6057529a4 Uploaded
greg
parents:
diff changeset
347 name1=value1;name2=value2
32c6057529a4 Uploaded
greg
parents:
diff changeset
348 The general format for a GTF attribute string is
32c6057529a4 Uploaded
greg
parents:
diff changeset
349 name1 "value1" ; name2 "value2"
32c6057529a4 Uploaded
greg
parents:
diff changeset
350 The general format for a GFF attribute string is a single string that
32c6057529a4 Uploaded
greg
parents:
diff changeset
351 denotes the interval's group; in this case, method returns a dictionary
32c6057529a4 Uploaded
greg
parents:
diff changeset
352 with a single key-value pair, and key name is 'group'.
32c6057529a4 Uploaded
greg
parents:
diff changeset
353 """
32c6057529a4 Uploaded
greg
parents:
diff changeset
354 attributes_list = attr_str.split(";")
32c6057529a4 Uploaded
greg
parents:
diff changeset
355 attributes = {}
32c6057529a4 Uploaded
greg
parents:
diff changeset
356 for name_value_pair in attributes_list:
32c6057529a4 Uploaded
greg
parents:
diff changeset
357 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
32c6057529a4 Uploaded
greg
parents:
diff changeset
358 # attribute; next, try double quotes for GTF.
32c6057529a4 Uploaded
greg
parents:
diff changeset
359 pair = name_value_pair.strip().split("=")
32c6057529a4 Uploaded
greg
parents:
diff changeset
360 if len(pair) == 1:
32c6057529a4 Uploaded
greg
parents:
diff changeset
361 pair = name_value_pair.strip().split("\"")
32c6057529a4 Uploaded
greg
parents:
diff changeset
362 if len(pair) == 1:
32c6057529a4 Uploaded
greg
parents:
diff changeset
363 # Could not split for some reason.
32c6057529a4 Uploaded
greg
parents:
diff changeset
364 continue
32c6057529a4 Uploaded
greg
parents:
diff changeset
365 if pair == '':
32c6057529a4 Uploaded
greg
parents:
diff changeset
366 continue
32c6057529a4 Uploaded
greg
parents:
diff changeset
367 name = pair[0].strip()
32c6057529a4 Uploaded
greg
parents:
diff changeset
368 if name == '':
32c6057529a4 Uploaded
greg
parents:
diff changeset
369 continue
32c6057529a4 Uploaded
greg
parents:
diff changeset
370 # Need to strip double quote from values
32c6057529a4 Uploaded
greg
parents:
diff changeset
371 value = pair[1].strip(" \"")
32c6057529a4 Uploaded
greg
parents:
diff changeset
372 attributes[name] = value
32c6057529a4 Uploaded
greg
parents:
diff changeset
373 if len(attributes) == 0:
32c6057529a4 Uploaded
greg
parents:
diff changeset
374 # Could not split attributes string, so entire string must be
32c6057529a4 Uploaded
greg
parents:
diff changeset
375 # 'group' attribute. This is the case for strictly GFF files.
32c6057529a4 Uploaded
greg
parents:
diff changeset
376 attributes['group'] = attr_str
32c6057529a4 Uploaded
greg
parents:
diff changeset
377 return attributes
32c6057529a4 Uploaded
greg
parents:
diff changeset
378
32c6057529a4 Uploaded
greg
parents:
diff changeset
379
32c6057529a4 Uploaded
greg
parents:
diff changeset
380 def reverse_complement(s):
32c6057529a4 Uploaded
greg
parents:
diff changeset
381 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
32c6057529a4 Uploaded
greg
parents:
diff changeset
382 reversed_s = []
32c6057529a4 Uploaded
greg
parents:
diff changeset
383 for i in s:
32c6057529a4 Uploaded
greg
parents:
diff changeset
384 reversed_s.append(complement_dna[i])
32c6057529a4 Uploaded
greg
parents:
diff changeset
385 reversed_s.reverse()
32c6057529a4 Uploaded
greg
parents:
diff changeset
386 return "".join(reversed_s)
32c6057529a4 Uploaded
greg
parents:
diff changeset
387
32c6057529a4 Uploaded
greg
parents:
diff changeset
388
32c6057529a4 Uploaded
greg
parents:
diff changeset
389 def stop_err(msg):
32c6057529a4 Uploaded
greg
parents:
diff changeset
390 sys.stderr.write(msg)
32c6057529a4 Uploaded
greg
parents:
diff changeset
391 sys.exit(1)