comparison extract_genomic_dna.py @ 0:cff5b7c9be55 draft

Uploaded
author greg
date Thu, 14 Jan 2016 07:55:22 -0500
parents
children 311febbd33d6
comparison
equal deleted inserted replaced
-1:000000000000 0:cff5b7c9be55
1 #!/usr/bin/env python
2 import argparse
3 import os
4 import subprocess
5 import sys
6 import tempfile
7
8 import bx.seq.nib
9 import bx.seq.twobit
10 from bx.intervals.io import Header, Comment
11 from galaxy.datatypes.util import gff_util
12 from galaxy.tools.util.galaxyops import parse_cols_arg
13
14
15 def convert_to_twobit(reference_genome):
16 """
17 Create 2bit file history fasta dataset.
18 """
19 try:
20 seq_path = tempfile.NamedTemporaryFile(dir=".").name
21 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
22 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
23 tmp_stderr = open(tmp_name, 'wb')
24 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
25 returncode = proc.wait()
26 tmp_stderr.close()
27 if returncode != 0:
28 # Get stderr, allowing for case where it's very large.
29 tmp_stderr = open(tmp_name, 'rb')
30 stderr = ''
31 buffsize = 1048576
32 try:
33 while True:
34 stderr += tmp_stderr.read(buffsize)
35 if not stderr or len(stderr) % buffsize != 0:
36 break
37 except OverflowError:
38 pass
39 tmp_stderr.close()
40 os.remove(tmp_name)
41 stop_err(stderr)
42 return seq_path
43 except Exception, e:
44 stop_err('Error running faToTwoBit. ' + str(e))
45
46
47 def get_lines(feature):
48 # Get feature's line(s).
49 if isinstance(feature, gff_util.GFFFeature):
50 return feature.lines()
51 else:
52 return [feature.rstrip('\r\n')]
53
54
55 def reverse_complement(s):
56 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
57 reversed_s = []
58 for i in s:
59 reversed_s.append(complement_dna[i])
60 reversed_s.reverse()
61 return "".join(reversed_s)
62
63
64 def stop_err(msg):
65 sys.stderr.write(msg)
66 sys.exit(1)
67
68 parser = argparse.ArgumentParser()
69 parser.add_option('--input_format', dest='input_format', help="Input dataset format")
70 parser.add_option('--input', dest='input', help="Input dataset")
71 parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build")
72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
73 parser.add_option('--columns', dest='columns', help="Columns to use in input file")
74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file")
76 parser.add_option('--output_format', dest='output_format', help="Output format")
77 parser.add_option('--output', dest='output', help="Output dataset")
78 args = parser.parse_args()
79
80 input_is_gff = args.input_format == 'gff'
81 interpret_features = args.interpret_features == "yes"
82 if len(args.cols.split(',')) == 5:
83 # Bed file.
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols)
85 else:
86 # Gff file.
87 chrom_col, start_col, end_col, strand_col = parse_cols_arg(args.cols)
88 name_col = False
89
90 if args.reference_genome_source == "history":
91 seq_path = convert_to_twobit(args.reference_genome)
92 else:
93 seq_path = args.reference_genome
94 seq_dir = os.path.split(seq_path)[0]
95
96 includes_strand_col = strand_col >= 0
97 strand = None
98 nibs = {}
99 skipped_lines = 0
100 first_invalid_line = 0
101 invalid_lines = []
102 warnings = []
103 warning = ''
104 twobitfile = None
105 line_count = 1
106 file_iterator = open(args.input)
107 if interpret_features:
108 file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False)
109 out = open(args.output, 'wt')
110
111 for feature in file_iterator:
112 # Ignore comments, headers.
113 if isinstance(feature, (Header, Comment)):
114 line_count += 1
115 continue
116 name = ""
117 if interpret_features:
118 # Processing features.
119 gff_util.convert_gff_coords_to_bed(feature)
120 chrom = feature.chrom
121 start = feature.start
122 end = feature.end
123 strand = feature.strand
124 else:
125 # Processing lines, either interval or GFF format.
126 line = feature.rstrip('\r\n')
127 if line and not line.startswith("#"):
128 fields = line.split('\t')
129 try:
130 chrom = fields[chrom_col]
131 start = int(fields[start_col])
132 end = int(fields[end_col])
133 if name_col:
134 name = fields[name_col]
135 if input_is_gff:
136 start, end = gff_util.convert_gff_coords_to_bed([start, end])
137 if includes_strand_col:
138 strand = fields[strand_col]
139 except:
140 warning = "Invalid chrom, start or end column values. "
141 warnings.append(warning)
142 if not invalid_lines:
143 invalid_lines = get_lines(feature)
144 first_invalid_line = line_count
145 skipped_lines += len(invalid_lines)
146 continue
147 if start > end:
148 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end)
149 warnings.append(warning)
150 if not invalid_lines:
151 invalid_lines = get_lines(feature)
152 first_invalid_line = line_count
153 skipped_lines += len(invalid_lines)
154 continue
155 if strand not in ['+', '-']:
156 strand = '+'
157 sequence = ''
158 else:
159 continue
160 # Open sequence file and get sequence for feature/interval.
161 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)):
162 if chrom in nibs:
163 nib = nibs[chrom]
164 else:
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom)))
166 try:
167 sequence = nib.get(start, end - start)
168 except Exception, e:
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey)
170 warnings.append(warning)
171 if not invalid_lines:
172 invalid_lines = get_lines(feature)
173 first_invalid_line = line_count
174 skipped_lines += len(invalid_lines)
175 continue
176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)):
177 if not(twobitfile):
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
179 try:
180 if interpret_features:
181 # Create sequence from intervals within a feature.
182 sequence = ''
183 for interval in feature.intervals:
184 sequence += twobitfile[interval.chrom][interval.start:interval.end]
185 else:
186 sequence = twobitfile[chrom][start:end]
187 except:
188 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom)
189 warnings.append(warning)
190 if not invalid_lines:
191 invalid_lines = get_lines(feature)
192 first_invalid_line = line_count
193 skipped_lines += len(invalid_lines)
194 continue
195 else:
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey)
197 warnings.append(warning)
198 if not invalid_lines:
199 invalid_lines = get_lines(feature)
200 first_invalid_line = line_count
201 skipped_lines += len(invalid_lines)
202 continue
203 if sequence == '':
204 warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey)
205 warnings.append(warning)
206 if not invalid_lines:
207 invalid_lines = get_lines(feature)
208 first_invalid_line = line_count
209 skipped_lines += len(invalid_lines)
210 continue
211 if includes_strand_col and strand == "-":
212 sequence = reverse_complement(sequence)
213 if args.output_format == "fasta":
214 l = len(sequence)
215 c = 0
216 if input_is_gff:
217 start, end = gff_util.convert_bed_coords_to_gff([start, end])
218 fields = [args.dbkey, str(chrom), str(start), str(end), strand]
219 meta_data = "_".join(fields)
220 if name.strip():
221 out.write(">%s %s\n" % (meta_data, name))
222 else:
223 out.write(">%s\n" % meta_data)
224 while c < l:
225 b = min(c + 50, l)
226 out.write("%s\n" % str(sequence[c:b]))
227 c = b
228 else:
229 # output_format == "interval".
230 if interpret_features:
231 meta_data = "\t".join([feature.chrom,
232 "galaxy_extract_genomic_dna",
233 "interval",
234 str(feature.start),
235 str(feature.end),
236 feature.score,
237 feature.strand,
238 ".",
239 gff_util.gff_attributes_to_str(feature.attributes, "GTF")])
240 else:
241 # Where is fields being set here?
242 meta_data = "\t".join(fields)
243 if input_is_gff:
244 format_str = "%s seq \"%s\";\n"
245 else:
246 format_str = "%s\t%s\n"
247 out.write(format_str % (meta_data, str(sequence)))
248 # Update line count.
249 if isinstance(feature, gff_util.GFFFeature):
250 line_count += len(feature.intervals)
251 else:
252 line_count += 1
253 out.close()
254
255 if warnings:
256 warn_msg = "%d warnings, 1st is: " % len(warnings)
257 warn_msg += warnings[0]
258 print warn_msg
259 if skipped_lines:
260 # Error message includes up to the first 10 skipped lines.
261 print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))
262
263 if args.reference_genome_source == "history":
264 os.remove(seq_path)