Mercurial > repos > greg > extract_genomic_dna
comparison extract_genomic_dna.py @ 0:cff5b7c9be55 draft
Uploaded
author | greg |
---|---|
date | Thu, 14 Jan 2016 07:55:22 -0500 |
parents | |
children | 311febbd33d6 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cff5b7c9be55 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import os | |
4 import subprocess | |
5 import sys | |
6 import tempfile | |
7 | |
8 import bx.seq.nib | |
9 import bx.seq.twobit | |
10 from bx.intervals.io import Header, Comment | |
11 from galaxy.datatypes.util import gff_util | |
12 from galaxy.tools.util.galaxyops import parse_cols_arg | |
13 | |
14 | |
15 def convert_to_twobit(reference_genome): | |
16 """ | |
17 Create 2bit file history fasta dataset. | |
18 """ | |
19 try: | |
20 seq_path = tempfile.NamedTemporaryFile(dir=".").name | |
21 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path) | |
22 tmp_name = tempfile.NamedTemporaryFile(dir=".").name | |
23 tmp_stderr = open(tmp_name, 'wb') | |
24 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
25 returncode = proc.wait() | |
26 tmp_stderr.close() | |
27 if returncode != 0: | |
28 # Get stderr, allowing for case where it's very large. | |
29 tmp_stderr = open(tmp_name, 'rb') | |
30 stderr = '' | |
31 buffsize = 1048576 | |
32 try: | |
33 while True: | |
34 stderr += tmp_stderr.read(buffsize) | |
35 if not stderr or len(stderr) % buffsize != 0: | |
36 break | |
37 except OverflowError: | |
38 pass | |
39 tmp_stderr.close() | |
40 os.remove(tmp_name) | |
41 stop_err(stderr) | |
42 return seq_path | |
43 except Exception, e: | |
44 stop_err('Error running faToTwoBit. ' + str(e)) | |
45 | |
46 | |
47 def get_lines(feature): | |
48 # Get feature's line(s). | |
49 if isinstance(feature, gff_util.GFFFeature): | |
50 return feature.lines() | |
51 else: | |
52 return [feature.rstrip('\r\n')] | |
53 | |
54 | |
55 def reverse_complement(s): | |
56 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"} | |
57 reversed_s = [] | |
58 for i in s: | |
59 reversed_s.append(complement_dna[i]) | |
60 reversed_s.reverse() | |
61 return "".join(reversed_s) | |
62 | |
63 | |
64 def stop_err(msg): | |
65 sys.stderr.write(msg) | |
66 sys.exit(1) | |
67 | |
68 parser = argparse.ArgumentParser() | |
69 parser.add_option('--input_format', dest='input_format', help="Input dataset format") | |
70 parser.add_option('--input', dest='input', help="Input dataset") | |
71 parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build") | |
72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") | |
73 parser.add_option('--columns', dest='columns', help="Columns to use in input file") | |
74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") | |
75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file") | |
76 parser.add_option('--output_format', dest='output_format', help="Output format") | |
77 parser.add_option('--output', dest='output', help="Output dataset") | |
78 args = parser.parse_args() | |
79 | |
80 input_is_gff = args.input_format == 'gff' | |
81 interpret_features = args.interpret_features == "yes" | |
82 if len(args.cols.split(',')) == 5: | |
83 # Bed file. | |
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) | |
85 else: | |
86 # Gff file. | |
87 chrom_col, start_col, end_col, strand_col = parse_cols_arg(args.cols) | |
88 name_col = False | |
89 | |
90 if args.reference_genome_source == "history": | |
91 seq_path = convert_to_twobit(args.reference_genome) | |
92 else: | |
93 seq_path = args.reference_genome | |
94 seq_dir = os.path.split(seq_path)[0] | |
95 | |
96 includes_strand_col = strand_col >= 0 | |
97 strand = None | |
98 nibs = {} | |
99 skipped_lines = 0 | |
100 first_invalid_line = 0 | |
101 invalid_lines = [] | |
102 warnings = [] | |
103 warning = '' | |
104 twobitfile = None | |
105 line_count = 1 | |
106 file_iterator = open(args.input) | |
107 if interpret_features: | |
108 file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False) | |
109 out = open(args.output, 'wt') | |
110 | |
111 for feature in file_iterator: | |
112 # Ignore comments, headers. | |
113 if isinstance(feature, (Header, Comment)): | |
114 line_count += 1 | |
115 continue | |
116 name = "" | |
117 if interpret_features: | |
118 # Processing features. | |
119 gff_util.convert_gff_coords_to_bed(feature) | |
120 chrom = feature.chrom | |
121 start = feature.start | |
122 end = feature.end | |
123 strand = feature.strand | |
124 else: | |
125 # Processing lines, either interval or GFF format. | |
126 line = feature.rstrip('\r\n') | |
127 if line and not line.startswith("#"): | |
128 fields = line.split('\t') | |
129 try: | |
130 chrom = fields[chrom_col] | |
131 start = int(fields[start_col]) | |
132 end = int(fields[end_col]) | |
133 if name_col: | |
134 name = fields[name_col] | |
135 if input_is_gff: | |
136 start, end = gff_util.convert_gff_coords_to_bed([start, end]) | |
137 if includes_strand_col: | |
138 strand = fields[strand_col] | |
139 except: | |
140 warning = "Invalid chrom, start or end column values. " | |
141 warnings.append(warning) | |
142 if not invalid_lines: | |
143 invalid_lines = get_lines(feature) | |
144 first_invalid_line = line_count | |
145 skipped_lines += len(invalid_lines) | |
146 continue | |
147 if start > end: | |
148 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end) | |
149 warnings.append(warning) | |
150 if not invalid_lines: | |
151 invalid_lines = get_lines(feature) | |
152 first_invalid_line = line_count | |
153 skipped_lines += len(invalid_lines) | |
154 continue | |
155 if strand not in ['+', '-']: | |
156 strand = '+' | |
157 sequence = '' | |
158 else: | |
159 continue | |
160 # Open sequence file and get sequence for feature/interval. | |
161 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): | |
162 if chrom in nibs: | |
163 nib = nibs[chrom] | |
164 else: | |
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) | |
166 try: | |
167 sequence = nib.get(start, end - start) | |
168 except Exception, e: | |
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey) | |
170 warnings.append(warning) | |
171 if not invalid_lines: | |
172 invalid_lines = get_lines(feature) | |
173 first_invalid_line = line_count | |
174 skipped_lines += len(invalid_lines) | |
175 continue | |
176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)): | |
177 if not(twobitfile): | |
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) | |
179 try: | |
180 if interpret_features: | |
181 # Create sequence from intervals within a feature. | |
182 sequence = '' | |
183 for interval in feature.intervals: | |
184 sequence += twobitfile[interval.chrom][interval.start:interval.end] | |
185 else: | |
186 sequence = twobitfile[chrom][start:end] | |
187 except: | |
188 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) | |
189 warnings.append(warning) | |
190 if not invalid_lines: | |
191 invalid_lines = get_lines(feature) | |
192 first_invalid_line = line_count | |
193 skipped_lines += len(invalid_lines) | |
194 continue | |
195 else: | |
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey) | |
197 warnings.append(warning) | |
198 if not invalid_lines: | |
199 invalid_lines = get_lines(feature) | |
200 first_invalid_line = line_count | |
201 skipped_lines += len(invalid_lines) | |
202 continue | |
203 if sequence == '': | |
204 warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey) | |
205 warnings.append(warning) | |
206 if not invalid_lines: | |
207 invalid_lines = get_lines(feature) | |
208 first_invalid_line = line_count | |
209 skipped_lines += len(invalid_lines) | |
210 continue | |
211 if includes_strand_col and strand == "-": | |
212 sequence = reverse_complement(sequence) | |
213 if args.output_format == "fasta": | |
214 l = len(sequence) | |
215 c = 0 | |
216 if input_is_gff: | |
217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) | |
218 fields = [args.dbkey, str(chrom), str(start), str(end), strand] | |
219 meta_data = "_".join(fields) | |
220 if name.strip(): | |
221 out.write(">%s %s\n" % (meta_data, name)) | |
222 else: | |
223 out.write(">%s\n" % meta_data) | |
224 while c < l: | |
225 b = min(c + 50, l) | |
226 out.write("%s\n" % str(sequence[c:b])) | |
227 c = b | |
228 else: | |
229 # output_format == "interval". | |
230 if interpret_features: | |
231 meta_data = "\t".join([feature.chrom, | |
232 "galaxy_extract_genomic_dna", | |
233 "interval", | |
234 str(feature.start), | |
235 str(feature.end), | |
236 feature.score, | |
237 feature.strand, | |
238 ".", | |
239 gff_util.gff_attributes_to_str(feature.attributes, "GTF")]) | |
240 else: | |
241 # Where is fields being set here? | |
242 meta_data = "\t".join(fields) | |
243 if input_is_gff: | |
244 format_str = "%s seq \"%s\";\n" | |
245 else: | |
246 format_str = "%s\t%s\n" | |
247 out.write(format_str % (meta_data, str(sequence))) | |
248 # Update line count. | |
249 if isinstance(feature, gff_util.GFFFeature): | |
250 line_count += len(feature.intervals) | |
251 else: | |
252 line_count += 1 | |
253 out.close() | |
254 | |
255 if warnings: | |
256 warn_msg = "%d warnings, 1st is: " % len(warnings) | |
257 warn_msg += warnings[0] | |
258 print warn_msg | |
259 if skipped_lines: | |
260 # Error message includes up to the first 10 skipped lines. | |
261 print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10])) | |
262 | |
263 if args.reference_genome_source == "history": | |
264 os.remove(seq_path) |