Mercurial > repos > greg > extract_genomic_dna
comparison extract_genomic_dna.py @ 9:8cc00c5cf33e draft
Uploaded
author | greg |
---|---|
date | Fri, 15 Jan 2016 08:43:52 -0500 |
parents | ee00b3c68801 |
children | 1a10864abc1f |
comparison
equal
deleted
inserted
replaced
8:32c6057529a4 | 9:8cc00c5cf33e |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 import argparse | 2 import argparse |
3 import os | 3 import os |
4 import subprocess | 4 |
5 import sys | 5 import extract_genomic_dna_utils as egdu |
6 import tempfile | |
7 | |
8 import bx.seq.nib | 6 import bx.seq.nib |
9 import bx.seq.twobit | 7 import bx.seq.twobit |
10 from bx.intervals.io import Header, Comment | 8 from bx.intervals.io import Header, Comment |
11 from galaxy.datatypes.util import gff_util | 9 |
12 from galaxy.tools.util.galaxyops import parse_cols_arg | |
13 | |
14 | |
15 def convert_to_twobit(reference_genome): | |
16 """ | |
17 Create 2bit file history fasta dataset. | |
18 """ | |
19 try: | |
20 seq_path = tempfile.NamedTemporaryFile(dir=".").name | |
21 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path) | |
22 tmp_name = tempfile.NamedTemporaryFile(dir=".").name | |
23 tmp_stderr = open(tmp_name, 'wb') | |
24 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
25 returncode = proc.wait() | |
26 tmp_stderr.close() | |
27 if returncode != 0: | |
28 # Get stderr, allowing for case where it's very large. | |
29 tmp_stderr = open(tmp_name, 'rb') | |
30 stderr = '' | |
31 buffsize = 1048576 | |
32 try: | |
33 while True: | |
34 stderr += tmp_stderr.read(buffsize) | |
35 if not stderr or len(stderr) % buffsize != 0: | |
36 break | |
37 except OverflowError: | |
38 pass | |
39 tmp_stderr.close() | |
40 os.remove(tmp_name) | |
41 stop_err(stderr) | |
42 return seq_path | |
43 except Exception, e: | |
44 stop_err('Error running faToTwoBit. ' + str(e)) | |
45 | |
46 | |
47 def get_lines(feature): | |
48 # Get feature's line(s). | |
49 if isinstance(feature, gff_util.GFFFeature): | |
50 return feature.lines() | |
51 else: | |
52 return [feature.rstrip('\r\n')] | |
53 | |
54 | |
55 def reverse_complement(s): | |
56 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"} | |
57 reversed_s = [] | |
58 for i in s: | |
59 reversed_s.append(complement_dna[i]) | |
60 reversed_s.reverse() | |
61 return "".join(reversed_s) | |
62 | |
63 | |
64 def stop_err(msg): | |
65 sys.stderr.write(msg) | |
66 sys.exit(1) | |
67 | 10 |
68 parser = argparse.ArgumentParser() | 11 parser = argparse.ArgumentParser() |
69 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") | 12 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") |
70 parser.add_argument('--input', dest='input', help="Input dataset") | 13 parser.add_argument('--input', dest='input', help="Input dataset") |
71 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") | 14 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") |
79 | 22 |
80 input_is_gff = args.input_format == 'gff' | 23 input_is_gff = args.input_format == 'gff' |
81 interpret_features = input_is_gff and args.interpret_features == "yes" | 24 interpret_features = input_is_gff and args.interpret_features == "yes" |
82 if len(args.columns.split(',')) == 5: | 25 if len(args.columns.split(',')) == 5: |
83 # Bed file. | 26 # Bed file. |
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.columns) | 27 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) |
85 else: | 28 else: |
86 # Gff file. | 29 # Gff file. |
87 chrom_col, start_col, end_col, strand_col = parse_cols_arg(args.columns) | 30 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) |
88 name_col = False | 31 name_col = False |
89 | 32 |
90 if args.reference_genome_source == "history": | 33 if args.reference_genome_source == "history": |
91 seq_path = convert_to_twobit(args.reference_genome) | 34 seq_path = egdu.convert_to_twobit(args.reference_genome) |
92 else: | 35 else: |
93 seq_path = args.reference_genome | 36 seq_path = args.reference_genome |
94 seq_dir = os.path.split(seq_path)[0] | 37 seq_dir = os.path.split(seq_path)[0] |
95 | 38 |
96 includes_strand_col = strand_col >= 0 | 39 includes_strand_col = strand_col >= 0 |
103 warning = '' | 46 warning = '' |
104 twobitfile = None | 47 twobitfile = None |
105 line_count = 1 | 48 line_count = 1 |
106 file_iterator = open(args.input) | 49 file_iterator = open(args.input) |
107 if interpret_features: | 50 if interpret_features: |
108 file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False) | 51 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) |
109 out = open(args.output, 'wt') | 52 out = open(args.output, 'wt') |
110 | 53 |
111 for feature in file_iterator: | 54 for feature in file_iterator: |
112 # Ignore comments, headers. | 55 # Ignore comments, headers. |
113 if isinstance(feature, (Header, Comment)): | 56 if isinstance(feature, (Header, Comment)): |
114 line_count += 1 | 57 line_count += 1 |
115 continue | 58 continue |
116 name = "" | 59 name = "" |
117 if interpret_features: | 60 if interpret_features: |
118 # Processing features. | 61 # Processing features. |
119 gff_util.convert_gff_coords_to_bed(feature) | 62 egdu.convert_gff_coords_to_bed(feature) |
120 chrom = feature.chrom | 63 chrom = feature.chrom |
121 start = feature.start | 64 start = feature.start |
122 end = feature.end | 65 end = feature.end |
123 strand = feature.strand | 66 strand = feature.strand |
124 else: | 67 else: |
131 start = int(fields[start_col]) | 74 start = int(fields[start_col]) |
132 end = int(fields[end_col]) | 75 end = int(fields[end_col]) |
133 if name_col: | 76 if name_col: |
134 name = fields[name_col] | 77 name = fields[name_col] |
135 if input_is_gff: | 78 if input_is_gff: |
136 start, end = gff_util.convert_gff_coords_to_bed([start, end]) | 79 start, end = egdu.convert_gff_coords_to_bed([start, end]) |
137 if includes_strand_col: | 80 if includes_strand_col: |
138 strand = fields[strand_col] | 81 strand = fields[strand_col] |
139 except: | 82 except: |
140 warning = "Invalid chrom, start or end column values. " | 83 warning = "Invalid chrom, start or end column values. " |
141 warnings.append(warning) | 84 warnings.append(warning) |
142 if not invalid_lines: | 85 if not invalid_lines: |
143 invalid_lines = get_lines(feature) | 86 invalid_lines = egdu.get_lines(feature) |
144 first_invalid_line = line_count | 87 first_invalid_line = line_count |
145 skipped_lines += len(invalid_lines) | 88 skipped_lines += len(invalid_lines) |
146 continue | 89 continue |
147 if start > end: | 90 if start > end: |
148 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end) | 91 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end) |
149 warnings.append(warning) | 92 warnings.append(warning) |
150 if not invalid_lines: | 93 if not invalid_lines: |
151 invalid_lines = get_lines(feature) | 94 invalid_lines = egdu.get_lines(feature) |
152 first_invalid_line = line_count | 95 first_invalid_line = line_count |
153 skipped_lines += len(invalid_lines) | 96 skipped_lines += len(invalid_lines) |
154 continue | 97 continue |
155 if strand not in ['+', '-']: | 98 if strand not in ['+', '-']: |
156 strand = '+' | 99 strand = '+' |
160 # Open sequence file and get sequence for feature/interval. | 103 # Open sequence file and get sequence for feature/interval. |
161 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): | 104 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): |
162 if chrom in nibs: | 105 if chrom in nibs: |
163 nib = nibs[chrom] | 106 nib = nibs[chrom] |
164 else: | 107 else: |
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) | 108 nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) |
166 try: | 109 try: |
167 sequence = nib.get(start, end - start) | 110 sequence = nib.get(start, end - start) |
168 except Exception, e: | 111 except Exception, e: |
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) | 112 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) |
170 warnings.append(warning) | 113 warnings.append(warning) |
171 if not invalid_lines: | 114 if not invalid_lines: |
172 invalid_lines = get_lines(feature) | 115 invalid_lines = egdu.get_lines(feature) |
173 first_invalid_line = line_count | 116 first_invalid_line = line_count |
174 skipped_lines += len(invalid_lines) | 117 skipped_lines += len(invalid_lines) |
175 continue | 118 continue |
176 elif os.path.isfile(seq_path): | 119 elif os.path.isfile(seq_path): |
177 if not(twobitfile): | 120 if not(twobitfile): |
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) | 121 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) |
179 try: | 122 try: |
180 if interpret_features: | 123 if interpret_features: |
181 # Create sequence from intervals within a feature. | 124 # Create sequence from intervals within a feature. |
182 sequence = '' | 125 sequence = '' |
183 for interval in feature.intervals: | 126 for interval in feature.intervals: |
186 sequence = twobitfile[chrom][start:end] | 129 sequence = twobitfile[chrom][start:end] |
187 except: | 130 except: |
188 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) | 131 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) |
189 warnings.append(warning) | 132 warnings.append(warning) |
190 if not invalid_lines: | 133 if not invalid_lines: |
191 invalid_lines = get_lines(feature) | 134 invalid_lines = egdu.get_lines(feature) |
192 first_invalid_line = line_count | 135 first_invalid_line = line_count |
193 skipped_lines += len(invalid_lines) | 136 skipped_lines += len(invalid_lines) |
194 continue | 137 continue |
195 else: | 138 else: |
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) | 139 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) |
197 warnings.append(warning) | 140 warnings.append(warning) |
198 if not invalid_lines: | 141 if not invalid_lines: |
199 invalid_lines = get_lines(feature) | 142 invalid_lines = egdu.get_lines(feature) |
200 first_invalid_line = line_count | 143 first_invalid_line = line_count |
201 skipped_lines += len(invalid_lines) | 144 skipped_lines += len(invalid_lines) |
202 continue | 145 continue |
203 if sequence == '': | 146 if sequence == '': |
204 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) | 147 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) |
205 warnings.append(warning) | 148 warnings.append(warning) |
206 if not invalid_lines: | 149 if not invalid_lines: |
207 invalid_lines = get_lines(feature) | 150 invalid_lines = egdu.get_lines(feature) |
208 first_invalid_line = line_count | 151 first_invalid_line = line_count |
209 skipped_lines += len(invalid_lines) | 152 skipped_lines += len(invalid_lines) |
210 continue | 153 continue |
211 if includes_strand_col and strand == "-": | 154 if includes_strand_col and strand == "-": |
212 sequence = reverse_complement(sequence) | 155 sequence = egdu.reverse_complement(sequence) |
213 if args.output_format == "fasta": | 156 if args.output_format == "fasta": |
214 l = len(sequence) | 157 l = len(sequence) |
215 c = 0 | 158 c = 0 |
216 if input_is_gff: | 159 if input_is_gff: |
217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) | 160 start, end = egdu.convert_bed_coords_to_gff([start, end]) |
218 fields = [args.genome, str(chrom), str(start), str(end), strand] | 161 fields = [args.genome, str(chrom), str(start), str(end), strand] |
219 meta_data = "_".join(fields) | 162 meta_data = "_".join(fields) |
220 if name.strip(): | 163 if name.strip(): |
221 out.write(">%s %s\n" % (meta_data, name)) | 164 out.write(">%s %s\n" % (meta_data, name)) |
222 else: | 165 else: |
234 str(feature.start), | 177 str(feature.start), |
235 str(feature.end), | 178 str(feature.end), |
236 feature.score, | 179 feature.score, |
237 feature.strand, | 180 feature.strand, |
238 ".", | 181 ".", |
239 gff_util.gff_attributes_to_str(feature.attributes, "GTF")]) | 182 egdu.gff_attributes_to_str(feature.attributes, "GTF")]) |
240 else: | 183 else: |
241 # Where is fields being set here? | 184 # Where is fields being set here? |
242 meta_data = "\t".join(fields) | 185 meta_data = "\t".join(fields) |
243 if input_is_gff: | 186 if input_is_gff: |
244 format_str = "%s seq \"%s\";\n" | 187 format_str = "%s seq \"%s\";\n" |
245 else: | 188 else: |
246 format_str = "%s\t%s\n" | 189 format_str = "%s\t%s\n" |
247 out.write(format_str % (meta_data, str(sequence))) | 190 out.write(format_str % (meta_data, str(sequence))) |
248 # Update line count. | 191 # Update line count. |
249 if isinstance(feature, gff_util.GFFFeature): | 192 if isinstance(feature, egdu.GFFFeature): |
250 line_count += len(feature.intervals) | 193 line_count += len(feature.intervals) |
251 else: | 194 else: |
252 line_count += 1 | 195 line_count += 1 |
253 out.close() | 196 out.close() |
254 | 197 |