comparison extract_genomic_dna.py @ 9:8cc00c5cf33e draft

Uploaded
author greg
date Fri, 15 Jan 2016 08:43:52 -0500
parents ee00b3c68801
children 1a10864abc1f
comparison
equal deleted inserted replaced
8:32c6057529a4 9:8cc00c5cf33e
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import argparse 2 import argparse
3 import os 3 import os
4 import subprocess 4
5 import sys 5 import extract_genomic_dna_utils as egdu
6 import tempfile
7
8 import bx.seq.nib 6 import bx.seq.nib
9 import bx.seq.twobit 7 import bx.seq.twobit
10 from bx.intervals.io import Header, Comment 8 from bx.intervals.io import Header, Comment
11 from galaxy.datatypes.util import gff_util 9
12 from galaxy.tools.util.galaxyops import parse_cols_arg
13
14
15 def convert_to_twobit(reference_genome):
16 """
17 Create 2bit file history fasta dataset.
18 """
19 try:
20 seq_path = tempfile.NamedTemporaryFile(dir=".").name
21 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
22 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
23 tmp_stderr = open(tmp_name, 'wb')
24 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
25 returncode = proc.wait()
26 tmp_stderr.close()
27 if returncode != 0:
28 # Get stderr, allowing for case where it's very large.
29 tmp_stderr = open(tmp_name, 'rb')
30 stderr = ''
31 buffsize = 1048576
32 try:
33 while True:
34 stderr += tmp_stderr.read(buffsize)
35 if not stderr or len(stderr) % buffsize != 0:
36 break
37 except OverflowError:
38 pass
39 tmp_stderr.close()
40 os.remove(tmp_name)
41 stop_err(stderr)
42 return seq_path
43 except Exception, e:
44 stop_err('Error running faToTwoBit. ' + str(e))
45
46
47 def get_lines(feature):
48 # Get feature's line(s).
49 if isinstance(feature, gff_util.GFFFeature):
50 return feature.lines()
51 else:
52 return [feature.rstrip('\r\n')]
53
54
55 def reverse_complement(s):
56 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
57 reversed_s = []
58 for i in s:
59 reversed_s.append(complement_dna[i])
60 reversed_s.reverse()
61 return "".join(reversed_s)
62
63
64 def stop_err(msg):
65 sys.stderr.write(msg)
66 sys.exit(1)
67 10
68 parser = argparse.ArgumentParser() 11 parser = argparse.ArgumentParser()
69 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") 12 parser.add_argument('--input_format', dest='input_format', help="Input dataset format")
70 parser.add_argument('--input', dest='input', help="Input dataset") 13 parser.add_argument('--input', dest='input', help="Input dataset")
71 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") 14 parser.add_argument('--genome', dest='genome', help="Input dataset genome build")
79 22
80 input_is_gff = args.input_format == 'gff' 23 input_is_gff = args.input_format == 'gff'
81 interpret_features = input_is_gff and args.interpret_features == "yes" 24 interpret_features = input_is_gff and args.interpret_features == "yes"
82 if len(args.columns.split(',')) == 5: 25 if len(args.columns.split(',')) == 5:
83 # Bed file. 26 # Bed file.
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.columns) 27 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns)
85 else: 28 else:
86 # Gff file. 29 # Gff file.
87 chrom_col, start_col, end_col, strand_col = parse_cols_arg(args.columns) 30 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns)
88 name_col = False 31 name_col = False
89 32
90 if args.reference_genome_source == "history": 33 if args.reference_genome_source == "history":
91 seq_path = convert_to_twobit(args.reference_genome) 34 seq_path = egdu.convert_to_twobit(args.reference_genome)
92 else: 35 else:
93 seq_path = args.reference_genome 36 seq_path = args.reference_genome
94 seq_dir = os.path.split(seq_path)[0] 37 seq_dir = os.path.split(seq_path)[0]
95 38
96 includes_strand_col = strand_col >= 0 39 includes_strand_col = strand_col >= 0
103 warning = '' 46 warning = ''
104 twobitfile = None 47 twobitfile = None
105 line_count = 1 48 line_count = 1
106 file_iterator = open(args.input) 49 file_iterator = open(args.input)
107 if interpret_features: 50 if interpret_features:
108 file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False) 51 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False)
109 out = open(args.output, 'wt') 52 out = open(args.output, 'wt')
110 53
111 for feature in file_iterator: 54 for feature in file_iterator:
112 # Ignore comments, headers. 55 # Ignore comments, headers.
113 if isinstance(feature, (Header, Comment)): 56 if isinstance(feature, (Header, Comment)):
114 line_count += 1 57 line_count += 1
115 continue 58 continue
116 name = "" 59 name = ""
117 if interpret_features: 60 if interpret_features:
118 # Processing features. 61 # Processing features.
119 gff_util.convert_gff_coords_to_bed(feature) 62 egdu.convert_gff_coords_to_bed(feature)
120 chrom = feature.chrom 63 chrom = feature.chrom
121 start = feature.start 64 start = feature.start
122 end = feature.end 65 end = feature.end
123 strand = feature.strand 66 strand = feature.strand
124 else: 67 else:
131 start = int(fields[start_col]) 74 start = int(fields[start_col])
132 end = int(fields[end_col]) 75 end = int(fields[end_col])
133 if name_col: 76 if name_col:
134 name = fields[name_col] 77 name = fields[name_col]
135 if input_is_gff: 78 if input_is_gff:
136 start, end = gff_util.convert_gff_coords_to_bed([start, end]) 79 start, end = egdu.convert_gff_coords_to_bed([start, end])
137 if includes_strand_col: 80 if includes_strand_col:
138 strand = fields[strand_col] 81 strand = fields[strand_col]
139 except: 82 except:
140 warning = "Invalid chrom, start or end column values. " 83 warning = "Invalid chrom, start or end column values. "
141 warnings.append(warning) 84 warnings.append(warning)
142 if not invalid_lines: 85 if not invalid_lines:
143 invalid_lines = get_lines(feature) 86 invalid_lines = egdu.get_lines(feature)
144 first_invalid_line = line_count 87 first_invalid_line = line_count
145 skipped_lines += len(invalid_lines) 88 skipped_lines += len(invalid_lines)
146 continue 89 continue
147 if start > end: 90 if start > end:
148 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end) 91 warning = "Invalid interval, start '%d' > end '%d'. " % (start, end)
149 warnings.append(warning) 92 warnings.append(warning)
150 if not invalid_lines: 93 if not invalid_lines:
151 invalid_lines = get_lines(feature) 94 invalid_lines = egdu.get_lines(feature)
152 first_invalid_line = line_count 95 first_invalid_line = line_count
153 skipped_lines += len(invalid_lines) 96 skipped_lines += len(invalid_lines)
154 continue 97 continue
155 if strand not in ['+', '-']: 98 if strand not in ['+', '-']:
156 strand = '+' 99 strand = '+'
160 # Open sequence file and get sequence for feature/interval. 103 # Open sequence file and get sequence for feature/interval.
161 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): 104 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)):
162 if chrom in nibs: 105 if chrom in nibs:
163 nib = nibs[chrom] 106 nib = nibs[chrom]
164 else: 107 else:
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) 108 nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom)))
166 try: 109 try:
167 sequence = nib.get(start, end - start) 110 sequence = nib.get(start, end - start)
168 except Exception, e: 111 except Exception, e:
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) 112 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
170 warnings.append(warning) 113 warnings.append(warning)
171 if not invalid_lines: 114 if not invalid_lines:
172 invalid_lines = get_lines(feature) 115 invalid_lines = egdu.get_lines(feature)
173 first_invalid_line = line_count 116 first_invalid_line = line_count
174 skipped_lines += len(invalid_lines) 117 skipped_lines += len(invalid_lines)
175 continue 118 continue
176 elif os.path.isfile(seq_path): 119 elif os.path.isfile(seq_path):
177 if not(twobitfile): 120 if not(twobitfile):
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) 121 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
179 try: 122 try:
180 if interpret_features: 123 if interpret_features:
181 # Create sequence from intervals within a feature. 124 # Create sequence from intervals within a feature.
182 sequence = '' 125 sequence = ''
183 for interval in feature.intervals: 126 for interval in feature.intervals:
186 sequence = twobitfile[chrom][start:end] 129 sequence = twobitfile[chrom][start:end]
187 except: 130 except:
188 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) 131 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom)
189 warnings.append(warning) 132 warnings.append(warning)
190 if not invalid_lines: 133 if not invalid_lines:
191 invalid_lines = get_lines(feature) 134 invalid_lines = egdu.get_lines(feature)
192 first_invalid_line = line_count 135 first_invalid_line = line_count
193 skipped_lines += len(invalid_lines) 136 skipped_lines += len(invalid_lines)
194 continue 137 continue
195 else: 138 else:
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) 139 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
197 warnings.append(warning) 140 warnings.append(warning)
198 if not invalid_lines: 141 if not invalid_lines:
199 invalid_lines = get_lines(feature) 142 invalid_lines = egdu.get_lines(feature)
200 first_invalid_line = line_count 143 first_invalid_line = line_count
201 skipped_lines += len(invalid_lines) 144 skipped_lines += len(invalid_lines)
202 continue 145 continue
203 if sequence == '': 146 if sequence == '':
204 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) 147 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
205 warnings.append(warning) 148 warnings.append(warning)
206 if not invalid_lines: 149 if not invalid_lines:
207 invalid_lines = get_lines(feature) 150 invalid_lines = egdu.get_lines(feature)
208 first_invalid_line = line_count 151 first_invalid_line = line_count
209 skipped_lines += len(invalid_lines) 152 skipped_lines += len(invalid_lines)
210 continue 153 continue
211 if includes_strand_col and strand == "-": 154 if includes_strand_col and strand == "-":
212 sequence = reverse_complement(sequence) 155 sequence = egdu.reverse_complement(sequence)
213 if args.output_format == "fasta": 156 if args.output_format == "fasta":
214 l = len(sequence) 157 l = len(sequence)
215 c = 0 158 c = 0
216 if input_is_gff: 159 if input_is_gff:
217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) 160 start, end = egdu.convert_bed_coords_to_gff([start, end])
218 fields = [args.genome, str(chrom), str(start), str(end), strand] 161 fields = [args.genome, str(chrom), str(start), str(end), strand]
219 meta_data = "_".join(fields) 162 meta_data = "_".join(fields)
220 if name.strip(): 163 if name.strip():
221 out.write(">%s %s\n" % (meta_data, name)) 164 out.write(">%s %s\n" % (meta_data, name))
222 else: 165 else:
234 str(feature.start), 177 str(feature.start),
235 str(feature.end), 178 str(feature.end),
236 feature.score, 179 feature.score,
237 feature.strand, 180 feature.strand,
238 ".", 181 ".",
239 gff_util.gff_attributes_to_str(feature.attributes, "GTF")]) 182 egdu.gff_attributes_to_str(feature.attributes, "GTF")])
240 else: 183 else:
241 # Where is fields being set here? 184 # Where is fields being set here?
242 meta_data = "\t".join(fields) 185 meta_data = "\t".join(fields)
243 if input_is_gff: 186 if input_is_gff:
244 format_str = "%s seq \"%s\";\n" 187 format_str = "%s seq \"%s\";\n"
245 else: 188 else:
246 format_str = "%s\t%s\n" 189 format_str = "%s\t%s\n"
247 out.write(format_str % (meta_data, str(sequence))) 190 out.write(format_str % (meta_data, str(sequence)))
248 # Update line count. 191 # Update line count.
249 if isinstance(feature, gff_util.GFFFeature): 192 if isinstance(feature, egdu.GFFFeature):
250 line_count += len(feature.intervals) 193 line_count += len(feature.intervals)
251 else: 194 else:
252 line_count += 1 195 line_count += 1
253 out.close() 196 out.close()
254 197