comparison extract_genomic_dna.py @ 1:311febbd33d6 draft

Uploaded
author greg
date Thu, 14 Jan 2016 09:24:51 -0500
parents cff5b7c9be55
children c46db4f7c869
comparison
equal deleted inserted replaced
0:cff5b7c9be55 1:311febbd33d6
66 sys.exit(1) 66 sys.exit(1)
67 67
68 parser = argparse.ArgumentParser() 68 parser = argparse.ArgumentParser()
69 parser.add_option('--input_format', dest='input_format', help="Input dataset format") 69 parser.add_option('--input_format', dest='input_format', help="Input dataset format")
70 parser.add_option('--input', dest='input', help="Input dataset") 70 parser.add_option('--input', dest='input', help="Input dataset")
71 parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build") 71 parser.add_option('--genome', dest='genome', help="Input dataset genome build")
72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") 72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
73 parser.add_option('--columns', dest='columns', help="Columns to use in input file") 73 parser.add_option('--columns', dest='columns', help="Columns to use in input file")
74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") 74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file") 75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file")
76 parser.add_option('--output_format', dest='output_format', help="Output format") 76 parser.add_option('--output_format', dest='output_format', help="Output format")
77 parser.add_option('--output', dest='output', help="Output dataset") 77 parser.add_option('--output', dest='output', help="Output dataset")
78 args = parser.parse_args() 78 args = parser.parse_args()
79 79
80 input_is_gff = args.input_format == 'gff' 80 input_is_gff = args.input_format == 'gff'
81 interpret_features = args.interpret_features == "yes" 81 interpret_features = input_is_gff and args.interpret_features == "yes"
82 if len(args.cols.split(',')) == 5: 82 if len(args.cols.split(',')) == 5:
83 # Bed file. 83 # Bed file.
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) 84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols)
85 else: 85 else:
86 # Gff file. 86 # Gff file.
164 else: 164 else:
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) 165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom)))
166 try: 166 try:
167 sequence = nib.get(start, end - start) 167 sequence = nib.get(start, end - start)
168 except Exception, e: 168 except Exception, e:
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey) 169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
170 warnings.append(warning) 170 warnings.append(warning)
171 if not invalid_lines: 171 if not invalid_lines:
172 invalid_lines = get_lines(feature) 172 invalid_lines = get_lines(feature)
173 first_invalid_line = line_count 173 first_invalid_line = line_count
174 skipped_lines += len(invalid_lines) 174 skipped_lines += len(invalid_lines)
175 continue 175 continue
176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)): 176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)):
177 if not(twobitfile): 177 if not(twobitfile):
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) 178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
179 try: 179 try:
180 if interpret_features: 180 if interpret_features:
181 # Create sequence from intervals within a feature. 181 # Create sequence from intervals within a feature.
191 invalid_lines = get_lines(feature) 191 invalid_lines = get_lines(feature)
192 first_invalid_line = line_count 192 first_invalid_line = line_count
193 skipped_lines += len(invalid_lines) 193 skipped_lines += len(invalid_lines)
194 continue 194 continue
195 else: 195 else:
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey) 196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
197 warnings.append(warning) 197 warnings.append(warning)
198 if not invalid_lines: 198 if not invalid_lines:
199 invalid_lines = get_lines(feature) 199 invalid_lines = get_lines(feature)
200 first_invalid_line = line_count 200 first_invalid_line = line_count
201 skipped_lines += len(invalid_lines) 201 skipped_lines += len(invalid_lines)
202 continue 202 continue
203 if sequence == '': 203 if sequence == '':
204 warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey) 204 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
205 warnings.append(warning) 205 warnings.append(warning)
206 if not invalid_lines: 206 if not invalid_lines:
207 invalid_lines = get_lines(feature) 207 invalid_lines = get_lines(feature)
208 first_invalid_line = line_count 208 first_invalid_line = line_count
209 skipped_lines += len(invalid_lines) 209 skipped_lines += len(invalid_lines)
213 if args.output_format == "fasta": 213 if args.output_format == "fasta":
214 l = len(sequence) 214 l = len(sequence)
215 c = 0 215 c = 0
216 if input_is_gff: 216 if input_is_gff:
217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) 217 start, end = gff_util.convert_bed_coords_to_gff([start, end])
218 fields = [args.dbkey, str(chrom), str(start), str(end), strand] 218 fields = [args.genome, str(chrom), str(start), str(end), strand]
219 meta_data = "_".join(fields) 219 meta_data = "_".join(fields)
220 if name.strip(): 220 if name.strip():
221 out.write(">%s %s\n" % (meta_data, name)) 221 out.write(">%s %s\n" % (meta_data, name))
222 else: 222 else:
223 out.write(">%s\n" % meta_data) 223 out.write(">%s\n" % meta_data)