Mercurial > repos > greg > extract_genomic_dna
comparison extract_genomic_dna.py @ 1:311febbd33d6 draft
Uploaded
author | greg |
---|---|
date | Thu, 14 Jan 2016 09:24:51 -0500 |
parents | cff5b7c9be55 |
children | c46db4f7c869 |
comparison
equal
deleted
inserted
replaced
0:cff5b7c9be55 | 1:311febbd33d6 |
---|---|
66 sys.exit(1) | 66 sys.exit(1) |
67 | 67 |
68 parser = argparse.ArgumentParser() | 68 parser = argparse.ArgumentParser() |
69 parser.add_option('--input_format', dest='input_format', help="Input dataset format") | 69 parser.add_option('--input_format', dest='input_format', help="Input dataset format") |
70 parser.add_option('--input', dest='input', help="Input dataset") | 70 parser.add_option('--input', dest='input', help="Input dataset") |
71 parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build") | 71 parser.add_option('--genome', dest='genome', help="Input dataset genome build") |
72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") | 72 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") |
73 parser.add_option('--columns', dest='columns', help="Columns to use in input file") | 73 parser.add_option('--columns', dest='columns', help="Columns to use in input file") |
74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") | 74 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") |
75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file") | 75 parser.add_option('--reference_genome', dest='reference_genome', help="Reference genome file") |
76 parser.add_option('--output_format', dest='output_format', help="Output format") | 76 parser.add_option('--output_format', dest='output_format', help="Output format") |
77 parser.add_option('--output', dest='output', help="Output dataset") | 77 parser.add_option('--output', dest='output', help="Output dataset") |
78 args = parser.parse_args() | 78 args = parser.parse_args() |
79 | 79 |
80 input_is_gff = args.input_format == 'gff' | 80 input_is_gff = args.input_format == 'gff' |
81 interpret_features = args.interpret_features == "yes" | 81 interpret_features = input_is_gff and args.interpret_features == "yes" |
82 if len(args.cols.split(',')) == 5: | 82 if len(args.cols.split(',')) == 5: |
83 # Bed file. | 83 # Bed file. |
84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) | 84 chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) |
85 else: | 85 else: |
86 # Gff file. | 86 # Gff file. |
164 else: | 164 else: |
165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) | 165 nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom))) |
166 try: | 166 try: |
167 sequence = nib.get(start, end - start) | 167 sequence = nib.get(start, end - start) |
168 except Exception, e: | 168 except Exception, e: |
169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey) | 169 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) |
170 warnings.append(warning) | 170 warnings.append(warning) |
171 if not invalid_lines: | 171 if not invalid_lines: |
172 invalid_lines = get_lines(feature) | 172 invalid_lines = get_lines(feature) |
173 first_invalid_line = line_count | 173 first_invalid_line = line_count |
174 skipped_lines += len(invalid_lines) | 174 skipped_lines += len(invalid_lines) |
175 continue | 175 continue |
176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)): | 176 elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)): |
177 if not(twobitfile): | 177 if not(twobitfile): |
178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) | 178 twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) |
179 try: | 179 try: |
180 if interpret_features: | 180 if interpret_features: |
181 # Create sequence from intervals within a feature. | 181 # Create sequence from intervals within a feature. |
191 invalid_lines = get_lines(feature) | 191 invalid_lines = get_lines(feature) |
192 first_invalid_line = line_count | 192 first_invalid_line = line_count |
193 skipped_lines += len(invalid_lines) | 193 skipped_lines += len(invalid_lines) |
194 continue | 194 continue |
195 else: | 195 else: |
196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey) | 196 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) |
197 warnings.append(warning) | 197 warnings.append(warning) |
198 if not invalid_lines: | 198 if not invalid_lines: |
199 invalid_lines = get_lines(feature) | 199 invalid_lines = get_lines(feature) |
200 first_invalid_line = line_count | 200 first_invalid_line = line_count |
201 skipped_lines += len(invalid_lines) | 201 skipped_lines += len(invalid_lines) |
202 continue | 202 continue |
203 if sequence == '': | 203 if sequence == '': |
204 warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey) | 204 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) |
205 warnings.append(warning) | 205 warnings.append(warning) |
206 if not invalid_lines: | 206 if not invalid_lines: |
207 invalid_lines = get_lines(feature) | 207 invalid_lines = get_lines(feature) |
208 first_invalid_line = line_count | 208 first_invalid_line = line_count |
209 skipped_lines += len(invalid_lines) | 209 skipped_lines += len(invalid_lines) |
213 if args.output_format == "fasta": | 213 if args.output_format == "fasta": |
214 l = len(sequence) | 214 l = len(sequence) |
215 c = 0 | 215 c = 0 |
216 if input_is_gff: | 216 if input_is_gff: |
217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) | 217 start, end = gff_util.convert_bed_coords_to_gff([start, end]) |
218 fields = [args.dbkey, str(chrom), str(start), str(end), strand] | 218 fields = [args.genome, str(chrom), str(start), str(end), strand] |
219 meta_data = "_".join(fields) | 219 meta_data = "_".join(fields) |
220 if name.strip(): | 220 if name.strip(): |
221 out.write(">%s %s\n" % (meta_data, name)) | 221 out.write(">%s %s\n" % (meta_data, name)) |
222 else: | 222 else: |
223 out.write(">%s\n" % meta_data) | 223 out.write(">%s\n" % meta_data) |