Mercurial > repos > greg > extract_genomic_dna
diff extract_genomic_dna.py @ 1:311febbd33d6 draft
Uploaded
author | greg |
---|---|
date | Thu, 14 Jan 2016 09:24:51 -0500 |
parents | cff5b7c9be55 |
children | c46db4f7c869 |
line wrap: on
line diff
--- a/extract_genomic_dna.py Thu Jan 14 07:55:22 2016 -0500 +++ b/extract_genomic_dna.py Thu Jan 14 09:24:51 2016 -0500 @@ -68,7 +68,7 @@ parser = argparse.ArgumentParser() parser.add_option('--input_format', dest='input_format', help="Input dataset format") parser.add_option('--input', dest='input', help="Input dataset") -parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build") +parser.add_option('--genome', dest='genome', help="Input dataset genome build") parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") parser.add_option('--columns', dest='columns', help="Columns to use in input file") parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") @@ -78,7 +78,7 @@ args = parser.parse_args() input_is_gff = args.input_format == 'gff' -interpret_features = args.interpret_features == "yes" +interpret_features = input_is_gff and args.interpret_features == "yes" if len(args.cols.split(',')) == 5: # Bed file. chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) @@ -166,14 +166,14 @@ try: sequence = nib.get(start, end - start) except Exception, e: - warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey) + warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue - elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)): + elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)): if not(twobitfile): twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) try: @@ -193,7 +193,7 @@ skipped_lines += len(invalid_lines) continue else: - warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey) + warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) @@ -201,7 +201,7 @@ skipped_lines += len(invalid_lines) continue if sequence == '': - warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey) + warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) @@ -215,7 +215,7 @@ c = 0 if input_is_gff: start, end = gff_util.convert_bed_coords_to_gff([start, end]) - fields = [args.dbkey, str(chrom), str(start), str(end), strand] + fields = [args.genome, str(chrom), str(start), str(end), strand] meta_data = "_".join(fields) if name.strip(): out.write(">%s %s\n" % (meta_data, name))