diff extract_genomic_dna.py @ 1:311febbd33d6 draft

Uploaded
author greg
date Thu, 14 Jan 2016 09:24:51 -0500
parents cff5b7c9be55
children c46db4f7c869
line wrap: on
line diff
--- a/extract_genomic_dna.py	Thu Jan 14 07:55:22 2016 -0500
+++ b/extract_genomic_dna.py	Thu Jan 14 09:24:51 2016 -0500
@@ -68,7 +68,7 @@
 parser = argparse.ArgumentParser()
 parser.add_option('--input_format', dest='input_format', help="Input dataset format")
 parser.add_option('--input', dest='input', help="Input dataset")
-parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build")
+parser.add_option('--genome', dest='genome', help="Input dataset genome build")
 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
 parser.add_option('--columns', dest='columns', help="Columns to use in input file")
 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
@@ -78,7 +78,7 @@
 args = parser.parse_args()
 
 input_is_gff = args.input_format == 'gff'
-interpret_features = args.interpret_features == "yes"
+interpret_features = input_is_gff and args.interpret_features == "yes"
 if len(args.cols.split(',')) == 5:
     # Bed file.
     chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols)
@@ -166,14 +166,14 @@
         try:
             sequence = nib.get(start, end - start)
         except Exception, e:
-            warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey)
+            warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
             warnings.append(warning)
             if not invalid_lines:
                 invalid_lines = get_lines(feature)
                 first_invalid_line = line_count
             skipped_lines += len(invalid_lines)
             continue
-    elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)):
+    elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)):
         if not(twobitfile):
             twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
         try:
@@ -193,7 +193,7 @@
             skipped_lines += len(invalid_lines)
             continue
     else:
-        warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey)
+        warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
         warnings.append(warning)
         if not invalid_lines:
             invalid_lines = get_lines(feature)
@@ -201,7 +201,7 @@
         skipped_lines += len(invalid_lines)
         continue
     if sequence == '':
-        warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey)
+        warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
         warnings.append(warning)
         if not invalid_lines:
             invalid_lines = get_lines(feature)
@@ -215,7 +215,7 @@
         c = 0
         if input_is_gff:
             start, end = gff_util.convert_bed_coords_to_gff([start, end])
-        fields = [args.dbkey, str(chrom), str(start), str(end), strand]
+        fields = [args.genome, str(chrom), str(start), str(end), strand]
         meta_data = "_".join(fields)
         if name.strip():
             out.write(">%s %s\n" % (meta_data, name))