Mercurial > repos > greg > extract_genomic_dna

--- a/extract_genomic_dna.py	Thu Jan 14 07:55:22 2016 -0500
+++ b/extract_genomic_dna.py	Thu Jan 14 09:24:51 2016 -0500
@@ -68,7 +68,7 @@
 parser = argparse.ArgumentParser()
 parser.add_option('--input_format', dest='input_format', help="Input dataset format")
 parser.add_option('--input', dest='input', help="Input dataset")
-parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build")
+parser.add_option('--genome', dest='genome', help="Input dataset genome build")
 parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
 parser.add_option('--columns', dest='columns', help="Columns to use in input file")
 parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
@@ -78,7 +78,7 @@
 args = parser.parse_args()

 input_is_gff = args.input_format == 'gff'
-interpret_features = args.interpret_features == "yes"
+interpret_features = input_is_gff and args.interpret_features == "yes"
 if len(args.cols.split(',')) == 5:
     # Bed file.
     chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols)
@@ -166,14 +166,14 @@
         try:
             sequence = nib.get(start, end - start)
         except Exception, e:
-            warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey)
+            warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
             warnings.append(warning)
             if not invalid_lines:
                 invalid_lines = get_lines(feature)
                 first_invalid_line = line_count
             skipped_lines += len(invalid_lines)
             continue
-    elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)):
+    elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)):
         if not(twobitfile):
             twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
         try:
@@ -193,7 +193,7 @@
             skipped_lines += len(invalid_lines)
             continue
     else:
-        warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey)
+        warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
         warnings.append(warning)
         if not invalid_lines:
             invalid_lines = get_lines(feature)
@@ -201,7 +201,7 @@
         skipped_lines += len(invalid_lines)
         continue
     if sequence == '':
-        warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey)
+        warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
         warnings.append(warning)
         if not invalid_lines:
             invalid_lines = get_lines(feature)
@@ -215,7 +215,7 @@
         c = 0
         if input_is_gff:
             start, end = gff_util.convert_bed_coords_to_gff([start, end])
-        fields = [args.dbkey, str(chrom), str(start), str(end), strand]
+        fields = [args.genome, str(chrom), str(start), str(end), strand]
         meta_data = "_".join(fields)
         if name.strip():
             out.write(">%s %s\n" % (meta_data, name))
--- a/extract_genomic_dna.xml	Thu Jan 14 07:55:22 2016 -0500
+++ b/extract_genomic_dna.xml	Thu Jan 14 09:24:51 2016 -0500
@@ -1,18 +1,20 @@
 <tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.0">
     <description>using coordinates from assembled/unassembled genomes</description>
+    <requirements>
+        <requirement type="package" version="35x1">faToTwoBit</requirement>
+    </requirements>
     <command>
         <![CDATA[
-            #set input_format $input_format_cond.input_format
-            #set input $input_format_cond.input
-            #set dbkey = $input.metadata.dbkey
+            #set input_format $input.ext
+            #set genome = $input.metadata.dbkey
             #set datatype = $input.datatype
             mkdir -p output_dir &&
             python $__tool_directory__/extract_genomic_dna.py
             --input_format $input_format
             --input "$input"
-            --dbkey $dbkey
+            --genome "$genome"
             #if str($input_format) == "gff":
-                --interpret_features $input_format_cond.interpret_features
+                --interpret_features $interpret_features
             #end if
             #if isinstance($datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
                 --columns "1,4,5,7"
@@ -30,26 +32,13 @@
         ]]>
     </command>
     <inputs>
-        <conditional name="input_format_cond">
-            <param name="input_format" type="select" label="Input file format">
-                <option value="gff" selected="True">Gff</option>
-                <option value="interval">Interval</option>
-            </param>
-            <when value="gff">
-                <param name="input" type="data" format="gff" label="Fetch sequences for intervals in">
-                    <validator type="unspecified_build" />
-                </param>
-                <param name="interpret_features" type="select" label="Interpret features when possible">
-                    <option value="yes">Yes</option>
-                    <option value="no">No</option>
-                </param>
-            </when>
-            <when value="interval">
-                <param name="input" type="data" format="interval" label="Fetch sequences for intervals in">
-                    <validator type="unspecified_build" />
-                </param>
-            </when>
-        </conditional>
+        <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in" help="Supported formats are gff, interval">
+            <validator type="unspecified_build" />
+        </param>
+        <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is gff">
+            <option value="yes">Yes</option>
+            <option value="no">No</option>
+        </param>
         <conditional name="reference_genome_cond">
             <param name="reference_genome_source" type="select" label="Choose the source for the reference genome">
                 <option value="cached">locally cached</option>
@@ -57,8 +46,8 @@
             </param>
             <when value="cached">
                 <param name="reference_genome" type="select" label="Using reference genome">
-                    <options from_data_table="alignseq_seq">
-                        <filter type="data_meta" key="dbkey" ref="input" column="dbkey"/>
+                    <options from_data_table="twobit">
+                        <filter type="data_meta" key="dbkey" ref="input" column="0"/>
                     </options>
                     <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                 </param>
@@ -66,7 +55,7 @@
             <when value="history">
                 <param name="reference_genome" type="data" format="fasta" label="Using reference genome">
                     <options>
-                        <filter type="data_meta" key="dbkey" ref="input_bam" />
+                        <filter type="data_meta" key="dbkey" ref="input"/>
                     </options>
                     <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/>
                 </param>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Jan 14 09:24:51 2016 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="twobit" comment_char="#">
+        <columns>dbkey, value</columns>
+        <file path="tool-data/twobit.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/twobit.loc.sample	Thu Jan 14 09:24:51 2016 -0500
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that is used by some
+#tools.  The twobit.loc file has this format (white space characters
+#are TAB characters):
+#
+#<Build>    <FullPathToFile>
+#
+#So, for example, if you had droPer1 twobit files stored in
+#/depot/data2/galaxy/droPer1/, then the twobit.loc entry
+#would look like this:
+#
+#droPer1    /depot/data2/galaxy/droPer1/droPer1.2bit
+#
+#and your /depot/data2/galaxy/droPer1/ directory would
+#contain all of your twobit files (e.g.):
+#
+#-rw-rw-r--   1 nate   galaxy 48972650 2007-05-04 11:27 droPer1.2bit
+#...etc...
+#
+#Your twobit.loc file should include an entry per line for each twobit
+#file you have stored.  For example:
+#
+#droPer1    /depot/data2/galaxy/droPer1/droPer1.2bit
+#apiMel2    /depot/data2/galaxy/apiMel2/apiMel2.2bit
+#droAna1    /depot/data2/galaxy/droAna1/droAna1.2bit
+#droAna2    /depot/data2/galaxy/droAna2/droAna2.2bit
+#...etc...