Mercurial > repos > greg > extract_genomic_dna
changeset 1:311febbd33d6 draft
Uploaded
author | greg |
---|---|
date | Thu, 14 Jan 2016 09:24:51 -0500 |
parents | cff5b7c9be55 |
children | cc1879e0b0ae |
files | extract_genomic_dna.py extract_genomic_dna.xml tool_data_table_conf.xml.sample twobit.loc.sample |
diffstat | 4 files changed, 56 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/extract_genomic_dna.py Thu Jan 14 07:55:22 2016 -0500 +++ b/extract_genomic_dna.py Thu Jan 14 09:24:51 2016 -0500 @@ -68,7 +68,7 @@ parser = argparse.ArgumentParser() parser.add_option('--input_format', dest='input_format', help="Input dataset format") parser.add_option('--input', dest='input', help="Input dataset") -parser.add_option('--dbkey', dest='dbkey', help="Input dataset genome build") +parser.add_option('--genome', dest='genome', help="Input dataset genome build") parser.add_option('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") parser.add_option('--columns', dest='columns', help="Columns to use in input file") parser.add_option('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") @@ -78,7 +78,7 @@ args = parser.parse_args() input_is_gff = args.input_format == 'gff' -interpret_features = args.interpret_features == "yes" +interpret_features = input_is_gff and args.interpret_features == "yes" if len(args.cols.split(',')) == 5: # Bed file. chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.cols) @@ -166,14 +166,14 @@ try: sequence = nib.get(start, end - start) except Exception, e: - warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.dbkey) + warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue - elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.dbkey)): + elif os.path.isfile(os.path.join(seq_dir, '%s.2bit' % args.genome)): if not(twobitfile): twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) try: @@ -193,7 +193,7 @@ skipped_lines += len(invalid_lines) continue else: - warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.dbkey) + warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) @@ -201,7 +201,7 @@ skipped_lines += len(invalid_lines) continue if sequence == '': - warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (chrom, start, end, args.dbkey) + warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) @@ -215,7 +215,7 @@ c = 0 if input_is_gff: start, end = gff_util.convert_bed_coords_to_gff([start, end]) - fields = [args.dbkey, str(chrom), str(start), str(end), strand] + fields = [args.genome, str(chrom), str(start), str(end), strand] meta_data = "_".join(fields) if name.strip(): out.write(">%s %s\n" % (meta_data, name))
--- a/extract_genomic_dna.xml Thu Jan 14 07:55:22 2016 -0500 +++ b/extract_genomic_dna.xml Thu Jan 14 09:24:51 2016 -0500 @@ -1,18 +1,20 @@ <tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.0"> <description>using coordinates from assembled/unassembled genomes</description> + <requirements> + <requirement type="package" version="35x1">faToTwoBit</requirement> + </requirements> <command> <![CDATA[ - #set input_format $input_format_cond.input_format - #set input $input_format_cond.input - #set dbkey = $input.metadata.dbkey + #set input_format $input.ext + #set genome = $input.metadata.dbkey #set datatype = $input.datatype mkdir -p output_dir && python $__tool_directory__/extract_genomic_dna.py --input_format $input_format --input "$input" - --dbkey $dbkey + --genome "$genome" #if str($input_format) == "gff": - --interpret_features $input_format_cond.interpret_features + --interpret_features $interpret_features #end if #if isinstance($datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): --columns "1,4,5,7" @@ -30,26 +32,13 @@ ]]> </command> <inputs> - <conditional name="input_format_cond"> - <param name="input_format" type="select" label="Input file format"> - <option value="gff" selected="True">Gff</option> - <option value="interval">Interval</option> - </param> - <when value="gff"> - <param name="input" type="data" format="gff" label="Fetch sequences for intervals in"> - <validator type="unspecified_build" /> - </param> - <param name="interpret_features" type="select" label="Interpret features when possible"> - <option value="yes">Yes</option> - <option value="no">No</option> - </param> - </when> - <when value="interval"> - <param name="input" type="data" format="interval" label="Fetch sequences for intervals in"> - <validator type="unspecified_build" /> - </param> - </when> - </conditional> + <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in" help="Supported formats are gff, interval"> + <validator type="unspecified_build" /> + </param> + <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is gff"> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> <conditional name="reference_genome_cond"> <param name="reference_genome_source" type="select" label="Choose the source for the reference genome"> <option value="cached">locally cached</option> @@ -57,8 +46,8 @@ </param> <when value="cached"> <param name="reference_genome" type="select" label="Using reference genome"> - <options from_data_table="alignseq_seq"> - <filter type="data_meta" key="dbkey" ref="input" column="dbkey"/> + <options from_data_table="twobit"> + <filter type="data_meta" key="dbkey" ref="input" column="0"/> </options> <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> </param> @@ -66,7 +55,7 @@ <when value="history"> <param name="reference_genome" type="data" format="fasta" label="Using reference genome"> <options> - <filter type="data_meta" key="dbkey" ref="input_bam" /> + <filter type="data_meta" key="dbkey" ref="input"/> </options> <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/> </param>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Jan 14 09:24:51 2016 -0500 @@ -0,0 +1,6 @@ +<tables> + <table name="twobit" comment_char="#"> + <columns>dbkey, value</columns> + <file path="tool-data/twobit.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/twobit.loc.sample Thu Jan 14 09:24:51 2016 -0500 @@ -0,0 +1,26 @@ +#This is a sample file distributed with Galaxy that is used by some +#tools. The twobit.loc file has this format (white space characters +#are TAB characters): +# +#<Build> <FullPathToFile> +# +#So, for example, if you had droPer1 twobit files stored in +#/depot/data2/galaxy/droPer1/, then the twobit.loc entry +#would look like this: +# +#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit +# +#and your /depot/data2/galaxy/droPer1/ directory would +#contain all of your twobit files (e.g.): +# +#-rw-rw-r-- 1 nate galaxy 48972650 2007-05-04 11:27 droPer1.2bit +#...etc... +# +#Your twobit.loc file should include an entry per line for each twobit +#file you have stored. For example: +# +#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit +#apiMel2 /depot/data2/galaxy/apiMel2/apiMel2.2bit +#droAna1 /depot/data2/galaxy/droAna1/droAna1.2bit +#droAna2 /depot/data2/galaxy/droAna2/droAna2.2bit +#...etc...