Mercurial > repos > greg > extract_genomic_dna

<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.0">
    <description>using coordinates from assembled/unassembled genomes</description>
    <requirements>
        <requirement type="package" version="35x1">faToTwoBit</requirement>
    </requirements>
    <command>
        <![CDATA[
            #set genome = $input.metadata.dbkey
            #set datatype = $input.datatype
            mkdir -p output_dir &&
            python $__tool_directory__/extract_genomic_dna.py
            --input "$input"
            --genome "$genome"
            #if isinstance($datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
                --input_format "gff"
                --columns "1,4,5,7"
                --interpret_features $interpret_features
            #else:
                --input_format "interval"
                --columns "${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}"
            #end if
            --reference_genome_source $reference_genome_cond.reference_genome_source
            #if str($reference_genome_cond.reference_genome_source) == "cached"
                --reference_genome $reference_genome_cond.reference_genome.fields.path
            #else:
                --reference_genome $reference_genome_cond.reference_genome
            #end if
            --output_format $output_format
            --output $output
        ]]>
    </command>
    <inputs>
        <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in" help="Supported formats are gff, interval">
            <validator type="unspecified_build" />
        </param>
        <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is gff">
            <option value="yes">Yes</option>
            <option value="no">No</option>
        </param>
        <conditional name="reference_genome_cond">
            <param name="reference_genome_source" type="select" label="Choose the source for the reference genome">
                <option value="cached">locally cached</option>
                <option value="history">from history</option>
            </param>
            <when value="cached">
                <param name="reference_genome" type="select" label="Using reference genome">
                    <options from_data_table="twobit">
                        <filter type="data_meta" key="dbkey" ref="input" column="0"/>
                    </options>
                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                </param>
            </when>
            <when value="history">
                <param name="reference_genome" type="data" format="fasta" label="Using reference genome">
                    <options>
                        <filter type="data_meta" key="dbkey" ref="input"/>
                    </options>
                    <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/>
                </param>
            </when>
        </conditional>
        <param name="output_format" type="select" label="Select output format">
            <option value="fasta" selected="True">fasta</option>
            <option value="interval">interval</option>
        </param>
    </inputs>
    <outputs>
        <data name="output" format="gff">
            <change_format>
                <when output_format="interval" format="interval" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out1.fasta">
                <assert_contents>
                    <!-- First few lines... -->
                    <has_text text=">hg17_chr1_147962192_147962580_- CCDS989.1_cds_0_0_chr1_147962193_r" />
                    <has_text text="ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG" />
                    <has_text text="GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT" />
                    <has_text text="GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT" />
                    <!-- Last few lines... -->
                    <has_text text="GCTGTGGCACAGAACATGGACTCTGTGTTTAAGGAGCTCTTGGGAAAGAC" />
                    <has_text text="CTCTGTCCGCCAGGGCCTTGGGCCAGCATCTACCACCTCTCCCAGTCCTG" />
                    <has_text text="GGCCCCGAAGCCCAAAGGCCCCGCCCAGCAGCCGCCTGGGCAGGAACAAA" />
                    <has_text text="GGCTTCTCCCGGGGCCCTGGGGCCCCAGCCTCACCCTCAGCTTCCCACCC" />
                    <has_text text="CCAGGGCCTAGACACGACCCCCAAGCCACACTGA" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out2.fasta" compare="contains"/>
        </test>
        <test>
            <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="interval"/>
            <output name="out_file1" file="extract_genomic_dna_out3.interval" />
        </test>
        <!-- Test GFF file support. -->
        <test>
            <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="interval"/>
            <output name="out_file1" file="extract_genomic_dna_out4.gff" compare="contains"/>
        </test>
        <test>
            <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="out_format" value="fasta"/>
            <param name="index_source" value="cached"/>
            <output name="out_file1" file="extract_genomic_dna_out5.fasta" compare="contains"/>
        </test>
        <!-- Test custom sequences support and GFF feature interpretation. -->
        <test>
            <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="index_source" value="history"/>
            <param name="ref_file" value="tophat_in1.fasta"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains"/>
        </test>
        <test>
            <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="history"/>
            <param name="ref_file" value="tophat_in1.fasta"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains"/>
        </test>
    </tests>
    <help>

.. class:: warningmark

The following will cause a line from the input dataset to be skipped and a warning generated.

 - Sequences that fall outside of the range of a line's start and end coordinates.
 - Chromosome start or end coordinates that are invalid for the specified build.

-----

**What it does**

This tool uses coordinate, strand, and build information to fetch genomic DNA from gff data, producing fasta data.

-----

**Example**

If the input dataset is::

    chr7  127475281  127475310  NM_000230  0  +
    chr7  127485994  127486166  NM_000230  0  +
    chr7  127486011  127486166  D49487     0  +

Extracting sequences returns::

    &gt;hg17_chr7_127475281_127475310_+ NM_000230
    GTAGGAATCGCAGCGCCAGCGGTTGCAAG
    &gt;hg17_chr7_127485994_127486166_+ NM_000230
    GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG
    GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC
    CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG
    GATCAATGACATTTCACACACG
    &gt;hg17_chr7_127486011_127486166_+ D49487
    TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG
    CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA
    CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
    ACACG

    </help>
    <citations>
        <citation type="bibtex">
            @unpublished{None,
            author = {},
            title = {None},
            year = {None},
            eprint = {None},
            url = {http://www.bx.psu.edu/~anton/labSite/}
        }</citation>
    </citations>
</tool>
author	greg
date	Thu, 14 Jan 2016 10:51:45 -0500
parents	cc1879e0b0ae
children	338e991cdd1f