Mercurial > repos > greg > extract_genomic_dna

--- a/extract_genomic_dna.xml	Thu Jan 14 10:51:45 2016 -0500
+++ b/extract_genomic_dna.xml	Thu Jan 14 11:01:00 2016 -0500
@@ -77,35 +77,21 @@
             <param name="interpret_features" value="yes"/>
             <param name="index_source" value="cached"/>
             <param name="out_format" value="fasta"/>
-            <output name="out_file1" file="extract_genomic_dna_out1.fasta">
-                <assert_contents>
-                    <!-- First few lines... -->
-                    <has_text text=">hg17_chr1_147962192_147962580_- CCDS989.1_cds_0_0_chr1_147962193_r" />
-                    <has_text text="ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG" />
-                    <has_text text="GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT" />
-                    <has_text text="GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT" />
-                    <!-- Last few lines... -->
-                    <has_text text="GCTGTGGCACAGAACATGGACTCTGTGTTTAAGGAGCTCTTGGGAAAGAC" />
-                    <has_text text="CTCTGTCCGCCAGGGCCTTGGGCCAGCATCTACCACCTCTCCCAGTCCTG" />
-                    <has_text text="GGCCCCGAAGCCCAAAGGCCCCGCCCAGCAGCCGCCTGGGCAGGAACAAA" />
-                    <has_text text="GGCTTCTCCCGGGGCCCTGGGGCCCCAGCCTCACCCTCAGCTTCCCACCC" />
-                    <has_text text="CCAGGGCCTAGACACGACCCCCAAGCCACACTGA" />
-                </assert_contents>
-            </output>
+            <output name="out_file1" file="extract_genomic_dna_out1.fasta" compare="contains" />
         </test>
         <test>
             <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
             <param name="interpret_features" value="yes"/>
             <param name="index_source" value="cached"/>
             <param name="out_format" value="fasta"/>
-            <output name="out_file1" file="extract_genomic_dna_out2.fasta" compare="contains"/>
+            <output name="out_file1" file="extract_genomic_dna_out2.fasta" compare="contains" />
         </test>
         <test>
             <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
             <param name="interpret_features" value="yes"/>
             <param name="index_source" value="cached"/>
             <param name="out_format" value="interval"/>
-            <output name="out_file1" file="extract_genomic_dna_out3.interval" />
+            <output name="out_file1" file="extract_genomic_dna_out3.interval" compare="contains" />
         </test>
         <!-- Test GFF file support. -->
         <test>
@@ -113,14 +99,14 @@
             <param name="interpret_features" value="no"/>
             <param name="index_source" value="cached"/>
             <param name="out_format" value="interval"/>
-            <output name="out_file1" file="extract_genomic_dna_out4.gff" compare="contains"/>
+            <output name="out_file1" file="extract_genomic_dna_out4.gff" compare="contains" />
         </test>
         <test>
             <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
             <param name="interpret_features" value="no"/>
             <param name="out_format" value="fasta"/>
             <param name="index_source" value="cached"/>
-            <output name="out_file1" file="extract_genomic_dna_out5.fasta" compare="contains"/>
+            <output name="out_file1" file="extract_genomic_dna_out5.fasta" compare="contains" />
         </test>
         <!-- Test custom sequences support and GFF feature interpretation. -->
         <test>
@@ -129,7 +115,7 @@
             <param name="index_source" value="history"/>
             <param name="ref_file" value="tophat_in1.fasta"/>
             <param name="out_format" value="fasta"/>
-            <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains"/>
+            <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains" />
         </test>
         <test>
             <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
@@ -137,23 +123,38 @@
             <param name="index_source" value="history"/>
             <param name="ref_file" value="tophat_in1.fasta"/>
             <param name="out_format" value="fasta"/>
-            <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains"/>
+            <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains" />
         </test>
     </tests>
     <help>

 .. class:: warningmark

-The following will cause a line from the input dataset to be skipped and a warning generated.
+This tool requires interval or gff (special tabular formatted data).  If your data is not TAB delimited, first use *Text Manipulation-&gt;Convert*.
+
+.. class:: warningmark
+
+Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified).
+
+.. class:: warningmark

- - Sequences that fall outside of the range of a line's start and end coordinates.
- - Chromosome start or end coordinates that are invalid for the specified build.
+All of the following will cause a line from the input dataset to be skipped and a warning generated.  The number of warnings and skipped lines is documented in the resulting history item.
+ - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.
+ - Sequences that fall outside of the range of a line's start and end coordinates.
+ - Chromosome, start or end coordinates that are invalid for the specified build.
+ - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).
+
+.. class:: infomark
+
+ **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools.

 -----

 **What it does**

-This tool uses coordinate, strand, and build information to fetch genomic DNA from gff data, producing fasta data.
+This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.
+
+If strand is not defined, the default value is "+".

 -----

@@ -165,7 +166,7 @@
     chr7  127485994  127486166  NM_000230  0  +
     chr7  127486011  127486166  D49487     0  +

-Extracting sequences returns::
+Extracting sequences with **FASTA** output data type returns::

     &gt;hg17_chr7_127475281_127475310_+ NM_000230
     GTAGGAATCGCAGCGCCAGCGGTTGCAAG
@@ -180,6 +181,12 @@
     CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
     ACACG

+Extracting sequences with **Interval** output data type returns::
+
+    chr7    127475281       127475310       NM_000230       0       +       GTAGGAATCGCAGCGCCAGCGGTTGCAAG
+    chr7    127485994       127486166       NM_000230       0       +       GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
+    chr7    127486011       127486166       D49487  0       +       TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
+
     </help>
     <citations>
         <citation type="bibtex">