Mercurial > repos > iuc > ampvis2_load

diff load.xml @ 0:301ee8d3a0f4 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ampvis2 commit 9ed0c3078be166bd22136771f517ae91a5198ecf
author: iuc
date: Fri, 16 Aug 2024 08:49:16 +0000
children: 07e7ec7ab1ac
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/load.xml	Fri Aug 16 08:49:16 2024 +0000
@@ -0,0 +1,423 @@
+<tool id="ampvis2_load" name="ampvis2 load" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description></description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="header"/>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $otutable.is_of_type("biom1") or $otutable.is_of_type("biom2")
+            ln -s '$otutable' otutable.biom &&
+        #else if not $otutable.is_of_type("phyloseq")
+            ## asv/otu column can not be specified so set the needed name
+            ## if empty https://github.com/KasperSkytte/ampvis2/issues/166 
+            ## also done in taxonomy.tsv
+            #if $asv_otu_col_empty
+                sed -e '1 s/^\t/ASV\t/' '$otutable' > otutable.tsv &&
+            #else
+                ln -s '$otutable' otutable.tsv &&
+            #end if
+        #end if
+        #if $taxonomy
+            #if $asv_otu_col_empty
+                sed -e '1 s/^\t/ASV\t/' '$taxonomy' > taxonomy.tsv &&
+            #else
+                ln -s '$taxonomy' taxonomy.tsv &&
+            #end if
+        #end if
+        Rscript '$rscript'
+    ]]></command>
+    <configfiles>
+        <configfile name="rscript"><![CDATA[
+            library(ampvis2, quietly = TRUE)
+            library(readr, quietly = TRUE)
+            ## 'manually' load metadata treating all columns as character
+            ## giving colClasses to amp_load seems not possible
+            ## - check.names=F: leave empty column names empty .. fixed below
+            #if $metadata
+                metadata <- read.table("$metadata", header = TRUE, sep = "\t", colClasses = "character", check.names=F)
+                ## we do not require the metadata to have a 1st column named "SampleID",
+                ## but it should not be empty
+                if(colnames(metadata)[1] == ""){
+                    colnames(metadata)[1] <- "SampleID"
+                }
+                if(exists("SampleID", where = metadata)){
+                    rownames(metadata) <- metadata[["SampleID"]]
+                }else{
+                    rownames(metadata) <- metadata[[1]]
+                }
+            #end if
+
+            #if $otutable.is_of_type("phyloseq")
+                otutable <- readRDS("$otutable")
+                print(class(otutable))
+            #end if
+            data <- amp_load(
+                #if $otutable.is_of_type("phyloseq")
+                    otutable = otutable,
+                #else if $otutable.is_of_type("biom1") or $otutable.is_of_type("biom2")
+                    otutable = "otutable.biom",
+                #else
+                    otutable = "otutable.tsv",
+                #end if
+                #if $metadata
+                    metadata = metadata,
+                #end if
+                #if $taxonomy
+                    taxonomy = "taxonomy.tsv",
+                #end if
+                #if $fasta
+                    fasta = "$fasta",
+                #end if
+                #if $tree
+                    tree = "$tree",
+                #end if
+                pruneSingletons = $pruneSingletons
+            )
+
+            #if $asv_sequences
+                library(ape, quietly = TRUE)
+
+                seq <- as.DNAbin(strsplit(rownames(data\$abund), ""))
+                names(seq) <- paste0("ASV", seq_along(seq))
+                data\$refseq <- seq
+                data <- matchOTUs(data, seq)
+            #end if
+
+            ## try to guess column types with plyr::type.convert
+            #if $guess_column_types
+                data\$metadata <- readr::type_convert(data\$metadata, guess_integer=TRUE)
+            #end if
+
+            saveRDS(data, "$ampvis")
+            ## write metadata list for biom input or if metadata is given 
+            #if "metadata" in $write_lists
+                @SAVE_METADATA_LIST@
+            #end if
+
+            #if "tax" in $write_lists
+                @SAVE_TAX_LIST@
+            #end if
+            ## print overview of the data to stdout
+            data
+        ]]></configfile>
+    </configfiles>
+    <inputs>
+        <param argument="otutable" type="data" format="phyloseq,dada2_sequencetable,tabular,biom1,biom2" label="OTU table"/>
+        <param name="asv_otu_col_empty" type="boolean" checked="false" label="OTU/ASV column has empty header" help="By default ampvis2 expects a column named ASV or OTU containing the ASV or OTU identifiers. By checking this a column with an empty header will be used (as produced by dada2)."/>
+        <param name="asv_sequences" type="boolean" checked="false" label="ASV identifiers are the ASV sequences" help="By checking this the identifiers will be renamed to ASV1, ASV2, etc and the sequences will be stored in the ampvis2 object." />
+        <param argument="metadata" type="data" format="tabular,tsv" optional="true" label="Sample metadata">
+            <validator type="expression" message="Table must have at least 1 column"><![CDATA[value.metadata.columns > 0]]></validator>
+        </param>
+        <param name="guess_column_types" type="boolean" checked="true" label="Guess metadata column types" help="See help"/>
+        <param argument="taxonomy" type="data" format="tabular" optional="true" label="Taxonomy table"/>
+        <param argument="fasta" type="data" format="fasta" optional="true" label="Fasta file"/>
+        <param argument="tree" type="data" format="newick" optional="true" label="Phylogenetic tree"/>
+        <param argument="pruneSingletons" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove singleton OTUs"/>
+        <param name="write_lists" type="select" optional="true" multiple="true" label="Output list data sets" help="Needed by most downstream tools. Select if the inputs contain taxonomic / metadata information.">
+            <option value="tax" selected="true">Taxonomy list</option>
+            <option value="metadata" selected="true">Metadata list</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="ampvis" format="ampvis2"/>
+        <data name="metadata_list_out" format="tabular" label="${tool.name} on ${on_string}: metadata list">
+            <filter>write_lists and "metadata" in write_lists</filter>
+        </data>
+        <data name="taxonomy_list_out" format="tabular" label="${tool.name} on ${on_string}: taxonomy list">
+            <filter>write_lists and "tax" in write_lists</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- load otu table + metadata + taxonomy -->
+        <test expect_num_outputs="3">
+            <param name="otutable" value="AalborgWWTPs.otu.csv"/>
+            <param name="metadata" value="AalborgWWTPs.tsv" ftype="tsv"/>
+            <param name="taxonomy" value="AalborgWWTPs.tax"/>
+            <output name="ampvis" value="AalborgWWTPs.rds" ftype="ampvis2" compare="sim_size"/>
+            <output name="metadata_list_out" value="AalborgWWTPs-metadata.list"/>
+            <output name="taxonomy_list_out" value="AalborgWWTPs-taxonomy.list"/>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 3 elements."/>
+                <has_text text="575.79"/>
+                <has_text text="SampleID, Plant, Date, Year, Period"/>
+                <has_text text="200(100%)   194(97%) 177(88.5%)   170(85%)   152(76%) 113(56.5%)      2(1%)"/>
+            </assert_stdout>
+        </test>
+        <!-- load otu table + metadata + taxonomy + tree + fasta -->
+        <test expect_num_outputs="3">
+            <param name="otutable" value="AalborgWWTPs.otu.csv"/>
+            <param name="metadata" value="AalborgWWTPs.tsv" ftype="tsv"/>
+            <param name="taxonomy" value="AalborgWWTPs.tax"/>
+            <param name="fasta" value="AalborgWWTPs.fa" ftype="fasta"/>
+            <param name="tree" value="AalborgWWTPs.nwk" ftype="newick"/>
+            <output name="ampvis" value="AalborgWWTPs-complete.rds" ftype="ampvis2" compare="sim_size"/>
+            <output name="metadata_list_out" value="AalborgWWTPs-metadata.list"/>
+            <output name="taxonomy_list_out" value="AalborgWWTPs-taxonomy.list"/>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 5 elements."/>
+                <has_text text="575.79"/>
+                <has_text text="SampleID, Plant, Date, Year, Period"/>
+                <has_text text="200(100%)   194(97%) 177(88.5%)   170(85%)   152(76%) 113(56.5%)      2(1%)"/>
+            </assert_stdout>
+        </test>
+        <!-- test biom 1/2 input (taken from https://github.com/biocore/biom-format/tree/master/examples)
+             metadata seems not to be loaded from a biom file https://github.com/MadsAlbertsen/ampvis2/issues/129
+             taxonomy is loaded from all but 1 
+            -->
+        <test expect_num_outputs="1">
+            <param name="otutable" value="rich-dense.biom" ftype="biom1"/>
+            <param name="write_lists" value=""/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size value="748"/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 3 elements."/>
+                <has_text text="4.5"/>
+                <has_text text="SampleID, BarcodeSequence, LinkerPrimerSequence, BODY_SITE, Description"/>
+                <has_text text="5(100%) 5(100%) 5(100%) 5(100%) 5(100%) 5(100%)  1(20%)"/>
+            </assert_stdout>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="otutable" value="rich-sparse.biom" ftype="biom1"/>
+            <param name="write_lists" value=""/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size value="751"/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 3 elements."/>
+                <has_text text="4.5"/>
+                <has_text text="SampleID, BarcodeSequence, LinkerPrimerSequence, BODY_SITE, Description"/>
+                <has_text text="5(100%) 5(100%) 5(100%) 5(100%) 5(100%) 5(100%)  1(20%)"/>
+            </assert_stdout>
+        </test>
+        <!-- input file seems to miss metadata check that no metadata & taxonomy is loaded (ampvis2 adds dummy metadata) -->
+        <test expect_num_outputs="1">
+            <param name="otutable" value="min_sparse_otu_table_hdf5.biom" ftype="biom2"/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size value="395"/>
+                </assert_contents>
+            </output>
+            <param name="write_lists" value=""/>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 3 elements."/>
+                <has_text text="4.5"/>
+                <has_text text="SampleID, DummyVariable"/>
+                <has_text text="0(0%)   0(0%)   0(0%)   0(0%)   0(0%)   0(0%)   0(0%)"/>
+            </assert_stdout>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="otutable" value="rich_sparse_otu_table_hdf5.biom" ftype="biom2"/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size value="753"/>
+                </assert_contents>
+            </output>
+            <param name="write_lists" value=""/>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 3 elements."/>
+                <has_text text="4.5"/>
+                <has_text text="SampleID, BODY_SITE, BarcodeSequence, Description, LinkerPrimerSequence"/>
+                <has_text text="5(100%) 5(100%) 5(100%) 5(100%) 5(100%) 5(100%)  1(20%)"/>
+            </assert_stdout>
+        </test>
+        <!-- load dada2 ASV table + metadata + taxonomy -->
+        <test expect_num_outputs="3">
+            <param name="otutable" value="dada2-removeBimeraDenovo.tab" ftype="dada2_sequencetable"/>
+            <param name="metadata" value="dada2-metadata.tsv" ftype="tsv"/>
+            <param name="taxonomy" value="dada2-assignTaxonomy.tabular"/>
+            <param name="asv_otu_col_empty" value="true"/>
+            <param name="asv_sequences" value="true"/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size min="100"/>
+                </assert_contents>
+            </output>
+            <output name="metadata_list_out">
+                <assert_contents>
+                    <has_n_lines n="23"/>
+                    <has_n_columns n="4"/>
+                    <has_text text="Sample"/>
+                </assert_contents>
+            </output>
+            <output name="taxonomy_list_out">
+                <assert_contents>
+                    <has_n_lines n="370"/>
+                    <has_n_columns n="2"/>
+                    <has_line line="Bacteria&#009;Kingdom"/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 4 elements."/> <!-- this also has fasta, i.e. 4 -->
+                <has_text text="6212.45"/>
+                <has_text text="Sample, time"/>
+                <has_text text="232(100%)   232(100%)   232(100%) 231(99.57%) 209(90.09%) 127(54.74%)"/>
+            </assert_stdout>
+        </test>
+        <!-- load data from phyloseq -->
+        <test expect_num_outputs="3">
+            <param name="otutable" value="output.phyloseq" ftype="phyloseq"/>
+            <output name="ampvis" ftype="ampvis2">
+                <assert_contents>
+                    <has_size min="100"/>
+                </assert_contents>
+            </output>
+            <output name="metadata_list_out">
+                <assert_contents>
+                    <has_n_lines n="6"/>
+                    <has_n_columns n="4"/>
+                    <has_text text="SampleID"/>
+                </assert_contents>
+            </output>
+            <output name="taxonomy_list_out">
+                <assert_contents>
+                    <has_n_lines n="147"/>
+                    <has_n_columns n="2"/>
+                    <has_line line="Bacteria&#009;Kingdom"/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="ampvis2 object with 4 elements."/> <!-- this also has fasta, i.e. 4 -->
+                <has_text text="SampleID, Property, Number"/>
+                <has_text text="64(100%)   64(100%)   64(100%)   64(100%) 62(96.88%)  56(87.5%)      0(0%)"/>
+            </assert_stdout>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+What it does
+============
+
+This tool reads an OTU or ASV table and corresponding sample metadata, and returns
+a RDS data set for use in all ampvis2 tools. It is therefore required to load
+data with this tool before any other ampvis2 tools can be used.
+
+The Galaxy tool calls the `amp_load <https://kasperskytte.github.io/ampvis2/reference/amp_load.html>`_
+function of the ampvis2 package. This function validates and corrects the
+provided data frames in different ways to make it suitable for the rest of the
+ampvis2 tools. It is important that the provided data sets match the
+requirements as described in the following to work properly.
+
+Input
+=====
+
+**The OTU-table**
+
+contains information about the OTU/ASVs, their read counts in each sample, and
+optionally their assigned taxonomy. The OTU table can be given as
+
+- Tabular data set
+- BIOM version (1 and 2)
+
+Metadata and taxonomy in the tabular or BIOM files that are given via the
+``OTU table`` parameter can is overwritten if by data presented via the
+``Sample metadata`` or ``Taxonomy table`` parameters.
+
+If given in tabular format the provided OTU-table must be a table with the
+following requirements:
+
+- The rows are OTU IDs and the columns are samples.
+- The OTU IDs are by default expected to be in a column called "OTU", "ASV", or "#OTU ID".
+  For data using an empty header for the OTU/ASV colum enable the option *OTU/ASV column has empty header*
+  (this allows to process data as produced e.g. by dada2).
+- The column names of the table are the sample IDs, exactly matching those in
+  the metadata
+- The last 7 columns are optionally the corresponding taxonomy assigned to the
+  OTUs, named "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species".
+
+If the ASV IDs are actually the ASV Sequences then enabling
+*ASV identifiers are the ASV sequences* will rename the identifiers to ASV1, ASV2,...
+(and save the sequences in the ampvis2 object).
+
+Generally avoid special characters and spaces in row- and column names.
+
+The OTU table can also contain the taxonomic information in additional columns:
+Kingdom, Phylum, Class, Order, Family, Genus.
+
+Check `here <https://biom-format.org/>`_ for information on the BIOM formats.
+
+**The metadata**
+
+contains additional information about the samples, for example where each sample
+was taken, date, pH, treatment etc, which is used to compare and group the
+samples during analysis. The amount of information in the metadata is unlimited,
+it can contain any number of columns (variables), however there are a few
+requirements:
+    
+- The sample IDs must be in the first column. The sample IDs must match exactly
+  to those in the OTU-table. Any unmatched samples between the otutable and
+  metadata will be removed with a warning.
+- Generally avoid special characters and spaces in row- and column names.
+
+By default the data types of metadata columns are guessed with
+``readr::type_convert``. The guessed column types can be seen in the last (4th)
+column of the ``metadata list`` output and also stdout of the tool.  Guessing of
+data types can be disabled using the parameter ``Guess metadata column types``.
+If disabled matadata from separate tabular input is treated as character data,
+and if loaded from biom files that data is used as is. Metadata types can be set
+manually using the tool ``ampvis2: set metadata``
+
+Dates should be given in the format ``YYYY-MM-DD`` (Y: year, M: month, D: day).
+
+In addition to the RDS data set a metadata (resp. taxonomy) list data set is returned
+if metadata (resp. taxonomic information) is given to this tool. It contains
+restructured metadata (taxonomic information) that is used in downstream ampvis2
+Galaxy tools in order to select metadata / metadata values (resp. taxonomic levels).
+
+**Taxonomy**
+
+is a tabular data set with 7 columns and one row per ASV/OTU:
+
+- the 1st column is identical to the 1st column of the OTU table parameter
+- the remaining columns contain data for Kingdom, Phylum, Class, Order, Family, Genus
+
+Note that the taxonomic information can also be embedded in the OTU table.
+
+**Tree**
+
+a tree with branch lengths in Newick format. 
+
+This is needed / usefull only if the data is used as input of: ``ampvis2:
+ordination plot`` for ordination methods NNDS / MMDS with (un)weighted UniFrac
+distances. Note that the loaded tree is also filtered by the ``ampvis2: subset
+...`` tools.
+
+**Fasta**
+
+a fasta file containing the sequences of the OTUs. Note that this information is
+only used in ``ampvis2: export fasta``. If the OTU table is modified by 
+``ampvis2: mergereplicates`` or the ``ampvis2: subset ...`` tools this might be
+useful to obtain a filtered list of sequences.
+
+
+Output
+======
+
+**RDS**
+
+The main output of the tool is an RDS data set that contains the R representation of
+the ampvis2 object containing the provided data (OTU table, metadata, taxonomy,
+phylogenetic tree, and fasta).
+
+**List files**
+
+Summarize the metadata and taxonomy information:
+
+- the taxonomy list file lists all taxa in a 1 column tabular data set
+- the metadata list file lists the Metadata variables (column 1), and the corresponding
+  available metadata values (column 2), if the variable is the SampleID (column 3), and
+  the data type of the corresponding metadata variable (column 4)
+
+These files are auxilliary files that are needed in downstream ``ampvis2`` Galaxy tools
+to allow selecting metadata and taxonomy. They are not passed to the underlying R functions.
+
+Note that, if the no taxonomy (or metadata) is given then the underlying ``ampvis2`` R
+function adds dummy taxonomy (resp. metadata). In this case the output of the list datasets
+can be disabled with the ``Output list data sets`` parameter.
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
\ No newline at end of file
author	iuc
date	Fri, 16 Aug 2024 08:49:16 +0000
parents
children	07e7ec7ab1ac