Mercurial > repos > iuc > decontam
diff decontam.xml @ 0:86da5c894956 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam commit 9e0ce6d02ee71d5b974ef615b1c5286bc45d8e6b
| author | iuc |
|---|---|
| date | Tue, 15 Oct 2024 13:36:46 +0000 |
| parents | |
| children | 871214ac722e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decontam.xml Tue Oct 15 13:36:46 2024 +0000 @@ -0,0 +1,173 @@ +<tool id="decontam" name="Decontam" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="bio_tools"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + Rscript '$rscript' + ]]></command> + <configfiles> + <configfile name="rscript"><![CDATA[ + library(tidyverse) + library(phyloseq) + library(ggplot2) + library(decontam) + + #if $input_type.select_input == 'phyloseq': + ps <- readRDS("$input_type.phyloseq_object") + sample_data(ps)\$control <- as.logical(sample_data(ps)[["$input_type.control_metadata"]]) + #else + ## get OTU table (first column is the OTU/ASV ID) + otu <- read_tsv("$input_type.otu") + otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) #use first column as rownames + OTU <- otu_table(otu2, taxa_are_rows = FALSE) + + ## get metadata table must have matching OTU/ASV ID in first column + meta <- read_tsv("$input_type.metadata") + meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) #use first column as rownames + + control_column = as.integer("$input_type.control") - 1 ##remove one index since the dataframe uses the first column as index + + ## convert 0/1 to bool for the control column and store in control column + meta2\$control <- as.logical(meta2[[control_column]]) + sampledata <- sample_data(meta2) + + ps <- phyloseq(OTU, FALSE, sampledata) + + #end if + + ## plot library_size_vs_control + df <- as.data.frame(sample_data(ps)) # Put sample_data into a ggplot-friendly data.frame + df\$LibrarySize <- sample_sums(ps) + df <- df[order(df\$LibrarySize),] + df\$Index <- seq(nrow(df)) + ggplot(data=df, aes(x=Index, y=LibrarySize, color=control)) + geom_point() + ggsave("$library_size_vs_control", device = "png", width = 10, height = 8, units = "cm") + + ## plot prevalence + contamdf.prev <- isContaminant(ps, method="prevalence", neg="control", threshold=$threshold) + table(contamdf.prev\$contaminant) + + ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0)) + ps.pa.neg <- prune_samples(sample_data(ps.pa)\$control == TRUE, ps.pa) + ps.pa.pos <- prune_samples(sample_data(ps.pa)\$control == FALSE, ps.pa) + + ## Make data.frame of prevalence in positive and negative samples + df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg), + contaminant=contamdf.prev\$contaminant) + ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() + + xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)") + ggsave("$prevalence", device = "png", width = 10, height = 8, units = "cm") + + ## remove contamination features from original data + #if $input_type.select_input == 'phyloseq': + id_name <- "SampleID" + #else + id_name <- colnames(otu)[1] ## we use the same name for the ID column as the OTU input + #end if + + ps.noncontam <- prune_taxa(!contamdf.prev\$contaminant, ps) + + otu_table(ps.noncontam) %>% + as.data.frame() %>% + rownames_to_column(id_name) -> otu + + write.table(otu, + file="$decontam_otu", + sep = "\t", + row.names=FALSE, + quote = FALSE) + + saveRDS(ps.noncontam, "$decontam_phyloseq") + + ]]></configfile> + </configfiles> + <inputs> + <conditional name="input_type"> + <param name="select_input" type="select" label="Phyloseq or Feature table input" help="This tool can work with phyloseq objects or feature table inputs."> + <option value="phyloseq">Phyloseq</option> + <option value="feature_table">Feature table</option> + </param> + <when value="phyloseq"> + <param name="phyloseq_object" type="data" format="phyloseq" label="Phyloseq object"/> + <param name="control_metadata" type="text" label="Control column" help="Column in the phyloseq metadata specifying weather a sample is a negative control (0 for normal samples / 1 for control)"/> + </when> + <when value="feature_table"> + <param name="otu" type="data" format="tabular" label="Feature table" help="OTU/ASV or other feature table. The first column must have corresponding IDs to the metadata table."/> + <param name="metadata" type="data" format="tabular" label="Metadata" help="Metadata that contains a column specifying weather a samples is a negativ control (0 for normal samples / 1 for control). The first column must have corresponding IDs to the feature table."/> + <param name="control" type="data_column" data_ref="metadata" use_header_names="true" multiple="false" optional="false" label="Control column" help="Column specifying weather a sample is a negative control (0 for normal samples / 1 for control)."/> + </when> + </conditional> + <param name="threshold" type="float" label="Threshold to detect a contaminant" value="0.1" min="0" max="1" help="Probability of the feature to be a decontaminant in the statistical test being performed." /> + </inputs> + <outputs> + <data name="library_size_vs_control" format="png" label="${tool.name} on ${on_string}: Library Size vs Control Plot"/> + <data name="prevalence" format="png" label="${tool.name} on ${on_string}: Prevalence Plot"/> + <data name="decontam_otu" format="tabular" label="${tool.name} on ${on_string}: Removed Contaminants - Feature Table"/> + <data name="decontam_phyloseq" format="phyloseq" label="${tool.name} on ${on_string}: Removed Contaminants - Phyloseq Object"/> + </outputs> + <tests> + <test> + <conditional name="input_type"> + <param name="select_input" value="phyloseq"/> + <param name="phyloseq_object" value="phyloseq_input.rds"/> + <param name="control_metadata" value="control"/> + </conditional> + <param name="threshold" value="0.5"/> + <output name="decontam_phyloseq" file="phyloseq_output.rds" ftype="phyloseq"/> + <output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/> + <output name="prevalence" file="Prevalence_Plot.png" ftype="png"/> + <output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/> + </test> + <test> + <conditional name="input_type"> + <param name="select_input" value="feature_table"/> + <param name="otu" value="otu_input.tsv"/> + <param name="metadata" value="metadata_input.tsv"/> + <!-- using the index of the column --> + <param name="control" value="8"/> + </conditional> + <param name="threshold" value="0.5"/> + <output name="decontam_phyloseq" file="phyloseq_output2.rds" ftype="phyloseq"/> + <output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/> + <output name="prevalence" file="Prevalence_Plot.png" ftype="png"/> + <output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/> + </test> + </tests> + <help><![CDATA[ +Simple Statistical Identification of Contaminating Sequence Features in Marker-Gene or Metagenomics Data +======================================================================================================== + +This tool identifies contaminating sequence features in marker-gene or +metagenomics datasets. It can be applied to any type of feature derived from +environmental sequencing data (e.g., ASVs, OTUs, taxonomic groups, MAGs, +etc.). The method requires either DNA quantitation data or sequenced negative +control samples. + +.. note:: + + Currently, only the negative control sample method is implemented in this + wrapper. + +**Output** + +- If a phyloseq object is provided as input, the output will be a phyloseq + object pruned to include only non-contaminant features. +- If only the feature table or metadata is provided, the output will be a + pruned phyloseq object containing only non-contaminant features, without + the TAX table. The output feature table will also be pruned to include only + non-contaminant features. + +**Threshold** + +The default threshold for identifying a contaminant is a probability of 0.1 +in the statistical test being performed. + +In the prevalence test, a special threshold value of 0.5 is notable: this +will identify as contaminants any sequences that are more prevalent in +negative controls than in positive samples. + ]]> + </help> + <expand macro="citations"/> +</tool>
