changeset 0:725b160c91f0 draft

Uploaded
author greg
date Thu, 25 Oct 2018 10:50:53 -0400
parents
children ba2df0561b12
files .shed.yml multilocus_genotype.R multilocus_genotype.xml
diffstat 3 files changed, 211 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Thu Oct 25 10:50:53 2018 -0400
@@ -0,0 +1,14 @@
+name: multilocus_genotype
+owner: greg
+description: |
+  Contains a tool that renders the unique combination of the alleles for two or more loci for each individual.
+homepage_url: https://github.com/freeseek/gtc2vcf
+long_description: |
+  Contains a tool that renders the unique combination of the alleles for two or more loci for each individual.
+  The multilocus genotypes are critically important for tracking dispersal and population structure of
+  organisms, especially those that reproduce clonally (plants, sponges, cnidarians, flatworms, annelids,
+  sea stars, and many more).
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/multilocus_genotype
+type: unrestricted
+categories:
+  - Statistics
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multilocus_genotype.R	Thu Oct 25 10:50:53 2018 -0400
@@ -0,0 +1,144 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("vcfR"))
+suppressPackageStartupMessages(library("poppr"))
+suppressPackageStartupMessages(library("adegenet"))
+suppressPackageStartupMessages(library("ape"))
+suppressPackageStartupMessages(library("ggplot2"))
+suppressPackageStartupMessages(library("knitr"))
+
+option_list <- list(
+    make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file")
+    make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file")
+)
+
+parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
+args <- parse_args(parser, positional_arguments=TRUE);
+opt <- args$options;
+
+get_file_path = function(file_name) {
+    file_path = paste("output_plots_dir", file_name, sep="/");
+    return(file_path);
+}
+
+#extract Provesti's distance from the distance matrix
+provesti_distance <- function(distance, selection) {
+  eval(parse(text=paste("as.matrix(distance)[", selection, "]")));
+}
+
+#Read in VCF input file.
+vcf <- read.vcfR(opts$input_vcf);
+
+# Convert VCF file into formats compatiable with the Poppr package.
+gind <- vcfR2genind(vcf);
+# Add population information to the genind object.
+poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA"));
+gind@pop <- as.factor(poptab$region);
+# Convert genind to genclone object
+gclo <- as.genclone(gind);
+# Calculate the bitwise distance between individuals,
+# the following is similar to Provesti's distance.
+xdis <- bitwise.dist(gclo);
+# All alleles must match to make a unique multilocus
+# genotype (“original” naive approach). This is the
+# default behavior of poppr.
+mll(gclo) <- "original";
+# The method above does not take the genetic distance
+# into account, but we can use this matrix to collapse
+# MLGs that are under a specified distance threshold.
+# To determine the distance threshold, we will generate
+# a neighbor-joining tree for all samples.
+
+# Start PDF device driver.
+dev.new(width=20, height=30);
+file_path = get_file_path("phylogeny_tree.pdf")
+pdf(file=file_path, width=20, height=30, bg="white");
+# Create a phylogeny of samples based on distance matrices
+# colors.
+cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60");
+set.seed(999);
+
+theTree <- gclo %>%
+  aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>%
+   # Organize branches by clade.
+  ladderize();
+plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4);
+ # Add a scale bar showing 5% difference.
+add.scale.bar(length=0.05, cex=0.65);
+nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+# Turn off device driver to flush output.
+dev.off();
+
+# Start PDF device driver.
+dev.new(width=20, height=30);
+file_path = get_file_path("dissimiliarity_distance_matrix.pdf")
+pdf(file=file_path, width=20, height=30, bg="white");
+# Use of mlg.filter() will create a dissimiliarity distance
+# matrix from the data and then filter based off of that
+# matrix. Here we will use the bitwise distance matrix
+# calculated above.
+
+# Multilocus genotypes (threshold of 1%).
+mlg.filter(gclo, distance= xdis) <- 0.01;
+m <- mlg.table(gclo, background=TRUE, color=TRUE);
+# Turn off device driver to flush output.
+dev.off();
+
+# Start PDF device driver.
+dev.new(width=20, height=30);
+file_path = get_file_path("filter_stats.pdf")
+pdf(file=file_path, width=20, height=30, bg="white");
+# Different clustering methods for tie breakers used by
+# mlg.filter, default is farthest neighbor.
+gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE);
+# Turn off device driver to flush output.
+dev.off();
+
+# Create table of MLGs.
+id <- mlg.id(gclo);
+df <- data.frame(matrix((id), byrow=T));
+
+# Start PDF device driver.
+dev.new(width=20, height=30);
+file_path = get_file_path("genotype_accumulation_curve.pdf")
+pdf(file=file_path, width=20, height=30, bg="white");
+# We can use the genotype_curve() function to create a
+# genotype accumulation curve to determine the minimum
+# number of loci to identify unique MLGs.
+gac <- genotype_curve(gind, sample=5, quiet=TRUE);
+# Turn off device driver to flush output.
+dev.off();
+
+# Start PDF device driver.
+dev.new(width=20, height=30);
+file_path = get_file_path("genotype_accumulation_curve_for_gind.pdf")
+pdf(file=file_path, width=20, height=30, bg="white");
+p <- last_plot();
+p + geom_smooth() + xlim(0, 100) + theme_bw();
+
+# From the collapsed MLGs, we can calculate genotypic
+# richness, diversity and eveness.
+kable(poppr(gclo));
+kable(diversity_ci(gclo, n=100L, raw=FALSE ));
+
+# Now we can correct the original data for clones using
+# clonecorrect. This step will reduce the dataset to
+# only have 1 representative genotype per multilocus
+# lineages (MLL).
+gclo_cor <- clonecorrect(gclo, strata=NA);
+
+# Lastly, we can use a discriminant analysis of principal
+# components to cluster genetically related individuals.
+# This multivariate statistical approach partions the
+# sample into a between-group and within- group component,
+# in an effort to maximize discrimination between groups.
+# Data is first transformed using a principal components
+# analysis (PCA) and subsequently clusters are identified
+# using discriminant analysis (DA).More information can be
+# found here.
+dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1);
+scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols);
+# Turn off device driver to flush output.
+dev.off();
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multilocus_genotype.xml	Thu Oct 25 10:50:53 2018 -0400
@@ -0,0 +1,53 @@
+<tool id="multilocus_genotype" name="Multilocus genotype" version="1.0.0">
+    <description>unique combination of alleles for loci</description>
+    <requirements>
+        <requirement type="package" version="1.6.0">r-optparse</requirement>
+        <requirement type="package" version="1.8.0">r-vcfr</requirement>
+        <requirement type="package" version="2.8.1">r-poppr</requirement>
+        <requirement type="package" version="2.1.1">r-adegenet</requirement>
+        <requirement type="package" version="5.1">r-ape</requirement>
+        <requirement type="package" version="5.1">r-ape</requirement>
+        <requirement type="package" version="3.0.0">r-ggplot2</requirement>
+        <requirement type="package" version="1.20">r-knitr</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+#set output_plots_dir = 'output_plots_dir'
+mkdir $output_plots_dir &&
+Rscript '$__tool_directory__/multilocus_genotype.R'
+--input_vcf '$input_vcf'
+--input_pop_info 'input_pop_info']]></command>
+    <inputs>
+        <param name="input_vcf" type="data" format="vcf" label="VCF file" />
+        <param name="input_pop_info" type="data" format="txt" label="Population information file" />
+    </inputs>
+    <outputs>
+        <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}">
+            <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/>
+            <param name="input_pop_info" value="pop_info.txt" ftype="txt"/>
+            <output_collection name="output_plot_collection" type="list">
+                <element name="phylogeny_tree.pdf" file="phylogeny_tree.pdf" ftype="pdf" compare="contains"/>
+                <element name="dissimiliarity_distance_matrix.pdf" file="dissimiliarity_distance_matrix.pdf" ftype="pdf" compare="contains"/>
+                <element name="filter_stats.pdf" file="filter_stats.pdf" ftype="pdf" compare="contains"/>
+                <element name="genotype_accumulation_curve.pdf" file="genotype_accumulation_curve.pdf" ftype="pdf" compare="contains"/>
+                <element name="genotype_accumulation_curve_for_gind.pdf" file="genotype_accumulation_curve_for_gind.pdf" ftype="pdf" compare="contains"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Renders the unique combination of the alleles for two or more loci for each individual. The multilocus genotypes
+are critically important for tracking dispersal and population structure of organisms, especially those that
+reproduce clonally (plants, sponges, cnidarians, flatworms, annelids, sea stars, and many more).
+-----
+
+**Required options**
+    </help>
+    <citations>
+    </citations>
+</tool>