Mercurial > repos > greg > multilocus_genotype
changeset 0:725b160c91f0 draft
Uploaded
author | greg |
---|---|
date | Thu, 25 Oct 2018 10:50:53 -0400 |
parents | |
children | ba2df0561b12 |
files | .shed.yml multilocus_genotype.R multilocus_genotype.xml |
diffstat | 3 files changed, 211 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Thu Oct 25 10:50:53 2018 -0400 @@ -0,0 +1,14 @@ +name: multilocus_genotype +owner: greg +description: | + Contains a tool that renders the unique combination of the alleles for two or more loci for each individual. +homepage_url: https://github.com/freeseek/gtc2vcf +long_description: | + Contains a tool that renders the unique combination of the alleles for two or more loci for each individual. + The multilocus genotypes are critically important for tracking dispersal and population structure of + organisms, especially those that reproduce clonally (plants, sponges, cnidarians, flatworms, annelids, + sea stars, and many more). +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/multilocus_genotype +type: unrestricted +categories: + - Statistics
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multilocus_genotype.R Thu Oct 25 10:50:53 2018 -0400 @@ -0,0 +1,144 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("vcfR")) +suppressPackageStartupMessages(library("poppr")) +suppressPackageStartupMessages(library("adegenet")) +suppressPackageStartupMessages(library("ape")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("knitr")) + +option_list <- list( + make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file") + make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file") +) + +parser <- OptionParser(usage="%prog [options] file", option_list=option_list); +args <- parse_args(parser, positional_arguments=TRUE); +opt <- args$options; + +get_file_path = function(file_name) { + file_path = paste("output_plots_dir", file_name, sep="/"); + return(file_path); +} + +#extract Provesti's distance from the distance matrix +provesti_distance <- function(distance, selection) { + eval(parse(text=paste("as.matrix(distance)[", selection, "]"))); +} + +#Read in VCF input file. +vcf <- read.vcfR(opts$input_vcf); + +# Convert VCF file into formats compatiable with the Poppr package. +gind <- vcfR2genind(vcf); +# Add population information to the genind object. +poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); +gind@pop <- as.factor(poptab$region); +# Convert genind to genclone object +gclo <- as.genclone(gind); +# Calculate the bitwise distance between individuals, +# the following is similar to Provesti's distance. +xdis <- bitwise.dist(gclo); +# All alleles must match to make a unique multilocus +# genotype (“original” naive approach). This is the +# default behavior of poppr. +mll(gclo) <- "original"; +# The method above does not take the genetic distance +# into account, but we can use this matrix to collapse +# MLGs that are under a specified distance threshold. +# To determine the distance threshold, we will generate +# a neighbor-joining tree for all samples. + +# Start PDF device driver. +dev.new(width=20, height=30); +file_path = get_file_path("phylogeny_tree.pdf") +pdf(file=file_path, width=20, height=30, bg="white"); +# Create a phylogeny of samples based on distance matrices +# colors. +cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60"); +set.seed(999); + +theTree <- gclo %>% + aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>% + # Organize branches by clade. + ladderize(); +plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4); + # Add a scale bar showing 5% difference. +add.scale.bar(length=0.05, cex=0.65); +nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); +# Turn off device driver to flush output. +dev.off(); + +# Start PDF device driver. +dev.new(width=20, height=30); +file_path = get_file_path("dissimiliarity_distance_matrix.pdf") +pdf(file=file_path, width=20, height=30, bg="white"); +# Use of mlg.filter() will create a dissimiliarity distance +# matrix from the data and then filter based off of that +# matrix. Here we will use the bitwise distance matrix +# calculated above. + +# Multilocus genotypes (threshold of 1%). +mlg.filter(gclo, distance= xdis) <- 0.01; +m <- mlg.table(gclo, background=TRUE, color=TRUE); +# Turn off device driver to flush output. +dev.off(); + +# Start PDF device driver. +dev.new(width=20, height=30); +file_path = get_file_path("filter_stats.pdf") +pdf(file=file_path, width=20, height=30, bg="white"); +# Different clustering methods for tie breakers used by +# mlg.filter, default is farthest neighbor. +gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE); +# Turn off device driver to flush output. +dev.off(); + +# Create table of MLGs. +id <- mlg.id(gclo); +df <- data.frame(matrix((id), byrow=T)); + +# Start PDF device driver. +dev.new(width=20, height=30); +file_path = get_file_path("genotype_accumulation_curve.pdf") +pdf(file=file_path, width=20, height=30, bg="white"); +# We can use the genotype_curve() function to create a +# genotype accumulation curve to determine the minimum +# number of loci to identify unique MLGs. +gac <- genotype_curve(gind, sample=5, quiet=TRUE); +# Turn off device driver to flush output. +dev.off(); + +# Start PDF device driver. +dev.new(width=20, height=30); +file_path = get_file_path("genotype_accumulation_curve_for_gind.pdf") +pdf(file=file_path, width=20, height=30, bg="white"); +p <- last_plot(); +p + geom_smooth() + xlim(0, 100) + theme_bw(); + +# From the collapsed MLGs, we can calculate genotypic +# richness, diversity and eveness. +kable(poppr(gclo)); +kable(diversity_ci(gclo, n=100L, raw=FALSE )); + +# Now we can correct the original data for clones using +# clonecorrect. This step will reduce the dataset to +# only have 1 representative genotype per multilocus +# lineages (MLL). +gclo_cor <- clonecorrect(gclo, strata=NA); + +# Lastly, we can use a discriminant analysis of principal +# components to cluster genetically related individuals. +# This multivariate statistical approach partions the +# sample into a between-group and within- group component, +# in an effort to maximize discrimination between groups. +# Data is first transformed using a principal components +# analysis (PCA) and subsequently clusters are identified +# using discriminant analysis (DA).More information can be +# found here. +dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1); +scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols); +# Turn off device driver to flush output. +dev.off(); +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multilocus_genotype.xml Thu Oct 25 10:50:53 2018 -0400 @@ -0,0 +1,53 @@ +<tool id="multilocus_genotype" name="Multilocus genotype" version="1.0.0"> + <description>unique combination of alleles for loci</description> + <requirements> + <requirement type="package" version="1.6.0">r-optparse</requirement> + <requirement type="package" version="1.8.0">r-vcfr</requirement> + <requirement type="package" version="2.8.1">r-poppr</requirement> + <requirement type="package" version="2.1.1">r-adegenet</requirement> + <requirement type="package" version="5.1">r-ape</requirement> + <requirement type="package" version="5.1">r-ape</requirement> + <requirement type="package" version="3.0.0">r-ggplot2</requirement> + <requirement type="package" version="1.20">r-knitr</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +#set output_plots_dir = 'output_plots_dir' +mkdir $output_plots_dir && +Rscript '$__tool_directory__/multilocus_genotype.R' +--input_vcf '$input_vcf' +--input_pop_info 'input_pop_info']]></command> + <inputs> + <param name="input_vcf" type="data" format="vcf" label="VCF file" /> + <param name="input_pop_info" type="data" format="txt" label="Population information file" /> + </inputs> + <outputs> + <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}"> + <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/> + </collection> + </outputs> + <tests> + <test> + <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/> + <param name="input_pop_info" value="pop_info.txt" ftype="txt"/> + <output_collection name="output_plot_collection" type="list"> + <element name="phylogeny_tree.pdf" file="phylogeny_tree.pdf" ftype="pdf" compare="contains"/> + <element name="dissimiliarity_distance_matrix.pdf" file="dissimiliarity_distance_matrix.pdf" ftype="pdf" compare="contains"/> + <element name="filter_stats.pdf" file="filter_stats.pdf" ftype="pdf" compare="contains"/> + <element name="genotype_accumulation_curve.pdf" file="genotype_accumulation_curve.pdf" ftype="pdf" compare="contains"/> + <element name="genotype_accumulation_curve_for_gind.pdf" file="genotype_accumulation_curve_for_gind.pdf" ftype="pdf" compare="contains"/> + </output_collection> + </test> + </tests> + <help> +**What it does** + +Renders the unique combination of the alleles for two or more loci for each individual. The multilocus genotypes +are critically important for tracking dispersal and population structure of organisms, especially those that +reproduce clonally (plants, sponges, cnidarians, flatworms, annelids, sea stars, and many more). +----- + +**Required options** + </help> + <citations> + </citations> +</tool>