Mercurial > repos > greg > multilocus_genotype
changeset 4:a7cce4091e80 draft
Uploaded
author | greg |
---|---|
date | Wed, 21 Nov 2018 09:14:09 -0500 |
parents | 1bc815d9c8c5 |
children | 73cfe1260e98 |
files | multilocus_genotype.R multilocus_genotype.xml |
diffstat | 2 files changed, 53 insertions(+), 100 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Thu Oct 25 13:33:47 2018 -0400 +++ b/multilocus_genotype.R Wed Nov 21 09:14:09 2018 -0500 @@ -1,16 +1,20 @@ #!/usr/bin/env Rscript -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("vcfR")) -suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("adegenet")) suppressPackageStartupMessages(library("ape")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("poppr")) +suppressPackageStartupMessages(library("RColorBrewer")) +suppressPackageStartupMessages(library("vcfR")) +suppressPackageStartupMessages(library("vegan")) option_list <- list( make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), - make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file") + make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), + make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"), + make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); @@ -30,6 +34,13 @@ # Read in VCF input file. vcf <- read.vcfR(opt$input_vcf); +#Missing GT in samples submitted +gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); +myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); +myMiss <- (myMiss / nrow(vcf)) * 100; +miss <- data.frame(myMiss); +write.table(miss, file=opt$output_missing_data, quote=FALSE); + # Convert VCF file into formats compatiable with the Poppr package. genind <- vcfR2genind(vcf); # Add population information to the genind object. @@ -39,106 +50,43 @@ gclo <- as.genclone(genind); # Calculate the bitwise distance between individuals, # the following is similar to Provesti's distance. -xdis <- bitwise.dist(gclo); -# All alleles must match to make a unique multilocus -# genotype (“original” naive approach). This is the -# default behavior of poppr. -mll(gclo) <- "original"; -# The method above does not take the genetic distance -# into account, but we can use this matrix to collapse -# MLGs that are under a specified distance threshold. -# To determine the distance threshold, we will generate -# a neighbor-joining tree for all samples. - -# Start PDF device driver. -dev.new(width=20, height=30); -file_path = get_file_path("phylogeny_tree.pdf"); -pdf(file=file_path, width=20, height=30, bg="white"); -# Create a phylogeny of samples based on distance matrices -# colors. -cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60"); -set.seed(999); +xdis <- bitwise.dist(gclo, missing_match=FALSE); -theTree <- gclo %>% - aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>% - # Organize branches by clade. - ladderize(); -plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4); - # Add a scale bar showing 5% difference. -add.scale.bar(length=0.05, cex=0.65); -nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); -# Turn off device driver to flush output. -dev.off(); - +# Multilocus genotypes (threshold of 1.6%). +mlg.filter(gclo, distance=xdis) <- 0.016; # Start PDF device driver. -dev.new(width=20, height=30); -file_path = get_file_path("dissimiliarity_distance_matrix.pdf"); -pdf(file=file_path, width=20, height=30, bg="white"); -# Use of mlg.filter() will create a dissimiliarity distance -# matrix from the data and then filter based off of that -# matrix. Here we will use the bitwise distance matrix -# calculated above. - -# Multilocus genotypes (threshold of 1%). -mlg.filter(gclo, distance= xdis) <- 0.01; +dev.new(width=10, height=7); +file_path = get_file_path("mlg_table.pdf"); +pdf(file=file_path, width=10, height=7); m <- mlg.table(gclo, background=TRUE, color=TRUE); -# Turn off device driver to flush output. -dev.off(); - -# Start PDF device driver. -dev.new(width=20, height=30); -file_path = get_file_path("filter_stats.pdf"); -pdf(file=file_path, width=20, height=30, bg="white"); -# Different clustering methods for tie breakers used by -# mlg.filter, default is farthest neighbor. -gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE); -# Turn off device driver to flush output. dev.off(); # Create table of MLGs. id <- mlg.id(gclo); df <- data.frame(matrix((id), byrow=T)); +write.table(df, file=opt$output_mlg_id); +# Rarifaction curve. +H.obj <- mlg.table(gclo, plot=TRUE); # Start PDF device driver. -dev.new(width=20, height=30); -file_path = get_file_path("genotype_accumulation_curve.pdf"); -pdf(file=file_path, width=20, height=30, bg="white"); -# We can use the genotype_curve() function to create a -# genotype accumulation curve to determine the minimum -# number of loci to identify unique MLGs. -gac <- genotype_curve(genind, sample=5, quiet=TRUE); -# Turn off device driver to flush output. -dev.off(); - -# Start PDF device driver. -dev.new(width=20, height=30); -file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf"); -pdf(file=file_path, width=20, height=30, bg="white"); -p <- last_plot(); -p + geom_smooth() + xlim(0, 100) + theme_bw(); +dev.new(width=10, height=7); +file_path = get_file_path("geno_rarifaction_curve.pdf"); +pdf(file=file_path, width=10, height=7); +rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue"); +dev.off() -# From the collapsed MLGs, we can calculate genotypic -# richness, diversity and eveness. -kable(poppr(gclo)); -kable(diversity_ci(gclo, n=100L, raw=FALSE )); - -# Now we can correct the original data for clones using -# clonecorrect. This step will reduce the dataset to -# only have 1 representative genotype per multilocus -# lineages (MLL). -gclo_cor <- clonecorrect(gclo, strata=NA); +# Create a phylogeny of samples based on distance matrices. +cols <- palette(brewer.pal(n=12, name='Set3')); +set.seed(999); +# Start PDF device driver. +dev.new(width=10, height=7); +file_path = get_file_path("nj_phylogeny.pdf"); +pdf(file=file_path, width=10, height=7); +# Organize branches by clade. +tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); +plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); +# Add a scale bar showing 5% difference.. +add.scale.bar(length=0.05, cex=0.65); +nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); +dev.off() -# Lastly, we can use a discriminant analysis of principal -# components to cluster genetically related individuals. -# This multivariate statistical approach partions the -# sample into a between-group and within- group component, -# in an effort to maximize discrimination between groups. -# Data is first transformed using a principal components -# analysis (PCA) and subsequently clusters are identified -# using discriminant analysis (DA).More information can be -# found here. -dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1); -scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols); -# Turn off device driver to flush output. -dev.off(); -
--- a/multilocus_genotype.xml Thu Oct 25 13:33:47 2018 -0400 +++ b/multilocus_genotype.xml Wed Nov 21 09:14:09 2018 -0500 @@ -1,26 +1,31 @@ <tool id="multilocus_genotype" name="Multilocus genotype" version="1.0.0"> <description>unique combination of alleles for loci</description> <requirements> - <requirement type="package" version="1.6.0">r-optparse</requirement> - <requirement type="package" version="1.8.0">r-vcfr</requirement> - <requirement type="package" version="2.8.1">r-poppr</requirement> <requirement type="package" version="2.1.1">r-adegenet</requirement> <requirement type="package" version="5.1">r-ape</requirement> - <requirement type="package" version="5.1">r-ape</requirement> <requirement type="package" version="3.0.0">r-ggplot2</requirement> <requirement type="package" version="1.20">r-knitr</requirement> + <requirement type="package" version="1.6.0">r-optparse</requirement> + <requirement type="package" version="2.8.1">r-poppr</requirement> + <requirement type="package" version="1.1.2">r-rcolorbrewer</requirement> + <requirement type="package" version="2.5_3">r-vegan</requirement> + <requirement type="package" version="1.8.0">r-vcfr</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ #set output_plots_dir = 'output_plots_dir' mkdir $output_plots_dir && Rscript '$__tool_directory__/multilocus_genotype.R' --input_vcf '$input_vcf' ---input_pop_info '$input_pop_info']]></command> +--input_pop_info '$input_pop_info' +--output_missing_data '$output_missing_data' +--output_mlg_id '$output_mlg_id']]></command> <inputs> <param name="input_vcf" type="data" format="vcf" label="VCF file" /> <param name="input_pop_info" type="data" format="txt" label="Population information file" /> </inputs> <outputs> + <data name="output_mlg_id" format="txt"/> + <data name="output_missing_data" format="txt"/> <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}"> <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/> </collection>