changeset 4:a7cce4091e80 draft

Uploaded
author greg
date Wed, 21 Nov 2018 09:14:09 -0500
parents 1bc815d9c8c5
children 73cfe1260e98
files multilocus_genotype.R multilocus_genotype.xml
diffstat 2 files changed, 53 insertions(+), 100 deletions(-) [+]
line wrap: on
line diff
--- a/multilocus_genotype.R	Thu Oct 25 13:33:47 2018 -0400
+++ b/multilocus_genotype.R	Wed Nov 21 09:14:09 2018 -0500
@@ -1,16 +1,20 @@
 #!/usr/bin/env Rscript
 
-suppressPackageStartupMessages(library("optparse"))
-suppressPackageStartupMessages(library("vcfR"))
-suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("adegenet"))
 suppressPackageStartupMessages(library("ape"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("poppr"))
+suppressPackageStartupMessages(library("RColorBrewer"))
+suppressPackageStartupMessages(library("vcfR"))
+suppressPackageStartupMessages(library("vegan"))
 
 option_list <- list(
     make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
-    make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file")
+    make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
+    make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"),
+    make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile")
 )
 
 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
@@ -30,6 +34,13 @@
 # Read in VCF input file.
 vcf <- read.vcfR(opt$input_vcf);
 
+#Missing GT in samples submitted
+gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
+myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
+myMiss <- (myMiss / nrow(vcf)) * 100;
+miss <- data.frame(myMiss);
+write.table(miss, file=opt$output_missing_data, quote=FALSE);
+
 # Convert VCF file into formats compatiable with the Poppr package.
 genind <- vcfR2genind(vcf);
 # Add population information to the genind object.
@@ -39,106 +50,43 @@
 gclo <- as.genclone(genind);
 # Calculate the bitwise distance between individuals,
 # the following is similar to Provesti's distance.
-xdis <- bitwise.dist(gclo);
-# All alleles must match to make a unique multilocus
-# genotype (“original” naive approach). This is the
-# default behavior of poppr.
-mll(gclo) <- "original";
-# The method above does not take the genetic distance
-# into account, but we can use this matrix to collapse
-# MLGs that are under a specified distance threshold.
-# To determine the distance threshold, we will generate
-# a neighbor-joining tree for all samples.
-
-# Start PDF device driver.
-dev.new(width=20, height=30);
-file_path = get_file_path("phylogeny_tree.pdf");
-pdf(file=file_path, width=20, height=30, bg="white");
-# Create a phylogeny of samples based on distance matrices
-# colors.
-cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60");
-set.seed(999);
+xdis <- bitwise.dist(gclo, missing_match=FALSE);
 
-theTree <- gclo %>%
-  aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>%
-   # Organize branches by clade.
-  ladderize();
-plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4);
- # Add a scale bar showing 5% difference.
-add.scale.bar(length=0.05, cex=0.65);
-nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
-# Turn off device driver to flush output.
-dev.off();
-
+# Multilocus genotypes (threshold of 1.6%).
+mlg.filter(gclo, distance=xdis) <- 0.016;
 # Start PDF device driver.
-dev.new(width=20, height=30);
-file_path = get_file_path("dissimiliarity_distance_matrix.pdf");
-pdf(file=file_path, width=20, height=30, bg="white");
-# Use of mlg.filter() will create a dissimiliarity distance
-# matrix from the data and then filter based off of that
-# matrix. Here we will use the bitwise distance matrix
-# calculated above.
-
-# Multilocus genotypes (threshold of 1%).
-mlg.filter(gclo, distance= xdis) <- 0.01;
+dev.new(width=10, height=7);
+file_path = get_file_path("mlg_table.pdf");
+pdf(file=file_path, width=10, height=7);
 m <- mlg.table(gclo, background=TRUE, color=TRUE);
-# Turn off device driver to flush output.
-dev.off();
-
-# Start PDF device driver.
-dev.new(width=20, height=30);
-file_path = get_file_path("filter_stats.pdf");
-pdf(file=file_path, width=20, height=30, bg="white");
-# Different clustering methods for tie breakers used by
-# mlg.filter, default is farthest neighbor.
-gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE);
-# Turn off device driver to flush output.
 dev.off();
 
 # Create table of MLGs.
 id <- mlg.id(gclo);
 df <- data.frame(matrix((id), byrow=T));
+write.table(df, file=opt$output_mlg_id);
 
+# Rarifaction curve.
+H.obj <- mlg.table(gclo, plot=TRUE);
 # Start PDF device driver.
-dev.new(width=20, height=30);
-file_path = get_file_path("genotype_accumulation_curve.pdf");
-pdf(file=file_path, width=20, height=30, bg="white");
-# We can use the genotype_curve() function to create a
-# genotype accumulation curve to determine the minimum
-# number of loci to identify unique MLGs.
-gac <- genotype_curve(genind, sample=5, quiet=TRUE);
-# Turn off device driver to flush output.
-dev.off();
-
-# Start PDF device driver.
-dev.new(width=20, height=30);
-file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf");
-pdf(file=file_path, width=20, height=30, bg="white");
-p <- last_plot();
-p + geom_smooth() + xlim(0, 100) + theme_bw();
+dev.new(width=10, height=7);
+file_path = get_file_path("geno_rarifaction_curve.pdf");
+pdf(file=file_path, width=10, height=7);
+rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue");
+dev.off()
 
-# From the collapsed MLGs, we can calculate genotypic
-# richness, diversity and eveness.
-kable(poppr(gclo));
-kable(diversity_ci(gclo, n=100L, raw=FALSE ));
-
-# Now we can correct the original data for clones using
-# clonecorrect. This step will reduce the dataset to
-# only have 1 representative genotype per multilocus
-# lineages (MLL).
-gclo_cor <- clonecorrect(gclo, strata=NA);
+# Create a phylogeny of samples based on distance matrices.
+cols <- palette(brewer.pal(n=12, name='Set3'));
+set.seed(999);
+# Start PDF device driver.
+dev.new(width=10, height=7);
+file_path = get_file_path("nj_phylogeny.pdf");
+pdf(file=file_path, width=10, height=7);
+# Organize branches by clade.
+tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize();
+plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4);
+# Add a scale bar showing 5% difference..
+add.scale.bar(length=0.05, cex=0.65);
+nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+dev.off()
 
-# Lastly, we can use a discriminant analysis of principal
-# components to cluster genetically related individuals.
-# This multivariate statistical approach partions the
-# sample into a between-group and within- group component,
-# in an effort to maximize discrimination between groups.
-# Data is first transformed using a principal components
-# analysis (PCA) and subsequently clusters are identified
-# using discriminant analysis (DA).More information can be
-# found here.
-dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1);
-scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols);
-# Turn off device driver to flush output.
-dev.off();
-
--- a/multilocus_genotype.xml	Thu Oct 25 13:33:47 2018 -0400
+++ b/multilocus_genotype.xml	Wed Nov 21 09:14:09 2018 -0500
@@ -1,26 +1,31 @@
 <tool id="multilocus_genotype" name="Multilocus genotype" version="1.0.0">
     <description>unique combination of alleles for loci</description>
     <requirements>
-        <requirement type="package" version="1.6.0">r-optparse</requirement>
-        <requirement type="package" version="1.8.0">r-vcfr</requirement>
-        <requirement type="package" version="2.8.1">r-poppr</requirement>
         <requirement type="package" version="2.1.1">r-adegenet</requirement>
         <requirement type="package" version="5.1">r-ape</requirement>
-        <requirement type="package" version="5.1">r-ape</requirement>
         <requirement type="package" version="3.0.0">r-ggplot2</requirement>
         <requirement type="package" version="1.20">r-knitr</requirement>
+        <requirement type="package" version="1.6.0">r-optparse</requirement>
+        <requirement type="package" version="2.8.1">r-poppr</requirement>
+        <requirement type="package" version="1.1.2">r-rcolorbrewer</requirement>
+        <requirement type="package" version="2.5_3">r-vegan</requirement>
+        <requirement type="package" version="1.8.0">r-vcfr</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #set output_plots_dir = 'output_plots_dir'
 mkdir $output_plots_dir &&
 Rscript '$__tool_directory__/multilocus_genotype.R'
 --input_vcf '$input_vcf'
---input_pop_info '$input_pop_info']]></command>
+--input_pop_info '$input_pop_info'
+--output_missing_data '$output_missing_data'
+--output_mlg_id '$output_mlg_id']]></command>
     <inputs>
         <param name="input_vcf" type="data" format="vcf" label="VCF file" />
         <param name="input_pop_info" type="data" format="txt" label="Population information file" />
     </inputs>
     <outputs>
+        <data name="output_mlg_id" format="txt"/>
+        <data name="output_missing_data" format="txt"/>
         <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}">
             <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/>
         </collection>