annotate multilocus_genotype.R @ 3:1bc815d9c8c5 draft

Uploaded
author greg
date Thu, 25 Oct 2018 13:33:47 -0400
parents 86aaadf36a4f
children a7cce4091e80
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env Rscript
725b160c91f0 Uploaded
greg
parents:
diff changeset
2
725b160c91f0 Uploaded
greg
parents:
diff changeset
3 suppressPackageStartupMessages(library("optparse"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
4 suppressPackageStartupMessages(library("vcfR"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
5 suppressPackageStartupMessages(library("poppr"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
6 suppressPackageStartupMessages(library("adegenet"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
7 suppressPackageStartupMessages(library("ape"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
8 suppressPackageStartupMessages(library("ggplot2"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
9 suppressPackageStartupMessages(library("knitr"))
725b160c91f0 Uploaded
greg
parents:
diff changeset
10
725b160c91f0 Uploaded
greg
parents:
diff changeset
11 option_list <- list(
2
86aaadf36a4f Uploaded
greg
parents: 0
diff changeset
12 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
13 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file")
725b160c91f0 Uploaded
greg
parents:
diff changeset
14 )
725b160c91f0 Uploaded
greg
parents:
diff changeset
15
725b160c91f0 Uploaded
greg
parents:
diff changeset
16 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
725b160c91f0 Uploaded
greg
parents:
diff changeset
17 args <- parse_args(parser, positional_arguments=TRUE);
725b160c91f0 Uploaded
greg
parents:
diff changeset
18 opt <- args$options;
725b160c91f0 Uploaded
greg
parents:
diff changeset
19
725b160c91f0 Uploaded
greg
parents:
diff changeset
20 get_file_path = function(file_name) {
725b160c91f0 Uploaded
greg
parents:
diff changeset
21 file_path = paste("output_plots_dir", file_name, sep="/");
725b160c91f0 Uploaded
greg
parents:
diff changeset
22 return(file_path);
725b160c91f0 Uploaded
greg
parents:
diff changeset
23 }
725b160c91f0 Uploaded
greg
parents:
diff changeset
24
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
25 # Extract Provesti's distance from the distance matrix.
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
26 provesti_distance <- function(distance, selection) {
725b160c91f0 Uploaded
greg
parents:
diff changeset
27 eval(parse(text=paste("as.matrix(distance)[", selection, "]")));
725b160c91f0 Uploaded
greg
parents:
diff changeset
28 }
725b160c91f0 Uploaded
greg
parents:
diff changeset
29
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
30 # Read in VCF input file.
2
86aaadf36a4f Uploaded
greg
parents: 0
diff changeset
31 vcf <- read.vcfR(opt$input_vcf);
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
32
725b160c91f0 Uploaded
greg
parents:
diff changeset
33 # Convert VCF file into formats compatiable with the Poppr package.
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
34 genind <- vcfR2genind(vcf);
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
35 # Add population information to the genind object.
725b160c91f0 Uploaded
greg
parents:
diff changeset
36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA"));
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
37 genind@pop <- as.factor(poptab$region);
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
38 # Convert genind to genclone object
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
39 gclo <- as.genclone(genind);
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
40 # Calculate the bitwise distance between individuals,
725b160c91f0 Uploaded
greg
parents:
diff changeset
41 # the following is similar to Provesti's distance.
725b160c91f0 Uploaded
greg
parents:
diff changeset
42 xdis <- bitwise.dist(gclo);
725b160c91f0 Uploaded
greg
parents:
diff changeset
43 # All alleles must match to make a unique multilocus
725b160c91f0 Uploaded
greg
parents:
diff changeset
44 # genotype (“original” naive approach). This is the
725b160c91f0 Uploaded
greg
parents:
diff changeset
45 # default behavior of poppr.
725b160c91f0 Uploaded
greg
parents:
diff changeset
46 mll(gclo) <- "original";
725b160c91f0 Uploaded
greg
parents:
diff changeset
47 # The method above does not take the genetic distance
725b160c91f0 Uploaded
greg
parents:
diff changeset
48 # into account, but we can use this matrix to collapse
725b160c91f0 Uploaded
greg
parents:
diff changeset
49 # MLGs that are under a specified distance threshold.
725b160c91f0 Uploaded
greg
parents:
diff changeset
50 # To determine the distance threshold, we will generate
725b160c91f0 Uploaded
greg
parents:
diff changeset
51 # a neighbor-joining tree for all samples.
725b160c91f0 Uploaded
greg
parents:
diff changeset
52
725b160c91f0 Uploaded
greg
parents:
diff changeset
53 # Start PDF device driver.
725b160c91f0 Uploaded
greg
parents:
diff changeset
54 dev.new(width=20, height=30);
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
55 file_path = get_file_path("phylogeny_tree.pdf");
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
56 pdf(file=file_path, width=20, height=30, bg="white");
725b160c91f0 Uploaded
greg
parents:
diff changeset
57 # Create a phylogeny of samples based on distance matrices
725b160c91f0 Uploaded
greg
parents:
diff changeset
58 # colors.
725b160c91f0 Uploaded
greg
parents:
diff changeset
59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60");
725b160c91f0 Uploaded
greg
parents:
diff changeset
60 set.seed(999);
725b160c91f0 Uploaded
greg
parents:
diff changeset
61
725b160c91f0 Uploaded
greg
parents:
diff changeset
62 theTree <- gclo %>%
725b160c91f0 Uploaded
greg
parents:
diff changeset
63 aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>%
725b160c91f0 Uploaded
greg
parents:
diff changeset
64 # Organize branches by clade.
725b160c91f0 Uploaded
greg
parents:
diff changeset
65 ladderize();
725b160c91f0 Uploaded
greg
parents:
diff changeset
66 plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4);
725b160c91f0 Uploaded
greg
parents:
diff changeset
67 # Add a scale bar showing 5% difference.
725b160c91f0 Uploaded
greg
parents:
diff changeset
68 add.scale.bar(length=0.05, cex=0.65);
725b160c91f0 Uploaded
greg
parents:
diff changeset
69 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
725b160c91f0 Uploaded
greg
parents:
diff changeset
70 # Turn off device driver to flush output.
725b160c91f0 Uploaded
greg
parents:
diff changeset
71 dev.off();
725b160c91f0 Uploaded
greg
parents:
diff changeset
72
725b160c91f0 Uploaded
greg
parents:
diff changeset
73 # Start PDF device driver.
725b160c91f0 Uploaded
greg
parents:
diff changeset
74 dev.new(width=20, height=30);
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf");
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
76 pdf(file=file_path, width=20, height=30, bg="white");
725b160c91f0 Uploaded
greg
parents:
diff changeset
77 # Use of mlg.filter() will create a dissimiliarity distance
725b160c91f0 Uploaded
greg
parents:
diff changeset
78 # matrix from the data and then filter based off of that
725b160c91f0 Uploaded
greg
parents:
diff changeset
79 # matrix. Here we will use the bitwise distance matrix
725b160c91f0 Uploaded
greg
parents:
diff changeset
80 # calculated above.
725b160c91f0 Uploaded
greg
parents:
diff changeset
81
725b160c91f0 Uploaded
greg
parents:
diff changeset
82 # Multilocus genotypes (threshold of 1%).
725b160c91f0 Uploaded
greg
parents:
diff changeset
83 mlg.filter(gclo, distance= xdis) <- 0.01;
725b160c91f0 Uploaded
greg
parents:
diff changeset
84 m <- mlg.table(gclo, background=TRUE, color=TRUE);
725b160c91f0 Uploaded
greg
parents:
diff changeset
85 # Turn off device driver to flush output.
725b160c91f0 Uploaded
greg
parents:
diff changeset
86 dev.off();
725b160c91f0 Uploaded
greg
parents:
diff changeset
87
725b160c91f0 Uploaded
greg
parents:
diff changeset
88 # Start PDF device driver.
725b160c91f0 Uploaded
greg
parents:
diff changeset
89 dev.new(width=20, height=30);
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
90 file_path = get_file_path("filter_stats.pdf");
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
91 pdf(file=file_path, width=20, height=30, bg="white");
725b160c91f0 Uploaded
greg
parents:
diff changeset
92 # Different clustering methods for tie breakers used by
725b160c91f0 Uploaded
greg
parents:
diff changeset
93 # mlg.filter, default is farthest neighbor.
725b160c91f0 Uploaded
greg
parents:
diff changeset
94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE);
725b160c91f0 Uploaded
greg
parents:
diff changeset
95 # Turn off device driver to flush output.
725b160c91f0 Uploaded
greg
parents:
diff changeset
96 dev.off();
725b160c91f0 Uploaded
greg
parents:
diff changeset
97
725b160c91f0 Uploaded
greg
parents:
diff changeset
98 # Create table of MLGs.
725b160c91f0 Uploaded
greg
parents:
diff changeset
99 id <- mlg.id(gclo);
725b160c91f0 Uploaded
greg
parents:
diff changeset
100 df <- data.frame(matrix((id), byrow=T));
725b160c91f0 Uploaded
greg
parents:
diff changeset
101
725b160c91f0 Uploaded
greg
parents:
diff changeset
102 # Start PDF device driver.
725b160c91f0 Uploaded
greg
parents:
diff changeset
103 dev.new(width=20, height=30);
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
104 file_path = get_file_path("genotype_accumulation_curve.pdf");
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
105 pdf(file=file_path, width=20, height=30, bg="white");
725b160c91f0 Uploaded
greg
parents:
diff changeset
106 # We can use the genotype_curve() function to create a
725b160c91f0 Uploaded
greg
parents:
diff changeset
107 # genotype accumulation curve to determine the minimum
725b160c91f0 Uploaded
greg
parents:
diff changeset
108 # number of loci to identify unique MLGs.
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
109 gac <- genotype_curve(genind, sample=5, quiet=TRUE);
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
110 # Turn off device driver to flush output.
725b160c91f0 Uploaded
greg
parents:
diff changeset
111 dev.off();
725b160c91f0 Uploaded
greg
parents:
diff changeset
112
725b160c91f0 Uploaded
greg
parents:
diff changeset
113 # Start PDF device driver.
725b160c91f0 Uploaded
greg
parents:
diff changeset
114 dev.new(width=20, height=30);
3
1bc815d9c8c5 Uploaded
greg
parents: 2
diff changeset
115 file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf");
0
725b160c91f0 Uploaded
greg
parents:
diff changeset
116 pdf(file=file_path, width=20, height=30, bg="white");
725b160c91f0 Uploaded
greg
parents:
diff changeset
117 p <- last_plot();
725b160c91f0 Uploaded
greg
parents:
diff changeset
118 p + geom_smooth() + xlim(0, 100) + theme_bw();
725b160c91f0 Uploaded
greg
parents:
diff changeset
119
725b160c91f0 Uploaded
greg
parents:
diff changeset
120 # From the collapsed MLGs, we can calculate genotypic
725b160c91f0 Uploaded
greg
parents:
diff changeset
121 # richness, diversity and eveness.
725b160c91f0 Uploaded
greg
parents:
diff changeset
122 kable(poppr(gclo));
725b160c91f0 Uploaded
greg
parents:
diff changeset
123 kable(diversity_ci(gclo, n=100L, raw=FALSE ));
725b160c91f0 Uploaded
greg
parents:
diff changeset
124
725b160c91f0 Uploaded
greg
parents:
diff changeset
125 # Now we can correct the original data for clones using
725b160c91f0 Uploaded
greg
parents:
diff changeset
126 # clonecorrect. This step will reduce the dataset to
725b160c91f0 Uploaded
greg
parents:
diff changeset
127 # only have 1 representative genotype per multilocus
725b160c91f0 Uploaded
greg
parents:
diff changeset
128 # lineages (MLL).
725b160c91f0 Uploaded
greg
parents:
diff changeset
129 gclo_cor <- clonecorrect(gclo, strata=NA);
725b160c91f0 Uploaded
greg
parents:
diff changeset
130
725b160c91f0 Uploaded
greg
parents:
diff changeset
131 # Lastly, we can use a discriminant analysis of principal
725b160c91f0 Uploaded
greg
parents:
diff changeset
132 # components to cluster genetically related individuals.
725b160c91f0 Uploaded
greg
parents:
diff changeset
133 # This multivariate statistical approach partions the
725b160c91f0 Uploaded
greg
parents:
diff changeset
134 # sample into a between-group and within- group component,
725b160c91f0 Uploaded
greg
parents:
diff changeset
135 # in an effort to maximize discrimination between groups.
725b160c91f0 Uploaded
greg
parents:
diff changeset
136 # Data is first transformed using a principal components
725b160c91f0 Uploaded
greg
parents:
diff changeset
137 # analysis (PCA) and subsequently clusters are identified
725b160c91f0 Uploaded
greg
parents:
diff changeset
138 # using discriminant analysis (DA).More information can be
725b160c91f0 Uploaded
greg
parents:
diff changeset
139 # found here.
725b160c91f0 Uploaded
greg
parents:
diff changeset
140 dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1);
725b160c91f0 Uploaded
greg
parents:
diff changeset
141 scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols);
725b160c91f0 Uploaded
greg
parents:
diff changeset
142 # Turn off device driver to flush output.
725b160c91f0 Uploaded
greg
parents:
diff changeset
143 dev.off();
725b160c91f0 Uploaded
greg
parents:
diff changeset
144