Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 0:725b160c91f0 draft
Uploaded
author | greg |
---|---|
date | Thu, 25 Oct 2018 10:50:53 -0400 |
parents | |
children | 86aaadf36a4f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:725b160c91f0 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 suppressPackageStartupMessages(library("optparse")) | |
4 suppressPackageStartupMessages(library("vcfR")) | |
5 suppressPackageStartupMessages(library("poppr")) | |
6 suppressPackageStartupMessages(library("adegenet")) | |
7 suppressPackageStartupMessages(library("ape")) | |
8 suppressPackageStartupMessages(library("ggplot2")) | |
9 suppressPackageStartupMessages(library("knitr")) | |
10 | |
11 option_list <- list( | |
12 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file") | |
13 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file") | |
14 ) | |
15 | |
16 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | |
17 args <- parse_args(parser, positional_arguments=TRUE); | |
18 opt <- args$options; | |
19 | |
20 get_file_path = function(file_name) { | |
21 file_path = paste("output_plots_dir", file_name, sep="/"); | |
22 return(file_path); | |
23 } | |
24 | |
25 #extract Provesti's distance from the distance matrix | |
26 provesti_distance <- function(distance, selection) { | |
27 eval(parse(text=paste("as.matrix(distance)[", selection, "]"))); | |
28 } | |
29 | |
30 #Read in VCF input file. | |
31 vcf <- read.vcfR(opts$input_vcf); | |
32 | |
33 # Convert VCF file into formats compatiable with the Poppr package. | |
34 gind <- vcfR2genind(vcf); | |
35 # Add population information to the genind object. | |
36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); | |
37 gind@pop <- as.factor(poptab$region); | |
38 # Convert genind to genclone object | |
39 gclo <- as.genclone(gind); | |
40 # Calculate the bitwise distance between individuals, | |
41 # the following is similar to Provesti's distance. | |
42 xdis <- bitwise.dist(gclo); | |
43 # All alleles must match to make a unique multilocus | |
44 # genotype (“original” naive approach). This is the | |
45 # default behavior of poppr. | |
46 mll(gclo) <- "original"; | |
47 # The method above does not take the genetic distance | |
48 # into account, but we can use this matrix to collapse | |
49 # MLGs that are under a specified distance threshold. | |
50 # To determine the distance threshold, we will generate | |
51 # a neighbor-joining tree for all samples. | |
52 | |
53 # Start PDF device driver. | |
54 dev.new(width=20, height=30); | |
55 file_path = get_file_path("phylogeny_tree.pdf") | |
56 pdf(file=file_path, width=20, height=30, bg="white"); | |
57 # Create a phylogeny of samples based on distance matrices | |
58 # colors. | |
59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60"); | |
60 set.seed(999); | |
61 | |
62 theTree <- gclo %>% | |
63 aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>% | |
64 # Organize branches by clade. | |
65 ladderize(); | |
66 plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4); | |
67 # Add a scale bar showing 5% difference. | |
68 add.scale.bar(length=0.05, cex=0.65); | |
69 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | |
70 # Turn off device driver to flush output. | |
71 dev.off(); | |
72 | |
73 # Start PDF device driver. | |
74 dev.new(width=20, height=30); | |
75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf") | |
76 pdf(file=file_path, width=20, height=30, bg="white"); | |
77 # Use of mlg.filter() will create a dissimiliarity distance | |
78 # matrix from the data and then filter based off of that | |
79 # matrix. Here we will use the bitwise distance matrix | |
80 # calculated above. | |
81 | |
82 # Multilocus genotypes (threshold of 1%). | |
83 mlg.filter(gclo, distance= xdis) <- 0.01; | |
84 m <- mlg.table(gclo, background=TRUE, color=TRUE); | |
85 # Turn off device driver to flush output. | |
86 dev.off(); | |
87 | |
88 # Start PDF device driver. | |
89 dev.new(width=20, height=30); | |
90 file_path = get_file_path("filter_stats.pdf") | |
91 pdf(file=file_path, width=20, height=30, bg="white"); | |
92 # Different clustering methods for tie breakers used by | |
93 # mlg.filter, default is farthest neighbor. | |
94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE); | |
95 # Turn off device driver to flush output. | |
96 dev.off(); | |
97 | |
98 # Create table of MLGs. | |
99 id <- mlg.id(gclo); | |
100 df <- data.frame(matrix((id), byrow=T)); | |
101 | |
102 # Start PDF device driver. | |
103 dev.new(width=20, height=30); | |
104 file_path = get_file_path("genotype_accumulation_curve.pdf") | |
105 pdf(file=file_path, width=20, height=30, bg="white"); | |
106 # We can use the genotype_curve() function to create a | |
107 # genotype accumulation curve to determine the minimum | |
108 # number of loci to identify unique MLGs. | |
109 gac <- genotype_curve(gind, sample=5, quiet=TRUE); | |
110 # Turn off device driver to flush output. | |
111 dev.off(); | |
112 | |
113 # Start PDF device driver. | |
114 dev.new(width=20, height=30); | |
115 file_path = get_file_path("genotype_accumulation_curve_for_gind.pdf") | |
116 pdf(file=file_path, width=20, height=30, bg="white"); | |
117 p <- last_plot(); | |
118 p + geom_smooth() + xlim(0, 100) + theme_bw(); | |
119 | |
120 # From the collapsed MLGs, we can calculate genotypic | |
121 # richness, diversity and eveness. | |
122 kable(poppr(gclo)); | |
123 kable(diversity_ci(gclo, n=100L, raw=FALSE )); | |
124 | |
125 # Now we can correct the original data for clones using | |
126 # clonecorrect. This step will reduce the dataset to | |
127 # only have 1 representative genotype per multilocus | |
128 # lineages (MLL). | |
129 gclo_cor <- clonecorrect(gclo, strata=NA); | |
130 | |
131 # Lastly, we can use a discriminant analysis of principal | |
132 # components to cluster genetically related individuals. | |
133 # This multivariate statistical approach partions the | |
134 # sample into a between-group and within- group component, | |
135 # in an effort to maximize discrimination between groups. | |
136 # Data is first transformed using a principal components | |
137 # analysis (PCA) and subsequently clusters are identified | |
138 # using discriminant analysis (DA).More information can be | |
139 # found here. | |
140 dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1); | |
141 scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols); | |
142 # Turn off device driver to flush output. | |
143 dev.off(); | |
144 |