comparison multilocus_genotype.R @ 4:a7cce4091e80 draft

Uploaded
author greg
date Wed, 21 Nov 2018 09:14:09 -0500
parents 1bc815d9c8c5
children 18001e7cb199
comparison
equal deleted inserted replaced
3:1bc815d9c8c5 4:a7cce4091e80
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 2
3 suppressPackageStartupMessages(library("optparse"))
4 suppressPackageStartupMessages(library("vcfR"))
5 suppressPackageStartupMessages(library("poppr"))
6 suppressPackageStartupMessages(library("adegenet")) 3 suppressPackageStartupMessages(library("adegenet"))
7 suppressPackageStartupMessages(library("ape")) 4 suppressPackageStartupMessages(library("ape"))
8 suppressPackageStartupMessages(library("ggplot2")) 5 suppressPackageStartupMessages(library("ggplot2"))
9 suppressPackageStartupMessages(library("knitr")) 6 suppressPackageStartupMessages(library("knitr"))
7 suppressPackageStartupMessages(library("optparse"))
8 suppressPackageStartupMessages(library("poppr"))
9 suppressPackageStartupMessages(library("RColorBrewer"))
10 suppressPackageStartupMessages(library("vcfR"))
11 suppressPackageStartupMessages(library("vegan"))
10 12
11 option_list <- list( 13 option_list <- list(
12 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), 14 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
13 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file") 15 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
16 make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"),
17 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile")
14 ) 18 )
15 19
16 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); 20 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
17 args <- parse_args(parser, positional_arguments=TRUE); 21 args <- parse_args(parser, positional_arguments=TRUE);
18 opt <- args$options; 22 opt <- args$options;
28 } 32 }
29 33
30 # Read in VCF input file. 34 # Read in VCF input file.
31 vcf <- read.vcfR(opt$input_vcf); 35 vcf <- read.vcfR(opt$input_vcf);
32 36
37 #Missing GT in samples submitted
38 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
39 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
40 myMiss <- (myMiss / nrow(vcf)) * 100;
41 miss <- data.frame(myMiss);
42 write.table(miss, file=opt$output_missing_data, quote=FALSE);
43
33 # Convert VCF file into formats compatiable with the Poppr package. 44 # Convert VCF file into formats compatiable with the Poppr package.
34 genind <- vcfR2genind(vcf); 45 genind <- vcfR2genind(vcf);
35 # Add population information to the genind object. 46 # Add population information to the genind object.
36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); 47 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA"));
37 genind@pop <- as.factor(poptab$region); 48 genind@pop <- as.factor(poptab$region);
38 # Convert genind to genclone object 49 # Convert genind to genclone object
39 gclo <- as.genclone(genind); 50 gclo <- as.genclone(genind);
40 # Calculate the bitwise distance between individuals, 51 # Calculate the bitwise distance between individuals,
41 # the following is similar to Provesti's distance. 52 # the following is similar to Provesti's distance.
42 xdis <- bitwise.dist(gclo); 53 xdis <- bitwise.dist(gclo, missing_match=FALSE);
43 # All alleles must match to make a unique multilocus
44 # genotype (“original” naive approach). This is the
45 # default behavior of poppr.
46 mll(gclo) <- "original";
47 # The method above does not take the genetic distance
48 # into account, but we can use this matrix to collapse
49 # MLGs that are under a specified distance threshold.
50 # To determine the distance threshold, we will generate
51 # a neighbor-joining tree for all samples.
52 54
55 # Multilocus genotypes (threshold of 1.6%).
56 mlg.filter(gclo, distance=xdis) <- 0.016;
53 # Start PDF device driver. 57 # Start PDF device driver.
54 dev.new(width=20, height=30); 58 dev.new(width=10, height=7);
55 file_path = get_file_path("phylogeny_tree.pdf"); 59 file_path = get_file_path("mlg_table.pdf");
56 pdf(file=file_path, width=20, height=30, bg="white"); 60 pdf(file=file_path, width=10, height=7);
57 # Create a phylogeny of samples based on distance matrices
58 # colors.
59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60");
60 set.seed(999);
61
62 theTree <- gclo %>%
63 aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>%
64 # Organize branches by clade.
65 ladderize();
66 plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4);
67 # Add a scale bar showing 5% difference.
68 add.scale.bar(length=0.05, cex=0.65);
69 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
70 # Turn off device driver to flush output.
71 dev.off();
72
73 # Start PDF device driver.
74 dev.new(width=20, height=30);
75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf");
76 pdf(file=file_path, width=20, height=30, bg="white");
77 # Use of mlg.filter() will create a dissimiliarity distance
78 # matrix from the data and then filter based off of that
79 # matrix. Here we will use the bitwise distance matrix
80 # calculated above.
81
82 # Multilocus genotypes (threshold of 1%).
83 mlg.filter(gclo, distance= xdis) <- 0.01;
84 m <- mlg.table(gclo, background=TRUE, color=TRUE); 61 m <- mlg.table(gclo, background=TRUE, color=TRUE);
85 # Turn off device driver to flush output.
86 dev.off();
87
88 # Start PDF device driver.
89 dev.new(width=20, height=30);
90 file_path = get_file_path("filter_stats.pdf");
91 pdf(file=file_path, width=20, height=30, bg="white");
92 # Different clustering methods for tie breakers used by
93 # mlg.filter, default is farthest neighbor.
94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE);
95 # Turn off device driver to flush output.
96 dev.off(); 62 dev.off();
97 63
98 # Create table of MLGs. 64 # Create table of MLGs.
99 id <- mlg.id(gclo); 65 id <- mlg.id(gclo);
100 df <- data.frame(matrix((id), byrow=T)); 66 df <- data.frame(matrix((id), byrow=T));
67 write.table(df, file=opt$output_mlg_id);
101 68
69 # Rarifaction curve.
70 H.obj <- mlg.table(gclo, plot=TRUE);
102 # Start PDF device driver. 71 # Start PDF device driver.
103 dev.new(width=20, height=30); 72 dev.new(width=10, height=7);
104 file_path = get_file_path("genotype_accumulation_curve.pdf"); 73 file_path = get_file_path("geno_rarifaction_curve.pdf");
105 pdf(file=file_path, width=20, height=30, bg="white"); 74 pdf(file=file_path, width=10, height=7);
106 # We can use the genotype_curve() function to create a 75 rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue");
107 # genotype accumulation curve to determine the minimum 76 dev.off()
108 # number of loci to identify unique MLGs.
109 gac <- genotype_curve(genind, sample=5, quiet=TRUE);
110 # Turn off device driver to flush output.
111 dev.off();
112 77
78 # Create a phylogeny of samples based on distance matrices.
79 cols <- palette(brewer.pal(n=12, name='Set3'));
80 set.seed(999);
113 # Start PDF device driver. 81 # Start PDF device driver.
114 dev.new(width=20, height=30); 82 dev.new(width=10, height=7);
115 file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf"); 83 file_path = get_file_path("nj_phylogeny.pdf");
116 pdf(file=file_path, width=20, height=30, bg="white"); 84 pdf(file=file_path, width=10, height=7);
117 p <- last_plot(); 85 # Organize branches by clade.
118 p + geom_smooth() + xlim(0, 100) + theme_bw(); 86 tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize();
87 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4);
88 # Add a scale bar showing 5% difference..
89 add.scale.bar(length=0.05, cex=0.65);
90 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
91 dev.off()
119 92
120 # From the collapsed MLGs, we can calculate genotypic
121 # richness, diversity and eveness.
122 kable(poppr(gclo));
123 kable(diversity_ci(gclo, n=100L, raw=FALSE ));
124
125 # Now we can correct the original data for clones using
126 # clonecorrect. This step will reduce the dataset to
127 # only have 1 representative genotype per multilocus
128 # lineages (MLL).
129 gclo_cor <- clonecorrect(gclo, strata=NA);
130
131 # Lastly, we can use a discriminant analysis of principal
132 # components to cluster genetically related individuals.
133 # This multivariate statistical approach partions the
134 # sample into a between-group and within- group component,
135 # in an effort to maximize discrimination between groups.
136 # Data is first transformed using a principal components
137 # analysis (PCA) and subsequently clusters are identified
138 # using discriminant analysis (DA).More information can be
139 # found here.
140 dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1);
141 scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols);
142 # Turn off device driver to flush output.
143 dev.off();
144