Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 4:a7cce4091e80 draft
Uploaded
| author | greg |
|---|---|
| date | Wed, 21 Nov 2018 09:14:09 -0500 |
| parents | 1bc815d9c8c5 |
| children | 18001e7cb199 |
comparison
equal
deleted
inserted
replaced
| 3:1bc815d9c8c5 | 4:a7cce4091e80 |
|---|---|
| 1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
| 2 | 2 |
| 3 suppressPackageStartupMessages(library("optparse")) | |
| 4 suppressPackageStartupMessages(library("vcfR")) | |
| 5 suppressPackageStartupMessages(library("poppr")) | |
| 6 suppressPackageStartupMessages(library("adegenet")) | 3 suppressPackageStartupMessages(library("adegenet")) |
| 7 suppressPackageStartupMessages(library("ape")) | 4 suppressPackageStartupMessages(library("ape")) |
| 8 suppressPackageStartupMessages(library("ggplot2")) | 5 suppressPackageStartupMessages(library("ggplot2")) |
| 9 suppressPackageStartupMessages(library("knitr")) | 6 suppressPackageStartupMessages(library("knitr")) |
| 7 suppressPackageStartupMessages(library("optparse")) | |
| 8 suppressPackageStartupMessages(library("poppr")) | |
| 9 suppressPackageStartupMessages(library("RColorBrewer")) | |
| 10 suppressPackageStartupMessages(library("vcfR")) | |
| 11 suppressPackageStartupMessages(library("vegan")) | |
| 10 | 12 |
| 11 option_list <- list( | 13 option_list <- list( |
| 12 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | 14 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), |
| 13 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file") | 15 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), |
| 16 make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"), | |
| 17 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile") | |
| 14 ) | 18 ) |
| 15 | 19 |
| 16 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | 20 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); |
| 17 args <- parse_args(parser, positional_arguments=TRUE); | 21 args <- parse_args(parser, positional_arguments=TRUE); |
| 18 opt <- args$options; | 22 opt <- args$options; |
| 28 } | 32 } |
| 29 | 33 |
| 30 # Read in VCF input file. | 34 # Read in VCF input file. |
| 31 vcf <- read.vcfR(opt$input_vcf); | 35 vcf <- read.vcfR(opt$input_vcf); |
| 32 | 36 |
| 37 #Missing GT in samples submitted | |
| 38 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | |
| 39 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | |
| 40 myMiss <- (myMiss / nrow(vcf)) * 100; | |
| 41 miss <- data.frame(myMiss); | |
| 42 write.table(miss, file=opt$output_missing_data, quote=FALSE); | |
| 43 | |
| 33 # Convert VCF file into formats compatiable with the Poppr package. | 44 # Convert VCF file into formats compatiable with the Poppr package. |
| 34 genind <- vcfR2genind(vcf); | 45 genind <- vcfR2genind(vcf); |
| 35 # Add population information to the genind object. | 46 # Add population information to the genind object. |
| 36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); | 47 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); |
| 37 genind@pop <- as.factor(poptab$region); | 48 genind@pop <- as.factor(poptab$region); |
| 38 # Convert genind to genclone object | 49 # Convert genind to genclone object |
| 39 gclo <- as.genclone(genind); | 50 gclo <- as.genclone(genind); |
| 40 # Calculate the bitwise distance between individuals, | 51 # Calculate the bitwise distance between individuals, |
| 41 # the following is similar to Provesti's distance. | 52 # the following is similar to Provesti's distance. |
| 42 xdis <- bitwise.dist(gclo); | 53 xdis <- bitwise.dist(gclo, missing_match=FALSE); |
| 43 # All alleles must match to make a unique multilocus | |
| 44 # genotype (“original” naive approach). This is the | |
| 45 # default behavior of poppr. | |
| 46 mll(gclo) <- "original"; | |
| 47 # The method above does not take the genetic distance | |
| 48 # into account, but we can use this matrix to collapse | |
| 49 # MLGs that are under a specified distance threshold. | |
| 50 # To determine the distance threshold, we will generate | |
| 51 # a neighbor-joining tree for all samples. | |
| 52 | 54 |
| 55 # Multilocus genotypes (threshold of 1.6%). | |
| 56 mlg.filter(gclo, distance=xdis) <- 0.016; | |
| 53 # Start PDF device driver. | 57 # Start PDF device driver. |
| 54 dev.new(width=20, height=30); | 58 dev.new(width=10, height=7); |
| 55 file_path = get_file_path("phylogeny_tree.pdf"); | 59 file_path = get_file_path("mlg_table.pdf"); |
| 56 pdf(file=file_path, width=20, height=30, bg="white"); | 60 pdf(file=file_path, width=10, height=7); |
| 57 # Create a phylogeny of samples based on distance matrices | |
| 58 # colors. | |
| 59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60"); | |
| 60 set.seed(999); | |
| 61 | |
| 62 theTree <- gclo %>% | |
| 63 aboot(dist=provesti.dist, sample=50, tree="nj", cutoff=50, quiet=TRUE) %>% | |
| 64 # Organize branches by clade. | |
| 65 ladderize(); | |
| 66 plot.phylo(theTree, tip.color=cols[gclo$pop], label.offset=0.0125, cex=0.7, font=2, lwd=4); | |
| 67 # Add a scale bar showing 5% difference. | |
| 68 add.scale.bar(length=0.05, cex=0.65); | |
| 69 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | |
| 70 # Turn off device driver to flush output. | |
| 71 dev.off(); | |
| 72 | |
| 73 # Start PDF device driver. | |
| 74 dev.new(width=20, height=30); | |
| 75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf"); | |
| 76 pdf(file=file_path, width=20, height=30, bg="white"); | |
| 77 # Use of mlg.filter() will create a dissimiliarity distance | |
| 78 # matrix from the data and then filter based off of that | |
| 79 # matrix. Here we will use the bitwise distance matrix | |
| 80 # calculated above. | |
| 81 | |
| 82 # Multilocus genotypes (threshold of 1%). | |
| 83 mlg.filter(gclo, distance= xdis) <- 0.01; | |
| 84 m <- mlg.table(gclo, background=TRUE, color=TRUE); | 61 m <- mlg.table(gclo, background=TRUE, color=TRUE); |
| 85 # Turn off device driver to flush output. | |
| 86 dev.off(); | |
| 87 | |
| 88 # Start PDF device driver. | |
| 89 dev.new(width=20, height=30); | |
| 90 file_path = get_file_path("filter_stats.pdf"); | |
| 91 pdf(file=file_path, width=20, height=30, bg="white"); | |
| 92 # Different clustering methods for tie breakers used by | |
| 93 # mlg.filter, default is farthest neighbor. | |
| 94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE); | |
| 95 # Turn off device driver to flush output. | |
| 96 dev.off(); | 62 dev.off(); |
| 97 | 63 |
| 98 # Create table of MLGs. | 64 # Create table of MLGs. |
| 99 id <- mlg.id(gclo); | 65 id <- mlg.id(gclo); |
| 100 df <- data.frame(matrix((id), byrow=T)); | 66 df <- data.frame(matrix((id), byrow=T)); |
| 67 write.table(df, file=opt$output_mlg_id); | |
| 101 | 68 |
| 69 # Rarifaction curve. | |
| 70 H.obj <- mlg.table(gclo, plot=TRUE); | |
| 102 # Start PDF device driver. | 71 # Start PDF device driver. |
| 103 dev.new(width=20, height=30); | 72 dev.new(width=10, height=7); |
| 104 file_path = get_file_path("genotype_accumulation_curve.pdf"); | 73 file_path = get_file_path("geno_rarifaction_curve.pdf"); |
| 105 pdf(file=file_path, width=20, height=30, bg="white"); | 74 pdf(file=file_path, width=10, height=7); |
| 106 # We can use the genotype_curve() function to create a | 75 rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue"); |
| 107 # genotype accumulation curve to determine the minimum | 76 dev.off() |
| 108 # number of loci to identify unique MLGs. | |
| 109 gac <- genotype_curve(genind, sample=5, quiet=TRUE); | |
| 110 # Turn off device driver to flush output. | |
| 111 dev.off(); | |
| 112 | 77 |
| 78 # Create a phylogeny of samples based on distance matrices. | |
| 79 cols <- palette(brewer.pal(n=12, name='Set3')); | |
| 80 set.seed(999); | |
| 113 # Start PDF device driver. | 81 # Start PDF device driver. |
| 114 dev.new(width=20, height=30); | 82 dev.new(width=10, height=7); |
| 115 file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf"); | 83 file_path = get_file_path("nj_phylogeny.pdf"); |
| 116 pdf(file=file_path, width=20, height=30, bg="white"); | 84 pdf(file=file_path, width=10, height=7); |
| 117 p <- last_plot(); | 85 # Organize branches by clade. |
| 118 p + geom_smooth() + xlim(0, 100) + theme_bw(); | 86 tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); |
| 87 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); | |
| 88 # Add a scale bar showing 5% difference.. | |
| 89 add.scale.bar(length=0.05, cex=0.65); | |
| 90 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | |
| 91 dev.off() | |
| 119 | 92 |
| 120 # From the collapsed MLGs, we can calculate genotypic | |
| 121 # richness, diversity and eveness. | |
| 122 kable(poppr(gclo)); | |
| 123 kable(diversity_ci(gclo, n=100L, raw=FALSE )); | |
| 124 | |
| 125 # Now we can correct the original data for clones using | |
| 126 # clonecorrect. This step will reduce the dataset to | |
| 127 # only have 1 representative genotype per multilocus | |
| 128 # lineages (MLL). | |
| 129 gclo_cor <- clonecorrect(gclo, strata=NA); | |
| 130 | |
| 131 # Lastly, we can use a discriminant analysis of principal | |
| 132 # components to cluster genetically related individuals. | |
| 133 # This multivariate statistical approach partions the | |
| 134 # sample into a between-group and within- group component, | |
| 135 # in an effort to maximize discrimination between groups. | |
| 136 # Data is first transformed using a principal components | |
| 137 # analysis (PCA) and subsequently clusters are identified | |
| 138 # using discriminant analysis (DA).More information can be | |
| 139 # found here. | |
| 140 dapc.coral <- dapc(gclo_cor, var.contrib=TRUE, scale=FALSE, n.pca=62, n.da=nPop(gclo_cor)-1); | |
| 141 scatter(dapc.coral, cell=0, pch=18:23, cstar=0, lwd=2, lty=2, legend=TRUE, cleg=0.75, clabel=TRUE, col=cols); | |
| 142 # Turn off device driver to flush output. | |
| 143 dev.off(); | |
| 144 |
