Mercurial > repos > greg > multilocus_genotype
changeset 17:85f8fc57eee4 draft
Uploaded
author | greg |
---|---|
date | Wed, 17 Apr 2019 09:07:04 -0400 |
parents | c4ec8727b50c |
children | 1190ee1456f6 |
files | multilocus_genotype.R |
diffstat | 1 files changed, 195 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Fri Feb 15 10:25:04 2019 -0500 +++ b/multilocus_genotype.R Wed Apr 17 09:07:04 2019 -0400 @@ -10,17 +10,24 @@ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) +suppressPackageStartupMessages(library("rnaturalearth")) +suppressPackageStartupMessages(library("rnaturalearthdata")) suppressPackageStartupMessages(library("RPostgres")) +suppressPackageStartupMessages(library("sf")) +suppressPackageStartupMessages(library(SNPRelate)) suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) +suppressPackageStartupMessages(library("yarrr")) +theme_set(theme_bw()) option_list <- list( make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), - make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") + make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), + make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); @@ -57,13 +64,17 @@ # Convert VCF file into a genind for the Poppr package. # TODO: probably should not hard-code 2 cores. -gl <- vcfR2genlight(vcf, n.cores=2); -gind <- new("genind", (as.matrix(gl))); +# changed to genind format for extracting alleles later +# trade-off is it is a bit slower to import data +# gl <- vcfR2genlight(vcf, n.cores=2) +# gind <- new("genind", (as.matrix(gl))) +gind <- vcfR2genind(vcf); # Add population information to the genind object. poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); gind@pop <- as.factor(poptab$region); +strata(gind)<-data.frame(pop(gind)); # Convert genind object to a genclone object. obj2 <- as.genclone(gind); @@ -71,28 +82,30 @@ # Calculate the bitwise distance between individuals. xdis <- bitwise.dist(obj2); -# Multilocus genotypes (threshold of 16%). -mlg.filter(obj2, distance=xdis) <- 0.016; +# Multilocus genotypes (threshold of 3.2%). +# threshold doubled because of how the data is formatted in genind compared to genlight +mlg.filter(obj2, distance=xdis) <- 0.032; m <- mlg.table(obj2, background=TRUE, color=TRUE); # Create table of MLGs. id <- mlg.id(obj2); -dt <- data.table(id, keep.rownames=TRUE); -setnames(dt, c("id"), c("affy_id")); +#dt <- data.table(id, keep.rownames=TRUE); +#setnames(dt, c("id"), c("affy_id")); # Read user's Affymetrix 96 well plate tabular file. -pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); +pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", "region", "latitude", "longitude", "geographic_origin", "sample_location", - "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist", + "latitude_outplant", "longitude_outplant", "depth", "disease_resist", "bleach_resist", "mortality","tle", "spawning", "collector_last_name", - "collector_first_name", "org", "collection_date", "contact_email", "seq_facility", + "collector_first_name", "organization", "collection_date", "email", "seq_facility", "array_version", "public", "public_after_date", "sperm_motility", "healing_time", - "dna_extraction_method", "dna_concentration"); + "dna_extraction_method", "dna_concentration", "registry_id"); pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); pinfo2 <- as.character(pinfo$user_specimen_id); -pi <- data.table(pinfo2); +pi <- data.table(pinfo2, pinfo$field_call); setnames(pi, c("pinfo2"), c("user_specimen_id")); +setnames(pi, c("V2"), c("field_call")); # Connect to database. conn <- get_database_connection(opt$database_connection_string); @@ -129,6 +142,7 @@ # Round missing data to two digits. mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); +#heterozygous alleles hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); hets <- (hets / nrow(vcf)) * 100; ht <- data.frame(hets); @@ -140,6 +154,7 @@ # Round missing data to two digits. ht$percent_mixed<-round(ht$percent_mixed, digits=2); +#reference alleles refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); refA <- (refA / nrow(vcf)) * 100; rA <- data.frame(refA); @@ -151,6 +166,7 @@ # round missing data to two digits. rA$percent_reference<-round(rA$percent_reference, digits=2); +#alternative alleles altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); altB <- (altB / nrow(vcf)) * 100; aB <- data.frame(altB); @@ -198,7 +214,7 @@ # List of new group ids, the sequence starts at the number of # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if # the df4 file contains all ids. If it doesn't then look below -# to change the seq() function. +# to change the seq() function. n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); # Pair group with new ids. rat <- cbind(unique(n.g), n.g_ids); @@ -232,26 +248,82 @@ by="affy_id") %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% + mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% + mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% ungroup() %>% select(-group); write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); -# Combine sample information for database. -report_db <- pinfo %>% - left_join(report_user %>% - select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", - "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", - "percent_alternative_coral"), - by="user_specimen_id"); +# Database tables +## Sample.table +sample_db <- pinfo %>% + left_join( + report_user %>% + select("user_specimen_id","affy_id", + "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", + "percent_alternative_coral"), + by='user_specimen_id'); + +###representative clone for genotype.table +cc<-clonecorrect(obj2, strata= ~pop.gind.); +id_rep<-mlg.id(cc); +dt_cc<-data.table(id_rep,keep.rownames = TRUE); +setnames(dt_cc, c("id_rep"), c("affy_id")); + +###transform mlg data.table +df_cc <- dt_cc %>% + group_by(row_number()) %>% + rename(group='row_number()') %>% + unnest(affy_id) %>% + left_join(report_user %>% + select("coral_mlg_clonal_id","user_specimen_id","affy_id"), + by='affy_id') %>% + mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% + ungroup() %>% + select(-group); -# Create vector indicating number of individuals desired -# made from affy_id collumn of report_user data table. -i <- report_user[[2]]; -sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; +##geno.table +geno_db <- df4 %>% + left_join(df_cc %>% + select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), + by='affy_id') %>% + ungroup() %>% + select(-group); + +##taxonomy.table + +tax_db <- report_user %>% + select(genetic_coral_species_call, affy_id) %>% + mutate(genus_name =ifelse(genetic_coral_species_call== + genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% + mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% + mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% + mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); + + +# Table of alleles for the new samples +## subset to new plate data +### create vector indicating number of individuals desired +### made from affy_id collumn from report_user data table +i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); +i<-i[!apply(i == "", 1, all),]; +sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; + +# convert to data frame +at_96<-genind2df(sub96, sep=""); +at_96<- at_96 %>% + select(-pop); + +# allele string for Allele.table in database +uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE); +uat_96<-setDT(uat_96, keep.rownames = TRUE)[]; +setnames(uat_96, c("rn"), c("user_specimen_id")); +# write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); # Create a phylogeny of samples based on distance matrices. -cols <- palette(brewer.pal(n=12, name="Set3")); +cols <- piratepal("basel"); set.seed(999); # Start PDF device driver. dev.new(width=10, height=7); @@ -259,15 +331,110 @@ pdf(file=file_path, width=10, height=7); # Organize branches by clade. theTree <- sub96 %>% - aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>% + aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); # Add a scale bar showing 5% difference.. add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); -legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8); -dev.off(); +legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); +dev.off() + +write.tree(theTree, file =opt$nj_tree, quote=FALSE); + +# identity-by-state analysis +#if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") +#BiocManager::install("SNPRelate", version = "3.8") + +#subset VCF to the user samples +l<-length(i); +n<-ncol(vcf@gt); +s<-n-l; +svcf<-vcf[,s:n]; +write.vcf(svcf, "subset.vcf.gz"); + +vcf.fn <- "subset.vcf.gz"; +snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); + +genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); +hd<-read.gdsn(index.gdsn(genofile, "sample.id")); +hd<-data.frame(hd); +hd<-setDT(hd, keep.rownames = FALSE)[]; +setnames(hd, c("hd"), c("user_specimen_id")); + +subpop2<- poptab[c(2,4)]; +poptab_sub <- hd %>% + left_join( + subpop2 %>% + select("affy_id","region"), + by='affy_id')%>% + drop_na(); + +samp.annot <- data.frame(pop.group = c(poptab_sub$region)); +add.gdsn(genofile, "sample.annot", samp.annot); + +pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); +pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); + +# Identity-By-State Analysis - distance matrix calculation +ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); + +# cluster analysis on the genome-wide IBS pairwise distance matrix +set.seed(100); +par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); +ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); + +# default clustering. +dev.new(width=10, height=7); +file_path = get_file_path("IBS_default.pdf"); +pdf (file=file_path, width=10, height=7); +rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); +snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); +legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); +dev.off() + +# color cluster by region. +dev.new(width=10, height=7); +file_path = get_file_path("IBS_Region.pdf"); +pdf (file=file_path, width=10, height=7); +race <- as.factor(pop_code); +rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); +snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); +legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); +dev.off() + +#close GDS file +snpgdsClose(genofile); + +# Sample MLG on a map. +world <- ne_countries(scale = "medium", returnclass = "sf"); +class(world); + +pinfo$mlg<-report_user$coral_mlg_clonal_id; +n <- nrow(pinfo); + +mxlat<-max(pinfo$latitude,na.rm = TRUE); +mnlat<-min(pinfo$latitude,na.rm = TRUE); +mxlong<-max(pinfo$longitude,na.rm = TRUE); +mnlong<-min(pinfo$longitude,na.rm = TRUE); + +p5<-ggplot(data = world) + + geom_sf() + + coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); + +colourCount = length(unique(pinfo$mlg)); +getPalette = colorRampPalette(piratepal("basel")); +dev.new(width=10, height=7); +file_path = get_file_path("mlg_map.pdf"); +pdf (file=file_path, width=10, height=7); +p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ + scale_color_manual(values=getPalette(colourCount))+ + theme(legend.position="bottom")+ + guides(color=guide_legend(nrow=8,byrow=F)); +p6; +dev.off() # Missing data barplot. poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; @@ -319,4 +486,3 @@ pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); } dev.off() -