Mercurial > repos > greg > multilocus_genotype
changeset 12:ea2914ddea50 draft
Uploaded
author | greg |
---|---|
date | Fri, 07 Dec 2018 09:13:02 -0500 |
parents | c755744296ca |
children | 6b8061a95c04 |
files | multilocus_genotype.R |
diffstat | 1 files changed, 69 insertions(+), 40 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Fri Dec 07 09:12:54 2018 -0500 +++ b/multilocus_genotype.R Fri Dec 07 09:13:02 2018 -0500 @@ -3,6 +3,7 @@ suppressPackageStartupMessages(library("adegenet")) suppressPackageStartupMessages(library("ape")) suppressPackageStartupMessages(library("data.table")) +suppressPackageStartupMessages(library("dbplyr")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) @@ -10,6 +11,7 @@ suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("RPostgres")) +suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) @@ -53,24 +55,6 @@ # Read in VCF input file. vcf <- read.vcfR(opt$input_vcf); -#Missing GT in samples submitted -gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); -myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); -myMiss <- (myMiss / nrow(vcf)) * 100; -miss <- data.frame(myMiss); - -hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); -hets <- (hets / nrow(vcf)) * 100; -ht <- data.frame(hets); - -refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); -refA <- (refA / nrow(vcf)) * 100; -rA <- data.frame(refA); - -altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); -altB <- (altB / nrow(vcf)) * 100; -aB <- data.frame(altB); - # Convert VCF file into a genind for the Poppr package. # TODO: probably should not hard-code 2 cores. gl <- vcfR2genlight(vcf, n.cores=2); @@ -97,7 +81,13 @@ setnames(dt, c("id"), c("affy_id")); # Read user's Affymetrix 96 well plate csv file. -pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t"); +pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); +colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", + "reef", "region", "latitude", "longitude", "geographic_origin", + "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore", + "disease_resist", "bleach_resist", "mortality","tle", "spawning", + "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email", + "seq_facility", "array_version", "public", "public_after_date"); pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); pinfo2 <- as.character(pinfo$user_specimen_id); pi <- data.table(pinfo2); @@ -116,6 +106,12 @@ sm <- data.frame(smlg); sm[sm==""] <- NA; +# Missing GT in samples submitted. +gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); +myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); +myMiss <- (myMiss / nrow(vcf)) * 100; +miss <- data.frame(myMiss); + # Convert missing data into data table. mi <-setDT(miss, keep.rownames=TRUE)[]; setnames(mi, c("rn"), c("affy_id")); @@ -123,6 +119,10 @@ # Round missing data to two digits. mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); +hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); +hets <- (hets / nrow(vcf)) * 100; +ht <- data.frame(hets); + # Convert heterozygosity data into data table. ht <-setDT(ht, keep.rownames=TRUE)[]; setnames(ht, c("rn"), c("affy_id")); @@ -130,6 +130,10 @@ # Round missing data to two digits. ht$percent_mixed<-round(ht$percent_mixed, digits=2); +refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); +refA <- (refA / nrow(vcf)) * 100; +rA <- data.frame(refA); + # Convert refA data into data.table. rA <-setDT(rA, keep.rownames=TRUE)[]; setnames(rA, c("rn"), c("affy_id")); @@ -137,6 +141,10 @@ # round missing data to two digits. rA$percent_reference<-round(rA$percent_reference, digits=2); +altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); +altB <- (altB / nrow(vcf)) * 100; +aB <- data.frame(altB); + # Convert altB data into data table. aB <-setDT(aB, keep.rownames=TRUE)[]; setnames(aB, c("rn"), c("affy_id")); @@ -152,7 +160,7 @@ df3 <- dt %>% group_by(row_number()) %>% dplyr::rename(group='row_number()') %>% - unnest (user_specimen_id) %>% + unnest (affy_id) %>% # Join with mlg table. left_join(sm %>% select("affy_id","coral_mlg_clonal_id"), @@ -168,7 +176,7 @@ # Determine if the sample mlg matched previous genotyped sample. df4<- df3 %>% group_by(group) %>% - mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match")); + mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); # Create new mlg id for samples that did not match those in the database. none <- unique(df4[c("group", "coral_mlg_clonal_id")]); @@ -184,7 +192,7 @@ n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); # This is a key for pairing group with new ids. rat <- cbind(unique(n.g), n.g_ids); -# this for loop assigns the new id iteratively for all that have NA. +# This for loop assigns the new id iteratively for all that have NA. for (i in 1:length(na.mlg2)) { df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; } @@ -213,11 +221,11 @@ select("affy_id", "percent_alternative_coral"), by='affy_id') %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% - mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% + mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% ungroup() %>% select(-group); -write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); +write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); # Combine sample information for database. report_db <- pinfo %>% @@ -225,7 +233,7 @@ select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", "percent_alternative_coral"), - by='user_specimen_id') + by='user_specimen_id'); # Create vector indicating number of individuals desired # made from affy_id collumn of report_user data table. @@ -264,19 +272,40 @@ text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); dev.off() -# Rarifaction curve. -# Start PDF device driver. -#dev.new(width=10, height=7); -#file_path = get_file_path("geno_rarifaction_curve.pdf"); -#pdf(file=file_path, width=10, height=7); -#rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); -#dev.off(); +# Generate 96 pie charts. Make a table to subset the numerical +# and user_specimen_id values out of report_user for the 96 pies +# (user_specimen_id names will be used to label each pie). +dt1 <- data.table(report_user); +dt1 <- report_user[c(-2, -3, -4)]; +dt1 <- na.omit(dt1); +# Translate to 96 columns and 5 rows. +tdt1 <- t(dt1); +# Make another data table and transpose it the same as dt1 to +# just get numerics; these will feed into the creation of 96 +# vectors, "x" in the for loop below. +dt2 <- data.table(report_user); +dt2 <- report_user[c(-1, -2, -3, -4)]; +# Translate to 96 columns and 5 rows. +tdt2 <- t(dt2); +# Create 96 vectors +x <- tdt2[1:96]; +tdt1_matrix <- as.matrix(tdt1[-1,]); +mode(tdt1_matrix) <- "numeric"; +spy <- rowMeans(tdt1_matrix); +dev.new(width=10, height=7); +file_path = get_file_path("percent_breakdown.pdf"); +pdf(file=file_path, width=10, height=7); +# Average pie of all samples. +labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); +col <- c("GREY", "#006DDB", "#24FF24", "#920000"); +main <- "Average breakdown of SNP assignments across all samples"; +pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); +par(mfrow=c(3, 2)); +for (i in 1:96) { + labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); + col <- c("GREY", "#006DDB", "#24FF24", "#920000"); + main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); + pie(tdt1_matrix[,i], labels=labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); +} +dev.off() -# Genotype accumulation curve, sample is number of -# loci randomly selected for to make the curve. -#dev.new(width=10, height=7); -#file_path = get_file_path("geno_accumulation_curve.pdf"); -#pdf(file=file_path, width=10, height=7); -#genotype_curve(gind, sample=5, quiet=TRUE); -#dev.off(); -