multilocus_genotype: multilocus_genotype.R comparison

comparison multilocus_genotype.R @ 12:ea2914ddea50 draft

Uploaded

author	greg
date	Fri, 07 Dec 2018 09:13:02 -0500
parents	6c93244a36e2
children	96ee9122823e

comparison

equal deleted inserted replaced

-:c755744296ca
+:ea2914ddea50
 #!/usr/bin/env Rscript
 suppressPackageStartupMessages(library("adegenet"))
 suppressPackageStartupMessages(library("ape"))
 suppressPackageStartupMessages(library("data.table"))
+suppressPackageStartupMessages(library("dbplyr"))
 suppressPackageStartupMessages(library("dplyr"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("RColorBrewer"))
 suppressPackageStartupMessages(library("RPostgres"))
+suppressPackageStartupMessages(library("tidyr"))
 suppressPackageStartupMessages(library("vcfR"))
 suppressPackageStartupMessages(library("vegan"))
 option_list <- list(
 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
 }
 # Read in VCF input file.
 vcf <- read.vcfR(opt$input_vcf);
-#Missing GT in samples submitted
+# Convert VCF file into a genind for the Poppr package.
+# TODO: probably should not hard-code 2 cores.
+gl <- vcfR2genlight(vcf, n.cores=2);
+gind <- new("genind", (as.matrix(gl)));
+# Add population information to the genind object.
+poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
+colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
+gind@pop <- as.factor(poptab$region);
+# Convert genind object to a genclone object.
+obj2 <- as.genclone(gind);
+# Calculate the bitwise distance between individuals.
+xdis <- bitwise.dist(obj2);
+# Multilocus genotypes (threshold of 16%).
+mlg.filter(obj2, distance=xdis) <- 0.016;
+m <- mlg.table(obj2, background=TRUE, color=TRUE);
+# Create table of MLGs.
+id <- mlg.id(obj2);
+dt <- data.table(id, keep.rownames=TRUE);
+setnames(dt, c("id"), c("affy_id"));
+# Read user's Affymetrix 96 well plate csv file.
+pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
+colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
+"reef", "region", "latitude", "longitude", "geographic_origin",
+"sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
+"disease_resist", "bleach_resist", "mortality","tle", "spawning",
+"collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
+"seq_facility", "array_version", "public", "public_after_date");
+pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
+pinfo2 <- as.character(pinfo$user_specimen_id);
+pi <- data.table(pinfo2);
+setnames(pi, c("pinfo2"), c("user_specimen_id"));
+# Connect to database.
+conn <- get_database_connection(opt$database_connection_string);
+# Import the sample table.
+mD <- tbl(conn, "sample");
+# Select user_specimen_id and mlg columns.
+smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
+# Convert to dataframe.
+sm <- data.frame(smlg);
+sm[sm==""] <- NA;
+# Missing GT in samples submitted.
 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
 myMiss <- (myMiss / nrow(vcf)) * 100;
 miss <- data.frame(myMiss);
-hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
-hets <- (hets / nrow(vcf)) * 100;
-ht <- data.frame(hets);
-refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
-refA <- (refA / nrow(vcf)) * 100;
-rA <- data.frame(refA);
-altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
-altB <- (altB / nrow(vcf)) * 100;
-aB <- data.frame(altB);
-# Convert VCF file into a genind for the Poppr package.
-# TODO: probably should not hard-code 2 cores.
-gl <- vcfR2genlight(vcf, n.cores=2);
-gind <- new("genind", (as.matrix(gl)));
-# Add population information to the genind object.
-poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
-colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
-gind@pop <- as.factor(poptab$region);
-# Convert genind object to a genclone object.
-obj2 <- as.genclone(gind);
-# Calculate the bitwise distance between individuals.
-xdis <- bitwise.dist(obj2);
-# Multilocus genotypes (threshold of 16%).
-mlg.filter(obj2, distance=xdis) <- 0.016;
-m <- mlg.table(obj2, background=TRUE, color=TRUE);
-# Create table of MLGs.
-id <- mlg.id(obj2);
-dt <- data.table(id, keep.rownames=TRUE);
-setnames(dt, c("id"), c("affy_id"));
-# Read user's Affymetrix 96 well plate csv file.
-pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t");
-pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
-pinfo2 <- as.character(pinfo$user_specimen_id);
-pi <- data.table(pinfo2);
-setnames(pi, c("pinfo2"), c("user_specimen_id"));
-# Connect to database.
-conn <- get_database_connection(opt$database_connection_string);
-# Import the sample table.
-mD <- tbl(conn, "sample");
-# Select user_specimen_id and mlg columns.
-smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
-# Convert to dataframe.
-sm <- data.frame(smlg);
-sm[sm==""] <- NA;
 # Convert missing data into data table.
 mi <-setDT(miss, keep.rownames=TRUE)[];
 setnames(mi, c("rn"), c("affy_id"));
 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
 # Round missing data to two digits.
 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
+hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
+hets <- (hets / nrow(vcf)) * 100;
+ht <- data.frame(hets);
 # Convert heterozygosity data into data table.
 ht <-setDT(ht, keep.rownames=TRUE)[];
 setnames(ht, c("rn"), c("affy_id"));
 setnames(ht, c("hets"), c("percent_mixed_coral"));
 # Round missing data to two digits.
 ht$percent_mixed<-round(ht$percent_mixed, digits=2);
+refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
+refA <- (refA / nrow(vcf)) * 100;
+rA <- data.frame(refA);
 # Convert refA data into data.table.
 rA <-setDT(rA, keep.rownames=TRUE)[];
 setnames(rA, c("rn"), c("affy_id"));
 setnames(rA, c("refA"), c("percent_reference_coral"));
 # round missing data to two digits.
 rA$percent_reference<-round(rA$percent_reference, digits=2);
+altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
+altB <- (altB / nrow(vcf)) * 100;
+aB <- data.frame(altB);
 # Convert altB data into data table.
 aB <-setDT(aB, keep.rownames=TRUE)[];
 setnames(aB, c("rn"), c("affy_id"));
 setnames(aB, c("altB"), c("percent_alternative_coral"));
 # Round missing data to two digits.
 # Transform.
 df3 <- dt %>%
 group_by(row_number()) %>%
 dplyr::rename(group='row_number()') %>%
-unnest (user_specimen_id) %>%
+unnest (affy_id) %>%
 # Join with mlg table.
 left_join(sm %>%
 select("affy_id","coral_mlg_clonal_id"),
 by='affy_id');
 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
 # Determine if the sample mlg matched previous genotyped sample.
 df4<- df3 %>%
 group_by(group) %>%
-mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match"));
+mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
 # Create new mlg id for samples that did not match those in the database.
 none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
 none <- none[is.na(none$coral_mlg_clonal_id),];
 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
 # the df4 file contains all ids.  If it doesn't then look below
 # to change the seq() function.
 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
 # This is a key for pairing group with new ids.
 rat <- cbind(unique(n.g), n.g_ids);
-# this for loop assigns the new id iteratively for all that have NA.
+# This for loop assigns the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
 }
 # subset poptab for all samples.
 by='affy_id') %>%
 left_join(aB %>%
 select("affy_id", "percent_alternative_coral"),
 by='affy_id') %>%
 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
-mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>%
+mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
 ungroup() %>%
 select(-group);
-write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE);
+write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);
 # Combine sample information for database.
 report_db <- pinfo %>%
 left_join(report_user %>%
 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
 "percent_alternative_coral"),
-by='user_specimen_id')
+by='user_specimen_id');
 # Create vector indicating number of individuals desired
 # made from affy_id collumn of report_user data table.
 i <- report_user[[2]];
 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
 par(mar = c(8, 4, 4, 2));
 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
 dev.off()
-# Rarifaction curve.
+# Generate 96 pie charts.  Make a table to subset the numerical
-# Start PDF device driver.
+# and user_specimen_id values out of report_user for the 96 pies
-#dev.new(width=10, height=7);
+# (user_specimen_id names will be used to label each pie).
-#file_path = get_file_path("geno_rarifaction_curve.pdf");
+dt1 <- data.table(report_user);
-#pdf(file=file_path, width=10, height=7);
+dt1 <- report_user[c(-2, -3, -4)];
-#rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
+dt1 <- na.omit(dt1);
-#dev.off();
+# Translate to 96 columns and 5 rows.
+tdt1 <- t(dt1);
-# Genotype accumulation curve, sample is number of
+# Make another data table and transpose it the same as dt1 to
-# loci randomly selected for to make the curve.
+# just get numerics; these will feed into the creation of 96
-#dev.new(width=10, height=7);
+# vectors, "x" in the for loop below.
-#file_path = get_file_path("geno_accumulation_curve.pdf");
+dt2 <- data.table(report_user);
-#pdf(file=file_path, width=10, height=7);
+dt2 <- report_user[c(-1, -2, -3, -4)];
-#genotype_curve(gind, sample=5, quiet=TRUE);
+# Translate to 96 columns and 5 rows.
-#dev.off();
+tdt2 <- t(dt2);
+# Create 96 vectors
+x <- tdt2[1:96];
+tdt1_matrix <- as.matrix(tdt1[-1,]);
+mode(tdt1_matrix) <- "numeric";
+spy <- rowMeans(tdt1_matrix);
+dev.new(width=10, height=7);
+file_path = get_file_path("percent_breakdown.pdf");
+pdf(file=file_path, width=10, height=7);
+# Average pie of all samples.
+labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
+col <- c("GREY", "#006DDB", "#24FF24", "#920000");
+main <- "Average breakdown of SNP assignments across all samples";
+pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
+par(mfrow=c(3, 2));
+for (i in 1:96) {
+labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
+col <- c("GREY", "#006DDB", "#24FF24", "#920000");
+main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
+pie(tdt1_matrix[,i], labels=labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
+}
+dev.off()

Mercurial > repos > greg > multilocus_genotype

comparison multilocus_genotype.R @ 12:ea2914ddea50 draft