Mercurial > repos > greg > multilocus_genotype

--- a/multilocus_genotype.R	Fri Dec 07 09:12:54 2018 -0500
+++ b/multilocus_genotype.R	Fri Dec 07 09:13:02 2018 -0500
@@ -3,6 +3,7 @@
 suppressPackageStartupMessages(library("adegenet"))
 suppressPackageStartupMessages(library("ape"))
 suppressPackageStartupMessages(library("data.table"))
+suppressPackageStartupMessages(library("dbplyr"))
 suppressPackageStartupMessages(library("dplyr"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
@@ -10,6 +11,7 @@
 suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("RColorBrewer"))
 suppressPackageStartupMessages(library("RPostgres"))
+suppressPackageStartupMessages(library("tidyr"))
 suppressPackageStartupMessages(library("vcfR"))
 suppressPackageStartupMessages(library("vegan"))

@@ -53,24 +55,6 @@
 # Read in VCF input file.
 vcf <- read.vcfR(opt$input_vcf);

-#Missing GT in samples submitted
-gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
-myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
-myMiss <- (myMiss / nrow(vcf)) * 100;
-miss <- data.frame(myMiss);
-
-hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
-hets <- (hets / nrow(vcf)) * 100;
-ht <- data.frame(hets);
-
-refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
-refA <- (refA / nrow(vcf)) * 100;
-rA <- data.frame(refA);
-
-altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
-altB <- (altB / nrow(vcf)) * 100;
-aB <- data.frame(altB);
-
 # Convert VCF file into a genind for the Poppr package.
 # TODO: probably should not hard-code 2 cores.
 gl <- vcfR2genlight(vcf, n.cores=2);
@@ -97,7 +81,13 @@
 setnames(dt, c("id"), c("affy_id"));

 # Read user's Affymetrix 96 well plate csv file.
-pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t");
+pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
+colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
+                     "reef", "region", "latitude", "longitude", "geographic_origin",
+                     "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
+                     "disease_resist", "bleach_resist", "mortality","tle", "spawning",
+                     "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
+                     "seq_facility", "array_version", "public", "public_after_date");
 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
 pinfo2 <- as.character(pinfo$user_specimen_id);
 pi <- data.table(pinfo2);
@@ -116,6 +106,12 @@
 sm <- data.frame(smlg);
 sm[sm==""] <- NA;

+# Missing GT in samples submitted.
+gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
+myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
+myMiss <- (myMiss / nrow(vcf)) * 100;
+miss <- data.frame(myMiss);
+
 # Convert missing data into data table.
 mi <-setDT(miss, keep.rownames=TRUE)[];
 setnames(mi, c("rn"), c("affy_id"));
@@ -123,6 +119,10 @@
 # Round missing data to two digits.
 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);

+hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
+hets <- (hets / nrow(vcf)) * 100;
+ht <- data.frame(hets);
+
 # Convert heterozygosity data into data table.
 ht <-setDT(ht, keep.rownames=TRUE)[];
 setnames(ht, c("rn"), c("affy_id"));
@@ -130,6 +130,10 @@
 # Round missing data to two digits.
 ht$percent_mixed<-round(ht$percent_mixed, digits=2);

+refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
+refA <- (refA / nrow(vcf)) * 100;
+rA <- data.frame(refA);
+
 # Convert refA data into data.table.
 rA <-setDT(rA, keep.rownames=TRUE)[];
 setnames(rA, c("rn"), c("affy_id"));
@@ -137,6 +141,10 @@
 # round missing data to two digits.
 rA$percent_reference<-round(rA$percent_reference, digits=2);

+altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
+altB <- (altB / nrow(vcf)) * 100;
+aB <- data.frame(altB);
+
 # Convert altB data into data table.
 aB <-setDT(aB, keep.rownames=TRUE)[];
 setnames(aB, c("rn"), c("affy_id"));
@@ -152,7 +160,7 @@
 df3 <- dt %>%
     group_by(row_number()) %>%
     dplyr::rename(group='row_number()') %>%
-    unnest (user_specimen_id) %>%
+    unnest (affy_id) %>%
     # Join with mlg table.
     left_join(sm %>%
               select("affy_id","coral_mlg_clonal_id"),
@@ -168,7 +176,7 @@
 # Determine if the sample mlg matched previous genotyped sample.
 df4<- df3 %>%
     group_by(group) %>%
-    mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match"));
+    mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));

 # Create new mlg id for samples that did not match those in the database.
 none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
@@ -184,7 +192,7 @@
 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
 # This is a key for pairing group with new ids.
 rat <- cbind(unique(n.g), n.g_ids);
-# this for loop assigns the new id iteratively for all that have NA.
+# This for loop assigns the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
     df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
 }
@@ -213,11 +221,11 @@
         select("affy_id", "percent_alternative_coral"),
         by='affy_id') %>%
     mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
-    mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>%
+    mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
     ungroup() %>%
     select(-group);

-write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE);
+write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);

 # Combine sample information for database.
 report_db <- pinfo %>%
@@ -225,7 +233,7 @@
         select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
                "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
                "percent_alternative_coral"),
-        by='user_specimen_id')
+        by='user_specimen_id');

 # Create vector indicating number of individuals desired
 # made from affy_id collumn of report_user data table.
@@ -264,19 +272,40 @@
 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
 dev.off()

-# Rarifaction curve.
-# Start PDF device driver.
-#dev.new(width=10, height=7);
-#file_path = get_file_path("geno_rarifaction_curve.pdf");
-#pdf(file=file_path, width=10, height=7);
-#rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
-#dev.off();
+# Generate 96 pie charts.  Make a table to subset the numerical
+# and user_specimen_id values out of report_user for the 96 pies
+# (user_specimen_id names will be used to label each pie).
+dt1 <- data.table(report_user);
+dt1 <- report_user[c(-2, -3, -4)];
+dt1 <- na.omit(dt1);
+# Translate to 96 columns and 5 rows.
+tdt1 <- t(dt1);
+# Make another data table and transpose it the same as dt1 to
+# just get numerics; these will feed into the creation of 96
+# vectors, "x" in the for loop below.
+dt2 <- data.table(report_user);
+dt2 <- report_user[c(-1, -2, -3, -4)];
+# Translate to 96 columns and 5 rows.
+tdt2 <- t(dt2);
+# Create 96 vectors
+x <- tdt2[1:96];
+tdt1_matrix <- as.matrix(tdt1[-1,]);
+mode(tdt1_matrix) <- "numeric";
+spy <- rowMeans(tdt1_matrix);
+dev.new(width=10, height=7);
+file_path = get_file_path("percent_breakdown.pdf");
+pdf(file=file_path, width=10, height=7);
+# Average pie of all samples.
+labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
+col <- c("GREY", "#006DDB", "#24FF24", "#920000");
+main <- "Average breakdown of SNP assignments across all samples";
+pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
+par(mfrow=c(3, 2));
+for (i in 1:96) {
+    labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
+    col <- c("GREY", "#006DDB", "#24FF24", "#920000");
+    main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
+    pie(tdt1_matrix[,i], labels=labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
+}
+dev.off()

-# Genotype accumulation curve, sample is number of
-# loci randomly selected for to make the curve.
-#dev.new(width=10, height=7);
-#file_path = get_file_path("geno_accumulation_curve.pdf");
-#pdf(file=file_path, width=10, height=7);
-#genotype_curve(gind, sample=5, quiet=TRUE);
-#dev.off();
-