Mercurial > repos > greg > multilocus_genotype

diff multilocus_genotype.R @ 18:1190ee1456f6 draft default tip
Uploaded
author: greg
date: Mon, 13 May 2019 08:37:37 -0400
parents: 85f8fc57eee4
--- a/multilocus_genotype.R	Wed Apr 17 09:07:04 2019 -0400
+++ b/multilocus_genotype.R	Mon May 13 08:37:37 2019 -0400
@@ -7,14 +7,13 @@
 suppressPackageStartupMessages(library("dplyr"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
+suppressPackageStartupMessages(library("maps"))
+suppressPackageStartupMessages(library("mapproj"))
 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("RColorBrewer"))
-suppressPackageStartupMessages(library("rnaturalearth"))
-suppressPackageStartupMessages(library("rnaturalearthdata"))
 suppressPackageStartupMessages(library("RPostgres"))
-suppressPackageStartupMessages(library("sf"))
-suppressPackageStartupMessages(library(SNPRelate))
+suppressPackageStartupMessages(library("SNPRelate"))
 suppressPackageStartupMessages(library("tidyr"))
 suppressPackageStartupMessages(library("vcfR"))
 suppressPackageStartupMessages(library("vegan"))
@@ -26,8 +25,8 @@
     make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
     make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
     make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
-    make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"),
-    make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file")
+    make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
+    make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
 )
 
 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
@@ -59,430 +58,520 @@
     return (conn);
 }
 
+time_elapsed <- function(start_time) {
+    cat("Elapsed time: ", proc.time() - start_time, "\n\n");
+}
+
+time_start <- function(msg) {
+    start_time <- proc.time();
+    cat(msg, "...\n");
+    return(start_time);
+}
+
 # Read in VCF input file.
+start_time <- time_start("Reading VCF input");
 vcf <- read.vcfR(opt$input_vcf);
+time_elapsed(start_time);
 
 # Convert VCF file into a genind for the Poppr package.
-# TODO: probably should not hard-code 2 cores.
-# changed to genind format for extracting alleles later
-# trade-off is it is a bit slower to import data
-# gl <- vcfR2genlight(vcf, n.cores=2)
-# gind <- new("genind", (as.matrix(gl)))
-gind <- vcfR2genind(vcf);
+start_time <- time_start("Converting VCF data to a genind object");
+genind_obj <- vcfR2genind(vcf);
+time_elapsed(start_time);
 
 # Add population information to the genind object.
-poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
-colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
-gind@pop <- as.factor(poptab$region);
-strata(gind)<-data.frame(pop(gind));
+population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
+colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
+genind_obj@pop <- as.factor(population_info_data_table$region);
+strata(genind_obj) <- data.frame(pop(genind_obj));
 
 # Convert genind object to a genclone object.
-obj2 <- as.genclone(gind);
+start_time <- time_start("Converting the genind object to a genclone object");
+genind_clone <- as.genclone(genind_obj);
+time_elapsed(start_time);
 
 # Calculate the bitwise distance between individuals.
-xdis <- bitwise.dist(obj2);
+start_time <- time_start("Calculating the bitwise distance between individuals");
+bitwise_distance <- bitwise.dist(genind_clone);
+time_elapsed(start_time);
 
 # Multilocus genotypes (threshold of 3.2%).
-# threshold doubled because of how the data is formatted in genind compared to genlight
-mlg.filter(obj2, distance=xdis) <- 0.032;
-m <- mlg.table(obj2, background=TRUE, color=TRUE);
+mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
+m <- mlg.table(genind_clone, background=TRUE, color=TRUE);
 
-# Create table of MLGs.
-id <- mlg.id(obj2);
-#dt <- data.table(id, keep.rownames=TRUE);
-#setnames(dt, c("id"), c("affy_id"));
+# Create list of MLGs.
+mlg_ids <- mlg.id(genind_clone);
 
 # Read user's Affymetrix 96 well plate tabular file.
-pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
-colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
-                     "region", "latitude", "longitude", "geographic_origin", "sample_location",
-                     "latitude_outplant", "longitude_outplant", "depth", "disease_resist",
-                     "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
-                     "collector_first_name", "organization", "collection_date", "email", "seq_facility",
-                     "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
-                     "dna_extraction_method", "dna_concentration", "registry_id");
-pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
-pinfo2 <- as.character(pinfo$user_specimen_id);
-pi <- data.table(pinfo2, pinfo$field_call);
-setnames(pi, c("pinfo2"), c("user_specimen_id"));
-setnames(pi, c("V2"), c("field_call"));
+affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
+colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
+                                        "region", "latitude", "longitude", "geographic_origin", "sample_location",
+                                        "latitude_outplant", "longitude_outplant", "depth", "disease_resist",
+                                        "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
+                                        "collector_first_name", "organization", "collection_date", "email", "seq_facility",
+                                        "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
+                                        "dna_extraction_method", "dna_concentration", "registry_id");
+affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
+user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
+# The specimen_id_field_call_data_table looks like this:
+# user_specimen_ids V2
+# test_002          prolifera
+# test_003          prolifera
+specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
+# Rename the user_specimen_ids column.
+setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
+# Rename the V2 column.
+setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
 
 # Connect to database.
 conn <- get_database_connection(opt$database_connection_string);
-
 # Import the sample table.
 sample_table <- tbl(conn, "sample");
-
 # Import the genotype table.
 genotype_table <- tbl(conn, "genotype");
-
 # Select columns from the sample table and the
 # genotype table joined by genotype_id.
 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
 smlg <- sample_table_columns %>%
     left_join(genotype_table %>%
               select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
-              by=c("genotype_id" = "id"));
-
+              by=c("genotype_id"="id"));
 # Convert to dataframe.
-sm <- data.frame(smlg);
+smlg_data_frame <- data.frame(smlg);
 # Name the columns.
-colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
+colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
 
 # Missing GT in samples submitted.
+start_time <- time_start("Discovering missing GT in samples");
 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
-myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
-myMiss <- (myMiss / nrow(vcf)) * 100;
-miss <- data.frame(myMiss);
-
-# Convert missing data into data table.
-mi <-setDT(miss, keep.rownames=TRUE)[];
-setnames(mi, c("rn"), c("affy_id"));
-setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
-# Round missing data to two digits.
-mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
+missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
+missing_gt <- (missing_gt / nrow(vcf)) * 100;
+missing_gt_data_frame <- data.frame(missing_gt);
+# The specimen_id_field_call_data_table looks like this:
+# rn                                 missing_gt
+# a100000-4368120-060520-256_I07.CEL 0.06092608
+# a100000-4368120-060520-256_K07.CEL 0.05077173
+missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
+# Rename the rn column.
+setnames(missing_gt_data_table, c("rn"), c("affy_id"));
+# Rename the missing_gt column.
+setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
+# Round data to two digits.
+missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
+time_elapsed(start_time);
 
-#heterozygous alleles
-hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
-hets <- (hets / nrow(vcf)) * 100;
-ht <- data.frame(hets);
-
-# Convert heterozygosity data into data table.
-ht <-setDT(ht, keep.rownames=TRUE)[];
-setnames(ht, c("rn"), c("affy_id"));
-setnames(ht, c("hets"), c("percent_mixed_coral"));
-# Round missing data to two digits.
-ht$percent_mixed<-round(ht$percent_mixed, digits=2);
+# Heterozygous alleles.
+start_time <- time_start("Discovering heterozygous alleles");
+heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
+heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100;
+heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
+# The heterozygous_alleles_data_table looks like this:
+# rn                                 heterozygous_alleles
+# a100000-4368120-060520-256_I07.CEL 73.94903
+# a100000-4368120-060520-256_K07.CEL 74.40089
+heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
+# Rename the rn column.
+setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
+# Rename the heterozygous_alleles column.
+setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
+# Round data to two digits.
+heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
+time_elapsed(start_time);
 
-#reference alleles
-refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
-refA <- (refA / nrow(vcf)) * 100;
-rA <- data.frame(refA);
-
-# Convert refA data into data.table.
-rA <-setDT(rA, keep.rownames=TRUE)[];
-setnames(rA, c("rn"), c("affy_id"));
-setnames(rA, c("refA"), c("percent_reference_coral"));
-# round missing data to two digits.
-rA$percent_reference<-round(rA$percent_reference, digits=2);
+# Reference alleles.
+start_time <- time_start("Discovering reference alleles");
+reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
+reference_alleles <- (reference_alleles / nrow(vcf)) * 100;
+reference_alleles_data_frame <- data.frame(reference_alleles);
+# The reference_alleles_data_table looks like this:
+# rn                                 reference_alleles
+# a100000-4368120-060520-256_I07.CEL 11.60642
+# a100000-4368120-060520-256_K07.CEL 11.45918
+reference_alleles_data_table  <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[];
+# Rename the rn column.
+setnames(reference_alleles_data_table, c("rn"), c("affy_id"));
+# Rename the reference_alleles column.
+setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral"));
+# Round data to two digits.
+reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2);
+time_elapsed(start_time);
 
-#alternative alleles
-altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
-altB <- (altB / nrow(vcf)) * 100;
-aB <- data.frame(altB);
+# Alternative alleles
+start_time <- time_start("Discovering alternative alleles");
+alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
+alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100;
+alternative_alleles_data_frame <- data.frame(alternative_alleles);
+# The alternative_alleles_data_table looks like this:
+# rn                                 alternative_alleles
+# a100000-4368120-060520-256_I07.CEL 14.38363
+# a100000-4368120-060520-256_K07.CEL 14.08916
+alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[];
+# Rename the rn column.
+setnames(alternative_alleles_data_table, c("rn"), c("affy_id"));
+# Rename the alternative_alleles column.
+setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral"));
+# Round data to two digits.
+alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2);
+time_elapsed(start_time);
 
-# Convert altB data into data table.
-aB <-setDT(aB, keep.rownames=TRUE)[];
-setnames(aB, c("rn"), c("affy_id"));
-setnames(aB, c("altB"), c("percent_alternative_coral"));
-# Round missing data to two digits.
-aB$percent_alternative<-round(aB$percent_alternative, digits=2);
+# The mlg_ids_data_table looks like this:
+# mlg_ids
+# a550962-4368120-060520-500_M23.CEL
+# a550962-4368120-060520-256_A19.CEL
+mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
+# Rename the mlg_ids column.
+setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
 
-#convert mlg id to data.table format
-dt <- data.table(id, keep.rownames=TRUE);
-setnames(dt, c("id"), c("affy_id"));
-
-# Transform.
-df3 <- dt %>%
+# sample_mlg_tibble looks like this:
+# A tibble: 262 x 3
+# Groups:   group [?]
+# group affy_id                            coral_mlg_clonal_id
+# <int> <chr>                              <chr>              
+# 1     a550962-4368120-060520-500_M23.CEL NA                 
+# 2     a550962-4368120-060520-256_A19.CEL HG0006
+sample_mlg_tibble <- mlg_ids_data_table %>%
     group_by(row_number()) %>%
     dplyr::rename(group="row_number()") %>%
     unnest (affy_id) %>%
     # Join with mlg table.
-    left_join(sm %>%
+    left_join(smlg_data_frame %>%
               select("affy_id","coral_mlg_clonal_id"),
               by="affy_id");
 
 # If found in database, group members on previous mlg id.
-uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
+uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
-na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
-na.group <- df3$group[na.mlg];
-df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
+na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
+na.group <- sample_mlg_tibble$group[na.mlg];
+sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
 
-# Determine if the sample mlg matched previous genotyped sample.
-df4<- df3 %>%
+# Find out if the sample mlg matched a previous genotyped sample.
+# sample_mlg_match_tibble looks like this:
+# A tibble: 262 x 4
+# Groups:   group [230]
+# group affy_id                            coral_mlg_clonal_id DB_match
+# <int> <chr>                              <chr>               <chr>   
+# 1     a550962-4368120-060520-500_M23.CEL NA                  no_match
+# 2     a550962-4368120-060520-256_A19.CEL HG0006              match 
+sample_mlg_match_tibble <- sample_mlg_tibble %>%
     group_by(group) %>%
     mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
 
-# Create new mlg id for samples that did not match those in the database.
-none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
+# Create new mlg id for samples with no matches in the database.
+none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
 none <- none[is.na(none$coral_mlg_clonal_id),];
-na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
-n.g <- df4$group[na.mlg2];
+na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
+n.g <- sample_mlg_match_tibble$group[na.mlg2];
 ct <- length(unique(n.g));
 
 # List of new group ids, the sequence starts at the number of
-# ids present in df4$coral_mlg_clonal_ids plus 1.  Not sure if
-# the df4 file contains all ids.  If it doesn't then look below
-# to change the seq() function.
-n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
-# Pair group with new ids.
-rat <- cbind(unique(n.g), n.g_ids);
+# ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
+# plus 1.
+# FIXME: Not sure if # the sample_mlg_match_tibble file
+# contains all ids.  If it doesn't then look below to change
+# the seq() function.
+n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
+
 # Assign the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
-    df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
+    sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
 }
 
-# Subset poptab for all samples.
-subpop <- poptab[c(2, 3)];
+# Subset population_info_data_table for all samples.
+# affy_id_user_specimen_id_vector looks like this:
+# affy_id                            user_specimen_id
+# a100000-4368120-060520-256_I07.CEL 13704
+# a100000-4368120-060520-256_K07.CEL 13706
+affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
 
 # Merge data frames for final table.
-report_user <- pi %>%
-    left_join(subpop %>%
+start_time <- time_start("Merging data frames");
+stag_db_report <- specimen_id_field_call_data_table %>%
+    left_join(affy_id_user_specimen_id_vector %>%
         select("affy_id", "user_specimen_id"),
         by="user_specimen_id") %>%
-    left_join(df4 %>%
+    left_join(sample_mlg_match_tibble %>%
         select("affy_id", "coral_mlg_clonal_id", "DB_match"),
         by="affy_id") %>%
-    left_join(mi %>%
+    left_join(missing_gt_data_table %>%
         select("affy_id", "percent_missing_data_coral"),
         by="affy_id") %>%
-    left_join(ht %>%
-        select("affy_id", "percent_mixed_coral"),
+    left_join(heterozygous_alleles_data_table %>%
+        select("affy_id", "percent_heterozygous_coral"),
         by="affy_id") %>%
-    left_join(rA %>%
+    left_join(reference_alleles_data_table %>%
         select("affy_id", "percent_reference_coral"),
         by="affy_id") %>%
-    left_join(aB %>%
+    left_join(alternative_alleles_data_table %>%
         select("affy_id", "percent_alternative_coral"),
         by="affy_id") %>%
     mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
     mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
-    mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>%
-    mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>%
-    mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>%
+    mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>%
+    mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>%
+    mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
+    ungroup() %>%
+    select(-group);
+time_elapsed(start_time);
+
+start_time <- time_start("Writing csv output");
+write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
+time_elapsed(start_time);
+
+# Database sample table.
+sample_db <- affy_metadata_data_frame %>%
+  left_join(stag_db_report %>%
+      select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"),
+      by='user_specimen_id');
+
+# Representative clone for genotype table.
+start_time <- time_start("Creating representative clone for genotype table");
+no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.);
+id_rep <- mlg.id(no_dup_genotypes_genind);
+id_data_table <- data.table(id_rep, keep.rownames=TRUE);
+# Rename the id_rep column.
+setnames(id_data_table, c("id_rep"), c("affy_id"));
+time_elapsed(start_time);
+
+# # Combine with previously genotyped samples in the database.
+start_time <- time_start("Selecting from various database tables");
+representative_mlg_tibble <- id_data_table %>%
+    group_by(row_number()) %>%
+    rename(group='row_number()') %>%
+    unnest(affy_id) %>%
+    left_join(stag_db_report %>%
+              select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"),
+              by='affy_id') %>%
+    mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>%
+    ungroup() %>%
+    select(-group);
+
+# Database genotype table.
+genotype_table_join <- sample_mlg_match_tibble %>%
+    left_join(representative_mlg_tibble %>%
+    select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"),
+    by='affy_id') %>%
     ungroup() %>%
     select(-group);
 
-write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);
-
-# Database tables
-## Sample.table
-sample_db <- pinfo %>%
-  left_join(
-    report_user %>%
-      select("user_specimen_id","affy_id",
-             "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral",
-             "percent_alternative_coral"),
-    by='user_specimen_id');
-
-###representative clone for genotype.table
-cc<-clonecorrect(obj2, strata= ~pop.gind.);
-id_rep<-mlg.id(cc);
-dt_cc<-data.table(id_rep,keep.rownames = TRUE);
-setnames(dt_cc, c("id_rep"), c("affy_id"));
-
-###transform mlg data.table
-df_cc <- dt_cc %>%
-  group_by(row_number()) %>%
-  rename(group='row_number()') %>%
-  unnest(affy_id) %>%
-  left_join(report_user %>%
-              select("coral_mlg_clonal_id","user_specimen_id","affy_id"),
-            by='affy_id') %>%
-  mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>%
-  ungroup() %>%
-  select(-group);
+# Database taxonomy table.
+taxonomy_table_join <- stag_db_report %>%
+    select(genetic_coral_species_call, affy_id) %>%
+    mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>%
+    mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
+    mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
+    mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
+time_elapsed(start_time);
+# Table of alleles for the new samples subset to new plate data.
+# Create vector indicating number of individuals desired from
+# affy_id column of stag_db_report data table.
+i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
+i <- i[!apply(i== "", 1, all),];
+sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
 
-##geno.table
-geno_db <- df4 %>%
- left_join(df_cc %>%
-    select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"),
-    by='affy_id') %>%
-  ungroup() %>%
-  select(-group);
-
-##taxonomy.table
-
-tax_db <- report_user %>%
-  select(genetic_coral_species_call, affy_id)  %>%
-  mutate(genus_name =ifelse(genetic_coral_species_call==
-                              genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>%
-  mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>%
-  mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>%
-  mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name));
-
-
-# Table of alleles for the new samples
-## subset to new plate data
-### create vector indicating number of individuals desired
-### made from affy_id collumn from report_user data table
-i<-ifelse(is.na(report_user[1]),"",report_user[[1]]);
-i<-i[!apply(i == "", 1, all),];
-sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE];
-
-# convert to data frame
-at_96<-genind2df(sub96, sep="");
-at_96<- at_96 %>%
-  select(-pop);
-
-# allele string for Allele.table in database
-uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE);
-uat_96<-setDT(uat_96, keep.rownames = TRUE)[];
-setnames(uat_96, c("rn"), c("user_specimen_id"));
-# write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE);
-
-# Create a phylogeny of samples based on distance matrices.
+# cols looks like this:
+#       blue1         red       green        pink      orange       blue2 
+# "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" 
+#      green2      yellow   turquoise        poop 
+# "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
 cols <- piratepal("basel");
 set.seed(999);
-# Start PDF device driver.
-dev.new(width=10, height=7);
-file_path = get_file_path("nj_phylogeny.pdf");
-pdf(file=file_path, width=10, height=7);
-# Organize branches by clade.
-theTree <- sub96 %>%
-    aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
-    ladderize();
-theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)];
-plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
-# Add a scale bar showing 5% difference..
-add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
-nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
-legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8);
-dev.off()
 
-write.tree(theTree, file =opt$nj_tree, quote=FALSE);
-
-# identity-by-state analysis
-#if (!requireNamespace("BiocManager", quietly = TRUE))
-#  install.packages("BiocManager")
-#BiocManager::install("SNPRelate", version = "3.8")
+if (!is.null(opt$output_nj_phylogeny_tree)) {
+    # Create a phylogeny tree of samples based on distance matrices.
+    # Start PDF device driver.
+    start_time <- time_start("Creating nj_phylogeny_tree.pdf");
+    dev.new(width=10, height=7);
+    file_path = get_file_path("nj_phylogeny_tree.pdf");
+    pdf(file=file_path, width=10, height=7);
+    # Organize branches by clade.
+    nj_phylogeny_tree <- sample_alleles_vector %>%
+        aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
+        ladderize();
+    nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
+    plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
+    # Add a scale bar showing 5% difference.
+    add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
+    nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+    legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
+    dev.off()
+    time_elapsed(start_time);
+}
 
-#subset VCF to the user samples
-l<-length(i);
-n<-ncol(vcf@gt);
-s<-n-l;
-svcf<-vcf[,s:n];
+# Subset VCF to the user samples.
+start_time <- time_start("Subsetting vcf to the user samples");
+l <- length(i);
+n <- ncol(vcf@gt);
+s <- n - l;
+svcf <- vcf[, s:n];
 write.vcf(svcf, "subset.vcf.gz");
-
 vcf.fn <- "subset.vcf.gz";
 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
-
-genofile <- snpgdsOpen(filename="test3.gds",  readonly=FALSE);
-hd<-read.gdsn(index.gdsn(genofile, "sample.id"));
-hd<-data.frame(hd);
-hd<-setDT(hd, keep.rownames = FALSE)[];
-setnames(hd, c("hd"), c("user_specimen_id"));
+genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
+gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
+# gds_array looks like this:
+# [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
+# [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL"
+gds_data_frame <- data.frame(gds_array);
+# gds_data_frame looks like this:
+# gds_array
+# a550962-4368120-060520-500_A03.CEL
+# a550962-4368120-060520-500_A05.CEL
+gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
+# Rename the gds_array column.
+setnames(gds_data_table, c("gds_array"), c("affy_id"));
+# affy_id_region_list looks like this:
+# affy_id                            region
+# a100000-4368120-060520-256_I07.CEL USVI
+# a100000-4368120-060520-256_K07.CEL USVI
+affy_id_region_list <- population_info_data_table[c(2,4)];
+gds_data_table_join <- gds_data_table %>%
+    left_join(affy_id_region_list %>%
+        select("affy_id", "region"),
+        by='affy_id')%>%
+        drop_na();
+samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
+add.gdsn(genofile, "sample.annot", samp.annot);
+# population_code looks like this:
+# [1] 18.361733   18.361733   18.361733   18.361733   18.361733   18.361733  
+# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
+population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
+pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
+# pop.group looks like this:
+# [1] 18.361733   18.361733   18.361733   18.361733   18.361733   18.361733  
+# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
+time_elapsed(start_time);
 
-subpop2<- poptab[c(2,4)];
-poptab_sub <- hd %>%
-  left_join(
-    subpop2 %>%
-      select("affy_id","region"),
-    by='affy_id')%>%
-    drop_na();
+# Distance matrix calculation.
+start_time <- time_start("Calculating distance matrix");
+ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
+time_elapsed(start_time);
 
-samp.annot <- data.frame(pop.group = c(poptab_sub$region));
-add.gdsn(genofile, "sample.annot", samp.annot);
-
-pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
-pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
-
-# Identity-By-State Analysis - distance matrix calculation
-ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
-
-# cluster analysis on the genome-wide IBS pairwise distance matrix
+# Cluster analysis on the genome-wide IBS pairwise distance matrix.
+start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
 set.seed(100);
 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2);
 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE));
+time_elapsed(start_time);
 
-# default clustering.
+# Default clustering.
+start_time <- time_start("Creating ibs_default.pdf");
+# Start PDF device driver.
 dev.new(width=10, height=7);
-file_path = get_file_path("IBS_default.pdf");
-pdf (file=file_path, width=10, height=7);
+file_path = get_file_path("ibs_default.pdf");
+pdf(file=file_path, width=10, height=7);
 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
-snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2);
+snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2);
 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
 dev.off()
+time_elapsed(start_time);
 
-# color cluster by region.
+# Color cluster by region.
+start_time <- time_start("Creating ibs_region.pdf");
+# Start PDF device driver.
 dev.new(width=10, height=7);
-file_path = get_file_path("IBS_Region.pdf");
-pdf (file=file_path, width=10, height=7);
-race <- as.factor(pop_code);
-rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15);
-snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2);
+file_path = get_file_path("ibs_region.pdf");
+pdf(file=file_path, width=10, height=7);
+race <- as.factor(population_code);
+rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15);
+snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2);
 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
 dev.off()
+time_elapsed(start_time);
 
-#close GDS file
+# close GDS file.
 snpgdsClose(genofile);
 
 # Sample MLG on a map.
-world <- ne_countries(scale = "medium", returnclass = "sf");
-class(world);
-
-pinfo$mlg<-report_user$coral_mlg_clonal_id;
-n <- nrow(pinfo);
-
-mxlat<-max(pinfo$latitude,na.rm = TRUE);
-mnlat<-min(pinfo$latitude,na.rm = TRUE);
-mxlong<-max(pinfo$longitude,na.rm = TRUE);
-mnlong<-min(pinfo$longitude,na.rm = TRUE);
-
-p5<-ggplot(data = world) +
-  geom_sf() +
-  coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE);
-
-colourCount = length(unique(pinfo$mlg));
-getPalette = colorRampPalette(piratepal("basel"));
+start_time <- time_start("Creating mlg_map.pdf");
+# Get the lattitude and longitude boundaries for rendering
+# the map.  Tese boundaries will restrict the map to focus
+# (i.e., zoom) on the region of the world map from which
+# the samples were taken.
+max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
+min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
+latitude_range_vector <- c(min_latitude-3, max_latitude+3);
+max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
+min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
+longitude_range_vector <- c(min_longitude-3, max_longitude+3);
+# Get the palette colors for rendering plots.
+colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
+# Get a color palette.
+palette <- colorRampPalette(piratepal("basel"));
+# Start PDF device driver.
 dev.new(width=10, height=7);
 file_path = get_file_path("mlg_map.pdf");
-pdf (file=file_path, width=10, height=7);
-p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+
-  scale_color_manual(values=getPalette(colourCount))+
-  theme(legend.position="bottom")+
-  guides(color=guide_legend(nrow=8,byrow=F));
-p6;
+pdf(file=file_path, width=10, height=7);
+world_data = map_data("world");
+# Add the coral_mlg_clonal_id column from the stag_db_report
+# data fram to the affy_metadata_data_frame.
+affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
+# Get the number of colors needed from the palette for plotting
+# the sample locations on the world map.
+num_colors = length(unique(affy_metadata_data_frame$mlg));
+# Get a color palette.
+palette = colorRampPalette(piratepal("basel"));
+ggplot() + 
+geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
+coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) +
+geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) +
+scale_color_manual(values=palette(num_colors)) +
+theme(legend.position="bottom") +
+guides(color=guide_legend(nrow=8, byrow=F));
 dev.off()
+time_elapsed(start_time);
 
 # Missing data barplot.
-poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)];
-test2 <- which(!is.na(poptab$miss));
-miss96 <- poptab$miss[test2];
-name96 <- poptab$user_specimen_id[test2];
+start_time <- time_start("Creating missing_data.pdf");
+population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
+test2 <- which(!is.na(population_info_data_table$miss));
+miss96 <- population_info_data_table$miss[test2];
+name96 <- population_info_data_table$user_specimen_id[test2];
+# Start PDF device driver.
 dev.new(width=10, height=7);
 file_path = get_file_path("missing_data.pdf");
-pdf (file=file_path, width=10, height=7);
+pdf(file=file_path, width=10, height=7);
 par(mar = c(8, 4, 4, 2));
 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
 dev.off()
+time_elapsed(start_time);
 
 # Generate a pie chart for each sample with a genotype.
 # Store the numerical and user_specimen_id values from
-# report_user for the charts (user_specimen_id names
+# stag_db_report for the charts (user_specimen_id names
 # will be used to label each chart).
-dt1 <- data.table(report_user);
-dt1 <- report_user[c(-2, -3, -4)];
-dt1 <- na.omit(dt1);
-# Translate to N (i.e., number of samples with a
-# genotype) columns and 5 rows.
-tdt1 <- t(dt1);
-# Make another data table and transpose it the same as dt1 to
-# get numerics. These will feed into the creation of N vectors.
-dt2 <- data.table(report_user);
-dt2 <- report_user[c(-1, -2, -3, -4)];
-# Translate to N columns and 5 rows.
-tdt2 <- t(dt2);
-tdt1_matrix <- as.matrix(tdt1[-1,]);
-# The number of columns is the number of samples with genotypes.
-nc <- ncol(tdt1_matrix);
-mode(tdt1_matrix) <- "numeric";
-spy <- rowMeans(tdt1_matrix);
+start_time <- time_start("Creating percent_breakdown.pdf");
+stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)];
+# Remove NA and NaN values.
+stag_db_report_data_table <- na.omit(stag_db_report_data_table);
+# Translate to N (i.e., number of samples with a genotype)
+# columns and 5 rows.
+translated_stag_db_report_data_table <- t(stag_db_report_data_table);
+translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
+# Set the storage mode of the matrix to numeric.  In some
+# cases this could result in the following:
+# Warning message:
+# In mde(x) : NAs introduced by coercion
+mode(translated_stag_db_report_matrix) <- "numeric";
+# Remove NA and NaN values that may have been introduced
+# by coercion.
+translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
+tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
 dev.new(width=10, height=7);
 file_path = get_file_path("percent_breakdown.pdf");
 pdf(file=file_path, width=10, height=7);
 # Average pie of all samples.
-labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
+labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep="");
 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
 main <- "Average breakdown of SNP assignments across all samples";
-pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
+pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
 par(mfrow=c(3, 2));
 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
-for (i in 1:nc) {
-    tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
-    main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
-    pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
+# Generate a pie chart for each sample with genotypes.
+for (i in 1:ncol(translated_stag_db_report_matrix)) {
+    tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
+    main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
+    pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
 }
 dev.off()
+time_elapsed(start_time);
author	greg
date	Mon, 13 May 2019 08:37:37 -0400
parents	85f8fc57eee4
children