multilocus_genotype: multilocus_genotype.R comparison

comparison multilocus_genotype.R @ 18:1190ee1456f6 draft default tip

Uploaded

author	greg
date	Mon, 13 May 2019 08:37:37 -0400
parents	85f8fc57eee4
children

comparison

equal deleted inserted replaced

-:85f8fc57eee4
+:1190ee1456f6
 suppressPackageStartupMessages(library("data.table"))
 suppressPackageStartupMessages(library("dbplyr"))
 suppressPackageStartupMessages(library("dplyr"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
+suppressPackageStartupMessages(library("maps"))
+suppressPackageStartupMessages(library("mapproj"))
 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("RColorBrewer"))
-suppressPackageStartupMessages(library("rnaturalearth"))
-suppressPackageStartupMessages(library("rnaturalearthdata"))
 suppressPackageStartupMessages(library("RPostgres"))
-suppressPackageStartupMessages(library("sf"))
+suppressPackageStartupMessages(library("SNPRelate"))
-suppressPackageStartupMessages(library(SNPRelate))
 suppressPackageStartupMessages(library("tidyr"))
 suppressPackageStartupMessages(library("vcfR"))
 suppressPackageStartupMessages(library("vegan"))
 suppressPackageStartupMessages(library("yarrr"))
 theme_set(theme_bw())
 option_list <- list(
 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
-make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"),
+make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
-make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file")
+make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
 )
 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
 args <- parse_args(parser, positional_arguments=TRUE);
 opt <- args$options;
 # FIXME: is there a way to not hard-code the port?
 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
 return (conn);
 }
+time_elapsed <- function(start_time) {
+cat("Elapsed time: ", proc.time() - start_time, "\n\n");
+}
+time_start <- function(msg) {
+start_time <- proc.time();
+cat(msg, "...\n");
+return(start_time);
+}
 # Read in VCF input file.
+start_time <- time_start("Reading VCF input");
 vcf <- read.vcfR(opt$input_vcf);
+time_elapsed(start_time);
 # Convert VCF file into a genind for the Poppr package.
-# TODO: probably should not hard-code 2 cores.
+start_time <- time_start("Converting VCF data to a genind object");
-# changed to genind format for extracting alleles later
+genind_obj <- vcfR2genind(vcf);
-# trade-off is it is a bit slower to import data
+time_elapsed(start_time);
-# gl <- vcfR2genlight(vcf, n.cores=2)
-# gind <- new("genind", (as.matrix(gl)))
-gind <- vcfR2genind(vcf);
 # Add population information to the genind object.
-poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
+population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
-colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
+colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
-gind@pop <- as.factor(poptab$region);
+genind_obj@pop <- as.factor(population_info_data_table$region);
-strata(gind)<-data.frame(pop(gind));
+strata(genind_obj) <- data.frame(pop(genind_obj));
 # Convert genind object to a genclone object.
-obj2 <- as.genclone(gind);
+start_time <- time_start("Converting the genind object to a genclone object");
+genind_clone <- as.genclone(genind_obj);
+time_elapsed(start_time);
 # Calculate the bitwise distance between individuals.
-xdis <- bitwise.dist(obj2);
+start_time <- time_start("Calculating the bitwise distance between individuals");
+bitwise_distance <- bitwise.dist(genind_clone);
+time_elapsed(start_time);
 # Multilocus genotypes (threshold of 3.2%).
-# threshold doubled because of how the data is formatted in genind compared to genlight
+mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
-mlg.filter(obj2, distance=xdis) <- 0.032;
+m <- mlg.table(genind_clone, background=TRUE, color=TRUE);
-m <- mlg.table(obj2, background=TRUE, color=TRUE);
+# Create list of MLGs.
-# Create table of MLGs.
+mlg_ids <- mlg.id(genind_clone);
-id <- mlg.id(obj2);
-#dt <- data.table(id, keep.rownames=TRUE);
-#setnames(dt, c("id"), c("affy_id"));
 # Read user's Affymetrix 96 well plate tabular file.
-pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
+affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
-colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
+colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
 "region", "latitude", "longitude", "geographic_origin", "sample_location",
 "latitude_outplant", "longitude_outplant", "depth", "disease_resist",
 "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
 "collector_first_name", "organization", "collection_date", "email", "seq_facility",
 "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
 "dna_extraction_method", "dna_concentration", "registry_id");
-pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
+affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
-pinfo2 <- as.character(pinfo$user_specimen_id);
+user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
-pi <- data.table(pinfo2, pinfo$field_call);
+# The specimen_id_field_call_data_table looks like this:
-setnames(pi, c("pinfo2"), c("user_specimen_id"));
+# user_specimen_ids V2
-setnames(pi, c("V2"), c("field_call"));
+# test_002          prolifera
+# test_003          prolifera
+specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
+# Rename the user_specimen_ids column.
+setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
+# Rename the V2 column.
+setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
 # Connect to database.
 conn <- get_database_connection(opt$database_connection_string);
 # Import the sample table.
 sample_table <- tbl(conn, "sample");
 # Import the genotype table.
 genotype_table <- tbl(conn, "genotype");
 # Select columns from the sample table and the
 # genotype table joined by genotype_id.
 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
 smlg <- sample_table_columns %>%
 left_join(genotype_table %>%
 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
-by=c("genotype_id" = "id"));
+by=c("genotype_id"="id"));
 # Convert to dataframe.
-sm <- data.frame(smlg);
+smlg_data_frame <- data.frame(smlg);
 # Name the columns.
-colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
+colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
 # Missing GT in samples submitted.
+start_time <- time_start("Discovering missing GT in samples");
 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
-myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
+missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
-myMiss <- (myMiss / nrow(vcf)) * 100;
+missing_gt <- (missing_gt / nrow(vcf)) * 100;
-miss <- data.frame(myMiss);
+missing_gt_data_frame <- data.frame(missing_gt);
+# The specimen_id_field_call_data_table looks like this:
-# Convert missing data into data table.
+# rn                                 missing_gt
-mi <-setDT(miss, keep.rownames=TRUE)[];
+# a100000-4368120-060520-256_I07.CEL 0.06092608
-setnames(mi, c("rn"), c("affy_id"));
+# a100000-4368120-060520-256_K07.CEL 0.05077173
-setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
+missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
-# Round missing data to two digits.
+# Rename the rn column.
-mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
+setnames(missing_gt_data_table, c("rn"), c("affy_id"));
+# Rename the missing_gt column.
-#heterozygous alleles
+setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
-hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
+# Round data to two digits.
-hets <- (hets / nrow(vcf)) * 100;
+missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
-ht <- data.frame(hets);
+time_elapsed(start_time);
-# Convert heterozygosity data into data table.
+# Heterozygous alleles.
-ht <-setDT(ht, keep.rownames=TRUE)[];
+start_time <- time_start("Discovering heterozygous alleles");
-setnames(ht, c("rn"), c("affy_id"));
+heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
-setnames(ht, c("hets"), c("percent_mixed_coral"));
+heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100;
-# Round missing data to two digits.
+heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
-ht$percent_mixed<-round(ht$percent_mixed, digits=2);
+# The heterozygous_alleles_data_table looks like this:
+# rn                                 heterozygous_alleles
-#reference alleles
+# a100000-4368120-060520-256_I07.CEL 73.94903
-refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
+# a100000-4368120-060520-256_K07.CEL 74.40089
-refA <- (refA / nrow(vcf)) * 100;
+heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
-rA <- data.frame(refA);
+# Rename the rn column.
+setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
-# Convert refA data into data.table.
+# Rename the heterozygous_alleles column.
-rA <-setDT(rA, keep.rownames=TRUE)[];
+setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
-setnames(rA, c("rn"), c("affy_id"));
+# Round data to two digits.
-setnames(rA, c("refA"), c("percent_reference_coral"));
+heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
-# round missing data to two digits.
+time_elapsed(start_time);
-rA$percent_reference<-round(rA$percent_reference, digits=2);
+# Reference alleles.
-#alternative alleles
+start_time <- time_start("Discovering reference alleles");
-altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
+reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
-altB <- (altB / nrow(vcf)) * 100;
+reference_alleles <- (reference_alleles / nrow(vcf)) * 100;
-aB <- data.frame(altB);
+reference_alleles_data_frame <- data.frame(reference_alleles);
+# The reference_alleles_data_table looks like this:
-# Convert altB data into data table.
+# rn                                 reference_alleles
-aB <-setDT(aB, keep.rownames=TRUE)[];
+# a100000-4368120-060520-256_I07.CEL 11.60642
-setnames(aB, c("rn"), c("affy_id"));
+# a100000-4368120-060520-256_K07.CEL 11.45918
-setnames(aB, c("altB"), c("percent_alternative_coral"));
+reference_alleles_data_table  <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[];
-# Round missing data to two digits.
+# Rename the rn column.
-aB$percent_alternative<-round(aB$percent_alternative, digits=2);
+setnames(reference_alleles_data_table, c("rn"), c("affy_id"));
+# Rename the reference_alleles column.
-#convert mlg id to data.table format
+setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral"));
-dt <- data.table(id, keep.rownames=TRUE);
+# Round data to two digits.
-setnames(dt, c("id"), c("affy_id"));
+reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2);
+time_elapsed(start_time);
-# Transform.
-df3 <- dt %>%
+# Alternative alleles
+start_time <- time_start("Discovering alternative alleles");
+alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
+alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100;
+alternative_alleles_data_frame <- data.frame(alternative_alleles);
+# The alternative_alleles_data_table looks like this:
+# rn                                 alternative_alleles
+# a100000-4368120-060520-256_I07.CEL 14.38363
+# a100000-4368120-060520-256_K07.CEL 14.08916
+alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[];
+# Rename the rn column.
+setnames(alternative_alleles_data_table, c("rn"), c("affy_id"));
+# Rename the alternative_alleles column.
+setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral"));
+# Round data to two digits.
+alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2);
+time_elapsed(start_time);
+# The mlg_ids_data_table looks like this:
+# mlg_ids
+# a550962-4368120-060520-500_M23.CEL
+# a550962-4368120-060520-256_A19.CEL
+mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
+# Rename the mlg_ids column.
+setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
+# sample_mlg_tibble looks like this:
+# A tibble: 262 x 3
+# Groups:   group [?]
+# group affy_id                            coral_mlg_clonal_id
+# <int> <chr>                              <chr>
+# 1     a550962-4368120-060520-500_M23.CEL NA
+# 2     a550962-4368120-060520-256_A19.CEL HG0006
+sample_mlg_tibble <- mlg_ids_data_table %>%
 group_by(row_number()) %>%
 dplyr::rename(group="row_number()") %>%
 unnest (affy_id) %>%
 # Join with mlg table.
-left_join(sm %>%
+left_join(smlg_data_frame %>%
 select("affy_id","coral_mlg_clonal_id"),
 by="affy_id");
 # If found in database, group members on previous mlg id.
-uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
+uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
-na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
+na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
-na.group <- df3$group[na.mlg];
+na.group <- sample_mlg_tibble$group[na.mlg];
-df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
+sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
-# Determine if the sample mlg matched previous genotyped sample.
+# Find out if the sample mlg matched a previous genotyped sample.
-df4<- df3 %>%
+# sample_mlg_match_tibble looks like this:
+# A tibble: 262 x 4
+# Groups:   group [230]
+# group affy_id                            coral_mlg_clonal_id DB_match
+# <int> <chr>                              <chr>               <chr>
+# 1     a550962-4368120-060520-500_M23.CEL NA                  no_match
+# 2     a550962-4368120-060520-256_A19.CEL HG0006              match
+sample_mlg_match_tibble <- sample_mlg_tibble %>%
 group_by(group) %>%
 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
-# Create new mlg id for samples that did not match those in the database.
+# Create new mlg id for samples with no matches in the database.
-none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
+none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
 none <- none[is.na(none$coral_mlg_clonal_id),];
-na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
+na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
-n.g <- df4$group[na.mlg2];
+n.g <- sample_mlg_match_tibble$group[na.mlg2];
 ct <- length(unique(n.g));
 # List of new group ids, the sequence starts at the number of
-# ids present in df4$coral_mlg_clonal_ids plus 1.  Not sure if
+# ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
-# the df4 file contains all ids.  If it doesn't then look below
+# plus 1.
-# to change the seq() function.
+# FIXME: Not sure if # the sample_mlg_match_tibble file
-n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
+# contains all ids.  If it doesn't then look below to change
-# Pair group with new ids.
+# the seq() function.
-rat <- cbind(unique(n.g), n.g_ids);
+n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
 # Assign the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
-df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
+sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
 }
-# Subset poptab for all samples.
+# Subset population_info_data_table for all samples.
-subpop <- poptab[c(2, 3)];
+# affy_id_user_specimen_id_vector looks like this:
+# affy_id                            user_specimen_id
+# a100000-4368120-060520-256_I07.CEL 13704
+# a100000-4368120-060520-256_K07.CEL 13706
+affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
 # Merge data frames for final table.
-report_user <- pi %>%
+start_time <- time_start("Merging data frames");
-left_join(subpop %>%
+stag_db_report <- specimen_id_field_call_data_table %>%
+left_join(affy_id_user_specimen_id_vector %>%
 select("affy_id", "user_specimen_id"),
 by="user_specimen_id") %>%
-left_join(df4 %>%
+left_join(sample_mlg_match_tibble %>%
 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
 by="affy_id") %>%
-left_join(mi %>%
+left_join(missing_gt_data_table %>%
 select("affy_id", "percent_missing_data_coral"),
 by="affy_id") %>%
-left_join(ht %>%
+left_join(heterozygous_alleles_data_table %>%
-select("affy_id", "percent_mixed_coral"),
+select("affy_id", "percent_heterozygous_coral"),
 by="affy_id") %>%
-left_join(rA %>%
+left_join(reference_alleles_data_table %>%
 select("affy_id", "percent_reference_coral"),
 by="affy_id") %>%
-left_join(aB %>%
+left_join(alternative_alleles_data_table %>%
 select("affy_id", "percent_alternative_coral"),
 by="affy_id") %>%
 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
-mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>%
+mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>%
-mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>%
+mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>%
-mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>%
+mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
 ungroup() %>%
 select(-group);
+time_elapsed(start_time);
-write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);
+start_time <- time_start("Writing csv output");
-# Database tables
+write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
-## Sample.table
+time_elapsed(start_time);
-sample_db <- pinfo %>%
-left_join(
+# Database sample table.
-report_user %>%
+sample_db <- affy_metadata_data_frame %>%
-select("user_specimen_id","affy_id",
+left_join(stag_db_report %>%
-"percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral",
+select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"),
-"percent_alternative_coral"),
+by='user_specimen_id');
-by='user_specimen_id');
+# Representative clone for genotype table.
-###representative clone for genotype.table
+start_time <- time_start("Creating representative clone for genotype table");
-cc<-clonecorrect(obj2, strata= ~pop.gind.);
+no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.);
-id_rep<-mlg.id(cc);
+id_rep <- mlg.id(no_dup_genotypes_genind);
-dt_cc<-data.table(id_rep,keep.rownames = TRUE);
+id_data_table <- data.table(id_rep, keep.rownames=TRUE);
-setnames(dt_cc, c("id_rep"), c("affy_id"));
+# Rename the id_rep column.
+setnames(id_data_table, c("id_rep"), c("affy_id"));
-###transform mlg data.table
+time_elapsed(start_time);
-df_cc <- dt_cc %>%
-group_by(row_number()) %>%
+# # Combine with previously genotyped samples in the database.
-rename(group='row_number()') %>%
+start_time <- time_start("Selecting from various database tables");
-unnest(affy_id) %>%
+representative_mlg_tibble <- id_data_table %>%
-left_join(report_user %>%
+group_by(row_number()) %>%
-select("coral_mlg_clonal_id","user_specimen_id","affy_id"),
+rename(group='row_number()') %>%
-by='affy_id') %>%
+unnest(affy_id) %>%
-mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>%
+left_join(stag_db_report %>%
-ungroup() %>%
+select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"),
-select(-group);
+by='affy_id') %>%
+mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>%
-##geno.table
+ungroup() %>%
-geno_db <- df4 %>%
+select(-group);
-left_join(df_cc %>%
-select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"),
+# Database genotype table.
+genotype_table_join <- sample_mlg_match_tibble %>%
+left_join(representative_mlg_tibble %>%
+select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"),
 by='affy_id') %>%
 ungroup() %>%
 select(-group);
-##taxonomy.table
+# Database taxonomy table.
+taxonomy_table_join <- stag_db_report %>%
-tax_db <- report_user %>%
+select(genetic_coral_species_call, affy_id) %>%
-select(genetic_coral_species_call, affy_id)  %>%
+mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>%
-mutate(genus_name =ifelse(genetic_coral_species_call==
+mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
-genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>%
+mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
-mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>%
+mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
-mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>%
+time_elapsed(start_time);
-mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name));
+# Table of alleles for the new samples subset to new plate data.
+# Create vector indicating number of individuals desired from
+# affy_id column of stag_db_report data table.
-# Table of alleles for the new samples
+i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
-## subset to new plate data
+i <- i[!apply(i== "", 1, all),];
-### create vector indicating number of individuals desired
+sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
-### made from affy_id collumn from report_user data table
-i<-ifelse(is.na(report_user[1]),"",report_user[[1]]);
+# cols looks like this:
-i<-i[!apply(i == "", 1, all),];
+#       blue1         red       green        pink      orange       blue2
-sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE];
+# "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF"
+#      green2      yellow   turquoise        poop
-# convert to data frame
+# "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
-at_96<-genind2df(sub96, sep="");
-at_96<- at_96 %>%
-select(-pop);
-# allele string for Allele.table in database
-uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE);
-uat_96<-setDT(uat_96, keep.rownames = TRUE)[];
-setnames(uat_96, c("rn"), c("user_specimen_id"));
-# write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE);
-# Create a phylogeny of samples based on distance matrices.
 cols <- piratepal("basel");
 set.seed(999);
-# Start PDF device driver.
-dev.new(width=10, height=7);
+if (!is.null(opt$output_nj_phylogeny_tree)) {
-file_path = get_file_path("nj_phylogeny.pdf");
+# Create a phylogeny tree of samples based on distance matrices.
-pdf(file=file_path, width=10, height=7);
+# Start PDF device driver.
-# Organize branches by clade.
+start_time <- time_start("Creating nj_phylogeny_tree.pdf");
-theTree <- sub96 %>%
+dev.new(width=10, height=7);
-aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
+file_path = get_file_path("nj_phylogeny_tree.pdf");
-ladderize();
+pdf(file=file_path, width=10, height=7);
-theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)];
+# Organize branches by clade.
-plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
+nj_phylogeny_tree <- sample_alleles_vector %>%
-# Add a scale bar showing 5% difference..
+aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
-add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
+ladderize();
-nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
-legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8);
+plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
-dev.off()
+# Add a scale bar showing 5% difference.
+add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
-write.tree(theTree, file =opt$nj_tree, quote=FALSE);
+nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
-# identity-by-state analysis
+dev.off()
-#if (!requireNamespace("BiocManager", quietly = TRUE))
+time_elapsed(start_time);
-#  install.packages("BiocManager")
+}
-#BiocManager::install("SNPRelate", version = "3.8")
+# Subset VCF to the user samples.
-#subset VCF to the user samples
+start_time <- time_start("Subsetting vcf to the user samples");
-l<-length(i);
+l <- length(i);
-n<-ncol(vcf@gt);
+n <- ncol(vcf@gt);
-s<-n-l;
+s <- n - l;
-svcf<-vcf[,s:n];
+svcf <- vcf[, s:n];
 write.vcf(svcf, "subset.vcf.gz");
 vcf.fn <- "subset.vcf.gz";
 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
+genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
-genofile <- snpgdsOpen(filename="test3.gds",  readonly=FALSE);
+gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
-hd<-read.gdsn(index.gdsn(genofile, "sample.id"));
+# gds_array looks like this:
-hd<-data.frame(hd);
+# [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
-hd<-setDT(hd, keep.rownames = FALSE)[];
+# [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL"
-setnames(hd, c("hd"), c("user_specimen_id"));
+gds_data_frame <- data.frame(gds_array);
+# gds_data_frame looks like this:
-subpop2<- poptab[c(2,4)];
+# gds_array
-poptab_sub <- hd %>%
+# a550962-4368120-060520-500_A03.CEL
-left_join(
+# a550962-4368120-060520-500_A05.CEL
-subpop2 %>%
+gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
-select("affy_id","region"),
+# Rename the gds_array column.
-by='affy_id')%>%
+setnames(gds_data_table, c("gds_array"), c("affy_id"));
-drop_na();
+# affy_id_region_list looks like this:
+# affy_id                            region
-samp.annot <- data.frame(pop.group = c(poptab_sub$region));
+# a100000-4368120-060520-256_I07.CEL USVI
+# a100000-4368120-060520-256_K07.CEL USVI
+affy_id_region_list <- population_info_data_table[c(2,4)];
+gds_data_table_join <- gds_data_table %>%
+left_join(affy_id_region_list %>%
+select("affy_id", "region"),
+by='affy_id')%>%
+drop_na();
+samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
 add.gdsn(genofile, "sample.annot", samp.annot);
+# population_code looks like this:
-pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
+# [1] 18.361733   18.361733   18.361733   18.361733   18.361733   18.361733
+# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
+population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
+# pop.group looks like this:
-# Identity-By-State Analysis - distance matrix calculation
+# [1] 18.361733   18.361733   18.361733   18.361733   18.361733   18.361733
+# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
+time_elapsed(start_time);
+# Distance matrix calculation.
+start_time <- time_start("Calculating distance matrix");
 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
+time_elapsed(start_time);
-# cluster analysis on the genome-wide IBS pairwise distance matrix
+# Cluster analysis on the genome-wide IBS pairwise distance matrix.
+start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
 set.seed(100);
 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2);
 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE));
+time_elapsed(start_time);
-# default clustering.
+# Default clustering.
+start_time <- time_start("Creating ibs_default.pdf");
+# Start PDF device driver.
 dev.new(width=10, height=7);
-file_path = get_file_path("IBS_default.pdf");
+file_path = get_file_path("ibs_default.pdf");
-pdf (file=file_path, width=10, height=7);
+pdf(file=file_path, width=10, height=7);
 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
-snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2);
+snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2);
 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
 dev.off()
+time_elapsed(start_time);
-# color cluster by region.
+# Color cluster by region.
+start_time <- time_start("Creating ibs_region.pdf");
+# Start PDF device driver.
 dev.new(width=10, height=7);
-file_path = get_file_path("IBS_Region.pdf");
+file_path = get_file_path("ibs_region.pdf");
-pdf (file=file_path, width=10, height=7);
+pdf(file=file_path, width=10, height=7);
-race <- as.factor(pop_code);
+race <- as.factor(population_code);
-rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15);
+rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15);
-snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2);
+snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2);
 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
 dev.off()
+time_elapsed(start_time);
-#close GDS file
+# close GDS file.
 snpgdsClose(genofile);
 # Sample MLG on a map.
-world <- ne_countries(scale = "medium", returnclass = "sf");
+start_time <- time_start("Creating mlg_map.pdf");
-class(world);
+# Get the lattitude and longitude boundaries for rendering
+# the map.  Tese boundaries will restrict the map to focus
-pinfo$mlg<-report_user$coral_mlg_clonal_id;
+# (i.e., zoom) on the region of the world map from which
-n <- nrow(pinfo);
+# the samples were taken.
+max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
-mxlat<-max(pinfo$latitude,na.rm = TRUE);
+min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
-mnlat<-min(pinfo$latitude,na.rm = TRUE);
+latitude_range_vector <- c(min_latitude-3, max_latitude+3);
-mxlong<-max(pinfo$longitude,na.rm = TRUE);
+max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
-mnlong<-min(pinfo$longitude,na.rm = TRUE);
+min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
+longitude_range_vector <- c(min_longitude-3, max_longitude+3);
-p5<-ggplot(data = world) +
+# Get the palette colors for rendering plots.
-geom_sf() +
+colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
-coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE);
+# Get a color palette.
+palette <- colorRampPalette(piratepal("basel"));
-colourCount = length(unique(pinfo$mlg));
+# Start PDF device driver.
-getPalette = colorRampPalette(piratepal("basel"));
 dev.new(width=10, height=7);
 file_path = get_file_path("mlg_map.pdf");
-pdf (file=file_path, width=10, height=7);
+pdf(file=file_path, width=10, height=7);
-p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+
+world_data = map_data("world");
-scale_color_manual(values=getPalette(colourCount))+
+# Add the coral_mlg_clonal_id column from the stag_db_report
-theme(legend.position="bottom")+
+# data fram to the affy_metadata_data_frame.
-guides(color=guide_legend(nrow=8,byrow=F));
+affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
-p6;
+# Get the number of colors needed from the palette for plotting
+# the sample locations on the world map.
+num_colors = length(unique(affy_metadata_data_frame$mlg));
+# Get a color palette.
+palette = colorRampPalette(piratepal("basel"));
+ggplot() +
+geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
+coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) +
+geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) +
+scale_color_manual(values=palette(num_colors)) +
+theme(legend.position="bottom") +
+guides(color=guide_legend(nrow=8, byrow=F));
 dev.off()
+time_elapsed(start_time);
 # Missing data barplot.
-poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)];
+start_time <- time_start("Creating missing_data.pdf");
-test2 <- which(!is.na(poptab$miss));
+population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
-miss96 <- poptab$miss[test2];
+test2 <- which(!is.na(population_info_data_table$miss));
-name96 <- poptab$user_specimen_id[test2];
+miss96 <- population_info_data_table$miss[test2];
+name96 <- population_info_data_table$user_specimen_id[test2];
+# Start PDF device driver.
 dev.new(width=10, height=7);
 file_path = get_file_path("missing_data.pdf");
-pdf (file=file_path, width=10, height=7);
+pdf(file=file_path, width=10, height=7);
 par(mar = c(8, 4, 4, 2));
 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
 dev.off()
+time_elapsed(start_time);
 # Generate a pie chart for each sample with a genotype.
 # Store the numerical and user_specimen_id values from
-# report_user for the charts (user_specimen_id names
+# stag_db_report for the charts (user_specimen_id names
 # will be used to label each chart).
-dt1 <- data.table(report_user);
+start_time <- time_start("Creating percent_breakdown.pdf");
-dt1 <- report_user[c(-2, -3, -4)];
+stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)];
-dt1 <- na.omit(dt1);
+# Remove NA and NaN values.
-# Translate to N (i.e., number of samples with a
+stag_db_report_data_table <- na.omit(stag_db_report_data_table);
-# genotype) columns and 5 rows.
+# Translate to N (i.e., number of samples with a genotype)
-tdt1 <- t(dt1);
+# columns and 5 rows.
-# Make another data table and transpose it the same as dt1 to
+translated_stag_db_report_data_table <- t(stag_db_report_data_table);
-# get numerics. These will feed into the creation of N vectors.
+translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
-dt2 <- data.table(report_user);
+# Set the storage mode of the matrix to numeric.  In some
-dt2 <- report_user[c(-1, -2, -3, -4)];
+# cases this could result in the following:
-# Translate to N columns and 5 rows.
+# Warning message:
-tdt2 <- t(dt2);
+# In mde(x) : NAs introduced by coercion
-tdt1_matrix <- as.matrix(tdt1[-1,]);
+mode(translated_stag_db_report_matrix) <- "numeric";
-# The number of columns is the number of samples with genotypes.
+# Remove NA and NaN values that may have been introduced
-nc <- ncol(tdt1_matrix);
+# by coercion.
-mode(tdt1_matrix) <- "numeric";
+translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
-spy <- rowMeans(tdt1_matrix);
+tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
 dev.new(width=10, height=7);
 file_path = get_file_path("percent_breakdown.pdf");
 pdf(file=file_path, width=10, height=7);
 # Average pie of all samples.
-labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
+labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep="");
 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
 main <- "Average breakdown of SNP assignments across all samples";
-pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
+pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
 par(mfrow=c(3, 2));
 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
-for (i in 1:nc) {
+# Generate a pie chart for each sample with genotypes.
-tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
+for (i in 1:ncol(translated_stag_db_report_matrix)) {
-main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
+tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
-pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
+main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
+pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
 }
 dev.off()
+time_elapsed(start_time);

Mercurial > repos > greg > multilocus_genotype

comparison multilocus_genotype.R @ 18:1190ee1456f6 draft default tip