Mercurial > repos > greg > multilocus_genotype
diff multilocus_genotype.R @ 18:1190ee1456f6 draft default tip
Uploaded
author | greg |
---|---|
date | Mon, 13 May 2019 08:37:37 -0400 |
parents | 85f8fc57eee4 |
children |
line wrap: on
line diff
--- a/multilocus_genotype.R Wed Apr 17 09:07:04 2019 -0400 +++ b/multilocus_genotype.R Mon May 13 08:37:37 2019 -0400 @@ -7,14 +7,13 @@ suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) +suppressPackageStartupMessages(library("maps")) +suppressPackageStartupMessages(library("mapproj")) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("rnaturalearth")) -suppressPackageStartupMessages(library("rnaturalearthdata")) suppressPackageStartupMessages(library("RPostgres")) -suppressPackageStartupMessages(library("sf")) -suppressPackageStartupMessages(library(SNPRelate)) +suppressPackageStartupMessages(library("SNPRelate")) suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) @@ -26,8 +25,8 @@ make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), - make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), - make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") + make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"), + make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); @@ -59,430 +58,520 @@ return (conn); } +time_elapsed <- function(start_time) { + cat("Elapsed time: ", proc.time() - start_time, "\n\n"); +} + +time_start <- function(msg) { + start_time <- proc.time(); + cat(msg, "...\n"); + return(start_time); +} + # Read in VCF input file. +start_time <- time_start("Reading VCF input"); vcf <- read.vcfR(opt$input_vcf); +time_elapsed(start_time); # Convert VCF file into a genind for the Poppr package. -# TODO: probably should not hard-code 2 cores. -# changed to genind format for extracting alleles later -# trade-off is it is a bit slower to import data -# gl <- vcfR2genlight(vcf, n.cores=2) -# gind <- new("genind", (as.matrix(gl))) -gind <- vcfR2genind(vcf); +start_time <- time_start("Converting VCF data to a genind object"); +genind_obj <- vcfR2genind(vcf); +time_elapsed(start_time); # Add population information to the genind object. -poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); -colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); -gind@pop <- as.factor(poptab$region); -strata(gind)<-data.frame(pop(gind)); +population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); +colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); +genind_obj@pop <- as.factor(population_info_data_table$region); +strata(genind_obj) <- data.frame(pop(genind_obj)); # Convert genind object to a genclone object. -obj2 <- as.genclone(gind); +start_time <- time_start("Converting the genind object to a genclone object"); +genind_clone <- as.genclone(genind_obj); +time_elapsed(start_time); # Calculate the bitwise distance between individuals. -xdis <- bitwise.dist(obj2); +start_time <- time_start("Calculating the bitwise distance between individuals"); +bitwise_distance <- bitwise.dist(genind_clone); +time_elapsed(start_time); # Multilocus genotypes (threshold of 3.2%). -# threshold doubled because of how the data is formatted in genind compared to genlight -mlg.filter(obj2, distance=xdis) <- 0.032; -m <- mlg.table(obj2, background=TRUE, color=TRUE); +mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032; +m <- mlg.table(genind_clone, background=TRUE, color=TRUE); -# Create table of MLGs. -id <- mlg.id(obj2); -#dt <- data.table(id, keep.rownames=TRUE); -#setnames(dt, c("id"), c("affy_id")); +# Create list of MLGs. +mlg_ids <- mlg.id(genind_clone); # Read user's Affymetrix 96 well plate tabular file. -pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); -colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", - "region", "latitude", "longitude", "geographic_origin", "sample_location", - "latitude_outplant", "longitude_outplant", "depth", "disease_resist", - "bleach_resist", "mortality","tle", "spawning", "collector_last_name", - "collector_first_name", "organization", "collection_date", "email", "seq_facility", - "array_version", "public", "public_after_date", "sperm_motility", "healing_time", - "dna_extraction_method", "dna_concentration", "registry_id"); -pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); -pinfo2 <- as.character(pinfo$user_specimen_id); -pi <- data.table(pinfo2, pinfo$field_call); -setnames(pi, c("pinfo2"), c("user_specimen_id")); -setnames(pi, c("V2"), c("field_call")); +affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); +colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", + "region", "latitude", "longitude", "geographic_origin", "sample_location", + "latitude_outplant", "longitude_outplant", "depth", "disease_resist", + "bleach_resist", "mortality","tle", "spawning", "collector_last_name", + "collector_first_name", "organization", "collection_date", "email", "seq_facility", + "array_version", "public", "public_after_date", "sperm_motility", "healing_time", + "dna_extraction_method", "dna_concentration", "registry_id"); +affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id); +user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id); +# The specimen_id_field_call_data_table looks like this: +# user_specimen_ids V2 +# test_002 prolifera +# test_003 prolifera +specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call); +# Rename the user_specimen_ids column. +setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id")); +# Rename the V2 column. +setnames(specimen_id_field_call_data_table, c("V2"), c("field_call")); # Connect to database. conn <- get_database_connection(opt$database_connection_string); - # Import the sample table. sample_table <- tbl(conn, "sample"); - # Import the genotype table. genotype_table <- tbl(conn, "genotype"); - # Select columns from the sample table and the # genotype table joined by genotype_id. sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); smlg <- sample_table_columns %>% left_join(genotype_table %>% select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), - by=c("genotype_id" = "id")); - + by=c("genotype_id"="id")); # Convert to dataframe. -sm <- data.frame(smlg); +smlg_data_frame <- data.frame(smlg); # Name the columns. -colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); +colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); # Missing GT in samples submitted. +start_time <- time_start("Discovering missing GT in samples"); gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); -myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); -myMiss <- (myMiss / nrow(vcf)) * 100; -miss <- data.frame(myMiss); - -# Convert missing data into data table. -mi <-setDT(miss, keep.rownames=TRUE)[]; -setnames(mi, c("rn"), c("affy_id")); -setnames(mi, c("myMiss"), c("percent_missing_data_coral")); -# Round missing data to two digits. -mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); +missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); +missing_gt <- (missing_gt / nrow(vcf)) * 100; +missing_gt_data_frame <- data.frame(missing_gt); +# The specimen_id_field_call_data_table looks like this: +# rn missing_gt +# a100000-4368120-060520-256_I07.CEL 0.06092608 +# a100000-4368120-060520-256_K07.CEL 0.05077173 +missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(missing_gt_data_table, c("rn"), c("affy_id")); +# Rename the missing_gt column. +setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral")); +# Round data to two digits. +missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); +time_elapsed(start_time); -#heterozygous alleles -hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); -hets <- (hets / nrow(vcf)) * 100; -ht <- data.frame(hets); - -# Convert heterozygosity data into data table. -ht <-setDT(ht, keep.rownames=TRUE)[]; -setnames(ht, c("rn"), c("affy_id")); -setnames(ht, c("hets"), c("percent_mixed_coral")); -# Round missing data to two digits. -ht$percent_mixed<-round(ht$percent_mixed, digits=2); +# Heterozygous alleles. +start_time <- time_start("Discovering heterozygous alleles"); +heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); +heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; +heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); +# The heterozygous_alleles_data_table looks like this: +# rn heterozygous_alleles +# a100000-4368120-060520-256_I07.CEL 73.94903 +# a100000-4368120-060520-256_K07.CEL 74.40089 +heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id")); +# Rename the heterozygous_alleles column. +setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral")); +# Round data to two digits. +heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); +time_elapsed(start_time); -#reference alleles -refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); -refA <- (refA / nrow(vcf)) * 100; -rA <- data.frame(refA); - -# Convert refA data into data.table. -rA <-setDT(rA, keep.rownames=TRUE)[]; -setnames(rA, c("rn"), c("affy_id")); -setnames(rA, c("refA"), c("percent_reference_coral")); -# round missing data to two digits. -rA$percent_reference<-round(rA$percent_reference, digits=2); +# Reference alleles. +start_time <- time_start("Discovering reference alleles"); +reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); +reference_alleles <- (reference_alleles / nrow(vcf)) * 100; +reference_alleles_data_frame <- data.frame(reference_alleles); +# The reference_alleles_data_table looks like this: +# rn reference_alleles +# a100000-4368120-060520-256_I07.CEL 11.60642 +# a100000-4368120-060520-256_K07.CEL 11.45918 +reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(reference_alleles_data_table, c("rn"), c("affy_id")); +# Rename the reference_alleles column. +setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); +# Round data to two digits. +reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2); +time_elapsed(start_time); -#alternative alleles -altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); -altB <- (altB / nrow(vcf)) * 100; -aB <- data.frame(altB); +# Alternative alleles +start_time <- time_start("Discovering alternative alleles"); +alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); +alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; +alternative_alleles_data_frame <- data.frame(alternative_alleles); +# The alternative_alleles_data_table looks like this: +# rn alternative_alleles +# a100000-4368120-060520-256_I07.CEL 14.38363 +# a100000-4368120-060520-256_K07.CEL 14.08916 +alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); +# Rename the alternative_alleles column. +setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); +# Round data to two digits. +alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2); +time_elapsed(start_time); -# Convert altB data into data table. -aB <-setDT(aB, keep.rownames=TRUE)[]; -setnames(aB, c("rn"), c("affy_id")); -setnames(aB, c("altB"), c("percent_alternative_coral")); -# Round missing data to two digits. -aB$percent_alternative<-round(aB$percent_alternative, digits=2); +# The mlg_ids_data_table looks like this: +# mlg_ids +# a550962-4368120-060520-500_M23.CEL +# a550962-4368120-060520-256_A19.CEL +mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE); +# Rename the mlg_ids column. +setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id")); -#convert mlg id to data.table format -dt <- data.table(id, keep.rownames=TRUE); -setnames(dt, c("id"), c("affy_id")); - -# Transform. -df3 <- dt %>% +# sample_mlg_tibble looks like this: +# A tibble: 262 x 3 +# Groups: group [?] +# group affy_id coral_mlg_clonal_id +# <int> <chr> <chr> +# 1 a550962-4368120-060520-500_M23.CEL NA +# 2 a550962-4368120-060520-256_A19.CEL HG0006 +sample_mlg_tibble <- mlg_ids_data_table %>% group_by(row_number()) %>% dplyr::rename(group="row_number()") %>% unnest (affy_id) %>% # Join with mlg table. - left_join(sm %>% + left_join(smlg_data_frame %>% select("affy_id","coral_mlg_clonal_id"), by="affy_id"); # If found in database, group members on previous mlg id. -uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); +uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]); uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; -na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); -na.group <- df3$group[na.mlg]; -df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; +na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id)); +na.group <- sample_mlg_tibble$group[na.mlg]; +sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; -# Determine if the sample mlg matched previous genotyped sample. -df4<- df3 %>% +# Find out if the sample mlg matched a previous genotyped sample. +# sample_mlg_match_tibble looks like this: +# A tibble: 262 x 4 +# Groups: group [230] +# group affy_id coral_mlg_clonal_id DB_match +# <int> <chr> <chr> <chr> +# 1 a550962-4368120-060520-500_M23.CEL NA no_match +# 2 a550962-4368120-060520-256_A19.CEL HG0006 match +sample_mlg_match_tibble <- sample_mlg_tibble %>% group_by(group) %>% mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); -# Create new mlg id for samples that did not match those in the database. -none <- unique(df4[c("group", "coral_mlg_clonal_id")]); +# Create new mlg id for samples with no matches in the database. +none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]); none <- none[is.na(none$coral_mlg_clonal_id),]; -na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); -n.g <- df4$group[na.mlg2]; +na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id)); +n.g <- sample_mlg_match_tibble$group[na.mlg2]; ct <- length(unique(n.g)); # List of new group ids, the sequence starts at the number of -# ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if -# the df4 file contains all ids. If it doesn't then look below -# to change the seq() function. -n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); -# Pair group with new ids. -rat <- cbind(unique(n.g), n.g_ids); +# ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids +# plus 1. +# FIXME: Not sure if # the sample_mlg_match_tibble file +# contains all ids. If it doesn't then look below to change +# the seq() function. +n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); + # Assign the new id iteratively for all that have NA. for (i in 1:length(na.mlg2)) { - df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; + sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))]; } -# Subset poptab for all samples. -subpop <- poptab[c(2, 3)]; +# Subset population_info_data_table for all samples. +# affy_id_user_specimen_id_vector looks like this: +# affy_id user_specimen_id +# a100000-4368120-060520-256_I07.CEL 13704 +# a100000-4368120-060520-256_K07.CEL 13706 +affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)]; # Merge data frames for final table. -report_user <- pi %>% - left_join(subpop %>% +start_time <- time_start("Merging data frames"); +stag_db_report <- specimen_id_field_call_data_table %>% + left_join(affy_id_user_specimen_id_vector %>% select("affy_id", "user_specimen_id"), by="user_specimen_id") %>% - left_join(df4 %>% + left_join(sample_mlg_match_tibble %>% select("affy_id", "coral_mlg_clonal_id", "DB_match"), by="affy_id") %>% - left_join(mi %>% + left_join(missing_gt_data_table %>% select("affy_id", "percent_missing_data_coral"), by="affy_id") %>% - left_join(ht %>% - select("affy_id", "percent_mixed_coral"), + left_join(heterozygous_alleles_data_table %>% + select("affy_id", "percent_heterozygous_coral"), by="affy_id") %>% - left_join(rA %>% + left_join(reference_alleles_data_table %>% select("affy_id", "percent_reference_coral"), by="affy_id") %>% - left_join(aB %>% + left_join(alternative_alleles_data_table %>% select("affy_id", "percent_alternative_coral"), by="affy_id") %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% - mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% - mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% - mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% + ungroup() %>% + select(-group); +time_elapsed(start_time); + +start_time <- time_start("Writing csv output"); +write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE); +time_elapsed(start_time); + +# Database sample table. +sample_db <- affy_metadata_data_frame %>% + left_join(stag_db_report %>% + select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"), + by='user_specimen_id'); + +# Representative clone for genotype table. +start_time <- time_start("Creating representative clone for genotype table"); +no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); +id_rep <- mlg.id(no_dup_genotypes_genind); +id_data_table <- data.table(id_rep, keep.rownames=TRUE); +# Rename the id_rep column. +setnames(id_data_table, c("id_rep"), c("affy_id")); +time_elapsed(start_time); + +# # Combine with previously genotyped samples in the database. +start_time <- time_start("Selecting from various database tables"); +representative_mlg_tibble <- id_data_table %>% + group_by(row_number()) %>% + rename(group='row_number()') %>% + unnest(affy_id) %>% + left_join(stag_db_report %>% + select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"), + by='affy_id') %>% + mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>% + ungroup() %>% + select(-group); + +# Database genotype table. +genotype_table_join <- sample_mlg_match_tibble %>% + left_join(representative_mlg_tibble %>% + select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"), + by='affy_id') %>% ungroup() %>% select(-group); -write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); - -# Database tables -## Sample.table -sample_db <- pinfo %>% - left_join( - report_user %>% - select("user_specimen_id","affy_id", - "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", - "percent_alternative_coral"), - by='user_specimen_id'); - -###representative clone for genotype.table -cc<-clonecorrect(obj2, strata= ~pop.gind.); -id_rep<-mlg.id(cc); -dt_cc<-data.table(id_rep,keep.rownames = TRUE); -setnames(dt_cc, c("id_rep"), c("affy_id")); - -###transform mlg data.table -df_cc <- dt_cc %>% - group_by(row_number()) %>% - rename(group='row_number()') %>% - unnest(affy_id) %>% - left_join(report_user %>% - select("coral_mlg_clonal_id","user_specimen_id","affy_id"), - by='affy_id') %>% - mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% - ungroup() %>% - select(-group); +# Database taxonomy table. +taxonomy_table_join <- stag_db_report %>% + select(genetic_coral_species_call, affy_id) %>% + mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name)); +time_elapsed(start_time); +# Table of alleles for the new samples subset to new plate data. +# Create vector indicating number of individuals desired from +# affy_id column of stag_db_report data table. +i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); +i <- i[!apply(i== "", 1, all),]; +sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; -##geno.table -geno_db <- df4 %>% - left_join(df_cc %>% - select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), - by='affy_id') %>% - ungroup() %>% - select(-group); - -##taxonomy.table - -tax_db <- report_user %>% - select(genetic_coral_species_call, affy_id) %>% - mutate(genus_name =ifelse(genetic_coral_species_call== - genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% - mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% - mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% - mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); - - -# Table of alleles for the new samples -## subset to new plate data -### create vector indicating number of individuals desired -### made from affy_id collumn from report_user data table -i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); -i<-i[!apply(i == "", 1, all),]; -sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; - -# convert to data frame -at_96<-genind2df(sub96, sep=""); -at_96<- at_96 %>% - select(-pop); - -# allele string for Allele.table in database -uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE); -uat_96<-setDT(uat_96, keep.rownames = TRUE)[]; -setnames(uat_96, c("rn"), c("user_specimen_id")); -# write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); - -# Create a phylogeny of samples based on distance matrices. +# cols looks like this: +# blue1 red green pink orange blue2 +# "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" +# green2 yellow turquoise poop +# "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF" cols <- piratepal("basel"); set.seed(999); -# Start PDF device driver. -dev.new(width=10, height=7); -file_path = get_file_path("nj_phylogeny.pdf"); -pdf(file=file_path, width=10, height=7); -# Organize branches by clade. -theTree <- sub96 %>% - aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% - ladderize(); -theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; -plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); -# Add a scale bar showing 5% difference.. -add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); -nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); -legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); -dev.off() -write.tree(theTree, file =opt$nj_tree, quote=FALSE); - -# identity-by-state analysis -#if (!requireNamespace("BiocManager", quietly = TRUE)) -# install.packages("BiocManager") -#BiocManager::install("SNPRelate", version = "3.8") +if (!is.null(opt$output_nj_phylogeny_tree)) { + # Create a phylogeny tree of samples based on distance matrices. + # Start PDF device driver. + start_time <- time_start("Creating nj_phylogeny_tree.pdf"); + dev.new(width=10, height=7); + file_path = get_file_path("nj_phylogeny_tree.pdf"); + pdf(file=file_path, width=10, height=7); + # Organize branches by clade. + nj_phylogeny_tree <- sample_alleles_vector %>% + aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% + ladderize(); + nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; + plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); + # Add a scale bar showing 5% difference. + add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); + nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); + legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); + dev.off() + time_elapsed(start_time); +} -#subset VCF to the user samples -l<-length(i); -n<-ncol(vcf@gt); -s<-n-l; -svcf<-vcf[,s:n]; +# Subset VCF to the user samples. +start_time <- time_start("Subsetting vcf to the user samples"); +l <- length(i); +n <- ncol(vcf@gt); +s <- n - l; +svcf <- vcf[, s:n]; write.vcf(svcf, "subset.vcf.gz"); - vcf.fn <- "subset.vcf.gz"; snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); - -genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); -hd<-read.gdsn(index.gdsn(genofile, "sample.id")); -hd<-data.frame(hd); -hd<-setDT(hd, keep.rownames = FALSE)[]; -setnames(hd, c("hd"), c("user_specimen_id")); +genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); +gds_array <- read.gdsn(index.gdsn(genofile, "sample.id")); +# gds_array looks like this: +# [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL" +# [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL" +gds_data_frame <- data.frame(gds_array); +# gds_data_frame looks like this: +# gds_array +# a550962-4368120-060520-500_A03.CEL +# a550962-4368120-060520-500_A05.CEL +gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[]; +# Rename the gds_array column. +setnames(gds_data_table, c("gds_array"), c("affy_id")); +# affy_id_region_list looks like this: +# affy_id region +# a100000-4368120-060520-256_I07.CEL USVI +# a100000-4368120-060520-256_K07.CEL USVI +affy_id_region_list <- population_info_data_table[c(2,4)]; +gds_data_table_join <- gds_data_table %>% + left_join(affy_id_region_list %>% + select("affy_id", "region"), + by='affy_id')%>% + drop_na(); +samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); +add.gdsn(genofile, "sample.annot", samp.annot); +# population_code looks like this: +# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 +# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 +population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); +pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); +# pop.group looks like this: +# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 +# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 +time_elapsed(start_time); -subpop2<- poptab[c(2,4)]; -poptab_sub <- hd %>% - left_join( - subpop2 %>% - select("affy_id","region"), - by='affy_id')%>% - drop_na(); +# Distance matrix calculation. +start_time <- time_start("Calculating distance matrix"); +ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); +time_elapsed(start_time); -samp.annot <- data.frame(pop.group = c(poptab_sub$region)); -add.gdsn(genofile, "sample.annot", samp.annot); - -pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); -pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); - -# Identity-By-State Analysis - distance matrix calculation -ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); - -# cluster analysis on the genome-wide IBS pairwise distance matrix +# Cluster analysis on the genome-wide IBS pairwise distance matrix. +start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); set.seed(100); par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); +time_elapsed(start_time); -# default clustering. +# Default clustering. +start_time <- time_start("Creating ibs_default.pdf"); +# Start PDF device driver. dev.new(width=10, height=7); -file_path = get_file_path("IBS_default.pdf"); -pdf (file=file_path, width=10, height=7); +file_path = get_file_path("ibs_default.pdf"); +pdf(file=file_path, width=10, height=7); rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); -snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); +snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); dev.off() +time_elapsed(start_time); -# color cluster by region. +# Color cluster by region. +start_time <- time_start("Creating ibs_region.pdf"); +# Start PDF device driver. dev.new(width=10, height=7); -file_path = get_file_path("IBS_Region.pdf"); -pdf (file=file_path, width=10, height=7); -race <- as.factor(pop_code); -rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); -snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); +file_path = get_file_path("ibs_region.pdf"); +pdf(file=file_path, width=10, height=7); +race <- as.factor(population_code); +rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); +snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); dev.off() +time_elapsed(start_time); -#close GDS file +# close GDS file. snpgdsClose(genofile); # Sample MLG on a map. -world <- ne_countries(scale = "medium", returnclass = "sf"); -class(world); - -pinfo$mlg<-report_user$coral_mlg_clonal_id; -n <- nrow(pinfo); - -mxlat<-max(pinfo$latitude,na.rm = TRUE); -mnlat<-min(pinfo$latitude,na.rm = TRUE); -mxlong<-max(pinfo$longitude,na.rm = TRUE); -mnlong<-min(pinfo$longitude,na.rm = TRUE); - -p5<-ggplot(data = world) + - geom_sf() + - coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); - -colourCount = length(unique(pinfo$mlg)); -getPalette = colorRampPalette(piratepal("basel")); +start_time <- time_start("Creating mlg_map.pdf"); +# Get the lattitude and longitude boundaries for rendering +# the map. Tese boundaries will restrict the map to focus +# (i.e., zoom) on the region of the world map from which +# the samples were taken. +max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE); +min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE); +latitude_range_vector <- c(min_latitude-3, max_latitude+3); +max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE); +min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE); +longitude_range_vector <- c(min_longitude-3, max_longitude+3); +# Get the palette colors for rendering plots. +colors <- length(unique(stag_db_report$coral_mlg_clonal_id)); +# Get a color palette. +palette <- colorRampPalette(piratepal("basel")); +# Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("mlg_map.pdf"); -pdf (file=file_path, width=10, height=7); -p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ - scale_color_manual(values=getPalette(colourCount))+ - theme(legend.position="bottom")+ - guides(color=guide_legend(nrow=8,byrow=F)); -p6; +pdf(file=file_path, width=10, height=7); +world_data = map_data("world"); +# Add the coral_mlg_clonal_id column from the stag_db_report +# data fram to the affy_metadata_data_frame. +affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id; +# Get the number of colors needed from the palette for plotting +# the sample locations on the world map. +num_colors = length(unique(affy_metadata_data_frame$mlg)); +# Get a color palette. +palette = colorRampPalette(piratepal("basel")); +ggplot() + +geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + +coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) + +geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) + +scale_color_manual(values=palette(num_colors)) + +theme(legend.position="bottom") + +guides(color=guide_legend(nrow=8, byrow=F)); dev.off() +time_elapsed(start_time); # Missing data barplot. -poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; -test2 <- which(!is.na(poptab$miss)); -miss96 <- poptab$miss[test2]; -name96 <- poptab$user_specimen_id[test2]; +start_time <- time_start("Creating missing_data.pdf"); +population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)]; +test2 <- which(!is.na(population_info_data_table$miss)); +miss96 <- population_info_data_table$miss[test2]; +name96 <- population_info_data_table$user_specimen_id[test2]; +# Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("missing_data.pdf"); -pdf (file=file_path, width=10, height=7); +pdf(file=file_path, width=10, height=7); par(mar = c(8, 4, 4, 2)); x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); dev.off() +time_elapsed(start_time); # Generate a pie chart for each sample with a genotype. # Store the numerical and user_specimen_id values from -# report_user for the charts (user_specimen_id names +# stag_db_report for the charts (user_specimen_id names # will be used to label each chart). -dt1 <- data.table(report_user); -dt1 <- report_user[c(-2, -3, -4)]; -dt1 <- na.omit(dt1); -# Translate to N (i.e., number of samples with a -# genotype) columns and 5 rows. -tdt1 <- t(dt1); -# Make another data table and transpose it the same as dt1 to -# get numerics. These will feed into the creation of N vectors. -dt2 <- data.table(report_user); -dt2 <- report_user[c(-1, -2, -3, -4)]; -# Translate to N columns and 5 rows. -tdt2 <- t(dt2); -tdt1_matrix <- as.matrix(tdt1[-1,]); -# The number of columns is the number of samples with genotypes. -nc <- ncol(tdt1_matrix); -mode(tdt1_matrix) <- "numeric"; -spy <- rowMeans(tdt1_matrix); +start_time <- time_start("Creating percent_breakdown.pdf"); +stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; +# Remove NA and NaN values. +stag_db_report_data_table <- na.omit(stag_db_report_data_table); +# Translate to N (i.e., number of samples with a genotype) +# columns and 5 rows. +translated_stag_db_report_data_table <- t(stag_db_report_data_table); +translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]); +# Set the storage mode of the matrix to numeric. In some +# cases this could result in the following: +# Warning message: +# In mde(x) : NAs introduced by coercion +mode(translated_stag_db_report_matrix) <- "numeric"; +# Remove NA and NaN values that may have been introduced +# by coercion. +translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); +tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); dev.new(width=10, height=7); file_path = get_file_path("percent_breakdown.pdf"); pdf(file=file_path, width=10, height=7); # Average pie of all samples. -labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); +labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); main <- "Average breakdown of SNP assignments across all samples"; -pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); +pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); par(mfrow=c(3, 2)); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); -for (i in 1:nc) { - tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); - main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); - pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); +# Generate a pie chart for each sample with genotypes. +for (i in 1:ncol(translated_stag_db_report_matrix)) { + tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); + main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); + pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); } dev.off() +time_elapsed(start_time);