# HG changeset patch # User greg # Date 1557751057 14400 # Node ID 1190ee1456f641f18f40ad30b124040c2908042f # Parent 85f8fc57eee4368569f0d3eebeb4b038d35cf2dc Uploaded diff -r 85f8fc57eee4 -r 1190ee1456f6 multilocus_genotype.R --- a/multilocus_genotype.R Wed Apr 17 09:07:04 2019 -0400 +++ b/multilocus_genotype.R Mon May 13 08:37:37 2019 -0400 @@ -7,14 +7,13 @@ suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) +suppressPackageStartupMessages(library("maps")) +suppressPackageStartupMessages(library("mapproj")) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("rnaturalearth")) -suppressPackageStartupMessages(library("rnaturalearthdata")) suppressPackageStartupMessages(library("RPostgres")) -suppressPackageStartupMessages(library("sf")) -suppressPackageStartupMessages(library(SNPRelate)) +suppressPackageStartupMessages(library("SNPRelate")) suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) @@ -26,8 +25,8 @@ make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), - make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), - make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") + make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"), + make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); @@ -59,430 +58,520 @@ return (conn); } +time_elapsed <- function(start_time) { + cat("Elapsed time: ", proc.time() - start_time, "\n\n"); +} + +time_start <- function(msg) { + start_time <- proc.time(); + cat(msg, "...\n"); + return(start_time); +} + # Read in VCF input file. +start_time <- time_start("Reading VCF input"); vcf <- read.vcfR(opt$input_vcf); +time_elapsed(start_time); # Convert VCF file into a genind for the Poppr package. -# TODO: probably should not hard-code 2 cores. -# changed to genind format for extracting alleles later -# trade-off is it is a bit slower to import data -# gl <- vcfR2genlight(vcf, n.cores=2) -# gind <- new("genind", (as.matrix(gl))) -gind <- vcfR2genind(vcf); +start_time <- time_start("Converting VCF data to a genind object"); +genind_obj <- vcfR2genind(vcf); +time_elapsed(start_time); # Add population information to the genind object. -poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); -colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); -gind@pop <- as.factor(poptab$region); -strata(gind)<-data.frame(pop(gind)); +population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); +colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); +genind_obj@pop <- as.factor(population_info_data_table$region); +strata(genind_obj) <- data.frame(pop(genind_obj)); # Convert genind object to a genclone object. -obj2 <- as.genclone(gind); +start_time <- time_start("Converting the genind object to a genclone object"); +genind_clone <- as.genclone(genind_obj); +time_elapsed(start_time); # Calculate the bitwise distance between individuals. -xdis <- bitwise.dist(obj2); +start_time <- time_start("Calculating the bitwise distance between individuals"); +bitwise_distance <- bitwise.dist(genind_clone); +time_elapsed(start_time); # Multilocus genotypes (threshold of 3.2%). -# threshold doubled because of how the data is formatted in genind compared to genlight -mlg.filter(obj2, distance=xdis) <- 0.032; -m <- mlg.table(obj2, background=TRUE, color=TRUE); +mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032; +m <- mlg.table(genind_clone, background=TRUE, color=TRUE); -# Create table of MLGs. -id <- mlg.id(obj2); -#dt <- data.table(id, keep.rownames=TRUE); -#setnames(dt, c("id"), c("affy_id")); +# Create list of MLGs. +mlg_ids <- mlg.id(genind_clone); # Read user's Affymetrix 96 well plate tabular file. -pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); -colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", - "region", "latitude", "longitude", "geographic_origin", "sample_location", - "latitude_outplant", "longitude_outplant", "depth", "disease_resist", - "bleach_resist", "mortality","tle", "spawning", "collector_last_name", - "collector_first_name", "organization", "collection_date", "email", "seq_facility", - "array_version", "public", "public_after_date", "sperm_motility", "healing_time", - "dna_extraction_method", "dna_concentration", "registry_id"); -pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); -pinfo2 <- as.character(pinfo$user_specimen_id); -pi <- data.table(pinfo2, pinfo$field_call); -setnames(pi, c("pinfo2"), c("user_specimen_id")); -setnames(pi, c("V2"), c("field_call")); +affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); +colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", + "region", "latitude", "longitude", "geographic_origin", "sample_location", + "latitude_outplant", "longitude_outplant", "depth", "disease_resist", + "bleach_resist", "mortality","tle", "spawning", "collector_last_name", + "collector_first_name", "organization", "collection_date", "email", "seq_facility", + "array_version", "public", "public_after_date", "sperm_motility", "healing_time", + "dna_extraction_method", "dna_concentration", "registry_id"); +affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id); +user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id); +# The specimen_id_field_call_data_table looks like this: +# user_specimen_ids V2 +# test_002 prolifera +# test_003 prolifera +specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call); +# Rename the user_specimen_ids column. +setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id")); +# Rename the V2 column. +setnames(specimen_id_field_call_data_table, c("V2"), c("field_call")); # Connect to database. conn <- get_database_connection(opt$database_connection_string); - # Import the sample table. sample_table <- tbl(conn, "sample"); - # Import the genotype table. genotype_table <- tbl(conn, "genotype"); - # Select columns from the sample table and the # genotype table joined by genotype_id. sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); smlg <- sample_table_columns %>% left_join(genotype_table %>% select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), - by=c("genotype_id" = "id")); - + by=c("genotype_id"="id")); # Convert to dataframe. -sm <- data.frame(smlg); +smlg_data_frame <- data.frame(smlg); # Name the columns. -colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); +colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); # Missing GT in samples submitted. +start_time <- time_start("Discovering missing GT in samples"); gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); -myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); -myMiss <- (myMiss / nrow(vcf)) * 100; -miss <- data.frame(myMiss); - -# Convert missing data into data table. -mi <-setDT(miss, keep.rownames=TRUE)[]; -setnames(mi, c("rn"), c("affy_id")); -setnames(mi, c("myMiss"), c("percent_missing_data_coral")); -# Round missing data to two digits. -mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); +missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); +missing_gt <- (missing_gt / nrow(vcf)) * 100; +missing_gt_data_frame <- data.frame(missing_gt); +# The specimen_id_field_call_data_table looks like this: +# rn missing_gt +# a100000-4368120-060520-256_I07.CEL 0.06092608 +# a100000-4368120-060520-256_K07.CEL 0.05077173 +missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(missing_gt_data_table, c("rn"), c("affy_id")); +# Rename the missing_gt column. +setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral")); +# Round data to two digits. +missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); +time_elapsed(start_time); -#heterozygous alleles -hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); -hets <- (hets / nrow(vcf)) * 100; -ht <- data.frame(hets); - -# Convert heterozygosity data into data table. -ht <-setDT(ht, keep.rownames=TRUE)[]; -setnames(ht, c("rn"), c("affy_id")); -setnames(ht, c("hets"), c("percent_mixed_coral")); -# Round missing data to two digits. -ht$percent_mixed<-round(ht$percent_mixed, digits=2); +# Heterozygous alleles. +start_time <- time_start("Discovering heterozygous alleles"); +heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); +heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; +heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); +# The heterozygous_alleles_data_table looks like this: +# rn heterozygous_alleles +# a100000-4368120-060520-256_I07.CEL 73.94903 +# a100000-4368120-060520-256_K07.CEL 74.40089 +heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id")); +# Rename the heterozygous_alleles column. +setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral")); +# Round data to two digits. +heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); +time_elapsed(start_time); -#reference alleles -refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); -refA <- (refA / nrow(vcf)) * 100; -rA <- data.frame(refA); - -# Convert refA data into data.table. -rA <-setDT(rA, keep.rownames=TRUE)[]; -setnames(rA, c("rn"), c("affy_id")); -setnames(rA, c("refA"), c("percent_reference_coral")); -# round missing data to two digits. -rA$percent_reference<-round(rA$percent_reference, digits=2); +# Reference alleles. +start_time <- time_start("Discovering reference alleles"); +reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); +reference_alleles <- (reference_alleles / nrow(vcf)) * 100; +reference_alleles_data_frame <- data.frame(reference_alleles); +# The reference_alleles_data_table looks like this: +# rn reference_alleles +# a100000-4368120-060520-256_I07.CEL 11.60642 +# a100000-4368120-060520-256_K07.CEL 11.45918 +reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(reference_alleles_data_table, c("rn"), c("affy_id")); +# Rename the reference_alleles column. +setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); +# Round data to two digits. +reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2); +time_elapsed(start_time); -#alternative alleles -altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); -altB <- (altB / nrow(vcf)) * 100; -aB <- data.frame(altB); +# Alternative alleles +start_time <- time_start("Discovering alternative alleles"); +alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); +alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; +alternative_alleles_data_frame <- data.frame(alternative_alleles); +# The alternative_alleles_data_table looks like this: +# rn alternative_alleles +# a100000-4368120-060520-256_I07.CEL 14.38363 +# a100000-4368120-060520-256_K07.CEL 14.08916 +alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); +# Rename the alternative_alleles column. +setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); +# Round data to two digits. +alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2); +time_elapsed(start_time); -# Convert altB data into data table. -aB <-setDT(aB, keep.rownames=TRUE)[]; -setnames(aB, c("rn"), c("affy_id")); -setnames(aB, c("altB"), c("percent_alternative_coral")); -# Round missing data to two digits. -aB$percent_alternative<-round(aB$percent_alternative, digits=2); +# The mlg_ids_data_table looks like this: +# mlg_ids +# a550962-4368120-060520-500_M23.CEL +# a550962-4368120-060520-256_A19.CEL +mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE); +# Rename the mlg_ids column. +setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id")); -#convert mlg id to data.table format -dt <- data.table(id, keep.rownames=TRUE); -setnames(dt, c("id"), c("affy_id")); - -# Transform. -df3 <- dt %>% +# sample_mlg_tibble looks like this: +# A tibble: 262 x 3 +# Groups: group [?] +# group affy_id coral_mlg_clonal_id +# +# 1 a550962-4368120-060520-500_M23.CEL NA +# 2 a550962-4368120-060520-256_A19.CEL HG0006 +sample_mlg_tibble <- mlg_ids_data_table %>% group_by(row_number()) %>% dplyr::rename(group="row_number()") %>% unnest (affy_id) %>% # Join with mlg table. - left_join(sm %>% + left_join(smlg_data_frame %>% select("affy_id","coral_mlg_clonal_id"), by="affy_id"); # If found in database, group members on previous mlg id. -uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); +uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]); uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; -na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); -na.group <- df3$group[na.mlg]; -df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; +na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id)); +na.group <- sample_mlg_tibble$group[na.mlg]; +sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; -# Determine if the sample mlg matched previous genotyped sample. -df4<- df3 %>% +# Find out if the sample mlg matched a previous genotyped sample. +# sample_mlg_match_tibble looks like this: +# A tibble: 262 x 4 +# Groups: group [230] +# group affy_id coral_mlg_clonal_id DB_match +# +# 1 a550962-4368120-060520-500_M23.CEL NA no_match +# 2 a550962-4368120-060520-256_A19.CEL HG0006 match +sample_mlg_match_tibble <- sample_mlg_tibble %>% group_by(group) %>% mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); -# Create new mlg id for samples that did not match those in the database. -none <- unique(df4[c("group", "coral_mlg_clonal_id")]); +# Create new mlg id for samples with no matches in the database. +none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]); none <- none[is.na(none$coral_mlg_clonal_id),]; -na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); -n.g <- df4$group[na.mlg2]; +na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id)); +n.g <- sample_mlg_match_tibble$group[na.mlg2]; ct <- length(unique(n.g)); # List of new group ids, the sequence starts at the number of -# ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if -# the df4 file contains all ids. If it doesn't then look below -# to change the seq() function. -n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); -# Pair group with new ids. -rat <- cbind(unique(n.g), n.g_ids); +# ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids +# plus 1. +# FIXME: Not sure if # the sample_mlg_match_tibble file +# contains all ids. If it doesn't then look below to change +# the seq() function. +n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); + # Assign the new id iteratively for all that have NA. for (i in 1:length(na.mlg2)) { - df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; + sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))]; } -# Subset poptab for all samples. -subpop <- poptab[c(2, 3)]; +# Subset population_info_data_table for all samples. +# affy_id_user_specimen_id_vector looks like this: +# affy_id user_specimen_id +# a100000-4368120-060520-256_I07.CEL 13704 +# a100000-4368120-060520-256_K07.CEL 13706 +affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)]; # Merge data frames for final table. -report_user <- pi %>% - left_join(subpop %>% +start_time <- time_start("Merging data frames"); +stag_db_report <- specimen_id_field_call_data_table %>% + left_join(affy_id_user_specimen_id_vector %>% select("affy_id", "user_specimen_id"), by="user_specimen_id") %>% - left_join(df4 %>% + left_join(sample_mlg_match_tibble %>% select("affy_id", "coral_mlg_clonal_id", "DB_match"), by="affy_id") %>% - left_join(mi %>% + left_join(missing_gt_data_table %>% select("affy_id", "percent_missing_data_coral"), by="affy_id") %>% - left_join(ht %>% - select("affy_id", "percent_mixed_coral"), + left_join(heterozygous_alleles_data_table %>% + select("affy_id", "percent_heterozygous_coral"), by="affy_id") %>% - left_join(rA %>% + left_join(reference_alleles_data_table %>% select("affy_id", "percent_reference_coral"), by="affy_id") %>% - left_join(aB %>% + left_join(alternative_alleles_data_table %>% select("affy_id", "percent_alternative_coral"), by="affy_id") %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% - mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% - mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% - mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>% + mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% + ungroup() %>% + select(-group); +time_elapsed(start_time); + +start_time <- time_start("Writing csv output"); +write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE); +time_elapsed(start_time); + +# Database sample table. +sample_db <- affy_metadata_data_frame %>% + left_join(stag_db_report %>% + select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"), + by='user_specimen_id'); + +# Representative clone for genotype table. +start_time <- time_start("Creating representative clone for genotype table"); +no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); +id_rep <- mlg.id(no_dup_genotypes_genind); +id_data_table <- data.table(id_rep, keep.rownames=TRUE); +# Rename the id_rep column. +setnames(id_data_table, c("id_rep"), c("affy_id")); +time_elapsed(start_time); + +# # Combine with previously genotyped samples in the database. +start_time <- time_start("Selecting from various database tables"); +representative_mlg_tibble <- id_data_table %>% + group_by(row_number()) %>% + rename(group='row_number()') %>% + unnest(affy_id) %>% + left_join(stag_db_report %>% + select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"), + by='affy_id') %>% + mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>% + ungroup() %>% + select(-group); + +# Database genotype table. +genotype_table_join <- sample_mlg_match_tibble %>% + left_join(representative_mlg_tibble %>% + select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"), + by='affy_id') %>% ungroup() %>% select(-group); -write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); - -# Database tables -## Sample.table -sample_db <- pinfo %>% - left_join( - report_user %>% - select("user_specimen_id","affy_id", - "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", - "percent_alternative_coral"), - by='user_specimen_id'); - -###representative clone for genotype.table -cc<-clonecorrect(obj2, strata= ~pop.gind.); -id_rep<-mlg.id(cc); -dt_cc<-data.table(id_rep,keep.rownames = TRUE); -setnames(dt_cc, c("id_rep"), c("affy_id")); - -###transform mlg data.table -df_cc <- dt_cc %>% - group_by(row_number()) %>% - rename(group='row_number()') %>% - unnest(affy_id) %>% - left_join(report_user %>% - select("coral_mlg_clonal_id","user_specimen_id","affy_id"), - by='affy_id') %>% - mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% - ungroup() %>% - select(-group); +# Database taxonomy table. +taxonomy_table_join <- stag_db_report %>% + select(genetic_coral_species_call, affy_id) %>% + mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% + mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name)); +time_elapsed(start_time); +# Table of alleles for the new samples subset to new plate data. +# Create vector indicating number of individuals desired from +# affy_id column of stag_db_report data table. +i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); +i <- i[!apply(i== "", 1, all),]; +sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; -##geno.table -geno_db <- df4 %>% - left_join(df_cc %>% - select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), - by='affy_id') %>% - ungroup() %>% - select(-group); - -##taxonomy.table - -tax_db <- report_user %>% - select(genetic_coral_species_call, affy_id) %>% - mutate(genus_name =ifelse(genetic_coral_species_call== - genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% - mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% - mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% - mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); - - -# Table of alleles for the new samples -## subset to new plate data -### create vector indicating number of individuals desired -### made from affy_id collumn from report_user data table -i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); -i<-i[!apply(i == "", 1, all),]; -sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; - -# convert to data frame -at_96<-genind2df(sub96, sep=""); -at_96<- at_96 %>% - select(-pop); - -# allele string for Allele.table in database -uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE); -uat_96<-setDT(uat_96, keep.rownames = TRUE)[]; -setnames(uat_96, c("rn"), c("user_specimen_id")); -# write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); - -# Create a phylogeny of samples based on distance matrices. +# cols looks like this: +# blue1 red green pink orange blue2 +# "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" +# green2 yellow turquoise poop +# "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF" cols <- piratepal("basel"); set.seed(999); -# Start PDF device driver. -dev.new(width=10, height=7); -file_path = get_file_path("nj_phylogeny.pdf"); -pdf(file=file_path, width=10, height=7); -# Organize branches by clade. -theTree <- sub96 %>% - aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% - ladderize(); -theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; -plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); -# Add a scale bar showing 5% difference.. -add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); -nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); -legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); -dev.off() -write.tree(theTree, file =opt$nj_tree, quote=FALSE); - -# identity-by-state analysis -#if (!requireNamespace("BiocManager", quietly = TRUE)) -# install.packages("BiocManager") -#BiocManager::install("SNPRelate", version = "3.8") +if (!is.null(opt$output_nj_phylogeny_tree)) { + # Create a phylogeny tree of samples based on distance matrices. + # Start PDF device driver. + start_time <- time_start("Creating nj_phylogeny_tree.pdf"); + dev.new(width=10, height=7); + file_path = get_file_path("nj_phylogeny_tree.pdf"); + pdf(file=file_path, width=10, height=7); + # Organize branches by clade. + nj_phylogeny_tree <- sample_alleles_vector %>% + aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% + ladderize(); + nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; + plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); + # Add a scale bar showing 5% difference. + add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); + nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); + legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); + dev.off() + time_elapsed(start_time); +} -#subset VCF to the user samples -l<-length(i); -n<-ncol(vcf@gt); -s<-n-l; -svcf<-vcf[,s:n]; +# Subset VCF to the user samples. +start_time <- time_start("Subsetting vcf to the user samples"); +l <- length(i); +n <- ncol(vcf@gt); +s <- n - l; +svcf <- vcf[, s:n]; write.vcf(svcf, "subset.vcf.gz"); - vcf.fn <- "subset.vcf.gz"; snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); - -genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); -hd<-read.gdsn(index.gdsn(genofile, "sample.id")); -hd<-data.frame(hd); -hd<-setDT(hd, keep.rownames = FALSE)[]; -setnames(hd, c("hd"), c("user_specimen_id")); +genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); +gds_array <- read.gdsn(index.gdsn(genofile, "sample.id")); +# gds_array looks like this: +# [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL" +# [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL" +gds_data_frame <- data.frame(gds_array); +# gds_data_frame looks like this: +# gds_array +# a550962-4368120-060520-500_A03.CEL +# a550962-4368120-060520-500_A05.CEL +gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[]; +# Rename the gds_array column. +setnames(gds_data_table, c("gds_array"), c("affy_id")); +# affy_id_region_list looks like this: +# affy_id region +# a100000-4368120-060520-256_I07.CEL USVI +# a100000-4368120-060520-256_K07.CEL USVI +affy_id_region_list <- population_info_data_table[c(2,4)]; +gds_data_table_join <- gds_data_table %>% + left_join(affy_id_region_list %>% + select("affy_id", "region"), + by='affy_id')%>% + drop_na(); +samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); +add.gdsn(genofile, "sample.annot", samp.annot); +# population_code looks like this: +# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 +# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 +population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); +pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); +# pop.group looks like this: +# [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 +# [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 +time_elapsed(start_time); -subpop2<- poptab[c(2,4)]; -poptab_sub <- hd %>% - left_join( - subpop2 %>% - select("affy_id","region"), - by='affy_id')%>% - drop_na(); +# Distance matrix calculation. +start_time <- time_start("Calculating distance matrix"); +ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); +time_elapsed(start_time); -samp.annot <- data.frame(pop.group = c(poptab_sub$region)); -add.gdsn(genofile, "sample.annot", samp.annot); - -pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); -pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); - -# Identity-By-State Analysis - distance matrix calculation -ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); - -# cluster analysis on the genome-wide IBS pairwise distance matrix +# Cluster analysis on the genome-wide IBS pairwise distance matrix. +start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); set.seed(100); par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); +time_elapsed(start_time); -# default clustering. +# Default clustering. +start_time <- time_start("Creating ibs_default.pdf"); +# Start PDF device driver. dev.new(width=10, height=7); -file_path = get_file_path("IBS_default.pdf"); -pdf (file=file_path, width=10, height=7); +file_path = get_file_path("ibs_default.pdf"); +pdf(file=file_path, width=10, height=7); rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); -snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); +snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); dev.off() +time_elapsed(start_time); -# color cluster by region. +# Color cluster by region. +start_time <- time_start("Creating ibs_region.pdf"); +# Start PDF device driver. dev.new(width=10, height=7); -file_path = get_file_path("IBS_Region.pdf"); -pdf (file=file_path, width=10, height=7); -race <- as.factor(pop_code); -rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); -snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); +file_path = get_file_path("ibs_region.pdf"); +pdf(file=file_path, width=10, height=7); +race <- as.factor(population_code); +rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); +snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); dev.off() +time_elapsed(start_time); -#close GDS file +# close GDS file. snpgdsClose(genofile); # Sample MLG on a map. -world <- ne_countries(scale = "medium", returnclass = "sf"); -class(world); - -pinfo$mlg<-report_user$coral_mlg_clonal_id; -n <- nrow(pinfo); - -mxlat<-max(pinfo$latitude,na.rm = TRUE); -mnlat<-min(pinfo$latitude,na.rm = TRUE); -mxlong<-max(pinfo$longitude,na.rm = TRUE); -mnlong<-min(pinfo$longitude,na.rm = TRUE); - -p5<-ggplot(data = world) + - geom_sf() + - coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); - -colourCount = length(unique(pinfo$mlg)); -getPalette = colorRampPalette(piratepal("basel")); +start_time <- time_start("Creating mlg_map.pdf"); +# Get the lattitude and longitude boundaries for rendering +# the map. Tese boundaries will restrict the map to focus +# (i.e., zoom) on the region of the world map from which +# the samples were taken. +max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE); +min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE); +latitude_range_vector <- c(min_latitude-3, max_latitude+3); +max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE); +min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE); +longitude_range_vector <- c(min_longitude-3, max_longitude+3); +# Get the palette colors for rendering plots. +colors <- length(unique(stag_db_report$coral_mlg_clonal_id)); +# Get a color palette. +palette <- colorRampPalette(piratepal("basel")); +# Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("mlg_map.pdf"); -pdf (file=file_path, width=10, height=7); -p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ - scale_color_manual(values=getPalette(colourCount))+ - theme(legend.position="bottom")+ - guides(color=guide_legend(nrow=8,byrow=F)); -p6; +pdf(file=file_path, width=10, height=7); +world_data = map_data("world"); +# Add the coral_mlg_clonal_id column from the stag_db_report +# data fram to the affy_metadata_data_frame. +affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id; +# Get the number of colors needed from the palette for plotting +# the sample locations on the world map. +num_colors = length(unique(affy_metadata_data_frame$mlg)); +# Get a color palette. +palette = colorRampPalette(piratepal("basel")); +ggplot() + +geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + +coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) + +geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) + +scale_color_manual(values=palette(num_colors)) + +theme(legend.position="bottom") + +guides(color=guide_legend(nrow=8, byrow=F)); dev.off() +time_elapsed(start_time); # Missing data barplot. -poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; -test2 <- which(!is.na(poptab$miss)); -miss96 <- poptab$miss[test2]; -name96 <- poptab$user_specimen_id[test2]; +start_time <- time_start("Creating missing_data.pdf"); +population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)]; +test2 <- which(!is.na(population_info_data_table$miss)); +miss96 <- population_info_data_table$miss[test2]; +name96 <- population_info_data_table$user_specimen_id[test2]; +# Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("missing_data.pdf"); -pdf (file=file_path, width=10, height=7); +pdf(file=file_path, width=10, height=7); par(mar = c(8, 4, 4, 2)); x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); dev.off() +time_elapsed(start_time); # Generate a pie chart for each sample with a genotype. # Store the numerical and user_specimen_id values from -# report_user for the charts (user_specimen_id names +# stag_db_report for the charts (user_specimen_id names # will be used to label each chart). -dt1 <- data.table(report_user); -dt1 <- report_user[c(-2, -3, -4)]; -dt1 <- na.omit(dt1); -# Translate to N (i.e., number of samples with a -# genotype) columns and 5 rows. -tdt1 <- t(dt1); -# Make another data table and transpose it the same as dt1 to -# get numerics. These will feed into the creation of N vectors. -dt2 <- data.table(report_user); -dt2 <- report_user[c(-1, -2, -3, -4)]; -# Translate to N columns and 5 rows. -tdt2 <- t(dt2); -tdt1_matrix <- as.matrix(tdt1[-1,]); -# The number of columns is the number of samples with genotypes. -nc <- ncol(tdt1_matrix); -mode(tdt1_matrix) <- "numeric"; -spy <- rowMeans(tdt1_matrix); +start_time <- time_start("Creating percent_breakdown.pdf"); +stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; +# Remove NA and NaN values. +stag_db_report_data_table <- na.omit(stag_db_report_data_table); +# Translate to N (i.e., number of samples with a genotype) +# columns and 5 rows. +translated_stag_db_report_data_table <- t(stag_db_report_data_table); +translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]); +# Set the storage mode of the matrix to numeric. In some +# cases this could result in the following: +# Warning message: +# In mde(x) : NAs introduced by coercion +mode(translated_stag_db_report_matrix) <- "numeric"; +# Remove NA and NaN values that may have been introduced +# by coercion. +translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); +tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); dev.new(width=10, height=7); file_path = get_file_path("percent_breakdown.pdf"); pdf(file=file_path, width=10, height=7); # Average pie of all samples. -labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); +labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); main <- "Average breakdown of SNP assignments across all samples"; -pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); +pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); par(mfrow=c(3, 2)); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); -for (i in 1:nc) { - tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); - main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); - pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); +# Generate a pie chart for each sample with genotypes. +for (i in 1:ncol(translated_stag_db_report_matrix)) { + tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); + main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); + pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); } dev.off() +time_elapsed(start_time); diff -r 85f8fc57eee4 -r 1190ee1456f6 multilocus_genotype.xml --- a/multilocus_genotype.xml Wed Apr 17 09:07:04 2019 -0400 +++ b/multilocus_genotype.xml Mon May 13 08:37:37 2019 -0400 @@ -1,6 +1,7 @@ unique combination of alleles for loci + bioconductor-snprelate r-adegenet r-ape r-data.table @@ -8,13 +9,16 @@ r-dplyr r-ggplot2 r-knitr + r-maps + r-mapproj r-optparse r-poppr r-rcolorbrewer r-rpostgres r-tidyr + r-vcfr r-vegan - r-vcfr + r-yarrr multilocus_genotype.log; -if [[ $? -ne 0 ]]; then - cp multilocus_genotype.log '$output_stag_db_report'; - exit 1; -fi]]> +&> '$output_log']]> + + + + - + + @@ -60,6 +68,7 @@ Renders the unique combination of the alleles for two or more loci for each individual. The multilocus genotypes are critically important for tracking dispersal and population structure of organisms, especially those that reproduce clonally (plants, sponges, cnidarians, flatworms, annelids, sea stars, and many more). + ----- **Required options**