Mercurial > repos > greg > multilocus_genotype
view multilocus_genotype.R @ 18:1190ee1456f6 draft default tip
Uploaded
author | greg |
---|---|
date | Mon, 13 May 2019 08:37:37 -0400 |
parents | 85f8fc57eee4 |
children |
line wrap: on
line source
#!/usr/bin/env Rscript suppressPackageStartupMessages(library("adegenet")) suppressPackageStartupMessages(library("ape")) suppressPackageStartupMessages(library("data.table")) suppressPackageStartupMessages(library("dbplyr")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) suppressPackageStartupMessages(library("maps")) suppressPackageStartupMessages(library("mapproj")) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("RPostgres")) suppressPackageStartupMessages(library("SNPRelate")) suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) suppressPackageStartupMessages(library("yarrr")) theme_set(theme_bw()) option_list <- list( make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"), make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); args <- parse_args(parser, positional_arguments=TRUE); opt <- args$options; get_file_path = function(file_name) { file_path = paste("output_plots_dir", file_name, sep="/"); return(file_path); } get_database_connection <- function(db_conn_string) { # Instantiate database connection. # The connection string has this format: # postgresql://user:password@host/dbname conn_items <- strsplit(db_conn_string, "://")[[1]]; string_needed <- conn_items[2]; items_needed <- strsplit(string_needed, "@")[[1]]; user_pass_string <- items_needed[1]; host_dbname_string <- items_needed[2]; user_pass_items <- strsplit(user_pass_string, ":")[[1]]; host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; user <- user_pass_items[1]; pass <- user_pass_items[2]; host <- host_dbname_items[1]; dbname <- host_dbname_items[2]; # FIXME: is there a way to not hard-code the port? conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); return (conn); } time_elapsed <- function(start_time) { cat("Elapsed time: ", proc.time() - start_time, "\n\n"); } time_start <- function(msg) { start_time <- proc.time(); cat(msg, "...\n"); return(start_time); } # Read in VCF input file. start_time <- time_start("Reading VCF input"); vcf <- read.vcfR(opt$input_vcf); time_elapsed(start_time); # Convert VCF file into a genind for the Poppr package. start_time <- time_start("Converting VCF data to a genind object"); genind_obj <- vcfR2genind(vcf); time_elapsed(start_time); # Add population information to the genind object. population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); genind_obj@pop <- as.factor(population_info_data_table$region); strata(genind_obj) <- data.frame(pop(genind_obj)); # Convert genind object to a genclone object. start_time <- time_start("Converting the genind object to a genclone object"); genind_clone <- as.genclone(genind_obj); time_elapsed(start_time); # Calculate the bitwise distance between individuals. start_time <- time_start("Calculating the bitwise distance between individuals"); bitwise_distance <- bitwise.dist(genind_clone); time_elapsed(start_time); # Multilocus genotypes (threshold of 3.2%). mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032; m <- mlg.table(genind_clone, background=TRUE, color=TRUE); # Create list of MLGs. mlg_ids <- mlg.id(genind_clone); # Read user's Affymetrix 96 well plate tabular file. affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", "region", "latitude", "longitude", "geographic_origin", "sample_location", "latitude_outplant", "longitude_outplant", "depth", "disease_resist", "bleach_resist", "mortality","tle", "spawning", "collector_last_name", "collector_first_name", "organization", "collection_date", "email", "seq_facility", "array_version", "public", "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method", "dna_concentration", "registry_id"); affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id); user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id); # The specimen_id_field_call_data_table looks like this: # user_specimen_ids V2 # test_002 prolifera # test_003 prolifera specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call); # Rename the user_specimen_ids column. setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id")); # Rename the V2 column. setnames(specimen_id_field_call_data_table, c("V2"), c("field_call")); # Connect to database. conn <- get_database_connection(opt$database_connection_string); # Import the sample table. sample_table <- tbl(conn, "sample"); # Import the genotype table. genotype_table <- tbl(conn, "genotype"); # Select columns from the sample table and the # genotype table joined by genotype_id. sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); smlg <- sample_table_columns %>% left_join(genotype_table %>% select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), by=c("genotype_id"="id")); # Convert to dataframe. smlg_data_frame <- data.frame(smlg); # Name the columns. colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); # Missing GT in samples submitted. start_time <- time_start("Discovering missing GT in samples"); gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); missing_gt <- (missing_gt / nrow(vcf)) * 100; missing_gt_data_frame <- data.frame(missing_gt); # The specimen_id_field_call_data_table looks like this: # rn missing_gt # a100000-4368120-060520-256_I07.CEL 0.06092608 # a100000-4368120-060520-256_K07.CEL 0.05077173 missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. setnames(missing_gt_data_table, c("rn"), c("affy_id")); # Rename the missing_gt column. setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral")); # Round data to two digits. missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); time_elapsed(start_time); # Heterozygous alleles. start_time <- time_start("Discovering heterozygous alleles"); heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); # The heterozygous_alleles_data_table looks like this: # rn heterozygous_alleles # a100000-4368120-060520-256_I07.CEL 73.94903 # a100000-4368120-060520-256_K07.CEL 74.40089 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id")); # Rename the heterozygous_alleles column. setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral")); # Round data to two digits. heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); time_elapsed(start_time); # Reference alleles. start_time <- time_start("Discovering reference alleles"); reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); reference_alleles <- (reference_alleles / nrow(vcf)) * 100; reference_alleles_data_frame <- data.frame(reference_alleles); # The reference_alleles_data_table looks like this: # rn reference_alleles # a100000-4368120-060520-256_I07.CEL 11.60642 # a100000-4368120-060520-256_K07.CEL 11.45918 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. setnames(reference_alleles_data_table, c("rn"), c("affy_id")); # Rename the reference_alleles column. setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); # Round data to two digits. reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2); time_elapsed(start_time); # Alternative alleles start_time <- time_start("Discovering alternative alleles"); alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; alternative_alleles_data_frame <- data.frame(alternative_alleles); # The alternative_alleles_data_table looks like this: # rn alternative_alleles # a100000-4368120-060520-256_I07.CEL 14.38363 # a100000-4368120-060520-256_K07.CEL 14.08916 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); # Rename the alternative_alleles column. setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); # Round data to two digits. alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2); time_elapsed(start_time); # The mlg_ids_data_table looks like this: # mlg_ids # a550962-4368120-060520-500_M23.CEL # a550962-4368120-060520-256_A19.CEL mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE); # Rename the mlg_ids column. setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id")); # sample_mlg_tibble looks like this: # A tibble: 262 x 3 # Groups: group [?] # group affy_id coral_mlg_clonal_id # <int> <chr> <chr> # 1 a550962-4368120-060520-500_M23.CEL NA # 2 a550962-4368120-060520-256_A19.CEL HG0006 sample_mlg_tibble <- mlg_ids_data_table %>% group_by(row_number()) %>% dplyr::rename(group="row_number()") %>% unnest (affy_id) %>% # Join with mlg table. left_join(smlg_data_frame %>% select("affy_id","coral_mlg_clonal_id"), by="affy_id"); # If found in database, group members on previous mlg id. uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]); uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id)); na.group <- sample_mlg_tibble$group[na.mlg]; sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; # Find out if the sample mlg matched a previous genotyped sample. # sample_mlg_match_tibble looks like this: # A tibble: 262 x 4 # Groups: group [230] # group affy_id coral_mlg_clonal_id DB_match # <int> <chr> <chr> <chr> # 1 a550962-4368120-060520-500_M23.CEL NA no_match # 2 a550962-4368120-060520-256_A19.CEL HG0006 match sample_mlg_match_tibble <- sample_mlg_tibble %>% group_by(group) %>% mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); # Create new mlg id for samples with no matches in the database. none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]); none <- none[is.na(none$coral_mlg_clonal_id),]; na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id)); n.g <- sample_mlg_match_tibble$group[na.mlg2]; ct <- length(unique(n.g)); # List of new group ids, the sequence starts at the number of # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids # plus 1. # FIXME: Not sure if # the sample_mlg_match_tibble file # contains all ids. If it doesn't then look below to change # the seq() function. n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); # Assign the new id iteratively for all that have NA. for (i in 1:length(na.mlg2)) { sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))]; } # Subset population_info_data_table for all samples. # affy_id_user_specimen_id_vector looks like this: # affy_id user_specimen_id # a100000-4368120-060520-256_I07.CEL 13704 # a100000-4368120-060520-256_K07.CEL 13706 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)]; # Merge data frames for final table. start_time <- time_start("Merging data frames"); stag_db_report <- specimen_id_field_call_data_table %>% left_join(affy_id_user_specimen_id_vector %>% select("affy_id", "user_specimen_id"), by="user_specimen_id") %>% left_join(sample_mlg_match_tibble %>% select("affy_id", "coral_mlg_clonal_id", "DB_match"), by="affy_id") %>% left_join(missing_gt_data_table %>% select("affy_id", "percent_missing_data_coral"), by="affy_id") %>% left_join(heterozygous_alleles_data_table %>% select("affy_id", "percent_heterozygous_coral"), by="affy_id") %>% left_join(reference_alleles_data_table %>% select("affy_id", "percent_reference_coral"), by="affy_id") %>% left_join(alternative_alleles_data_table %>% select("affy_id", "percent_alternative_coral"), by="affy_id") %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>% mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>% mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% ungroup() %>% select(-group); time_elapsed(start_time); start_time <- time_start("Writing csv output"); write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE); time_elapsed(start_time); # Database sample table. sample_db <- affy_metadata_data_frame %>% left_join(stag_db_report %>% select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"), by='user_specimen_id'); # Representative clone for genotype table. start_time <- time_start("Creating representative clone for genotype table"); no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); id_rep <- mlg.id(no_dup_genotypes_genind); id_data_table <- data.table(id_rep, keep.rownames=TRUE); # Rename the id_rep column. setnames(id_data_table, c("id_rep"), c("affy_id")); time_elapsed(start_time); # # Combine with previously genotyped samples in the database. start_time <- time_start("Selecting from various database tables"); representative_mlg_tibble <- id_data_table %>% group_by(row_number()) %>% rename(group='row_number()') %>% unnest(affy_id) %>% left_join(stag_db_report %>% select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"), by='affy_id') %>% mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>% ungroup() %>% select(-group); # Database genotype table. genotype_table_join <- sample_mlg_match_tibble %>% left_join(representative_mlg_tibble %>% select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"), by='affy_id') %>% ungroup() %>% select(-group); # Database taxonomy table. taxonomy_table_join <- stag_db_report %>% select(genetic_coral_species_call, affy_id) %>% mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name)); time_elapsed(start_time); # Table of alleles for the new samples subset to new plate data. # Create vector indicating number of individuals desired from # affy_id column of stag_db_report data table. i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); i <- i[!apply(i== "", 1, all),]; sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; # cols looks like this: # blue1 red green pink orange blue2 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" # green2 yellow turquoise poop # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF" cols <- piratepal("basel"); set.seed(999); if (!is.null(opt$output_nj_phylogeny_tree)) { # Create a phylogeny tree of samples based on distance matrices. # Start PDF device driver. start_time <- time_start("Creating nj_phylogeny_tree.pdf"); dev.new(width=10, height=7); file_path = get_file_path("nj_phylogeny_tree.pdf"); pdf(file=file_path, width=10, height=7); # Organize branches by clade. nj_phylogeny_tree <- sample_alleles_vector %>% aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); # Add a scale bar showing 5% difference. add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); dev.off() time_elapsed(start_time); } # Subset VCF to the user samples. start_time <- time_start("Subsetting vcf to the user samples"); l <- length(i); n <- ncol(vcf@gt); s <- n - l; svcf <- vcf[, s:n]; write.vcf(svcf, "subset.vcf.gz"); vcf.fn <- "subset.vcf.gz"; snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); gds_array <- read.gdsn(index.gdsn(genofile, "sample.id")); # gds_array looks like this: # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL" # [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL" gds_data_frame <- data.frame(gds_array); # gds_data_frame looks like this: # gds_array # a550962-4368120-060520-500_A03.CEL # a550962-4368120-060520-500_A05.CEL gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[]; # Rename the gds_array column. setnames(gds_data_table, c("gds_array"), c("affy_id")); # affy_id_region_list looks like this: # affy_id region # a100000-4368120-060520-256_I07.CEL USVI # a100000-4368120-060520-256_K07.CEL USVI affy_id_region_list <- population_info_data_table[c(2,4)]; gds_data_table_join <- gds_data_table %>% left_join(affy_id_region_list %>% select("affy_id", "region"), by='affy_id')%>% drop_na(); samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); add.gdsn(genofile, "sample.annot", samp.annot); # population_code looks like this: # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); # pop.group looks like this: # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 time_elapsed(start_time); # Distance matrix calculation. start_time <- time_start("Calculating distance matrix"); ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); time_elapsed(start_time); # Cluster analysis on the genome-wide IBS pairwise distance matrix. start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); set.seed(100); par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); time_elapsed(start_time); # Default clustering. start_time <- time_start("Creating ibs_default.pdf"); # Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("ibs_default.pdf"); pdf(file=file_path, width=10, height=7); rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); dev.off() time_elapsed(start_time); # Color cluster by region. start_time <- time_start("Creating ibs_region.pdf"); # Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("ibs_region.pdf"); pdf(file=file_path, width=10, height=7); race <- as.factor(population_code); rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2); legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); dev.off() time_elapsed(start_time); # close GDS file. snpgdsClose(genofile); # Sample MLG on a map. start_time <- time_start("Creating mlg_map.pdf"); # Get the lattitude and longitude boundaries for rendering # the map. Tese boundaries will restrict the map to focus # (i.e., zoom) on the region of the world map from which # the samples were taken. max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE); min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE); latitude_range_vector <- c(min_latitude-3, max_latitude+3); max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE); min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE); longitude_range_vector <- c(min_longitude-3, max_longitude+3); # Get the palette colors for rendering plots. colors <- length(unique(stag_db_report$coral_mlg_clonal_id)); # Get a color palette. palette <- colorRampPalette(piratepal("basel")); # Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("mlg_map.pdf"); pdf(file=file_path, width=10, height=7); world_data = map_data("world"); # Add the coral_mlg_clonal_id column from the stag_db_report # data fram to the affy_metadata_data_frame. affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id; # Get the number of colors needed from the palette for plotting # the sample locations on the world map. num_colors = length(unique(affy_metadata_data_frame$mlg)); # Get a color palette. palette = colorRampPalette(piratepal("basel")); ggplot() + geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) + geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) + scale_color_manual(values=palette(num_colors)) + theme(legend.position="bottom") + guides(color=guide_legend(nrow=8, byrow=F)); dev.off() time_elapsed(start_time); # Missing data barplot. start_time <- time_start("Creating missing_data.pdf"); population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)]; test2 <- which(!is.na(population_info_data_table$miss)); miss96 <- population_info_data_table$miss[test2]; name96 <- population_info_data_table$user_specimen_id[test2]; # Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("missing_data.pdf"); pdf(file=file_path, width=10, height=7); par(mar = c(8, 4, 4, 2)); x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); dev.off() time_elapsed(start_time); # Generate a pie chart for each sample with a genotype. # Store the numerical and user_specimen_id values from # stag_db_report for the charts (user_specimen_id names # will be used to label each chart). start_time <- time_start("Creating percent_breakdown.pdf"); stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; # Remove NA and NaN values. stag_db_report_data_table <- na.omit(stag_db_report_data_table); # Translate to N (i.e., number of samples with a genotype) # columns and 5 rows. translated_stag_db_report_data_table <- t(stag_db_report_data_table); translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]); # Set the storage mode of the matrix to numeric. In some # cases this could result in the following: # Warning message: # In mde(x) : NAs introduced by coercion mode(translated_stag_db_report_matrix) <- "numeric"; # Remove NA and NaN values that may have been introduced # by coercion. translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); dev.new(width=10, height=7); file_path = get_file_path("percent_breakdown.pdf"); pdf(file=file_path, width=10, height=7); # Average pie of all samples. labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); main <- "Average breakdown of SNP assignments across all samples"; pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); par(mfrow=c(3, 2)); col <- c("GREY", "#006DDB", "#24FF24", "#920000"); # Generate a pie chart for each sample with genotypes. for (i in 1:ncol(translated_stag_db_report_matrix)) { tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); } dev.off() time_elapsed(start_time);