comparison multilocus_genotype.R @ 18:1190ee1456f6 draft default tip

Uploaded
author greg
date Mon, 13 May 2019 08:37:37 -0400
parents 85f8fc57eee4
children
comparison
equal deleted inserted replaced
17:85f8fc57eee4 18:1190ee1456f6
5 suppressPackageStartupMessages(library("data.table")) 5 suppressPackageStartupMessages(library("data.table"))
6 suppressPackageStartupMessages(library("dbplyr")) 6 suppressPackageStartupMessages(library("dbplyr"))
7 suppressPackageStartupMessages(library("dplyr")) 7 suppressPackageStartupMessages(library("dplyr"))
8 suppressPackageStartupMessages(library("ggplot2")) 8 suppressPackageStartupMessages(library("ggplot2"))
9 suppressPackageStartupMessages(library("knitr")) 9 suppressPackageStartupMessages(library("knitr"))
10 suppressPackageStartupMessages(library("maps"))
11 suppressPackageStartupMessages(library("mapproj"))
10 suppressPackageStartupMessages(library("optparse")) 12 suppressPackageStartupMessages(library("optparse"))
11 suppressPackageStartupMessages(library("poppr")) 13 suppressPackageStartupMessages(library("poppr"))
12 suppressPackageStartupMessages(library("RColorBrewer")) 14 suppressPackageStartupMessages(library("RColorBrewer"))
13 suppressPackageStartupMessages(library("rnaturalearth"))
14 suppressPackageStartupMessages(library("rnaturalearthdata"))
15 suppressPackageStartupMessages(library("RPostgres")) 15 suppressPackageStartupMessages(library("RPostgres"))
16 suppressPackageStartupMessages(library("sf")) 16 suppressPackageStartupMessages(library("SNPRelate"))
17 suppressPackageStartupMessages(library(SNPRelate))
18 suppressPackageStartupMessages(library("tidyr")) 17 suppressPackageStartupMessages(library("tidyr"))
19 suppressPackageStartupMessages(library("vcfR")) 18 suppressPackageStartupMessages(library("vcfR"))
20 suppressPackageStartupMessages(library("vegan")) 19 suppressPackageStartupMessages(library("vegan"))
21 suppressPackageStartupMessages(library("yarrr")) 20 suppressPackageStartupMessages(library("yarrr"))
22 theme_set(theme_bw()) 21 theme_set(theme_bw())
24 option_list <- list( 23 option_list <- list(
25 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), 24 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
26 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), 25 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
27 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), 26 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
28 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), 27 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), 28 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
30 make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") 29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
31 ) 30 )
32 31
33 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); 32 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
34 args <- parse_args(parser, positional_arguments=TRUE); 33 args <- parse_args(parser, positional_arguments=TRUE);
35 opt <- args$options; 34 opt <- args$options;
57 # FIXME: is there a way to not hard-code the port? 56 # FIXME: is there a way to not hard-code the port?
58 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); 57 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
59 return (conn); 58 return (conn);
60 } 59 }
61 60
61 time_elapsed <- function(start_time) {
62 cat("Elapsed time: ", proc.time() - start_time, "\n\n");
63 }
64
65 time_start <- function(msg) {
66 start_time <- proc.time();
67 cat(msg, "...\n");
68 return(start_time);
69 }
70
62 # Read in VCF input file. 71 # Read in VCF input file.
72 start_time <- time_start("Reading VCF input");
63 vcf <- read.vcfR(opt$input_vcf); 73 vcf <- read.vcfR(opt$input_vcf);
74 time_elapsed(start_time);
64 75
65 # Convert VCF file into a genind for the Poppr package. 76 # Convert VCF file into a genind for the Poppr package.
66 # TODO: probably should not hard-code 2 cores. 77 start_time <- time_start("Converting VCF data to a genind object");
67 # changed to genind format for extracting alleles later 78 genind_obj <- vcfR2genind(vcf);
68 # trade-off is it is a bit slower to import data 79 time_elapsed(start_time);
69 # gl <- vcfR2genlight(vcf, n.cores=2)
70 # gind <- new("genind", (as.matrix(gl)))
71 gind <- vcfR2genind(vcf);
72 80
73 # Add population information to the genind object. 81 # Add population information to the genind object.
74 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); 82 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
75 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); 83 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
76 gind@pop <- as.factor(poptab$region); 84 genind_obj@pop <- as.factor(population_info_data_table$region);
77 strata(gind)<-data.frame(pop(gind)); 85 strata(genind_obj) <- data.frame(pop(genind_obj));
78 86
79 # Convert genind object to a genclone object. 87 # Convert genind object to a genclone object.
80 obj2 <- as.genclone(gind); 88 start_time <- time_start("Converting the genind object to a genclone object");
89 genind_clone <- as.genclone(genind_obj);
90 time_elapsed(start_time);
81 91
82 # Calculate the bitwise distance between individuals. 92 # Calculate the bitwise distance between individuals.
83 xdis <- bitwise.dist(obj2); 93 start_time <- time_start("Calculating the bitwise distance between individuals");
94 bitwise_distance <- bitwise.dist(genind_clone);
95 time_elapsed(start_time);
84 96
85 # Multilocus genotypes (threshold of 3.2%). 97 # Multilocus genotypes (threshold of 3.2%).
86 # threshold doubled because of how the data is formatted in genind compared to genlight 98 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
87 mlg.filter(obj2, distance=xdis) <- 0.032; 99 m <- mlg.table(genind_clone, background=TRUE, color=TRUE);
88 m <- mlg.table(obj2, background=TRUE, color=TRUE); 100
89 101 # Create list of MLGs.
90 # Create table of MLGs. 102 mlg_ids <- mlg.id(genind_clone);
91 id <- mlg.id(obj2);
92 #dt <- data.table(id, keep.rownames=TRUE);
93 #setnames(dt, c("id"), c("affy_id"));
94 103
95 # Read user's Affymetrix 96 well plate tabular file. 104 # Read user's Affymetrix 96 well plate tabular file.
96 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); 105 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
97 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", 106 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
98 "region", "latitude", "longitude", "geographic_origin", "sample_location", 107 "region", "latitude", "longitude", "geographic_origin", "sample_location",
99 "latitude_outplant", "longitude_outplant", "depth", "disease_resist", 108 "latitude_outplant", "longitude_outplant", "depth", "disease_resist",
100 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", 109 "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
101 "collector_first_name", "organization", "collection_date", "email", "seq_facility", 110 "collector_first_name", "organization", "collection_date", "email", "seq_facility",
102 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", 111 "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
103 "dna_extraction_method", "dna_concentration", "registry_id"); 112 "dna_extraction_method", "dna_concentration", "registry_id");
104 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); 113 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
105 pinfo2 <- as.character(pinfo$user_specimen_id); 114 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
106 pi <- data.table(pinfo2, pinfo$field_call); 115 # The specimen_id_field_call_data_table looks like this:
107 setnames(pi, c("pinfo2"), c("user_specimen_id")); 116 # user_specimen_ids V2
108 setnames(pi, c("V2"), c("field_call")); 117 # test_002 prolifera
118 # test_003 prolifera
119 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
120 # Rename the user_specimen_ids column.
121 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
122 # Rename the V2 column.
123 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
109 124
110 # Connect to database. 125 # Connect to database.
111 conn <- get_database_connection(opt$database_connection_string); 126 conn <- get_database_connection(opt$database_connection_string);
112
113 # Import the sample table. 127 # Import the sample table.
114 sample_table <- tbl(conn, "sample"); 128 sample_table <- tbl(conn, "sample");
115
116 # Import the genotype table. 129 # Import the genotype table.
117 genotype_table <- tbl(conn, "genotype"); 130 genotype_table <- tbl(conn, "genotype");
118
119 # Select columns from the sample table and the 131 # Select columns from the sample table and the
120 # genotype table joined by genotype_id. 132 # genotype table joined by genotype_id.
121 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); 133 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
122 smlg <- sample_table_columns %>% 134 smlg <- sample_table_columns %>%
123 left_join(genotype_table %>% 135 left_join(genotype_table %>%
124 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), 136 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
125 by=c("genotype_id" = "id")); 137 by=c("genotype_id"="id"));
126
127 # Convert to dataframe. 138 # Convert to dataframe.
128 sm <- data.frame(smlg); 139 smlg_data_frame <- data.frame(smlg);
129 # Name the columns. 140 # Name the columns.
130 colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); 141 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
131 142
132 # Missing GT in samples submitted. 143 # Missing GT in samples submitted.
144 start_time <- time_start("Discovering missing GT in samples");
133 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 145 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
134 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 146 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
135 myMiss <- (myMiss / nrow(vcf)) * 100; 147 missing_gt <- (missing_gt / nrow(vcf)) * 100;
136 miss <- data.frame(myMiss); 148 missing_gt_data_frame <- data.frame(missing_gt);
137 149 # The specimen_id_field_call_data_table looks like this:
138 # Convert missing data into data table. 150 # rn missing_gt
139 mi <-setDT(miss, keep.rownames=TRUE)[]; 151 # a100000-4368120-060520-256_I07.CEL 0.06092608
140 setnames(mi, c("rn"), c("affy_id")); 152 # a100000-4368120-060520-256_K07.CEL 0.05077173
141 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); 153 missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
142 # Round missing data to two digits. 154 # Rename the rn column.
143 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); 155 setnames(missing_gt_data_table, c("rn"), c("affy_id"));
144 156 # Rename the missing_gt column.
145 #heterozygous alleles 157 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
146 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); 158 # Round data to two digits.
147 hets <- (hets / nrow(vcf)) * 100; 159 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
148 ht <- data.frame(hets); 160 time_elapsed(start_time);
149 161
150 # Convert heterozygosity data into data table. 162 # Heterozygous alleles.
151 ht <-setDT(ht, keep.rownames=TRUE)[]; 163 start_time <- time_start("Discovering heterozygous alleles");
152 setnames(ht, c("rn"), c("affy_id")); 164 heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
153 setnames(ht, c("hets"), c("percent_mixed_coral")); 165 heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100;
154 # Round missing data to two digits. 166 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
155 ht$percent_mixed<-round(ht$percent_mixed, digits=2); 167 # The heterozygous_alleles_data_table looks like this:
156 168 # rn heterozygous_alleles
157 #reference alleles 169 # a100000-4368120-060520-256_I07.CEL 73.94903
158 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); 170 # a100000-4368120-060520-256_K07.CEL 74.40089
159 refA <- (refA / nrow(vcf)) * 100; 171 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
160 rA <- data.frame(refA); 172 # Rename the rn column.
161 173 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
162 # Convert refA data into data.table. 174 # Rename the heterozygous_alleles column.
163 rA <-setDT(rA, keep.rownames=TRUE)[]; 175 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
164 setnames(rA, c("rn"), c("affy_id")); 176 # Round data to two digits.
165 setnames(rA, c("refA"), c("percent_reference_coral")); 177 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
166 # round missing data to two digits. 178 time_elapsed(start_time);
167 rA$percent_reference<-round(rA$percent_reference, digits=2); 179
168 180 # Reference alleles.
169 #alternative alleles 181 start_time <- time_start("Discovering reference alleles");
170 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); 182 reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
171 altB <- (altB / nrow(vcf)) * 100; 183 reference_alleles <- (reference_alleles / nrow(vcf)) * 100;
172 aB <- data.frame(altB); 184 reference_alleles_data_frame <- data.frame(reference_alleles);
173 185 # The reference_alleles_data_table looks like this:
174 # Convert altB data into data table. 186 # rn reference_alleles
175 aB <-setDT(aB, keep.rownames=TRUE)[]; 187 # a100000-4368120-060520-256_I07.CEL 11.60642
176 setnames(aB, c("rn"), c("affy_id")); 188 # a100000-4368120-060520-256_K07.CEL 11.45918
177 setnames(aB, c("altB"), c("percent_alternative_coral")); 189 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[];
178 # Round missing data to two digits. 190 # Rename the rn column.
179 aB$percent_alternative<-round(aB$percent_alternative, digits=2); 191 setnames(reference_alleles_data_table, c("rn"), c("affy_id"));
180 192 # Rename the reference_alleles column.
181 #convert mlg id to data.table format 193 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral"));
182 dt <- data.table(id, keep.rownames=TRUE); 194 # Round data to two digits.
183 setnames(dt, c("id"), c("affy_id")); 195 reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2);
184 196 time_elapsed(start_time);
185 # Transform. 197
186 df3 <- dt %>% 198 # Alternative alleles
199 start_time <- time_start("Discovering alternative alleles");
200 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
201 alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100;
202 alternative_alleles_data_frame <- data.frame(alternative_alleles);
203 # The alternative_alleles_data_table looks like this:
204 # rn alternative_alleles
205 # a100000-4368120-060520-256_I07.CEL 14.38363
206 # a100000-4368120-060520-256_K07.CEL 14.08916
207 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[];
208 # Rename the rn column.
209 setnames(alternative_alleles_data_table, c("rn"), c("affy_id"));
210 # Rename the alternative_alleles column.
211 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral"));
212 # Round data to two digits.
213 alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2);
214 time_elapsed(start_time);
215
216 # The mlg_ids_data_table looks like this:
217 # mlg_ids
218 # a550962-4368120-060520-500_M23.CEL
219 # a550962-4368120-060520-256_A19.CEL
220 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
221 # Rename the mlg_ids column.
222 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
223
224 # sample_mlg_tibble looks like this:
225 # A tibble: 262 x 3
226 # Groups: group [?]
227 # group affy_id coral_mlg_clonal_id
228 # <int> <chr> <chr>
229 # 1 a550962-4368120-060520-500_M23.CEL NA
230 # 2 a550962-4368120-060520-256_A19.CEL HG0006
231 sample_mlg_tibble <- mlg_ids_data_table %>%
187 group_by(row_number()) %>% 232 group_by(row_number()) %>%
188 dplyr::rename(group="row_number()") %>% 233 dplyr::rename(group="row_number()") %>%
189 unnest (affy_id) %>% 234 unnest (affy_id) %>%
190 # Join with mlg table. 235 # Join with mlg table.
191 left_join(sm %>% 236 left_join(smlg_data_frame %>%
192 select("affy_id","coral_mlg_clonal_id"), 237 select("affy_id","coral_mlg_clonal_id"),
193 by="affy_id"); 238 by="affy_id");
194 239
195 # If found in database, group members on previous mlg id. 240 # If found in database, group members on previous mlg id.
196 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); 241 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
197 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; 242 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
198 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); 243 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
199 na.group <- df3$group[na.mlg]; 244 na.group <- sample_mlg_tibble$group[na.mlg];
200 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; 245 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
201 246
202 # Determine if the sample mlg matched previous genotyped sample. 247 # Find out if the sample mlg matched a previous genotyped sample.
203 df4<- df3 %>% 248 # sample_mlg_match_tibble looks like this:
249 # A tibble: 262 x 4
250 # Groups: group [230]
251 # group affy_id coral_mlg_clonal_id DB_match
252 # <int> <chr> <chr> <chr>
253 # 1 a550962-4368120-060520-500_M23.CEL NA no_match
254 # 2 a550962-4368120-060520-256_A19.CEL HG0006 match
255 sample_mlg_match_tibble <- sample_mlg_tibble %>%
204 group_by(group) %>% 256 group_by(group) %>%
205 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); 257 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
206 258
207 # Create new mlg id for samples that did not match those in the database. 259 # Create new mlg id for samples with no matches in the database.
208 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); 260 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
209 none <- none[is.na(none$coral_mlg_clonal_id),]; 261 none <- none[is.na(none$coral_mlg_clonal_id),];
210 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); 262 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
211 n.g <- df4$group[na.mlg2]; 263 n.g <- sample_mlg_match_tibble$group[na.mlg2];
212 ct <- length(unique(n.g)); 264 ct <- length(unique(n.g));
213 265
214 # List of new group ids, the sequence starts at the number of 266 # List of new group ids, the sequence starts at the number of
215 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if 267 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
216 # the df4 file contains all ids. If it doesn't then look below 268 # plus 1.
217 # to change the seq() function. 269 # FIXME: Not sure if # the sample_mlg_match_tibble file
218 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); 270 # contains all ids. If it doesn't then look below to change
219 # Pair group with new ids. 271 # the seq() function.
220 rat <- cbind(unique(n.g), n.g_ids); 272 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
273
221 # Assign the new id iteratively for all that have NA. 274 # Assign the new id iteratively for all that have NA.
222 for (i in 1:length(na.mlg2)) { 275 for (i in 1:length(na.mlg2)) {
223 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; 276 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
224 } 277 }
225 278
226 # Subset poptab for all samples. 279 # Subset population_info_data_table for all samples.
227 subpop <- poptab[c(2, 3)]; 280 # affy_id_user_specimen_id_vector looks like this:
281 # affy_id user_specimen_id
282 # a100000-4368120-060520-256_I07.CEL 13704
283 # a100000-4368120-060520-256_K07.CEL 13706
284 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
228 285
229 # Merge data frames for final table. 286 # Merge data frames for final table.
230 report_user <- pi %>% 287 start_time <- time_start("Merging data frames");
231 left_join(subpop %>% 288 stag_db_report <- specimen_id_field_call_data_table %>%
289 left_join(affy_id_user_specimen_id_vector %>%
232 select("affy_id", "user_specimen_id"), 290 select("affy_id", "user_specimen_id"),
233 by="user_specimen_id") %>% 291 by="user_specimen_id") %>%
234 left_join(df4 %>% 292 left_join(sample_mlg_match_tibble %>%
235 select("affy_id", "coral_mlg_clonal_id", "DB_match"), 293 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
236 by="affy_id") %>% 294 by="affy_id") %>%
237 left_join(mi %>% 295 left_join(missing_gt_data_table %>%
238 select("affy_id", "percent_missing_data_coral"), 296 select("affy_id", "percent_missing_data_coral"),
239 by="affy_id") %>% 297 by="affy_id") %>%
240 left_join(ht %>% 298 left_join(heterozygous_alleles_data_table %>%
241 select("affy_id", "percent_mixed_coral"), 299 select("affy_id", "percent_heterozygous_coral"),
242 by="affy_id") %>% 300 by="affy_id") %>%
243 left_join(rA %>% 301 left_join(reference_alleles_data_table %>%
244 select("affy_id", "percent_reference_coral"), 302 select("affy_id", "percent_reference_coral"),
245 by="affy_id") %>% 303 by="affy_id") %>%
246 left_join(aB %>% 304 left_join(alternative_alleles_data_table %>%
247 select("affy_id", "percent_alternative_coral"), 305 select("affy_id", "percent_alternative_coral"),
248 by="affy_id") %>% 306 by="affy_id") %>%
249 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% 307 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
250 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% 308 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
251 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% 309 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>%
252 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% 310 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>%
253 mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% 311 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
254 ungroup() %>% 312 ungroup() %>%
255 select(-group); 313 select(-group);
256 314 time_elapsed(start_time);
257 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); 315
258 316 start_time <- time_start("Writing csv output");
259 # Database tables 317 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
260 ## Sample.table 318 time_elapsed(start_time);
261 sample_db <- pinfo %>% 319
262 left_join( 320 # Database sample table.
263 report_user %>% 321 sample_db <- affy_metadata_data_frame %>%
264 select("user_specimen_id","affy_id", 322 left_join(stag_db_report %>%
265 "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", 323 select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"),
266 "percent_alternative_coral"), 324 by='user_specimen_id');
267 by='user_specimen_id'); 325
268 326 # Representative clone for genotype table.
269 ###representative clone for genotype.table 327 start_time <- time_start("Creating representative clone for genotype table");
270 cc<-clonecorrect(obj2, strata= ~pop.gind.); 328 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.);
271 id_rep<-mlg.id(cc); 329 id_rep <- mlg.id(no_dup_genotypes_genind);
272 dt_cc<-data.table(id_rep,keep.rownames = TRUE); 330 id_data_table <- data.table(id_rep, keep.rownames=TRUE);
273 setnames(dt_cc, c("id_rep"), c("affy_id")); 331 # Rename the id_rep column.
274 332 setnames(id_data_table, c("id_rep"), c("affy_id"));
275 ###transform mlg data.table 333 time_elapsed(start_time);
276 df_cc <- dt_cc %>% 334
277 group_by(row_number()) %>% 335 # # Combine with previously genotyped samples in the database.
278 rename(group='row_number()') %>% 336 start_time <- time_start("Selecting from various database tables");
279 unnest(affy_id) %>% 337 representative_mlg_tibble <- id_data_table %>%
280 left_join(report_user %>% 338 group_by(row_number()) %>%
281 select("coral_mlg_clonal_id","user_specimen_id","affy_id"), 339 rename(group='row_number()') %>%
282 by='affy_id') %>% 340 unnest(affy_id) %>%
283 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% 341 left_join(stag_db_report %>%
284 ungroup() %>% 342 select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"),
285 select(-group); 343 by='affy_id') %>%
286 344 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>%
287 ##geno.table 345 ungroup() %>%
288 geno_db <- df4 %>% 346 select(-group);
289 left_join(df_cc %>% 347
290 select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), 348 # Database genotype table.
349 genotype_table_join <- sample_mlg_match_tibble %>%
350 left_join(representative_mlg_tibble %>%
351 select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"),
291 by='affy_id') %>% 352 by='affy_id') %>%
292 ungroup() %>% 353 ungroup() %>%
293 select(-group); 354 select(-group);
294 355
295 ##taxonomy.table 356 # Database taxonomy table.
296 357 taxonomy_table_join <- stag_db_report %>%
297 tax_db <- report_user %>% 358 select(genetic_coral_species_call, affy_id) %>%
298 select(genetic_coral_species_call, affy_id) %>% 359 mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>%
299 mutate(genus_name =ifelse(genetic_coral_species_call== 360 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
300 genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% 361 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
301 mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% 362 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
302 mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% 363 time_elapsed(start_time);
303 mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); 364 # Table of alleles for the new samples subset to new plate data.
304 365 # Create vector indicating number of individuals desired from
305 366 # affy_id column of stag_db_report data table.
306 # Table of alleles for the new samples 367 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
307 ## subset to new plate data 368 i <- i[!apply(i== "", 1, all),];
308 ### create vector indicating number of individuals desired 369 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
309 ### made from affy_id collumn from report_user data table 370
310 i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); 371 # cols looks like this:
311 i<-i[!apply(i == "", 1, all),]; 372 # blue1 red green pink orange blue2
312 sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; 373 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF"
313 374 # green2 yellow turquoise poop
314 # convert to data frame 375 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
315 at_96<-genind2df(sub96, sep="");
316 at_96<- at_96 %>%
317 select(-pop);
318
319 # allele string for Allele.table in database
320 uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE);
321 uat_96<-setDT(uat_96, keep.rownames = TRUE)[];
322 setnames(uat_96, c("rn"), c("user_specimen_id"));
323 # write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE);
324
325 # Create a phylogeny of samples based on distance matrices.
326 cols <- piratepal("basel"); 376 cols <- piratepal("basel");
327 set.seed(999); 377 set.seed(999);
328 # Start PDF device driver. 378
329 dev.new(width=10, height=7); 379 if (!is.null(opt$output_nj_phylogeny_tree)) {
330 file_path = get_file_path("nj_phylogeny.pdf"); 380 # Create a phylogeny tree of samples based on distance matrices.
331 pdf(file=file_path, width=10, height=7); 381 # Start PDF device driver.
332 # Organize branches by clade. 382 start_time <- time_start("Creating nj_phylogeny_tree.pdf");
333 theTree <- sub96 %>% 383 dev.new(width=10, height=7);
334 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% 384 file_path = get_file_path("nj_phylogeny_tree.pdf");
335 ladderize(); 385 pdf(file=file_path, width=10, height=7);
336 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; 386 # Organize branches by clade.
337 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); 387 nj_phylogeny_tree <- sample_alleles_vector %>%
338 # Add a scale bar showing 5% difference.. 388 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
339 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); 389 ladderize();
340 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); 390 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
341 legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); 391 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
342 dev.off() 392 # Add a scale bar showing 5% difference.
343 393 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
344 write.tree(theTree, file =opt$nj_tree, quote=FALSE); 394 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
345 395 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
346 # identity-by-state analysis 396 dev.off()
347 #if (!requireNamespace("BiocManager", quietly = TRUE)) 397 time_elapsed(start_time);
348 # install.packages("BiocManager") 398 }
349 #BiocManager::install("SNPRelate", version = "3.8") 399
350 400 # Subset VCF to the user samples.
351 #subset VCF to the user samples 401 start_time <- time_start("Subsetting vcf to the user samples");
352 l<-length(i); 402 l <- length(i);
353 n<-ncol(vcf@gt); 403 n <- ncol(vcf@gt);
354 s<-n-l; 404 s <- n - l;
355 svcf<-vcf[,s:n]; 405 svcf <- vcf[, s:n];
356 write.vcf(svcf, "subset.vcf.gz"); 406 write.vcf(svcf, "subset.vcf.gz");
357
358 vcf.fn <- "subset.vcf.gz"; 407 vcf.fn <- "subset.vcf.gz";
359 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); 408 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
360 409 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
361 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); 410 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
362 hd<-read.gdsn(index.gdsn(genofile, "sample.id")); 411 # gds_array looks like this:
363 hd<-data.frame(hd); 412 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
364 hd<-setDT(hd, keep.rownames = FALSE)[]; 413 # [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL"
365 setnames(hd, c("hd"), c("user_specimen_id")); 414 gds_data_frame <- data.frame(gds_array);
366 415 # gds_data_frame looks like this:
367 subpop2<- poptab[c(2,4)]; 416 # gds_array
368 poptab_sub <- hd %>% 417 # a550962-4368120-060520-500_A03.CEL
369 left_join( 418 # a550962-4368120-060520-500_A05.CEL
370 subpop2 %>% 419 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
371 select("affy_id","region"), 420 # Rename the gds_array column.
372 by='affy_id')%>% 421 setnames(gds_data_table, c("gds_array"), c("affy_id"));
373 drop_na(); 422 # affy_id_region_list looks like this:
374 423 # affy_id region
375 samp.annot <- data.frame(pop.group = c(poptab_sub$region)); 424 # a100000-4368120-060520-256_I07.CEL USVI
425 # a100000-4368120-060520-256_K07.CEL USVI
426 affy_id_region_list <- population_info_data_table[c(2,4)];
427 gds_data_table_join <- gds_data_table %>%
428 left_join(affy_id_region_list %>%
429 select("affy_id", "region"),
430 by='affy_id')%>%
431 drop_na();
432 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
376 add.gdsn(genofile, "sample.annot", samp.annot); 433 add.gdsn(genofile, "sample.annot", samp.annot);
377 434 # population_code looks like this:
378 pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); 435 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
436 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
437 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
379 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); 438 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
380 439 # pop.group looks like this:
381 # Identity-By-State Analysis - distance matrix calculation 440 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
441 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
442 time_elapsed(start_time);
443
444 # Distance matrix calculation.
445 start_time <- time_start("Calculating distance matrix");
382 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); 446 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
383 447 time_elapsed(start_time);
384 # cluster analysis on the genome-wide IBS pairwise distance matrix 448
449 # Cluster analysis on the genome-wide IBS pairwise distance matrix.
450 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
385 set.seed(100); 451 set.seed(100);
386 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); 452 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2);
387 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); 453 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE));
388 454 time_elapsed(start_time);
389 # default clustering. 455
456 # Default clustering.
457 start_time <- time_start("Creating ibs_default.pdf");
458 # Start PDF device driver.
390 dev.new(width=10, height=7); 459 dev.new(width=10, height=7);
391 file_path = get_file_path("IBS_default.pdf"); 460 file_path = get_file_path("ibs_default.pdf");
392 pdf (file=file_path, width=10, height=7); 461 pdf(file=file_path, width=10, height=7);
393 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); 462 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
394 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); 463 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2);
395 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); 464 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
396 dev.off() 465 dev.off()
397 466 time_elapsed(start_time);
398 # color cluster by region. 467
468 # Color cluster by region.
469 start_time <- time_start("Creating ibs_region.pdf");
470 # Start PDF device driver.
399 dev.new(width=10, height=7); 471 dev.new(width=10, height=7);
400 file_path = get_file_path("IBS_Region.pdf"); 472 file_path = get_file_path("ibs_region.pdf");
401 pdf (file=file_path, width=10, height=7); 473 pdf(file=file_path, width=10, height=7);
402 race <- as.factor(pop_code); 474 race <- as.factor(population_code);
403 rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); 475 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15);
404 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); 476 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2);
405 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); 477 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
406 dev.off() 478 dev.off()
407 479 time_elapsed(start_time);
408 #close GDS file 480
481 # close GDS file.
409 snpgdsClose(genofile); 482 snpgdsClose(genofile);
410 483
411 # Sample MLG on a map. 484 # Sample MLG on a map.
412 world <- ne_countries(scale = "medium", returnclass = "sf"); 485 start_time <- time_start("Creating mlg_map.pdf");
413 class(world); 486 # Get the lattitude and longitude boundaries for rendering
414 487 # the map. Tese boundaries will restrict the map to focus
415 pinfo$mlg<-report_user$coral_mlg_clonal_id; 488 # (i.e., zoom) on the region of the world map from which
416 n <- nrow(pinfo); 489 # the samples were taken.
417 490 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
418 mxlat<-max(pinfo$latitude,na.rm = TRUE); 491 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
419 mnlat<-min(pinfo$latitude,na.rm = TRUE); 492 latitude_range_vector <- c(min_latitude-3, max_latitude+3);
420 mxlong<-max(pinfo$longitude,na.rm = TRUE); 493 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
421 mnlong<-min(pinfo$longitude,na.rm = TRUE); 494 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
422 495 longitude_range_vector <- c(min_longitude-3, max_longitude+3);
423 p5<-ggplot(data = world) + 496 # Get the palette colors for rendering plots.
424 geom_sf() + 497 colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
425 coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); 498 # Get a color palette.
426 499 palette <- colorRampPalette(piratepal("basel"));
427 colourCount = length(unique(pinfo$mlg)); 500 # Start PDF device driver.
428 getPalette = colorRampPalette(piratepal("basel"));
429 dev.new(width=10, height=7); 501 dev.new(width=10, height=7);
430 file_path = get_file_path("mlg_map.pdf"); 502 file_path = get_file_path("mlg_map.pdf");
431 pdf (file=file_path, width=10, height=7); 503 pdf(file=file_path, width=10, height=7);
432 p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ 504 world_data = map_data("world");
433 scale_color_manual(values=getPalette(colourCount))+ 505 # Add the coral_mlg_clonal_id column from the stag_db_report
434 theme(legend.position="bottom")+ 506 # data fram to the affy_metadata_data_frame.
435 guides(color=guide_legend(nrow=8,byrow=F)); 507 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
436 p6; 508 # Get the number of colors needed from the palette for plotting
509 # the sample locations on the world map.
510 num_colors = length(unique(affy_metadata_data_frame$mlg));
511 # Get a color palette.
512 palette = colorRampPalette(piratepal("basel"));
513 ggplot() +
514 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
515 coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) +
516 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) +
517 scale_color_manual(values=palette(num_colors)) +
518 theme(legend.position="bottom") +
519 guides(color=guide_legend(nrow=8, byrow=F));
437 dev.off() 520 dev.off()
521 time_elapsed(start_time);
438 522
439 # Missing data barplot. 523 # Missing data barplot.
440 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; 524 start_time <- time_start("Creating missing_data.pdf");
441 test2 <- which(!is.na(poptab$miss)); 525 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
442 miss96 <- poptab$miss[test2]; 526 test2 <- which(!is.na(population_info_data_table$miss));
443 name96 <- poptab$user_specimen_id[test2]; 527 miss96 <- population_info_data_table$miss[test2];
528 name96 <- population_info_data_table$user_specimen_id[test2];
529 # Start PDF device driver.
444 dev.new(width=10, height=7); 530 dev.new(width=10, height=7);
445 file_path = get_file_path("missing_data.pdf"); 531 file_path = get_file_path("missing_data.pdf");
446 pdf (file=file_path, width=10, height=7); 532 pdf(file=file_path, width=10, height=7);
447 par(mar = c(8, 4, 4, 2)); 533 par(mar = c(8, 4, 4, 2));
448 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); 534 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
449 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); 535 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
450 dev.off() 536 dev.off()
537 time_elapsed(start_time);
451 538
452 # Generate a pie chart for each sample with a genotype. 539 # Generate a pie chart for each sample with a genotype.
453 # Store the numerical and user_specimen_id values from 540 # Store the numerical and user_specimen_id values from
454 # report_user for the charts (user_specimen_id names 541 # stag_db_report for the charts (user_specimen_id names
455 # will be used to label each chart). 542 # will be used to label each chart).
456 dt1 <- data.table(report_user); 543 start_time <- time_start("Creating percent_breakdown.pdf");
457 dt1 <- report_user[c(-2, -3, -4)]; 544 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)];
458 dt1 <- na.omit(dt1); 545 # Remove NA and NaN values.
459 # Translate to N (i.e., number of samples with a 546 stag_db_report_data_table <- na.omit(stag_db_report_data_table);
460 # genotype) columns and 5 rows. 547 # Translate to N (i.e., number of samples with a genotype)
461 tdt1 <- t(dt1); 548 # columns and 5 rows.
462 # Make another data table and transpose it the same as dt1 to 549 translated_stag_db_report_data_table <- t(stag_db_report_data_table);
463 # get numerics. These will feed into the creation of N vectors. 550 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
464 dt2 <- data.table(report_user); 551 # Set the storage mode of the matrix to numeric. In some
465 dt2 <- report_user[c(-1, -2, -3, -4)]; 552 # cases this could result in the following:
466 # Translate to N columns and 5 rows. 553 # Warning message:
467 tdt2 <- t(dt2); 554 # In mde(x) : NAs introduced by coercion
468 tdt1_matrix <- as.matrix(tdt1[-1,]); 555 mode(translated_stag_db_report_matrix) <- "numeric";
469 # The number of columns is the number of samples with genotypes. 556 # Remove NA and NaN values that may have been introduced
470 nc <- ncol(tdt1_matrix); 557 # by coercion.
471 mode(tdt1_matrix) <- "numeric"; 558 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
472 spy <- rowMeans(tdt1_matrix); 559 tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
473 dev.new(width=10, height=7); 560 dev.new(width=10, height=7);
474 file_path = get_file_path("percent_breakdown.pdf"); 561 file_path = get_file_path("percent_breakdown.pdf");
475 pdf(file=file_path, width=10, height=7); 562 pdf(file=file_path, width=10, height=7);
476 # Average pie of all samples. 563 # Average pie of all samples.
477 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); 564 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep="");
478 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); 565 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
479 main <- "Average breakdown of SNP assignments across all samples"; 566 main <- "Average breakdown of SNP assignments across all samples";
480 pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); 567 pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
481 par(mfrow=c(3, 2)); 568 par(mfrow=c(3, 2));
482 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); 569 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
483 for (i in 1:nc) { 570 # Generate a pie chart for each sample with genotypes.
484 tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); 571 for (i in 1:ncol(translated_stag_db_report_matrix)) {
485 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); 572 tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
486 pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); 573 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
574 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
487 } 575 }
488 dev.off() 576 dev.off()
577 time_elapsed(start_time);