0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 suppressPackageStartupMessages(library("adegenet"))
|
|
4 suppressPackageStartupMessages(library("ape"))
|
7
|
5 suppressPackageStartupMessages(library("data.table"))
|
12
|
6 suppressPackageStartupMessages(library("dbplyr"))
|
7
|
7 suppressPackageStartupMessages(library("dplyr"))
|
0
|
8 suppressPackageStartupMessages(library("ggplot2"))
|
|
9 suppressPackageStartupMessages(library("knitr"))
|
18
|
10 suppressPackageStartupMessages(library("maps"))
|
|
11 suppressPackageStartupMessages(library("mapproj"))
|
4
|
12 suppressPackageStartupMessages(library("optparse"))
|
|
13 suppressPackageStartupMessages(library("poppr"))
|
|
14 suppressPackageStartupMessages(library("RColorBrewer"))
|
7
|
15 suppressPackageStartupMessages(library("RPostgres"))
|
18
|
16 suppressPackageStartupMessages(library("SNPRelate"))
|
12
|
17 suppressPackageStartupMessages(library("tidyr"))
|
4
|
18 suppressPackageStartupMessages(library("vcfR"))
|
|
19 suppressPackageStartupMessages(library("vegan"))
|
17
|
20 suppressPackageStartupMessages(library("yarrr"))
|
|
21 theme_set(theme_bw())
|
0
|
22
|
|
23 option_list <- list(
|
7
|
24 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
|
|
25 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
|
4
|
26 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
|
7
|
27 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
|
18
|
28 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
|
|
29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
|
0
|
30 )
|
|
31
|
|
32 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
|
|
33 args <- parse_args(parser, positional_arguments=TRUE);
|
|
34 opt <- args$options;
|
|
35
|
|
36 get_file_path = function(file_name) {
|
|
37 file_path = paste("output_plots_dir", file_name, sep="/");
|
|
38 return(file_path);
|
|
39 }
|
|
40
|
8
|
41 get_database_connection <- function(db_conn_string) {
|
|
42 # Instantiate database connection.
|
|
43 # The connection string has this format:
|
|
44 # postgresql://user:password@host/dbname
|
|
45 conn_items <- strsplit(db_conn_string, "://")[[1]];
|
|
46 string_needed <- conn_items[2];
|
|
47 items_needed <- strsplit(string_needed, "@")[[1]];
|
|
48 user_pass_string <- items_needed[1];
|
|
49 host_dbname_string <- items_needed[2];
|
|
50 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
|
|
51 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
|
|
52 user <- user_pass_items[1];
|
|
53 pass <- user_pass_items[2];
|
|
54 host <- host_dbname_items[1];
|
|
55 dbname <- host_dbname_items[2];
|
|
56 # FIXME: is there a way to not hard-code the port?
|
16
|
57 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
|
8
|
58 return (conn);
|
|
59 }
|
|
60
|
18
|
61 time_elapsed <- function(start_time) {
|
|
62 cat("Elapsed time: ", proc.time() - start_time, "\n\n");
|
|
63 }
|
|
64
|
|
65 time_start <- function(msg) {
|
|
66 start_time <- proc.time();
|
|
67 cat(msg, "...\n");
|
|
68 return(start_time);
|
|
69 }
|
|
70
|
3
|
71 # Read in VCF input file.
|
18
|
72 start_time <- time_start("Reading VCF input");
|
2
|
73 vcf <- read.vcfR(opt$input_vcf);
|
18
|
74 time_elapsed(start_time);
|
0
|
75
|
7
|
76 # Convert VCF file into a genind for the Poppr package.
|
18
|
77 start_time <- time_start("Converting VCF data to a genind object");
|
|
78 genind_obj <- vcfR2genind(vcf);
|
|
79 time_elapsed(start_time);
|
7
|
80
|
0
|
81 # Add population information to the genind object.
|
18
|
82 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
|
|
83 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
|
|
84 genind_obj@pop <- as.factor(population_info_data_table$region);
|
|
85 strata(genind_obj) <- data.frame(pop(genind_obj));
|
7
|
86
|
|
87 # Convert genind object to a genclone object.
|
18
|
88 start_time <- time_start("Converting the genind object to a genclone object");
|
|
89 genind_clone <- as.genclone(genind_obj);
|
|
90 time_elapsed(start_time);
|
7
|
91
|
|
92 # Calculate the bitwise distance between individuals.
|
18
|
93 start_time <- time_start("Calculating the bitwise distance between individuals");
|
|
94 bitwise_distance <- bitwise.dist(genind_clone);
|
|
95 time_elapsed(start_time);
|
0
|
96
|
17
|
97 # Multilocus genotypes (threshold of 3.2%).
|
18
|
98 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
|
|
99 m <- mlg.table(genind_clone, background=TRUE, color=TRUE);
|
0
|
100
|
18
|
101 # Create list of MLGs.
|
|
102 mlg_ids <- mlg.id(genind_clone);
|
7
|
103
|
16
|
104 # Read user's Affymetrix 96 well plate tabular file.
|
18
|
105 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA"));
|
|
106 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
|
|
107 "region", "latitude", "longitude", "geographic_origin", "sample_location",
|
|
108 "latitude_outplant", "longitude_outplant", "depth", "disease_resist",
|
|
109 "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
|
|
110 "collector_first_name", "organization", "collection_date", "email", "seq_facility",
|
|
111 "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
|
|
112 "dna_extraction_method", "dna_concentration", "registry_id");
|
|
113 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
|
|
114 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
|
|
115 # The specimen_id_field_call_data_table looks like this:
|
|
116 # user_specimen_ids V2
|
|
117 # test_002 prolifera
|
|
118 # test_003 prolifera
|
|
119 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
|
|
120 # Rename the user_specimen_ids column.
|
|
121 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
|
|
122 # Rename the V2 column.
|
|
123 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
|
7
|
124
|
8
|
125 # Connect to database.
|
|
126 conn <- get_database_connection(opt$database_connection_string);
|
7
|
127 # Import the sample table.
|
16
|
128 sample_table <- tbl(conn, "sample");
|
|
129 # Import the genotype table.
|
|
130 genotype_table <- tbl(conn, "genotype");
|
|
131 # Select columns from the sample table and the
|
|
132 # genotype table joined by genotype_id.
|
|
133 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
|
|
134 smlg <- sample_table_columns %>%
|
|
135 left_join(genotype_table %>%
|
|
136 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
|
18
|
137 by=c("genotype_id"="id"));
|
7
|
138 # Convert to dataframe.
|
18
|
139 smlg_data_frame <- data.frame(smlg);
|
16
|
140 # Name the columns.
|
18
|
141 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
|
7
|
142
|
12
|
143 # Missing GT in samples submitted.
|
18
|
144 start_time <- time_start("Discovering missing GT in samples");
|
12
|
145 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
|
18
|
146 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
|
|
147 missing_gt <- (missing_gt / nrow(vcf)) * 100;
|
|
148 missing_gt_data_frame <- data.frame(missing_gt);
|
|
149 # The specimen_id_field_call_data_table looks like this:
|
|
150 # rn missing_gt
|
|
151 # a100000-4368120-060520-256_I07.CEL 0.06092608
|
|
152 # a100000-4368120-060520-256_K07.CEL 0.05077173
|
|
153 missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
|
|
154 # Rename the rn column.
|
|
155 setnames(missing_gt_data_table, c("rn"), c("affy_id"));
|
|
156 # Rename the missing_gt column.
|
|
157 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
|
|
158 # Round data to two digits.
|
|
159 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
|
|
160 time_elapsed(start_time);
|
7
|
161
|
18
|
162 # Heterozygous alleles.
|
|
163 start_time <- time_start("Discovering heterozygous alleles");
|
|
164 heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
|
|
165 heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100;
|
|
166 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
|
|
167 # The heterozygous_alleles_data_table looks like this:
|
|
168 # rn heterozygous_alleles
|
|
169 # a100000-4368120-060520-256_I07.CEL 73.94903
|
|
170 # a100000-4368120-060520-256_K07.CEL 74.40089
|
|
171 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
|
|
172 # Rename the rn column.
|
|
173 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
|
|
174 # Rename the heterozygous_alleles column.
|
|
175 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
|
|
176 # Round data to two digits.
|
|
177 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
|
|
178 time_elapsed(start_time);
|
7
|
179
|
18
|
180 # Reference alleles.
|
|
181 start_time <- time_start("Discovering reference alleles");
|
|
182 reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
|
|
183 reference_alleles <- (reference_alleles / nrow(vcf)) * 100;
|
|
184 reference_alleles_data_frame <- data.frame(reference_alleles);
|
|
185 # The reference_alleles_data_table looks like this:
|
|
186 # rn reference_alleles
|
|
187 # a100000-4368120-060520-256_I07.CEL 11.60642
|
|
188 # a100000-4368120-060520-256_K07.CEL 11.45918
|
|
189 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[];
|
|
190 # Rename the rn column.
|
|
191 setnames(reference_alleles_data_table, c("rn"), c("affy_id"));
|
|
192 # Rename the reference_alleles column.
|
|
193 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral"));
|
|
194 # Round data to two digits.
|
|
195 reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2);
|
|
196 time_elapsed(start_time);
|
7
|
197
|
18
|
198 # Alternative alleles
|
|
199 start_time <- time_start("Discovering alternative alleles");
|
|
200 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
|
|
201 alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100;
|
|
202 alternative_alleles_data_frame <- data.frame(alternative_alleles);
|
|
203 # The alternative_alleles_data_table looks like this:
|
|
204 # rn alternative_alleles
|
|
205 # a100000-4368120-060520-256_I07.CEL 14.38363
|
|
206 # a100000-4368120-060520-256_K07.CEL 14.08916
|
|
207 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[];
|
|
208 # Rename the rn column.
|
|
209 setnames(alternative_alleles_data_table, c("rn"), c("affy_id"));
|
|
210 # Rename the alternative_alleles column.
|
|
211 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral"));
|
|
212 # Round data to two digits.
|
|
213 alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2);
|
|
214 time_elapsed(start_time);
|
12
|
215
|
18
|
216 # The mlg_ids_data_table looks like this:
|
|
217 # mlg_ids
|
|
218 # a550962-4368120-060520-500_M23.CEL
|
|
219 # a550962-4368120-060520-256_A19.CEL
|
|
220 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
|
|
221 # Rename the mlg_ids column.
|
|
222 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
|
7
|
223
|
18
|
224 # sample_mlg_tibble looks like this:
|
|
225 # A tibble: 262 x 3
|
|
226 # Groups: group [?]
|
|
227 # group affy_id coral_mlg_clonal_id
|
|
228 # <int> <chr> <chr>
|
|
229 # 1 a550962-4368120-060520-500_M23.CEL NA
|
|
230 # 2 a550962-4368120-060520-256_A19.CEL HG0006
|
|
231 sample_mlg_tibble <- mlg_ids_data_table %>%
|
7
|
232 group_by(row_number()) %>%
|
16
|
233 dplyr::rename(group="row_number()") %>%
|
12
|
234 unnest (affy_id) %>%
|
7
|
235 # Join with mlg table.
|
18
|
236 left_join(smlg_data_frame %>%
|
9
|
237 select("affy_id","coral_mlg_clonal_id"),
|
16
|
238 by="affy_id");
|
7
|
239
|
|
240 # If found in database, group members on previous mlg id.
|
18
|
241 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
|
7
|
242 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
|
18
|
243 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
|
|
244 na.group <- sample_mlg_tibble$group[na.mlg];
|
|
245 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
|
7
|
246
|
18
|
247 # Find out if the sample mlg matched a previous genotyped sample.
|
|
248 # sample_mlg_match_tibble looks like this:
|
|
249 # A tibble: 262 x 4
|
|
250 # Groups: group [230]
|
|
251 # group affy_id coral_mlg_clonal_id DB_match
|
|
252 # <int> <chr> <chr> <chr>
|
|
253 # 1 a550962-4368120-060520-500_M23.CEL NA no_match
|
|
254 # 2 a550962-4368120-060520-256_A19.CEL HG0006 match
|
|
255 sample_mlg_match_tibble <- sample_mlg_tibble %>%
|
7
|
256 group_by(group) %>%
|
12
|
257 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
|
7
|
258
|
18
|
259 # Create new mlg id for samples with no matches in the database.
|
|
260 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
|
7
|
261 none <- none[is.na(none$coral_mlg_clonal_id),];
|
18
|
262 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
|
|
263 n.g <- sample_mlg_match_tibble$group[na.mlg2];
|
7
|
264 ct <- length(unique(n.g));
|
|
265
|
|
266 # List of new group ids, the sequence starts at the number of
|
18
|
267 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
|
|
268 # plus 1.
|
|
269 # FIXME: Not sure if # the sample_mlg_match_tibble file
|
|
270 # contains all ids. If it doesn't then look below to change
|
|
271 # the seq() function.
|
|
272 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
|
|
273
|
16
|
274 # Assign the new id iteratively for all that have NA.
|
7
|
275 for (i in 1:length(na.mlg2)) {
|
18
|
276 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
|
7
|
277 }
|
|
278
|
18
|
279 # Subset population_info_data_table for all samples.
|
|
280 # affy_id_user_specimen_id_vector looks like this:
|
|
281 # affy_id user_specimen_id
|
|
282 # a100000-4368120-060520-256_I07.CEL 13704
|
|
283 # a100000-4368120-060520-256_K07.CEL 13706
|
|
284 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
|
9
|
285
|
7
|
286 # Merge data frames for final table.
|
18
|
287 start_time <- time_start("Merging data frames");
|
|
288 stag_db_report <- specimen_id_field_call_data_table %>%
|
|
289 left_join(affy_id_user_specimen_id_vector %>%
|
9
|
290 select("affy_id", "user_specimen_id"),
|
16
|
291 by="user_specimen_id") %>%
|
18
|
292 left_join(sample_mlg_match_tibble %>%
|
9
|
293 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
|
16
|
294 by="affy_id") %>%
|
18
|
295 left_join(missing_gt_data_table %>%
|
9
|
296 select("affy_id", "percent_missing_data_coral"),
|
16
|
297 by="affy_id") %>%
|
18
|
298 left_join(heterozygous_alleles_data_table %>%
|
|
299 select("affy_id", "percent_heterozygous_coral"),
|
16
|
300 by="affy_id") %>%
|
18
|
301 left_join(reference_alleles_data_table %>%
|
9
|
302 select("affy_id", "percent_reference_coral"),
|
16
|
303 by="affy_id") %>%
|
18
|
304 left_join(alternative_alleles_data_table %>%
|
9
|
305 select("affy_id", "percent_alternative_coral"),
|
16
|
306 by="affy_id") %>%
|
7
|
307 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
|
12
|
308 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
|
18
|
309 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>%
|
|
310 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>%
|
|
311 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
|
|
312 ungroup() %>%
|
|
313 select(-group);
|
|
314 time_elapsed(start_time);
|
|
315
|
|
316 start_time <- time_start("Writing csv output");
|
|
317 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
|
|
318 time_elapsed(start_time);
|
|
319
|
|
320 # Database sample table.
|
|
321 sample_db <- affy_metadata_data_frame %>%
|
|
322 left_join(stag_db_report %>%
|
|
323 select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"),
|
|
324 by='user_specimen_id');
|
|
325
|
|
326 # Representative clone for genotype table.
|
|
327 start_time <- time_start("Creating representative clone for genotype table");
|
|
328 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.);
|
|
329 id_rep <- mlg.id(no_dup_genotypes_genind);
|
|
330 id_data_table <- data.table(id_rep, keep.rownames=TRUE);
|
|
331 # Rename the id_rep column.
|
|
332 setnames(id_data_table, c("id_rep"), c("affy_id"));
|
|
333 time_elapsed(start_time);
|
|
334
|
|
335 # # Combine with previously genotyped samples in the database.
|
|
336 start_time <- time_start("Selecting from various database tables");
|
|
337 representative_mlg_tibble <- id_data_table %>%
|
|
338 group_by(row_number()) %>%
|
|
339 rename(group='row_number()') %>%
|
|
340 unnest(affy_id) %>%
|
|
341 left_join(stag_db_report %>%
|
|
342 select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"),
|
|
343 by='affy_id') %>%
|
|
344 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>%
|
|
345 ungroup() %>%
|
|
346 select(-group);
|
|
347
|
|
348 # Database genotype table.
|
|
349 genotype_table_join <- sample_mlg_match_tibble %>%
|
|
350 left_join(representative_mlg_tibble %>%
|
|
351 select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"),
|
|
352 by='affy_id') %>%
|
7
|
353 ungroup() %>%
|
|
354 select(-group);
|
|
355
|
18
|
356 # Database taxonomy table.
|
|
357 taxonomy_table_join <- stag_db_report %>%
|
|
358 select(genetic_coral_species_call, affy_id) %>%
|
|
359 mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>%
|
|
360 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
|
|
361 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
|
|
362 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
|
|
363 time_elapsed(start_time);
|
|
364 # Table of alleles for the new samples subset to new plate data.
|
|
365 # Create vector indicating number of individuals desired from
|
|
366 # affy_id column of stag_db_report data table.
|
|
367 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
|
|
368 i <- i[!apply(i== "", 1, all),];
|
|
369 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
|
7
|
370
|
18
|
371 # cols looks like this:
|
|
372 # blue1 red green pink orange blue2
|
|
373 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF"
|
|
374 # green2 yellow turquoise poop
|
|
375 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
|
17
|
376 cols <- piratepal("basel");
|
4
|
377 set.seed(999);
|
17
|
378
|
18
|
379 if (!is.null(opt$output_nj_phylogeny_tree)) {
|
|
380 # Create a phylogeny tree of samples based on distance matrices.
|
|
381 # Start PDF device driver.
|
|
382 start_time <- time_start("Creating nj_phylogeny_tree.pdf");
|
|
383 dev.new(width=10, height=7);
|
|
384 file_path = get_file_path("nj_phylogeny_tree.pdf");
|
|
385 pdf(file=file_path, width=10, height=7);
|
|
386 # Organize branches by clade.
|
|
387 nj_phylogeny_tree <- sample_alleles_vector %>%
|
|
388 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>%
|
|
389 ladderize();
|
|
390 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
|
|
391 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
|
|
392 # Add a scale bar showing 5% difference.
|
|
393 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
|
|
394 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
|
|
395 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
|
|
396 dev.off()
|
|
397 time_elapsed(start_time);
|
|
398 }
|
17
|
399
|
18
|
400 # Subset VCF to the user samples.
|
|
401 start_time <- time_start("Subsetting vcf to the user samples");
|
|
402 l <- length(i);
|
|
403 n <- ncol(vcf@gt);
|
|
404 s <- n - l;
|
|
405 svcf <- vcf[, s:n];
|
17
|
406 write.vcf(svcf, "subset.vcf.gz");
|
|
407 vcf.fn <- "subset.vcf.gz";
|
|
408 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
|
18
|
409 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
|
|
410 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
|
|
411 # gds_array looks like this:
|
|
412 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
|
|
413 # [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL"
|
|
414 gds_data_frame <- data.frame(gds_array);
|
|
415 # gds_data_frame looks like this:
|
|
416 # gds_array
|
|
417 # a550962-4368120-060520-500_A03.CEL
|
|
418 # a550962-4368120-060520-500_A05.CEL
|
|
419 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
|
|
420 # Rename the gds_array column.
|
|
421 setnames(gds_data_table, c("gds_array"), c("affy_id"));
|
|
422 # affy_id_region_list looks like this:
|
|
423 # affy_id region
|
|
424 # a100000-4368120-060520-256_I07.CEL USVI
|
|
425 # a100000-4368120-060520-256_K07.CEL USVI
|
|
426 affy_id_region_list <- population_info_data_table[c(2,4)];
|
|
427 gds_data_table_join <- gds_data_table %>%
|
|
428 left_join(affy_id_region_list %>%
|
|
429 select("affy_id", "region"),
|
|
430 by='affy_id')%>%
|
|
431 drop_na();
|
|
432 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
|
|
433 add.gdsn(genofile, "sample.annot", samp.annot);
|
|
434 # population_code looks like this:
|
|
435 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
|
|
436 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
|
|
437 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
|
|
438 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
|
|
439 # pop.group looks like this:
|
|
440 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
|
|
441 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
|
|
442 time_elapsed(start_time);
|
17
|
443
|
18
|
444 # Distance matrix calculation.
|
|
445 start_time <- time_start("Calculating distance matrix");
|
|
446 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
|
|
447 time_elapsed(start_time);
|
17
|
448
|
18
|
449 # Cluster analysis on the genome-wide IBS pairwise distance matrix.
|
|
450 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
|
17
|
451 set.seed(100);
|
|
452 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2);
|
|
453 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE));
|
18
|
454 time_elapsed(start_time);
|
17
|
455
|
18
|
456 # Default clustering.
|
|
457 start_time <- time_start("Creating ibs_default.pdf");
|
|
458 # Start PDF device driver.
|
17
|
459 dev.new(width=10, height=7);
|
18
|
460 file_path = get_file_path("ibs_default.pdf");
|
|
461 pdf(file=file_path, width=10, height=7);
|
17
|
462 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
|
18
|
463 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2);
|
17
|
464 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
|
|
465 dev.off()
|
18
|
466 time_elapsed(start_time);
|
17
|
467
|
18
|
468 # Color cluster by region.
|
|
469 start_time <- time_start("Creating ibs_region.pdf");
|
|
470 # Start PDF device driver.
|
17
|
471 dev.new(width=10, height=7);
|
18
|
472 file_path = get_file_path("ibs_region.pdf");
|
|
473 pdf(file=file_path, width=10, height=7);
|
|
474 race <- as.factor(population_code);
|
|
475 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15);
|
|
476 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2);
|
17
|
477 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
|
|
478 dev.off()
|
18
|
479 time_elapsed(start_time);
|
17
|
480
|
18
|
481 # close GDS file.
|
17
|
482 snpgdsClose(genofile);
|
|
483
|
|
484 # Sample MLG on a map.
|
18
|
485 start_time <- time_start("Creating mlg_map.pdf");
|
|
486 # Get the lattitude and longitude boundaries for rendering
|
|
487 # the map. Tese boundaries will restrict the map to focus
|
|
488 # (i.e., zoom) on the region of the world map from which
|
|
489 # the samples were taken.
|
|
490 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
|
|
491 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
|
|
492 latitude_range_vector <- c(min_latitude-3, max_latitude+3);
|
|
493 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
|
|
494 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
|
|
495 longitude_range_vector <- c(min_longitude-3, max_longitude+3);
|
|
496 # Get the palette colors for rendering plots.
|
|
497 colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
|
|
498 # Get a color palette.
|
|
499 palette <- colorRampPalette(piratepal("basel"));
|
|
500 # Start PDF device driver.
|
17
|
501 dev.new(width=10, height=7);
|
|
502 file_path = get_file_path("mlg_map.pdf");
|
18
|
503 pdf(file=file_path, width=10, height=7);
|
|
504 world_data = map_data("world");
|
|
505 # Add the coral_mlg_clonal_id column from the stag_db_report
|
|
506 # data fram to the affy_metadata_data_frame.
|
|
507 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
|
|
508 # Get the number of colors needed from the palette for plotting
|
|
509 # the sample locations on the world map.
|
|
510 num_colors = length(unique(affy_metadata_data_frame$mlg));
|
|
511 # Get a color palette.
|
|
512 palette = colorRampPalette(piratepal("basel"));
|
|
513 ggplot() +
|
|
514 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
|
|
515 coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) +
|
|
516 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) +
|
|
517 scale_color_manual(values=palette(num_colors)) +
|
|
518 theme(legend.position="bottom") +
|
|
519 guides(color=guide_legend(nrow=8, byrow=F));
|
17
|
520 dev.off()
|
18
|
521 time_elapsed(start_time);
|
0
|
522
|
9
|
523 # Missing data barplot.
|
18
|
524 start_time <- time_start("Creating missing_data.pdf");
|
|
525 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
|
|
526 test2 <- which(!is.na(population_info_data_table$miss));
|
|
527 miss96 <- population_info_data_table$miss[test2];
|
|
528 name96 <- population_info_data_table$user_specimen_id[test2];
|
|
529 # Start PDF device driver.
|
9
|
530 dev.new(width=10, height=7);
|
|
531 file_path = get_file_path("missing_data.pdf");
|
18
|
532 pdf(file=file_path, width=10, height=7);
|
9
|
533 par(mar = c(8, 4, 4, 2));
|
|
534 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
|
|
535 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
|
|
536 dev.off()
|
18
|
537 time_elapsed(start_time);
|
9
|
538
|
15
|
539 # Generate a pie chart for each sample with a genotype.
|
|
540 # Store the numerical and user_specimen_id values from
|
18
|
541 # stag_db_report for the charts (user_specimen_id names
|
15
|
542 # will be used to label each chart).
|
18
|
543 start_time <- time_start("Creating percent_breakdown.pdf");
|
|
544 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)];
|
|
545 # Remove NA and NaN values.
|
|
546 stag_db_report_data_table <- na.omit(stag_db_report_data_table);
|
|
547 # Translate to N (i.e., number of samples with a genotype)
|
|
548 # columns and 5 rows.
|
|
549 translated_stag_db_report_data_table <- t(stag_db_report_data_table);
|
|
550 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
|
|
551 # Set the storage mode of the matrix to numeric. In some
|
|
552 # cases this could result in the following:
|
|
553 # Warning message:
|
|
554 # In mde(x) : NAs introduced by coercion
|
|
555 mode(translated_stag_db_report_matrix) <- "numeric";
|
|
556 # Remove NA and NaN values that may have been introduced
|
|
557 # by coercion.
|
|
558 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
|
|
559 tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
|
12
|
560 dev.new(width=10, height=7);
|
|
561 file_path = get_file_path("percent_breakdown.pdf");
|
|
562 pdf(file=file_path, width=10, height=7);
|
|
563 # Average pie of all samples.
|
18
|
564 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep="");
|
12
|
565 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
|
|
566 main <- "Average breakdown of SNP assignments across all samples";
|
18
|
567 pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
|
12
|
568 par(mfrow=c(3, 2));
|
14
|
569 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
|
18
|
570 # Generate a pie chart for each sample with genotypes.
|
|
571 for (i in 1:ncol(translated_stag_db_report_matrix)) {
|
|
572 tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
|
|
573 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
|
|
574 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
|
12
|
575 }
|
|
576 dev.off()
|
18
|
577 time_elapsed(start_time);
|