0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 suppressPackageStartupMessages(library("adegenet"))
|
|
4 suppressPackageStartupMessages(library("ape"))
|
7
|
5 suppressPackageStartupMessages(library("data.table"))
|
12
|
6 suppressPackageStartupMessages(library("dbplyr"))
|
7
|
7 suppressPackageStartupMessages(library("dplyr"))
|
0
|
8 suppressPackageStartupMessages(library("ggplot2"))
|
|
9 suppressPackageStartupMessages(library("knitr"))
|
4
|
10 suppressPackageStartupMessages(library("optparse"))
|
|
11 suppressPackageStartupMessages(library("poppr"))
|
|
12 suppressPackageStartupMessages(library("RColorBrewer"))
|
7
|
13 suppressPackageStartupMessages(library("RPostgres"))
|
12
|
14 suppressPackageStartupMessages(library("tidyr"))
|
4
|
15 suppressPackageStartupMessages(library("vcfR"))
|
|
16 suppressPackageStartupMessages(library("vegan"))
|
0
|
17
|
|
18 option_list <- list(
|
7
|
19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
|
|
20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
|
4
|
21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
|
7
|
22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
|
|
23 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file")
|
0
|
24 )
|
|
25
|
|
26 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
|
|
27 args <- parse_args(parser, positional_arguments=TRUE);
|
|
28 opt <- args$options;
|
|
29
|
|
30 get_file_path = function(file_name) {
|
|
31 file_path = paste("output_plots_dir", file_name, sep="/");
|
|
32 return(file_path);
|
|
33 }
|
|
34
|
8
|
35 get_database_connection <- function(db_conn_string) {
|
|
36 # Instantiate database connection.
|
|
37 # The connection string has this format:
|
|
38 # postgresql://user:password@host/dbname
|
|
39 conn_items <- strsplit(db_conn_string, "://")[[1]];
|
|
40 string_needed <- conn_items[2];
|
|
41 items_needed <- strsplit(string_needed, "@")[[1]];
|
|
42 user_pass_string <- items_needed[1];
|
|
43 host_dbname_string <- items_needed[2];
|
|
44 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
|
|
45 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
|
|
46 user <- user_pass_items[1];
|
|
47 pass <- user_pass_items[2];
|
|
48 host <- host_dbname_items[1];
|
|
49 dbname <- host_dbname_items[2];
|
|
50 # FIXME: is there a way to not hard-code the port?
|
|
51 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
|
|
52 return (conn);
|
|
53 }
|
|
54
|
3
|
55 # Read in VCF input file.
|
2
|
56 vcf <- read.vcfR(opt$input_vcf);
|
0
|
57
|
7
|
58 # Convert VCF file into a genind for the Poppr package.
|
|
59 # TODO: probably should not hard-code 2 cores.
|
|
60 gl <- vcfR2genlight(vcf, n.cores=2);
|
8
|
61 gind <- new("genind", (as.matrix(gl)));
|
7
|
62
|
0
|
63 # Add population information to the genind object.
|
10
|
64 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
|
9
|
65 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
|
8
|
66 gind@pop <- as.factor(poptab$region);
|
7
|
67
|
|
68 # Convert genind object to a genclone object.
|
8
|
69 obj2 <- as.genclone(gind);
|
7
|
70
|
|
71 # Calculate the bitwise distance between individuals.
|
8
|
72 xdis <- bitwise.dist(obj2);
|
0
|
73
|
9
|
74 # Multilocus genotypes (threshold of 16%).
|
|
75 mlg.filter(obj2, distance=xdis) <- 0.016;
|
8
|
76 m <- mlg.table(obj2, background=TRUE, color=TRUE);
|
0
|
77
|
|
78 # Create table of MLGs.
|
8
|
79 id <- mlg.id(obj2);
|
7
|
80 dt <- data.table(id, keep.rownames=TRUE);
|
9
|
81 setnames(dt, c("id"), c("affy_id"));
|
7
|
82
|
|
83 # Read user's Affymetrix 96 well plate csv file.
|
12
|
84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
|
|
85 colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
|
|
86 "reef", "region", "latitude", "longitude", "geographic_origin",
|
|
87 "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
|
|
88 "disease_resist", "bleach_resist", "mortality","tle", "spawning",
|
|
89 "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
|
|
90 "seq_facility", "array_version", "public", "public_after_date");
|
9
|
91 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
|
|
92 pinfo2 <- as.character(pinfo$user_specimen_id);
|
|
93 pi <- data.table(pinfo2);
|
|
94 setnames(pi, c("pinfo2"), c("user_specimen_id"));
|
7
|
95
|
8
|
96 # Connect to database.
|
|
97 conn <- get_database_connection(opt$database_connection_string);
|
7
|
98
|
|
99 # Import the sample table.
|
8
|
100 mD <- tbl(conn, "sample");
|
7
|
101
|
|
102 # Select user_specimen_id and mlg columns.
|
9
|
103 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
|
7
|
104
|
|
105 # Convert to dataframe.
|
|
106 sm <- data.frame(smlg);
|
|
107 sm[sm==""] <- NA;
|
|
108
|
12
|
109 # Missing GT in samples submitted.
|
|
110 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
|
|
111 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
|
|
112 myMiss <- (myMiss / nrow(vcf)) * 100;
|
|
113 miss <- data.frame(myMiss);
|
|
114
|
7
|
115 # Convert missing data into data table.
|
8
|
116 mi <-setDT(miss, keep.rownames=TRUE)[];
|
9
|
117 setnames(mi, c("rn"), c("affy_id"));
|
7
|
118 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
|
|
119 # Round missing data to two digits.
|
9
|
120 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
|
7
|
121
|
12
|
122 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
|
|
123 hets <- (hets / nrow(vcf)) * 100;
|
|
124 ht <- data.frame(hets);
|
|
125
|
7
|
126 # Convert heterozygosity data into data table.
|
|
127 ht <-setDT(ht, keep.rownames=TRUE)[];
|
9
|
128 setnames(ht, c("rn"), c("affy_id"));
|
7
|
129 setnames(ht, c("hets"), c("percent_mixed_coral"));
|
|
130 # Round missing data to two digits.
|
|
131 ht$percent_mixed<-round(ht$percent_mixed, digits=2);
|
|
132
|
12
|
133 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
|
|
134 refA <- (refA / nrow(vcf)) * 100;
|
|
135 rA <- data.frame(refA);
|
|
136
|
7
|
137 # Convert refA data into data.table.
|
|
138 rA <-setDT(rA, keep.rownames=TRUE)[];
|
9
|
139 setnames(rA, c("rn"), c("affy_id"));
|
7
|
140 setnames(rA, c("refA"), c("percent_reference_coral"));
|
|
141 # round missing data to two digits.
|
|
142 rA$percent_reference<-round(rA$percent_reference, digits=2);
|
|
143
|
12
|
144 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
|
|
145 altB <- (altB / nrow(vcf)) * 100;
|
|
146 aB <- data.frame(altB);
|
|
147
|
7
|
148 # Convert altB data into data table.
|
|
149 aB <-setDT(aB, keep.rownames=TRUE)[];
|
9
|
150 setnames(aB, c("rn"), c("affy_id"));
|
7
|
151 setnames(aB, c("altB"), c("percent_alternative_coral"));
|
|
152 # Round missing data to two digits.
|
|
153 aB$percent_alternative<-round(aB$percent_alternative, digits=2);
|
|
154
|
|
155 #convert mlg id to data.table format
|
|
156 dt <- data.table(id, keep.rownames=TRUE);
|
9
|
157 setnames(dt, c("id"), c("affy_id"));
|
7
|
158
|
|
159 # Transform.
|
|
160 df3 <- dt %>%
|
|
161 group_by(row_number()) %>%
|
|
162 dplyr::rename(group='row_number()') %>%
|
12
|
163 unnest (affy_id) %>%
|
7
|
164 # Join with mlg table.
|
|
165 left_join(sm %>%
|
9
|
166 select("affy_id","coral_mlg_clonal_id"),
|
|
167 by='affy_id');
|
7
|
168
|
|
169 # If found in database, group members on previous mlg id.
|
|
170 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
|
|
171 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
|
|
172 na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
|
|
173 na.group <- df3$group[na.mlg];
|
|
174 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
|
|
175
|
|
176 # Determine if the sample mlg matched previous genotyped sample.
|
|
177 df4<- df3 %>%
|
|
178 group_by(group) %>%
|
12
|
179 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
|
7
|
180
|
|
181 # Create new mlg id for samples that did not match those in the database.
|
|
182 none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
|
|
183 none <- none[is.na(none$coral_mlg_clonal_id),];
|
|
184 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
|
|
185 n.g <- df4$group[na.mlg2];
|
|
186 ct <- length(unique(n.g));
|
|
187
|
|
188 # List of new group ids, the sequence starts at the number of
|
|
189 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if
|
|
190 # the df4 file contains all ids. If it doesn't then look below
|
|
191 # to change the seq() function.
|
|
192 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
|
|
193 # This is a key for pairing group with new ids.
|
|
194 rat <- cbind(unique(n.g), n.g_ids);
|
12
|
195 # This for loop assigns the new id iteratively for all that have NA.
|
7
|
196 for (i in 1:length(na.mlg2)) {
|
|
197 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
|
|
198 }
|
|
199
|
9
|
200 # subset poptab for all samples.
|
|
201 subpop <- poptab[c(2, 3)];
|
|
202
|
7
|
203 # Merge data frames for final table.
|
|
204 report_user <- pi %>%
|
9
|
205 left_join(subpop %>%
|
|
206 select("affy_id", "user_specimen_id"),
|
7
|
207 by='user_specimen_id') %>%
|
9
|
208 left_join(df4 %>%
|
|
209 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
|
|
210 by='affy_id') %>%
|
7
|
211 left_join(mi %>%
|
9
|
212 select("affy_id", "percent_missing_data_coral"),
|
|
213 by='affy_id') %>%
|
7
|
214 left_join(ht %>%
|
9
|
215 select("affy_id", "percent_mixed_coral"),
|
|
216 by='affy_id') %>%
|
7
|
217 left_join(rA %>%
|
9
|
218 select("affy_id", "percent_reference_coral"),
|
|
219 by='affy_id') %>%
|
7
|
220 left_join(aB %>%
|
9
|
221 select("affy_id", "percent_alternative_coral"),
|
|
222 by='affy_id') %>%
|
7
|
223 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
|
12
|
224 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
|
7
|
225 ungroup() %>%
|
|
226 select(-group);
|
|
227
|
12
|
228 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);
|
0
|
229
|
9
|
230 # Combine sample information for database.
|
|
231 report_db <- pinfo %>%
|
|
232 left_join(report_user %>%
|
|
233 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
|
|
234 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
|
|
235 "percent_alternative_coral"),
|
12
|
236 by='user_specimen_id');
|
7
|
237
|
9
|
238 # Create vector indicating number of individuals desired
|
|
239 # made from affy_id collumn of report_user data table.
|
|
240 i <- report_user[[2]];
|
|
241 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
|
0
|
242
|
4
|
243 # Create a phylogeny of samples based on distance matrices.
|
|
244 cols <- palette(brewer.pal(n=12, name='Set3'));
|
|
245 set.seed(999);
|
|
246 # Start PDF device driver.
|
|
247 dev.new(width=10, height=7);
|
|
248 file_path = get_file_path("nj_phylogeny.pdf");
|
|
249 pdf(file=file_path, width=10, height=7);
|
|
250 # Organize branches by clade.
|
9
|
251 theTree <- sub96 %>%
|
|
252 aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>%
|
7
|
253 ladderize();
|
9
|
254 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)];
|
|
255 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
|
4
|
256 # Add a scale bar showing 5% difference..
|
9
|
257 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
|
|
258 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
|
|
259 legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8);
|
7
|
260 dev.off();
|
0
|
261
|
9
|
262 # Missing data barplot.
|
|
263 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)];
|
|
264 test2 <- which(!is.na(poptab$miss));
|
|
265 miss96 <- poptab$miss[test2];
|
|
266 name96 <- poptab$user_specimen_id[test2];
|
|
267 dev.new(width=10, height=7);
|
|
268 file_path = get_file_path("missing_data.pdf");
|
|
269 pdf (file=file_path, width=10, height=7);
|
|
270 par(mar = c(8, 4, 4, 2));
|
|
271 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
|
|
272 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
|
|
273 dev.off()
|
|
274
|
15
|
275 # Generate a pie chart for each sample with a genotype.
|
|
276 # Store the numerical and user_specimen_id values from
|
|
277 # report_user for the charts (user_specimen_id names
|
|
278 # will be used to label each chart).
|
12
|
279 dt1 <- data.table(report_user);
|
|
280 dt1 <- report_user[c(-2, -3, -4)];
|
|
281 dt1 <- na.omit(dt1);
|
15
|
282 # Translate to N (i.e., number of samples with a
|
|
283 # genotype) columns and 5 rows.
|
12
|
284 tdt1 <- t(dt1);
|
|
285 # Make another data table and transpose it the same as dt1 to
|
15
|
286 # get numerics. These will feed into the creation of N vectors.
|
12
|
287 dt2 <- data.table(report_user);
|
|
288 dt2 <- report_user[c(-1, -2, -3, -4)];
|
15
|
289 # Translate to N columns and 5 rows.
|
12
|
290 tdt2 <- t(dt2);
|
|
291 tdt1_matrix <- as.matrix(tdt1[-1,]);
|
15
|
292 # The number of columns is the number of samples with genotypes.
|
|
293 nc <- ncol(tdt1_matrix);
|
12
|
294 mode(tdt1_matrix) <- "numeric";
|
|
295 spy <- rowMeans(tdt1_matrix);
|
|
296 dev.new(width=10, height=7);
|
|
297 file_path = get_file_path("percent_breakdown.pdf");
|
|
298 pdf(file=file_path, width=10, height=7);
|
|
299 # Average pie of all samples.
|
|
300 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
|
|
301 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
|
|
302 main <- "Average breakdown of SNP assignments across all samples";
|
|
303 pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
|
|
304 par(mfrow=c(3, 2));
|
14
|
305 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
|
15
|
306 for (i in 1:nc) {
|
14
|
307 tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
|
12
|
308 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
|
14
|
309 pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
|
12
|
310 }
|
|
311 dev.off()
|
9
|
312
|