comparison multilocus_genotype.R @ 12:ea2914ddea50 draft

Uploaded
author greg
date Fri, 07 Dec 2018 09:13:02 -0500
parents 6c93244a36e2
children 96ee9122823e
comparison
equal deleted inserted replaced
11:c755744296ca 12:ea2914ddea50
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 2
3 suppressPackageStartupMessages(library("adegenet")) 3 suppressPackageStartupMessages(library("adegenet"))
4 suppressPackageStartupMessages(library("ape")) 4 suppressPackageStartupMessages(library("ape"))
5 suppressPackageStartupMessages(library("data.table")) 5 suppressPackageStartupMessages(library("data.table"))
6 suppressPackageStartupMessages(library("dbplyr"))
6 suppressPackageStartupMessages(library("dplyr")) 7 suppressPackageStartupMessages(library("dplyr"))
7 suppressPackageStartupMessages(library("ggplot2")) 8 suppressPackageStartupMessages(library("ggplot2"))
8 suppressPackageStartupMessages(library("knitr")) 9 suppressPackageStartupMessages(library("knitr"))
9 suppressPackageStartupMessages(library("optparse")) 10 suppressPackageStartupMessages(library("optparse"))
10 suppressPackageStartupMessages(library("poppr")) 11 suppressPackageStartupMessages(library("poppr"))
11 suppressPackageStartupMessages(library("RColorBrewer")) 12 suppressPackageStartupMessages(library("RColorBrewer"))
12 suppressPackageStartupMessages(library("RPostgres")) 13 suppressPackageStartupMessages(library("RPostgres"))
14 suppressPackageStartupMessages(library("tidyr"))
13 suppressPackageStartupMessages(library("vcfR")) 15 suppressPackageStartupMessages(library("vcfR"))
14 suppressPackageStartupMessages(library("vegan")) 16 suppressPackageStartupMessages(library("vegan"))
15 17
16 option_list <- list( 18 option_list <- list(
17 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), 19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
51 } 53 }
52 54
53 # Read in VCF input file. 55 # Read in VCF input file.
54 vcf <- read.vcfR(opt$input_vcf); 56 vcf <- read.vcfR(opt$input_vcf);
55 57
56 #Missing GT in samples submitted 58 # Convert VCF file into a genind for the Poppr package.
59 # TODO: probably should not hard-code 2 cores.
60 gl <- vcfR2genlight(vcf, n.cores=2);
61 gind <- new("genind", (as.matrix(gl)));
62
63 # Add population information to the genind object.
64 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
65 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
66 gind@pop <- as.factor(poptab$region);
67
68 # Convert genind object to a genclone object.
69 obj2 <- as.genclone(gind);
70
71 # Calculate the bitwise distance between individuals.
72 xdis <- bitwise.dist(obj2);
73
74 # Multilocus genotypes (threshold of 16%).
75 mlg.filter(obj2, distance=xdis) <- 0.016;
76 m <- mlg.table(obj2, background=TRUE, color=TRUE);
77
78 # Create table of MLGs.
79 id <- mlg.id(obj2);
80 dt <- data.table(id, keep.rownames=TRUE);
81 setnames(dt, c("id"), c("affy_id"));
82
83 # Read user's Affymetrix 96 well plate csv file.
84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
85 colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
86 "reef", "region", "latitude", "longitude", "geographic_origin",
87 "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
88 "disease_resist", "bleach_resist", "mortality","tle", "spawning",
89 "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
90 "seq_facility", "array_version", "public", "public_after_date");
91 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
92 pinfo2 <- as.character(pinfo$user_specimen_id);
93 pi <- data.table(pinfo2);
94 setnames(pi, c("pinfo2"), c("user_specimen_id"));
95
96 # Connect to database.
97 conn <- get_database_connection(opt$database_connection_string);
98
99 # Import the sample table.
100 mD <- tbl(conn, "sample");
101
102 # Select user_specimen_id and mlg columns.
103 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
104
105 # Convert to dataframe.
106 sm <- data.frame(smlg);
107 sm[sm==""] <- NA;
108
109 # Missing GT in samples submitted.
57 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 110 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
58 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 111 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
59 myMiss <- (myMiss / nrow(vcf)) * 100; 112 myMiss <- (myMiss / nrow(vcf)) * 100;
60 miss <- data.frame(myMiss); 113 miss <- data.frame(myMiss);
61
62 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
63 hets <- (hets / nrow(vcf)) * 100;
64 ht <- data.frame(hets);
65
66 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
67 refA <- (refA / nrow(vcf)) * 100;
68 rA <- data.frame(refA);
69
70 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
71 altB <- (altB / nrow(vcf)) * 100;
72 aB <- data.frame(altB);
73
74 # Convert VCF file into a genind for the Poppr package.
75 # TODO: probably should not hard-code 2 cores.
76 gl <- vcfR2genlight(vcf, n.cores=2);
77 gind <- new("genind", (as.matrix(gl)));
78
79 # Add population information to the genind object.
80 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t");
81 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
82 gind@pop <- as.factor(poptab$region);
83
84 # Convert genind object to a genclone object.
85 obj2 <- as.genclone(gind);
86
87 # Calculate the bitwise distance between individuals.
88 xdis <- bitwise.dist(obj2);
89
90 # Multilocus genotypes (threshold of 16%).
91 mlg.filter(obj2, distance=xdis) <- 0.016;
92 m <- mlg.table(obj2, background=TRUE, color=TRUE);
93
94 # Create table of MLGs.
95 id <- mlg.id(obj2);
96 dt <- data.table(id, keep.rownames=TRUE);
97 setnames(dt, c("id"), c("affy_id"));
98
99 # Read user's Affymetrix 96 well plate csv file.
100 pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t");
101 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
102 pinfo2 <- as.character(pinfo$user_specimen_id);
103 pi <- data.table(pinfo2);
104 setnames(pi, c("pinfo2"), c("user_specimen_id"));
105
106 # Connect to database.
107 conn <- get_database_connection(opt$database_connection_string);
108
109 # Import the sample table.
110 mD <- tbl(conn, "sample");
111
112 # Select user_specimen_id and mlg columns.
113 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
114
115 # Convert to dataframe.
116 sm <- data.frame(smlg);
117 sm[sm==""] <- NA;
118 114
119 # Convert missing data into data table. 115 # Convert missing data into data table.
120 mi <-setDT(miss, keep.rownames=TRUE)[]; 116 mi <-setDT(miss, keep.rownames=TRUE)[];
121 setnames(mi, c("rn"), c("affy_id")); 117 setnames(mi, c("rn"), c("affy_id"));
122 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); 118 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
123 # Round missing data to two digits. 119 # Round missing data to two digits.
124 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); 120 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
125 121
122 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
123 hets <- (hets / nrow(vcf)) * 100;
124 ht <- data.frame(hets);
125
126 # Convert heterozygosity data into data table. 126 # Convert heterozygosity data into data table.
127 ht <-setDT(ht, keep.rownames=TRUE)[]; 127 ht <-setDT(ht, keep.rownames=TRUE)[];
128 setnames(ht, c("rn"), c("affy_id")); 128 setnames(ht, c("rn"), c("affy_id"));
129 setnames(ht, c("hets"), c("percent_mixed_coral")); 129 setnames(ht, c("hets"), c("percent_mixed_coral"));
130 # Round missing data to two digits. 130 # Round missing data to two digits.
131 ht$percent_mixed<-round(ht$percent_mixed, digits=2); 131 ht$percent_mixed<-round(ht$percent_mixed, digits=2);
132 132
133 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} );
134 refA <- (refA / nrow(vcf)) * 100;
135 rA <- data.frame(refA);
136
133 # Convert refA data into data.table. 137 # Convert refA data into data.table.
134 rA <-setDT(rA, keep.rownames=TRUE)[]; 138 rA <-setDT(rA, keep.rownames=TRUE)[];
135 setnames(rA, c("rn"), c("affy_id")); 139 setnames(rA, c("rn"), c("affy_id"));
136 setnames(rA, c("refA"), c("percent_reference_coral")); 140 setnames(rA, c("refA"), c("percent_reference_coral"));
137 # round missing data to two digits. 141 # round missing data to two digits.
138 rA$percent_reference<-round(rA$percent_reference, digits=2); 142 rA$percent_reference<-round(rA$percent_reference, digits=2);
139 143
144 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} );
145 altB <- (altB / nrow(vcf)) * 100;
146 aB <- data.frame(altB);
147
140 # Convert altB data into data table. 148 # Convert altB data into data table.
141 aB <-setDT(aB, keep.rownames=TRUE)[]; 149 aB <-setDT(aB, keep.rownames=TRUE)[];
142 setnames(aB, c("rn"), c("affy_id")); 150 setnames(aB, c("rn"), c("affy_id"));
143 setnames(aB, c("altB"), c("percent_alternative_coral")); 151 setnames(aB, c("altB"), c("percent_alternative_coral"));
144 # Round missing data to two digits. 152 # Round missing data to two digits.
150 158
151 # Transform. 159 # Transform.
152 df3 <- dt %>% 160 df3 <- dt %>%
153 group_by(row_number()) %>% 161 group_by(row_number()) %>%
154 dplyr::rename(group='row_number()') %>% 162 dplyr::rename(group='row_number()') %>%
155 unnest (user_specimen_id) %>% 163 unnest (affy_id) %>%
156 # Join with mlg table. 164 # Join with mlg table.
157 left_join(sm %>% 165 left_join(sm %>%
158 select("affy_id","coral_mlg_clonal_id"), 166 select("affy_id","coral_mlg_clonal_id"),
159 by='affy_id'); 167 by='affy_id');
160 168
166 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; 174 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
167 175
168 # Determine if the sample mlg matched previous genotyped sample. 176 # Determine if the sample mlg matched previous genotyped sample.
169 df4<- df3 %>% 177 df4<- df3 %>%
170 group_by(group) %>% 178 group_by(group) %>%
171 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match")); 179 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match"));
172 180
173 # Create new mlg id for samples that did not match those in the database. 181 # Create new mlg id for samples that did not match those in the database.
174 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); 182 none <- unique(df4[c("group", "coral_mlg_clonal_id")]);
175 none <- none[is.na(none$coral_mlg_clonal_id),]; 183 none <- none[is.na(none$coral_mlg_clonal_id),];
176 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); 184 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id));
182 # the df4 file contains all ids. If it doesn't then look below 190 # the df4 file contains all ids. If it doesn't then look below
183 # to change the seq() function. 191 # to change the seq() function.
184 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); 192 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
185 # This is a key for pairing group with new ids. 193 # This is a key for pairing group with new ids.
186 rat <- cbind(unique(n.g), n.g_ids); 194 rat <- cbind(unique(n.g), n.g_ids);
187 # this for loop assigns the new id iteratively for all that have NA. 195 # This for loop assigns the new id iteratively for all that have NA.
188 for (i in 1:length(na.mlg2)) { 196 for (i in 1:length(na.mlg2)) {
189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; 197 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
190 } 198 }
191 199
192 # subset poptab for all samples. 200 # subset poptab for all samples.
211 by='affy_id') %>% 219 by='affy_id') %>%
212 left_join(aB %>% 220 left_join(aB %>%
213 select("affy_id", "percent_alternative_coral"), 221 select("affy_id", "percent_alternative_coral"),
214 by='affy_id') %>% 222 by='affy_id') %>%
215 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% 223 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
216 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% 224 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
217 ungroup() %>% 225 ungroup() %>%
218 select(-group); 226 select(-group);
219 227
220 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); 228 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE);
221 229
222 # Combine sample information for database. 230 # Combine sample information for database.
223 report_db <- pinfo %>% 231 report_db <- pinfo %>%
224 left_join(report_user %>% 232 left_join(report_user %>%
225 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", 233 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
226 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", 234 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
227 "percent_alternative_coral"), 235 "percent_alternative_coral"),
228 by='user_specimen_id') 236 by='user_specimen_id');
229 237
230 # Create vector indicating number of individuals desired 238 # Create vector indicating number of individuals desired
231 # made from affy_id collumn of report_user data table. 239 # made from affy_id collumn of report_user data table.
232 i <- report_user[[2]]; 240 i <- report_user[[2]];
233 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; 241 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
262 par(mar = c(8, 4, 4, 2)); 270 par(mar = c(8, 4, 4, 2));
263 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); 271 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
264 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); 272 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
265 dev.off() 273 dev.off()
266 274
267 # Rarifaction curve. 275 # Generate 96 pie charts. Make a table to subset the numerical
268 # Start PDF device driver. 276 # and user_specimen_id values out of report_user for the 96 pies
269 #dev.new(width=10, height=7); 277 # (user_specimen_id names will be used to label each pie).
270 #file_path = get_file_path("geno_rarifaction_curve.pdf"); 278 dt1 <- data.table(report_user);
271 #pdf(file=file_path, width=10, height=7); 279 dt1 <- report_user[c(-2, -3, -4)];
272 #rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); 280 dt1 <- na.omit(dt1);
273 #dev.off(); 281 # Translate to 96 columns and 5 rows.
274 282 tdt1 <- t(dt1);
275 # Genotype accumulation curve, sample is number of 283 # Make another data table and transpose it the same as dt1 to
276 # loci randomly selected for to make the curve. 284 # just get numerics; these will feed into the creation of 96
277 #dev.new(width=10, height=7); 285 # vectors, "x" in the for loop below.
278 #file_path = get_file_path("geno_accumulation_curve.pdf"); 286 dt2 <- data.table(report_user);
279 #pdf(file=file_path, width=10, height=7); 287 dt2 <- report_user[c(-1, -2, -3, -4)];
280 #genotype_curve(gind, sample=5, quiet=TRUE); 288 # Translate to 96 columns and 5 rows.
281 #dev.off(); 289 tdt2 <- t(dt2);
282 290 # Create 96 vectors
291 x <- tdt2[1:96];
292 tdt1_matrix <- as.matrix(tdt1[-1,]);
293 mode(tdt1_matrix) <- "numeric";
294 spy <- rowMeans(tdt1_matrix);
295 dev.new(width=10, height=7);
296 file_path = get_file_path("percent_breakdown.pdf");
297 pdf(file=file_path, width=10, height=7);
298 # Average pie of all samples.
299 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep="");
300 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
301 main <- "Average breakdown of SNP assignments across all samples";
302 pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75);
303 par(mfrow=c(3, 2));
304 for (i in 1:96) {
305 labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep="");
306 col <- c("GREY", "#006DDB", "#24FF24", "#920000");
307 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]);
308 pie(tdt1_matrix[,i], labels=labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
309 }
310 dev.off()
311