Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 12:ea2914ddea50 draft
Uploaded
author | greg |
---|---|
date | Fri, 07 Dec 2018 09:13:02 -0500 |
parents | 6c93244a36e2 |
children | 96ee9122823e |
comparison
equal
deleted
inserted
replaced
11:c755744296ca | 12:ea2914ddea50 |
---|---|
1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
2 | 2 |
3 suppressPackageStartupMessages(library("adegenet")) | 3 suppressPackageStartupMessages(library("adegenet")) |
4 suppressPackageStartupMessages(library("ape")) | 4 suppressPackageStartupMessages(library("ape")) |
5 suppressPackageStartupMessages(library("data.table")) | 5 suppressPackageStartupMessages(library("data.table")) |
6 suppressPackageStartupMessages(library("dbplyr")) | |
6 suppressPackageStartupMessages(library("dplyr")) | 7 suppressPackageStartupMessages(library("dplyr")) |
7 suppressPackageStartupMessages(library("ggplot2")) | 8 suppressPackageStartupMessages(library("ggplot2")) |
8 suppressPackageStartupMessages(library("knitr")) | 9 suppressPackageStartupMessages(library("knitr")) |
9 suppressPackageStartupMessages(library("optparse")) | 10 suppressPackageStartupMessages(library("optparse")) |
10 suppressPackageStartupMessages(library("poppr")) | 11 suppressPackageStartupMessages(library("poppr")) |
11 suppressPackageStartupMessages(library("RColorBrewer")) | 12 suppressPackageStartupMessages(library("RColorBrewer")) |
12 suppressPackageStartupMessages(library("RPostgres")) | 13 suppressPackageStartupMessages(library("RPostgres")) |
14 suppressPackageStartupMessages(library("tidyr")) | |
13 suppressPackageStartupMessages(library("vcfR")) | 15 suppressPackageStartupMessages(library("vcfR")) |
14 suppressPackageStartupMessages(library("vegan")) | 16 suppressPackageStartupMessages(library("vegan")) |
15 | 17 |
16 option_list <- list( | 18 option_list <- list( |
17 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | 19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), |
51 } | 53 } |
52 | 54 |
53 # Read in VCF input file. | 55 # Read in VCF input file. |
54 vcf <- read.vcfR(opt$input_vcf); | 56 vcf <- read.vcfR(opt$input_vcf); |
55 | 57 |
56 #Missing GT in samples submitted | 58 # Convert VCF file into a genind for the Poppr package. |
59 # TODO: probably should not hard-code 2 cores. | |
60 gl <- vcfR2genlight(vcf, n.cores=2); | |
61 gind <- new("genind", (as.matrix(gl))); | |
62 | |
63 # Add population information to the genind object. | |
64 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); | |
65 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); | |
66 gind@pop <- as.factor(poptab$region); | |
67 | |
68 # Convert genind object to a genclone object. | |
69 obj2 <- as.genclone(gind); | |
70 | |
71 # Calculate the bitwise distance between individuals. | |
72 xdis <- bitwise.dist(obj2); | |
73 | |
74 # Multilocus genotypes (threshold of 16%). | |
75 mlg.filter(obj2, distance=xdis) <- 0.016; | |
76 m <- mlg.table(obj2, background=TRUE, color=TRUE); | |
77 | |
78 # Create table of MLGs. | |
79 id <- mlg.id(obj2); | |
80 dt <- data.table(id, keep.rownames=TRUE); | |
81 setnames(dt, c("id"), c("affy_id")); | |
82 | |
83 # Read user's Affymetrix 96 well plate csv file. | |
84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); | |
85 colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", | |
86 "reef", "region", "latitude", "longitude", "geographic_origin", | |
87 "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore", | |
88 "disease_resist", "bleach_resist", "mortality","tle", "spawning", | |
89 "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email", | |
90 "seq_facility", "array_version", "public", "public_after_date"); | |
91 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); | |
92 pinfo2 <- as.character(pinfo$user_specimen_id); | |
93 pi <- data.table(pinfo2); | |
94 setnames(pi, c("pinfo2"), c("user_specimen_id")); | |
95 | |
96 # Connect to database. | |
97 conn <- get_database_connection(opt$database_connection_string); | |
98 | |
99 # Import the sample table. | |
100 mD <- tbl(conn, "sample"); | |
101 | |
102 # Select user_specimen_id and mlg columns. | |
103 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); | |
104 | |
105 # Convert to dataframe. | |
106 sm <- data.frame(smlg); | |
107 sm[sm==""] <- NA; | |
108 | |
109 # Missing GT in samples submitted. | |
57 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | 110 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); |
58 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | 111 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); |
59 myMiss <- (myMiss / nrow(vcf)) * 100; | 112 myMiss <- (myMiss / nrow(vcf)) * 100; |
60 miss <- data.frame(myMiss); | 113 miss <- data.frame(myMiss); |
61 | |
62 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); | |
63 hets <- (hets / nrow(vcf)) * 100; | |
64 ht <- data.frame(hets); | |
65 | |
66 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); | |
67 refA <- (refA / nrow(vcf)) * 100; | |
68 rA <- data.frame(refA); | |
69 | |
70 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); | |
71 altB <- (altB / nrow(vcf)) * 100; | |
72 aB <- data.frame(altB); | |
73 | |
74 # Convert VCF file into a genind for the Poppr package. | |
75 # TODO: probably should not hard-code 2 cores. | |
76 gl <- vcfR2genlight(vcf, n.cores=2); | |
77 gind <- new("genind", (as.matrix(gl))); | |
78 | |
79 # Add population information to the genind object. | |
80 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); | |
81 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); | |
82 gind@pop <- as.factor(poptab$region); | |
83 | |
84 # Convert genind object to a genclone object. | |
85 obj2 <- as.genclone(gind); | |
86 | |
87 # Calculate the bitwise distance between individuals. | |
88 xdis <- bitwise.dist(obj2); | |
89 | |
90 # Multilocus genotypes (threshold of 16%). | |
91 mlg.filter(obj2, distance=xdis) <- 0.016; | |
92 m <- mlg.table(obj2, background=TRUE, color=TRUE); | |
93 | |
94 # Create table of MLGs. | |
95 id <- mlg.id(obj2); | |
96 dt <- data.table(id, keep.rownames=TRUE); | |
97 setnames(dt, c("id"), c("affy_id")); | |
98 | |
99 # Read user's Affymetrix 96 well plate csv file. | |
100 pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t"); | |
101 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); | |
102 pinfo2 <- as.character(pinfo$user_specimen_id); | |
103 pi <- data.table(pinfo2); | |
104 setnames(pi, c("pinfo2"), c("user_specimen_id")); | |
105 | |
106 # Connect to database. | |
107 conn <- get_database_connection(opt$database_connection_string); | |
108 | |
109 # Import the sample table. | |
110 mD <- tbl(conn, "sample"); | |
111 | |
112 # Select user_specimen_id and mlg columns. | |
113 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); | |
114 | |
115 # Convert to dataframe. | |
116 sm <- data.frame(smlg); | |
117 sm[sm==""] <- NA; | |
118 | 114 |
119 # Convert missing data into data table. | 115 # Convert missing data into data table. |
120 mi <-setDT(miss, keep.rownames=TRUE)[]; | 116 mi <-setDT(miss, keep.rownames=TRUE)[]; |
121 setnames(mi, c("rn"), c("affy_id")); | 117 setnames(mi, c("rn"), c("affy_id")); |
122 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | 118 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); |
123 # Round missing data to two digits. | 119 # Round missing data to two digits. |
124 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); | 120 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); |
125 | 121 |
122 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); | |
123 hets <- (hets / nrow(vcf)) * 100; | |
124 ht <- data.frame(hets); | |
125 | |
126 # Convert heterozygosity data into data table. | 126 # Convert heterozygosity data into data table. |
127 ht <-setDT(ht, keep.rownames=TRUE)[]; | 127 ht <-setDT(ht, keep.rownames=TRUE)[]; |
128 setnames(ht, c("rn"), c("affy_id")); | 128 setnames(ht, c("rn"), c("affy_id")); |
129 setnames(ht, c("hets"), c("percent_mixed_coral")); | 129 setnames(ht, c("hets"), c("percent_mixed_coral")); |
130 # Round missing data to two digits. | 130 # Round missing data to two digits. |
131 ht$percent_mixed<-round(ht$percent_mixed, digits=2); | 131 ht$percent_mixed<-round(ht$percent_mixed, digits=2); |
132 | 132 |
133 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); | |
134 refA <- (refA / nrow(vcf)) * 100; | |
135 rA <- data.frame(refA); | |
136 | |
133 # Convert refA data into data.table. | 137 # Convert refA data into data.table. |
134 rA <-setDT(rA, keep.rownames=TRUE)[]; | 138 rA <-setDT(rA, keep.rownames=TRUE)[]; |
135 setnames(rA, c("rn"), c("affy_id")); | 139 setnames(rA, c("rn"), c("affy_id")); |
136 setnames(rA, c("refA"), c("percent_reference_coral")); | 140 setnames(rA, c("refA"), c("percent_reference_coral")); |
137 # round missing data to two digits. | 141 # round missing data to two digits. |
138 rA$percent_reference<-round(rA$percent_reference, digits=2); | 142 rA$percent_reference<-round(rA$percent_reference, digits=2); |
139 | 143 |
144 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); | |
145 altB <- (altB / nrow(vcf)) * 100; | |
146 aB <- data.frame(altB); | |
147 | |
140 # Convert altB data into data table. | 148 # Convert altB data into data table. |
141 aB <-setDT(aB, keep.rownames=TRUE)[]; | 149 aB <-setDT(aB, keep.rownames=TRUE)[]; |
142 setnames(aB, c("rn"), c("affy_id")); | 150 setnames(aB, c("rn"), c("affy_id")); |
143 setnames(aB, c("altB"), c("percent_alternative_coral")); | 151 setnames(aB, c("altB"), c("percent_alternative_coral")); |
144 # Round missing data to two digits. | 152 # Round missing data to two digits. |
150 | 158 |
151 # Transform. | 159 # Transform. |
152 df3 <- dt %>% | 160 df3 <- dt %>% |
153 group_by(row_number()) %>% | 161 group_by(row_number()) %>% |
154 dplyr::rename(group='row_number()') %>% | 162 dplyr::rename(group='row_number()') %>% |
155 unnest (user_specimen_id) %>% | 163 unnest (affy_id) %>% |
156 # Join with mlg table. | 164 # Join with mlg table. |
157 left_join(sm %>% | 165 left_join(sm %>% |
158 select("affy_id","coral_mlg_clonal_id"), | 166 select("affy_id","coral_mlg_clonal_id"), |
159 by='affy_id'); | 167 by='affy_id'); |
160 | 168 |
166 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; | 174 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; |
167 | 175 |
168 # Determine if the sample mlg matched previous genotyped sample. | 176 # Determine if the sample mlg matched previous genotyped sample. |
169 df4<- df3 %>% | 177 df4<- df3 %>% |
170 group_by(group) %>% | 178 group_by(group) %>% |
171 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match")); | 179 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); |
172 | 180 |
173 # Create new mlg id for samples that did not match those in the database. | 181 # Create new mlg id for samples that did not match those in the database. |
174 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); | 182 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); |
175 none <- none[is.na(none$coral_mlg_clonal_id),]; | 183 none <- none[is.na(none$coral_mlg_clonal_id),]; |
176 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); | 184 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); |
182 # the df4 file contains all ids. If it doesn't then look below | 190 # the df4 file contains all ids. If it doesn't then look below |
183 # to change the seq() function. | 191 # to change the seq() function. |
184 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | 192 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); |
185 # This is a key for pairing group with new ids. | 193 # This is a key for pairing group with new ids. |
186 rat <- cbind(unique(n.g), n.g_ids); | 194 rat <- cbind(unique(n.g), n.g_ids); |
187 # this for loop assigns the new id iteratively for all that have NA. | 195 # This for loop assigns the new id iteratively for all that have NA. |
188 for (i in 1:length(na.mlg2)) { | 196 for (i in 1:length(na.mlg2)) { |
189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; | 197 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; |
190 } | 198 } |
191 | 199 |
192 # subset poptab for all samples. | 200 # subset poptab for all samples. |
211 by='affy_id') %>% | 219 by='affy_id') %>% |
212 left_join(aB %>% | 220 left_join(aB %>% |
213 select("affy_id", "percent_alternative_coral"), | 221 select("affy_id", "percent_alternative_coral"), |
214 by='affy_id') %>% | 222 by='affy_id') %>% |
215 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | 223 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% |
216 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% | 224 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% |
217 ungroup() %>% | 225 ungroup() %>% |
218 select(-group); | 226 select(-group); |
219 | 227 |
220 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); | 228 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); |
221 | 229 |
222 # Combine sample information for database. | 230 # Combine sample information for database. |
223 report_db <- pinfo %>% | 231 report_db <- pinfo %>% |
224 left_join(report_user %>% | 232 left_join(report_user %>% |
225 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", | 233 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", |
226 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", | 234 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", |
227 "percent_alternative_coral"), | 235 "percent_alternative_coral"), |
228 by='user_specimen_id') | 236 by='user_specimen_id'); |
229 | 237 |
230 # Create vector indicating number of individuals desired | 238 # Create vector indicating number of individuals desired |
231 # made from affy_id collumn of report_user data table. | 239 # made from affy_id collumn of report_user data table. |
232 i <- report_user[[2]]; | 240 i <- report_user[[2]]; |
233 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; | 241 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; |
262 par(mar = c(8, 4, 4, 2)); | 270 par(mar = c(8, 4, 4, 2)); |
263 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); | 271 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); |
264 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); | 272 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); |
265 dev.off() | 273 dev.off() |
266 | 274 |
267 # Rarifaction curve. | 275 # Generate 96 pie charts. Make a table to subset the numerical |
268 # Start PDF device driver. | 276 # and user_specimen_id values out of report_user for the 96 pies |
269 #dev.new(width=10, height=7); | 277 # (user_specimen_id names will be used to label each pie). |
270 #file_path = get_file_path("geno_rarifaction_curve.pdf"); | 278 dt1 <- data.table(report_user); |
271 #pdf(file=file_path, width=10, height=7); | 279 dt1 <- report_user[c(-2, -3, -4)]; |
272 #rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); | 280 dt1 <- na.omit(dt1); |
273 #dev.off(); | 281 # Translate to 96 columns and 5 rows. |
274 | 282 tdt1 <- t(dt1); |
275 # Genotype accumulation curve, sample is number of | 283 # Make another data table and transpose it the same as dt1 to |
276 # loci randomly selected for to make the curve. | 284 # just get numerics; these will feed into the creation of 96 |
277 #dev.new(width=10, height=7); | 285 # vectors, "x" in the for loop below. |
278 #file_path = get_file_path("geno_accumulation_curve.pdf"); | 286 dt2 <- data.table(report_user); |
279 #pdf(file=file_path, width=10, height=7); | 287 dt2 <- report_user[c(-1, -2, -3, -4)]; |
280 #genotype_curve(gind, sample=5, quiet=TRUE); | 288 # Translate to 96 columns and 5 rows. |
281 #dev.off(); | 289 tdt2 <- t(dt2); |
282 | 290 # Create 96 vectors |
291 x <- tdt2[1:96]; | |
292 tdt1_matrix <- as.matrix(tdt1[-1,]); | |
293 mode(tdt1_matrix) <- "numeric"; | |
294 spy <- rowMeans(tdt1_matrix); | |
295 dev.new(width=10, height=7); | |
296 file_path = get_file_path("percent_breakdown.pdf"); | |
297 pdf(file=file_path, width=10, height=7); | |
298 # Average pie of all samples. | |
299 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); | |
300 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | |
301 main <- "Average breakdown of SNP assignments across all samples"; | |
302 pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); | |
303 par(mfrow=c(3, 2)); | |
304 for (i in 1:96) { | |
305 labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); | |
306 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | |
307 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); | |
308 pie(tdt1_matrix[,i], labels=labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); | |
309 } | |
310 dev.off() | |
311 |