comparison multilocus_genotype.R @ 9:8f2f346a5e1c draft

Uploaded
author greg
date Tue, 04 Dec 2018 13:44:12 -0500
parents d2057e183772
children 6c93244a36e2
comparison
equal deleted inserted replaced
8:d2057e183772 9:8f2f346a5e1c
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 2
3 suppressPackageStartupMessages(library("adegenet")) 3 suppressPackageStartupMessages(library("adegenet"))
4 suppressPackageStartupMessages(library("ape")) 4 suppressPackageStartupMessages(library("ape"))
5 suppressPackageStartupMessages(library("data.table")) 5 suppressPackageStartupMessages(library("data.table"))
6 #suppressPackageStartupMessages(library("dbplyr"))
7 suppressPackageStartupMessages(library("dplyr")) 6 suppressPackageStartupMessages(library("dplyr"))
8 suppressPackageStartupMessages(library("ggplot2")) 7 suppressPackageStartupMessages(library("ggplot2"))
9 suppressPackageStartupMessages(library("knitr")) 8 suppressPackageStartupMessages(library("knitr"))
10 suppressPackageStartupMessages(library("optparse")) 9 suppressPackageStartupMessages(library("optparse"))
11 suppressPackageStartupMessages(library("poppr")) 10 suppressPackageStartupMessages(library("poppr"))
12 suppressPackageStartupMessages(library("RColorBrewer")) 11 suppressPackageStartupMessages(library("RColorBrewer"))
13 suppressPackageStartupMessages(library("RPostgres")) 12 suppressPackageStartupMessages(library("RPostgres"))
14 #suppressPackageStartupMessages(library("tidyr"))
15 suppressPackageStartupMessages(library("vcfR")) 13 suppressPackageStartupMessages(library("vcfR"))
16 suppressPackageStartupMessages(library("vegan")) 14 suppressPackageStartupMessages(library("vegan"))
17 15
18 option_list <- list( 16 option_list <- list(
19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), 17 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), 18 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), 19 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), 20 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
23 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"),
24 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") 21 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file")
25 ) 22 )
26 23
27 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); 24 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
28 args <- parse_args(parser, positional_arguments=TRUE); 25 args <- parse_args(parser, positional_arguments=TRUE);
78 # TODO: probably should not hard-code 2 cores. 75 # TODO: probably should not hard-code 2 cores.
79 gl <- vcfR2genlight(vcf, n.cores=2); 76 gl <- vcfR2genlight(vcf, n.cores=2);
80 gind <- new("genind", (as.matrix(gl))); 77 gind <- new("genind", (as.matrix(gl)));
81 78
82 # Add population information to the genind object. 79 # Add population information to the genind object.
83 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); 80 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors = FALSE);
81 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
84 gind@pop <- as.factor(poptab$region); 82 gind@pop <- as.factor(poptab$region);
85 83
86 # Convert genind object to a genclone object. 84 # Convert genind object to a genclone object.
87 obj2 <- as.genclone(gind); 85 obj2 <- as.genclone(gind);
88 86
89 # Calculate the bitwise distance between individuals. 87 # Calculate the bitwise distance between individuals.
90 xdis <- bitwise.dist(obj2); 88 xdis <- bitwise.dist(obj2);
91 89
92 # Multilocus genotypes (threshold of 1%). 90 # Multilocus genotypes (threshold of 16%).
93 mlg.filter(obj2, distance=xdis) <- 0.01; 91 mlg.filter(obj2, distance=xdis) <- 0.016;
94 m <- mlg.table(obj2, background=TRUE, color=TRUE); 92 m <- mlg.table(obj2, background=TRUE, color=TRUE);
95 93
96 # Create table of MLGs. 94 # Create table of MLGs.
97 id <- mlg.id(obj2); 95 id <- mlg.id(obj2);
98 dt <- data.table(id, keep.rownames=TRUE); 96 dt <- data.table(id, keep.rownames=TRUE);
99 setnames(dt, c("id"), c("user_specimen_id")); 97 setnames(dt, c("id"), c("affy_id"));
100 98
101 # Read user's Affymetrix 96 well plate csv file. 99 # Read user's Affymetrix 96 well plate csv file.
102 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); 100 pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t");
103 pinfo <- pinfo$user_specimen_id; 101 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
104 pi <- data.table(pinfo); 102 pinfo2 <- as.character(pinfo$user_specimen_id);
105 setnames(pi, c("pinfo"), c("user_specimen_id")); 103 pi <- data.table(pinfo2);
104 setnames(pi, c("pinfo2"), c("user_specimen_id"));
106 105
107 # Connect to database. 106 # Connect to database.
108 conn <- get_database_connection(opt$database_connection_string); 107 conn <- get_database_connection(opt$database_connection_string);
109 108
110 # Import the sample table. 109 # Import the sample table.
111 mD <- tbl(conn, "sample"); 110 mD <- tbl(conn, "sample");
112 111
113 # Select user_specimen_id and mlg columns. 112 # Select user_specimen_id and mlg columns.
114 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); 113 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
115 114
116 # Convert to dataframe. 115 # Convert to dataframe.
117 sm <- data.frame(smlg); 116 sm <- data.frame(smlg);
118 sm[sm==""] <- NA; 117 sm[sm==""] <- NA;
119 118
120 # Convert missing data into data table. 119 # Convert missing data into data table.
121 mi <-setDT(miss, keep.rownames=TRUE)[]; 120 mi <-setDT(miss, keep.rownames=TRUE)[];
122 # Change names to match db. 121 setnames(mi, c("rn"), c("affy_id"));
123 setnames(mi, c("rn"), c("user_specimen_id"));
124 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); 122 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
125 # Round missing data to two digits. 123 # Round missing data to two digits.
126 mi$percent_missing <- round(mi$percent_missing, digits=2); 124 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);
127 125
128 # Convert heterozygosity data into data table. 126 # Convert heterozygosity data into data table.
129 ht <-setDT(ht, keep.rownames=TRUE)[]; 127 ht <-setDT(ht, keep.rownames=TRUE)[];
130 # Change names to match db. 128 setnames(ht, c("rn"), c("affy_id"));
131 setnames(ht, c("rn"), c("user_specimen_id"));
132 setnames(ht, c("hets"), c("percent_mixed_coral")); 129 setnames(ht, c("hets"), c("percent_mixed_coral"));
133 # Round missing data to two digits. 130 # Round missing data to two digits.
134 ht$percent_mixed<-round(ht$percent_mixed, digits=2); 131 ht$percent_mixed<-round(ht$percent_mixed, digits=2);
135 132
136 # Convert refA data into data.table. 133 # Convert refA data into data.table.
137 rA <-setDT(rA, keep.rownames=TRUE)[]; 134 rA <-setDT(rA, keep.rownames=TRUE)[];
138 # Change names to match db. 135 setnames(rA, c("rn"), c("affy_id"));
139 setnames(rA, c("rn"), c("user_specimen_id"));
140 setnames(rA, c("refA"), c("percent_reference_coral")); 136 setnames(rA, c("refA"), c("percent_reference_coral"));
141 # round missing data to two digits. 137 # round missing data to two digits.
142 rA$percent_reference<-round(rA$percent_reference, digits=2); 138 rA$percent_reference<-round(rA$percent_reference, digits=2);
143 139
144 # Convert altB data into data table. 140 # Convert altB data into data table.
145 aB <-setDT(aB, keep.rownames=TRUE)[]; 141 aB <-setDT(aB, keep.rownames=TRUE)[];
146 # Change names to match db. 142 setnames(aB, c("rn"), c("affy_id"));
147 setnames(aB, c("rn"), c("user_specimen_id"));
148 setnames(aB, c("altB"), c("percent_alternative_coral")); 143 setnames(aB, c("altB"), c("percent_alternative_coral"));
149 # Round missing data to two digits. 144 # Round missing data to two digits.
150 aB$percent_alternative<-round(aB$percent_alternative, digits=2); 145 aB$percent_alternative<-round(aB$percent_alternative, digits=2);
151 146
152 #convert mlg id to data.table format 147 #convert mlg id to data.table format
153 dt <- data.table(id, keep.rownames=TRUE); 148 dt <- data.table(id, keep.rownames=TRUE);
154 # Change name to match db. 149 setnames(dt, c("id"), c("affy_id"));
155 setnames(dt, c("id"), c("user_specimen_id"));
156 150
157 # Transform. 151 # Transform.
158 df3 <- dt %>% 152 df3 <- dt %>%
159 group_by(row_number()) %>% 153 group_by(row_number()) %>%
160 dplyr::rename(group='row_number()') %>% 154 dplyr::rename(group='row_number()') %>%
161 unnest (user_specimen_id) %>% 155 unnest (user_specimen_id) %>%
162 # Join with mlg table. 156 # Join with mlg table.
163 left_join(sm %>% 157 left_join(sm %>%
164 select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id'); 158 select("affy_id","coral_mlg_clonal_id"),
159 by='affy_id');
165 160
166 # If found in database, group members on previous mlg id. 161 # If found in database, group members on previous mlg id.
167 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); 162 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
168 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; 163 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
169 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); 164 na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
192 # this for loop assigns the new id iteratively for all that have NA. 187 # this for loop assigns the new id iteratively for all that have NA.
193 for (i in 1:length(na.mlg2)) { 188 for (i in 1:length(na.mlg2)) {
194 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; 189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
195 } 190 }
196 191
192 # subset poptab for all samples.
193 subpop <- poptab[c(2, 3)];
194
197 # Merge data frames for final table. 195 # Merge data frames for final table.
198 report_user <- pi %>% 196 report_user <- pi %>%
199 # Join with the second file (only the first and third column). 197 left_join(subpop %>%
198 select("affy_id", "user_specimen_id"),
199 by='user_specimen_id') %>%
200 left_join(df4 %>% 200 left_join(df4 %>%
201 select("user_specimen_id","coral_mlg_clonal_id","DB_match"), 201 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
202 by='user_specimen_id') %>% 202 by='affy_id') %>%
203 # Join with the second file (only the first and third column).
204 left_join(mi %>% 203 left_join(mi %>%
205 select("user_specimen_id","percent_missing_coral"), 204 select("affy_id", "percent_missing_data_coral"),
206 by='user_specimen_id') %>% 205 by='affy_id') %>%
207 # Join with the second file (only the first and third column).
208 left_join(ht %>% 206 left_join(ht %>%
209 select("user_specimen_id","percent_mixed_coral"), 207 select("affy_id", "percent_mixed_coral"),
210 by='user_specimen_id') %>% 208 by='affy_id') %>%
211 # Join with the second file (only the first and third column);
212 left_join(rA %>% 209 left_join(rA %>%
213 select("user_specimen_id","percent_reference_coral"), 210 select("affy_id", "percent_reference_coral"),
214 by='user_specimen_id') %>% 211 by='affy_id') %>%
215 # Join with the second file (only the first and third column).
216 left_join(aB %>% 212 left_join(aB %>%
217 select("user_specimen_id","percent_alternative_coral"), 213 select("affy_id", "percent_alternative_coral"),
218 by='user_specimen_id') %>% 214 by='affy_id') %>%
219 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% 215 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
220 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% 216 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>%
221 ungroup() %>% 217 ungroup() %>%
222 select(-group); 218 select(-group);
223 219
224 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); 220 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE);
225 221
226 # Rarifaction curve. 222 # Combine sample information for database.
227 # Start PDF device driver. 223 report_db <- pinfo %>%
228 dev.new(width=10, height=7); 224 left_join(report_user %>%
229 file_path = get_file_path("geno_rarifaction_curve.pdf"); 225 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
230 pdf(file=file_path, width=10, height=7); 226 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
231 rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); 227 "percent_alternative_coral"),
232 dev.off(); 228 by='user_specimen_id')
233 229
234 # Genotype accumulation curve, sample is number of 230 # Create vector indicating number of individuals desired
235 # loci randomly selected for to make the curve. 231 # made from affy_id collumn of report_user data table.
236 dev.new(width=10, height=7); 232 i <- report_user[[2]];
237 file_path = get_file_path("geno_accumulation_curve.pdf"); 233 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
238 pdf(file=file_path, width=10, height=7);
239 genotype_curve(gind, sample=5, quiet=TRUE);
240 dev.off();
241 234
242 # Create a phylogeny of samples based on distance matrices. 235 # Create a phylogeny of samples based on distance matrices.
243 cols <- palette(brewer.pal(n=12, name='Set3')); 236 cols <- palette(brewer.pal(n=12, name='Set3'));
244 set.seed(999); 237 set.seed(999);
245 # Start PDF device driver. 238 # Start PDF device driver.
246 dev.new(width=10, height=7); 239 dev.new(width=10, height=7);
247 file_path = get_file_path("nj_phylogeny.pdf"); 240 file_path = get_file_path("nj_phylogeny.pdf");
248 pdf(file=file_path, width=10, height=7); 241 pdf(file=file_path, width=10, height=7);
249 # Organize branches by clade. 242 # Organize branches by clade.
250 tree <- obj2 %>% 243 theTree <- sub96 %>%
251 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% 244 aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>%
252 ladderize(); 245 ladderize();
253 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); 246 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)];
247 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
254 # Add a scale bar showing 5% difference.. 248 # Add a scale bar showing 5% difference..
255 add.scale.bar(length=0.05, cex=0.65); 249 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
256 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); 250 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
251 legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8);
257 dev.off(); 252 dev.off();
258 253
254 # Missing data barplot.
255 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)];
256 test2 <- which(!is.na(poptab$miss));
257 miss96 <- poptab$miss[test2];
258 name96 <- poptab$user_specimen_id[test2];
259 dev.new(width=10, height=7);
260 file_path = get_file_path("missing_data.pdf");
261 pdf (file=file_path, width=10, height=7);
262 par(mar = c(8, 4, 4, 2));
263 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
264 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
265 dev.off()
266
267 # Rarifaction curve.
268 # Start PDF device driver.
269 #dev.new(width=10, height=7);
270 #file_path = get_file_path("geno_rarifaction_curve.pdf");
271 #pdf(file=file_path, width=10, height=7);
272 #rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
273 #dev.off();
274
275 # Genotype accumulation curve, sample is number of
276 # loci randomly selected for to make the curve.
277 #dev.new(width=10, height=7);
278 #file_path = get_file_path("geno_accumulation_curve.pdf");
279 #pdf(file=file_path, width=10, height=7);
280 #genotype_curve(gind, sample=5, quiet=TRUE);
281 #dev.off();
282