Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 7:18001e7cb199 draft
Uploaded
| author | greg |
|---|---|
| date | Wed, 28 Nov 2018 13:49:18 -0500 |
| parents | a7cce4091e80 |
| children | d2057e183772 |
comparison
equal
deleted
inserted
replaced
| 6:a71901fd5325 | 7:18001e7cb199 |
|---|---|
| 1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
| 2 | 2 |
| 3 suppressPackageStartupMessages(library("adegenet")) | 3 suppressPackageStartupMessages(library("adegenet")) |
| 4 suppressPackageStartupMessages(library("ape")) | 4 suppressPackageStartupMessages(library("ape")) |
| 5 suppressPackageStartupMessages(library("data.table")) | |
| 6 #suppressPackageStartupMessages(library("dbplyr")) | |
| 7 suppressPackageStartupMessages(library("dplyr")) | |
| 5 suppressPackageStartupMessages(library("ggplot2")) | 8 suppressPackageStartupMessages(library("ggplot2")) |
| 6 suppressPackageStartupMessages(library("knitr")) | 9 suppressPackageStartupMessages(library("knitr")) |
| 7 suppressPackageStartupMessages(library("optparse")) | 10 suppressPackageStartupMessages(library("optparse")) |
| 8 suppressPackageStartupMessages(library("poppr")) | 11 suppressPackageStartupMessages(library("poppr")) |
| 9 suppressPackageStartupMessages(library("RColorBrewer")) | 12 suppressPackageStartupMessages(library("RColorBrewer")) |
| 13 suppressPackageStartupMessages(library("RPostgres")) | |
| 14 #suppressPackageStartupMessages(library("tidyr")) | |
| 10 suppressPackageStartupMessages(library("vcfR")) | 15 suppressPackageStartupMessages(library("vcfR")) |
| 11 suppressPackageStartupMessages(library("vegan")) | 16 suppressPackageStartupMessages(library("vegan")) |
| 12 | 17 |
| 13 option_list <- list( | 18 option_list <- list( |
| 19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | |
| 20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), | |
| 21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | |
| 14 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | 22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), |
| 15 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | 23 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"), |
| 16 make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"), | 24 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") |
| 17 make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile") | |
| 18 ) | 25 ) |
| 19 | 26 |
| 20 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | 27 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); |
| 21 args <- parse_args(parser, positional_arguments=TRUE); | 28 args <- parse_args(parser, positional_arguments=TRUE); |
| 22 opt <- args$options; | 29 opt <- args$options; |
| 24 get_file_path = function(file_name) { | 31 get_file_path = function(file_name) { |
| 25 file_path = paste("output_plots_dir", file_name, sep="/"); | 32 file_path = paste("output_plots_dir", file_name, sep="/"); |
| 26 return(file_path); | 33 return(file_path); |
| 27 } | 34 } |
| 28 | 35 |
| 29 # Extract Provesti's distance from the distance matrix. | |
| 30 provesti_distance <- function(distance, selection) { | |
| 31 eval(parse(text=paste("as.matrix(distance)[", selection, "]"))); | |
| 32 } | |
| 33 | |
| 34 # Read in VCF input file. | 36 # Read in VCF input file. |
| 35 vcf <- read.vcfR(opt$input_vcf); | 37 vcf <- read.vcfR(opt$input_vcf); |
| 36 | 38 |
| 37 #Missing GT in samples submitted | 39 #Missing GT in samples submitted |
| 38 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | 40 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); |
| 39 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | 41 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); |
| 40 myMiss <- (myMiss / nrow(vcf)) * 100; | 42 missing_gt <- (missing_gt / nrow(vcf)) * 100; |
| 41 miss <- data.frame(myMiss); | 43 missing_gt_data_frame <- data.frame(missing_gt); |
| 42 write.table(miss, file=opt$output_missing_data, quote=FALSE); | 44 |
| 43 | 45 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); |
| 44 # Convert VCF file into formats compatiable with the Poppr package. | 46 hets <- (hets / nrow(vcf)) * 100; |
| 45 genind <- vcfR2genind(vcf); | 47 ht <- data.frame(hets); |
| 48 | |
| 49 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); | |
| 50 refA <- (refA / nrow(vcf)) * 100; | |
| 51 rA <- data.frame(refA); | |
| 52 | |
| 53 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); | |
| 54 altB <- (altB / nrow(vcf)) * 100; | |
| 55 aB <- data.frame(altB); | |
| 56 | |
| 57 # Convert VCF file into a genind for the Poppr package. | |
| 58 # TODO: probably should not hard-code 2 cores. | |
| 59 gl <- vcfR2genlight(vcf, n.cores=2); | |
| 60 genind <- new("genind", (as.matrix(gl))); | |
| 61 | |
| 46 # Add population information to the genind object. | 62 # Add population information to the genind object. |
| 47 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); | 63 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); |
| 48 genind@pop <- as.factor(poptab$region); | 64 genind@pop <- as.factor(poptab$region); |
| 49 # Convert genind to genclone object | 65 |
| 66 # Convert genind object to a genclone object. | |
| 50 gclo <- as.genclone(genind); | 67 gclo <- as.genclone(genind); |
| 51 # Calculate the bitwise distance between individuals, | 68 |
| 52 # the following is similar to Provesti's distance. | 69 # Calculate the bitwise distance between individuals. |
| 53 xdis <- bitwise.dist(gclo, missing_match=FALSE); | 70 xdis <- bitwise.dist(gclo); |
| 54 | 71 |
| 55 # Multilocus genotypes (threshold of 1.6%). | 72 # Multilocus genotypes (threshold of 1%). |
| 56 mlg.filter(gclo, distance=xdis) <- 0.016; | 73 mlg.filter(gclo, distance=xdis) <- 0.01; |
| 57 # Start PDF device driver. | |
| 58 dev.new(width=10, height=7); | |
| 59 file_path = get_file_path("mlg_table.pdf"); | |
| 60 pdf(file=file_path, width=10, height=7); | |
| 61 m <- mlg.table(gclo, background=TRUE, color=TRUE); | 74 m <- mlg.table(gclo, background=TRUE, color=TRUE); |
| 62 dev.off(); | |
| 63 | 75 |
| 64 # Create table of MLGs. | 76 # Create table of MLGs. |
| 65 id <- mlg.id(gclo); | 77 id <- mlg.id(gclo); |
| 66 df <- data.frame(matrix((id), byrow=T)); | 78 dt <- data.table(id, keep.rownames=TRUE); |
| 67 write.table(df, file=opt$output_mlg_id); | 79 setnames(dt, c("id"), c("user_specimen_id")); |
| 80 | |
| 81 # Read user's Affymetrix 96 well plate csv file. | |
| 82 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); | |
| 83 pinfo <- pinfo$user_specimen_id; | |
| 84 pi <- data.table(pinfo); | |
| 85 setnames(pi, c("pinfo"), c("user_specimen_id")); | |
| 86 | |
| 87 # Instantiate database connection. | |
| 88 # The connection string has this format: | |
| 89 # postgresql://user:password@host/dbname | |
| 90 conn_string <- opt$database_connection_string; | |
| 91 conn_items <- strsplit(conn_string, "://")[[1]]; | |
| 92 string_needed <- conn_items[1]; | |
| 93 items_needed <- strsplit(string_needed, "@")[[1]]; | |
| 94 user_pass_string <- items_needed[1]; | |
| 95 host_dbname_string <- items_needed[2]; | |
| 96 user_pass_items <- strsplit(user_pass_string, ":")[[1]]; | |
| 97 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; | |
| 98 user <- user_pass_items[1]; | |
| 99 pass <- user_pass_items[2]; | |
| 100 host <- host_dbname_items[1]; | |
| 101 dbname <- host_dbname_items[2]; | |
| 102 # FIXME: is there a way to not hard-code the port? | |
| 103 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); | |
| 104 | |
| 105 # Import the sample table. | |
| 106 sample_table <- tbl(conn, "sample"); | |
| 107 | |
| 108 # Select user_specimen_id and mlg columns. | |
| 109 smlg <- sample_table %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); | |
| 110 | |
| 111 # Convert to dataframe. | |
| 112 sm <- data.frame(smlg); | |
| 113 sm[sm==""] <- NA; | |
| 114 | |
| 115 # Convert missing data into data table. | |
| 116 mi <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; | |
| 117 # Change names to match db. | |
| 118 setnames(mi, c("rn"), c("user_specimen_id")); | |
| 119 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | |
| 120 # Round missing data to two digits. | |
| 121 mi$percent_missing <- round(mi$percent_missing, digits=2); | |
| 122 | |
| 123 # Convert heterozygosity data into data table. | |
| 124 ht <-setDT(ht, keep.rownames=TRUE)[]; | |
| 125 # Change names to match db. | |
| 126 setnames(ht, c("rn"), c("user_specimen_id")); | |
| 127 setnames(ht, c("hets"), c("percent_mixed_coral")); | |
| 128 # Round missing data to two digits. | |
| 129 ht$percent_mixed<-round(ht$percent_mixed, digits=2); | |
| 130 | |
| 131 # Convert refA data into data.table. | |
| 132 rA <-setDT(rA, keep.rownames=TRUE)[]; | |
| 133 # Change names to match db. | |
| 134 setnames(rA, c("rn"), c("user_specimen_id")); | |
| 135 setnames(rA, c("refA"), c("percent_reference_coral")); | |
| 136 # round missing data to two digits. | |
| 137 rA$percent_reference<-round(rA$percent_reference, digits=2); | |
| 138 | |
| 139 # Convert altB data into data table. | |
| 140 aB <-setDT(aB, keep.rownames=TRUE)[]; | |
| 141 # Change names to match db. | |
| 142 setnames(aB, c("rn"), c("user_specimen_id")); | |
| 143 setnames(aB, c("altB"), c("percent_alternative_coral")); | |
| 144 # Round missing data to two digits. | |
| 145 aB$percent_alternative<-round(aB$percent_alternative, digits=2); | |
| 146 | |
| 147 #convert mlg id to data.table format | |
| 148 dt <- data.table(id, keep.rownames=TRUE); | |
| 149 # Change name to match db. | |
| 150 setnames(dt, c("id"), c("user_specimen_id")); | |
| 151 | |
| 152 # Transform. | |
| 153 df3 <- dt %>% | |
| 154 group_by(row_number()) %>% | |
| 155 dplyr::rename(group='row_number()') %>% | |
| 156 unnest (user_specimen_id) %>% | |
| 157 # Join with mlg table. | |
| 158 left_join(sm %>% | |
| 159 select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id'); | |
| 160 | |
| 161 # If found in database, group members on previous mlg id. | |
| 162 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); | |
| 163 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; | |
| 164 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); | |
| 165 na.group <- df3$group[na.mlg]; | |
| 166 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; | |
| 167 | |
| 168 # Determine if the sample mlg matched previous genotyped sample. | |
| 169 df4<- df3 %>% | |
| 170 group_by(group) %>% | |
| 171 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match")); | |
| 172 | |
| 173 # Create new mlg id for samples that did not match those in the database. | |
| 174 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); | |
| 175 none <- none[is.na(none$coral_mlg_clonal_id),]; | |
| 176 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); | |
| 177 n.g <- df4$group[na.mlg2]; | |
| 178 ct <- length(unique(n.g)); | |
| 179 | |
| 180 # List of new group ids, the sequence starts at the number of | |
| 181 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if | |
| 182 # the df4 file contains all ids. If it doesn't then look below | |
| 183 # to change the seq() function. | |
| 184 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | |
| 185 # This is a key for pairing group with new ids. | |
| 186 rat <- cbind(unique(n.g), n.g_ids); | |
| 187 # this for loop assigns the new id iteratively for all that have NA. | |
| 188 for (i in 1:length(na.mlg2)) { | |
| 189 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; | |
| 190 } | |
| 191 | |
| 192 # Merge data frames for final table. | |
| 193 report_user <- pi %>% | |
| 194 # Join with the second file (only the first and third column). | |
| 195 left_join(df4 %>% | |
| 196 select("user_specimen_id","coral_mlg_clonal_id","DB_match"), | |
| 197 by='user_specimen_id') %>% | |
| 198 # Join with the second file (only the first and third column). | |
| 199 left_join(mi %>% | |
| 200 select("user_specimen_id","percent_missing_coral"), | |
| 201 by='user_specimen_id') %>% | |
| 202 # Join with the second file (only the first and third column). | |
| 203 left_join(ht %>% | |
| 204 select("user_specimen_id","percent_mixed_coral"), | |
| 205 by='user_specimen_id') %>% | |
| 206 # Join with the second file (only the first and third column); | |
| 207 left_join(rA %>% | |
| 208 select("user_specimen_id","percent_reference_coral"), | |
| 209 by='user_specimen_id') %>% | |
| 210 # Join with the second file (only the first and third column). | |
| 211 left_join(aB %>% | |
| 212 select("user_specimen_id","percent_alternative_coral"), | |
| 213 by='user_specimen_id') %>% | |
| 214 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | |
| 215 mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% | |
| 216 ungroup() %>% | |
| 217 select(-group); | |
| 218 | |
| 219 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); | |
| 68 | 220 |
| 69 # Rarifaction curve. | 221 # Rarifaction curve. |
| 70 H.obj <- mlg.table(gclo, plot=TRUE); | |
| 71 # Start PDF device driver. | 222 # Start PDF device driver. |
| 72 dev.new(width=10, height=7); | 223 dev.new(width=10, height=7); |
| 73 file_path = get_file_path("geno_rarifaction_curve.pdf"); | 224 file_path = get_file_path("geno_rarifaction_curve.pdf"); |
| 74 pdf(file=file_path, width=10, height=7); | 225 pdf(file=file_path, width=10, height=7); |
| 75 rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue"); | 226 rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); |
| 76 dev.off() | 227 dev.off(); |
| 228 | |
| 229 # Genotype accumulation curve, sample is number of | |
| 230 # loci randomly selected for to make the curve. | |
| 231 dev.new(width=10, height=7); | |
| 232 file_path = get_file_path("geno_accumulation_curve.pdf"); | |
| 233 pdf(file=file_path, width=10, height=7); | |
| 234 genotype_curve(gind, sample=5, quiet=TRUE); | |
| 235 dev.off(); | |
| 77 | 236 |
| 78 # Create a phylogeny of samples based on distance matrices. | 237 # Create a phylogeny of samples based on distance matrices. |
| 79 cols <- palette(brewer.pal(n=12, name='Set3')); | 238 cols <- palette(brewer.pal(n=12, name='Set3')); |
| 80 set.seed(999); | 239 set.seed(999); |
| 81 # Start PDF device driver. | 240 # Start PDF device driver. |
| 82 dev.new(width=10, height=7); | 241 dev.new(width=10, height=7); |
| 83 file_path = get_file_path("nj_phylogeny.pdf"); | 242 file_path = get_file_path("nj_phylogeny.pdf"); |
| 84 pdf(file=file_path, width=10, height=7); | 243 pdf(file=file_path, width=10, height=7); |
| 85 # Organize branches by clade. | 244 # Organize branches by clade. |
| 86 tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); | 245 tree <- gclo %>% |
| 246 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% | |
| 247 ladderize(); | |
| 87 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); | 248 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); |
| 88 # Add a scale bar showing 5% difference.. | 249 # Add a scale bar showing 5% difference.. |
| 89 add.scale.bar(length=0.05, cex=0.65); | 250 add.scale.bar(length=0.05, cex=0.65); |
| 90 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | 251 nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); |
| 91 dev.off() | 252 dev.off(); |
| 92 | 253 |
