Mercurial > repos > greg > multilocus_genotype
changeset 7:18001e7cb199 draft
Uploaded
author | greg |
---|---|
date | Wed, 28 Nov 2018 13:49:18 -0500 |
parents | a71901fd5325 |
children | d2057e183772 |
files | multilocus_genotype.R multilocus_genotype.xml |
diffstat | 2 files changed, 204 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Wed Nov 21 10:04:47 2018 -0500 +++ b/multilocus_genotype.R Wed Nov 28 13:49:18 2018 -0500 @@ -2,19 +2,26 @@ suppressPackageStartupMessages(library("adegenet")) suppressPackageStartupMessages(library("ape")) +suppressPackageStartupMessages(library("data.table")) +#suppressPackageStartupMessages(library("dbplyr")) +suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) +suppressPackageStartupMessages(library("RPostgres")) +#suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) option_list <- list( - make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), + make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), + make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), - make_option(c("--output_missing_data"), action="store", dest="output_missing_data", help="Missing data outputfile"), - make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data outputfile") + make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), + make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"), + make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list); @@ -26,54 +33,206 @@ return(file_path); } -# Extract Provesti's distance from the distance matrix. -provesti_distance <- function(distance, selection) { - eval(parse(text=paste("as.matrix(distance)[", selection, "]"))); -} - # Read in VCF input file. vcf <- read.vcfR(opt$input_vcf); #Missing GT in samples submitted gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); -myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); -myMiss <- (myMiss / nrow(vcf)) * 100; -miss <- data.frame(myMiss); -write.table(miss, file=opt$output_missing_data, quote=FALSE); +missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); +missing_gt <- (missing_gt / nrow(vcf)) * 100; +missing_gt_data_frame <- data.frame(missing_gt); + +hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); +hets <- (hets / nrow(vcf)) * 100; +ht <- data.frame(hets); + +refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); +refA <- (refA / nrow(vcf)) * 100; +rA <- data.frame(refA); -# Convert VCF file into formats compatiable with the Poppr package. -genind <- vcfR2genind(vcf); +altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); +altB <- (altB / nrow(vcf)) * 100; +aB <- data.frame(altB); + +# Convert VCF file into a genind for the Poppr package. +# TODO: probably should not hard-code 2 cores. +gl <- vcfR2genlight(vcf, n.cores=2); +genind <- new("genind", (as.matrix(gl))); + # Add population information to the genind object. -poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); +poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); genind@pop <- as.factor(poptab$region); -# Convert genind to genclone object + +# Convert genind object to a genclone object. gclo <- as.genclone(genind); -# Calculate the bitwise distance between individuals, -# the following is similar to Provesti's distance. -xdis <- bitwise.dist(gclo, missing_match=FALSE); + +# Calculate the bitwise distance between individuals. +xdis <- bitwise.dist(gclo); -# Multilocus genotypes (threshold of 1.6%). -mlg.filter(gclo, distance=xdis) <- 0.016; -# Start PDF device driver. -dev.new(width=10, height=7); -file_path = get_file_path("mlg_table.pdf"); -pdf(file=file_path, width=10, height=7); +# Multilocus genotypes (threshold of 1%). +mlg.filter(gclo, distance=xdis) <- 0.01; m <- mlg.table(gclo, background=TRUE, color=TRUE); -dev.off(); # Create table of MLGs. id <- mlg.id(gclo); -df <- data.frame(matrix((id), byrow=T)); -write.table(df, file=opt$output_mlg_id); +dt <- data.table(id, keep.rownames=TRUE); +setnames(dt, c("id"), c("user_specimen_id")); + +# Read user's Affymetrix 96 well plate csv file. +pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); +pinfo <- pinfo$user_specimen_id; +pi <- data.table(pinfo); +setnames(pi, c("pinfo"), c("user_specimen_id")); + +# Instantiate database connection. +# The connection string has this format: +# postgresql://user:password@host/dbname +conn_string <- opt$database_connection_string; +conn_items <- strsplit(conn_string, "://")[[1]]; +string_needed <- conn_items[1]; +items_needed <- strsplit(string_needed, "@")[[1]]; +user_pass_string <- items_needed[1]; +host_dbname_string <- items_needed[2]; +user_pass_items <- strsplit(user_pass_string, ":")[[1]]; +host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; +user <- user_pass_items[1]; +pass <- user_pass_items[2]; +host <- host_dbname_items[1]; +dbname <- host_dbname_items[2]; +# FIXME: is there a way to not hard-code the port? +conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); + +# Import the sample table. +sample_table <- tbl(conn, "sample"); + +# Select user_specimen_id and mlg columns. +smlg <- sample_table %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); + +# Convert to dataframe. +sm <- data.frame(smlg); +sm[sm==""] <- NA; + +# Convert missing data into data table. +mi <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; +# Change names to match db. +setnames(mi, c("rn"), c("user_specimen_id")); +setnames(mi, c("myMiss"), c("percent_missing_data_coral")); +# Round missing data to two digits. +mi$percent_missing <- round(mi$percent_missing, digits=2); + +# Convert heterozygosity data into data table. +ht <-setDT(ht, keep.rownames=TRUE)[]; +# Change names to match db. +setnames(ht, c("rn"), c("user_specimen_id")); +setnames(ht, c("hets"), c("percent_mixed_coral")); +# Round missing data to two digits. +ht$percent_mixed<-round(ht$percent_mixed, digits=2); + +# Convert refA data into data.table. +rA <-setDT(rA, keep.rownames=TRUE)[]; +# Change names to match db. +setnames(rA, c("rn"), c("user_specimen_id")); +setnames(rA, c("refA"), c("percent_reference_coral")); +# round missing data to two digits. +rA$percent_reference<-round(rA$percent_reference, digits=2); + +# Convert altB data into data table. +aB <-setDT(aB, keep.rownames=TRUE)[]; +# Change names to match db. +setnames(aB, c("rn"), c("user_specimen_id")); +setnames(aB, c("altB"), c("percent_alternative_coral")); +# Round missing data to two digits. +aB$percent_alternative<-round(aB$percent_alternative, digits=2); + +#convert mlg id to data.table format +dt <- data.table(id, keep.rownames=TRUE); +# Change name to match db. +setnames(dt, c("id"), c("user_specimen_id")); + +# Transform. +df3 <- dt %>% + group_by(row_number()) %>% + dplyr::rename(group='row_number()') %>% + unnest (user_specimen_id) %>% + # Join with mlg table. + left_join(sm %>% + select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id'); + +# If found in database, group members on previous mlg id. +uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); +uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; +na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); +na.group <- df3$group[na.mlg]; +df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; + +# Determine if the sample mlg matched previous genotyped sample. +df4<- df3 %>% + group_by(group) %>% + mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match","match")); + +# Create new mlg id for samples that did not match those in the database. +none <- unique(df4[c("group", "coral_mlg_clonal_id")]); +none <- none[is.na(none$coral_mlg_clonal_id),]; +na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); +n.g <- df4$group[na.mlg2]; +ct <- length(unique(n.g)); + +# List of new group ids, the sequence starts at the number of +# ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if +# the df4 file contains all ids. If it doesn't then look below +# to change the seq() function. +n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); +# This is a key for pairing group with new ids. +rat <- cbind(unique(n.g), n.g_ids); +# this for loop assigns the new id iteratively for all that have NA. +for (i in 1:length(na.mlg2)) { + df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; +} + +# Merge data frames for final table. +report_user <- pi %>% + # Join with the second file (only the first and third column). + left_join(df4 %>% + select("user_specimen_id","coral_mlg_clonal_id","DB_match"), + by='user_specimen_id') %>% + # Join with the second file (only the first and third column). + left_join(mi %>% + select("user_specimen_id","percent_missing_coral"), + by='user_specimen_id') %>% + # Join with the second file (only the first and third column). + left_join(ht %>% + select("user_specimen_id","percent_mixed_coral"), + by='user_specimen_id') %>% + # Join with the second file (only the first and third column); + left_join(rA %>% + select("user_specimen_id","percent_reference_coral"), + by='user_specimen_id') %>% + # Join with the second file (only the first and third column). + left_join(aB %>% + select("user_specimen_id","percent_alternative_coral"), + by='user_specimen_id') %>% + mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% + mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% + ungroup() %>% + select(-group); + +write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); # Rarifaction curve. -H.obj <- mlg.table(gclo, plot=TRUE); # Start PDF device driver. dev.new(width=10, height=7); file_path = get_file_path("geno_rarifaction_curve.pdf"); pdf(file=file_path, width=10, height=7); -rarecurve(H.obj, ylab="Number of expected MLGs", sample=min(rowSums(H.obj)), border=NA, fill=NA, font=2, cex=1, col="blue"); -dev.off() +rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); +dev.off(); + +# Genotype accumulation curve, sample is number of +# loci randomly selected for to make the curve. +dev.new(width=10, height=7); +file_path = get_file_path("geno_accumulation_curve.pdf"); +pdf(file=file_path, width=10, height=7); +genotype_curve(gind, sample=5, quiet=TRUE); +dev.off(); # Create a phylogeny of samples based on distance matrices. cols <- palette(brewer.pal(n=12, name='Set3')); @@ -83,10 +242,12 @@ file_path = get_file_path("nj_phylogeny.pdf"); pdf(file=file_path, width=10, height=7); # Organize branches by clade. -tree <- gclo %>% aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); +tree <- gclo %>% + aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% + ladderize(); plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); # Add a scale bar showing 5% difference.. add.scale.bar(length=0.05, cex=0.65); nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); -dev.off() +dev.off();
--- a/multilocus_genotype.xml Wed Nov 21 10:04:47 2018 -0500 +++ b/multilocus_genotype.xml Wed Nov 28 13:49:18 2018 -0500 @@ -3,11 +3,14 @@ <requirements> <requirement type="package" version="2.1.1">r-adegenet</requirement> <requirement type="package" version="5.1">r-ape</requirement> + <requirement type="package" version="1.11.6">r-data.table</requirement> + <requirement type="package" version="0.7.6">r-dplyr</requirement> <requirement type="package" version="3.0.0">r-ggplot2</requirement> <requirement type="package" version="1.20">r-knitr</requirement> <requirement type="package" version="1.6.0">r-optparse</requirement> <requirement type="package" version="2.8.1">r-poppr</requirement> <requirement type="package" version="1.1.2">r-rcolorbrewer</requirement> + <requirement type="package" version="1.1.1">r-rpostgres</requirement> <requirement type="package" version="2.5_3">r-vegan</requirement> <requirement type="package" version="1.8.0">r-vcfr</requirement> </requirements> @@ -15,17 +18,20 @@ #set output_plots_dir = 'output_plots_dir' mkdir $output_plots_dir && Rscript '$__tool_directory__/multilocus_genotype.R' ---input_vcf '$input_vcf' +--database_connection_string '$__app__.config.corals_database_connection' +--input_affy_metadata '$input_affy_metadata' --input_pop_info '$input_pop_info' ---output_missing_data '$output_missing_data' +--input_vcf '$input_vcf' +--output_stag_db_report '$output_stag_db_report' --output_mlg_id '$output_mlg_id']]></command> <inputs> <param name="input_vcf" type="data" format="vcf" label="VCF file" /> + <param name="input_affy_metadata" type="data" format="csv" label="Affymetrix 96 well plate file" /> <param name="input_pop_info" type="data" format="txt" label="Population information file" /> </inputs> <outputs> + <data name="output_stag_db_report" format="csv" label="${tool.name} (stag db report) on ${on_string}"/> <data name="output_mlg_id" format="txt" label="${tool.name} (MLGs) on ${on_string}"/> - <data name="output_missing_data" format="txt" label="${tool.name} (missing GT) on ${on_string}"/> <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}"> <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/> </collection> @@ -56,3 +62,4 @@ <citations> </citations> </tool> +