Mercurial > repos > greg > multilocus_genotype
changeset 9:8f2f346a5e1c draft
Uploaded
author | greg |
---|---|
date | Tue, 04 Dec 2018 13:44:12 -0500 |
parents | d2057e183772 |
children | 6c93244a36e2 |
files | multilocus_genotype.R multilocus_genotype.xml |
diffstat | 2 files changed, 89 insertions(+), 67 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Thu Nov 29 11:18:00 2018 -0500 +++ b/multilocus_genotype.R Tue Dec 04 13:44:12 2018 -0500 @@ -3,7 +3,6 @@ suppressPackageStartupMessages(library("adegenet")) suppressPackageStartupMessages(library("ape")) suppressPackageStartupMessages(library("data.table")) -#suppressPackageStartupMessages(library("dbplyr")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("knitr")) @@ -11,7 +10,6 @@ suppressPackageStartupMessages(library("poppr")) suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("RPostgres")) -#suppressPackageStartupMessages(library("tidyr")) suppressPackageStartupMessages(library("vcfR")) suppressPackageStartupMessages(library("vegan")) @@ -20,7 +18,6 @@ make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), - make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"), make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") ) @@ -80,7 +77,8 @@ gind <- new("genind", (as.matrix(gl))); # Add population information to the genind object. -poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); +poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors = FALSE); +colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); gind@pop <- as.factor(poptab$region); # Convert genind object to a genclone object. @@ -89,20 +87,21 @@ # Calculate the bitwise distance between individuals. xdis <- bitwise.dist(obj2); -# Multilocus genotypes (threshold of 1%). -mlg.filter(obj2, distance=xdis) <- 0.01; +# Multilocus genotypes (threshold of 16%). +mlg.filter(obj2, distance=xdis) <- 0.016; m <- mlg.table(obj2, background=TRUE, color=TRUE); # Create table of MLGs. id <- mlg.id(obj2); dt <- data.table(id, keep.rownames=TRUE); -setnames(dt, c("id"), c("user_specimen_id")); +setnames(dt, c("id"), c("affy_id")); # Read user's Affymetrix 96 well plate csv file. -pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); -pinfo <- pinfo$user_specimen_id; -pi <- data.table(pinfo); -setnames(pi, c("pinfo"), c("user_specimen_id")); +pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t"); +pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); +pinfo2 <- as.character(pinfo$user_specimen_id); +pi <- data.table(pinfo2); +setnames(pi, c("pinfo2"), c("user_specimen_id")); # Connect to database. conn <- get_database_connection(opt$database_connection_string); @@ -111,7 +110,7 @@ mD <- tbl(conn, "sample"); # Select user_specimen_id and mlg columns. -smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); +smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); # Convert to dataframe. sm <- data.frame(smlg); @@ -119,40 +118,35 @@ # Convert missing data into data table. mi <-setDT(miss, keep.rownames=TRUE)[]; -# Change names to match db. -setnames(mi, c("rn"), c("user_specimen_id")); +setnames(mi, c("rn"), c("affy_id")); setnames(mi, c("myMiss"), c("percent_missing_data_coral")); # Round missing data to two digits. -mi$percent_missing <- round(mi$percent_missing, digits=2); +mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); # Convert heterozygosity data into data table. ht <-setDT(ht, keep.rownames=TRUE)[]; -# Change names to match db. -setnames(ht, c("rn"), c("user_specimen_id")); +setnames(ht, c("rn"), c("affy_id")); setnames(ht, c("hets"), c("percent_mixed_coral")); # Round missing data to two digits. ht$percent_mixed<-round(ht$percent_mixed, digits=2); # Convert refA data into data.table. rA <-setDT(rA, keep.rownames=TRUE)[]; -# Change names to match db. -setnames(rA, c("rn"), c("user_specimen_id")); +setnames(rA, c("rn"), c("affy_id")); setnames(rA, c("refA"), c("percent_reference_coral")); # round missing data to two digits. rA$percent_reference<-round(rA$percent_reference, digits=2); # Convert altB data into data table. aB <-setDT(aB, keep.rownames=TRUE)[]; -# Change names to match db. -setnames(aB, c("rn"), c("user_specimen_id")); +setnames(aB, c("rn"), c("affy_id")); setnames(aB, c("altB"), c("percent_alternative_coral")); # Round missing data to two digits. aB$percent_alternative<-round(aB$percent_alternative, digits=2); #convert mlg id to data.table format dt <- data.table(id, keep.rownames=TRUE); -# Change name to match db. -setnames(dt, c("id"), c("user_specimen_id")); +setnames(dt, c("id"), c("affy_id")); # Transform. df3 <- dt %>% @@ -161,7 +155,8 @@ unnest (user_specimen_id) %>% # Join with mlg table. left_join(sm %>% - select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id'); + select("affy_id","coral_mlg_clonal_id"), + by='affy_id'); # If found in database, group members on previous mlg id. uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); @@ -194,28 +189,29 @@ df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; } +# subset poptab for all samples. +subpop <- poptab[c(2, 3)]; + # Merge data frames for final table. report_user <- pi %>% - # Join with the second file (only the first and third column). - left_join(df4 %>% - select("user_specimen_id","coral_mlg_clonal_id","DB_match"), + left_join(subpop %>% + select("affy_id", "user_specimen_id"), by='user_specimen_id') %>% - # Join with the second file (only the first and third column). + left_join(df4 %>% + select("affy_id", "coral_mlg_clonal_id", "DB_match"), + by='affy_id') %>% left_join(mi %>% - select("user_specimen_id","percent_missing_coral"), - by='user_specimen_id') %>% - # Join with the second file (only the first and third column). + select("affy_id", "percent_missing_data_coral"), + by='affy_id') %>% left_join(ht %>% - select("user_specimen_id","percent_mixed_coral"), - by='user_specimen_id') %>% - # Join with the second file (only the first and third column); + select("affy_id", "percent_mixed_coral"), + by='affy_id') %>% left_join(rA %>% - select("user_specimen_id","percent_reference_coral"), - by='user_specimen_id') %>% - # Join with the second file (only the first and third column). + select("affy_id", "percent_reference_coral"), + by='affy_id') %>% left_join(aB %>% - select("user_specimen_id","percent_alternative_coral"), - by='user_specimen_id') %>% + select("affy_id", "percent_alternative_coral"), + by='affy_id') %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>% ungroup() %>% @@ -223,21 +219,18 @@ write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE); -# Rarifaction curve. -# Start PDF device driver. -dev.new(width=10, height=7); -file_path = get_file_path("geno_rarifaction_curve.pdf"); -pdf(file=file_path, width=10, height=7); -rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); -dev.off(); +# Combine sample information for database. +report_db <- pinfo %>% + left_join(report_user %>% + select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", + "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", + "percent_alternative_coral"), + by='user_specimen_id') -# Genotype accumulation curve, sample is number of -# loci randomly selected for to make the curve. -dev.new(width=10, height=7); -file_path = get_file_path("geno_accumulation_curve.pdf"); -pdf(file=file_path, width=10, height=7); -genotype_curve(gind, sample=5, quiet=TRUE); -dev.off(); +# Create vector indicating number of individuals desired +# made from affy_id collumn of report_user data table. +i <- report_user[[2]]; +sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; # Create a phylogeny of samples based on distance matrices. cols <- palette(brewer.pal(n=12, name='Set3')); @@ -247,12 +240,43 @@ file_path = get_file_path("nj_phylogeny.pdf"); pdf(file=file_path, width=10, height=7); # Organize branches by clade. -tree <- obj2 %>% - aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% +theTree <- sub96 %>% + aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>% ladderize(); -plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); +theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; +plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); # Add a scale bar showing 5% difference.. -add.scale.bar(length=0.05, cex=0.65); -nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); +add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); +nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); +legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8); dev.off(); +# Missing data barplot. +poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; +test2 <- which(!is.na(poptab$miss)); +miss96 <- poptab$miss[test2]; +name96 <- poptab$user_specimen_id[test2]; +dev.new(width=10, height=7); +file_path = get_file_path("missing_data.pdf"); +pdf (file=file_path, width=10, height=7); +par(mar = c(8, 4, 4, 2)); +x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); +text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); +dev.off() + +# Rarifaction curve. +# Start PDF device driver. +#dev.new(width=10, height=7); +#file_path = get_file_path("geno_rarifaction_curve.pdf"); +#pdf(file=file_path, width=10, height=7); +#rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue"); +#dev.off(); + +# Genotype accumulation curve, sample is number of +# loci randomly selected for to make the curve. +#dev.new(width=10, height=7); +#file_path = get_file_path("geno_accumulation_curve.pdf"); +#pdf(file=file_path, width=10, height=7); +#genotype_curve(gind, sample=5, quiet=TRUE); +#dev.off(); +
--- a/multilocus_genotype.xml Thu Nov 29 11:18:00 2018 -0500 +++ b/multilocus_genotype.xml Tue Dec 04 13:44:12 2018 -0500 @@ -22,16 +22,14 @@ --input_affy_metadata '$input_affy_metadata' --input_pop_info '$input_pop_info' --input_vcf '$input_vcf' ---output_stag_db_report '$output_stag_db_report' ---output_mlg_id '$output_mlg_id']]></command> +--output_stag_db_report '$output_stag_db_report']]></command> <inputs> <param name="input_vcf" type="data" format="vcf" label="VCF file" /> - <param name="input_affy_metadata" type="data" format="csv" label="Affymetrix 96 well plate file" /> + <param name="input_affy_metadata" type="data" format="tabular" label="Affymetrix 96 well plate file" /> <param name="input_pop_info" type="data" format="txt" label="Population information file" /> </inputs> <outputs> <data name="output_stag_db_report" format="csv" label="${tool.name} (stag db report) on ${on_string}"/> - <data name="output_mlg_id" format="txt" label="${tool.name} (MLGs) on ${on_string}"/> <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}"> <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/> </collection> @@ -39,13 +37,13 @@ <tests> <test> <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/> + <param name="input_affy_metadata" value="affy_metadata.tabular" ftype="tabular"/> <param name="input_pop_info" value="pop_info.txt" ftype="txt"/> + <output name="output_stag_db_report" file="output_stag_db_report.csv" ftype="csv"/> <output_collection name="output_plot_collection" type="list"> - <element name="phylogeny_tree.pdf" file="phylogeny_tree.pdf" ftype="pdf" compare="contains"/> - <element name="dissimiliarity_distance_matrix.pdf" file="dissimiliarity_distance_matrix.pdf" ftype="pdf" compare="contains"/> - <element name="filter_stats.pdf" file="filter_stats.pdf" ftype="pdf" compare="contains"/> - <element name="genotype_accumulation_curve.pdf" file="genotype_accumulation_curve.pdf" ftype="pdf" compare="contains"/> - <element name="genotype_accumulation_curve_for_gind.pdf" file="genotype_accumulation_curve_for_gind.pdf" ftype="pdf" compare="contains"/> + <element name="geno_accumulation_curve.pdf" file="geno_accumulation_curve.pdf" ftype="pdf" compare="contains"/> + <element name="geno_rarifaction_curve.pdf" file="geno_rarifaction_curve.pdf" ftype="pdf" compare="contains"/> + <element name="nj_phylogeny.pdf" file="nj_phylogeny.pdf" ftype="pdf" compare="contains"/> </output_collection> </test> </tests>