Mercurial > repos > greg > multilocus_genotype

--- a/multilocus_genotype.R	Thu Nov 29 11:18:00 2018 -0500
+++ b/multilocus_genotype.R	Tue Dec 04 13:44:12 2018 -0500
@@ -3,7 +3,6 @@
 suppressPackageStartupMessages(library("adegenet"))
 suppressPackageStartupMessages(library("ape"))
 suppressPackageStartupMessages(library("data.table"))
-#suppressPackageStartupMessages(library("dbplyr"))
 suppressPackageStartupMessages(library("dplyr"))
 suppressPackageStartupMessages(library("ggplot2"))
 suppressPackageStartupMessages(library("knitr"))
@@ -11,7 +10,6 @@
 suppressPackageStartupMessages(library("poppr"))
 suppressPackageStartupMessages(library("RColorBrewer"))
 suppressPackageStartupMessages(library("RPostgres"))
-#suppressPackageStartupMessages(library("tidyr"))
 suppressPackageStartupMessages(library("vcfR"))
 suppressPackageStartupMessages(library("vegan"))

@@ -20,7 +18,6 @@
     make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
     make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
     make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
-    make_option(c("--output_mlg_id"), action="store", dest="output_mlg_id", help="Mlg Id data output file"),
     make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file")
 )

@@ -80,7 +77,8 @@
 gind <- new("genind", (as.matrix(gl)));

 # Add population information to the genind object.
-poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA"));
+poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors = FALSE);
+colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region");
 gind@pop <- as.factor(poptab$region);

 # Convert genind object to a genclone object.
@@ -89,20 +87,21 @@
 # Calculate the bitwise distance between individuals.
 xdis <- bitwise.dist(obj2);

-# Multilocus genotypes (threshold of 1%).
-mlg.filter(obj2, distance=xdis) <- 0.01;
+# Multilocus genotypes (threshold of 16%).
+mlg.filter(obj2, distance=xdis) <- 0.016;
 m <- mlg.table(obj2, background=TRUE, color=TRUE);

 # Create table of MLGs.
 id <- mlg.id(obj2);
 dt <- data.table(id, keep.rownames=TRUE);
-setnames(dt, c("id"), c("user_specimen_id"));
+setnames(dt, c("id"), c("affy_id"));

 # Read user's Affymetrix 96 well plate csv file.
-pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE);
-pinfo <- pinfo$user_specimen_id;
-pi <- data.table(pinfo);
-setnames(pi, c("pinfo"), c("user_specimen_id"));
+pinfo <- read.table(opt$input_affy_metadata, header=TRUE, stringsAsFactors=FALSE, sep="\t");
+pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
+pinfo2 <- as.character(pinfo$user_specimen_id);
+pi <- data.table(pinfo2);
+setnames(pi, c("pinfo2"), c("user_specimen_id"));

 # Connect to database.
 conn <- get_database_connection(opt$database_connection_string);
@@ -111,7 +110,7 @@
 mD <- tbl(conn, "sample");

 # Select user_specimen_id and mlg columns.
-smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id);
+smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);

 # Convert to dataframe.
 sm <- data.frame(smlg);
@@ -119,40 +118,35 @@

 # Convert missing data into data table.
 mi <-setDT(miss, keep.rownames=TRUE)[];
-# Change names to match db.
-setnames(mi, c("rn"), c("user_specimen_id"));
+setnames(mi, c("rn"), c("affy_id"));
 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
 # Round missing data to two digits.
-mi$percent_missing <- round(mi$percent_missing, digits=2);
+mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2);

 # Convert heterozygosity data into data table.
 ht <-setDT(ht, keep.rownames=TRUE)[];
-# Change names to match db.
-setnames(ht, c("rn"), c("user_specimen_id"));
+setnames(ht, c("rn"), c("affy_id"));
 setnames(ht, c("hets"), c("percent_mixed_coral"));
 # Round missing data to two digits.
 ht$percent_mixed<-round(ht$percent_mixed, digits=2);

 # Convert refA data into data.table.
 rA <-setDT(rA, keep.rownames=TRUE)[];
-# Change names to match db.
-setnames(rA, c("rn"), c("user_specimen_id"));
+setnames(rA, c("rn"), c("affy_id"));
 setnames(rA, c("refA"), c("percent_reference_coral"));
 # round missing data to two digits.
 rA$percent_reference<-round(rA$percent_reference, digits=2);

 # Convert altB data into data table.
 aB <-setDT(aB, keep.rownames=TRUE)[];
-# Change names to match db.
-setnames(aB, c("rn"), c("user_specimen_id"));
+setnames(aB, c("rn"), c("affy_id"));
 setnames(aB, c("altB"), c("percent_alternative_coral"));
 # Round missing data to two digits.
 aB$percent_alternative<-round(aB$percent_alternative, digits=2);

 #convert mlg id to data.table format
 dt <- data.table(id, keep.rownames=TRUE);
-# Change name to match db.
-setnames(dt, c("id"), c("user_specimen_id"));
+setnames(dt, c("id"), c("affy_id"));

 # Transform.
 df3 <- dt %>%
@@ -161,7 +155,8 @@
     unnest (user_specimen_id) %>%
     # Join with mlg table.
     left_join(sm %>%
-              select("user_specimen_id","coral_mlg_clonal_id"), by='user_specimen_id');
+              select("affy_id","coral_mlg_clonal_id"),
+              by='affy_id');

 # If found in database, group members on previous mlg id.
 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
@@ -194,28 +189,29 @@
     df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
 }

+# subset poptab for all samples.
+subpop <- poptab[c(2, 3)];
+
 # Merge data frames for final table.
 report_user <- pi %>%
-    # Join with the second file (only the first and third column).
-    left_join(df4 %>%
-        select("user_specimen_id","coral_mlg_clonal_id","DB_match"),
+    left_join(subpop %>%
+        select("affy_id", "user_specimen_id"),
         by='user_specimen_id') %>%
-    # Join with the second file (only the first and third column).
+    left_join(df4 %>%
+        select("affy_id", "coral_mlg_clonal_id", "DB_match"),
+        by='affy_id') %>%
     left_join(mi %>%
-        select("user_specimen_id","percent_missing_coral"),
-        by='user_specimen_id') %>%
-    # Join with the second file (only the first and third column).
+        select("affy_id", "percent_missing_data_coral"),
+        by='affy_id') %>%
     left_join(ht %>%
-        select("user_specimen_id","percent_mixed_coral"),
-        by='user_specimen_id') %>%
-    # Join with the second file (only the first and third column);
+        select("affy_id", "percent_mixed_coral"),
+        by='affy_id') %>%
     left_join(rA %>%
-        select("user_specimen_id","percent_reference_coral"),
-        by='user_specimen_id') %>%
-    # Join with the second file (only the first and third column).
+        select("affy_id", "percent_reference_coral"),
+        by='affy_id') %>%
     left_join(aB %>%
-        select("user_specimen_id","percent_alternative_coral"),
-        by='user_specimen_id') %>%
+        select("affy_id", "percent_alternative_coral"),
+        by='affy_id') %>%
     mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
     mutate(coral_mlg_clonal_id=ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id))%>%
     ungroup() %>%
@@ -223,21 +219,18 @@

 write.csv(report_user, file=paste(opt$output_stag_db_report), quote=FALSE);

-# Rarifaction curve.
-# Start PDF device driver.
-dev.new(width=10, height=7);
-file_path = get_file_path("geno_rarifaction_curve.pdf");
-pdf(file=file_path, width=10, height=7);
-rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
-dev.off();
+# Combine sample information for database.
+report_db <- pinfo %>%
+    left_join(report_user %>%
+        select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
+               "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
+               "percent_alternative_coral"),
+        by='user_specimen_id')

-# Genotype accumulation curve, sample is number of
-# loci randomly selected for to make the curve.
-dev.new(width=10, height=7);
-file_path = get_file_path("geno_accumulation_curve.pdf");
-pdf(file=file_path, width=10, height=7);
-genotype_curve(gind, sample=5, quiet=TRUE);
-dev.off();
+# Create vector indicating number of individuals desired
+# made from affy_id collumn of report_user data table.
+i <- report_user[[2]];
+sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];

 # Create a phylogeny of samples based on distance matrices.
 cols <- palette(brewer.pal(n=12, name='Set3'));
@@ -247,12 +240,43 @@
 file_path = get_file_path("nj_phylogeny.pdf");
 pdf(file=file_path, width=10, height=7);
 # Organize branches by clade.
-tree <- obj2 %>%
-    aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>%
+theTree <- sub96 %>%
+    aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>%
     ladderize();
-plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4);
+theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)];
+plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T);
 # Add a scale bar showing 5% difference..
-add.scale.bar(length=0.05, cex=0.65);
-nodelabels(tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3);
+nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
+legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8);
 dev.off();

+# Missing data barplot.
+poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)];
+test2 <- which(!is.na(poptab$miss));
+miss96 <- poptab$miss[test2];
+name96 <- poptab$user_specimen_id[test2];
+dev.new(width=10, height=7);
+file_path = get_file_path("missing_data.pdf");
+pdf (file=file_path, width=10, height=7);
+par(mar = c(8, 4, 4, 2));
+x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
+text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
+dev.off()
+
+# Rarifaction curve.
+# Start PDF device driver.
+#dev.new(width=10, height=7);
+#file_path = get_file_path("geno_rarifaction_curve.pdf");
+#pdf(file=file_path, width=10, height=7);
+#rarecurve(m, ylab="Number of expected MLGs", sample=min(rowSums(m)), border=NA, fill=NA, font=2, cex=1, col="blue");
+#dev.off();
+
+# Genotype accumulation curve, sample is number of
+# loci randomly selected for to make the curve.
+#dev.new(width=10, height=7);
+#file_path = get_file_path("geno_accumulation_curve.pdf");
+#pdf(file=file_path, width=10, height=7);
+#genotype_curve(gind, sample=5, quiet=TRUE);
+#dev.off();
+
--- a/multilocus_genotype.xml	Thu Nov 29 11:18:00 2018 -0500
+++ b/multilocus_genotype.xml	Tue Dec 04 13:44:12 2018 -0500
@@ -22,16 +22,14 @@
 --input_affy_metadata '$input_affy_metadata'
 --input_pop_info '$input_pop_info'
 --input_vcf '$input_vcf'
---output_stag_db_report '$output_stag_db_report'
---output_mlg_id '$output_mlg_id']]></command>
+--output_stag_db_report '$output_stag_db_report']]></command>
     <inputs>
         <param name="input_vcf" type="data" format="vcf" label="VCF file" />
-        <param name="input_affy_metadata" type="data" format="csv" label="Affymetrix 96 well plate file" />
+        <param name="input_affy_metadata" type="data" format="tabular" label="Affymetrix 96 well plate file" />
         <param name="input_pop_info" type="data" format="txt" label="Population information file" />
     </inputs>
     <outputs>
         <data name="output_stag_db_report" format="csv" label="${tool.name} (stag db report) on ${on_string}"/>
-        <data name="output_mlg_id" format="txt" label="${tool.name} (MLGs) on ${on_string}"/>
         <collection name="output_plot_collection" type="list" label="${tool.name} (plots), on ${on_string}">
             <discover_datasets pattern="__name__" directory="output_plots_dir" format="pdf"/>
         </collection>
@@ -39,13 +37,13 @@
     <tests>
         <test>
             <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/>
+            <param name="input_affy_metadata" value="affy_metadata.tabular" ftype="tabular"/>
             <param name="input_pop_info" value="pop_info.txt" ftype="txt"/>
+            <output name="output_stag_db_report" file="output_stag_db_report.csv" ftype="csv"/>
             <output_collection name="output_plot_collection" type="list">
-                <element name="phylogeny_tree.pdf" file="phylogeny_tree.pdf" ftype="pdf" compare="contains"/>
-                <element name="dissimiliarity_distance_matrix.pdf" file="dissimiliarity_distance_matrix.pdf" ftype="pdf" compare="contains"/>
-                <element name="filter_stats.pdf" file="filter_stats.pdf" ftype="pdf" compare="contains"/>
-                <element name="genotype_accumulation_curve.pdf" file="genotype_accumulation_curve.pdf" ftype="pdf" compare="contains"/>
-                <element name="genotype_accumulation_curve_for_gind.pdf" file="genotype_accumulation_curve_for_gind.pdf" ftype="pdf" compare="contains"/>
+                <element name="geno_accumulation_curve.pdf" file="geno_accumulation_curve.pdf" ftype="pdf" compare="contains"/>
+                <element name="geno_rarifaction_curve.pdf" file="geno_rarifaction_curve.pdf" ftype="pdf" compare="contains"/>
+                <element name="nj_phylogeny.pdf" file="nj_phylogeny.pdf" ftype="pdf" compare="contains"/>
             </output_collection>
         </test>
     </tests>