changeset 16:c4ec8727b50c draft

Uploaded
author greg
date Fri, 15 Feb 2019 10:25:04 -0500
parents 62ad61eac1ff
children 85f8fc57eee4
files multilocus_genotype.R
diffstat 1 files changed, 35 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/multilocus_genotype.R	Thu Dec 20 11:29:13 2018 -0500
+++ b/multilocus_genotype.R	Fri Feb 15 10:25:04 2019 -0500
@@ -48,7 +48,7 @@
     host <- host_dbname_items[1];
     dbname <- host_dbname_items[2];
     # FIXME: is there a way to not hard-code the port?
-    conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
+    conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
     return (conn);
 }
 
@@ -80,14 +80,15 @@
 dt <- data.table(id, keep.rownames=TRUE);
 setnames(dt, c("id"), c("affy_id"));
 
-# Read user's Affymetrix 96 well plate csv file.
+# Read user's Affymetrix 96 well plate tabular file.
 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
-colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
-                     "reef", "region", "latitude", "longitude", "geographic_origin",
-                     "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
-                     "disease_resist", "bleach_resist", "mortality","tle", "spawning",
-                     "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
-                     "seq_facility", "array_version", "public", "public_after_date");
+colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
+                     "region", "latitude", "longitude", "geographic_origin", "sample_location",
+                     "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist",
+                     "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
+                     "collector_first_name", "org", "collection_date", "contact_email", "seq_facility",
+                     "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
+                     "dna_extraction_method", "dna_concentration");
 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
 pinfo2 <- as.character(pinfo$user_specimen_id);
 pi <- data.table(pinfo2);
@@ -97,14 +98,23 @@
 conn <- get_database_connection(opt$database_connection_string);
 
 # Import the sample table.
-mD <- tbl(conn, "sample");
+sample_table <- tbl(conn, "sample");
+
+# Import the genotype table.
+genotype_table <- tbl(conn, "genotype");
 
-# Select user_specimen_id and mlg columns.
-smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
+# Select columns from the sample table and the
+# genotype table joined by genotype_id.
+sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
+smlg <- sample_table_columns %>%
+    left_join(genotype_table %>%
+              select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
+              by=c("genotype_id" = "id"));
 
 # Convert to dataframe.
 sm <- data.frame(smlg);
-sm[sm==""] <- NA;
+# Name the columns.
+colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
 
 # Missing GT in samples submitted.
 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
@@ -159,12 +169,12 @@
 # Transform.
 df3 <- dt %>%
     group_by(row_number()) %>%
-    dplyr::rename(group='row_number()') %>%
+    dplyr::rename(group="row_number()") %>%
     unnest (affy_id) %>%
     # Join with mlg table.
     left_join(sm %>%
               select("affy_id","coral_mlg_clonal_id"),
-              by='affy_id');
+              by="affy_id");
 
 # If found in database, group members on previous mlg id.
 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
@@ -190,36 +200,36 @@
 # the df4 file contains all ids.  If it doesn't then look below
 # to change the seq() function. 
 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
-# This is a key for pairing group with new ids.
+# Pair group with new ids.
 rat <- cbind(unique(n.g), n.g_ids);
-# This for loop assigns the new id iteratively for all that have NA.
+# Assign the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
     df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
 }
 
-# subset poptab for all samples.
+# Subset poptab for all samples.
 subpop <- poptab[c(2, 3)];
 
 # Merge data frames for final table.
 report_user <- pi %>%
     left_join(subpop %>%
         select("affy_id", "user_specimen_id"),
-        by='user_specimen_id') %>%
+        by="user_specimen_id") %>%
     left_join(df4 %>%
         select("affy_id", "coral_mlg_clonal_id", "DB_match"),
-        by='affy_id') %>%
+        by="affy_id") %>%
     left_join(mi %>%
         select("affy_id", "percent_missing_data_coral"),
-        by='affy_id') %>%
+        by="affy_id") %>%
     left_join(ht %>%
         select("affy_id", "percent_mixed_coral"),
-        by='affy_id') %>%
+        by="affy_id") %>%
     left_join(rA %>%
         select("affy_id", "percent_reference_coral"),
-        by='affy_id') %>%
+        by="affy_id") %>%
     left_join(aB %>%
         select("affy_id", "percent_alternative_coral"),
-        by='affy_id') %>%
+        by="affy_id") %>%
     mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
     mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
     ungroup() %>%
@@ -233,7 +243,7 @@
         select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
                "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
                "percent_alternative_coral"),
-        by='user_specimen_id');
+        by="user_specimen_id");
 
 # Create vector indicating number of individuals desired
 # made from affy_id collumn of report_user data table.
@@ -241,7 +251,7 @@
 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
 
 # Create a phylogeny of samples based on distance matrices.
-cols <- palette(brewer.pal(n=12, name='Set3'));
+cols <- palette(brewer.pal(n=12, name="Set3"));
 set.seed(999);
 # Start PDF device driver.
 dev.new(width=10, height=7);