multilocus_genotype: multilocus_genotype.R comparison

comparison multilocus_genotype.R @ 16:c4ec8727b50c draft

Uploaded

author	greg
date	Fri, 15 Feb 2019 10:25:04 -0500
parents	62ad61eac1ff
children	85f8fc57eee4

comparison

equal deleted inserted replaced

-:62ad61eac1ff
+:c4ec8727b50c
 user <- user_pass_items[1];
 pass <- user_pass_items[2];
 host <- host_dbname_items[1];
 dbname <- host_dbname_items[2];
 # FIXME: is there a way to not hard-code the port?
-conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
+conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
 return (conn);
 }
 # Read in VCF input file.
 vcf <- read.vcfR(opt$input_vcf);
 # Create table of MLGs.
 id <- mlg.id(obj2);
 dt <- data.table(id, keep.rownames=TRUE);
 setnames(dt, c("id"), c("affy_id"));
-# Read user's Affymetrix 96 well plate csv file.
+# Read user's Affymetrix 96 well plate tabular file.
 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
-colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id",
+colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
-"reef", "region", "latitude", "longitude", "geographic_origin",
+"region", "latitude", "longitude", "geographic_origin", "sample_location",
-"sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore",
+"latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist",
-"disease_resist", "bleach_resist", "mortality","tle", "spawning",
+"bleach_resist", "mortality","tle", "spawning", "collector_last_name",
-"collector_last_name", "collector_first_name", "org", "collection_date", "contact_email",
+"collector_first_name", "org", "collection_date", "contact_email", "seq_facility",
-"seq_facility", "array_version", "public", "public_after_date");
+"array_version", "public", "public_after_date", "sperm_motility", "healing_time",
+"dna_extraction_method", "dna_concentration");
 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
 pinfo2 <- as.character(pinfo$user_specimen_id);
 pi <- data.table(pinfo2);
 setnames(pi, c("pinfo2"), c("user_specimen_id"));
 # Connect to database.
 conn <- get_database_connection(opt$database_connection_string);
 # Import the sample table.
-mD <- tbl(conn, "sample");
+sample_table <- tbl(conn, "sample");
-# Select user_specimen_id and mlg columns.
+# Import the genotype table.
-smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id);
+genotype_table <- tbl(conn, "genotype");
+# Select columns from the sample table and the
+# genotype table joined by genotype_id.
+sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
+smlg <- sample_table_columns %>%
+left_join(genotype_table %>%
+select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
+by=c("genotype_id" = "id"));
 # Convert to dataframe.
 sm <- data.frame(smlg);
-sm[sm==""] <- NA;
+# Name the columns.
+colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
 # Missing GT in samples submitted.
 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
 myMiss <- (myMiss / nrow(vcf)) * 100;
 setnames(dt, c("id"), c("affy_id"));
 # Transform.
 df3 <- dt %>%
 group_by(row_number()) %>%
-dplyr::rename(group='row_number()') %>%
+dplyr::rename(group="row_number()") %>%
 unnest (affy_id) %>%
 # Join with mlg table.
 left_join(sm %>%
 select("affy_id","coral_mlg_clonal_id"),
-by='affy_id');
+by="affy_id");
 # If found in database, group members on previous mlg id.
 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
 na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
 # List of new group ids, the sequence starts at the number of
 # ids present in df4$coral_mlg_clonal_ids plus 1.  Not sure if
 # the df4 file contains all ids.  If it doesn't then look below
 # to change the seq() function.
 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
-# This is a key for pairing group with new ids.
+# Pair group with new ids.
 rat <- cbind(unique(n.g), n.g_ids);
-# This for loop assigns the new id iteratively for all that have NA.
+# Assign the new id iteratively for all that have NA.
 for (i in 1:length(na.mlg2)) {
 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
 }
-# subset poptab for all samples.
+# Subset poptab for all samples.
 subpop <- poptab[c(2, 3)];
 # Merge data frames for final table.
 report_user <- pi %>%
 left_join(subpop %>%
 select("affy_id", "user_specimen_id"),
-by='user_specimen_id') %>%
+by="user_specimen_id") %>%
 left_join(df4 %>%
 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
-by='affy_id') %>%
+by="affy_id") %>%
 left_join(mi %>%
 select("affy_id", "percent_missing_data_coral"),
-by='affy_id') %>%
+by="affy_id") %>%
 left_join(ht %>%
 select("affy_id", "percent_mixed_coral"),
-by='affy_id') %>%
+by="affy_id") %>%
 left_join(rA %>%
 select("affy_id", "percent_reference_coral"),
-by='affy_id') %>%
+by="affy_id") %>%
 left_join(aB %>%
 select("affy_id", "percent_alternative_coral"),
-by='affy_id') %>%
+by="affy_id") %>%
 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
 ungroup() %>%
 select(-group);
 report_db <- pinfo %>%
 left_join(report_user %>%
 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
 "percent_alternative_coral"),
-by='user_specimen_id');
+by="user_specimen_id");
 # Create vector indicating number of individuals desired
 # made from affy_id collumn of report_user data table.
 i <- report_user[[2]];
 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
 # Create a phylogeny of samples based on distance matrices.
-cols <- palette(brewer.pal(n=12, name='Set3'));
+cols <- palette(brewer.pal(n=12, name="Set3"));
 set.seed(999);
 # Start PDF device driver.
 dev.new(width=10, height=7);
 file_path = get_file_path("nj_phylogeny.pdf");
 pdf(file=file_path, width=10, height=7);

Mercurial > repos > greg > multilocus_genotype

comparison multilocus_genotype.R @ 16:c4ec8727b50c draft