Mercurial > repos > greg > multilocus_genotype
changeset 16:c4ec8727b50c draft
Uploaded
author | greg |
---|---|
date | Fri, 15 Feb 2019 10:25:04 -0500 |
parents | 62ad61eac1ff |
children | 85f8fc57eee4 |
files | multilocus_genotype.R |
diffstat | 1 files changed, 35 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/multilocus_genotype.R Thu Dec 20 11:29:13 2018 -0500 +++ b/multilocus_genotype.R Fri Feb 15 10:25:04 2019 -0500 @@ -48,7 +48,7 @@ host <- host_dbname_items[1]; dbname <- host_dbname_items[2]; # FIXME: is there a way to not hard-code the port? - conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); + conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); return (conn); } @@ -80,14 +80,15 @@ dt <- data.table(id, keep.rownames=TRUE); setnames(dt, c("id"), c("affy_id")); -# Read user's Affymetrix 96 well plate csv file. +# Read user's Affymetrix 96 well plate tabular file. pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); -colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", - "reef", "region", "latitude", "longitude", "geographic_origin", - "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore", - "disease_resist", "bleach_resist", "mortality","tle", "spawning", - "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email", - "seq_facility", "array_version", "public", "public_after_date"); +colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", + "region", "latitude", "longitude", "geographic_origin", "sample_location", + "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist", + "bleach_resist", "mortality","tle", "spawning", "collector_last_name", + "collector_first_name", "org", "collection_date", "contact_email", "seq_facility", + "array_version", "public", "public_after_date", "sperm_motility", "healing_time", + "dna_extraction_method", "dna_concentration"); pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); pinfo2 <- as.character(pinfo$user_specimen_id); pi <- data.table(pinfo2); @@ -97,14 +98,23 @@ conn <- get_database_connection(opt$database_connection_string); # Import the sample table. -mD <- tbl(conn, "sample"); +sample_table <- tbl(conn, "sample"); + +# Import the genotype table. +genotype_table <- tbl(conn, "genotype"); -# Select user_specimen_id and mlg columns. -smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); +# Select columns from the sample table and the +# genotype table joined by genotype_id. +sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); +smlg <- sample_table_columns %>% + left_join(genotype_table %>% + select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), + by=c("genotype_id" = "id")); # Convert to dataframe. sm <- data.frame(smlg); -sm[sm==""] <- NA; +# Name the columns. +colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); # Missing GT in samples submitted. gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); @@ -159,12 +169,12 @@ # Transform. df3 <- dt %>% group_by(row_number()) %>% - dplyr::rename(group='row_number()') %>% + dplyr::rename(group="row_number()") %>% unnest (affy_id) %>% # Join with mlg table. left_join(sm %>% select("affy_id","coral_mlg_clonal_id"), - by='affy_id'); + by="affy_id"); # If found in database, group members on previous mlg id. uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); @@ -190,36 +200,36 @@ # the df4 file contains all ids. If it doesn't then look below # to change the seq() function. n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); -# This is a key for pairing group with new ids. +# Pair group with new ids. rat <- cbind(unique(n.g), n.g_ids); -# This for loop assigns the new id iteratively for all that have NA. +# Assign the new id iteratively for all that have NA. for (i in 1:length(na.mlg2)) { df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; } -# subset poptab for all samples. +# Subset poptab for all samples. subpop <- poptab[c(2, 3)]; # Merge data frames for final table. report_user <- pi %>% left_join(subpop %>% select("affy_id", "user_specimen_id"), - by='user_specimen_id') %>% + by="user_specimen_id") %>% left_join(df4 %>% select("affy_id", "coral_mlg_clonal_id", "DB_match"), - by='affy_id') %>% + by="affy_id") %>% left_join(mi %>% select("affy_id", "percent_missing_data_coral"), - by='affy_id') %>% + by="affy_id") %>% left_join(ht %>% select("affy_id", "percent_mixed_coral"), - by='affy_id') %>% + by="affy_id") %>% left_join(rA %>% select("affy_id", "percent_reference_coral"), - by='affy_id') %>% + by="affy_id") %>% left_join(aB %>% select("affy_id", "percent_alternative_coral"), - by='affy_id') %>% + by="affy_id") %>% mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% ungroup() %>% @@ -233,7 +243,7 @@ select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", "percent_alternative_coral"), - by='user_specimen_id'); + by="user_specimen_id"); # Create vector indicating number of individuals desired # made from affy_id collumn of report_user data table. @@ -241,7 +251,7 @@ sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; # Create a phylogeny of samples based on distance matrices. -cols <- palette(brewer.pal(n=12, name='Set3')); +cols <- palette(brewer.pal(n=12, name="Set3")); set.seed(999); # Start PDF device driver. dev.new(width=10, height=7);