comparison multilocus_genotype.R @ 16:c4ec8727b50c draft

Uploaded
author greg
date Fri, 15 Feb 2019 10:25:04 -0500
parents 62ad61eac1ff
children 85f8fc57eee4
comparison
equal deleted inserted replaced
15:62ad61eac1ff 16:c4ec8727b50c
46 user <- user_pass_items[1]; 46 user <- user_pass_items[1];
47 pass <- user_pass_items[2]; 47 pass <- user_pass_items[2];
48 host <- host_dbname_items[1]; 48 host <- host_dbname_items[1];
49 dbname <- host_dbname_items[2]; 49 dbname <- host_dbname_items[2];
50 # FIXME: is there a way to not hard-code the port? 50 # FIXME: is there a way to not hard-code the port?
51 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); 51 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
52 return (conn); 52 return (conn);
53 } 53 }
54 54
55 # Read in VCF input file. 55 # Read in VCF input file.
56 vcf <- read.vcfR(opt$input_vcf); 56 vcf <- read.vcfR(opt$input_vcf);
78 # Create table of MLGs. 78 # Create table of MLGs.
79 id <- mlg.id(obj2); 79 id <- mlg.id(obj2);
80 dt <- data.table(id, keep.rownames=TRUE); 80 dt <- data.table(id, keep.rownames=TRUE);
81 setnames(dt, c("id"), c("affy_id")); 81 setnames(dt, c("id"), c("affy_id"));
82 82
83 # Read user's Affymetrix 96 well plate csv file. 83 # Read user's Affymetrix 96 well plate tabular file.
84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); 84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t");
85 colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", 85 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
86 "reef", "region", "latitude", "longitude", "geographic_origin", 86 "region", "latitude", "longitude", "geographic_origin", "sample_location",
87 "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore", 87 "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist",
88 "disease_resist", "bleach_resist", "mortality","tle", "spawning", 88 "bleach_resist", "mortality","tle", "spawning", "collector_last_name",
89 "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email", 89 "collector_first_name", "org", "collection_date", "contact_email", "seq_facility",
90 "seq_facility", "array_version", "public", "public_after_date"); 90 "array_version", "public", "public_after_date", "sperm_motility", "healing_time",
91 "dna_extraction_method", "dna_concentration");
91 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); 92 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id);
92 pinfo2 <- as.character(pinfo$user_specimen_id); 93 pinfo2 <- as.character(pinfo$user_specimen_id);
93 pi <- data.table(pinfo2); 94 pi <- data.table(pinfo2);
94 setnames(pi, c("pinfo2"), c("user_specimen_id")); 95 setnames(pi, c("pinfo2"), c("user_specimen_id"));
95 96
96 # Connect to database. 97 # Connect to database.
97 conn <- get_database_connection(opt$database_connection_string); 98 conn <- get_database_connection(opt$database_connection_string);
98 99
99 # Import the sample table. 100 # Import the sample table.
100 mD <- tbl(conn, "sample"); 101 sample_table <- tbl(conn, "sample");
101 102
102 # Select user_specimen_id and mlg columns. 103 # Import the genotype table.
103 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); 104 genotype_table <- tbl(conn, "genotype");
105
106 # Select columns from the sample table and the
107 # genotype table joined by genotype_id.
108 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id);
109 smlg <- sample_table_columns %>%
110 left_join(genotype_table %>%
111 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"),
112 by=c("genotype_id" = "id"));
104 113
105 # Convert to dataframe. 114 # Convert to dataframe.
106 sm <- data.frame(smlg); 115 sm <- data.frame(smlg);
107 sm[sm==""] <- NA; 116 # Name the columns.
117 colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id");
108 118
109 # Missing GT in samples submitted. 119 # Missing GT in samples submitted.
110 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 120 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
111 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 121 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
112 myMiss <- (myMiss / nrow(vcf)) * 100; 122 myMiss <- (myMiss / nrow(vcf)) * 100;
157 setnames(dt, c("id"), c("affy_id")); 167 setnames(dt, c("id"), c("affy_id"));
158 168
159 # Transform. 169 # Transform.
160 df3 <- dt %>% 170 df3 <- dt %>%
161 group_by(row_number()) %>% 171 group_by(row_number()) %>%
162 dplyr::rename(group='row_number()') %>% 172 dplyr::rename(group="row_number()") %>%
163 unnest (affy_id) %>% 173 unnest (affy_id) %>%
164 # Join with mlg table. 174 # Join with mlg table.
165 left_join(sm %>% 175 left_join(sm %>%
166 select("affy_id","coral_mlg_clonal_id"), 176 select("affy_id","coral_mlg_clonal_id"),
167 by='affy_id'); 177 by="affy_id");
168 178
169 # If found in database, group members on previous mlg id. 179 # If found in database, group members on previous mlg id.
170 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); 180 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]);
171 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; 181 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
172 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); 182 na.mlg <- which(is.na(df3$coral_mlg_clonal_id));
188 # List of new group ids, the sequence starts at the number of 198 # List of new group ids, the sequence starts at the number of
189 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if 199 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if
190 # the df4 file contains all ids. If it doesn't then look below 200 # the df4 file contains all ids. If it doesn't then look below
191 # to change the seq() function. 201 # to change the seq() function.
192 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); 202 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
193 # This is a key for pairing group with new ids. 203 # Pair group with new ids.
194 rat <- cbind(unique(n.g), n.g_ids); 204 rat <- cbind(unique(n.g), n.g_ids);
195 # This for loop assigns the new id iteratively for all that have NA. 205 # Assign the new id iteratively for all that have NA.
196 for (i in 1:length(na.mlg2)) { 206 for (i in 1:length(na.mlg2)) {
197 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; 207 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))];
198 } 208 }
199 209
200 # subset poptab for all samples. 210 # Subset poptab for all samples.
201 subpop <- poptab[c(2, 3)]; 211 subpop <- poptab[c(2, 3)];
202 212
203 # Merge data frames for final table. 213 # Merge data frames for final table.
204 report_user <- pi %>% 214 report_user <- pi %>%
205 left_join(subpop %>% 215 left_join(subpop %>%
206 select("affy_id", "user_specimen_id"), 216 select("affy_id", "user_specimen_id"),
207 by='user_specimen_id') %>% 217 by="user_specimen_id") %>%
208 left_join(df4 %>% 218 left_join(df4 %>%
209 select("affy_id", "coral_mlg_clonal_id", "DB_match"), 219 select("affy_id", "coral_mlg_clonal_id", "DB_match"),
210 by='affy_id') %>% 220 by="affy_id") %>%
211 left_join(mi %>% 221 left_join(mi %>%
212 select("affy_id", "percent_missing_data_coral"), 222 select("affy_id", "percent_missing_data_coral"),
213 by='affy_id') %>% 223 by="affy_id") %>%
214 left_join(ht %>% 224 left_join(ht %>%
215 select("affy_id", "percent_mixed_coral"), 225 select("affy_id", "percent_mixed_coral"),
216 by='affy_id') %>% 226 by="affy_id") %>%
217 left_join(rA %>% 227 left_join(rA %>%
218 select("affy_id", "percent_reference_coral"), 228 select("affy_id", "percent_reference_coral"),
219 by='affy_id') %>% 229 by="affy_id") %>%
220 left_join(aB %>% 230 left_join(aB %>%
221 select("affy_id", "percent_alternative_coral"), 231 select("affy_id", "percent_alternative_coral"),
222 by='affy_id') %>% 232 by="affy_id") %>%
223 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% 233 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>%
224 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% 234 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
225 ungroup() %>% 235 ungroup() %>%
226 select(-group); 236 select(-group);
227 237
231 report_db <- pinfo %>% 241 report_db <- pinfo %>%
232 left_join(report_user %>% 242 left_join(report_user %>%
233 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", 243 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match",
234 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", 244 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral",
235 "percent_alternative_coral"), 245 "percent_alternative_coral"),
236 by='user_specimen_id'); 246 by="user_specimen_id");
237 247
238 # Create vector indicating number of individuals desired 248 # Create vector indicating number of individuals desired
239 # made from affy_id collumn of report_user data table. 249 # made from affy_id collumn of report_user data table.
240 i <- report_user[[2]]; 250 i <- report_user[[2]];
241 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; 251 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE];
242 252
243 # Create a phylogeny of samples based on distance matrices. 253 # Create a phylogeny of samples based on distance matrices.
244 cols <- palette(brewer.pal(n=12, name='Set3')); 254 cols <- palette(brewer.pal(n=12, name="Set3"));
245 set.seed(999); 255 set.seed(999);
246 # Start PDF device driver. 256 # Start PDF device driver.
247 dev.new(width=10, height=7); 257 dev.new(width=10, height=7);
248 file_path = get_file_path("nj_phylogeny.pdf"); 258 file_path = get_file_path("nj_phylogeny.pdf");
249 pdf(file=file_path, width=10, height=7); 259 pdf(file=file_path, width=10, height=7);