Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 16:c4ec8727b50c draft
Uploaded
author | greg |
---|---|
date | Fri, 15 Feb 2019 10:25:04 -0500 |
parents | 62ad61eac1ff |
children | 85f8fc57eee4 |
comparison
equal
deleted
inserted
replaced
15:62ad61eac1ff | 16:c4ec8727b50c |
---|---|
46 user <- user_pass_items[1]; | 46 user <- user_pass_items[1]; |
47 pass <- user_pass_items[2]; | 47 pass <- user_pass_items[2]; |
48 host <- host_dbname_items[1]; | 48 host <- host_dbname_items[1]; |
49 dbname <- host_dbname_items[2]; | 49 dbname <- host_dbname_items[2]; |
50 # FIXME: is there a way to not hard-code the port? | 50 # FIXME: is there a way to not hard-code the port? |
51 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); | 51 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); |
52 return (conn); | 52 return (conn); |
53 } | 53 } |
54 | 54 |
55 # Read in VCF input file. | 55 # Read in VCF input file. |
56 vcf <- read.vcfR(opt$input_vcf); | 56 vcf <- read.vcfR(opt$input_vcf); |
78 # Create table of MLGs. | 78 # Create table of MLGs. |
79 id <- mlg.id(obj2); | 79 id <- mlg.id(obj2); |
80 dt <- data.table(id, keep.rownames=TRUE); | 80 dt <- data.table(id, keep.rownames=TRUE); |
81 setnames(dt, c("id"), c("affy_id")); | 81 setnames(dt, c("id"), c("affy_id")); |
82 | 82 |
83 # Read user's Affymetrix 96 well plate csv file. | 83 # Read user's Affymetrix 96 well plate tabular file. |
84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); | 84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); |
85 colnames(pinfo) <- c("date_entered_db", "user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", | 85 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", |
86 "reef", "region", "latitude", "longitude", "geographic_origin", | 86 "region", "latitude", "longitude", "geographic_origin", "sample_location", |
87 "sample_location", "latitude_outplant", "longitude_outplant", "depth", "dist_shore", | 87 "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist", |
88 "disease_resist", "bleach_resist", "mortality","tle", "spawning", | 88 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", |
89 "collector_last_name", "collector_first_name", "org", "collection_date", "contact_email", | 89 "collector_first_name", "org", "collection_date", "contact_email", "seq_facility", |
90 "seq_facility", "array_version", "public", "public_after_date"); | 90 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", |
91 "dna_extraction_method", "dna_concentration"); | |
91 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); | 92 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); |
92 pinfo2 <- as.character(pinfo$user_specimen_id); | 93 pinfo2 <- as.character(pinfo$user_specimen_id); |
93 pi <- data.table(pinfo2); | 94 pi <- data.table(pinfo2); |
94 setnames(pi, c("pinfo2"), c("user_specimen_id")); | 95 setnames(pi, c("pinfo2"), c("user_specimen_id")); |
95 | 96 |
96 # Connect to database. | 97 # Connect to database. |
97 conn <- get_database_connection(opt$database_connection_string); | 98 conn <- get_database_connection(opt$database_connection_string); |
98 | 99 |
99 # Import the sample table. | 100 # Import the sample table. |
100 mD <- tbl(conn, "sample"); | 101 sample_table <- tbl(conn, "sample"); |
101 | 102 |
102 # Select user_specimen_id and mlg columns. | 103 # Import the genotype table. |
103 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id, affy_id); | 104 genotype_table <- tbl(conn, "genotype"); |
105 | |
106 # Select columns from the sample table and the | |
107 # genotype table joined by genotype_id. | |
108 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); | |
109 smlg <- sample_table_columns %>% | |
110 left_join(genotype_table %>% | |
111 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), | |
112 by=c("genotype_id" = "id")); | |
104 | 113 |
105 # Convert to dataframe. | 114 # Convert to dataframe. |
106 sm <- data.frame(smlg); | 115 sm <- data.frame(smlg); |
107 sm[sm==""] <- NA; | 116 # Name the columns. |
117 colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); | |
108 | 118 |
109 # Missing GT in samples submitted. | 119 # Missing GT in samples submitted. |
110 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | 120 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); |
111 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | 121 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); |
112 myMiss <- (myMiss / nrow(vcf)) * 100; | 122 myMiss <- (myMiss / nrow(vcf)) * 100; |
157 setnames(dt, c("id"), c("affy_id")); | 167 setnames(dt, c("id"), c("affy_id")); |
158 | 168 |
159 # Transform. | 169 # Transform. |
160 df3 <- dt %>% | 170 df3 <- dt %>% |
161 group_by(row_number()) %>% | 171 group_by(row_number()) %>% |
162 dplyr::rename(group='row_number()') %>% | 172 dplyr::rename(group="row_number()") %>% |
163 unnest (affy_id) %>% | 173 unnest (affy_id) %>% |
164 # Join with mlg table. | 174 # Join with mlg table. |
165 left_join(sm %>% | 175 left_join(sm %>% |
166 select("affy_id","coral_mlg_clonal_id"), | 176 select("affy_id","coral_mlg_clonal_id"), |
167 by='affy_id'); | 177 by="affy_id"); |
168 | 178 |
169 # If found in database, group members on previous mlg id. | 179 # If found in database, group members on previous mlg id. |
170 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); | 180 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); |
171 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; | 181 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; |
172 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); | 182 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); |
188 # List of new group ids, the sequence starts at the number of | 198 # List of new group ids, the sequence starts at the number of |
189 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if | 199 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if |
190 # the df4 file contains all ids. If it doesn't then look below | 200 # the df4 file contains all ids. If it doesn't then look below |
191 # to change the seq() function. | 201 # to change the seq() function. |
192 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | 202 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); |
193 # This is a key for pairing group with new ids. | 203 # Pair group with new ids. |
194 rat <- cbind(unique(n.g), n.g_ids); | 204 rat <- cbind(unique(n.g), n.g_ids); |
195 # This for loop assigns the new id iteratively for all that have NA. | 205 # Assign the new id iteratively for all that have NA. |
196 for (i in 1:length(na.mlg2)) { | 206 for (i in 1:length(na.mlg2)) { |
197 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; | 207 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; |
198 } | 208 } |
199 | 209 |
200 # subset poptab for all samples. | 210 # Subset poptab for all samples. |
201 subpop <- poptab[c(2, 3)]; | 211 subpop <- poptab[c(2, 3)]; |
202 | 212 |
203 # Merge data frames for final table. | 213 # Merge data frames for final table. |
204 report_user <- pi %>% | 214 report_user <- pi %>% |
205 left_join(subpop %>% | 215 left_join(subpop %>% |
206 select("affy_id", "user_specimen_id"), | 216 select("affy_id", "user_specimen_id"), |
207 by='user_specimen_id') %>% | 217 by="user_specimen_id") %>% |
208 left_join(df4 %>% | 218 left_join(df4 %>% |
209 select("affy_id", "coral_mlg_clonal_id", "DB_match"), | 219 select("affy_id", "coral_mlg_clonal_id", "DB_match"), |
210 by='affy_id') %>% | 220 by="affy_id") %>% |
211 left_join(mi %>% | 221 left_join(mi %>% |
212 select("affy_id", "percent_missing_data_coral"), | 222 select("affy_id", "percent_missing_data_coral"), |
213 by='affy_id') %>% | 223 by="affy_id") %>% |
214 left_join(ht %>% | 224 left_join(ht %>% |
215 select("affy_id", "percent_mixed_coral"), | 225 select("affy_id", "percent_mixed_coral"), |
216 by='affy_id') %>% | 226 by="affy_id") %>% |
217 left_join(rA %>% | 227 left_join(rA %>% |
218 select("affy_id", "percent_reference_coral"), | 228 select("affy_id", "percent_reference_coral"), |
219 by='affy_id') %>% | 229 by="affy_id") %>% |
220 left_join(aB %>% | 230 left_join(aB %>% |
221 select("affy_id", "percent_alternative_coral"), | 231 select("affy_id", "percent_alternative_coral"), |
222 by='affy_id') %>% | 232 by="affy_id") %>% |
223 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | 233 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% |
224 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% | 234 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% |
225 ungroup() %>% | 235 ungroup() %>% |
226 select(-group); | 236 select(-group); |
227 | 237 |
231 report_db <- pinfo %>% | 241 report_db <- pinfo %>% |
232 left_join(report_user %>% | 242 left_join(report_user %>% |
233 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", | 243 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", |
234 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", | 244 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", |
235 "percent_alternative_coral"), | 245 "percent_alternative_coral"), |
236 by='user_specimen_id'); | 246 by="user_specimen_id"); |
237 | 247 |
238 # Create vector indicating number of individuals desired | 248 # Create vector indicating number of individuals desired |
239 # made from affy_id collumn of report_user data table. | 249 # made from affy_id collumn of report_user data table. |
240 i <- report_user[[2]]; | 250 i <- report_user[[2]]; |
241 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; | 251 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; |
242 | 252 |
243 # Create a phylogeny of samples based on distance matrices. | 253 # Create a phylogeny of samples based on distance matrices. |
244 cols <- palette(brewer.pal(n=12, name='Set3')); | 254 cols <- palette(brewer.pal(n=12, name="Set3")); |
245 set.seed(999); | 255 set.seed(999); |
246 # Start PDF device driver. | 256 # Start PDF device driver. |
247 dev.new(width=10, height=7); | 257 dev.new(width=10, height=7); |
248 file_path = get_file_path("nj_phylogeny.pdf"); | 258 file_path = get_file_path("nj_phylogeny.pdf"); |
249 pdf(file=file_path, width=10, height=7); | 259 pdf(file=file_path, width=10, height=7); |