Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 17:85f8fc57eee4 draft
Uploaded
| author | greg |
|---|---|
| date | Wed, 17 Apr 2019 09:07:04 -0400 |
| parents | c4ec8727b50c |
| children | 1190ee1456f6 |
comparison
equal
deleted
inserted
replaced
| 16:c4ec8727b50c | 17:85f8fc57eee4 |
|---|---|
| 8 suppressPackageStartupMessages(library("ggplot2")) | 8 suppressPackageStartupMessages(library("ggplot2")) |
| 9 suppressPackageStartupMessages(library("knitr")) | 9 suppressPackageStartupMessages(library("knitr")) |
| 10 suppressPackageStartupMessages(library("optparse")) | 10 suppressPackageStartupMessages(library("optparse")) |
| 11 suppressPackageStartupMessages(library("poppr")) | 11 suppressPackageStartupMessages(library("poppr")) |
| 12 suppressPackageStartupMessages(library("RColorBrewer")) | 12 suppressPackageStartupMessages(library("RColorBrewer")) |
| 13 suppressPackageStartupMessages(library("rnaturalearth")) | |
| 14 suppressPackageStartupMessages(library("rnaturalearthdata")) | |
| 13 suppressPackageStartupMessages(library("RPostgres")) | 15 suppressPackageStartupMessages(library("RPostgres")) |
| 16 suppressPackageStartupMessages(library("sf")) | |
| 17 suppressPackageStartupMessages(library(SNPRelate)) | |
| 14 suppressPackageStartupMessages(library("tidyr")) | 18 suppressPackageStartupMessages(library("tidyr")) |
| 15 suppressPackageStartupMessages(library("vcfR")) | 19 suppressPackageStartupMessages(library("vcfR")) |
| 16 suppressPackageStartupMessages(library("vegan")) | 20 suppressPackageStartupMessages(library("vegan")) |
| 21 suppressPackageStartupMessages(library("yarrr")) | |
| 22 theme_set(theme_bw()) | |
| 17 | 23 |
| 18 option_list <- list( | 24 option_list <- list( |
| 19 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | 25 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), |
| 20 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), | 26 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), |
| 21 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | 27 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), |
| 22 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | 28 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), |
| 23 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file") | 29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), |
| 30 make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") | |
| 24 ) | 31 ) |
| 25 | 32 |
| 26 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | 33 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); |
| 27 args <- parse_args(parser, positional_arguments=TRUE); | 34 args <- parse_args(parser, positional_arguments=TRUE); |
| 28 opt <- args$options; | 35 opt <- args$options; |
| 55 # Read in VCF input file. | 62 # Read in VCF input file. |
| 56 vcf <- read.vcfR(opt$input_vcf); | 63 vcf <- read.vcfR(opt$input_vcf); |
| 57 | 64 |
| 58 # Convert VCF file into a genind for the Poppr package. | 65 # Convert VCF file into a genind for the Poppr package. |
| 59 # TODO: probably should not hard-code 2 cores. | 66 # TODO: probably should not hard-code 2 cores. |
| 60 gl <- vcfR2genlight(vcf, n.cores=2); | 67 # changed to genind format for extracting alleles later |
| 61 gind <- new("genind", (as.matrix(gl))); | 68 # trade-off is it is a bit slower to import data |
| 69 # gl <- vcfR2genlight(vcf, n.cores=2) | |
| 70 # gind <- new("genind", (as.matrix(gl))) | |
| 71 gind <- vcfR2genind(vcf); | |
| 62 | 72 |
| 63 # Add population information to the genind object. | 73 # Add population information to the genind object. |
| 64 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); | 74 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); |
| 65 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); | 75 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); |
| 66 gind@pop <- as.factor(poptab$region); | 76 gind@pop <- as.factor(poptab$region); |
| 77 strata(gind)<-data.frame(pop(gind)); | |
| 67 | 78 |
| 68 # Convert genind object to a genclone object. | 79 # Convert genind object to a genclone object. |
| 69 obj2 <- as.genclone(gind); | 80 obj2 <- as.genclone(gind); |
| 70 | 81 |
| 71 # Calculate the bitwise distance between individuals. | 82 # Calculate the bitwise distance between individuals. |
| 72 xdis <- bitwise.dist(obj2); | 83 xdis <- bitwise.dist(obj2); |
| 73 | 84 |
| 74 # Multilocus genotypes (threshold of 16%). | 85 # Multilocus genotypes (threshold of 3.2%). |
| 75 mlg.filter(obj2, distance=xdis) <- 0.016; | 86 # threshold doubled because of how the data is formatted in genind compared to genlight |
| 87 mlg.filter(obj2, distance=xdis) <- 0.032; | |
| 76 m <- mlg.table(obj2, background=TRUE, color=TRUE); | 88 m <- mlg.table(obj2, background=TRUE, color=TRUE); |
| 77 | 89 |
| 78 # Create table of MLGs. | 90 # Create table of MLGs. |
| 79 id <- mlg.id(obj2); | 91 id <- mlg.id(obj2); |
| 80 dt <- data.table(id, keep.rownames=TRUE); | 92 #dt <- data.table(id, keep.rownames=TRUE); |
| 81 setnames(dt, c("id"), c("affy_id")); | 93 #setnames(dt, c("id"), c("affy_id")); |
| 82 | 94 |
| 83 # Read user's Affymetrix 96 well plate tabular file. | 95 # Read user's Affymetrix 96 well plate tabular file. |
| 84 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t"); | 96 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); |
| 85 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", | 97 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", |
| 86 "region", "latitude", "longitude", "geographic_origin", "sample_location", | 98 "region", "latitude", "longitude", "geographic_origin", "sample_location", |
| 87 "latitude_outplant", "longitude_outplant", "depth", "dist_shore", "disease_resist", | 99 "latitude_outplant", "longitude_outplant", "depth", "disease_resist", |
| 88 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", | 100 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", |
| 89 "collector_first_name", "org", "collection_date", "contact_email", "seq_facility", | 101 "collector_first_name", "organization", "collection_date", "email", "seq_facility", |
| 90 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", | 102 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", |
| 91 "dna_extraction_method", "dna_concentration"); | 103 "dna_extraction_method", "dna_concentration", "registry_id"); |
| 92 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); | 104 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); |
| 93 pinfo2 <- as.character(pinfo$user_specimen_id); | 105 pinfo2 <- as.character(pinfo$user_specimen_id); |
| 94 pi <- data.table(pinfo2); | 106 pi <- data.table(pinfo2, pinfo$field_call); |
| 95 setnames(pi, c("pinfo2"), c("user_specimen_id")); | 107 setnames(pi, c("pinfo2"), c("user_specimen_id")); |
| 108 setnames(pi, c("V2"), c("field_call")); | |
| 96 | 109 |
| 97 # Connect to database. | 110 # Connect to database. |
| 98 conn <- get_database_connection(opt$database_connection_string); | 111 conn <- get_database_connection(opt$database_connection_string); |
| 99 | 112 |
| 100 # Import the sample table. | 113 # Import the sample table. |
| 127 setnames(mi, c("rn"), c("affy_id")); | 140 setnames(mi, c("rn"), c("affy_id")); |
| 128 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | 141 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); |
| 129 # Round missing data to two digits. | 142 # Round missing data to two digits. |
| 130 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); | 143 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); |
| 131 | 144 |
| 145 #heterozygous alleles | |
| 132 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); | 146 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); |
| 133 hets <- (hets / nrow(vcf)) * 100; | 147 hets <- (hets / nrow(vcf)) * 100; |
| 134 ht <- data.frame(hets); | 148 ht <- data.frame(hets); |
| 135 | 149 |
| 136 # Convert heterozygosity data into data table. | 150 # Convert heterozygosity data into data table. |
| 138 setnames(ht, c("rn"), c("affy_id")); | 152 setnames(ht, c("rn"), c("affy_id")); |
| 139 setnames(ht, c("hets"), c("percent_mixed_coral")); | 153 setnames(ht, c("hets"), c("percent_mixed_coral")); |
| 140 # Round missing data to two digits. | 154 # Round missing data to two digits. |
| 141 ht$percent_mixed<-round(ht$percent_mixed, digits=2); | 155 ht$percent_mixed<-round(ht$percent_mixed, digits=2); |
| 142 | 156 |
| 157 #reference alleles | |
| 143 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); | 158 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); |
| 144 refA <- (refA / nrow(vcf)) * 100; | 159 refA <- (refA / nrow(vcf)) * 100; |
| 145 rA <- data.frame(refA); | 160 rA <- data.frame(refA); |
| 146 | 161 |
| 147 # Convert refA data into data.table. | 162 # Convert refA data into data.table. |
| 149 setnames(rA, c("rn"), c("affy_id")); | 164 setnames(rA, c("rn"), c("affy_id")); |
| 150 setnames(rA, c("refA"), c("percent_reference_coral")); | 165 setnames(rA, c("refA"), c("percent_reference_coral")); |
| 151 # round missing data to two digits. | 166 # round missing data to two digits. |
| 152 rA$percent_reference<-round(rA$percent_reference, digits=2); | 167 rA$percent_reference<-round(rA$percent_reference, digits=2); |
| 153 | 168 |
| 169 #alternative alleles | |
| 154 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); | 170 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); |
| 155 altB <- (altB / nrow(vcf)) * 100; | 171 altB <- (altB / nrow(vcf)) * 100; |
| 156 aB <- data.frame(altB); | 172 aB <- data.frame(altB); |
| 157 | 173 |
| 158 # Convert altB data into data table. | 174 # Convert altB data into data table. |
| 196 ct <- length(unique(n.g)); | 212 ct <- length(unique(n.g)); |
| 197 | 213 |
| 198 # List of new group ids, the sequence starts at the number of | 214 # List of new group ids, the sequence starts at the number of |
| 199 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if | 215 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if |
| 200 # the df4 file contains all ids. If it doesn't then look below | 216 # the df4 file contains all ids. If it doesn't then look below |
| 201 # to change the seq() function. | 217 # to change the seq() function. |
| 202 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | 218 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); |
| 203 # Pair group with new ids. | 219 # Pair group with new ids. |
| 204 rat <- cbind(unique(n.g), n.g_ids); | 220 rat <- cbind(unique(n.g), n.g_ids); |
| 205 # Assign the new id iteratively for all that have NA. | 221 # Assign the new id iteratively for all that have NA. |
| 206 for (i in 1:length(na.mlg2)) { | 222 for (i in 1:length(na.mlg2)) { |
| 230 left_join(aB %>% | 246 left_join(aB %>% |
| 231 select("affy_id", "percent_alternative_coral"), | 247 select("affy_id", "percent_alternative_coral"), |
| 232 by="affy_id") %>% | 248 by="affy_id") %>% |
| 233 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | 249 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% |
| 234 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% | 250 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% |
| 251 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% | |
| 252 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% | |
| 253 mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% | |
| 235 ungroup() %>% | 254 ungroup() %>% |
| 236 select(-group); | 255 select(-group); |
| 237 | 256 |
| 238 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); | 257 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); |
| 239 | 258 |
| 240 # Combine sample information for database. | 259 # Database tables |
| 241 report_db <- pinfo %>% | 260 ## Sample.table |
| 242 left_join(report_user %>% | 261 sample_db <- pinfo %>% |
| 243 select("user_specimen_id", "affy_id", "coral_mlg_clonal_id", "DB_match", | 262 left_join( |
| 244 "percent_missing_data_coral", "percent_mixed_coral", "percent_reference_coral", | 263 report_user %>% |
| 245 "percent_alternative_coral"), | 264 select("user_specimen_id","affy_id", |
| 246 by="user_specimen_id"); | 265 "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", |
| 247 | 266 "percent_alternative_coral"), |
| 248 # Create vector indicating number of individuals desired | 267 by='user_specimen_id'); |
| 249 # made from affy_id collumn of report_user data table. | 268 |
| 250 i <- report_user[[2]]; | 269 ###representative clone for genotype.table |
| 251 sub96 <- obj2[i, mlg.reset=FALSE, drop=FALSE]; | 270 cc<-clonecorrect(obj2, strata= ~pop.gind.); |
| 271 id_rep<-mlg.id(cc); | |
| 272 dt_cc<-data.table(id_rep,keep.rownames = TRUE); | |
| 273 setnames(dt_cc, c("id_rep"), c("affy_id")); | |
| 274 | |
| 275 ###transform mlg data.table | |
| 276 df_cc <- dt_cc %>% | |
| 277 group_by(row_number()) %>% | |
| 278 rename(group='row_number()') %>% | |
| 279 unnest(affy_id) %>% | |
| 280 left_join(report_user %>% | |
| 281 select("coral_mlg_clonal_id","user_specimen_id","affy_id"), | |
| 282 by='affy_id') %>% | |
| 283 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% | |
| 284 ungroup() %>% | |
| 285 select(-group); | |
| 286 | |
| 287 ##geno.table | |
| 288 geno_db <- df4 %>% | |
| 289 left_join(df_cc %>% | |
| 290 select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), | |
| 291 by='affy_id') %>% | |
| 292 ungroup() %>% | |
| 293 select(-group); | |
| 294 | |
| 295 ##taxonomy.table | |
| 296 | |
| 297 tax_db <- report_user %>% | |
| 298 select(genetic_coral_species_call, affy_id) %>% | |
| 299 mutate(genus_name =ifelse(genetic_coral_species_call== | |
| 300 genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% | |
| 301 mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% | |
| 302 mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% | |
| 303 mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); | |
| 304 | |
| 305 | |
| 306 # Table of alleles for the new samples | |
| 307 ## subset to new plate data | |
| 308 ### create vector indicating number of individuals desired | |
| 309 ### made from affy_id collumn from report_user data table | |
| 310 i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); | |
| 311 i<-i[!apply(i == "", 1, all),]; | |
| 312 sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; | |
| 313 | |
| 314 # convert to data frame | |
| 315 at_96<-genind2df(sub96, sep=""); | |
| 316 at_96<- at_96 %>% | |
| 317 select(-pop); | |
| 318 | |
| 319 # allele string for Allele.table in database | |
| 320 uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE); | |
| 321 uat_96<-setDT(uat_96, keep.rownames = TRUE)[]; | |
| 322 setnames(uat_96, c("rn"), c("user_specimen_id")); | |
| 323 # write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); | |
| 252 | 324 |
| 253 # Create a phylogeny of samples based on distance matrices. | 325 # Create a phylogeny of samples based on distance matrices. |
| 254 cols <- palette(brewer.pal(n=12, name="Set3")); | 326 cols <- piratepal("basel"); |
| 255 set.seed(999); | 327 set.seed(999); |
| 256 # Start PDF device driver. | 328 # Start PDF device driver. |
| 257 dev.new(width=10, height=7); | 329 dev.new(width=10, height=7); |
| 258 file_path = get_file_path("nj_phylogeny.pdf"); | 330 file_path = get_file_path("nj_phylogeny.pdf"); |
| 259 pdf(file=file_path, width=10, height=7); | 331 pdf(file=file_path, width=10, height=7); |
| 260 # Organize branches by clade. | 332 # Organize branches by clade. |
| 261 theTree <- sub96 %>% | 333 theTree <- sub96 %>% |
| 262 aboot(dist=provesti.dist, sample=1, tree="nj", cutoff=50, quiet=TRUE) %>% | 334 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% |
| 263 ladderize(); | 335 ladderize(); |
| 264 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; | 336 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; |
| 265 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); | 337 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); |
| 266 # Add a scale bar showing 5% difference.. | 338 # Add a scale bar showing 5% difference.. |
| 267 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); | 339 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); |
| 268 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | 340 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); |
| 269 legend("topright", legend=c("Antigua", "Bahamas", "Belize", "Cuba", "Curacao", "Florida", "PuertoRico", "USVI"), text.col=cols, xpd=T, cex=0.8); | 341 legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); |
| 270 dev.off(); | 342 dev.off() |
| 343 | |
| 344 write.tree(theTree, file =opt$nj_tree, quote=FALSE); | |
| 345 | |
| 346 # identity-by-state analysis | |
| 347 #if (!requireNamespace("BiocManager", quietly = TRUE)) | |
| 348 # install.packages("BiocManager") | |
| 349 #BiocManager::install("SNPRelate", version = "3.8") | |
| 350 | |
| 351 #subset VCF to the user samples | |
| 352 l<-length(i); | |
| 353 n<-ncol(vcf@gt); | |
| 354 s<-n-l; | |
| 355 svcf<-vcf[,s:n]; | |
| 356 write.vcf(svcf, "subset.vcf.gz"); | |
| 357 | |
| 358 vcf.fn <- "subset.vcf.gz"; | |
| 359 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); | |
| 360 | |
| 361 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); | |
| 362 hd<-read.gdsn(index.gdsn(genofile, "sample.id")); | |
| 363 hd<-data.frame(hd); | |
| 364 hd<-setDT(hd, keep.rownames = FALSE)[]; | |
| 365 setnames(hd, c("hd"), c("user_specimen_id")); | |
| 366 | |
| 367 subpop2<- poptab[c(2,4)]; | |
| 368 poptab_sub <- hd %>% | |
| 369 left_join( | |
| 370 subpop2 %>% | |
| 371 select("affy_id","region"), | |
| 372 by='affy_id')%>% | |
| 373 drop_na(); | |
| 374 | |
| 375 samp.annot <- data.frame(pop.group = c(poptab_sub$region)); | |
| 376 add.gdsn(genofile, "sample.annot", samp.annot); | |
| 377 | |
| 378 pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); | |
| 379 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); | |
| 380 | |
| 381 # Identity-By-State Analysis - distance matrix calculation | |
| 382 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); | |
| 383 | |
| 384 # cluster analysis on the genome-wide IBS pairwise distance matrix | |
| 385 set.seed(100); | |
| 386 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); | |
| 387 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); | |
| 388 | |
| 389 # default clustering. | |
| 390 dev.new(width=10, height=7); | |
| 391 file_path = get_file_path("IBS_default.pdf"); | |
| 392 pdf (file=file_path, width=10, height=7); | |
| 393 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); | |
| 394 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); | |
| 395 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); | |
| 396 dev.off() | |
| 397 | |
| 398 # color cluster by region. | |
| 399 dev.new(width=10, height=7); | |
| 400 file_path = get_file_path("IBS_Region.pdf"); | |
| 401 pdf (file=file_path, width=10, height=7); | |
| 402 race <- as.factor(pop_code); | |
| 403 rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); | |
| 404 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); | |
| 405 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); | |
| 406 dev.off() | |
| 407 | |
| 408 #close GDS file | |
| 409 snpgdsClose(genofile); | |
| 410 | |
| 411 # Sample MLG on a map. | |
| 412 world <- ne_countries(scale = "medium", returnclass = "sf"); | |
| 413 class(world); | |
| 414 | |
| 415 pinfo$mlg<-report_user$coral_mlg_clonal_id; | |
| 416 n <- nrow(pinfo); | |
| 417 | |
| 418 mxlat<-max(pinfo$latitude,na.rm = TRUE); | |
| 419 mnlat<-min(pinfo$latitude,na.rm = TRUE); | |
| 420 mxlong<-max(pinfo$longitude,na.rm = TRUE); | |
| 421 mnlong<-min(pinfo$longitude,na.rm = TRUE); | |
| 422 | |
| 423 p5<-ggplot(data = world) + | |
| 424 geom_sf() + | |
| 425 coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); | |
| 426 | |
| 427 colourCount = length(unique(pinfo$mlg)); | |
| 428 getPalette = colorRampPalette(piratepal("basel")); | |
| 429 dev.new(width=10, height=7); | |
| 430 file_path = get_file_path("mlg_map.pdf"); | |
| 431 pdf (file=file_path, width=10, height=7); | |
| 432 p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ | |
| 433 scale_color_manual(values=getPalette(colourCount))+ | |
| 434 theme(legend.position="bottom")+ | |
| 435 guides(color=guide_legend(nrow=8,byrow=F)); | |
| 436 p6; | |
| 437 dev.off() | |
| 271 | 438 |
| 272 # Missing data barplot. | 439 # Missing data barplot. |
| 273 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; | 440 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; |
| 274 test2 <- which(!is.na(poptab$miss)); | 441 test2 <- which(!is.na(poptab$miss)); |
| 275 miss96 <- poptab$miss[test2]; | 442 miss96 <- poptab$miss[test2]; |
| 317 tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); | 484 tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); |
| 318 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); | 485 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); |
| 319 pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); | 486 pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); |
| 320 } | 487 } |
| 321 dev.off() | 488 dev.off() |
| 322 |
