Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 18:1190ee1456f6 draft default tip
Uploaded
author | greg |
---|---|
date | Mon, 13 May 2019 08:37:37 -0400 |
parents | 85f8fc57eee4 |
children |
comparison
equal
deleted
inserted
replaced
17:85f8fc57eee4 | 18:1190ee1456f6 |
---|---|
5 suppressPackageStartupMessages(library("data.table")) | 5 suppressPackageStartupMessages(library("data.table")) |
6 suppressPackageStartupMessages(library("dbplyr")) | 6 suppressPackageStartupMessages(library("dbplyr")) |
7 suppressPackageStartupMessages(library("dplyr")) | 7 suppressPackageStartupMessages(library("dplyr")) |
8 suppressPackageStartupMessages(library("ggplot2")) | 8 suppressPackageStartupMessages(library("ggplot2")) |
9 suppressPackageStartupMessages(library("knitr")) | 9 suppressPackageStartupMessages(library("knitr")) |
10 suppressPackageStartupMessages(library("maps")) | |
11 suppressPackageStartupMessages(library("mapproj")) | |
10 suppressPackageStartupMessages(library("optparse")) | 12 suppressPackageStartupMessages(library("optparse")) |
11 suppressPackageStartupMessages(library("poppr")) | 13 suppressPackageStartupMessages(library("poppr")) |
12 suppressPackageStartupMessages(library("RColorBrewer")) | 14 suppressPackageStartupMessages(library("RColorBrewer")) |
13 suppressPackageStartupMessages(library("rnaturalearth")) | |
14 suppressPackageStartupMessages(library("rnaturalearthdata")) | |
15 suppressPackageStartupMessages(library("RPostgres")) | 15 suppressPackageStartupMessages(library("RPostgres")) |
16 suppressPackageStartupMessages(library("sf")) | 16 suppressPackageStartupMessages(library("SNPRelate")) |
17 suppressPackageStartupMessages(library(SNPRelate)) | |
18 suppressPackageStartupMessages(library("tidyr")) | 17 suppressPackageStartupMessages(library("tidyr")) |
19 suppressPackageStartupMessages(library("vcfR")) | 18 suppressPackageStartupMessages(library("vcfR")) |
20 suppressPackageStartupMessages(library("vegan")) | 19 suppressPackageStartupMessages(library("vegan")) |
21 suppressPackageStartupMessages(library("yarrr")) | 20 suppressPackageStartupMessages(library("yarrr")) |
22 theme_set(theme_bw()) | 21 theme_set(theme_bw()) |
24 option_list <- list( | 23 option_list <- list( |
25 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | 24 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), |
26 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), | 25 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), |
27 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | 26 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), |
28 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | 27 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), |
29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="stag db report output file"), | 28 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"), |
30 make_option(c("--nj_tree"), action="store", dest="nj_tree", help="neighbor-joining tree output file") | 29 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file") |
31 ) | 30 ) |
32 | 31 |
33 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | 32 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); |
34 args <- parse_args(parser, positional_arguments=TRUE); | 33 args <- parse_args(parser, positional_arguments=TRUE); |
35 opt <- args$options; | 34 opt <- args$options; |
57 # FIXME: is there a way to not hard-code the port? | 56 # FIXME: is there a way to not hard-code the port? |
58 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); | 57 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); |
59 return (conn); | 58 return (conn); |
60 } | 59 } |
61 | 60 |
61 time_elapsed <- function(start_time) { | |
62 cat("Elapsed time: ", proc.time() - start_time, "\n\n"); | |
63 } | |
64 | |
65 time_start <- function(msg) { | |
66 start_time <- proc.time(); | |
67 cat(msg, "...\n"); | |
68 return(start_time); | |
69 } | |
70 | |
62 # Read in VCF input file. | 71 # Read in VCF input file. |
72 start_time <- time_start("Reading VCF input"); | |
63 vcf <- read.vcfR(opt$input_vcf); | 73 vcf <- read.vcfR(opt$input_vcf); |
74 time_elapsed(start_time); | |
64 | 75 |
65 # Convert VCF file into a genind for the Poppr package. | 76 # Convert VCF file into a genind for the Poppr package. |
66 # TODO: probably should not hard-code 2 cores. | 77 start_time <- time_start("Converting VCF data to a genind object"); |
67 # changed to genind format for extracting alleles later | 78 genind_obj <- vcfR2genind(vcf); |
68 # trade-off is it is a bit slower to import data | 79 time_elapsed(start_time); |
69 # gl <- vcfR2genlight(vcf, n.cores=2) | |
70 # gind <- new("genind", (as.matrix(gl))) | |
71 gind <- vcfR2genind(vcf); | |
72 | 80 |
73 # Add population information to the genind object. | 81 # Add population information to the genind object. |
74 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); | 82 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t"); |
75 colnames(poptab) <- c("row_id", "affy_id", "user_specimen_id", "region"); | 83 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); |
76 gind@pop <- as.factor(poptab$region); | 84 genind_obj@pop <- as.factor(population_info_data_table$region); |
77 strata(gind)<-data.frame(pop(gind)); | 85 strata(genind_obj) <- data.frame(pop(genind_obj)); |
78 | 86 |
79 # Convert genind object to a genclone object. | 87 # Convert genind object to a genclone object. |
80 obj2 <- as.genclone(gind); | 88 start_time <- time_start("Converting the genind object to a genclone object"); |
89 genind_clone <- as.genclone(genind_obj); | |
90 time_elapsed(start_time); | |
81 | 91 |
82 # Calculate the bitwise distance between individuals. | 92 # Calculate the bitwise distance between individuals. |
83 xdis <- bitwise.dist(obj2); | 93 start_time <- time_start("Calculating the bitwise distance between individuals"); |
94 bitwise_distance <- bitwise.dist(genind_clone); | |
95 time_elapsed(start_time); | |
84 | 96 |
85 # Multilocus genotypes (threshold of 3.2%). | 97 # Multilocus genotypes (threshold of 3.2%). |
86 # threshold doubled because of how the data is formatted in genind compared to genlight | 98 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032; |
87 mlg.filter(obj2, distance=xdis) <- 0.032; | 99 m <- mlg.table(genind_clone, background=TRUE, color=TRUE); |
88 m <- mlg.table(obj2, background=TRUE, color=TRUE); | 100 |
89 | 101 # Create list of MLGs. |
90 # Create table of MLGs. | 102 mlg_ids <- mlg.id(genind_clone); |
91 id <- mlg.id(obj2); | |
92 #dt <- data.table(id, keep.rownames=TRUE); | |
93 #setnames(dt, c("id"), c("affy_id")); | |
94 | 103 |
95 # Read user's Affymetrix 96 well plate tabular file. | 104 # Read user's Affymetrix 96 well plate tabular file. |
96 pinfo <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); | 105 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings = c("", "NA")); |
97 colnames(pinfo) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", | 106 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", |
98 "region", "latitude", "longitude", "geographic_origin", "sample_location", | 107 "region", "latitude", "longitude", "geographic_origin", "sample_location", |
99 "latitude_outplant", "longitude_outplant", "depth", "disease_resist", | 108 "latitude_outplant", "longitude_outplant", "depth", "disease_resist", |
100 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", | 109 "bleach_resist", "mortality","tle", "spawning", "collector_last_name", |
101 "collector_first_name", "organization", "collection_date", "email", "seq_facility", | 110 "collector_first_name", "organization", "collection_date", "email", "seq_facility", |
102 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", | 111 "array_version", "public", "public_after_date", "sperm_motility", "healing_time", |
103 "dna_extraction_method", "dna_concentration", "registry_id"); | 112 "dna_extraction_method", "dna_concentration", "registry_id"); |
104 pinfo$user_specimen_id <- as.character(pinfo$user_specimen_id); | 113 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id); |
105 pinfo2 <- as.character(pinfo$user_specimen_id); | 114 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id); |
106 pi <- data.table(pinfo2, pinfo$field_call); | 115 # The specimen_id_field_call_data_table looks like this: |
107 setnames(pi, c("pinfo2"), c("user_specimen_id")); | 116 # user_specimen_ids V2 |
108 setnames(pi, c("V2"), c("field_call")); | 117 # test_002 prolifera |
118 # test_003 prolifera | |
119 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call); | |
120 # Rename the user_specimen_ids column. | |
121 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id")); | |
122 # Rename the V2 column. | |
123 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call")); | |
109 | 124 |
110 # Connect to database. | 125 # Connect to database. |
111 conn <- get_database_connection(opt$database_connection_string); | 126 conn <- get_database_connection(opt$database_connection_string); |
112 | |
113 # Import the sample table. | 127 # Import the sample table. |
114 sample_table <- tbl(conn, "sample"); | 128 sample_table <- tbl(conn, "sample"); |
115 | |
116 # Import the genotype table. | 129 # Import the genotype table. |
117 genotype_table <- tbl(conn, "genotype"); | 130 genotype_table <- tbl(conn, "genotype"); |
118 | |
119 # Select columns from the sample table and the | 131 # Select columns from the sample table and the |
120 # genotype table joined by genotype_id. | 132 # genotype table joined by genotype_id. |
121 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); | 133 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, genotype_id); |
122 smlg <- sample_table_columns %>% | 134 smlg <- sample_table_columns %>% |
123 left_join(genotype_table %>% | 135 left_join(genotype_table %>% |
124 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), | 136 select("id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"), |
125 by=c("genotype_id" = "id")); | 137 by=c("genotype_id"="id")); |
126 | |
127 # Convert to dataframe. | 138 # Convert to dataframe. |
128 sm <- data.frame(smlg); | 139 smlg_data_frame <- data.frame(smlg); |
129 # Name the columns. | 140 # Name the columns. |
130 colnames(sm) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); | 141 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "genotype_id", "coral_mlg_clonal_id", "symbio_mlg_clonal_id"); |
131 | 142 |
132 # Missing GT in samples submitted. | 143 # Missing GT in samples submitted. |
144 start_time <- time_start("Discovering missing GT in samples"); | |
133 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | 145 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); |
134 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | 146 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); |
135 myMiss <- (myMiss / nrow(vcf)) * 100; | 147 missing_gt <- (missing_gt / nrow(vcf)) * 100; |
136 miss <- data.frame(myMiss); | 148 missing_gt_data_frame <- data.frame(missing_gt); |
137 | 149 # The specimen_id_field_call_data_table looks like this: |
138 # Convert missing data into data table. | 150 # rn missing_gt |
139 mi <-setDT(miss, keep.rownames=TRUE)[]; | 151 # a100000-4368120-060520-256_I07.CEL 0.06092608 |
140 setnames(mi, c("rn"), c("affy_id")); | 152 # a100000-4368120-060520-256_K07.CEL 0.05077173 |
141 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | 153 missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; |
142 # Round missing data to two digits. | 154 # Rename the rn column. |
143 mi$percent_missing_data_coral <- round(mi$percent_missing_data_coral, digits=2); | 155 setnames(missing_gt_data_table, c("rn"), c("affy_id")); |
144 | 156 # Rename the missing_gt column. |
145 #heterozygous alleles | 157 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral")); |
146 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); | 158 # Round data to two digits. |
147 hets <- (hets / nrow(vcf)) * 100; | 159 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); |
148 ht <- data.frame(hets); | 160 time_elapsed(start_time); |
149 | 161 |
150 # Convert heterozygosity data into data table. | 162 # Heterozygous alleles. |
151 ht <-setDT(ht, keep.rownames=TRUE)[]; | 163 start_time <- time_start("Discovering heterozygous alleles"); |
152 setnames(ht, c("rn"), c("affy_id")); | 164 heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); |
153 setnames(ht, c("hets"), c("percent_mixed_coral")); | 165 heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; |
154 # Round missing data to two digits. | 166 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); |
155 ht$percent_mixed<-round(ht$percent_mixed, digits=2); | 167 # The heterozygous_alleles_data_table looks like this: |
156 | 168 # rn heterozygous_alleles |
157 #reference alleles | 169 # a100000-4368120-060520-256_I07.CEL 73.94903 |
158 refA <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))} ); | 170 # a100000-4368120-060520-256_K07.CEL 74.40089 |
159 refA <- (refA / nrow(vcf)) * 100; | 171 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[]; |
160 rA <- data.frame(refA); | 172 # Rename the rn column. |
161 | 173 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id")); |
162 # Convert refA data into data.table. | 174 # Rename the heterozygous_alleles column. |
163 rA <-setDT(rA, keep.rownames=TRUE)[]; | 175 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral")); |
164 setnames(rA, c("rn"), c("affy_id")); | 176 # Round data to two digits. |
165 setnames(rA, c("refA"), c("percent_reference_coral")); | 177 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); |
166 # round missing data to two digits. | 178 time_elapsed(start_time); |
167 rA$percent_reference<-round(rA$percent_reference, digits=2); | 179 |
168 | 180 # Reference alleles. |
169 #alternative alleles | 181 start_time <- time_start("Discovering reference alleles"); |
170 altB <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))} ); | 182 reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); |
171 altB <- (altB / nrow(vcf)) * 100; | 183 reference_alleles <- (reference_alleles / nrow(vcf)) * 100; |
172 aB <- data.frame(altB); | 184 reference_alleles_data_frame <- data.frame(reference_alleles); |
173 | 185 # The reference_alleles_data_table looks like this: |
174 # Convert altB data into data table. | 186 # rn reference_alleles |
175 aB <-setDT(aB, keep.rownames=TRUE)[]; | 187 # a100000-4368120-060520-256_I07.CEL 11.60642 |
176 setnames(aB, c("rn"), c("affy_id")); | 188 # a100000-4368120-060520-256_K07.CEL 11.45918 |
177 setnames(aB, c("altB"), c("percent_alternative_coral")); | 189 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; |
178 # Round missing data to two digits. | 190 # Rename the rn column. |
179 aB$percent_alternative<-round(aB$percent_alternative, digits=2); | 191 setnames(reference_alleles_data_table, c("rn"), c("affy_id")); |
180 | 192 # Rename the reference_alleles column. |
181 #convert mlg id to data.table format | 193 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); |
182 dt <- data.table(id, keep.rownames=TRUE); | 194 # Round data to two digits. |
183 setnames(dt, c("id"), c("affy_id")); | 195 reference_alleles_data_table$percent_reference <- round(reference_alleles_data_table$percent_reference, digits=2); |
184 | 196 time_elapsed(start_time); |
185 # Transform. | 197 |
186 df3 <- dt %>% | 198 # Alternative alleles |
199 start_time <- time_start("Discovering alternative alleles"); | |
200 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); | |
201 alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; | |
202 alternative_alleles_data_frame <- data.frame(alternative_alleles); | |
203 # The alternative_alleles_data_table looks like this: | |
204 # rn alternative_alleles | |
205 # a100000-4368120-060520-256_I07.CEL 14.38363 | |
206 # a100000-4368120-060520-256_K07.CEL 14.08916 | |
207 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; | |
208 # Rename the rn column. | |
209 setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); | |
210 # Rename the alternative_alleles column. | |
211 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); | |
212 # Round data to two digits. | |
213 alternative_alleles_data_table$percent_alternative <- round(alternative_alleles_data_table$percent_alternative, digits=2); | |
214 time_elapsed(start_time); | |
215 | |
216 # The mlg_ids_data_table looks like this: | |
217 # mlg_ids | |
218 # a550962-4368120-060520-500_M23.CEL | |
219 # a550962-4368120-060520-256_A19.CEL | |
220 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE); | |
221 # Rename the mlg_ids column. | |
222 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id")); | |
223 | |
224 # sample_mlg_tibble looks like this: | |
225 # A tibble: 262 x 3 | |
226 # Groups: group [?] | |
227 # group affy_id coral_mlg_clonal_id | |
228 # <int> <chr> <chr> | |
229 # 1 a550962-4368120-060520-500_M23.CEL NA | |
230 # 2 a550962-4368120-060520-256_A19.CEL HG0006 | |
231 sample_mlg_tibble <- mlg_ids_data_table %>% | |
187 group_by(row_number()) %>% | 232 group_by(row_number()) %>% |
188 dplyr::rename(group="row_number()") %>% | 233 dplyr::rename(group="row_number()") %>% |
189 unnest (affy_id) %>% | 234 unnest (affy_id) %>% |
190 # Join with mlg table. | 235 # Join with mlg table. |
191 left_join(sm %>% | 236 left_join(smlg_data_frame %>% |
192 select("affy_id","coral_mlg_clonal_id"), | 237 select("affy_id","coral_mlg_clonal_id"), |
193 by="affy_id"); | 238 by="affy_id"); |
194 | 239 |
195 # If found in database, group members on previous mlg id. | 240 # If found in database, group members on previous mlg id. |
196 uniques <- unique(df3[c("group", "coral_mlg_clonal_id")]); | 241 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]); |
197 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; | 242 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; |
198 na.mlg <- which(is.na(df3$coral_mlg_clonal_id)); | 243 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id)); |
199 na.group <- df3$group[na.mlg]; | 244 na.group <- sample_mlg_tibble$group[na.mlg]; |
200 df3$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; | 245 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; |
201 | 246 |
202 # Determine if the sample mlg matched previous genotyped sample. | 247 # Find out if the sample mlg matched a previous genotyped sample. |
203 df4<- df3 %>% | 248 # sample_mlg_match_tibble looks like this: |
249 # A tibble: 262 x 4 | |
250 # Groups: group [230] | |
251 # group affy_id coral_mlg_clonal_id DB_match | |
252 # <int> <chr> <chr> <chr> | |
253 # 1 a550962-4368120-060520-500_M23.CEL NA no_match | |
254 # 2 a550962-4368120-060520-256_A19.CEL HG0006 match | |
255 sample_mlg_match_tibble <- sample_mlg_tibble %>% | |
204 group_by(group) %>% | 256 group_by(group) %>% |
205 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); | 257 mutate(DB_match = ifelse(is.na(coral_mlg_clonal_id),"no_match", "match")); |
206 | 258 |
207 # Create new mlg id for samples that did not match those in the database. | 259 # Create new mlg id for samples with no matches in the database. |
208 none <- unique(df4[c("group", "coral_mlg_clonal_id")]); | 260 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]); |
209 none <- none[is.na(none$coral_mlg_clonal_id),]; | 261 none <- none[is.na(none$coral_mlg_clonal_id),]; |
210 na.mlg2 <- which(is.na(df4$coral_mlg_clonal_id)); | 262 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id)); |
211 n.g <- df4$group[na.mlg2]; | 263 n.g <- sample_mlg_match_tibble$group[na.mlg2]; |
212 ct <- length(unique(n.g)); | 264 ct <- length(unique(n.g)); |
213 | 265 |
214 # List of new group ids, the sequence starts at the number of | 266 # List of new group ids, the sequence starts at the number of |
215 # ids present in df4$coral_mlg_clonal_ids plus 1. Not sure if | 267 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids |
216 # the df4 file contains all ids. If it doesn't then look below | 268 # plus 1. |
217 # to change the seq() function. | 269 # FIXME: Not sure if # the sample_mlg_match_tibble file |
218 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(df4["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | 270 # contains all ids. If it doesn't then look below to change |
219 # Pair group with new ids. | 271 # the seq() function. |
220 rat <- cbind(unique(n.g), n.g_ids); | 272 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); |
273 | |
221 # Assign the new id iteratively for all that have NA. | 274 # Assign the new id iteratively for all that have NA. |
222 for (i in 1:length(na.mlg2)) { | 275 for (i in 1:length(na.mlg2)) { |
223 df4$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(df4$group[na.mlg2[i]], unique(n.g))]; | 276 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))]; |
224 } | 277 } |
225 | 278 |
226 # Subset poptab for all samples. | 279 # Subset population_info_data_table for all samples. |
227 subpop <- poptab[c(2, 3)]; | 280 # affy_id_user_specimen_id_vector looks like this: |
281 # affy_id user_specimen_id | |
282 # a100000-4368120-060520-256_I07.CEL 13704 | |
283 # a100000-4368120-060520-256_K07.CEL 13706 | |
284 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)]; | |
228 | 285 |
229 # Merge data frames for final table. | 286 # Merge data frames for final table. |
230 report_user <- pi %>% | 287 start_time <- time_start("Merging data frames"); |
231 left_join(subpop %>% | 288 stag_db_report <- specimen_id_field_call_data_table %>% |
289 left_join(affy_id_user_specimen_id_vector %>% | |
232 select("affy_id", "user_specimen_id"), | 290 select("affy_id", "user_specimen_id"), |
233 by="user_specimen_id") %>% | 291 by="user_specimen_id") %>% |
234 left_join(df4 %>% | 292 left_join(sample_mlg_match_tibble %>% |
235 select("affy_id", "coral_mlg_clonal_id", "DB_match"), | 293 select("affy_id", "coral_mlg_clonal_id", "DB_match"), |
236 by="affy_id") %>% | 294 by="affy_id") %>% |
237 left_join(mi %>% | 295 left_join(missing_gt_data_table %>% |
238 select("affy_id", "percent_missing_data_coral"), | 296 select("affy_id", "percent_missing_data_coral"), |
239 by="affy_id") %>% | 297 by="affy_id") %>% |
240 left_join(ht %>% | 298 left_join(heterozygous_alleles_data_table %>% |
241 select("affy_id", "percent_mixed_coral"), | 299 select("affy_id", "percent_heterozygous_coral"), |
242 by="affy_id") %>% | 300 by="affy_id") %>% |
243 left_join(rA %>% | 301 left_join(reference_alleles_data_table %>% |
244 select("affy_id", "percent_reference_coral"), | 302 select("affy_id", "percent_reference_coral"), |
245 by="affy_id") %>% | 303 by="affy_id") %>% |
246 left_join(aB %>% | 304 left_join(alternative_alleles_data_table %>% |
247 select("affy_id", "percent_alternative_coral"), | 305 select("affy_id", "percent_alternative_coral"), |
248 by="affy_id") %>% | 306 by="affy_id") %>% |
249 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% | 307 mutate(DB_match = ifelse(is.na(DB_match), "failed", DB_match))%>% |
250 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% | 308 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% |
251 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 40 & percent_alternative_coral<= 44.5,"A.palmata","other")) %>% | 309 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.5, "A.palmata","other")) %>% |
252 mutate(genetic_coral_species_call=ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral<= 50,"A.cervicornis",genetic_coral_species_call)) %>% | 310 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45.5 & percent_alternative_coral <= 50, "A.cervicornis", genetic_coral_species_call)) %>% |
253 mutate(genetic_coral_species_call=ifelse(percent_heterozygous_coral > 40,"A.prolifera",genetic_coral_species_call)) %>% | 311 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% |
254 ungroup() %>% | 312 ungroup() %>% |
255 select(-group); | 313 select(-group); |
256 | 314 time_elapsed(start_time); |
257 write.csv(report_user, file=opt$output_stag_db_report, quote=FALSE); | 315 |
258 | 316 start_time <- time_start("Writing csv output"); |
259 # Database tables | 317 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE); |
260 ## Sample.table | 318 time_elapsed(start_time); |
261 sample_db <- pinfo %>% | 319 |
262 left_join( | 320 # Database sample table. |
263 report_user %>% | 321 sample_db <- affy_metadata_data_frame %>% |
264 select("user_specimen_id","affy_id", | 322 left_join(stag_db_report %>% |
265 "percent_missing_data_coral","percent_heterozygous_coral","percent_reference_coral", | 323 select("user_specimen_id","affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", "percent_reference_coral", "percent_alternative_coral"), |
266 "percent_alternative_coral"), | 324 by='user_specimen_id'); |
267 by='user_specimen_id'); | 325 |
268 | 326 # Representative clone for genotype table. |
269 ###representative clone for genotype.table | 327 start_time <- time_start("Creating representative clone for genotype table"); |
270 cc<-clonecorrect(obj2, strata= ~pop.gind.); | 328 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); |
271 id_rep<-mlg.id(cc); | 329 id_rep <- mlg.id(no_dup_genotypes_genind); |
272 dt_cc<-data.table(id_rep,keep.rownames = TRUE); | 330 id_data_table <- data.table(id_rep, keep.rownames=TRUE); |
273 setnames(dt_cc, c("id_rep"), c("affy_id")); | 331 # Rename the id_rep column. |
274 | 332 setnames(id_data_table, c("id_rep"), c("affy_id")); |
275 ###transform mlg data.table | 333 time_elapsed(start_time); |
276 df_cc <- dt_cc %>% | 334 |
277 group_by(row_number()) %>% | 335 # # Combine with previously genotyped samples in the database. |
278 rename(group='row_number()') %>% | 336 start_time <- time_start("Selecting from various database tables"); |
279 unnest(affy_id) %>% | 337 representative_mlg_tibble <- id_data_table %>% |
280 left_join(report_user %>% | 338 group_by(row_number()) %>% |
281 select("coral_mlg_clonal_id","user_specimen_id","affy_id"), | 339 rename(group='row_number()') %>% |
282 by='affy_id') %>% | 340 unnest(affy_id) %>% |
283 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id),"",affy_id)) %>% | 341 left_join(stag_db_report %>% |
284 ungroup() %>% | 342 select("coral_mlg_clonal_id", "user_specimen_id", "affy_id"), |
285 select(-group); | 343 by='affy_id') %>% |
286 | 344 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_clonal_id), "", affy_id)) %>% |
287 ##geno.table | 345 ungroup() %>% |
288 geno_db <- df4 %>% | 346 select(-group); |
289 left_join(df_cc %>% | 347 |
290 select("affy_id","coral_mlg_rep_sample_id","user_specimen_id"), | 348 # Database genotype table. |
349 genotype_table_join <- sample_mlg_match_tibble %>% | |
350 left_join(representative_mlg_tibble %>% | |
351 select("affy_id", "coral_mlg_rep_sample_id", "user_specimen_id"), | |
291 by='affy_id') %>% | 352 by='affy_id') %>% |
292 ungroup() %>% | 353 ungroup() %>% |
293 select(-group); | 354 select(-group); |
294 | 355 |
295 ##taxonomy.table | 356 # Database taxonomy table. |
296 | 357 taxonomy_table_join <- stag_db_report %>% |
297 tax_db <- report_user %>% | 358 select(genetic_coral_species_call, affy_id) %>% |
298 select(genetic_coral_species_call, affy_id) %>% | 359 mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% |
299 mutate(genus_name =ifelse(genetic_coral_species_call== | 360 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% |
300 genetic_coral_species_call[grep("^A.*",genetic_coral_species_call)],"Acropora","other")) %>% | 361 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% |
301 mutate(species_name=ifelse(genetic_coral_species_call=="A.palmata","palmata","other"))%>% | 362 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name)); |
302 mutate(species_name=ifelse(genetic_coral_species_call =="A.cervicornis","cervicornis",species_name))%>% | 363 time_elapsed(start_time); |
303 mutate(species_name=ifelse(genetic_coral_species_call=="A.prolifera","prolifera", species_name)); | 364 # Table of alleles for the new samples subset to new plate data. |
304 | 365 # Create vector indicating number of individuals desired from |
305 | 366 # affy_id column of stag_db_report data table. |
306 # Table of alleles for the new samples | 367 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); |
307 ## subset to new plate data | 368 i <- i[!apply(i== "", 1, all),]; |
308 ### create vector indicating number of individuals desired | 369 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; |
309 ### made from affy_id collumn from report_user data table | 370 |
310 i<-ifelse(is.na(report_user[1]),"",report_user[[1]]); | 371 # cols looks like this: |
311 i<-i[!apply(i == "", 1, all),]; | 372 # blue1 red green pink orange blue2 |
312 sub96<-obj2[i, mlg.reset = FALSE, drop = FALSE]; | 373 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" |
313 | 374 # green2 yellow turquoise poop |
314 # convert to data frame | 375 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF" |
315 at_96<-genind2df(sub96, sep=""); | |
316 at_96<- at_96 %>% | |
317 select(-pop); | |
318 | |
319 # allele string for Allele.table in database | |
320 uat_96<-unite(at_96, alleles, 1:19696, sep = " ", remove = TRUE); | |
321 uat_96<-setDT(uat_96, keep.rownames = TRUE)[]; | |
322 setnames(uat_96, c("rn"), c("user_specimen_id")); | |
323 # write.csv(uat_96,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); | |
324 | |
325 # Create a phylogeny of samples based on distance matrices. | |
326 cols <- piratepal("basel"); | 376 cols <- piratepal("basel"); |
327 set.seed(999); | 377 set.seed(999); |
328 # Start PDF device driver. | 378 |
329 dev.new(width=10, height=7); | 379 if (!is.null(opt$output_nj_phylogeny_tree)) { |
330 file_path = get_file_path("nj_phylogeny.pdf"); | 380 # Create a phylogeny tree of samples based on distance matrices. |
331 pdf(file=file_path, width=10, height=7); | 381 # Start PDF device driver. |
332 # Organize branches by clade. | 382 start_time <- time_start("Creating nj_phylogeny_tree.pdf"); |
333 theTree <- sub96 %>% | 383 dev.new(width=10, height=7); |
334 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% | 384 file_path = get_file_path("nj_phylogeny_tree.pdf"); |
335 ladderize(); | 385 pdf(file=file_path, width=10, height=7); |
336 theTree$tip.label <- report_user$user_specimen_id[match(theTree$tip.label, report_user$affy_id)]; | 386 # Organize branches by clade. |
337 plot.phylo(theTree, tip.color=cols[sub96$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); | 387 nj_phylogeny_tree <- sample_alleles_vector %>% |
338 # Add a scale bar showing 5% difference.. | 388 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE) %>% |
339 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); | 389 ladderize(); |
340 nodelabels(theTree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | 390 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; |
341 legend("topright", legend=c(levels(sub96$pop)), text.col=cols, xpd=T, cex=0.8); | 391 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0125, cex=0.3, font=2, lwd=4, align.tip.label=F, no.margin=T); |
342 dev.off() | 392 # Add a scale bar showing 5% difference. |
343 | 393 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=3); |
344 write.tree(theTree, file =opt$nj_tree, quote=FALSE); | 394 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); |
345 | 395 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); |
346 # identity-by-state analysis | 396 dev.off() |
347 #if (!requireNamespace("BiocManager", quietly = TRUE)) | 397 time_elapsed(start_time); |
348 # install.packages("BiocManager") | 398 } |
349 #BiocManager::install("SNPRelate", version = "3.8") | 399 |
350 | 400 # Subset VCF to the user samples. |
351 #subset VCF to the user samples | 401 start_time <- time_start("Subsetting vcf to the user samples"); |
352 l<-length(i); | 402 l <- length(i); |
353 n<-ncol(vcf@gt); | 403 n <- ncol(vcf@gt); |
354 s<-n-l; | 404 s <- n - l; |
355 svcf<-vcf[,s:n]; | 405 svcf <- vcf[, s:n]; |
356 write.vcf(svcf, "subset.vcf.gz"); | 406 write.vcf(svcf, "subset.vcf.gz"); |
357 | |
358 vcf.fn <- "subset.vcf.gz"; | 407 vcf.fn <- "subset.vcf.gz"; |
359 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); | 408 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); |
360 | 409 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); |
361 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); | 410 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id")); |
362 hd<-read.gdsn(index.gdsn(genofile, "sample.id")); | 411 # gds_array looks like this: |
363 hd<-data.frame(hd); | 412 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL" |
364 hd<-setDT(hd, keep.rownames = FALSE)[]; | 413 # [3] "a550962-4368120-060520-500_A09.CEL" "a550962-4368120-060520-500_A11.CEL" |
365 setnames(hd, c("hd"), c("user_specimen_id")); | 414 gds_data_frame <- data.frame(gds_array); |
366 | 415 # gds_data_frame looks like this: |
367 subpop2<- poptab[c(2,4)]; | 416 # gds_array |
368 poptab_sub <- hd %>% | 417 # a550962-4368120-060520-500_A03.CEL |
369 left_join( | 418 # a550962-4368120-060520-500_A05.CEL |
370 subpop2 %>% | 419 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[]; |
371 select("affy_id","region"), | 420 # Rename the gds_array column. |
372 by='affy_id')%>% | 421 setnames(gds_data_table, c("gds_array"), c("affy_id")); |
373 drop_na(); | 422 # affy_id_region_list looks like this: |
374 | 423 # affy_id region |
375 samp.annot <- data.frame(pop.group = c(poptab_sub$region)); | 424 # a100000-4368120-060520-256_I07.CEL USVI |
425 # a100000-4368120-060520-256_K07.CEL USVI | |
426 affy_id_region_list <- population_info_data_table[c(2,4)]; | |
427 gds_data_table_join <- gds_data_table %>% | |
428 left_join(affy_id_region_list %>% | |
429 select("affy_id", "region"), | |
430 by='affy_id')%>% | |
431 drop_na(); | |
432 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); | |
376 add.gdsn(genofile, "sample.annot", samp.annot); | 433 add.gdsn(genofile, "sample.annot", samp.annot); |
377 | 434 # population_code looks like this: |
378 pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); | 435 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 |
436 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 | |
437 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); | |
379 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); | 438 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); |
380 | 439 # pop.group looks like this: |
381 # Identity-By-State Analysis - distance matrix calculation | 440 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 |
441 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 | |
442 time_elapsed(start_time); | |
443 | |
444 # Distance matrix calculation. | |
445 start_time <- time_start("Calculating distance matrix"); | |
382 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); | 446 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); |
383 | 447 time_elapsed(start_time); |
384 # cluster analysis on the genome-wide IBS pairwise distance matrix | 448 |
449 # Cluster analysis on the genome-wide IBS pairwise distance matrix. | |
450 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); | |
385 set.seed(100); | 451 set.seed(100); |
386 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); | 452 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); |
387 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); | 453 ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, autosome.only=FALSE)); |
388 | 454 time_elapsed(start_time); |
389 # default clustering. | 455 |
456 # Default clustering. | |
457 start_time <- time_start("Creating ibs_default.pdf"); | |
458 # Start PDF device driver. | |
390 dev.new(width=10, height=7); | 459 dev.new(width=10, height=7); |
391 file_path = get_file_path("IBS_default.pdf"); | 460 file_path = get_file_path("ibs_default.pdf"); |
392 pdf (file=file_path, width=10, height=7); | 461 pdf(file=file_path, width=10, height=7); |
393 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); | 462 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); |
394 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular",y.label=0.2); | 463 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", y.label=0.2); |
395 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); | 464 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); |
396 dev.off() | 465 dev.off() |
397 | 466 time_elapsed(start_time); |
398 # color cluster by region. | 467 |
468 # Color cluster by region. | |
469 start_time <- time_start("Creating ibs_region.pdf"); | |
470 # Start PDF device driver. | |
399 dev.new(width=10, height=7); | 471 dev.new(width=10, height=7); |
400 file_path = get_file_path("IBS_Region.pdf"); | 472 file_path = get_file_path("ibs_region.pdf"); |
401 pdf (file=file_path, width=10, height=7); | 473 pdf(file=file_path, width=10, height=7); |
402 race <- as.factor(pop_code); | 474 race <- as.factor(population_code); |
403 rv2 <- snpgdsCutTree(ibs.hc,samp.group=race,col.list=cols,pch.list=15); | 475 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); |
404 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular",y.label=0.2); | 476 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", y.label=0.2); |
405 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); | 477 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); |
406 dev.off() | 478 dev.off() |
407 | 479 time_elapsed(start_time); |
408 #close GDS file | 480 |
481 # close GDS file. | |
409 snpgdsClose(genofile); | 482 snpgdsClose(genofile); |
410 | 483 |
411 # Sample MLG on a map. | 484 # Sample MLG on a map. |
412 world <- ne_countries(scale = "medium", returnclass = "sf"); | 485 start_time <- time_start("Creating mlg_map.pdf"); |
413 class(world); | 486 # Get the lattitude and longitude boundaries for rendering |
414 | 487 # the map. Tese boundaries will restrict the map to focus |
415 pinfo$mlg<-report_user$coral_mlg_clonal_id; | 488 # (i.e., zoom) on the region of the world map from which |
416 n <- nrow(pinfo); | 489 # the samples were taken. |
417 | 490 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE); |
418 mxlat<-max(pinfo$latitude,na.rm = TRUE); | 491 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE); |
419 mnlat<-min(pinfo$latitude,na.rm = TRUE); | 492 latitude_range_vector <- c(min_latitude-3, max_latitude+3); |
420 mxlong<-max(pinfo$longitude,na.rm = TRUE); | 493 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE); |
421 mnlong<-min(pinfo$longitude,na.rm = TRUE); | 494 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE); |
422 | 495 longitude_range_vector <- c(min_longitude-3, max_longitude+3); |
423 p5<-ggplot(data = world) + | 496 # Get the palette colors for rendering plots. |
424 geom_sf() + | 497 colors <- length(unique(stag_db_report$coral_mlg_clonal_id)); |
425 coord_sf(xlim = c(mnlong-3, mxlong+3), ylim = c(mnlat-3,mxlat+3), expand = FALSE); | 498 # Get a color palette. |
426 | 499 palette <- colorRampPalette(piratepal("basel")); |
427 colourCount = length(unique(pinfo$mlg)); | 500 # Start PDF device driver. |
428 getPalette = colorRampPalette(piratepal("basel")); | |
429 dev.new(width=10, height=7); | 501 dev.new(width=10, height=7); |
430 file_path = get_file_path("mlg_map.pdf"); | 502 file_path = get_file_path("mlg_map.pdf"); |
431 pdf (file=file_path, width=10, height=7); | 503 pdf(file=file_path, width=10, height=7); |
432 p6<-p5+ geom_point(data = pinfo,aes(x =longitude, y=latitude, group=mlg, color = mlg), alpha=.7, size=3)+ | 504 world_data = map_data("world"); |
433 scale_color_manual(values=getPalette(colourCount))+ | 505 # Add the coral_mlg_clonal_id column from the stag_db_report |
434 theme(legend.position="bottom")+ | 506 # data fram to the affy_metadata_data_frame. |
435 guides(color=guide_legend(nrow=8,byrow=F)); | 507 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id; |
436 p6; | 508 # Get the number of colors needed from the palette for plotting |
509 # the sample locations on the world map. | |
510 num_colors = length(unique(affy_metadata_data_frame$mlg)); | |
511 # Get a color palette. | |
512 palette = colorRampPalette(piratepal("basel")); | |
513 ggplot() + | |
514 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + | |
515 coord_map(xlim=longitude_range_vector, ylim=latitude_range_vector) + | |
516 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, color=mlg), alpha=.7, size=3) + | |
517 scale_color_manual(values=palette(num_colors)) + | |
518 theme(legend.position="bottom") + | |
519 guides(color=guide_legend(nrow=8, byrow=F)); | |
437 dev.off() | 520 dev.off() |
521 time_elapsed(start_time); | |
438 | 522 |
439 # Missing data barplot. | 523 # Missing data barplot. |
440 poptab$miss <- report_user$percent_missing_data_coral[match(miss$affy_id, report_user$affy_id)]; | 524 start_time <- time_start("Creating missing_data.pdf"); |
441 test2 <- which(!is.na(poptab$miss)); | 525 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)]; |
442 miss96 <- poptab$miss[test2]; | 526 test2 <- which(!is.na(population_info_data_table$miss)); |
443 name96 <- poptab$user_specimen_id[test2]; | 527 miss96 <- population_info_data_table$miss[test2]; |
528 name96 <- population_info_data_table$user_specimen_id[test2]; | |
529 # Start PDF device driver. | |
444 dev.new(width=10, height=7); | 530 dev.new(width=10, height=7); |
445 file_path = get_file_path("missing_data.pdf"); | 531 file_path = get_file_path("missing_data.pdf"); |
446 pdf (file=file_path, width=10, height=7); | 532 pdf(file=file_path, width=10, height=7); |
447 par(mar = c(8, 4, 4, 2)); | 533 par(mar = c(8, 4, 4, 2)); |
448 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); | 534 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); |
449 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); | 535 text(cex=0.6, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); |
450 dev.off() | 536 dev.off() |
537 time_elapsed(start_time); | |
451 | 538 |
452 # Generate a pie chart for each sample with a genotype. | 539 # Generate a pie chart for each sample with a genotype. |
453 # Store the numerical and user_specimen_id values from | 540 # Store the numerical and user_specimen_id values from |
454 # report_user for the charts (user_specimen_id names | 541 # stag_db_report for the charts (user_specimen_id names |
455 # will be used to label each chart). | 542 # will be used to label each chart). |
456 dt1 <- data.table(report_user); | 543 start_time <- time_start("Creating percent_breakdown.pdf"); |
457 dt1 <- report_user[c(-2, -3, -4)]; | 544 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; |
458 dt1 <- na.omit(dt1); | 545 # Remove NA and NaN values. |
459 # Translate to N (i.e., number of samples with a | 546 stag_db_report_data_table <- na.omit(stag_db_report_data_table); |
460 # genotype) columns and 5 rows. | 547 # Translate to N (i.e., number of samples with a genotype) |
461 tdt1 <- t(dt1); | 548 # columns and 5 rows. |
462 # Make another data table and transpose it the same as dt1 to | 549 translated_stag_db_report_data_table <- t(stag_db_report_data_table); |
463 # get numerics. These will feed into the creation of N vectors. | 550 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]); |
464 dt2 <- data.table(report_user); | 551 # Set the storage mode of the matrix to numeric. In some |
465 dt2 <- report_user[c(-1, -2, -3, -4)]; | 552 # cases this could result in the following: |
466 # Translate to N columns and 5 rows. | 553 # Warning message: |
467 tdt2 <- t(dt2); | 554 # In mde(x) : NAs introduced by coercion |
468 tdt1_matrix <- as.matrix(tdt1[-1,]); | 555 mode(translated_stag_db_report_matrix) <- "numeric"; |
469 # The number of columns is the number of samples with genotypes. | 556 # Remove NA and NaN values that may have been introduced |
470 nc <- ncol(tdt1_matrix); | 557 # by coercion. |
471 mode(tdt1_matrix) <- "numeric"; | 558 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); |
472 spy <- rowMeans(tdt1_matrix); | 559 tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); |
473 dev.new(width=10, height=7); | 560 dev.new(width=10, height=7); |
474 file_path = get_file_path("percent_breakdown.pdf"); | 561 file_path = get_file_path("percent_breakdown.pdf"); |
475 pdf(file=file_path, width=10, height=7); | 562 pdf(file=file_path, width=10, height=7); |
476 # Average pie of all samples. | 563 # Average pie of all samples. |
477 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(spy, 1), "%)", sep=""); | 564 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); |
478 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | 565 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); |
479 main <- "Average breakdown of SNP assignments across all samples"; | 566 main <- "Average breakdown of SNP assignments across all samples"; |
480 pie(spy, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); | 567 pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); |
481 par(mfrow=c(3, 2)); | 568 par(mfrow=c(3, 2)); |
482 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | 569 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); |
483 for (i in 1:nc) { | 570 # Generate a pie chart for each sample with genotypes. |
484 tmp_labels <- paste(labels, " (", round(tdt1_matrix[,i], 1), "%)", sep=""); | 571 for (i in 1:ncol(translated_stag_db_report_matrix)) { |
485 main <- paste("Breakdown of SNP assignments for", tdt1[1, i]); | 572 tmp_labels <- paste(labels, " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); |
486 pie(tdt1_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); | 573 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); |
574 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); | |
487 } | 575 } |
488 dev.off() | 576 dev.off() |
577 time_elapsed(start_time); |