Mercurial > repos > greg > multilocus_genotype
comparison multilocus_genotype.R @ 8:d2057e183772 draft
Uploaded
author | greg |
---|---|
date | Thu, 29 Nov 2018 11:18:00 -0500 |
parents | 18001e7cb199 |
children | 8f2f346a5e1c |
comparison
equal
deleted
inserted
replaced
7:18001e7cb199 | 8:d2057e183772 |
---|---|
31 get_file_path = function(file_name) { | 31 get_file_path = function(file_name) { |
32 file_path = paste("output_plots_dir", file_name, sep="/"); | 32 file_path = paste("output_plots_dir", file_name, sep="/"); |
33 return(file_path); | 33 return(file_path); |
34 } | 34 } |
35 | 35 |
36 get_database_connection <- function(db_conn_string) { | |
37 # Instantiate database connection. | |
38 # The connection string has this format: | |
39 # postgresql://user:password@host/dbname | |
40 conn_items <- strsplit(db_conn_string, "://")[[1]]; | |
41 string_needed <- conn_items[2]; | |
42 items_needed <- strsplit(string_needed, "@")[[1]]; | |
43 user_pass_string <- items_needed[1]; | |
44 host_dbname_string <- items_needed[2]; | |
45 user_pass_items <- strsplit(user_pass_string, ":")[[1]]; | |
46 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; | |
47 user <- user_pass_items[1]; | |
48 pass <- user_pass_items[2]; | |
49 host <- host_dbname_items[1]; | |
50 dbname <- host_dbname_items[2]; | |
51 # FIXME: is there a way to not hard-code the port? | |
52 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); | |
53 return (conn); | |
54 } | |
55 | |
36 # Read in VCF input file. | 56 # Read in VCF input file. |
37 vcf <- read.vcfR(opt$input_vcf); | 57 vcf <- read.vcfR(opt$input_vcf); |
38 | 58 |
39 #Missing GT in samples submitted | 59 #Missing GT in samples submitted |
40 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | 60 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); |
41 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | 61 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); |
42 missing_gt <- (missing_gt / nrow(vcf)) * 100; | 62 myMiss <- (myMiss / nrow(vcf)) * 100; |
43 missing_gt_data_frame <- data.frame(missing_gt); | 63 miss <- data.frame(myMiss); |
44 | 64 |
45 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); | 65 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); |
46 hets <- (hets / nrow(vcf)) * 100; | 66 hets <- (hets / nrow(vcf)) * 100; |
47 ht <- data.frame(hets); | 67 ht <- data.frame(hets); |
48 | 68 |
55 aB <- data.frame(altB); | 75 aB <- data.frame(altB); |
56 | 76 |
57 # Convert VCF file into a genind for the Poppr package. | 77 # Convert VCF file into a genind for the Poppr package. |
58 # TODO: probably should not hard-code 2 cores. | 78 # TODO: probably should not hard-code 2 cores. |
59 gl <- vcfR2genlight(vcf, n.cores=2); | 79 gl <- vcfR2genlight(vcf, n.cores=2); |
60 genind <- new("genind", (as.matrix(gl))); | 80 gind <- new("genind", (as.matrix(gl))); |
61 | 81 |
62 # Add population information to the genind object. | 82 # Add population information to the genind object. |
63 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); | 83 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); |
64 genind@pop <- as.factor(poptab$region); | 84 gind@pop <- as.factor(poptab$region); |
65 | 85 |
66 # Convert genind object to a genclone object. | 86 # Convert genind object to a genclone object. |
67 gclo <- as.genclone(genind); | 87 obj2 <- as.genclone(gind); |
68 | 88 |
69 # Calculate the bitwise distance between individuals. | 89 # Calculate the bitwise distance between individuals. |
70 xdis <- bitwise.dist(gclo); | 90 xdis <- bitwise.dist(obj2); |
71 | 91 |
72 # Multilocus genotypes (threshold of 1%). | 92 # Multilocus genotypes (threshold of 1%). |
73 mlg.filter(gclo, distance=xdis) <- 0.01; | 93 mlg.filter(obj2, distance=xdis) <- 0.01; |
74 m <- mlg.table(gclo, background=TRUE, color=TRUE); | 94 m <- mlg.table(obj2, background=TRUE, color=TRUE); |
75 | 95 |
76 # Create table of MLGs. | 96 # Create table of MLGs. |
77 id <- mlg.id(gclo); | 97 id <- mlg.id(obj2); |
78 dt <- data.table(id, keep.rownames=TRUE); | 98 dt <- data.table(id, keep.rownames=TRUE); |
79 setnames(dt, c("id"), c("user_specimen_id")); | 99 setnames(dt, c("id"), c("user_specimen_id")); |
80 | 100 |
81 # Read user's Affymetrix 96 well plate csv file. | 101 # Read user's Affymetrix 96 well plate csv file. |
82 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); | 102 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); |
83 pinfo <- pinfo$user_specimen_id; | 103 pinfo <- pinfo$user_specimen_id; |
84 pi <- data.table(pinfo); | 104 pi <- data.table(pinfo); |
85 setnames(pi, c("pinfo"), c("user_specimen_id")); | 105 setnames(pi, c("pinfo"), c("user_specimen_id")); |
86 | 106 |
87 # Instantiate database connection. | 107 # Connect to database. |
88 # The connection string has this format: | 108 conn <- get_database_connection(opt$database_connection_string); |
89 # postgresql://user:password@host/dbname | |
90 conn_string <- opt$database_connection_string; | |
91 conn_items <- strsplit(conn_string, "://")[[1]]; | |
92 string_needed <- conn_items[1]; | |
93 items_needed <- strsplit(string_needed, "@")[[1]]; | |
94 user_pass_string <- items_needed[1]; | |
95 host_dbname_string <- items_needed[2]; | |
96 user_pass_items <- strsplit(user_pass_string, ":")[[1]]; | |
97 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; | |
98 user <- user_pass_items[1]; | |
99 pass <- user_pass_items[2]; | |
100 host <- host_dbname_items[1]; | |
101 dbname <- host_dbname_items[2]; | |
102 # FIXME: is there a way to not hard-code the port? | |
103 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass); | |
104 | 109 |
105 # Import the sample table. | 110 # Import the sample table. |
106 sample_table <- tbl(conn, "sample"); | 111 mD <- tbl(conn, "sample"); |
107 | 112 |
108 # Select user_specimen_id and mlg columns. | 113 # Select user_specimen_id and mlg columns. |
109 smlg <- sample_table %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); | 114 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); |
110 | 115 |
111 # Convert to dataframe. | 116 # Convert to dataframe. |
112 sm <- data.frame(smlg); | 117 sm <- data.frame(smlg); |
113 sm[sm==""] <- NA; | 118 sm[sm==""] <- NA; |
114 | 119 |
115 # Convert missing data into data table. | 120 # Convert missing data into data table. |
116 mi <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; | 121 mi <-setDT(miss, keep.rownames=TRUE)[]; |
117 # Change names to match db. | 122 # Change names to match db. |
118 setnames(mi, c("rn"), c("user_specimen_id")); | 123 setnames(mi, c("rn"), c("user_specimen_id")); |
119 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); | 124 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); |
120 # Round missing data to two digits. | 125 # Round missing data to two digits. |
121 mi$percent_missing <- round(mi$percent_missing, digits=2); | 126 mi$percent_missing <- round(mi$percent_missing, digits=2); |
240 # Start PDF device driver. | 245 # Start PDF device driver. |
241 dev.new(width=10, height=7); | 246 dev.new(width=10, height=7); |
242 file_path = get_file_path("nj_phylogeny.pdf"); | 247 file_path = get_file_path("nj_phylogeny.pdf"); |
243 pdf(file=file_path, width=10, height=7); | 248 pdf(file=file_path, width=10, height=7); |
244 # Organize branches by clade. | 249 # Organize branches by clade. |
245 tree <- gclo %>% | 250 tree <- obj2 %>% |
246 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% | 251 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% |
247 ladderize(); | 252 ladderize(); |
248 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); | 253 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); |
249 # Add a scale bar showing 5% difference.. | 254 # Add a scale bar showing 5% difference.. |
250 add.scale.bar(length=0.05, cex=0.65); | 255 add.scale.bar(length=0.05, cex=0.65); |