comparison multilocus_genotype.R @ 8:d2057e183772 draft

Uploaded
author greg
date Thu, 29 Nov 2018 11:18:00 -0500
parents 18001e7cb199
children 8f2f346a5e1c
comparison
equal deleted inserted replaced
7:18001e7cb199 8:d2057e183772
31 get_file_path = function(file_name) { 31 get_file_path = function(file_name) {
32 file_path = paste("output_plots_dir", file_name, sep="/"); 32 file_path = paste("output_plots_dir", file_name, sep="/");
33 return(file_path); 33 return(file_path);
34 } 34 }
35 35
36 get_database_connection <- function(db_conn_string) {
37 # Instantiate database connection.
38 # The connection string has this format:
39 # postgresql://user:password@host/dbname
40 conn_items <- strsplit(db_conn_string, "://")[[1]];
41 string_needed <- conn_items[2];
42 items_needed <- strsplit(string_needed, "@")[[1]];
43 user_pass_string <- items_needed[1];
44 host_dbname_string <- items_needed[2];
45 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
46 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
47 user <- user_pass_items[1];
48 pass <- user_pass_items[2];
49 host <- host_dbname_items[1];
50 dbname <- host_dbname_items[2];
51 # FIXME: is there a way to not hard-code the port?
52 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
53 return (conn);
54 }
55
36 # Read in VCF input file. 56 # Read in VCF input file.
37 vcf <- read.vcfR(opt$input_vcf); 57 vcf <- read.vcfR(opt$input_vcf);
38 58
39 #Missing GT in samples submitted 59 #Missing GT in samples submitted
40 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 60 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
41 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 61 myMiss <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
42 missing_gt <- (missing_gt / nrow(vcf)) * 100; 62 myMiss <- (myMiss / nrow(vcf)) * 100;
43 missing_gt_data_frame <- data.frame(missing_gt); 63 miss <- data.frame(myMiss);
44 64
45 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} ); 65 hets <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))} );
46 hets <- (hets / nrow(vcf)) * 100; 66 hets <- (hets / nrow(vcf)) * 100;
47 ht <- data.frame(hets); 67 ht <- data.frame(hets);
48 68
55 aB <- data.frame(altB); 75 aB <- data.frame(altB);
56 76
57 # Convert VCF file into a genind for the Poppr package. 77 # Convert VCF file into a genind for the Poppr package.
58 # TODO: probably should not hard-code 2 cores. 78 # TODO: probably should not hard-code 2 cores.
59 gl <- vcfR2genlight(vcf, n.cores=2); 79 gl <- vcfR2genlight(vcf, n.cores=2);
60 genind <- new("genind", (as.matrix(gl))); 80 gind <- new("genind", (as.matrix(gl)));
61 81
62 # Add population information to the genind object. 82 # Add population information to the genind object.
63 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA")); 83 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings=c("", "NA"));
64 genind@pop <- as.factor(poptab$region); 84 gind@pop <- as.factor(poptab$region);
65 85
66 # Convert genind object to a genclone object. 86 # Convert genind object to a genclone object.
67 gclo <- as.genclone(genind); 87 obj2 <- as.genclone(gind);
68 88
69 # Calculate the bitwise distance between individuals. 89 # Calculate the bitwise distance between individuals.
70 xdis <- bitwise.dist(gclo); 90 xdis <- bitwise.dist(obj2);
71 91
72 # Multilocus genotypes (threshold of 1%). 92 # Multilocus genotypes (threshold of 1%).
73 mlg.filter(gclo, distance=xdis) <- 0.01; 93 mlg.filter(obj2, distance=xdis) <- 0.01;
74 m <- mlg.table(gclo, background=TRUE, color=TRUE); 94 m <- mlg.table(obj2, background=TRUE, color=TRUE);
75 95
76 # Create table of MLGs. 96 # Create table of MLGs.
77 id <- mlg.id(gclo); 97 id <- mlg.id(obj2);
78 dt <- data.table(id, keep.rownames=TRUE); 98 dt <- data.table(id, keep.rownames=TRUE);
79 setnames(dt, c("id"), c("user_specimen_id")); 99 setnames(dt, c("id"), c("user_specimen_id"));
80 100
81 # Read user's Affymetrix 96 well plate csv file. 101 # Read user's Affymetrix 96 well plate csv file.
82 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE); 102 pinfo <- read.csv(opt$input_affy_metadata, stringsAsFactors=FALSE);
83 pinfo <- pinfo$user_specimen_id; 103 pinfo <- pinfo$user_specimen_id;
84 pi <- data.table(pinfo); 104 pi <- data.table(pinfo);
85 setnames(pi, c("pinfo"), c("user_specimen_id")); 105 setnames(pi, c("pinfo"), c("user_specimen_id"));
86 106
87 # Instantiate database connection. 107 # Connect to database.
88 # The connection string has this format: 108 conn <- get_database_connection(opt$database_connection_string);
89 # postgresql://user:password@host/dbname
90 conn_string <- opt$database_connection_string;
91 conn_items <- strsplit(conn_string, "://")[[1]];
92 string_needed <- conn_items[1];
93 items_needed <- strsplit(string_needed, "@")[[1]];
94 user_pass_string <- items_needed[1];
95 host_dbname_string <- items_needed[2];
96 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
97 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
98 user <- user_pass_items[1];
99 pass <- user_pass_items[2];
100 host <- host_dbname_items[1];
101 dbname <- host_dbname_items[2];
102 # FIXME: is there a way to not hard-code the port?
103 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port='5432', dbname=dbname, user=user, password=pass);
104 109
105 # Import the sample table. 110 # Import the sample table.
106 sample_table <- tbl(conn, "sample"); 111 mD <- tbl(conn, "sample");
107 112
108 # Select user_specimen_id and mlg columns. 113 # Select user_specimen_id and mlg columns.
109 smlg <- sample_table %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id); 114 smlg <- mD %>% select(user_specimen_id, coral_mlg_clonal_id, symbio_mlg_clonal_id);
110 115
111 # Convert to dataframe. 116 # Convert to dataframe.
112 sm <- data.frame(smlg); 117 sm <- data.frame(smlg);
113 sm[sm==""] <- NA; 118 sm[sm==""] <- NA;
114 119
115 # Convert missing data into data table. 120 # Convert missing data into data table.
116 mi <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; 121 mi <-setDT(miss, keep.rownames=TRUE)[];
117 # Change names to match db. 122 # Change names to match db.
118 setnames(mi, c("rn"), c("user_specimen_id")); 123 setnames(mi, c("rn"), c("user_specimen_id"));
119 setnames(mi, c("myMiss"), c("percent_missing_data_coral")); 124 setnames(mi, c("myMiss"), c("percent_missing_data_coral"));
120 # Round missing data to two digits. 125 # Round missing data to two digits.
121 mi$percent_missing <- round(mi$percent_missing, digits=2); 126 mi$percent_missing <- round(mi$percent_missing, digits=2);
240 # Start PDF device driver. 245 # Start PDF device driver.
241 dev.new(width=10, height=7); 246 dev.new(width=10, height=7);
242 file_path = get_file_path("nj_phylogeny.pdf"); 247 file_path = get_file_path("nj_phylogeny.pdf");
243 pdf(file=file_path, width=10, height=7); 248 pdf(file=file_path, width=10, height=7);
244 # Organize branches by clade. 249 # Organize branches by clade.
245 tree <- gclo %>% 250 tree <- obj2 %>%
246 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>% 251 aboot(dist=provesti.dist, sample=10, tree="nj", cutoff=50, quiet=TRUE) %>%
247 ladderize(); 252 ladderize();
248 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4); 253 plot.phylo(tree, tip.color=cols[obj2$pop],label.offset=0.0125, cex=0.7, font=2, lwd=4);
249 # Add a scale bar showing 5% difference.. 254 # Add a scale bar showing 5% difference..
250 add.scale.bar(length=0.05, cex=0.65); 255 add.scale.bar(length=0.05, cex=0.65);