comparison multilocus_genotype.R @ 3:1bc815d9c8c5 draft

Uploaded
author greg
date Thu, 25 Oct 2018 13:33:47 -0400
parents 86aaadf36a4f
children a7cce4091e80
comparison
equal deleted inserted replaced
2:86aaadf36a4f 3:1bc815d9c8c5
20 get_file_path = function(file_name) { 20 get_file_path = function(file_name) {
21 file_path = paste("output_plots_dir", file_name, sep="/"); 21 file_path = paste("output_plots_dir", file_name, sep="/");
22 return(file_path); 22 return(file_path);
23 } 23 }
24 24
25 #extract Provesti's distance from the distance matrix 25 # Extract Provesti's distance from the distance matrix.
26 provesti_distance <- function(distance, selection) { 26 provesti_distance <- function(distance, selection) {
27 eval(parse(text=paste("as.matrix(distance)[", selection, "]"))); 27 eval(parse(text=paste("as.matrix(distance)[", selection, "]")));
28 } 28 }
29 29
30 #Read in VCF input file. 30 # Read in VCF input file.
31 vcf <- read.vcfR(opt$input_vcf); 31 vcf <- read.vcfR(opt$input_vcf);
32 32
33 # Convert VCF file into formats compatiable with the Poppr package. 33 # Convert VCF file into formats compatiable with the Poppr package.
34 gind <- vcfR2genind(vcf); 34 genind <- vcfR2genind(vcf);
35 # Add population information to the genind object. 35 # Add population information to the genind object.
36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA")); 36 poptab <- read.table(opt$input_pop_info, check.names=FALSE, header=T, na.strings = c("", "NA"));
37 gind@pop <- as.factor(poptab$region); 37 genind@pop <- as.factor(poptab$region);
38 # Convert genind to genclone object 38 # Convert genind to genclone object
39 gclo <- as.genclone(gind); 39 gclo <- as.genclone(genind);
40 # Calculate the bitwise distance between individuals, 40 # Calculate the bitwise distance between individuals,
41 # the following is similar to Provesti's distance. 41 # the following is similar to Provesti's distance.
42 xdis <- bitwise.dist(gclo); 42 xdis <- bitwise.dist(gclo);
43 # All alleles must match to make a unique multilocus 43 # All alleles must match to make a unique multilocus
44 # genotype (“original” naive approach). This is the 44 # genotype (“original” naive approach). This is the
50 # To determine the distance threshold, we will generate 50 # To determine the distance threshold, we will generate
51 # a neighbor-joining tree for all samples. 51 # a neighbor-joining tree for all samples.
52 52
53 # Start PDF device driver. 53 # Start PDF device driver.
54 dev.new(width=20, height=30); 54 dev.new(width=20, height=30);
55 file_path = get_file_path("phylogeny_tree.pdf") 55 file_path = get_file_path("phylogeny_tree.pdf");
56 pdf(file=file_path, width=20, height=30, bg="white"); 56 pdf(file=file_path, width=20, height=30, bg="white");
57 # Create a phylogeny of samples based on distance matrices 57 # Create a phylogeny of samples based on distance matrices
58 # colors. 58 # colors.
59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60"); 59 cols <- c("skyblue2","#C38D9E", '#E8A87C',"darkcyan","#e27d60");
60 set.seed(999); 60 set.seed(999);
70 # Turn off device driver to flush output. 70 # Turn off device driver to flush output.
71 dev.off(); 71 dev.off();
72 72
73 # Start PDF device driver. 73 # Start PDF device driver.
74 dev.new(width=20, height=30); 74 dev.new(width=20, height=30);
75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf") 75 file_path = get_file_path("dissimiliarity_distance_matrix.pdf");
76 pdf(file=file_path, width=20, height=30, bg="white"); 76 pdf(file=file_path, width=20, height=30, bg="white");
77 # Use of mlg.filter() will create a dissimiliarity distance 77 # Use of mlg.filter() will create a dissimiliarity distance
78 # matrix from the data and then filter based off of that 78 # matrix from the data and then filter based off of that
79 # matrix. Here we will use the bitwise distance matrix 79 # matrix. Here we will use the bitwise distance matrix
80 # calculated above. 80 # calculated above.
85 # Turn off device driver to flush output. 85 # Turn off device driver to flush output.
86 dev.off(); 86 dev.off();
87 87
88 # Start PDF device driver. 88 # Start PDF device driver.
89 dev.new(width=20, height=30); 89 dev.new(width=20, height=30);
90 file_path = get_file_path("filter_stats.pdf") 90 file_path = get_file_path("filter_stats.pdf");
91 pdf(file=file_path, width=20, height=30, bg="white"); 91 pdf(file=file_path, width=20, height=30, bg="white");
92 # Different clustering methods for tie breakers used by 92 # Different clustering methods for tie breakers used by
93 # mlg.filter, default is farthest neighbor. 93 # mlg.filter, default is farthest neighbor.
94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE); 94 gclo_filtered <- filter_stats(gclo, distance=xdis, plot=TRUE);
95 # Turn off device driver to flush output. 95 # Turn off device driver to flush output.
99 id <- mlg.id(gclo); 99 id <- mlg.id(gclo);
100 df <- data.frame(matrix((id), byrow=T)); 100 df <- data.frame(matrix((id), byrow=T));
101 101
102 # Start PDF device driver. 102 # Start PDF device driver.
103 dev.new(width=20, height=30); 103 dev.new(width=20, height=30);
104 file_path = get_file_path("genotype_accumulation_curve.pdf") 104 file_path = get_file_path("genotype_accumulation_curve.pdf");
105 pdf(file=file_path, width=20, height=30, bg="white"); 105 pdf(file=file_path, width=20, height=30, bg="white");
106 # We can use the genotype_curve() function to create a 106 # We can use the genotype_curve() function to create a
107 # genotype accumulation curve to determine the minimum 107 # genotype accumulation curve to determine the minimum
108 # number of loci to identify unique MLGs. 108 # number of loci to identify unique MLGs.
109 gac <- genotype_curve(gind, sample=5, quiet=TRUE); 109 gac <- genotype_curve(genind, sample=5, quiet=TRUE);
110 # Turn off device driver to flush output. 110 # Turn off device driver to flush output.
111 dev.off(); 111 dev.off();
112 112
113 # Start PDF device driver. 113 # Start PDF device driver.
114 dev.new(width=20, height=30); 114 dev.new(width=20, height=30);
115 file_path = get_file_path("genotype_accumulation_curve_for_gind.pdf") 115 file_path = get_file_path("genotype_accumulation_curve_for_genind.pdf");
116 pdf(file=file_path, width=20, height=30, bg="white"); 116 pdf(file=file_path, width=20, height=30, bg="white");
117 p <- last_plot(); 117 p <- last_plot();
118 p + geom_smooth() + xlim(0, 100) + theme_bw(); 118 p + geom_smooth() + xlim(0, 100) + theme_bw();
119 119
120 # From the collapsed MLGs, we can calculate genotypic 120 # From the collapsed MLGs, we can calculate genotypic