Mercurial > repos > davidvanzessen > mutation_analysis
comparison sequence_overview.r @ 82:564c4f6da203 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 24 May 2016 03:52:14 -0400 |
parents | a778156dad3d |
children | 0011a0597685 |
comparison
equal
deleted
inserted
replaced
81:a778156dad3d | 82:564c4f6da203 |
---|---|
2 | 2 |
3 args <- commandArgs(trailingOnly = TRUE) | 3 args <- commandArgs(trailingOnly = TRUE) |
4 | 4 |
5 gene.matches = args[1] | 5 gene.matches = args[1] |
6 sequence.file = args[2] | 6 sequence.file = args[2] |
7 outputdir = args[3] | 7 merged.file = args[3] |
8 outputdir = args[4] | |
9 gene.classes = unlist(strsplit(args[5], ",")) | |
10 hotspot.analysis.sum.file = args[6] | |
8 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") | 11 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") |
9 NTsum.file = paste(outputdir, "ntsum.txt", sep="/") | 12 NTsum.file = paste(outputdir, "ntsum.txt", sep="/") |
10 main.html = "index.html" | 13 main.html = "index.html" |
11 | 14 |
12 setwd(outputdir) | 15 setwd(outputdir) |
13 | 16 |
14 genes = read.table(gene.matches, header=T, sep="\t", fill=T) | 17 genes = read.table(gene.matches, header=T, sep="\t", fill=T) |
15 sequences = read.table(sequence.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | 18 sequences = read.table(sequence.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") |
19 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
20 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="") | |
16 | 21 |
17 dat = merge(sequences, genes, by="Sequence.ID") | 22 dat = merge(sequences, genes, by="Sequence.ID") |
23 | |
24 dat = dat[dat$Sequence.ID %in% merged$Sequence.ID,] | |
18 | 25 |
19 dat$seq_conc = paste(dat$CDR1.IMGT, dat$CDR2.IMGT, dat$CDR3.IMGT, dat$FR2.IMGT, dat$FR3.IMGT) | 26 dat$seq_conc = paste(dat$CDR1.IMGT, dat$CDR2.IMGT, dat$CDR3.IMGT, dat$FR2.IMGT, dat$FR3.IMGT) |
20 | 27 |
21 IDs = dat[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")] | 28 IDs = dat[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")] |
22 IDs$best_match = as.character(IDs$best_match) | 29 IDs$best_match = as.character(IDs$best_match) |
23 | 30 |
24 #dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) | 31 #dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) |
25 | 32 |
26 dat = data.frame(table(dat$seq_conc)) | 33 dat = data.frame(table(dat$seq_conc, dat$Functionality)) |
27 | 34 |
28 dat = dat[dat$Freq > 1,] | 35 dat = dat[dat$Freq > 1,] |
29 | 36 |
30 names(dat) = c("seq_conc", "Freq") | 37 names(dat) = c("seq_conc", "Functionality", "Freq") |
31 | 38 |
32 dat$seq_conc = factor(dat$seq_conc) | 39 dat$seq_conc = factor(dat$seq_conc) |
33 | 40 |
34 dat = dat[order(nchar(as.character(dat$seq_conc))),] | 41 dat = dat[order(as.character(dat$seq_conc)),] |
35 | 42 |
36 #writing html from R... | 43 #writing html from R... |
37 td = function(val) { paste("<td>", val, "</td>", sep="") } | 44 td = function(val) { paste("<td>", val, "</td>", sep="") } |
38 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) } | 45 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) } |
39 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") } | 46 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") } |
52 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",] | 59 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",] |
53 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",] | 60 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",] |
54 | 61 |
55 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",] | 62 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",] |
56 | 63 |
64 classes = c(nrow(ca1), nrow(ca2), nrow(cg1), nrow(cg2), nrow(cg3), nrow(cg4), nrow(cm)) | |
65 | |
66 classes.sum = sum(classes) | |
67 | |
68 if(!any(classes != 0 & classes != classes.sum)){ | |
69 next | |
70 } | |
71 | |
57 id = as.numeric(dat[i,"seq_conc"]) | 72 id = as.numeric(dat[i,"seq_conc"]) |
58 | 73 |
59 functionality = dat[i,"Functionality"] | 74 functionality = dat[i,"Functionality"] |
60 | 75 |
61 if(nrow(ca1) > 0){ | 76 if(nrow(ca1) > 0){ |
111 NToverview = genes[,c("Sequence.ID", "best_match")] | 126 NToverview = genes[,c("Sequence.ID", "best_match")] |
112 sequences$seq = paste(sequences$CDR2.IMGT, sequences$CDR2.IMGT, sequences$FR2.IMGT, sequences$FR3.IMGT, sep="_") | 127 sequences$seq = paste(sequences$CDR2.IMGT, sequences$CDR2.IMGT, sequences$FR2.IMGT, sequences$FR3.IMGT, sep="_") |
113 | 128 |
114 NToverview = merge(NToverview, sequences[,c("Sequence.ID", "seq")], by="Sequence.ID") | 129 NToverview = merge(NToverview, sequences[,c("Sequence.ID", "seq")], by="Sequence.ID") |
115 | 130 |
131 NToverview = NToverview[NToverview$Sequence.ID %in% merged$Sequence.ID,] | |
132 | |
116 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) | 133 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) |
117 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) | 134 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) |
118 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq)) | 135 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq)) |
119 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq)) | 136 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq)) |
120 | 137 |
121 NTsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T)) | 138 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T)) |
122 | 139 |
123 print(names(NToverview)) | 140 #NToverview = rbind(NToverview, NTsum) |
124 print(names(NTsum)) | 141 |
125 | 142 NTresult = data.frame(nt=c("A", "C", "T", "G")) |
126 NToverview = rbind(NToverview, NTsum) | 143 |
144 for(clazz in gene.classes){ | |
145 NToverview.sub = NToverview[grepl(clazz, NToverview$best_match),] | |
146 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G)) | |
147 new.col.y = sum(new.col.x) | |
148 new.col.z = round(new.col.x / new.col.y * 100, 2) | |
149 | |
150 tmp = names(NTresult) | |
151 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z)) | |
152 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep="")) | |
153 } | |
154 | |
155 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G)) | |
156 new.col.y = sum(new.col.x) | |
157 new.col.z = round(new.col.x / new.col.y * 100, 2) | |
158 | |
159 tmp = names(NTresult) | |
160 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z)) | |
161 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep="")) | |
162 | |
163 names(hotspot.analysis.sum) = names(NTresult) | |
164 | |
165 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult) | |
166 | |
167 print(hotspot.analysis.sum) | |
168 | |
169 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0") | |
170 | |
127 | 171 |
128 write.table(NToverview, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) | 172 write.table(NToverview, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) |
129 #write.table(NTsum, NTsum.file, quote=F, sep="\t", row.names=F, col.names=T) | 173 |
130 | 174 |
131 | 175 |
132 | 176 |
133 | 177 |
134 | 178 |
135 | 179 |
136 | 180 |
137 | 181 |
138 | 182 |
139 | 183 |
140 | 184 |
141 | 185 |
142 | 186 |
143 | 187 |
144 | 188 |
145 | 189 |
146 | 190 |
147 | 191 |
148 | 192 |
149 | 193 |
150 | 194 |
151 | 195 |
152 | 196 |
153 | 197 |
154 | 198 |
155 | 199 |
156 | 200 |
157 | 201 |
158 | 202 |
159 |