comparison sequence_overview.r @ 82:564c4f6da203 draft

Uploaded
author davidvanzessen
date Tue, 24 May 2016 03:52:14 -0400
parents a778156dad3d
children 0011a0597685
comparison
equal deleted inserted replaced
81:a778156dad3d 82:564c4f6da203
2 2
3 args <- commandArgs(trailingOnly = TRUE) 3 args <- commandArgs(trailingOnly = TRUE)
4 4
5 gene.matches = args[1] 5 gene.matches = args[1]
6 sequence.file = args[2] 6 sequence.file = args[2]
7 outputdir = args[3] 7 merged.file = args[3]
8 outputdir = args[4]
9 gene.classes = unlist(strsplit(args[5], ","))
10 hotspot.analysis.sum.file = args[6]
8 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") 11 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
9 NTsum.file = paste(outputdir, "ntsum.txt", sep="/") 12 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
10 main.html = "index.html" 13 main.html = "index.html"
11 14
12 setwd(outputdir) 15 setwd(outputdir)
13 16
14 genes = read.table(gene.matches, header=T, sep="\t", fill=T) 17 genes = read.table(gene.matches, header=T, sep="\t", fill=T)
15 sequences = read.table(sequence.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") 18 sequences = read.table(sequence.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
19 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
20 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
16 21
17 dat = merge(sequences, genes, by="Sequence.ID") 22 dat = merge(sequences, genes, by="Sequence.ID")
23
24 dat = dat[dat$Sequence.ID %in% merged$Sequence.ID,]
18 25
19 dat$seq_conc = paste(dat$CDR1.IMGT, dat$CDR2.IMGT, dat$CDR3.IMGT, dat$FR2.IMGT, dat$FR3.IMGT) 26 dat$seq_conc = paste(dat$CDR1.IMGT, dat$CDR2.IMGT, dat$CDR3.IMGT, dat$FR2.IMGT, dat$FR3.IMGT)
20 27
21 IDs = dat[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")] 28 IDs = dat[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]
22 IDs$best_match = as.character(IDs$best_match) 29 IDs$best_match = as.character(IDs$best_match)
23 30
24 #dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) 31 #dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")])
25 32
26 dat = data.frame(table(dat$seq_conc)) 33 dat = data.frame(table(dat$seq_conc, dat$Functionality))
27 34
28 dat = dat[dat$Freq > 1,] 35 dat = dat[dat$Freq > 1,]
29 36
30 names(dat) = c("seq_conc", "Freq") 37 names(dat) = c("seq_conc", "Functionality", "Freq")
31 38
32 dat$seq_conc = factor(dat$seq_conc) 39 dat$seq_conc = factor(dat$seq_conc)
33 40
34 dat = dat[order(nchar(as.character(dat$seq_conc))),] 41 dat = dat[order(as.character(dat$seq_conc)),]
35 42
36 #writing html from R... 43 #writing html from R...
37 td = function(val) { paste("<td>", val, "</td>", sep="") } 44 td = function(val) { paste("<td>", val, "</td>", sep="") }
38 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) } 45 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) }
39 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") } 46 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") }
52 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",] 59 cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg3",]
53 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",] 60 cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cg4",]
54 61
55 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",] 62 cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & IDs$best_match == "cm",]
56 63
64 classes = c(nrow(ca1), nrow(ca2), nrow(cg1), nrow(cg2), nrow(cg3), nrow(cg4), nrow(cm))
65
66 classes.sum = sum(classes)
67
68 if(!any(classes != 0 & classes != classes.sum)){
69 next
70 }
71
57 id = as.numeric(dat[i,"seq_conc"]) 72 id = as.numeric(dat[i,"seq_conc"])
58 73
59 functionality = dat[i,"Functionality"] 74 functionality = dat[i,"Functionality"]
60 75
61 if(nrow(ca1) > 0){ 76 if(nrow(ca1) > 0){
111 NToverview = genes[,c("Sequence.ID", "best_match")] 126 NToverview = genes[,c("Sequence.ID", "best_match")]
112 sequences$seq = paste(sequences$CDR2.IMGT, sequences$CDR2.IMGT, sequences$FR2.IMGT, sequences$FR3.IMGT, sep="_") 127 sequences$seq = paste(sequences$CDR2.IMGT, sequences$CDR2.IMGT, sequences$FR2.IMGT, sequences$FR3.IMGT, sep="_")
113 128
114 NToverview = merge(NToverview, sequences[,c("Sequence.ID", "seq")], by="Sequence.ID") 129 NToverview = merge(NToverview, sequences[,c("Sequence.ID", "seq")], by="Sequence.ID")
115 130
131 NToverview = NToverview[NToverview$Sequence.ID %in% merged$Sequence.ID,]
132
116 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) 133 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
117 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) 134 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
118 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq)) 135 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
119 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq)) 136 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
120 137
121 NTsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T)) 138 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
122 139
123 print(names(NToverview)) 140 #NToverview = rbind(NToverview, NTsum)
124 print(names(NTsum)) 141
125 142 NTresult = data.frame(nt=c("A", "C", "T", "G"))
126 NToverview = rbind(NToverview, NTsum) 143
144 for(clazz in gene.classes){
145 NToverview.sub = NToverview[grepl(clazz, NToverview$best_match),]
146 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
147 new.col.y = sum(new.col.x)
148 new.col.z = round(new.col.x / new.col.y * 100, 2)
149
150 tmp = names(NTresult)
151 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
152 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
153 }
154
155 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
156 new.col.y = sum(new.col.x)
157 new.col.z = round(new.col.x / new.col.y * 100, 2)
158
159 tmp = names(NTresult)
160 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
161 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
162
163 names(hotspot.analysis.sum) = names(NTresult)
164
165 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
166
167 print(hotspot.analysis.sum)
168
169 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
170
127 171
128 write.table(NToverview, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) 172 write.table(NToverview, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
129 #write.table(NTsum, NTsum.file, quote=F, sep="\t", row.names=F, col.names=T) 173
130 174
131 175
132 176
133 177
134 178
135 179
136 180
137 181
138 182
139 183
140 184
141 185
142 186
143 187
144 188
145 189
146 190
147 191
148 192
149 193
150 194
151 195
152 196
153 197
154 198
155 199
156 200
157 201
158 202
159