comparison sequence_overview.r @ 99:86206431cbb0 draft

Uploaded
author davidvanzessen
date Thu, 16 Jun 2016 10:01:54 -0400
parents 5ffbf40cdd4b
children ff5be711382b
comparison
equal deleted inserted replaced
98:5ffbf40cdd4b 99:86206431cbb0
1 library(reshape2) 1 library(reshape2)
2 2
3 args <- commandArgs(trailingOnly = TRUE) 3 args <- commandArgs(trailingOnly = TRUE)
4 4
5 input.file = args[1] 5 input.file = args[1]
6 input.file = args[2] 6 outputdir = args[2]
7 outputdir = args[3] 7 gene.classes = unlist(strsplit(args[3], ","))
8 gene.classes = unlist(strsplit(args[4], ",")) 8 hotspot.analysis.sum.file = args[4]
9 hotspot.analysis.sum.file = args[5]
10 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") 9 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
11 NTsum.file = paste(outputdir, "ntsum.txt", sep="/") 10 NTsum.file = paste(outputdir, "ntsum.txt", sep="/")
12 main.html = "index.html" 11 main.html = "index.html"
13 12
14 setwd(outputdir) 13 setwd(outputdir)
39 td = function(val) { paste("<td>", val, "</td>", sep="") } 38 td = function(val) { paste("<td>", val, "</td>", sep="") }
40 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) } 39 tr = function(val) { capture.output(cat("<tr>", td(val), "</tr>", sep="")) }
41 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") } 40 make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") }
42 tbl = function(df) { res = "<table border='1'>"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "</table>"); } 41 tbl = function(df) { res = "<table border='1'>"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "</table>"); }
43 42
44 print(paste("Number of unique sequences to be written to the sequence overview page", nrow(dat)))
45
46 cat("<table border='1'>", file=main.html, append=F) 43 cat("<table border='1'>", file=main.html, append=F)
47 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T) 44 cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)
48 cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T) 45 cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T)
46
47
48
49 single.sequences=0 #sequence only found once, skipped
50 in.multiple=0 #same sequence across multiple subclasses
51 multiple.in.one=0 #same sequence multiple times in one subclass
52 matched=0 #should be the same als matched sequences
49 53
50 for(i in 1:nrow(dat)){ 54 for(i in 1:nrow(dat)){
51 55
52 ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),] 56 ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]
53 ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),] 57 ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),]
63 classes = c(nrow(ca1), nrow(ca2), nrow(cg1), nrow(cg2), nrow(cg3), nrow(cg4), nrow(cm)) 67 classes = c(nrow(ca1), nrow(ca2), nrow(cg1), nrow(cg2), nrow(cg3), nrow(cg4), nrow(cm))
64 68
65 classes.sum = sum(classes) 69 classes.sum = sum(classes)
66 70
67 if(classes.sum == 1){ 71 if(classes.sum == 1){
68 print(paste("next", i, classes.sum)) 72 single.sequences = single.sequences + 1
69 next 73 next
74 }
75
76 matched = matched + sum(classes > 0) #count in how many subclasses the sequence occurs.
77
78 if(any(classes == classes.sum)){
79 in.multiple = in.multiple + 1
70 } else { 80 } else {
71 print(i) 81 multiple.in.one = multiple.in.one + 1
72 } 82 }
73 83
74 id = as.numeric(dat[i,"seq_conc"]) 84 id = as.numeric(dat[i,"seq_conc"])
75 85
76 functionality = paste(unique(allc[,"Functionality"]), collapse=",") 86 functionality = paste(unique(allc[,"Functionality"]), collapse=",")
118 cat(tr(rw), file=main.html, append=T) 128 cat(tr(rw), file=main.html, append=T)
119 } 129 }
120 130
121 cat("</table>", file=main.html, append=T) 131 cat("</table>", file=main.html, append=T)
122 132
133 print(paste("Single sequences:", single.sequences))
134 print(paste("Sequences in multiple subclasses:", in.multiple))
135 print(paste("Multiple sequences in one subclass:", multiple.in.one))
136 print(paste("Count that should match 'matched' sequences:", matched))
123 137
124 #ACGT overview 138 #ACGT overview
125 139
126 NToverview = merged 140 NToverview = merged
127 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_") 141 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_")
128 142
129 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) 143 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
130 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) 144 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
131 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq)) 145 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
132 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq)) 146 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
133
134 print(sum(colSums(NToverview[,c("A", "C", "T", "G")])))
135 147
136 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T)) 148 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
137 149
138 #NToverview = rbind(NToverview, NTsum) 150 #NToverview = rbind(NToverview, NTsum)
139 151
164 176
165 names(hotspot.analysis.sum) = names(NTresult) 177 names(hotspot.analysis.sum) = names(NTresult)
166 178
167 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult) 179 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
168 180
169 print(hotspot.analysis.sum)
170
171 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0") 181 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")
172 182
173 183
174 184
175 185