Mercurial > repos > davidvanzessen > mutation_analysis
changeset 98:5ffbf40cdd4b draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 16 Jun 2016 05:05:47 -0400 |
parents | 6e8dfbe164c6 |
children | 86206431cbb0 |
files | merge_and_filter.r mutation_analysis.py mutation_analysis.r sequence_overview.r wrapper.sh |
diffstat | 5 files changed, 33 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/merge_and_filter.r Wed Jun 15 04:48:41 2016 -0400 +++ b/merge_and_filter.r Thu Jun 16 05:05:47 2016 -0400 @@ -116,6 +116,11 @@ print(paste("Number of sequences in result after merging with sequences:", nrow(result))) +print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) +print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) +print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) +print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) + result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) @@ -170,6 +175,7 @@ } print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) +print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) print(paste("Number of rows in result:", nrow(result))) print(paste("Number of rows in unmatched:", nrow(unmatched)))
--- a/mutation_analysis.py Wed Jun 15 04:48:41 2016 -0400 +++ b/mutation_analysis.py Thu Jun 16 05:05:47 2016 -0400 @@ -161,12 +161,12 @@ sys.exit() hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") -RGYWCount = {g: 0 for g in genes} -WRCYCount = {g: 0 for g in genes} -WACount = {g: 0 for g in genes} -TWCount = {g: 0 for g in genes} +RGYWCount = {} +WRCYCount = {} +WACount = {} +TWCount = {} -IDIndex = 0 +#IDIndex = 0 ataIndex = 0 tatIndex = 0 aggctatIndex = 0 @@ -185,6 +185,8 @@ linesplt = line.split("\t") gene = linesplt[best_matchIndex] ID = linesplt[IDIndex] + if ID == "ca2": + print linesplt RGYW = [(int(x), int(y), z) for (x, y, z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] WRCY = [(int(x), int(y), z) for (x, y, z) in @@ -249,12 +251,14 @@ def get_xyz(lst, gene, f, fname): x = int(round(f(lst))) y = valuedic[gene + "_" + fname] - z = str(round(x / float(valuedic[gene + "_" + fname]) * 100, 1)) if valuedic[gene + "_" + fname] != 0 else "0" + z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" return (str(x), str(y), z) dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} arr = ["RGYW", "WRCY", "WA", "TW"] +geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} + for fname in funcs.keys(): func = funcs[fname] foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" @@ -263,14 +267,14 @@ o.write(typ + " (%)") curr = dic[typ] for gene in genes: - geneMatcher = re.compile("^" + gene + ".*") + geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... if valuedic[gene + "_" + fname] is 0: o.write(",0,0,0") else: x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) o.write("," + x + "," + y + "," + z) - # for total - x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) + + x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) o.write("," + x + "," + y + "," + z + "\n")
--- a/mutation_analysis.r Wed Jun 15 04:48:41 2016 -0400 +++ b/mutation_analysis.r Thu Jun 16 05:05:47 2016 -0400 @@ -295,7 +295,7 @@ matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) } - matrx = calculate_result(i + 1, ".*", dat, matrx, func, fname, name="all") + matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") result = data.frame(matrx) if(fname == "sum"){
--- a/sequence_overview.r Wed Jun 15 04:48:41 2016 -0400 +++ b/sequence_overview.r Thu Jun 16 05:05:47 2016 -0400 @@ -41,11 +41,14 @@ make.link = function(id, clss, val) { paste("<a href='", clss, "_", id, ".html'>", val, "</a>", sep="") } tbl = function(df) { res = "<table border='1'>"; for(i in 1:nrow(df)){ res = paste(res, tr(df[i,]), sep=""); }; res = paste(res, "</table>"); } +print(paste("Number of unique sequences to be written to the sequence overview page", nrow(dat))) + cat("<table border='1'>", file=main.html, append=F) cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T) cat("<tr><th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th></tr>", file=main.html, append=T) for(i in 1:nrow(dat)){ + ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),] ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),] @@ -62,7 +65,10 @@ classes.sum = sum(classes) if(classes.sum == 1){ + print(paste("next", i, classes.sum)) next + } else { + print(i) } id = as.numeric(dat[i,"seq_conc"]) @@ -144,6 +150,10 @@ names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep="")) } +write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) + +NToverview = NToverview[!grepl("unmatched", NToverview$best_match),] + new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G)) new.col.y = sum(new.col.x) new.col.z = round(new.col.x / new.col.y * 100, 2) @@ -160,8 +170,6 @@ write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0") -write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) -
--- a/wrapper.sh Wed Jun 15 04:48:41 2016 -0400 +++ b/wrapper.sh Thu Jun 16 05:05:47 2016 -0400 @@ -77,7 +77,7 @@ Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} 2>&1 -echo "---------------- creating new IMGT zip ----------------<br />" +echo "---------------- creating new IMGT zip ----------------" echo "---------------- creating new IMGT zip ----------------<br />" >> $output mkdir $outdir/new_IMGT @@ -150,7 +150,7 @@ #display the matched/unmatched for clearity -matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`" +matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`" unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`" total_count=$((matched_count + unmatched_count)) perc_count=$((unmatched_count / total_count * 100)) @@ -169,7 +169,7 @@ cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt - echo "<table border='1' width='100%'><caption><h3>${func} table</h3></caption>" >> $output + echo "<table border='1' width='100%'><caption><h3><a href='data_${func}.txt'>${func} table</a></h3></caption>" >> $output echo "<tr><th>info</th>" >> $output for gene in ${genes[@]} do