changeset 31:c623690e3b81 draft

Uploaded
author davidvanzessen
date Thu, 09 Apr 2015 08:52:18 -0400
parents 7e44617c9ca4
children 2a7343e4be5a
files aa_histogram.r mutation_analysis.py mutation_analysis.r wrapper.sh
diffstat 2 files changed, 20 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/mutation_analysis.py	Thu Apr 09 07:59:06 2015 -0400
+++ b/mutation_analysis.py	Thu Apr 09 08:52:18 2015 -0400
@@ -4,12 +4,14 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
 parser.add_argument("--genes", help="The genes available in the 'best_match' column")
+parser.add_argument("--includefr1", help="The genes available in the 'best_match' column")
 parser.add_argument("--output", help="Output file")
 
 args = parser.parse_args()
 
 infile = args.input
 genes = str(args.genes).split(",")
+include_fr1 = True if args.includefr1 == "yes" else False
 outfile = args.output 
 
 genedic = dict()
@@ -46,7 +48,7 @@
 		linesplt = line.split("\t")
 		ID = linesplt[IDIndex]
 		genedic[ID] = linesplt[best_matchIndex]
-		mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]
+		mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
 		mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
 		mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
 		mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
@@ -108,46 +110,22 @@
 		WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
 		TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
 		RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0
-		for (x,y,z) in RGYW: #RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-			if not z or z == "CDR3":
-				continue
-			mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
-			in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW])
-			if mutations_in_motif > 0:
-				RGYWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
-				if in_other_motifs > 1:
-					print in_other_motifs, ID, "RGYW", x, y, ([(x,y,z) for (xother, yother, zother) in WRCY if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in WA if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in TW if ((x <= int(xother) <= y) and (x <= int(yother) <= y))])
 		
-		for (x,y,z) in WRCY: #WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-			if not z or z == "CDR3":
-				continue
-			mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
-			in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW])
-			if mutations_in_motif > 0:
-				WRCYCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
-				if in_other_motifs > 1:
-					print in_other_motifs, ID, "WRCY", x, y
-		
-		for (x,y,z) in WA: #WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-			if not z or z == "CDR3":
-				continue
-			mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
-			in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW])
-			if mutations_in_motif > 0:
-				WACount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
-				if in_other_motifs > 1:
-					print in_other_motifs, ID, "WA", x, y
-		
-		for (x,y,z) in TW: #TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-			if not z or z == "CDR3":
-				continue
-			mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])
-			in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA])
-			if mutations_in_motif > 0:
-				TWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1
-				if in_other_motifs > 1:
-					print in_other_motifs, ID, "TW", x, y
-
+		mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+		for mutation in mutationList:
+			frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
+			mutation_in_RGYW = any([(start <= int(where) <= end) for (start,end,region) in RGYW])
+			mutation_in_WRCY = any([(start <= int(where) <= end) for (start,end,region) in WRCY])
+			mutation_in_WA = any([(start <= int(where) <= end) for (start,end,region) in WA])
+			mutation_in_TW = any([(start <= int(where) <= end) for (start,end,region) in TW])
+			
+			in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
+			
+			if in_how_many_motifs > 0:
+				RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs
+				WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
+				WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
+				TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
 
 directory = outfile[:outfile.rfind("/") + 1]
 value = 0
@@ -189,4 +167,4 @@
 			o.write("ID\tRGYWC\tWRCY\tWA\tTW\n")
 			first = False
 			continue
-		o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n")
+		o.write(ID + "\t" + str(int(RGYWCount[ID])) + "\t" + str(int(WRCYCount[ID])) + "\t" + str(int(WACount[ID])) + "\t" + str(int(TWCount[ID])) + "\n")
--- a/wrapper.sh	Thu Apr 09 07:59:06 2015 -0400
+++ b/wrapper.sh	Thu Apr 09 08:52:18 2015 -0400
@@ -46,7 +46,7 @@
 echo "R mutation analysis"
 Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1
 echo "python mutation analysis"
-python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt
+python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${inlcude_fr1}" --output $outdir/hotspot_analysis.txt
 echo "R AA histogram"
 Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1