Mercurial > repos > davidvanzessen > mutation_analysis
changeset 31:c623690e3b81 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 09 Apr 2015 08:52:18 -0400 |
parents | 7e44617c9ca4 |
children | 2a7343e4be5a |
files | aa_histogram.r mutation_analysis.py mutation_analysis.r wrapper.sh |
diffstat | 2 files changed, 20 insertions(+), 42 deletions(-) [+] |
line wrap: on
line diff
--- a/mutation_analysis.py Thu Apr 09 07:59:06 2015 -0400 +++ b/mutation_analysis.py Thu Apr 09 08:52:18 2015 -0400 @@ -4,12 +4,14 @@ parser = argparse.ArgumentParser() parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") parser.add_argument("--genes", help="The genes available in the 'best_match' column") +parser.add_argument("--includefr1", help="The genes available in the 'best_match' column") parser.add_argument("--output", help="Output file") args = parser.parse_args() infile = args.input genes = str(args.genes).split(",") +include_fr1 = True if args.includefr1 == "yes" else False outfile = args.output genedic = dict() @@ -46,7 +48,7 @@ linesplt = line.split("\t") ID = linesplt[IDIndex] genedic[ID] = linesplt[best_matchIndex] - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] @@ -108,46 +110,22 @@ WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0,0,0,0 - for (x,y,z) in RGYW: #RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - if not z or z == "CDR3": - continue - mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) - in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) - if mutations_in_motif > 0: - RGYWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 - if in_other_motifs > 1: - print in_other_motifs, ID, "RGYW", x, y, ([(x,y,z) for (xother, yother, zother) in WRCY if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in WA if ((x <= int(xother) <= y) and (x <= int(yother) <= y))] + [(x,y,z) for (xother, yother, zother) in TW if ((x <= int(xother) <= y) and (x <= int(yother) <= y))]) - for (x,y,z) in WRCY: #WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - if not z or z == "CDR3": - continue - mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) - in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) - if mutations_in_motif > 0: - WRCYCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 - if in_other_motifs > 1: - print in_other_motifs, ID, "WRCY", x, y - - for (x,y,z) in WA: #WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - if not z or z == "CDR3": - continue - mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) - in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in TW]) - if mutations_in_motif > 0: - WACount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 - if in_other_motifs > 1: - print in_other_motifs, ID, "WA", x, y - - for (x,y,z) in TW: #TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - if not z or z == "CDR3": - continue - mutations_in_motif = sum([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]]) - in_other_motifs = sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in RGYW]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WRCY]) + sum([((x <= int(xother) <= y) and (x <= int(yother) <= y)) for (xother, yother, zother) in WA]) - if mutations_in_motif > 0: - TWCount[ID] += 1.0 / in_other_motifs if in_other_motifs > 0 else 1 - if in_other_motifs > 1: - print in_other_motifs, ID, "TW", x, y - + mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + mutation_in_RGYW = any([(start <= int(where) <= end) for (start,end,region) in RGYW]) + mutation_in_WRCY = any([(start <= int(where) <= end) for (start,end,region) in WRCY]) + mutation_in_WA = any([(start <= int(where) <= end) for (start,end,region) in WA]) + mutation_in_TW = any([(start <= int(where) <= end) for (start,end,region) in TW]) + + in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) + + if in_how_many_motifs > 0: + RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs + WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs + WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs + TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs directory = outfile[:outfile.rfind("/") + 1] value = 0 @@ -189,4 +167,4 @@ o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") first = False continue - o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") + o.write(ID + "\t" + str(int(RGYWCount[ID])) + "\t" + str(int(WRCYCount[ID])) + "\t" + str(int(WACount[ID])) + "\t" + str(int(TWCount[ID])) + "\n")
--- a/wrapper.sh Thu Apr 09 07:59:06 2015 -0400 +++ b/wrapper.sh Thu Apr 09 08:52:18 2015 -0400 @@ -46,7 +46,7 @@ echo "R mutation analysis" Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir ${include_fr1} 2>&1 echo "python mutation analysis" -python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt +python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${inlcude_fr1}" --output $outdir/hotspot_analysis.txt echo "R AA histogram" Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1