Mercurial > repos > davidvanzessen > mutation_analysis
changeset 4:069419cccba4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 22 Sep 2014 10:19:36 -0400 |
parents | a0b27058dcac |
children | 71a12810eff3 |
files | HS12RSS.txt HS23RSS.txt filter.r gene_identification.py logo.pm merge_and_filter.r mutation_analysis.py mutation_analysis.r piechart.r seqlogo template.eps template.pm wrapper.sh |
diffstat | 13 files changed, 486 insertions(+), 3256 deletions(-) [+] |
line wrap: on
line diff
--- a/HS12RSS.txt Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -Gene Sequence -IGKV2D-18 CACAGTGGTACAACCCTGAACAGAAACC -IGKV1D-39 CACAGTGTTACAAGTCATAACATAAACC -TRBJ2-5 CACGGCCCCCGAGCCCCGCACAAAAACC -TRBJ2-6 CACAGCCCGGGGACTCCCCGCAAAAACC -IGKV1-37 CACAGTGTTACAAGGCATAACATAAACC -TRBJ2-1 CACAGTGGGAAGGGGCTGCCCAGAATTC -TRBJ2-2 CACAGCCCTGGGGACCCTGGCGCAAACC -IGKV1-33 CACAGTGTAACAAGTCATAACATAAATC -IGKV3-11 CACAGTGATTCCACATGAAACAAAAACC -IGKV1D-33 CACAGTGTAACAAGTCATAACATAAATC -IGKV1-39 CACAGTGTTACAAGTCGTAACATAAACC -IGHD3-22 CACAGTGTCACAGAGTCCATCAAAAACT -TRAJ31 CACAGCACTTCCTGCCTTTACTGAAACC -TRAJ32 CACAGTCCTTCAGAGCCTTACACTAACC -TRAJ33 CACAGACACAAAAACCTTAACAAAAACC -TRAJ34 CACAGTGATACTGAGATCTACAAAAACC -TRAJ36 CACAGTGTTTCTGCCCAGTACAAAAACA -TRAJ37 CACTCTAATGCTGTACTTTACAAAAACT -TRAJ38 CACAGTCATAGAAAGCTTTACCAAAACC -TRAJ39 CACAGTGATCTTCGGCTGAGCAAAAACC -TRBJ1-2 CACATAAGAATATAGCCACTCTAAAAGG -IGHD2-2 CACAGTGACACAGCCCCATTCCCAAAGC -IGHD1-1 CACCGTGACTCGGGGCTGTTCAGAATCT -IGKV1-27 CACTGTGATACAAGCCCGAACATAAACC -TRBJ1-6 CACAGCTGCAGAGGCTTAGATAAAACCC -TRBJ1-5 CACAGTGCATCATGAGTGTGGCAAACCC -TRBJ1-4 CACAACATTAAAGACTGGAAGGAAAACC -TRBJ1-3 CACAGCCTCCCAGGGCCACTTCAAAACC -IGKV1-6 CACAGTGTTACACACCCGAACAGAAACC -TRBJ2-4 CACAGCCCCGAGACGCGGCACAGAAACT -IGKV1-5 CACAGTGTTACACACCCGAACATAAACC -TRAJ29 CACAGTGATTTCCTCCATAACAAAAACC -TRAJ28 CACAGAGTTTCCTTTCTTTGCAAAAACC -TRAJ27 CACAGTCTTTAGTGCTATTGCAATAACC -TRAJ26 CACAGTGTTTCTGGGCTCAGCAAAAACC -TRAJ24 CACTGTGACAAACACCTCTACAAAATGG -TRAJ23 CACTGTGTTACATATCCTGTCAAAAACA -TRAJ22 CACTATGATTTGCTCAACAACAAAACCA -TRAJ21 CACCATGTTTATTGGCATTACAAAAAAT -IGKV1-8 CACAGTGTTACACACCCAAACAAAAACC -TRBJ2-7 CACGGAGGTGCACCCCCGCATGCAAACC -IGKV1-9 CACAGTGTTACAAACCTGAACATAAACC -IGKV3D-11 CACAGTGATTCCACATGAAACAAAAACC -TRBJ2-3 CACAGCCTGGAGGCCCAGGACAAAAACC -IGKV1-17 CACAGTGTTACACACCCGAACATAAAAC -IGKV1D-17 CACAGTGTTACACACCCGAACATAAACC -IGKV1D-16 CACAGTGTTACACACCCAAACATAAACC -TRAJ58 CACTGTGCTGAGGGGCTTTGCAAAAACC -TRAJ59 CACAGGAGCTGCTGCCTTTACATAAACT -TRAJ52 CACTGCACTGGAGGCCTTTACAAGAACC -TRAJ53 CACAGCCACAGAAGGCTTTACAGAAACA -TRAJ50 CACAGCCATCCAAACCTTTACAATAACT -TRAJ56 CACAATGACACGAGGATCTACAAAAACT -TRAJ57 CACCCCCACAGACTGCTTTACAAATACT -TRAJ54 CACCACAGCACGAGGCTTTACAGAAACT -IGKV3-15 CACAGTGATTCAACATGAAACAAAAACC -TRDD2 CACAATGAAACACATCAGTATAAAAACC -IGKV2-40 CACAGTGGTACAGCCCTGAACAGAAACC -IGKV6D-21 CACTGTGTTACAACCCAGAACAAAAACT -IGKV3-20 CACAGTGATTCCACAGGAAACCAAACCT -TRDJ3 CACATTATGACAGTGCCTCACAGGTAAC -IGHD3-3 CACAGTGACACAGACCTCACCCCAAACC -TRAJ49 CACTGTGATAGGAAGCTCAACAAAAACC -TRAJ48 CACAGTGTTCTAAGTCATTGCAAAAACC -TRAJ41 CACAGTGCGTTCTCCCTAAACAAAAAAC -TRAJ40 CACAGTGTTATGTGTCTCTACATAAACC -TRAJ43 CACAGTAATACATGCTCTAACAAAAACC -TRAJ42 CACAGTCCTATGGGGCTTTACAATAATC -TRAJ45 CACTCTGGGCCAACCCTTTACATAAACT -TRAJ44 CACTGTGAGATGCTTCATAACAGAAACC -TRAJ47 CACAGCGTCAAACTCCTCTACAAAAACA -IGLJ2 CACTGTGACACAGGCTCATACAAAAACC -IGLJ1 CACAGTGACTGAGGCTCAGACCAAAACC -IGLJ7 CACAGTGACACAGCCCCACACACAAACC -IGLJ6 CACTGTGATATAACCCTGCACACAAACC -LCK CACACACACACACACACAAGCCAAAACC -IGKV4-1 CACAGTGCTTCAGCCTCGAACACAAACC -TRAJ4 CACAATCAGATGGTGCTTTACAAGAACT -HPRT_12 CACACACACACACACACACACAAATACA -IGKV3D-20 CACAGTGATTCAGCTTGAAACAAAAACC -TRAJ5 CACCCTGGTACATCACAGTACAAAATCC -TRAJ6 CACAGTGAGGACAGCCTTTGATAAAACC -TRAJ3 CACTGTGGGTAAGGTCTTTGAGATAACC -IGKV2D-29 CACAGAGGTACAGACCAATACAGAAACC -IGKV2D-28 CACAGTGGTACAACCCCTAACAGAAACC -TAL2 CACTGTGACTTACTGGAGATATAAAAAT -IGKV2D-26 CACAGTGCTACAGCCCTGAGCACAAACC -TRAJ61 CACAGGAGTGGGCACCTTTACAAAAACC -TRBD1 CACAATGTTACAGCTTTGTACAAAAACA -IGKV1D-8 CACAGTGTTACACACCCGAACAAAAACC -LMO2 CACAGTATTGTCTTACCCAGCAATAATT -IGKV2D-30 CACAGTGGTACAGCCCTGAACAAAAACC -IGKV1/OR2-108 CACAGTGTTACAAGCCATAACAAAAACC -TRBD2 CACAATGTTACACCATGATACAAAAATG -IGKV2-24 CACAGTGGTACAGCTCTGAACAAAAACC -IGKV2-28 CACAGTGGTACAACCCCTAACAGAAACC -IGKV2-29 CACAGTGGTACAGACCAATACAGAAACC -TRAJ16 CACAGTGATCTATTGTACCACAAAAACC -TRAJ17 CACAATGATTTAAGGCCCAGCAAAAACC -TRAJ15 CACAGTGAAATGAGGCCCTGCAAATACC -TRAJ12 CACAGTGTTTCTTAGTCAGTCAAAAACA -TRAJ13 CACTGTAATGCCTGCCTTTACAAAATGA -TRAJ10 CACAGTGTTTGATGCCTCACAATAAACT -TRAJ11 CACTATAGCAAATCCCCATACAAAAATG -TRAJ18 CACAATGCTGGTCCCCTTTACATGAACC -IGKV1D-43 CACAGTGTTACACACCCAAACAAAAACC -IGKV2-30 CACAGTGGTACAGCCCTGAACAAAAACC -TRAJ30 CACTGTGATTGGGACCATAACAAAAACT -IGHD3-16 CACAGTGACACAGACCTCACTTCAAACC -IGHD4-4 CACAGTGTCACAGAGTCCATCAAAAACC -IGHD3-10 CACAGTGACACAGACCTCATTCTAAACC -IGHD7-27 CACAGTGGTTCTCAGCTCAGCCAAAACC
--- a/HS23RSS.txt Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,174 +0,0 @@ -Gene Sequence -IGLV2-8 CACAGTGTTTTAAGTCAATGAGGAAGTAAGACCAAAACC -IGLV4-3 CACAGTGACACAGATGAAGGGGAAGTGAGGCCAAAACTC -TRBV21-1 CACAGTGCCGAATGTTAGCCCTTCTTAGAACACAAACTC -IGHV1-17 CACAGTGCGAAAACCCACATCCTGAGAGTGTCAGAAACC -IGHV1-14 CACAGTGTGAAAACCCACATCCTGAGAGAGTCAGAAATC -IGHV3-15 CACAGTGAGGGGAGGTCAGTGTGAGCCCGGACACAAACC -IGHV1-18 CACAGTGTGAAAACCCACATCCTGAGGGTTTCAGAAACC -IGHV3-11 CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC -IGLV2-5 CACAGTGGTCCAAGTTCTTGGGGAACTGAGACGAAAACC -IGHV3-13 CACAGTGAGGGGAAGTCAGTATGAGCCCAGACACAAACC -TRAV2 CACAGAGGCAGGGAACCCATGAAGAGCTGAACAGAAACA -TRAV3 CACACTGATAGGGGCTGCAGGGGGAGCAGAACACAAACT -TRAV4 CACAGTGAGACAGATGGGCCTGCACCTGTGCCGTTTTCC -TRAV5 CACATTGCTTCTCAGGCACCTGTATCCTGTACCCAAACC -TRBV6-4 CACAGTGCTGCACAGCCATCTCCTCTCTGTACATAAATG -IGHV3-19 CACTGTGAGAGGACGGAAGTGTGAGCCCAGACACAAACC -TRBV6-6 CACAGCGCTACAAGACCATCTCCTCTCTGCACATAAAGG -TRBV18 CACATTGATGCAGAGCCACATCCTCTCAGTCCACAAACA -TRBV6-1 CACAGCGCTGCATGGCCGTCTCCTCTCTGCACATAAAGG -TRBV6-2 CACAGTGCTGCACGGCTGTCTCCTCTCTGCACAGAAAGG -TRBV6-3 CACAGTGCTGCACGGCTGTCTCCTCTCTGCACAGAAAGG -TRBV12-4 CACAGCGCTGCAGAATCACCCCTTTCCTGTGCAGAAACC -TRAV27 CACAGTGCTCTTGAGGCACCTGCTGCCTGCACCCAAACC -TRBV5-4 CACAGCCCGGCAGAGTCTCTGACATTCTATACATAAACT -TRBV5-3 CACAGCCCTGCAGAGTCACTGGAACTCTGTGCACTAATC -TRGV3 CACAGTGATTCAGACCTGTCCTACACCACACTGAAAATC -TRBV5-1 CACAGCCCTACAAAGCCAACCACATTCTGTGCACAAACC -IGHV3-9 CACAGTGAGGGGAAGTCAGCGAGAGCCCAGACAAAAACC -TRGV8 CACAGTGATTCAGACCTGTGCTACACCACACTGAAAATC -TRBV5-8 CACAGCCCAGCAGAATCACTGACATTCTGTATATAAACT -TRDV2 CACCCTGCTGCAGCTCTACTTCTGAGCAGCTCAAAAACC -IGKJ1 CACAGTGAGAGCTCTCCATTGTCTTGCTGAACAGAAACC -IGKJ2 CACAATGGTTCCTCTTAACTTCCCTCCTATACAAAAACT -IGKJ4 CACAGTGAGGGATCTCACCCTTTCCCCTCAACAAAAACC -TRAV26 CACACTGGGACAGATGGGGCTGCACCTGTGCAATATCTC -TRAV25 CACAGTGCTCCCCAAGCACCTGCTGCCTGTCTCCAAATC -TRAV24 CACAGTGCTGTTCAGGCACCTGCAGCCCATACGCAAACC -IGHV1-24 CACAGTGTGAAAACCCACATCCTGAGAGCGTCAGAAACC -TRAV41 CACAGTGCTCCCCAGGCACCTGGAGCCCGTACCTAAACT -IGLV5-52 CACAGTGCTCCAGACCCATGAGGAAGTAAGACAAAACCC -IGLV10-54 CACAGTGCCTCAGGCCAGTGGGGAAGTGAGATAAAAACT -IGHV3-64 CACAGTGAGGAGAAGTTAATGTGGGACCATGCAGAAACC -IGHV3-66 CACAGTGAGGGGAGGCCATTGTGCGCCCAGACACAAACC -TRBV7-9 CACAGCATGGCACAGTCGCCTCCTTCCTGCTCACAAACC -TRBV7-8 CACAGCATGGCACAGTTGCCTCCTTCCTGTTCACAAACC -IGLV1-40 CACAGTGCTCAGGCGTGTGTGTGAACTGAGACAAGAACC -IGLV3-21 CACGGTGACACAGGCAGATGAGGAAGTGAGACAAAAACA -IGHV7-4-1 CACAGTGTGGAAACCCACATCCTGAGAGTGTCAAAAACC -IGLV3-27 CACAGTGACACAGGCAGATGGAGAAATGAGACACAAACC -TRBV4-1 CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC -IGLV1-41 CACAGTGCTCCAGGCCAATGGGGAACTGAGACAAGAACC -TRAV13-1 CACATTGCTTCCCAGGCACCTGCTACCCGTACACAAACC -TRAV13-2 CACATTGCTTTCCAGGCATCTGTAACCATCACCCAAACC -TRBV5-6 CACAGCCCATCAGAGTCACTGACGTTCTGTATATAAACT -IGLV3-25 CACAGTGACACAAGCAGACAGGGAAGTGAGACATAAACC -TRBV10-2 CACAGTGCTGCACAGCTGCCTCCTCTCTGCACGGAAATG -TRBV10-3 CACAGTGCTGCATGGCTGCCTCCTCTCTGCACGTAAACA -TRBV10-1 CACAGTGCTGCACAGCTGCCTCCTCTCTGCACATAAAGG -IGLV6-57 CACAGTGCTCCAGACCCATGGGGAAGTGAGACAGAAACT -TRBV6-5 CACAGCGCTACAAGGCCGTCTCCTCTCTGCACATAAAGG -IGHV3-73 CACAGTGAGGGGAGGTCAGTGTGAGCCCGGACACAAACT -IGHV3-72 CACAGCGAGGGGAGGTCAGTGTGAGCCCGGACACAAACC -IGLV1-47 CACAGTGCTCCAGGCCCAGGGGGAACTGAGACAAGAACC -TRAV9-2 CACAGTGACAGGGACTGCAGGGGGAGCTGAGCACAAACT -IGLV7-46 CACAGTGACAGACCCATGAGAGGAACCAAGACATAAACC -TRBV6-7 CACAGCGCTGCAAGGCTGTCTCCTCTCTGCACATAAAGG -TRBV3-1 CACAGCCCTGCAAAGTCACTGCATCCCTGTGCACAAACC -IGLV7-43 CACAGTGACAGACTCATAAGAGGAACCAAGACATAAACC -IGHV1-8 CACAGTGTGAAAAACCACATCCTCAGAGAGTCAGAAACC -IGLV1-51 CACAGTGCTCCAGCCCAATGGGGAACTGAGACAAGAACC -IGLV1-50 CACAGTGCTCCAGGCCCAGGGGGAGGTGAGAGAAGAACC -IGHV1-3 CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACC -IGHV1-2 CACAGTGTGAAAACCCACATCCTGAGGGTGTCAGAAACC -IGHV4-39 CACAGTGTGGAAACCCACATCCCGAGGGAGTCAGAAACC -TRBV2 CACAGCCTTGCAAAGACAACTCCAGCCTGTGCAAAATCC -TRDD3 CACAGTGCTACAAAACCTACAGAGACCTGTACAAAAACT -TRDD2 CACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACA -TRAV29/DV5 CACAGTGCTCTCCAGGCACCTGCAGCCCGTACTCAAACC -IGLV3-10 CACAGTGACACTGGCAGATGGGGAAGTGAGACACAAACC -TRBV27 CACAGTGTTGCACAGCCAGCTGCTCTCTGCACAAAAACA -TRBV5-5 CACAGCCCGGCAGAATCACTGACATTCTGTATATAAACT -TRGV10 CACATACTAGAACTGTTGAAACAACATGCACAAAATCCC -TRGV11 CACAGTGTTCGAGTTGTCAAGATAAGGTACACAGAAACT -TRGV5 CACAGTGATTCAGACCTGTCCTACACCACACTGAAAATC -TRBV30 CACACTGAGCTGGGTGGGGCAGACATCTGTGCAAAAACC -IGHV3-48 CACAGTGAGGGGAGGTCAGTGTGACACCAGACACAAACC -IGHV3-49 CACAGTGAGGGGAGGTCAATGTGAGCCCAGACACAAACC -TRGV2 CACAGTGATTCAGATCCGCCCTACACCACACTGAAAATC -IGLV CACAGTGCTGCAGGCACATGGGGAACCGAGACAAAAACC -IGHV3-43 CACAGTGAGGGGAAGTCAGCGAGAGCCCAGACAAAAACC -IGHV3-41 CACAGTGAGAGGAAGTCCGTGTGAGCCCAGACACAAACC -IGHV3-7 CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC -HPRT_23 CACTGTAGGTTCATCTTTATGGAGATGCTCATGGCCTCA -TRGV9 CACAGCAGCAGACAGTTTGAGCCATCCCATTCAATAAAT -IGLV4-60 CACAGTGATACAGGCAGATGAGGAAGTGGGACAAAATCC -TRBV28 CACAGCGCAGCACAGCCGCATCCTCTCTGCACAAAAAGA -TRBV25-1 CACAGTGCTACATGGATACCGACACTCCGCACAGAAAGG -IGLV4-69 CACAGTGACACAGGCAGATGAGGAAGTGGGACAGAAACC -IGHV4-28 CACAGTGAGGGGAGGTGAGTGTGAGCCCAGACACAAACC -TRBV3-2 CACAGCCTTACAGAGCCACTGCATCCCTGTGCACAAACC -IGHV6-1 CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC -TRAV19 CACAGTGAGATGGGTGCCTGTGGGAGCCCTACAAAAACC -TRAV10 CACTGTGCTCCACAGGCACTTGAAGCCAGTATGCAAACC -TRAV17 CACAGTGTTCCCCAGGAACCTGCAGCCTCTACGCAAACC -TRBV4-2 CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC -TRBV4-3 CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC -IGHV3-53 CACAGTGAGGGGAGGTCAGTGTGAGCCCAGACACAAACC -IGHV3-52 CACAGTGAGAGGCACAGTGAGGGGAGGTCAGTGTGAGCC -TRBV7-3 CACAGCATGACACAATCGCCTCCTTCCTGCTCATAAACC -TRBV7-2 CACAGCATGGCACAGTCGCCTCCTTCCTGCTCATAAACC -TRBV7-7 CACAGCATGGCACAGTCGCCTCCTTCCTGTTCACAAACC -TRBV7-6 CACAGTGTGGCATAGTCGCCTCCTTCCTGTTCACAAACC -IGHV2-5 CACAAAGACACAGCCCAGGGCACCTCCTGTACAAAAACC -IGLV3-13 CACAGTGACACAGGCAGATGGGGAAGTGAGACACAAATC -IGLV3-16 CACAGTGACACAAGGAGACAGGGAAGTGAAACATAAACC -IGLV3-19 CACAGTGACACAGACAGATGGGGAAGTGAGACAGAAACC -IGHJ3 CACAGGGACACAGTCCGTTCCTAGACCCAGACACAAACC -IGHV1-45 CACAGTGTGAAAACCCACATCCTGAGACCGTCAGAAACC -IGHV1-46 CACAGTGTGAGAAACCACATCCTCAGAGTGTCAGAAACC -IGLV8-61 CACAGTGATTTAAACCTATGAGGAAGTGCAACTAAAACC -IGHV4-59 CACAGTGAGGGGAGGTGAGTGTGAGCCCAGACAAAAACC -TRAV1-1 CACAGTGACTATGAGGCCTCCTTAACTGTGCCAAAATTC -TRAV1-2 CACGGTGACTATGAGGCCTCTTTAGCTGCACCAAAATTC -TRGV7 CACAGTGATTCACACCTGCCCTACACCACACTGAAAATC -IGHV3-20 CACAGTGAGGGGAAGCCAGTGAGAGCCCAGACACAAACG -IGHV3-21 CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC -IGHV3-23 CACAGTGAGGGAAGTCATTGTGAGCCCAGACACAAACCT -IGLV2-23 CACAGTGGTCCAAGTTCATGGGGAACTGAGACCAAAACC -TRBD1 CACAATGATTCAACTCTACGGGAAACCTTTACAAAAACC -TRGV4 CACAGTGATTCAGATCCGCCCTACACCACACTGAAAACC -TRAV12-1 CACAGTGCTCCCCAGACACCTGCAGTCTGTACCCAAACC -TRAV12-3 CACAGTGCTCCCCAGACACTTGCAGTCTGTACCCAAACC -TRAV12-2 CACAGTGCTCCCCAGACACCTGCAGTCTGTACCCAAACC -IGLV3-6 CACAGTGACACAGGCAGATTAGGAAGTGAGACACAAACC -IGLV3-1 CACAGTGACACAGGCAGATGCGGAAGTGAGACAGAAACC -IGHV1-58 CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACG -TRBD2 CACGATGATTCAGGTAGAGGAGGTGCTTTTACAAAAAAC -IGLV3-9 CACAGTGACACAGGCAGATGAAGAAGTGAGACACAAACC -TRBV6-8 CACAGCGCTACAAGGTTGTCTCCTCTCTGCACATAAAGG -TRBV9 CACAGCCCTGCATGAGCATCAGCCTTCTGTGCAATAACA -IGHV3-35 CACTGTGAGAGGTCGGAAGTGTGAGCCCAGACACAAACC -IGHV3-33 CACAGTGAGGGGAGGTCATTGTGCGCCCAGACACAAACC -IGLV2-14 CACAGTGGTCCAAGTTCCTGGGGAACTGAGACCAAAACC -IGHV3-30 CACAGTGAGGGGAAGTCATTGTGCGCCCAGACACAAACC -TRAV8-1 CACAGTGTCTGGGACTGCAAAGGGAGCTGAACACAAACT -IGLV2-18 CACAGAGGTCCAAGTTCCTGGGGAACTGAGACCAAAACC -TRAV14/DV4 CACAGTGACAGAACTGTCGGAGGGAGGTGTACAAAAGCC -IGHV2-26 CACAGAGACACAGCCCAGGATGCCTCCTGTACAAGAACC -TRBV16 CACAGTGTTAAATATTAGCTAATCTTAGGACACAGACTC -TRBV15 CACAGAGCTGCAGTGCTTCCTGCTCTCTGTTCATAAACC -TRBV14 CACAGTGCTTCACAGTCGTGCCCTTGCTGTGCAAAACCA -TRBV13 CACAGCATGGCACAGTCGCCTCCTTCCTGCTCACAAACC -TRBV11-3 CACAGTGTAGCAGAGACACTTCCCTCCTGTGCAGAAAAC -TRBV19 CACAGTGAAGCACGGATGTCGCCTCTCTGTGCATAAATG -TRBV11-2 CACAGTGTAGCAGAGACACTTCCCTCCTGTGCAGAAAAC -TRAV38-1 CACAATGAGATGAGCAGCAGGTAGAGGCTTACAGAAATC -TRBV12-5 CACAGCGCTGCAGAATCACCTGCTCCCTGTGCAGAAACC -TRBV12-2 CACAGCGCTGCAGAATCTCCCCCTCCCTGTGCAGAAACT -TRBV12-3 CACAGCGCTGCAGAATCACCCCTTTCCTGTGCAGAAAAC -IGHV1-69 CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACC -LMO1_23 CACAGTGGCTCACCACCCCACACAGCCCTCACTCTGGCA -IGLV5-45 CACAGTGACACACACAGATGGGGAAGTGGGACAAAAACC -IGLV5-37 CACAGTGACACACACAGATGGGAAAGTGGGACAAAAACC -TRAV23/DV6 CACAGTGCTCCCCAGGCACCTGAAGCCTGTACCCAAACC -TRDV1 CACAGTGTTTGAAGTGATAGTAAAAGCAAAACAAAAACC -TRBV20-1 CACAGCGCCAGGAGGGGATCAGACACCGCGGCAAGAACC -IGHV2-10 CACAGAGACAGAGCCCAGGGTGCCTCTTGTACAAGACCC -TRBV29-1 CACAGTGCGGGGCACAGATCAAAGATCTGAGCAAGAACC -TRBV23-1 CACAGCACTGAAATGTCAGTTCCTCTTAGCACACAAACT -TRAV34 CACAGCGATCTTCAGGCCTCTATCAGCTGTCTCCAAACC -TRAV30 CACAGTGATACCCAGGCCTCCAAGACCTGTACTCAAACC -TAL1_23 CACAGCCTCGCGCATTTCTGTATATTGCGTAAGGAAAAG -TRAV39 CACAGTGCTCCCCTGACGCCACCAGTCTGTACCCAAACC
--- a/filter.r Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -args <- commandArgs(trailingOnly = TRUE) - -input = args[1] -filterf = args[2] -output = args[3] - - -dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) -filterdat = read.table(filterf, header=T, sep="\t", fill=T, stringsAsFactors=F) - -dat = dat[dat$Sequence.ID %in% filterdat$ID,] - -write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- a/gene_identification.py Wed Sep 17 07:25:17 2014 -0400 +++ b/gene_identification.py Mon Sep 22 10:19:36 2014 -0400 @@ -5,31 +5,38 @@ parser = argparse.ArgumentParser() parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") -parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") +parser.add_argument("--output", help="The annotated summary output file") args = parser.parse_args() infile = args.input #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" -outdir = args.outdir +output = args.output #outfile = "identified.txt" dic = dict() total = 0 + first = True +IDIndex = 0 +seqIndex = 0 + with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence - for line in f: - total += 1 - if first: - first = False - continue - linesplt = line.split("\t") - if linesplt[2] == "No results": - continue - ID = linesplt[1] - seq = linesplt[28] - dic[ID] = seq + for line in f: + total += 1 + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence ID") + seqIndex = linesplt.index("Sequence") + first = False + continue + linesplt = line.split("\t") + ID = linesplt[IDIndex] + if len(linesplt) < 28: #weird rows without a sequence + dic[ID] = "" + else: + dic[ID] = linesplt[seqIndex] #lambda/kappa reference sequence searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", @@ -128,7 +135,7 @@ continue #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] currentIDHits[key + "_hits"] += 1 - start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1]) #start_location[ID + "_" + key] = str(start.index(max(start))) @@ -141,120 +148,49 @@ varsInCM = 0 requiredVarPercentage = 0.7 -ca = 0 -ca1 = 0 -ca2 = 0 -cg = 0 -cg1 = 0 -cg2 = 0 -cg3 = 0 -cg4 = 0 -cm = 0 -try: - cafile = open(outdir + "/ca.txt", 'w') - ca1file = open(outdir + "/ca1.txt", 'w') - ca2file = open(outdir + "/ca2.txt", 'w') - cgfile = open(outdir + "/cg.txt", 'w') - cg1file = open(outdir + "/cg1.txt", 'w') - cg2file = open(outdir + "/cg2.txt", 'w') - cg3file = open(outdir + "/cg3.txt", 'w') - cg4file = open(outdir + "/cg4.txt", 'w') - cmfile = open(outdir + "/cm.txt", 'w') - unmatchedfile = open(outdir + "/unmatched.txt", 'w') - cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") - for ID in hits.keys(): - currentIDHits = hits[ID] - possibleca = float(len(compiledregex["ca"])) - possiblecg = float(len(compiledregex["cg"])) - possiblecm = float(len(compiledregex["cm"])) - cahits = currentIDHits["ca_hits"] - cghits = currentIDHits["cg_hits"] - cmhits = currentIDHits["cm_hits"] - if cahits > cghits and cahits > cmhits: #its a ca gene - if cahits <= int(chunksInCA * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") + +first = True +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + with open(output, 'w') as o: + for line in f: + total += 1 + if first: + o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + first = False continue - ca += 1 - ca1hits = currentIDHits["ca1"] - ca2hits = currentIDHits["ca2"] - cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - if ca1hits > ca2hits: - #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" - if ca1hits <= int(varsInCA * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") - continue - ca1 += 1 - ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - else: - #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" - if ca2hits <= int(varsInCA * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") - continue - ca2 += 1 - ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - elif cghits > cahits and cghits > cmhits: #its a cg gene - if cghits <= int(chunksInCG * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") - continue - cg += 1 - cg1hits = currentIDHits["cg1"] - cg2hits = currentIDHits["cg2"] - cg3hits = currentIDHits["cg3"] - cg4hits = currentIDHits["cg4"] - cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene - if cg1hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") - continue - cg1 += 1 - cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene - if cg2hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") - continue - cg2 += 1 - cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene - if cg3hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") - continue - cg3 += 1 - cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - else: #cg4 gene - if cg4hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") - continue - cg4 += 1 - cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - else: #its a cm gene - if cmhits <= int(chunksInCM * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") - continue - cm += 1 - cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") -finally: - cafile.close() - ca1file.close() - ca2file.close() - cgfile.close() - cg1file.close() - cg2file.close() - cg3file.close() - cg4file.close() - cmfile.close() - unmatchedfile.close() - - -#print ca,cg,cm,(ca+cg+cm) + linesplt = line.split("\t") + if linesplt[2] == "No results": + pass + ID = linesplt[1] + currentIDHits = hits[ID] + possibleca = float(len(compiledregex["ca"])) + possiblecg = float(len(compiledregex["cg"])) + possiblecm = float(len(compiledregex["cm"])) + cahits = currentIDHits["ca_hits"] + cghits = currentIDHits["cg_hits"] + cmhits = currentIDHits["cm_hits"] + if cahits > cghits and cahits > cmhits: #its a ca gene + ca1hits = currentIDHits["ca1"] + ca2hits = currentIDHits["ca2"] + if ca1hits > ca2hits: + o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + else: + o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + elif cghits > cahits and cghits > cmhits: #its a cg gene + cg1hits = currentIDHits["cg1"] + cg2hits = currentIDHits["cg2"] + cg3hits = currentIDHits["cg3"] + cg4hits = currentIDHits["cg4"] + if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene + o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene + o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene + o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #cg4 gene + o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm gene + o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") print "Time: %i" % (int(time.time() * 1000) - starttime)
--- a/logo.pm Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,826 +0,0 @@ -#!/usr/local/bin/perl -w - -=head1 NAME - - logo.pm - organizes data in FASTA and CLUSTAL formats into height data. - -=head1 SYNOPSIS - - Perl module - -=head1 DESCRIPTION - - logo.pm: Takes in strings of aligned sequences and sorts them vertically - based on height as assigned by the following equations found in - Schneider and Stephens paper "Sequence Logos: A New Way to Display - Consensus Sequences": - - height = f(b,l) * R(l) (1) - - where f(b,l) is the frequency of base or amino acid "b" at position - "l". R(l) is amount of information present at position "l" and can - be quantified as follows: - - R(l) for amino acids = log(20) - (H(l) + e(n)) (2a) - R(l) for nucleic acids = 2 - (H(l) + e(n)) (2b) - - where log is taken base 2, H(l) is the uncertainty at position "l", - and e(n) is the error correction factor for small "n". H(l) is - computed as follows: - - H(l) = - (Sum f(b,l) * log[ f(b,l) ]) (3) - - where again, log is taken base 2. f(b,l) is the frequency of base - "b" at position "l". The sum is taken over all amino acids or - bases, depending on which the data is. - - Currently, logo.pm uses an approximation for e(n), given by: - - e(n) = (s-1) / (2 * ln2 * n) (4) - - Where s is 4 for nucleotides, 20 for amino acids ; n is the number - of sequences in the alignment. e(n) also gives the height of error - bars. - -=cut - -package logo; - -use strict; -use Carp; - -################################################################################ -###### SOME VARIABLES ###### -################################################################################ - -my $DEBUG = 0; - -my $AA = 0; -my $NA = 1; - -my %BASES = ("a" => "adenine", - "t" => "thymine", - "g" => "guanine", - "c" => "cytosine", - "u" => "uracil"); - -# does not include B or Z -my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "", - "g" => "", "h" => "", "i" => "", "k" => "", "l" => "", - "m" => "", "n" => "", "p" => "", "q" => "", "r" => "", - "s" => "", "t" => "", "v" => "", "w" => "", "y" => ""); - -my @data; -my $kind; -my ($seqs_r, $desc_r); - -my $CONFIDENCE_LIMIT = 0.90; - -################################################################################ -###### SOME FUNCTIONS ###### -################################################################################ - -=head1 APPENDIX - -=cut - -=head2 getHeightData() - - Usage : my ($height_data_r, $description_data_r, $kind) = - logo::getHeightData($input_data_r, $params); - Returns : * REFERENCE TO array of height data - * REFERENCE TO array of input description strings - * $AA if the data is amino acid, $NA otherise - Args : $input_data_r : input data in CLUSTAL or FASTA formats - : $params : hash of parameters - - getHeightData is the entry point into the logo.pm module. $input_data_r is a - reference to an array of strings containing FASTA or CLUSTAL data, where all - lines whose first character is "#" is considered a comment line. - - $params is a hash of parameters with the following keys: - * smallsampletoggle : 0 to turn off small sample correction, otherwise - small sample correction is on - * input_kind : 0 for amino acids, 1 for nucleic acids; if undefined, - logo.pm will attempt to automatically detect whether the - input consists of amino or nucleic acid data. If - $input_kind is defined, only those residues defined by - $input_kind will be in the output -- all other residues are - considered as spaces. For example, if $input_kind is $NA, - the residue "I" or "i" are considered spaces, since "I" and - "i" are not nucleic acid residues. - * stretch : stretch all characters so they are flush at the maximum number - of bits allowed - - Sample use: - - # get FASTA data - open (FASTA, "$fastafile"); - my @inputdata = <FASTA>; - close (FASTA); - - my %heightparams = ( - smallsamplecorrection => 0, - input_kind => 0, - stretch => 0 - ); - - # get height data - my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams); - -=cut - -# entry point into module -sub getHeightData { - - # $smallsampletoggle is toggle to turn off small sample correction (1 to turn off) - # $input_kind can be $AA or $NA or undef - my ($input_data_r, $params) = @_; - - # gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n) - $input_data_r = normalizeData($input_data_r); - - # gets sequences, sets $kind temporarily - my ($goodlength, $maxreslength, $badline, $validformat); - ($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) = - getSeqs($input_data_r, $params->{input_kind}); - -# for(my $i = 0; $i < scalar @$seqs_r ; $i++) { -# print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n"); -# } -# print STDERR "maxreslength = $maxreslength\n"; -# -# exit(0); - - if ($DEBUG) { print STDERR ("point 1\n");} - - # check for valid format - if ((defined $validformat) && ($validformat == 1)) { -# print("returning\n"); - return (undef, undef, undef, undef, undef, 1); - } - - if ($DEBUG) { print STDERR ("point 2\n");} - - # check for bad length - if (!$goodlength) { - return (undef, undef, undef, $goodlength, $badline); - } - - # reset $kind if in $input_kind - if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) { - $kind = $params->{input_kind}; - } - - # build data - buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength); - - if ($DEBUG) { print STDERR ("point 3\n");} - -# print STDERR ("data size = ", scalar @data, "\n"); -# foreach (@data) { -# print STDERR ("$_\n"); -# } -# -# exit(0); -# -# print STDERR ("return at 2\n"); - return (\@data, $desc_r, $kind, $goodlength, $badline); -} - -sub isLegalKind { - return ($_[0] =~ /^[01]$/); -} - -################################################################################ -# -# sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline -# style normalized to standard Unix-style newline style -# -################################################################################ -sub normalizeData { - my ($data_r) = @_; - - # check args - if (not defined $data_r) { - die "data_r must be defined\n"; - } - - my @normalized = (); - foreach my $pseudo_line (@$data_r) { - my @split_line = split(/[\r\n]+/, $pseudo_line); - push(@normalized, @split_line); - } - - return \@normalized; -} - - -################################################################################ -# -# sub getSeqs($data_r, $kind) returns 5 values: -# -# * array reference to sequence strings -# * array reference to sequence names -# * length of sequence -# * 1 if all sequences have the same length, 0 else -# * line number L where sequenceLength(L) != sequenceLength(other lines), else -# undef -# -################################################################################ -sub getSeqs { - my ($input_data_r, $kind) = @_; - - unless( $input_data_r->[0] ){ - return (undef, undef, undef, undef, undef, 1); - } - - # skip all comment chars and lines of all spaces - while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) { - shift @$input_data_r; - if( !defined $input_data_r->[0]) {return (undef, undef, undef, undef, undef, 1);} - } - - if (isFormat_FASTA($input_data_r)) { - return getSeqs_FASTA($input_data_r, $kind); - - } elsif (isFormat_CLUSTAL($input_data_r)) { - return getSeqs_CLUSTAL($input_data_r, $kind); - - } elsif (isFormat_FLAT($input_data_r)) { - return getSeqs_FLAT($input_data_r, $kind); - - } else { - if ($DEBUG) {print STDERR ("format nothing\n");} - return (undef, undef, undef, undef, undef, 1); - } - -# if ($_[0] =~ />/) { -# return getSeqs_FASTA(@_); -# } else { -# return getSeqs_CLUSTAL(@_); -# } -} - -################################################################################ -# -# sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format -# -################################################################################s -sub isFormat_FASTA { - my ($input_data_r) = @_; - - # check args - if (not defined $input_data_r) { - Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n"); - } - - if ($input_data_r->[0] =~ />/) { - return 1; - } else { - return 0; - } -} - -################################################################################ -# -# sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format -# -################################################################################ -sub isFormat_CLUSTAL { - my ($input_data_r) = @_; - - # check args - if (not defined $input_data_r) { - Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " . - "defined\n"); - } - - my $i=0; - -# # skip spaces or just "*" and "." and ":" -# while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) { -# $i++; -# } - - # if it looks like CLUSTAL W (version) ... , then it must be clustal - if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) { - return 1; - } - - # CLUSTAL looks like: "name seq" - if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) { - return 1; - } else { - return 0; - } -} - -################################################################################ -# -# sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format -# -################################################################################ -sub isFormat_FLAT { - my ($input_data_r) = @_; - - # check args - if (not defined $input_data_r) { - Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n"); - } - -# print("checking flat\n"); -# print("first element = -->", $input_data_r->[0], "<--\n"); - - if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) { - return 1; - } else { - return 0; - } -} - -################################################################################ -###### FORMATTING FUNCTIONS ###### -################################################################################ - -# the flat sequence format is as follows: -# sequence1 -# sequence2 -# sequence3 -# ... -# sequenceN -sub getSeqs_FLAT { - - if ($DEBUG) {print STDERR "DOING FLAT\n";} - - my ($input_data_r, $input_kind) = @_; - - my $linelength = 0; - my $seqCount = 0; - my $total_residues = 0; - my (@returnVal, @desc) = (); - my $prevlinelength = undef; - my $NA_count = 0; - - foreach my $seq (@$input_data_r) { -# chomp $seq; - $seq =~ s/\s+$//; - - my @chars = split(//,$seq); - - my $char; - foreach (@chars) { - $total_residues++; - $linelength++; - - # set $char - if (defined $input_kind) { - if ($input_kind == $AA) { - $char = (isAA($_)) ? $_ : "-"; - } else { # == $NA - $char = (isNA($_)) ? $_ : "-"; - } - } else { - $char = $_; - if (isNA($char)) { - $NA_count++; - } - } - - $returnVal[$seqCount] .= $char; - } - $desc[$seqCount] = "no name"; - - if ($seqCount == 0) { - $prevlinelength = $linelength; - } elsif ($prevlinelength != $linelength) { # different number of residues, so complain - return (undef, undef, undef, 0, $seq); # 0 for not same length, $seq is name - } - $linelength=0; - - $seqCount++; - } - - # determine whether to use $NA or $AA - if (!defined $input_kind) { - if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { - $kind = $NA; - } else { - $kind = $AA; - } - } - - return (\@returnVal, \@desc, $prevlinelength, 1, undef); -} - -sub getSeqs_CLUSTAL { - - if ($DEBUG) {print STDERR "DOING CLUSTAL\n";} - - my ($input_data_r, $input_kind) = @_; - - my @returnVal; - my @desc; - my $seqCount=0; - my $reslength = 0; - my ($name, $seq); - -# my $input_kind = pop @_; -# my $CONFIDENCE_LIMIT = 0.90; - my $NA_count = 0; - my $total_residues = 0; - my ($prevlinelength, $linelength) = (0,0); - -# foreach (@_) { - foreach (@$input_data_r) { -# chomp; - - if ($DEBUG) {print STDERR ("line = $_\n")}; - - $_ =~ s/\s+$//; - - # skip if it is a comment character -- first character is "#" - next if (/^\s*\#/); - - # skil if it is a CLUSTAL W header line - next if (/^\s*CLUSTAL/); - - # if spaces or just "*" and "." and ":" - if (/^[\*\.\:\s]*$/) { - $seqCount=0; - $prevlinelength=0; - next; - } - - ($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/); - - if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); } - - # add new entry - if (!defined $desc[$seqCount]) { - $desc[$seqCount] = $name; - $returnVal[$seqCount] = ""; - } - - if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong - - my @chars = split(//,$seq); - my $char; - foreach (@chars) { - if ($seqCount == 0) { - $reslength++; # all sequences have same residue length, so only count first one - } - - $total_residues++; - $linelength++; - - # set $char - if (defined $input_kind) { - if ($input_kind == $AA) { - $char = (isAA($_)) ? $_ : "-"; - } else { # == $NA - $char = (isNA($_)) ? $_ : "-"; - } - } else { - $char = $_; - if (isNA($char)) { - $NA_count++; - } - } - - $returnVal[$seqCount] .= $char; - } - - if ($seqCount == 0) { - $prevlinelength = $linelength; - } elsif ($prevlinelength != $linelength) { # different number of residues, so complain - return (undef, undef, undef, 0, $name); - } - $linelength=0; - - $seqCount++; - } - - # determine whether to use $NA or $AA - if (!defined $input_kind ) { - if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { - $kind = $NA; - } else { - $kind = $AA; - } - } - - return (\@returnVal, \@desc, $reslength, 1, undef); -} - -# if $input_kind is defined, residues that are not defined are set to space -sub getSeqs_FASTA { - - if ($DEBUG) {print STDERR "DOING FASTA\n";} - - my ($input_data_r, $input_kind) = @_; - - my @returnVal; - my @desc; - my $count=-1; - my $newElem=0; - -# my $input_kind = pop @_; - -# my $CONFIDENCE_LIMIT = 0.90; - my $NA_count = 0; - my $total_residues = 0; - my $reslength = 0; - my $maxreslength = 0; - - my ($goodlength, $currline, $prevline); - - -# # skip all lines that are all spaces -# while ($_[0] =~ /^\s*$/) { -# shift @_; -# } - -# foreach (@_) { - foreach (@$input_data_r) { - - $_ =~ s/\s+$//; - - # skip if it is a comment character -- first character is "#" - next if (/^\s*\#/); - - # skip all lines that are all spaces - next if (/^\s*$/); - - $_ =~ s/\s+$//; # cut trailing white space - $_ =~ s/^\s+//; # cut leading white space - if (/>/) { - $currline = $_; - ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/); - - if (not $newElem) { - $count++; - $newElem = 1; - } - } else { - if ($newElem) { - $maxreslength = $reslength if $maxreslength == 0; - if (($maxreslength != 0) && ($maxreslength != $reslength)) { - return (undef, undef, undef, 0, $prevline); - } - - $maxreslength = $reslength; - $reslength = 0; - } - - my @chars = split(//,$_); - my $char; - foreach (@chars) { - $reslength++; - $total_residues++; - - # set $char - if (defined $input_kind) { - if ($input_kind == $AA) { - $char = (isAA($_)) ? $_ : "-"; - } else { # == $NA - $char = (isNA($_)) ? $_ : "-"; - } - } else { - $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-"; # if not alpha char, use space - if (isNA($char) && !isSpace($char)) { - $NA_count++; - } - } - - if ($newElem) { - $returnVal[$count] = $char; - } else { - $returnVal[$count] .= $char; - } - $newElem = 0; - } - $prevline = $currline if $currline =~ />/; - } - } - - # check if last is biggest - if (($maxreslength != 0) && ($maxreslength != $reslength)) { - return (undef, undef, undef, 0, $prevline); - } -# $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength; - - # determine whether to use $NA or $AA - if (!defined $input_kind) { - if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { - $kind = $NA; - } else { - $kind = $AA; - } - } - - return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef); # 1 for good lengths -} - -sub isSpace { - return $_[0] =~ /[ \-]/; -} - -sub isAA { - return (defined $AMINOACIDS{lc $_[0]}); -} - -sub isNA { - return (defined $BASES{lc $_[0]}); -} - -################################################################################ -###### DATA BUILDING FUNCTIONS ###### -################################################################################ - - -# arguments: takes reference to array and lines aligned sequences of bases or -# amino acids -# returns: updated reference array to reflect contents of sequences sorted -# vertically by height as described by (1) -sub buildData { - - my $currentx = 0; - my $h; - my $count=0; - my $maxreslength = pop (@_); - my $stretch = pop(@_); - my $smallsampletoggle = pop (@_); - my $totalsize = $#_+1; - - while ($currentx < $maxreslength) { #(length $_[0])) { - my $allspaces = 1; - my $spaceCount=0; - - # get vertical sequence - my @vert=(); - foreach (@_) { # foreach sequence - my $currentchar; - - # set currentchar, set to " " if $_ is not long enough - if ($currentx >= (length $_)) { - $currentchar = " "; - } else { - $currentchar = substr($_,$currentx,1); - } - - # in all sequences, "-" is considered as a space - # don't count " " and "-" - if (($currentchar ne "-") && ($currentchar ne " ")) { - $vert[scalar @vert] = uc substr($_,$currentx,1); - $allspaces = 0; - } else { - $spaceCount++; - } - } - - if ($allspaces) { - # build @vert - @vert = (" 0", ">0"); - - # place in @data - $data[scalar @data] = \@vert; - - $currentx++; - next; - } - - my $error; - if ($smallsampletoggle) { - $error=getError($kind,$totalsize - $spaceCount); - } else { - $error = 0; - } - - # sort vertical sequence by amino acid or base - @vert = sort(@vert); - my $total = $#vert + 1; - - # find H(l) -- must be done before collapsing - $h = getH(@vert); - - # collect like terms - @vert = collapse(@vert); - - # get R(l) - my $r; - if (!defined $stretch || !$stretch) { - $r= getR($kind, $h, $error); - } else { - $r = ($kind == $NA) ? 2 : (log(20) / log(2)); - } - - # place heights - my $count=0; - my $height; - my $elem; - foreach $elem (@vert) { - $height = getHeight(substr($elem, 2) / $total,$r); - $vert[$count] = substr($elem,0,1) . $height; - $count++; - } - - # sort by height - @vert = sort height_sort @vert; - - # put in error bar size - $vert[$count] = ">$error"; - - # place in @data - $data[scalar @data] = \@vert; - - $currentx++; - } -} - -# uses error approximation given by: -# e := (s-1) / (2 * ln2 * ntrue); -sub getError { - return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]); -} - -sub height_sort { - my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1); - my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1); - - # compare by height first, then letter - if ($heighta <=> $heightb) { - return $heighta <=> $heightb; - } else { - return $letterb cmp $lettera; #reversed for some reason... - } -} - -sub collapse { - my @returnVal; - my $current = $_[0]; - my $count=0; - my $freq; - - foreach (@_) { - if ($current eq $_) { - $count++; - } else { - $returnVal[scalar @returnVal] = "$current $count"; - - $current = $_; - $count=1; - } - } - - # add last element - $returnVal[scalar @returnVal] = "$current $count"; - - return @returnVal; -} - -# arguments : $_[0] : list of bases or amino acids -sub getH { - my $h = 0; - my (@vert) = @_; # vertical sequence (comparing multiple sequences) - - my $current = $vert[0]; - my $count=0; - my $freq; - - foreach (@vert) { - if ($current eq $_) { - $count++; - } else { - $freq = $count / ($#vert + 1); - $h += $freq * (log ($freq) / log(2)); - - $current = $_; - $count=1; - } - } - - # add last element - $freq = $count / ($#vert + 1); - $h += $freq * (log ($freq) / log(2)); - - return -$h; -} - -# arguments : $_[0] : $AA or $NA -# $_[1] : uncertainty = H(l) -# $_[2] : error correction factor for small number of sequences -# = e(n) ; assummed to be 0 if not given -sub getR { - my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2)); - my $e = (defined $_[2]) ? $_[2] : 0; - return ($max - ($_[1] + $e)); -} - -# arguments: $_[0] : frequency = f(b,l) -# $_[1] : R(l) -sub getHeight { - return $_[0] * $_[1] -} - -1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_and_filter.r Mon Sep 22 10:19:36 2014 -0400 @@ -0,0 +1,57 @@ +args <- commandArgs(trailingOnly = TRUE) + + +summaryfile = args[1] +mutationanalysisfile = args[2] +mutationstatsfile = args[3] +hotspotsfile = args[4] +output = args[5] +unmatchedfile = args[6] + +summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F) + + +summ = summ[summ$Functionality != "No results",] +tmp = summ[summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70,] +unmatched = summ[summ$chunk_hit_percentage < 70 & summ$nt_hit_percentage < 70,] +unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] +summ = tmp +rm(tmp) + +if(length(summ$Sequence.ID) == 0){ + stop("No data remaining after filter") +} + +result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-2])], by="Sequence.ID") +result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") +result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") + + +cleanup_columns = c("FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.Nb.of.mutations", + "FR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.Nb.of.mutations", + "FR3.IMGT.Nb.of.mutations") + +for(col in cleanup_columns){ + result[,col] = gsub("\\(.*\\)", "", result[,col]) + result[,col] = as.numeric(result[,col]) + result[is.na(result[,col]),] = 0 +} + +result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) +result$VGene = gsub("[*].*", "", result$VGene) +result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) +result$JGene = gsub("[*].*", "", result$JGene) + +result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match) + +result = result[!duplicated(result$past), ] + +result = result[,!(names(result) %in% c("past"))] + +write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) +write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
--- a/mutation_analysis.py Wed Sep 17 07:25:17 2014 -0400 +++ b/mutation_analysis.py Mon Sep 22 10:19:36 2014 -0400 @@ -2,79 +2,125 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument("--mutationfile", help="The '7_V-REGION-mutation-and-AA-change-table' file from the IMGT output") -parser.add_argument("--hotspotfile", help="The '10_V-REGION-mutation-hotspots' file from the IMGT output") +parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") +parser.add_argument("--genes", help="The genes available in the 'best_match' column") parser.add_argument("--output", help="Output file") args = parser.parse_args() -mutationfile = args.mutationfile #"test_VH-Ca_Cg_25nt/7_V-REGION-mutation-and-AA-change-table_test_VH-Ca_Cg_25nt_241013.txt" -hotspotsfile = args.hotspotfile #"test_VH-Ca_Cg_25nt/10_V-REGION-mutation-hotspots_test_VH-Ca_Cg_25nt_241013.txt" -outfile = args.output #"out.txt" +infile = args.input +genes = str(args.genes).split(",") +outfile = args.output + +genedic = dict() mutationdic = dict() mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") linecount = 0 -with open(mutationfile, 'r') as i: - for line in i.readlines()[1:]: +IDIndex = 0 +best_matchIndex = 0 +fr1Index = 0 +cdr1Index = 0 +fr2Index = 0 +cdr2Index = 0 +fr3Index = 0 +first=True +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence.ID") + best_matchIndex = linesplt.index("best_match") + fr1Index = linesplt.index("FR1.IMGT") + cdr1Index = linesplt.index("CDR1.IMGT") + fr2Index = linesplt.index("FR2.IMGT") + cdr2Index = linesplt.index("CDR2.IMGT") + fr3Index = linesplt.index("FR3.IMGT") + first = False + continue linecount += 1 linesplt = line.split("\t") - if linesplt[2] != "productive": - continue - ID = linesplt[1] - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[5].split("|") if x] - mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[6].split("|") if x] - mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[7].split("|") if x] - mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[8].split("|") if x] + ID = linesplt[IDIndex] + genedic[ID] = linesplt[best_matchIndex] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] - mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[9].split("|") if x] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linecount == 0: print "No data, exiting" with open(outfile, 'w') as o: - o.write("RGYW (%),0,0,NA\n") - o.write("WRCY (%),0,0,NA\n") - o.write("WA (%),0,0,NA\n") - o.write("TW (%),0,0,NA\n") + o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) + o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) + o.write("WA (%)," + ("0,0,0\n" * len(genes))) + o.write("TW (%)," + ("0,0,0\n" * len(genes))) import sys sys.exit() hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") -RGYWCount = 0 -WRCYCount = 0 -WACount = 0 -TWCount = 0 +RGYWCount = {g: 0 for g in genes} +WRCYCount = {g: 0 for g in genes} +WACount = {g: 0 for g in genes} +TWCount = {g: 0 for g in genes} -with open(hotspotsfile, 'r') as i: - for line in i.readlines()[1:]: - linesplt = line.split("\t") - if linesplt[2] != "productive": +IDIndex = 0 +ataIndex = 0 +tatIndex = 0 +aggctatIndex = 0 +atagcctIndex = 0 +first = True +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + ataIndex = linesplt.index("X.a.t.a") + tatIndex = linesplt.index("t.a.t.") + aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") + atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") + first = False continue - ID = linesplt[1] - RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[6].split("|") if x]] - WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[7].split("|") if x]] - WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[4].split("|") if x]] - TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[5].split("|") if x]] - RGYWCount += sum([1 for (x,y,z) in RGYW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - WRCYCount += sum([1 for (x,y,z) in WRCY if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - WACount += sum([1 for (x,y,z) in WA if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - TWCount += sum([1 for (x,y,z) in TW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) - -path = outfile[:outfile.rfind("/") + 1] + "mutations.txt" -value = 0; -with open(path, 'r') as f: - value = f.readlines()[0].split(",")[1] -with open(outfile, 'w') as o: - if value != "0": - o.write("RGYW (%)," + str(RGYWCount) + "," + value + "," + str(round(RGYWCount / float(value) * 100, 1)) + "\n") - o.write("WRCY (%)," + str(WRCYCount) + "," + value + "," + str(round(WRCYCount / float(value) * 100, 1)) + "\n") - o.write("WA (%)," + str(WACount) + "," + value + "," + str(round(WACount / float(value) * 100, 1)) + "\n") - o.write("TW (%)," + str(TWCount) + "," + value + "," + str(round(TWCount / float(value) * 100, 1)) + "\n") - else: - o.write("RGYW (%)," + str(RGYWCount) + ",0,NA\n") - o.write("WRCY (%)," + str(WRCYCount) + ",0,NA\n") - o.write("WA (%)," + str(WACount) + ",0,NA\n") - o.write("TW (%)," + str(TWCount) + ",0,NA\n") + linesplt = line.split("\t") + gene = linesplt[best_matchIndex] + ID = linesplt[IDIndex] + RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] + WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] + WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] + TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] + RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) +directory = outfile[:outfile.rfind("/") + 1] +value = 0; +valuedic = dict() +for gene in genes: + with open(directory + gene + "_value.txt", 'r') as v: + valuedic[gene] = int(v.readlines()[0].rstrip()) +with open(directory + "total_value.txt", 'r') as v: + valuedic["total"] = int(v.readlines()[0].rstrip()) + +dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} +arr = ["RGYW", "WRCY", "WA", "TW"] +with open(outfile, 'w') as o: + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = re.compile(".*" + gene + ".*") + if valuedic[gene] is 0: + o.write(",0,0,0") + else: + x = sum([curr[x] for x in [y for y,z in genedic.iteritems() if geneMatcher.match(z)]]) + y = valuedic[gene] + z = str(round(x / float(valuedic[gene]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z) + #for total + x = sum([y for x,y in curr.iteritems()]) + y = valuedic["total"] + z = str(round(x / float(valuedic["total"]) * 100, 1)) + o.write("," + str(x) + "," + str(y) + "," + z + "\n")
--- a/mutation_analysis.r Wed Sep 17 07:25:17 2014 -0400 +++ b/mutation_analysis.r Mon Sep 22 10:19:36 2014 -0400 @@ -1,38 +1,29 @@ args <- commandArgs(trailingOnly = TRUE) input = args[1] -summaryinput = args[2] +genes = unlist(strsplit(args[2], ",")) outputdir = args[3] setwd(outputdir) -#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) -datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) -datSum = datSum[,c("Sequence.ID","J.GENE.and.allele", "AA.JUNCTION")] - -dat = merge(dat, datSum, by="Sequence.ID", all.x=T) - if(length(dat$Sequence.ID) == 0){ - setwd(outputdir) - result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") - write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) - transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) - row.names(transitionTable) = c("A", "C", "G", "T") - transitionTable["A","A"] = NA - transitionTable["C","C"] = NA - transitionTable["G","G"] = NA - transitionTable["T","T"] = NA - write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) - cat("0", file="n.txt") - stop("No data") + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") } -#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) -result = data.frame(x = 1:5, y = 1:5, z = 1:5) -row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") cleanup_columns = c("FR1.IMGT.c.a", "FR2.IMGT.g.t", @@ -111,32 +102,19 @@ dat[is.na(dat[,col]),] = 0 } -dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) -dat$VGene = gsub("[*].*", "", dat$VGene) -dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) -dat$JGene = gsub("[*].*", "", dat$JGene) - -dat$past = paste(dat$AA.JUNCTION, dat$VGene, dat$JGene, (dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + dat$FR3.IMGT.Nb.of.mutations)) - -dat = dat[!duplicated(dat$past), ] - -VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + +dat$VRegionMutations = dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + - dat$FR3.IMGT.Nb.of.mutations) + dat$FR3.IMGT.Nb.of.mutations -VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + +dat$VRegionNucleotides = dat$FR1.IMGT.Nb.of.nucleotides + dat$CDR1.IMGT.Nb.of.nucleotides + dat$FR2.IMGT.Nb.of.nucleotides + dat$CDR2.IMGT.Nb.of.nucleotides + - dat$FR3.IMGT.Nb.of.nucleotides) + dat$FR3.IMGT.Nb.of.nucleotides -result[1,"x"] = VRegionMutations -result[1,"y"] = VRegionNucleotides -result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) - -transitionMutations = sum(dat$FR1.IMGT.a.g + +dat$transitionMutations = dat$FR1.IMGT.a.g + dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$FR1.IMGT.t.c + @@ -155,58 +133,51 @@ dat$FR3.IMGT.a.g + dat$FR3.IMGT.g.a + dat$FR3.IMGT.c.t + - dat$FR3.IMGT.t.c) - -result[2,"x"] = transitionMutations -result[2,"y"] = VRegionMutations -result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) + dat$FR3.IMGT.t.c -transversionMutations = sum( dat$FR1.IMGT.a.c + - dat$FR1.IMGT.c.a + - dat$FR1.IMGT.a.t + - dat$FR1.IMGT.t.a + - dat$FR1.IMGT.g.c + - dat$FR1.IMGT.c.g + - dat$FR1.IMGT.g.t + - dat$FR1.IMGT.t.g + - dat$CDR1.IMGT.a.c + - dat$CDR1.IMGT.c.a + - dat$CDR1.IMGT.a.t + - dat$CDR1.IMGT.t.a + - dat$CDR1.IMGT.g.c + - dat$CDR1.IMGT.c.g + - dat$CDR1.IMGT.g.t + - dat$CDR1.IMGT.t.g + - dat$FR2.IMGT.a.c + - dat$FR2.IMGT.c.a + - dat$FR2.IMGT.a.t + - dat$FR2.IMGT.t.a + - dat$FR2.IMGT.g.c + - dat$FR2.IMGT.c.g + - dat$FR2.IMGT.g.t + - dat$FR2.IMGT.t.g + - dat$CDR2.IMGT.a.c + - dat$CDR2.IMGT.c.a + - dat$CDR2.IMGT.a.t + - dat$CDR2.IMGT.t.a + - dat$CDR2.IMGT.g.c + - dat$CDR2.IMGT.c.g + - dat$CDR2.IMGT.g.t + - dat$CDR2.IMGT.t.g + - dat$FR3.IMGT.a.c + - dat$FR3.IMGT.c.a + - dat$FR3.IMGT.a.t + - dat$FR3.IMGT.t.a + - dat$FR3.IMGT.g.c + - dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t + - dat$FR3.IMGT.t.g) +dat$transversionMutations = dat$FR1.IMGT.a.c + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.a.t + + dat$FR1.IMGT.t.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$FR1.IMGT.t.g + + dat$CDR1.IMGT.a.c + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.a.t + + dat$CDR1.IMGT.t.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$CDR1.IMGT.t.g + + dat$FR2.IMGT.a.c + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.a.t + + dat$FR2.IMGT.t.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$FR2.IMGT.t.g + + dat$CDR2.IMGT.a.c + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.a.t + + dat$CDR2.IMGT.t.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$CDR2.IMGT.t.g + + dat$FR3.IMGT.a.c + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.a.t + + dat$FR3.IMGT.t.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t + + dat$FR3.IMGT.t.g -result[3,"x"] = transversionMutations -result[3,"y"] = VRegionMutations -result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) -transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + +dat$transitionMutationsAtGC = dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$CDR1.IMGT.g.a + dat$CDR1.IMGT.c.t + @@ -215,9 +186,9 @@ dat$CDR2.IMGT.g.a + dat$CDR2.IMGT.c.t + dat$FR3.IMGT.g.a + - dat$FR3.IMGT.c.t) + dat$FR3.IMGT.c.t -totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + +dat$totalMutationsAtGC = dat$FR1.IMGT.g.a + dat$FR1.IMGT.c.t + dat$FR1.IMGT.c.a + dat$FR1.IMGT.g.c + @@ -246,18 +217,99 @@ dat$FR3.IMGT.c.a + dat$FR3.IMGT.g.c + dat$FR3.IMGT.c.g + - dat$FR3.IMGT.g.t) + dat$FR3.IMGT.g.t -result[4,"x"] = transitionMutationsAtGC -result[4,"y"] = totalMutationsAtGC -result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) - -result[5,"x"] = totalMutationsAtGC -result[5,"y"] = VRegionMutations -result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) -#transitiontable +setwd(outputdir) + +matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5) +for(i in 1:length(genes)){ + gene = genes[i] + tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),] + if(gene == "."){ + tmp = dat + } + if(length(tmp) == 0){ + cat("0", file=paste(gene, "_value.txt" ,sep="")) + next + } + j = i - 1 + x = (j * 3) + 1 + y = (j * 3) + 2 + z = (j * 3) + 3 + matrx[1,x] = sum(tmp$VRegionMutations) + matrx[1,y] = sum(tmp$VRegionNucleotides) + matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) + matrx[2,x] = sum(tmp$transitionMutations) + matrx[2,y] = sum(tmp$VRegionMutations) + matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + matrx[3,x] = sum(tmp$transversionMutations) + matrx[3,y] = sum(tmp$VRegionMutations) + matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + matrx[4,x] = sum(tmp$transitionMutationsAtGC) + matrx[4,y] = sum(tmp$totalMutationsAtGC) + matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + matrx[5,x] = sum(tmp$totalMutationsAtGC) + matrx[5,y] = sum(tmp$VRegionMutations) + matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + + transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + nts = c("a", "c", "g", "t") + + + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( tmp[,FR1] + + tmp[,CDR1] + + tmp[,FR2] + + tmp[,CDR2] + + tmp[,FR3]) + } + } + write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + + cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep="")) + cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep="")) +} + +#again for all of the data +tmp = dat +j = i +x = (j * 3) + 1 +y = (j * 3) + 2 +z = (j * 3) + 3 +matrx[1,x] = sum(tmp$VRegionMutations) +matrx[1,y] = sum(tmp$VRegionNucleotides) +matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1) +matrx[2,x] = sum(tmp$transitionMutations) +matrx[2,y] = sum(tmp$VRegionMutations) +matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) +matrx[3,x] = sum(tmp$transversionMutations) +matrx[3,y] = sum(tmp$VRegionMutations) +matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) +matrx[4,x] = sum(tmp$transitionMutationsAtGC) +matrx[4,y] = sum(tmp$totalMutationsAtGC) +matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) +matrx[5,x] = sum(tmp$totalMutationsAtGC) +matrx[5,y] = sum(tmp$VRegionMutations) +matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) row.names(transitionTable) = c("A", "C", "G", "T") @@ -269,40 +321,88 @@ for(nt1 in nts){ - for(nt2 in nts){ - if(nt1 == nt2){ - next - } - NT1 = LETTERS[letters == nt1] - NT2 = LETTERS[letters == nt2] - FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") - CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") - FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") - CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") - FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") - transitionTable[NT1,NT2] = sum( dat[,FR1] + - dat[,CDR1] + - dat[,FR2] + - dat[,CDR2] + - dat[,FR3]) - } + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( tmp[,FR1] + + tmp[,CDR1] + + tmp[,FR2] + + tmp[,CDR2] + + tmp[,FR3]) + } +} +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T) +cat(matrx[1,x], file="total_value.txt") +cat(length(tmp$Sequence.ID), file="total_n.txt") + + + +result = data.frame(matrx) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)") + +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + + +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +library(ggplot2) + +genesForPlot = gsub("[0-9]", "", dat$best_match) +genesForPlot = data.frame(table(genesForPlot)) +colnames(genesForPlot) = c("Gene","Freq") +genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + +pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) +pc = pc + geom_bar(width = 1, stat = "identity") +pc = pc + coord_polar(theta="y") +pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) + +png(filename="all.png") +pc +dev.off() + + +#blegh +genesForPlot = dat[grepl("ca", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")")) + + + png(filename="ca.png") + print(pc) + dev.off() } -setwd(outputdir) -write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) -write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) -cat(length(dat$Sequence.ID), file="n.txt") +genesForPlot = dat[grepl("cg", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")")) -weblogo = dat[,c("Sequence.ID", "VGene")] -weblogo$VGene = gsub("\\*.*", "", weblogo$VGene) - -rs12 = read.table("HS12RSS.txt", sep="\t", header=TRUE) -rs23 = read.table("HS23RSS.txt", sep="\t", header=TRUE) - -result12 = merge(weblogo, rs12, by.x="VGene", by.y="Gene") -result23 = merge(weblogo, rs23, by.x="VGene", by.y="Gene") - -write.table(x=result12$Sequence, file="weblogo_in_rs12.txt", sep=",",quote=F,row.names=F,col.names=F) -write.table(x=result23$Sequence, file="weblogo_in_rs23.txt", sep=",",quote=F,row.names=F,col.names=F) - + png(filename="cg.png") + print(pc) + dev.off() +}
--- a/piechart.r Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -if (!("ggplot2" %in% rownames(installed.packages()))) { - install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") -} -library(ggplot2) - -args <- commandArgs(trailingOnly = TRUE) - -vls = args[1] -lbls = args[2] -name = args[3] -output = args[4] - -vls = as.numeric(unlist(strsplit(vls, ","))) -lbls = unlist(strsplit(lbls, ",")) - -pc = ggplot(data.frame(Type=lbls, vls=vls), aes(x = factor(1), y=vls, fill=Type)) -pc = pc + geom_bar(width = 1, stat = "identity") -pc = pc + coord_polar(theta="y") -pc = pc + xlab(" ") + ylab(" ") + ggtitle(name) - -png(filename=output) -#pie(vls, labels=lbls, col=rainbow(length(vls)), main=name) -pc -dev.off()
--- a/seqlogo Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,356 +0,0 @@ -#!/usr/bin/perl -w - -=head1 NAME - - seqlogo - runs the logo creation script - -=head1 SYNOPSIS - - seqlogo [OPTION]...-f [FILENAME] - -=head1 DESCRIPTION - - Creates a logo for the given input filename. - - Available options: - -B <bar bits> Number of bits in bar (real between 0, 1) - -T <tic bits> Number of bits between tic marks - -C <chars per line> Number of characters per line of logo - -d <box shrink factor> Shrink factor of characters if option c is toggled - -E <error bar fraction> Fraction of error bar to show (real between 0, 1 ) - -f <input file> Input filename - -F <format> Format of output (EPS, GIF, PDF, PNG), - for STDOUT - -h <logo height> Height of output logo (real > 0) - -k <kind of data> 0 for amino acid, 1 for nucleic acid ; if not - defined, a 90% certainty method is used to - determine whether the input data is amino acid or - nucleic acid - -l <sequence lower bound> Lower bound of sequence (integer) - -m <sequence upper bound> Upper bound of sequence (integer) - -o <output file> Name of output file - -s <sequence start> Sequence start number, defaults to 1 (int) - -t <titletext> Text of title, enclosed in "" if more than one word - -w <logo width> Width of output logo - -x <x-axis label> Label for x-axis - -y <y-axis label> Label for y-axis - - Available toggles (no values associated) - -a Toggle antialiasing - -b Toggle bar ends - -c Toggle color - -e Toggle error bar - -M Toggle small sample correction - -O Toggle outlining of characters - -n Toggle numbering of x-axis - -S Toggle stretching of logos to entire length - -X Toggle boxing of characters - -Y Toggle y-axis - -=head1 EXAMPLE - - The following command takes as input "input.fasta" and returns the logo in the - form "logo.eps". Antialiasing, bar ends, color, small sample correction, - numbering of x-axis, and y-axis labelling are turned on: - - seqlogo -f input.fasta -F EPS -o logo.eps -abcMnY - -=cut - -use vars qw($PATH); - -BEGIN { - - use FindBin qw($Bin); - use lib "$Bin"; - - $PATH = $Bin; - -## $PATH = "/h/gary/Seqlogo/Code/"; -# $PATH = "/n/weblogo/home/httpd/weblogo/pub/beta/Seqlogo/Code"; -# unshift(@INC, $PATH); -} - -use logo; -use template; -use Getopt::Std; -use FileHandle; - - -my $opts; -$opts = - $opt_a || # antialiasing - $opt_b || # bar ends (0 or 1) - $opt_c || # color - $opt_e || # show error bar (0 or 1) - $opt_n || # numbering (0 or 1) - $opt_M || # small sample correction (0 or 1) - $opt_O || # outline (0 or 1) - $opt_S || # stretch - $opt_X || # box (0 for no box, 1 for box) - $opt_Y || # y axis - - $opt_B || # bar bits (real) - $opt_T || # tics bits (real) - $opt_C || # chars per line - $opt_d || # box shrinking factor (<1) - $opt_E || # error bar fraction (real) - $opt_f || # input filename - $opt_F || # format (PNG, EPS, PDF, GIF) - $opt_h || # logo height (cm) - $opt_k || # 0 = amino acid, 1 = nucleic acid - $opt_l || # lower bound of sequence to put in logo - $opt_m || # max bound of sequence to put in logo - $opt_o || # output file - $opt_s || # start number for very beginning of sequence - $opt_t || # title text (string) - $opt_w || # logo width (cm) - $opt_x || # x axis label - $opt_y || # y axis label -$opts; - -################################################################################ -##### USAGE ##### -################################################################################ - -sub usage { - my $usage = <<END - -usage: seqlogo -f <input filename> [OPTIONs with values] -Creates a logo for the given input filename. - -Available options: - -B <bar bits> Number of bits in bar (real # > 0) - -T <tic bits> Number of bits between tic marks - -C <chars per line> Number of characters per line of logo - -d <box shrink factor> Shrink factor of characters if option c is toggled - -E <error bar fraction> Fraction of error bar to show (real # > 0) - -f <input filename> Input filename - -F <format> Format of output (EPS, GIF, PDF, PNG), - for STDOUT - -h <logo height> Height of output logo (real # > 0) - -k <kind of data> 0 for amino acid, 1 for nucleic acid - -l <sequence lower bound> Lower bound of sequence (integer) - -m <sequence upper bound> Upper bound of sequence (integer) - -o <output file> Name of output file - -s <sequence start> Sequence start number, defaults to 1 (int) - -t <titletext> Text of title, enclosed in "" if more than one word - -w <logo width> Width of output logo - -x <x-axis label> Label for x-axis - -y <y-axis label> Label for y-axis - -Available toggles (no values associated) bOenc - -a Toggle antialiasing - -b Toggle bar ends - -c Toggle color - -e Toggle error bar - -M Toggle small sample correction - -O Toggle outlining of characters - -p Toggle fineprint - -n Toggle numbering of x-axis - -S Toggle stretching of logos to entire length - -X Toggle boxing of characters - -Y Toggle y-axis - -END - ; - - return $usage; -} - -################################################################################ -##### MAIN FUNCTION ##### -################################################################################ - -# arguments : $_[0] : file name -MAIN: { - init(); - - # feed data from file to make height data array reference - my @input = <INPUTFILE>; - close (INPUTFILE); - my %heightparams = ( - smallsampletoggle => $opt_M, - input_kind => $opt_k, - stretch => $opt_S - ); - - my ($heightdata_r, $desc_r, $kind, $goodlength, $badline, $validformat) = - logo::getHeightData(\@input, \%heightparams); - - # check for errors - if ((defined $validformat) && ($validformat == 1)) { - die("Error: Invalid input format does not conform to FASTA, " . - "CLUSTAL, or Flat.\n"); - } - if (!$goodlength) { - die("Error: Number of characters in each logo line is not " . - "consistent, starting at: ", $badline, "\n"); - } - - my %input = ( - LOGO_HEIGHT => $opt_h, - LOGO_WIDTH => $opt_w, - COLORSCHEME => ($opt_c) ? "DEFAULT" : "BW", - - LOGOSTART => $opt_l, - LOGOEND => $opt_m, - START_NUM => $opt_s, - - TITLETEXT => $opt_t, - YAXIS_LABEL => $opt_y, - XAXIS_LABEL => $opt_x, - - BOXSHRINK => $opt_d, - CHARSPERLINE => $opt_C, - BARBITS => $opt_B, - TICBITS => $opt_T, - RES => "96", - "FORMAT" => (uc $opt_F), - - # toggles - ANTIALIAS => $opt_a, - ERRBAR => $opt_e, - FINEPRINT => $opt_p, - NUMBERING => $opt_n, - OUTLINE => $opt_O, - SHOWENDS => $opt_b, - SHOWINGBOX => $opt_X, - YAXIS => $opt_Y - ); - - template::create_template(\%input, $kind, $desc_r, $heightdata_r, $opt_o, $PATH); -} - - -################################################################################ -##### FUNCTINOS FOR INIT ##### -################################################################################ - -# all ints -sub isInt { - return ($_[0] =~ /^[\+\-]?\d+$/) ? 1 : 0; -} - -# all reals -sub isReal { - return ($_[0] =~ /^[\+\-]?\d*.\d*?$/) ? 1 : 0; -} - -sub isZeroOrOne { - return ($_[0] == 0 || $_[0] == 1) ? 1 : 0; -} - -sub init { - -# if (not defined $PATH) { -# die ("PATH must be defined\n"); -# } elsif (not -e $PATH) { -# die ("PATH ($PATH) must exist\n"); -# } elsif (not -d $PATH) { -# die ("PATH ($PATH) must be a directory\n"); -# } - - &getopts('T:B:C:d:E:f:F:h:k:l:m:o:s:t:w:x:y:abcenMOpSXY'); - - if (defined $opt_B && - (!isReal($opt_B) || $opt_B < 0) ) { - printf("\noption B must be a positive real, but is $opt_B, $!\n"); - die &usage(); - } - if (defined $opt_d && - ( !isReal($opt_d) || $opt_d < 0 || $opt_d > 1) ) { - print("\noption d must be a real between 0 and 1, but is $opt_d, $!\n"); - die &usage(); - } - if (defined $opt_E && - (!isReal($opt_E) || $opt_E < 0 || $opt_E > 1) ) { - print("\noption E must be a real between 0 and 1, but is $opt_E, $!\n"); - die &usage(); - } - if (defined $opt_f) { - open (INPUTFILE, "$opt_f") or die "Couldn't open input filename $opt_f: $!\n"; - } else { - print("\ninput file not specified, terminating...\n"); - die &usage(); - } - if (defined $opt_h && - (!isReal($opt_h) || $opt_h < 0) ) { - print("\noption h must be a positive real, but is $opt_h, $!\n"); - die &usage(); - } - if (defined $opt_w && - (!isReal($opt_w) || $opt_w < 0) ) { - print("\noption w must be a positive real, but is $opt_w, $!\n"); - die &usage(); - } - if (defined $opt_k && - (!isZeroOrOne($opt_k)) ) { - print("\noption k must be 0 or 1, but is $opt_k, $!\n"); - die &usage(); - } - - #toggles - if (!defined $opt_a) { - $opt_a = 0; - } - if (!defined $opt_b) { - $opt_b = 0; - } - if (!defined $opt_c) { - $opt_c = 0; - } - if (!defined $opt_e) { - $opt_e = 0; - } - if (!defined $opt_n) { - $opt_n = 0; - } - if (!defined $opt_M) { - $opt_M = 0; - } - if (!defined $opt_O) { - $opt_O = 0; - } - if (!defined $opt_p) { - $opt_p = 0; - } - if (!defined $opt_S) { - $opt_S = 0; - } - if (!defined $opt_X) { - $opt_X = 0; - }; - if (!defined $opt_Y) { - $opt_Y = 0; - }; - - if (!defined $opt_F) { - $opt_F = "EPS"; # default to EPS - } - if (!defined $opt_o) { - $opt_o = "-"; # for standard out - } else { -# $opt_o =~ s/\.\S*$//; # remove extension if there is one - $opt_o .= "." . (lc $opt_F); # make file name - } - - if (defined $opt_C && - (!isInt($opt_C) || $opt_C < 0) ) { - printf("\noption C must be a postive integer, but is $opt_C, $!\n"); - die &usage(); - } - - if (defined $opt_l && !isInt($opt_l)) { - printf("\noption l must be an integer, but is $opt_l, $!\n"); - die &usage(); - } - - if (defined $opt_m && !isInt($opt_m)) { - printf("\noption m must be an integer, but is $opt_m, $!\n"); - die &usage(); - } - - if (defined $opt_s && !isInt($opt_s)) { - printf("\noption s must be an integer, but is $opt_s, $!\n"); - die &usage(); - } -}
--- a/template.eps Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,674 +0,0 @@ -%!PS-Adobe-3.0 EPSF-3.0 -%%Title: Sequence Logo : {$TITLE} -%%Creator: {$CREATOR} -%%CreationDate: {$CREATIONDATE} -%%BoundingBox: 0 0 {$LOGOWIDTHPOINTS} {$LOGOHEIGHTPOINTS} -%%Pages: 0 -%%DocumentFonts: -%%EndComments - -{$DESC} - - -% ---- CONSTANTS ---- -/cmfactor 72 2.54 div def % defines points -> cm conversion -/cm {cmfactor mul} bind def % defines centimeters - - -% ---- VARIABLES ---- - -{$COLORDEF} - -/logoWidth {$LOGOWIDTH} cm def -/logoHeight {$LOGOLINEHEIGHT} cm def -/logoTitle ({$TITLE}) def - -/yaxis {$YAXIS} def -/yaxisLabel ({$YAXIS_LABEL}) def -/yaxisBits {$BARBITS} def % bits -/yaxisTicBits {$TICBITS} def - - -/xaxis {$NUMBERING} def -/xaxisLabel ({$XAXIS_LABEL}) def -/showEnds ({$SHOWENDS}) def % d: DNA, p: PROTEIN, -: none - -/showFineprint true def -/fineprint ({$FINEPRINT}) def - -/charsPerLine {$CHARSPERLINE} def -/logoLines {$LOGOLINES} def - -/showingBox ({$SHOWINGBOX}) def %n s f -/shrinking {$SHRINKBOOLEAN} def -/shrink {$SHRINKFACTOR} def -/outline {$OUTLINE} def - -/IbeamFraction {$ERRORBARFRACTION} def -/IbeamGray 0.50 def -/IbeamLineWidth 0.5 def - -/fontsize 12 def -/titleFontsize 14 def -/smallFontsize 6 def - -/defaultColor {$DEFAULT_COLOR} def - -{$COLORDICT} - -% Standard DNA/RNA color scheme -% /colorDict << -% (G) orange -% (T) red -% (C) blue -% (A) green -% (U) red -% >> def - -% Standard Amino Acid colors -%/colorDict << -% (G) green -% (S) green -% (T) green -% (Y) green -% (C) green -% (N) purple -% (Q) purple -% (K) blue -% (R) blue -% (H) blue -% (D) red -% (E) red -% (P) black -% (A) black -% (W) black -% (F) black -% (L) black -% (I) black -% (M) black -% (V) black -%>> def - - - -% ---- DERIVED PARAMETERS ---- - -/leftMargin - fontsize 3.5 mul - -def - -/bottomMargin - fontsize 0.75 mul - - % Add extra room for axis - xaxis {fontsize 1.75 mul add } if - xaxisLabel () eq {} {fontsize 0.75 mul add} ifelse -def - - -/topMargin - logoTitle () eq { 10 }{titleFontsize 4 add} ifelse -def - -/rightMargin - %Add extra room if showing ends - showEnds (-) eq { fontsize}{fontsize 1.5 mul} ifelse -def - -/yaxisHeight - logoHeight - bottomMargin sub - topMargin sub -def - -/ticWidth fontsize 2 div def - -/pointsPerBit yaxisHeight yaxisBits div def - -/isBoxed - showingBox (s) eq - showingBox (f) eq or { - true - } { - false - } ifelse -def - -/stackMargin 1 def - -% Do not add space aroung characters if characters are boxed -/charRightMargin - isBoxed { 0.0 } {stackMargin} ifelse -def - -/charTopMargin - isBoxed { 0.0 } {stackMargin} ifelse -def - -/charWidth - logoWidth - leftMargin sub - rightMargin sub - charsPerLine div - charRightMargin sub -def - -/charWidth4 charWidth 4 div def -/charWidth2 charWidth 2 div def - -/stackWidth - charWidth charRightMargin add -def - -/numberFontsize - fontsize charWidth lt {fontsize}{charWidth} ifelse -def - -% movements to place 5'/N and 3'/C symbols -/leftEndDeltaX fontsize neg def -/leftEndDeltaY fontsize 1.5 mul neg def -/rightEndDeltaX fontsize 0.25 mul def -/rightEndDeltaY leftEndDeltaY def - -% Outline width is proporional to charWidth, -% but no less that 1 point -/outlinewidth - charWidth 32 div dup 1 gt {}{pop 1} ifelse -def - - -% ---- PROCEDURES ---- - -/StartLogo { - % Save state - save - gsave - - % Print Logo Title, top center - gsave - SetTitleFont - - logoWidth 2 div - logoTitle - stringwidth pop 2 div sub - logoHeight logoLines mul - titleFontsize sub - moveto - - logoTitle - show - grestore - - % Print X-axis label, bottom center - gsave - SetStringFont - - logoWidth 2 div - xaxisLabel stringwidth pop 2 div sub - fontsize 3 div - moveto - - xaxisLabel - show - grestore - - % Show Fine Print - showFineprint { - gsave - SetSmallFont - logoWidth - fineprint stringwidth pop sub - smallFontsize sub - smallFontsize 3 div - moveto - - fineprint show - grestore - } if - - % Move to lower left corner of last line, first stack - leftMargin bottomMargin translate - - % Move above first line ready for StartLine - 0 logoLines logoHeight mul translate - - SetLogoFont -} bind def - -/EndLogo { - grestore - showpage - restore -} bind def - - -/StartLine{ - % move down to the bottom of the line: - 0 logoHeight neg translate - - gsave - yaxis { MakeYaxis } if - xaxis { ShowLeftEnd } if -} bind def - -/EndLine{ - xaxis { ShowRightEnd } if - grestore -} bind def - - -/MakeYaxis { - gsave - stackMargin neg 0 translate - ShowYaxisBar - ShowYaxisLabel - grestore -} bind def - - -/ShowYaxisBar { - gsave - SetStringFont - - /str 10 string def % string to hold number - /smallgap stackMargin 2 div def - - % Draw first tic and bar - gsave - ticWidth neg 0 moveto - ticWidth 0 rlineto - 0 yaxisHeight rlineto - stroke - grestore - - - % Draw the tics - % initial increment limit proc for - 0 yaxisTicBits yaxisBits abs %cvi - {/loopnumber exch def - - % convert the number coming from the loop to a string - % and find its width - loopnumber 10 str cvrs - /stringnumber exch def % string representing the number - - stringnumber stringwidth pop - /numberwidth exch def % width of number to show - - /halfnumberheight - stringnumber CharBoxHeight 2 div - def - - numberwidth % move back width of number - neg loopnumber pointsPerBit mul % shift on y axis - halfnumberheight sub % down half the digit - - moveto % move back the width of the string - - ticWidth neg smallgap sub % Move back a bit more - 0 rmoveto % move back the width of the tic - - stringnumber show - smallgap 0 rmoveto % Make a small gap - - % now show the tic mark - 0 halfnumberheight rmoveto % shift up again - ticWidth 0 rlineto - stroke - } for - grestore -} bind def - -/ShowYaxisLabel { - gsave - SetStringFont - - % How far we move left depends on the size of - % the tic labels. - /str 10 string def % string to hold number - yaxisBits yaxisTicBits div cvi yaxisTicBits mul - str cvs stringwidth pop - ticWidth 1.5 mul add neg - - - yaxisHeight - yaxisLabel stringwidth pop - sub 2 div - - translate - 90 rotate - 0 0 moveto - yaxisLabel show - grestore -} bind def - - -/StartStack { % <stackNumber> startstack - xaxis {MakeNumber}{pop} ifelse - gsave -} bind def - -/EndStack { - grestore - stackWidth 0 translate -} bind def - - -% Draw a character whose height is proportional to symbol bits -/MakeSymbol{ % charbits character MakeSymbol - gsave - /char exch def - /bits exch def - - /bitsHeight - bits pointsPerBit mul - def - - /charHeight - bitsHeight charTopMargin sub - dup - 0.0 gt {}{pop 0.0} ifelse % if neg replace with zero - def - - charHeight 0.0 gt { - char SetColor - charWidth charHeight char ShowChar - - showingBox (s) eq { % Unfilled box - 0 0 charWidth charHeight false ShowBox - } if - - showingBox (f) eq { % Filled box - 0 0 charWidth charHeight true ShowBox - } if - - } if - - grestore - - 0 bitsHeight translate -} bind def - - -/ShowChar { % <width> <height> <char> ShowChar - gsave - /tc exch def % The character - /ysize exch def % the y size of the character - /xsize exch def % the x size of the character - - /xmulfactor 1 def - /ymulfactor 1 def - - - % if ysize is negative, make everything upside down! - ysize 0 lt { - % put ysize normal in this orientation - /ysize ysize abs def - xsize ysize translate - 180 rotate - } if - - shrinking { - xsize 1 shrink sub 2 div mul - ysize 1 shrink sub 2 div mul translate - - shrink shrink scale - } if - - % Calculate the font scaling factors - % Loop twice to catch small correction due to first scaling - 2 { - gsave - xmulfactor ymulfactor scale - - ysize % desired size of character in points - tc CharBoxHeight - dup 0.0 ne { - div % factor by which to scale up the character - /ymulfactor exch def - } % end if - {pop pop} - ifelse - - xsize % desired size of character in points - tc CharBoxWidth - dup 0.0 ne { - div % factor by which to scale up the character - /xmulfactor exch def - } % end if - {pop pop} - ifelse - grestore - } repeat - - % Adjust horizontal position if the symbol is an I - tc (I) eq { - charWidth 2 div % half of requested character width - tc CharBoxWidth 2 div % half of the actual character - sub 0 translate - % Avoid x scaling for I - /xmulfactor 1 def - } if - - - % ---- Finally, draw the character - - newpath - xmulfactor ymulfactor scale - - % Move lower left corner of character to start point - tc CharBox pop pop % llx lly : Lower left corner - exch neg exch neg - moveto - - outline { % outline characters: - outlinewidth setlinewidth - tc true charpath - gsave 1 setgray fill grestore - clip stroke - } { % regular characters - tc show - } ifelse - - grestore -} bind def - - -/ShowBox { % x1 y1 x2 y2 filled ShowBox - gsave - /filled exch def - /y2 exch def - /x2 exch def - /y1 exch def - /x1 exch def - newpath - x1 y1 moveto - x2 y1 lineto - x2 y2 lineto - x1 y2 lineto - closepath - - clip - - filled { - fill - }{ - 0 setgray stroke - } ifelse - - grestore -} bind def - - -/MakeNumber { % number MakeNumber - gsave - SetNumberFont - stackWidth 0 translate - 90 rotate % rotate so the number fits - dup stringwidth pop % find the length of the number - neg % prepare for move - stackMargin sub % Move back a bit - charWidth (0) CharBoxHeight % height of numbers - sub 2 div % - moveto % move back to provide space - show - grestore -} bind def - - -/Ibeam{ % heightInBits Ibeam - gsave - % Make an Ibeam of twice the given height in bits - /height exch pointsPerBit mul def - /heightDRAW height IbeamFraction mul def - - IbeamLineWidth setlinewidth - IbeamGray setgray - - charWidth2 height neg translate - ShowIbar - newpath - 0 0 moveto - 0 heightDRAW rlineto - stroke - newpath - 0 height moveto - 0 height rmoveto - currentpoint translate - ShowIbar - newpath - 0 0 moveto - 0 heightDRAW neg rlineto - currentpoint translate - stroke - grestore -} bind def - - -/ShowIbar { % make a horizontal bar - gsave - newpath - charWidth4 neg 0 moveto - charWidth4 0 lineto - stroke - grestore -} bind def - - -/ShowLeftEnd { - gsave - SetStringFont - leftEndDeltaX leftEndDeltaY moveto - showEnds (d) eq {(5) show ShowPrime} if - showEnds (p) eq {(N) show} if - grestore -} bind def - - -/ShowRightEnd { - gsave - SetStringFont - rightEndDeltaX rightEndDeltaY moveto - showEnds (d) eq {(3) show ShowPrime} if - showEnds (p) eq {(C) show} if - grestore -} bind def - - -/ShowPrime { - gsave - SetPrimeFont - (\242) show - grestore -} bind def - - -/SetColor{ % <char> SetColor - dup colorDict exch known { - colorDict exch get aload pop setrgbcolor - } { - pop - defaultColor aload pop setrgbcolor - } ifelse -} bind def - -% define fonts -/SetTitleFont {/Times-Bold findfont titleFontsize scalefont setfont} bind def -/SetLogoFont {/Helvetica-Narrow-Bold findfont charWidth scalefont setfont} bind def -/SetStringFont{/Helvetica-Bold findfont fontsize scalefont setfont} bind def -/SetPrimeFont {/Symbol findfont fontsize scalefont setfont} bind def -/SetSmallFont {/Helvetica findfont smallFontsize scalefont setfont} bind def - -/SetNumberFont { - /Helvetica-Bold findfont - numberFontsize - scalefont - setfont -} bind def - -%Take a single character and return the bounding box -/CharBox { % <char> CharBox <lx> <ly> <ux> <uy> - gsave - newpath - 0 0 moveto - % take the character off the stack and use it here: - true charpath - flattenpath - pathbbox % compute bounding box of 1 pt. char => lx ly ux uy - % the path is here, but toss it away ... - grestore -} bind def - - -% The height of a characters bounding box -/CharBoxHeight { % <char> CharBoxHeight <num> - CharBox - exch pop sub neg exch pop -} bind def - - -% The width of a characters bounding box -/CharBoxWidth { % <char> CharBoxHeight <num> - CharBox - pop exch pop sub neg -} bind def - - -% Deprecated names -/startstack {StartStack} bind def -/endstack {EndStack} bind def -/makenumber {MakeNumber} bind def -/numchar { MakeSymbol } bind def - -%%EndProlog - -%%Page: 1 1 -StartLogo -StartLine % line number 1 - -{$DATA} - -EndLine -EndLogo - -%%EOF - - - - - - - - - - - - - - - - - - -
--- a/template.pm Wed Sep 17 07:25:17 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,672 +0,0 @@ -=head1 NAME - - template.pm - creates logo output in various formats - -=head1 SYNOPSIS - - Perl module - -=head1 DESCRIPTION - - logo.cgi and run.pl collect the logo data. They can then enter - template::create_template to create logo output in the following formats: - - * EPS - * GIF - * PDF - * PNG - - - - If the configuration file "logo.conf" exists in the working directory, then - it will be parsed for the locations of GhostScript (gs) and convert. The - following is an - example of the configuration file ("#" at beginning of line indicates - comment): - - # Make configuration changes here. Rename this file to logo.conf - # gs version 5.5 does not work, 6.5 does - # set the PATHS - gs=/usr/local/bin/gs - convert=/usr/X11R6/bin/convert - -=cut - -package template; -use strict; - - -################################################################################ -######## STATE FOR FILLINGS ########## -################################################################################ - -# BARBITS number of bits in vertical y-axis bar -# CHARWIDTH width of characters in logo -# COLORSCHEME "a" for amino acid; -# "n" for nucleic acid; -# "b" for black (no color scheme) -# color scheme -# DATA string of heights of characters in cm -# DESC description of aligned sequences -# FINEPRINT enable adverts/credits -# LOGOLINES number of lines of logo -# LOGOHEIGHT height of EACH LINE in logo (in cm) -# LOGOWIDTH width of final logo in cm -# LOGOHEIGHTPOINTS height of final logo in points -# LOGOWIDTHPOINTS height of final logo in points -# LOGOSTART logo output will begin at residue LOGOSTART -# LOGOEND logo output will end at residue LOGOEND -# ERRBAR 1 to include error bar, 0 to exclude -# ERRORBARFRACTION percent of error bar to show in range [0,1] -# KIND $AA for amino acid, $NA for nucleic acid -# NUMBERING 1 to show residue numbers, 0 to exclude -# OUTLINE 1 to print characters in outline form; -# 0 to print characters in solid form -# SHOWENDS "d" to show 5' and 3' ends; -# "p" to show N and C termini; -# "-" to exclude end markers -# SHOWINGBOX "n" to have No boxes around characters; -# "s" to have boxes around characters, with Shrinking; -# "f" to have Filled boxes around characters -# SHRINKBOOLEAN 1 to shrink characters; 0 to exclude shrinking -# SHRINKFACTOR amount to shrink in range from 1(no shrinking) to -# 0(full shrinking) -# START_NUM start number for very beginning of sequence -# TITLE title of logo -# YAXIS 1 to turn on y axis and its labels - -################################################################################ -##### VARIABLES AND DEFAULT VALUES ##### -################################################################################ - -my %defaults = ( - LOGOHEIGHT => 5, - LOGOWIDTH => 8, - - YAXIS => "false", - TITLE => "", - YAXIS_LABEL => "bits", - XAXIS_LABEL => "", - - BARENDS => "false", - OUTLINE => "false", - SHOWINGBOX => "n", - NUMBERING => "false", - #FINEPRINT => "Created by: G. E. Crooks, G. Hon, J.-M. Chandonia & S. E. Brenner, (2002) <weblogo.berkeley.edu>", - FINEPRINT => "weblogo.berkeley.edu", - - ERRORBARFRACTION => "1", - ERRBAR => "0", - SHRINKFACTOR => "1", - START_NUM => "1", - - DEFAULT_COLOR => "black", - - black => "[0 0 0]", - red => "[0.8 0 0]", - green => "[0 0.8 0]", - blue => "[0 0 0.8]", - yellow => "[1 0.7 1.0]", - purple => "[0.8 0 0.8]", - orange => "[1 0.7 0]" - ); - -my $AA = 0; -my $NA = 1; -my $PATH; - -################################################################################ -##### SOME FUNCTIONS ##### -################################################################################ - - -sub create_template { - my ($input, $kind, $desc_r, $data_r, $outfile, $path) = @_; - - # set path - $PATH = $path; - - #Create EPS file - my %fillings; - - # put parameters in fillings - makeFillings(\%fillings, $input, $kind, $desc_r, $defaults{FINEPRINT}); - - # set default data if not filled - setDefaults(\%fillings, \%defaults, scalar @$data_r); - - # put color in fillings - setColors(\%fillings, \%defaults); - - # put data in fillings - setData(\%fillings, $data_r); - - # make eps output - my $eps = fillTemplate("$PATH/template.eps", \%fillings); - my $format = $input->{FORMAT}; - - # convert - my ($gsprog, $convertprog) = getProgs(); - -# print STDERR "(gsprog, convertprog) = ($gsprog, $convertprog)\n"; - - my $width = $fillings{LOGOWIDTHPOINTS}; - my $height = $fillings{LOGOHEIGHTPOINTS}; # height of entire logo - my $res = $input->{RES}; - my $antialias = (defined $input->{ANTIALIAS} && $input->{ANTIALIAS}) ? "-dTextAlphaBits=4" : ""; - - my $r = $width . "x" . $height; - - if( $format eq "EPS" ) { - if ($outfile eq "-") { # if standard out - print $eps; - } else { - open (WRITEME, ">$outfile"); - print WRITEME $eps; - close (WRITEME); - } - - } elsif ($format eq "PDF"){ -# print("outfile = $outfile\n"); - my $program = "| $gsprog -sOutputFile=$outfile -sDEVICE=pdfwrite -dPDFSETTINGS=/printer -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height -dEmbedAllFonts=true $antialias -dSAFER -dBATCH -dNOPAUSE -_"; - open(WRITEME, $program); - print WRITEME "$eps"; - close (WRITEME); - - } elsif ( $format eq "PNG" ) { - my $program = "| $gsprog -sOutputFile=$outfile -sDEVICE=png16m -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height $antialias -dSAFER -dBATCH -dNOPAUSE -_"; - #print ("$program"); - open(WRITEME, $program); - print WRITEME "$eps"; - close (WRITEME); - - } elsif ($format eq "GIF") { - # convert to EPS first, then GIF - die "Please check logo.conf: convert program does not exist" - if (!defined $convertprog || !(-e $convertprog)); - - my $program = "| $gsprog -sOutputFile=- -sDEVICE=png16m -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height $antialias -dSAFER -dBATCH -dNOPAUSE -_"; - - if ($outfile eq "-") { - $program .= " | $convertprog png:- gif:-"; - } else { - $program .= " | $convertprog png:- $outfile"; - } - - open(WRITEME, $program); - print WRITEME "$eps"; - close (WRITEME); - } -} - -#deprecated -sub c { - return create_template( @_); -} - -sub getProgs { - my ($gsprog, $convertprog) = ("gs", "convert"); - - - # No configuration file, then return defaults. - return ($gsprog, $convertprog) if (! (-e "$PATH/logo.conf")); - open (CONF, "$PATH/logo.conf"); - - while (<CONF>) { - next if (/^\#/); # skip lines beginning with "#" - if (m/^gs/i) { # if looks like gs (case insensitive) - ($gsprog) = ($_ =~ /^\S+\=(.+)$/); - } - if (m/^convert/i) { # if looks like convert (case insensitive) - ($convertprog) = ($_ =~ /^\S+\=(.+)$/); - } - } - - # Do these fings exist? - my ($gsprogname) = ($gsprog =~ /^(\S+)/); - - die "Please check $PATH/logo.conf: gs program ($gsprogname) does not exist" if (!defined $gsprogname || !(-e $gsprogname)); - #die "Please check logo.conf: convert program does not exist" if (!defined $convertprog || !(-e $convertprog)); - - return ($gsprog, $convertprog); -} - -sub get_eps { - my ($input, $kind, $desc_r, $data_r) = @_; - my %fillings; - - # put parameters in fillings - makeFillings(\%fillings, $input, $kind, $desc_r); - - # set default data if not filled - setDefaults(\%fillings, \%defaults, $#$data_r); - - # put data in fillings - setData(\%fillings, $data_r); - - # make output - return fillTemplate("$PATH/template.eps", \%fillings); -} - -sub fillTemplate { - my ($filename, $fillings) = @_; - - if (not -e $filename) { - die "filename ($filename) must exist\n"; - } - - my $text; - local $/; # slurp mode (undef) - local *F; # create local filehandle - open(F, "< $filename\0") || return; - $text = <F>; - close(F); - - #replace {$KEYWORDS} with value in %$fillings hash - $text =~ s{ \{\$ (.*?) \} } - { exists( $fillings->{$1}) - ? $fillings->{$1} - : "" - }gsex; - return $text; -} - - -################################################################################ -##### FILL THE FILLINGS HERE ##### -################################################################################ - -sub isChecked { - return 0 if (!defined $_[0]); - return $_[0]; -} - -# negative/positive ints -sub isInt { - return ($_[0] =~ /^[-\+]?\d+$/) ? 1 : 0; -} - -sub makeFillings { - -# my ($fillings, $input, $kind, $desc_r, $data_r, $fineprint) = @_; - my ($fillings, $input, $kind, $desc_r, $fineprint) = @_; - $fillings->{KIND} = $kind; - $fillings->{LOGOHEIGHT} = $input->{LOGO_HEIGHT}; - $fillings->{LOGOWIDTH} = $input->{LOGO_WIDTH}; - $fillings->{OUTLINE} = (isChecked($input->{OUTLINE})) ? "true" : "false"; - $fillings->{NUMBERING} = (isChecked($input->{NUMBERING})) ? "true" : "false"; - $fillings->{FINEPRINT} = (isChecked($input->{FINEPRINT})) ? $fineprint : ""; - - $fillings->{LOGOSTART} = $input->{LOGOSTART}; - $fillings->{LOGOEND} = $input->{LOGOEND}; - $fillings->{START_NUM} = $input->{START_NUM}; - - $fillings->{YAXIS} = (isChecked($input->{YAXIS}) && !isChecked($input->{STRETCH})) ? "true" : "false"; - - - - $fillings->{TITLE} = $input->{TITLETEXT}; - $fillings->{YAXIS_LABEL} = $input->{YAXIS_LABEL}; - - $fillings->{XAXIS_LABEL} = $input->{XAXIS_LABEL}; - $fillings->{ERRBAR} = $input->{ERRBAR}; - $fillings->{SHOWINGBOX} = (isChecked($input->{SHOWINGBOX})) ? "s" : "n"; - $fillings->{SHRINKBOOLEAN} = ($fillings->{SHOWINGBOX} eq "s") ? "true" : "false"; - $fillings->{SHRINKFACTOR} = $input->{BOXSHRINK}; - - if ((defined $input->{CHARSPERLINE}) && - isInt($input->{CHARSPERLINE}) && - ($input->{CHARSPERLINE} > 0)) { - $fillings->{CHARSPERLINE} = $input->{CHARSPERLINE}; - } - - if (defined $input->{BARBITS}) { - $fillings->{BARBITS} = $input->{BARBITS}; - } else { - $fillings->{BARBITS} = ($fillings->{KIND} == $AA) ? 4.3 : 2; - } - - if (defined $input->{TICBITS}) { - $fillings->{TICBITS} = $input->{TICBITS}; - } else { - $fillings->{TICBITS} = 1; - } - - - - -# if (isChecked($input->{NOCOLOR})) { -# $fillings->{COLORSCHEME} = "b"; -# } else { -# $fillings->{COLORSCHEME} = ($kind == $AA) ? "a" : "n"; -# } - - #color - if (defined $input->{DEFAULT_COLOR}) { - $fillings->{DEFAULT_COLOR} = (isHexColor( $input->{DEFAULT_COLOR})) ? "c" . $input->{DEFAULT_COLOR} : - $input->{DEFAULT_COLOR}; - } - - if (isChecked($input->{SHOWENDS})) { - $fillings->{SHOWENDS} = ($fillings->{KIND} == $AA) ? "p" : "d"; - } else { - $fillings->{SHOWENDS} = "-"; - } - - $fillings->{DESC} = getDescription($desc_r, $fillings->{KIND}); - - $fillings->{ERRORBARFRACTION} = $input->{ERRORBARFRACTION}; - $fillings->{COLORSCHEME} = $input->{COLORSCHEME}; - $fillings->{COLORS} = $input->{COLORS}; -} - -sub getDescription { - my $returnVal = ""; - - foreach (@{$_[0]}) { - if(defined($_)) { - $returnVal .= "% * $_\n"; - } else { - $returnVal .= "% * \n"; - } - } - - if ($_[1] == $AA) { - $returnVal .= "% * PROTEIN ALIGNMENT"; - } else { - $returnVal .= "% * NUCLEOTIDE ALIGNMENT"; - } - - return $returnVal; -} - - -################################################################################ -##### SETTING DEFAULTS ##### -################################################################################ - -sub setDefaults { - my ($fillings, $defaults, $numchars) = @_; - - $fillings->{LOGOHEIGHT} = $defaults->{LOGOHEIGHT} if !defined $fillings->{LOGOHEIGHT}; - $fillings->{LOGOWIDTH} = $defaults->{LOGOWIDTH} if !defined $fillings->{LOGOWIDTH}; - - $fillings->{START_NUM} = $defaults->{START_NUM} if !defined $fillings->{START_NUM}; - $fillings->{LOGOSTART} = $fillings->{START_NUM} if !defined $fillings->{LOGOSTART}; - $fillings->{LOGOEND} = $numchars + $fillings->{LOGOSTART} - 1 if !defined $fillings->{LOGOEND}; - - $fillings->{YAXIS} = $defaults->{YAXIS} if !defined $fillings->{YAXIS}; - $fillings->{TITLE} = $defaults->{TITLE} if !defined $fillings->{TITLE} || $fillings->{TITLE} eq ""; - #$fillings->{YAXIS_LABEL} = $defaults->{YAXIS_LABEL} if !defined $fillings->{YAXIS_LABEL} || $fillings->{YAXIS_LABEL} eq ""; - $fillings->{YAXIS_LABEL} = $defaults->{YAXIS_LABEL} if !defined $fillings->{YAXIS_LABEL} ; - $fillings->{XAXIS_LABEL} = $defaults->{XAXIS_LABEL} if !defined $fillings->{XAXIS_LABEL} || $fillings->{XAXIS_LABEL} eq ""; - - $fillings->{BARENDS} = $defaults->{BARENDS} if !defined $fillings->{BARENDS}; - $fillings->{OUTLINE} = $defaults->{OUTLINE} if !defined $fillings->{OUTLINE}; - $fillings->{SHOWINGBOX} = $defaults->{SHOWINGBOX} if !defined $fillings->{SHOWINGBOX}; - $fillings->{NUMBERING} = $defaults->{NUMBERING} if !defined $fillings->{NUMBERING}; - - $fillings->{ERRORBARFRACTION} = $defaults->{ERRORBARFRACTION} if !defined $fillings->{ERRORBARFRACTION}; - $fillings->{SHRINKFACTOR} = $defaults->{SHRINKFACTOR} if !defined $fillings->{SHRINKFACTOR}; - $fillings->{ERRBAR} = $defaults->{ERRBAR} if !defined $fillings->{ERRBAR}; - -# printf("logostart = %d, start num = %d, numchars = $numchars, logoend = %d\n", $fillings->{LOGOSTART}, $fillings->{START_NUM}, -# $fillings->{LOGOEND}); - - my $givenrange = $fillings->{LOGOEND} - $fillings->{LOGOSTART} + 1; - my $possiblerange = $numchars - ($fillings->{LOGOSTART} - $fillings->{START_NUM}); - - if (!defined $fillings->{CHARSPERLINE} && ($givenrange > $possiblerange)) { - $fillings->{CHARSPERLINE} = $numchars - ($fillings->{LOGOSTART} - $fillings->{START_NUM}); - } elsif (!defined $fillings->{CHARSPERLINE}) { - $fillings->{CHARSPERLINE} = $fillings->{LOGOEND} - $fillings->{LOGOSTART} + 1; - } - - $fillings->{DEFAULT_COLOR} = $defaults->{DEFAULT_COLOR} if !defined $fillings->{DEFAULT_COLOR} || - $fillings->{DEFAULT_COLOR} eq ""; - -# printf("chars per line = %s\n",$fillings->{CHARSPERLINE}); -# print("givenrange = $givenrange, possiblerange = $possiblerange\n"); - - if ($givenrange > $possiblerange) { - $fillings->{LOGOLINES} = roundup($possiblerange / $fillings->{CHARSPERLINE}); - } else { - $fillings->{LOGOLINES} = roundup($givenrange / $fillings->{CHARSPERLINE}); - } - - $fillings->{CHARWIDTH} = ($fillings->{LOGOWIDTH} - 1.5) / $fillings->{CHARSPERLINE}; - -# # change height if more than 1 line -# $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54)) * $fillings->{LOGOLINES}; - - # LOGOHEIGHTPOITNS is the height input by the user -# $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54)); # user specifies height of entire logo - $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54)) * $fillings->{LOGOLINES}; # user specifies height of logo line - $fillings->{LOGOWIDTHPOINTS} = int($fillings->{LOGOWIDTH} * (72 / 2.54)); - - # LOGOLINEHEIGHT is the height of each logo line, in cm -# $fillings->{LOGOLINEHEIGHT} = $fillings->{LOGOHEIGHT} / $fillings->{LOGOLINES}; # user specifies height of entire logo - $fillings->{LOGOLINEHEIGHT} = $fillings->{LOGOHEIGHT}; # user specifies height of logo line -} - -sub roundup { - return ($_[0] - int($_[0]) > 0) ? int($_[0] + 1) : $_[0]; -} - - -################################################################################ -##### COLORS ##### -################################################################################ - -sub getDefaultColors { - my ($defaults) = @_; - my $returnVal = ""; - $returnVal .= "/black " . $defaults->{black} . " def\n"; - $returnVal .= "/red " . $defaults->{red} . " def\n"; - $returnVal .= "/green " . $defaults->{green} . " def\n"; - $returnVal .= "/blue " . $defaults->{blue} . " def\n"; - $returnVal .= "/yellow " . $defaults->{yellow} . " def\n"; - $returnVal .= "/purple " . $defaults->{purple} . " def\n"; - $returnVal .= "/orange " . $defaults->{orange} . " def\n"; - - return $returnVal; -} - -sub getNAColors { - my $returnVal = <<END -% Standard DNA/RNA color scheme -/colorDict << - (G) orange - (T) red - (C) blue - (A) green - (U) red -END - ; - - return $returnVal; -} - -sub getAAColors { - my $returnVal = <<END -% Standard Amino Acid colors -/colorDict << - (G) green - (S) green - (T) green - (Y) green - (C) green - (N) purple - (Q) purple - (K) blue - (R) blue - (H) blue - (D) red - (E) red - (P) black - (A) black - (W) black - (F) black - (L) black - (I) black - (M) black - (V) black -END - ; - - return $returnVal; -} - -sub setColors { - my ($fillings, $defaults, $input) = @_; - my $colordef = getDefaultColors($defaults); - my $colordict = "/colorDict <<\n"; - - if ($fillings->{COLORSCHEME} eq "DEFAULT") { - $colordef = getDefaultColors($defaults); - - if ($fillings->{KIND} eq $AA) { - $colordict = getAAColors(); - } else { - $colordict = getNAColors(); - } - } elsif ($fillings->{COLORSCHEME} eq "BW") { - # do nothing for dict - } else { - - my %colorhash = %{ $fillings->{COLORS} }; - my $colorName = ""; - - foreach (keys %colorhash) { # keys are strings of residues, value = color name or color code (FF0000) - # add color to definitions - $colorName = $colorhash{$_}; - -# print("color = $_\n"); - - addColorDef(\$colordef, $colorName ) if (isHexColor($colorName)); - - # add have each residue use the color - foreach (split(//, $_)) { - # add color to dictionary - if (isHexColor($colorName)) { - $colordict .= " ($_) c$colorName\n" if !($_ =~ /^\s*$/); - } else { - $colordict .= " ($_) $colorName\n" if !($_ =~ /^\s*$/); - } - } - } - } - - $colordict .= "\n>> def"; - - # add to fillings - $fillings->{COLORDEF} = $colordef; - $fillings->{COLORDICT} = $colordict; -} - -sub addColorDef { -# print("adding to color def\n"); - my ($colordef_r, $color) = @_; - my $PSColor = getPSColor($color); - $$colordef_r .= "/c$color $PSColor def\n"; -} - -sub isHexColor { - return ($_[0] =~ /^[0-9a-fA-F]+$/) && (length $_[0] == 6); -} - -# know that it is hex color -sub getPSColor { - return "[" . hex(substr($_[0],0,2)) / 255 . " " . - hex(substr($_[0],2,2)) / 255 . " " . - hex(substr($_[0],4,2)) / 255 . "]"; -} - - -################################################################################ -##### SETTING DATA FIELD ##### -################################################################################ - -sub setData { - my ($fillings, $data_r) = @_; - - my @data = @$data_r; - my ($height, $letter); - my @slice; - my $data; - my $start_num = $fillings->{START_NUM}; - - my $start = $fillings->{LOGOSTART} - $start_num; # where in @data to start - my $end = $fillings->{LOGOEND} - $start_num; # where in @data to end - my $charsperline = $fillings->{CHARSPERLINE}; - - my $numlabel = $fillings->{LOGOSTART}; - - $end = ($end >= scalar @data) ? (scalar @data - 1) : $end; - - for (my $i=$start ; $i<=$end ; $i++) { - - # if add new lines -# if ((($i - $start) % $charsperline == 0) && -# ($i != $start) && # not first one -# ($i != $end)) { # not last one - if ((($i - $start) % $charsperline == 0) && - ($i != $start)) { # not first one - $data .= <<END -EndLine -StartLine - -END - ; - } - - @slice = @{$data[$i]}; - $data .= <<END -($numlabel) startstack -END - ; - - $numlabel++; - - foreach (@slice) { - ($letter,$height) = ($_ =~ /^(.{1})(\S+)/); - - # is space, so leave - if ($letter eq " ") { - last; - } - - # look for ">", which is symbol for error bar, then quit - if ($letter eq ">") { - last; - } - -# # look for negative heights -# if ($height < 0) { -# next; -# } - - $letter = (uc $letter); # always uppercase - $height = ($height < 0) ? 0 : $height; - $data .= " $height ($letter) numchar\n"; - } - - # put in error bars -- size is in $height as read in before - if ($fillings->{ERRBAR} && $letter ne " " && $height != 0) { - $data .= " $height Ibeam\n"; - } - - $data .= <<END -endstack - -END - ; - - } - - $fillings->{DATA} = $data; -} - -################################################################################ - -1;
--- a/wrapper.sh Wed Sep 17 07:25:17 2014 -0400 +++ b/wrapper.sh Mon Sep 22 10:19:36 2014 -0400 @@ -1,4 +1,5 @@ #!/bin/bash +set -e dir="$(cd "$(dirname "$0")" && pwd)" input=$1 output=$2 @@ -12,108 +13,50 @@ cat $PWD/files/*/8_* > $PWD/mutationstats.txt cat $PWD/files/*/10_* > $PWD/hotspots.txt -cp $dir/HS12RSS.txt $outdir/ -cp $dir/HS23RSS.txt $outdir/ + +echo "identification" +python $dir/gene_identification.py --input $PWD/summary.txt --output $PWD/annotatedsummary.txt +echo "merging" +Rscript $dir/merge_and_filter.r $PWD/annotatedsummary.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/merged.txt $outdir/unmatched.txt -mkdir $outdir/identification/ -python $dir/gene_identification.py --input $PWD/summary.txt --outdir $outdir/identification/ +genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm" +echo "R mutation analysis" +Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir 2>&1 +echo "python mutation analysis" +python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt + +cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt + genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) -tmp=$PWD/tmp -tmp2=$PWD/tmp2 -hotspottmp=$PWD/hotspottmp -mutationtmp=$PWD/mutationtmp -touch $outdir/mutationandhotspot.txt -for gene in ${genes[@]} -do - echo "Running $gene <br />" >> $output - mkdir $outdir/$gene - cp $dir/HS12RSS.txt $outdir/$gene/ - cp $dir/HS23RSS.txt $outdir/$gene/ - echo "Filtering input..." >> $output - Rscript $dir/filter.r $PWD/summary.txt $outdir/identification/${gene}.txt $outdir/$gene/summary.txt - Rscript $dir/filter.r $PWD/mutationanalysis.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationanalysis.txt - Rscript $dir/filter.r $PWD/mutationstats.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationstats.txt - Rscript $dir/filter.r $PWD/hotspots.txt $outdir/identification/${gene}.txt $outdir/$gene/hotspots.txt - echo "done <br />" >> $output - - echo "Running R script on $gene..." >> $output - Rscript --verbose $dir/mutation_analysis.r $outdir/$gene/mutationstats.txt $outdir/$gene/summary.txt $outdir/$gene/ 2>&1 - echo "done <br />" >> $output - - echo "Running Python script..." >> $output - python $dir/mutation_analysis.py --mutationfile $outdir/$gene/mutationanalysis.txt --hotspotfile $outdir/$gene/hotspots.txt --output $outdir/$gene/hotspot_analysis.txt - echo "done <br />" >> $output - echo "Done with $gene <br />" >> $output - cut $outdir/$gene/mutations.txt -d, -f2,3,4 > $mutationtmp - cut $outdir/$gene/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp - cat $mutationtmp $hotspottmp > $tmp - paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 - cat $tmp2 > $outdir/mutationandhotspot.txt - rm $outdir/$gene/HS12RSS.txt - rm $outdir/$gene/HS23RSS.txt -done - -Rscript --verbose $dir/mutation_analysis.r $PWD/mutationstats.txt $PWD/summary.txt $outdir/ 2>&1 -python $dir/mutation_analysis.py --mutationfile $PWD/mutationanalysis.txt --hotspotfile $PWD/hotspots.txt --output $outdir/hotspot_analysis.txt - -cut $outdir/mutations.txt -d, -f2,3,4 > $mutationtmp -cut $outdir/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp -cat $mutationtmp $hotspottmp > $tmp -paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 -cat $tmp2 > $outdir/mutationandhotspot.txt - -cut $outdir/ca1/mutations.txt -d, -f1 > $mutationtmp -cut $outdir/ca1/hotspot_analysis.txt -d, -f1 > $hotspottmp -cat $mutationtmp $hotspottmp > $tmp -paste $tmp $outdir/mutationandhotspot.txt -d, > $tmp2 -cat $tmp2 | tr -s "," > $outdir/mutationandhotspot.txt - -ca_n=`cat $outdir/ca/n.txt` -ca1_n=`cat $outdir/ca1/n.txt` -ca2_n=`cat $outdir/ca2/n.txt` -cg_n=`cat $outdir/cg/n.txt` -cg1_n=`cat $outdir/cg1/n.txt` -cg2_n=`cat $outdir/cg2/n.txt` -cg3_n=`cat $outdir/cg3/n.txt` -cg4_n=`cat $outdir/cg4/n.txt` -cm_n=`cat $outdir/cm/n.txt` -#all_n=$((ca_n + cg_n + cm_n)) -all_n=`cat $outdir/n.txt` echo "<html><center><h1>$title</h1></center><table border='1'>" > $output echo "<tr><th>info</th>" >> $output -echo "<th><a href='identification/ca.txt'>ca (N = $ca_n)</a></th>" >> $output -echo "<th><a href='identification/ca1.txt'>ca1 (N = $ca1_n)</a></th>" >> $output -echo "<th><a href='identification/ca2.txt'>ca2 (N = $ca2_n)</a></th>" >> $output -echo "<th><a href='identification/cg.txt'>cg (N = $cg_n)</a></th>" >> $output -echo "<th><a href='identification/cg1.txt'>cg1 (N = $cg1_n)</a></th>" >> $output -echo "<th><a href='identification/cg2.txt'>cg2 (N = $cg2_n)</a></th>" >> $output -echo "<th><a href='identification/cg3.txt'>cg3 (N = $cg3_n)</a></th>" >> $output -echo "<th><a href='identification/cg4.txt'>cg4 (N = $cg4_n)</a></th>" >> $output -echo "<th><a href='identification/cm.txt'>cm (N = $cm_n)</a></th>" >> $output -echo "<th>all (N = $all_n)</th></tr>" >> $output +for gene in ${genes[@]} +do + tmp=`cat $outdir/${gene}_n.txt` + echo "<th><a href='matched_${gene}.txt'>${gene} (N = $tmp)</a></th>" >> $output +done +tmp=`cat $outdir/total_n.txt` +echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output + while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz do echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output -done < $outdir/mutationandhotspot.txt +done < $outdir/result.txt echo "</table>" >> $output -echo "<a href='identification/unmatched.txt'>umatched</a><br />" >> $output - -Rscript $dir/piechart.r "${ca_n},${cg_n},${cm_n}" "IgA - ${ca_n},IgG - ${cg_n},IgM? - ${cm_n}" "Ig* (N = $all_n)" $outdir/all.png 2>&1 -Rscript $dir/piechart.r "${ca1_n},${ca2_n}" "IgA1 - ${ca1_n},IgA2 - ${ca2_n}" "IgA (N = $ca_n)" $outdir/ca.png 2>&1 -Rscript $dir/piechart.r "${cg1_n},${cg2_n},${cg3_n},${cg4_n}" "IgG1 - ${cg1_n},IgG2 - ${cg2_n},IgG3 - ${cg3_n},IgG4 - ${cg4_n}" "IgG (N = $cg_n)" $outdir/cg.png 2>&1 - -$dir/seqlogo -t "HS12RSS" -w 20 -h 5 -p -a -c -n -F PNG -f $outdir/weblogo_in_rs12.txt > $outdir/HS12.png 2>&1 -$dir/seqlogo -t "HS23RSS" -w 20 -h 5 -p -a -c -n -F PNG -f $outdir/weblogo_in_rs23.txt > $outdir/HS23.png 2>&1 +echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output -echo "<img src='all.png'/>" >> $output -echo "<img src='ca.png'/>" >> $output -echo "<img src='cg.png'/>" >> $output - -echo "<img src='HS12.png'/>" >> $output -echo "<img src='HS23.png'/>" >> $output +echo "<img src='all.png'/><br />" >> $output +if [ -a $outdir/ca.png ] +then + echo "<img src='ca.png'/><br />" >> $output +fi +if [ -a $outdir/cg.png ] +then + echo "<img src='cg.png'/><br />" >> $output +fi for gene in ${genes[@]} do @@ -121,7 +64,7 @@ while IFS=, read from a c g t do echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output - done < $outdir/$gene/transitions.txt + done < $outdir/transitions_${gene}.txt echo "</table>" >> $output done