changeset 4:069419cccba4 draft

Uploaded
author davidvanzessen
date Mon, 22 Sep 2014 10:19:36 -0400
parents a0b27058dcac
children 71a12810eff3
files HS12RSS.txt HS23RSS.txt filter.r gene_identification.py logo.pm merge_and_filter.r mutation_analysis.py mutation_analysis.r piechart.r seqlogo template.eps template.pm wrapper.sh
diffstat 13 files changed, 486 insertions(+), 3256 deletions(-) [+]
line wrap: on
line diff
--- a/HS12RSS.txt	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-Gene	Sequence
-IGKV2D-18	CACAGTGGTACAACCCTGAACAGAAACC
-IGKV1D-39	CACAGTGTTACAAGTCATAACATAAACC
-TRBJ2-5	CACGGCCCCCGAGCCCCGCACAAAAACC
-TRBJ2-6	CACAGCCCGGGGACTCCCCGCAAAAACC
-IGKV1-37	CACAGTGTTACAAGGCATAACATAAACC
-TRBJ2-1	CACAGTGGGAAGGGGCTGCCCAGAATTC
-TRBJ2-2	CACAGCCCTGGGGACCCTGGCGCAAACC
-IGKV1-33	CACAGTGTAACAAGTCATAACATAAATC
-IGKV3-11	CACAGTGATTCCACATGAAACAAAAACC
-IGKV1D-33	CACAGTGTAACAAGTCATAACATAAATC
-IGKV1-39	CACAGTGTTACAAGTCGTAACATAAACC
-IGHD3-22	CACAGTGTCACAGAGTCCATCAAAAACT
-TRAJ31	CACAGCACTTCCTGCCTTTACTGAAACC
-TRAJ32	CACAGTCCTTCAGAGCCTTACACTAACC
-TRAJ33	CACAGACACAAAAACCTTAACAAAAACC
-TRAJ34	CACAGTGATACTGAGATCTACAAAAACC
-TRAJ36	CACAGTGTTTCTGCCCAGTACAAAAACA
-TRAJ37	CACTCTAATGCTGTACTTTACAAAAACT
-TRAJ38	CACAGTCATAGAAAGCTTTACCAAAACC
-TRAJ39	CACAGTGATCTTCGGCTGAGCAAAAACC
-TRBJ1-2	CACATAAGAATATAGCCACTCTAAAAGG
-IGHD2-2	CACAGTGACACAGCCCCATTCCCAAAGC
-IGHD1-1	CACCGTGACTCGGGGCTGTTCAGAATCT
-IGKV1-27	CACTGTGATACAAGCCCGAACATAAACC
-TRBJ1-6	CACAGCTGCAGAGGCTTAGATAAAACCC
-TRBJ1-5	CACAGTGCATCATGAGTGTGGCAAACCC
-TRBJ1-4	CACAACATTAAAGACTGGAAGGAAAACC
-TRBJ1-3	CACAGCCTCCCAGGGCCACTTCAAAACC
-IGKV1-6	CACAGTGTTACACACCCGAACAGAAACC
-TRBJ2-4	CACAGCCCCGAGACGCGGCACAGAAACT
-IGKV1-5	CACAGTGTTACACACCCGAACATAAACC
-TRAJ29	CACAGTGATTTCCTCCATAACAAAAACC
-TRAJ28	CACAGAGTTTCCTTTCTTTGCAAAAACC
-TRAJ27	CACAGTCTTTAGTGCTATTGCAATAACC
-TRAJ26	CACAGTGTTTCTGGGCTCAGCAAAAACC
-TRAJ24	CACTGTGACAAACACCTCTACAAAATGG
-TRAJ23	CACTGTGTTACATATCCTGTCAAAAACA
-TRAJ22	CACTATGATTTGCTCAACAACAAAACCA
-TRAJ21	CACCATGTTTATTGGCATTACAAAAAAT
-IGKV1-8	CACAGTGTTACACACCCAAACAAAAACC
-TRBJ2-7	CACGGAGGTGCACCCCCGCATGCAAACC
-IGKV1-9	CACAGTGTTACAAACCTGAACATAAACC
-IGKV3D-11	CACAGTGATTCCACATGAAACAAAAACC
-TRBJ2-3	CACAGCCTGGAGGCCCAGGACAAAAACC
-IGKV1-17	CACAGTGTTACACACCCGAACATAAAAC
-IGKV1D-17	CACAGTGTTACACACCCGAACATAAACC
-IGKV1D-16	CACAGTGTTACACACCCAAACATAAACC
-TRAJ58	CACTGTGCTGAGGGGCTTTGCAAAAACC
-TRAJ59	CACAGGAGCTGCTGCCTTTACATAAACT
-TRAJ52	CACTGCACTGGAGGCCTTTACAAGAACC
-TRAJ53	CACAGCCACAGAAGGCTTTACAGAAACA
-TRAJ50	CACAGCCATCCAAACCTTTACAATAACT
-TRAJ56	CACAATGACACGAGGATCTACAAAAACT
-TRAJ57	CACCCCCACAGACTGCTTTACAAATACT
-TRAJ54	CACCACAGCACGAGGCTTTACAGAAACT
-IGKV3-15	CACAGTGATTCAACATGAAACAAAAACC
-TRDD2	CACAATGAAACACATCAGTATAAAAACC
-IGKV2-40	CACAGTGGTACAGCCCTGAACAGAAACC
-IGKV6D-21	CACTGTGTTACAACCCAGAACAAAAACT
-IGKV3-20	CACAGTGATTCCACAGGAAACCAAACCT
-TRDJ3	CACATTATGACAGTGCCTCACAGGTAAC
-IGHD3-3	CACAGTGACACAGACCTCACCCCAAACC
-TRAJ49	CACTGTGATAGGAAGCTCAACAAAAACC
-TRAJ48	CACAGTGTTCTAAGTCATTGCAAAAACC
-TRAJ41	CACAGTGCGTTCTCCCTAAACAAAAAAC
-TRAJ40	CACAGTGTTATGTGTCTCTACATAAACC
-TRAJ43	CACAGTAATACATGCTCTAACAAAAACC
-TRAJ42	CACAGTCCTATGGGGCTTTACAATAATC
-TRAJ45	CACTCTGGGCCAACCCTTTACATAAACT
-TRAJ44	CACTGTGAGATGCTTCATAACAGAAACC
-TRAJ47	CACAGCGTCAAACTCCTCTACAAAAACA
-IGLJ2	CACTGTGACACAGGCTCATACAAAAACC
-IGLJ1	CACAGTGACTGAGGCTCAGACCAAAACC
-IGLJ7	CACAGTGACACAGCCCCACACACAAACC
-IGLJ6	CACTGTGATATAACCCTGCACACAAACC
-LCK	CACACACACACACACACAAGCCAAAACC
-IGKV4-1	CACAGTGCTTCAGCCTCGAACACAAACC
-TRAJ4	CACAATCAGATGGTGCTTTACAAGAACT
-HPRT_12	CACACACACACACACACACACAAATACA
-IGKV3D-20	CACAGTGATTCAGCTTGAAACAAAAACC
-TRAJ5	CACCCTGGTACATCACAGTACAAAATCC
-TRAJ6	CACAGTGAGGACAGCCTTTGATAAAACC
-TRAJ3	CACTGTGGGTAAGGTCTTTGAGATAACC
-IGKV2D-29	CACAGAGGTACAGACCAATACAGAAACC
-IGKV2D-28	CACAGTGGTACAACCCCTAACAGAAACC
-TAL2	CACTGTGACTTACTGGAGATATAAAAAT
-IGKV2D-26	CACAGTGCTACAGCCCTGAGCACAAACC
-TRAJ61	CACAGGAGTGGGCACCTTTACAAAAACC
-TRBD1	CACAATGTTACAGCTTTGTACAAAAACA
-IGKV1D-8	CACAGTGTTACACACCCGAACAAAAACC
-LMO2	CACAGTATTGTCTTACCCAGCAATAATT
-IGKV2D-30	CACAGTGGTACAGCCCTGAACAAAAACC
-IGKV1/OR2-108	CACAGTGTTACAAGCCATAACAAAAACC
-TRBD2 	CACAATGTTACACCATGATACAAAAATG
-IGKV2-24	CACAGTGGTACAGCTCTGAACAAAAACC
-IGKV2-28	CACAGTGGTACAACCCCTAACAGAAACC
-IGKV2-29	CACAGTGGTACAGACCAATACAGAAACC
-TRAJ16	CACAGTGATCTATTGTACCACAAAAACC
-TRAJ17	CACAATGATTTAAGGCCCAGCAAAAACC
-TRAJ15	CACAGTGAAATGAGGCCCTGCAAATACC
-TRAJ12	CACAGTGTTTCTTAGTCAGTCAAAAACA
-TRAJ13	CACTGTAATGCCTGCCTTTACAAAATGA
-TRAJ10	CACAGTGTTTGATGCCTCACAATAAACT
-TRAJ11	CACTATAGCAAATCCCCATACAAAAATG
-TRAJ18	CACAATGCTGGTCCCCTTTACATGAACC
-IGKV1D-43	CACAGTGTTACACACCCAAACAAAAACC
-IGKV2-30	CACAGTGGTACAGCCCTGAACAAAAACC
-TRAJ30	CACTGTGATTGGGACCATAACAAAAACT
-IGHD3-16	CACAGTGACACAGACCTCACTTCAAACC
-IGHD4-4	CACAGTGTCACAGAGTCCATCAAAAACC
-IGHD3-10	CACAGTGACACAGACCTCATTCTAAACC
-IGHD7-27	CACAGTGGTTCTCAGCTCAGCCAAAACC
--- a/HS23RSS.txt	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,174 +0,0 @@
-Gene	Sequence
-IGLV2-8	CACAGTGTTTTAAGTCAATGAGGAAGTAAGACCAAAACC
-IGLV4-3	CACAGTGACACAGATGAAGGGGAAGTGAGGCCAAAACTC
-TRBV21-1	CACAGTGCCGAATGTTAGCCCTTCTTAGAACACAAACTC
-IGHV1-17	CACAGTGCGAAAACCCACATCCTGAGAGTGTCAGAAACC
-IGHV1-14	CACAGTGTGAAAACCCACATCCTGAGAGAGTCAGAAATC
-IGHV3-15	CACAGTGAGGGGAGGTCAGTGTGAGCCCGGACACAAACC
-IGHV1-18	CACAGTGTGAAAACCCACATCCTGAGGGTTTCAGAAACC
-IGHV3-11	CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC
-IGLV2-5	CACAGTGGTCCAAGTTCTTGGGGAACTGAGACGAAAACC
-IGHV3-13	CACAGTGAGGGGAAGTCAGTATGAGCCCAGACACAAACC
-TRAV2	CACAGAGGCAGGGAACCCATGAAGAGCTGAACAGAAACA
-TRAV3	CACACTGATAGGGGCTGCAGGGGGAGCAGAACACAAACT
-TRAV4	CACAGTGAGACAGATGGGCCTGCACCTGTGCCGTTTTCC
-TRAV5	CACATTGCTTCTCAGGCACCTGTATCCTGTACCCAAACC
-TRBV6-4	CACAGTGCTGCACAGCCATCTCCTCTCTGTACATAAATG
-IGHV3-19	CACTGTGAGAGGACGGAAGTGTGAGCCCAGACACAAACC
-TRBV6-6	CACAGCGCTACAAGACCATCTCCTCTCTGCACATAAAGG
-TRBV18	CACATTGATGCAGAGCCACATCCTCTCAGTCCACAAACA
-TRBV6-1	CACAGCGCTGCATGGCCGTCTCCTCTCTGCACATAAAGG
-TRBV6-2	CACAGTGCTGCACGGCTGTCTCCTCTCTGCACAGAAAGG
-TRBV6-3	CACAGTGCTGCACGGCTGTCTCCTCTCTGCACAGAAAGG
-TRBV12-4	CACAGCGCTGCAGAATCACCCCTTTCCTGTGCAGAAACC
-TRAV27	CACAGTGCTCTTGAGGCACCTGCTGCCTGCACCCAAACC
-TRBV5-4	CACAGCCCGGCAGAGTCTCTGACATTCTATACATAAACT
-TRBV5-3	CACAGCCCTGCAGAGTCACTGGAACTCTGTGCACTAATC
-TRGV3	CACAGTGATTCAGACCTGTCCTACACCACACTGAAAATC
-TRBV5-1	CACAGCCCTACAAAGCCAACCACATTCTGTGCACAAACC
-IGHV3-9	CACAGTGAGGGGAAGTCAGCGAGAGCCCAGACAAAAACC
-TRGV8	CACAGTGATTCAGACCTGTGCTACACCACACTGAAAATC
-TRBV5-8	CACAGCCCAGCAGAATCACTGACATTCTGTATATAAACT
-TRDV2	CACCCTGCTGCAGCTCTACTTCTGAGCAGCTCAAAAACC
-IGKJ1	CACAGTGAGAGCTCTCCATTGTCTTGCTGAACAGAAACC
-IGKJ2	CACAATGGTTCCTCTTAACTTCCCTCCTATACAAAAACT
-IGKJ4	CACAGTGAGGGATCTCACCCTTTCCCCTCAACAAAAACC
-TRAV26	CACACTGGGACAGATGGGGCTGCACCTGTGCAATATCTC
-TRAV25	CACAGTGCTCCCCAAGCACCTGCTGCCTGTCTCCAAATC
-TRAV24	CACAGTGCTGTTCAGGCACCTGCAGCCCATACGCAAACC
-IGHV1-24	CACAGTGTGAAAACCCACATCCTGAGAGCGTCAGAAACC
-TRAV41	CACAGTGCTCCCCAGGCACCTGGAGCCCGTACCTAAACT
-IGLV5-52	CACAGTGCTCCAGACCCATGAGGAAGTAAGACAAAACCC
-IGLV10-54	CACAGTGCCTCAGGCCAGTGGGGAAGTGAGATAAAAACT
-IGHV3-64	CACAGTGAGGAGAAGTTAATGTGGGACCATGCAGAAACC
-IGHV3-66	CACAGTGAGGGGAGGCCATTGTGCGCCCAGACACAAACC
-TRBV7-9	CACAGCATGGCACAGTCGCCTCCTTCCTGCTCACAAACC
-TRBV7-8	CACAGCATGGCACAGTTGCCTCCTTCCTGTTCACAAACC
-IGLV1-40	CACAGTGCTCAGGCGTGTGTGTGAACTGAGACAAGAACC
-IGLV3-21	CACGGTGACACAGGCAGATGAGGAAGTGAGACAAAAACA
-IGHV7-4-1	CACAGTGTGGAAACCCACATCCTGAGAGTGTCAAAAACC
-IGLV3-27	CACAGTGACACAGGCAGATGGAGAAATGAGACACAAACC
-TRBV4-1	CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC
-IGLV1-41	CACAGTGCTCCAGGCCAATGGGGAACTGAGACAAGAACC
-TRAV13-1	CACATTGCTTCCCAGGCACCTGCTACCCGTACACAAACC
-TRAV13-2	CACATTGCTTTCCAGGCATCTGTAACCATCACCCAAACC
-TRBV5-6	CACAGCCCATCAGAGTCACTGACGTTCTGTATATAAACT
-IGLV3-25	CACAGTGACACAAGCAGACAGGGAAGTGAGACATAAACC
-TRBV10-2	CACAGTGCTGCACAGCTGCCTCCTCTCTGCACGGAAATG
-TRBV10-3	CACAGTGCTGCATGGCTGCCTCCTCTCTGCACGTAAACA
-TRBV10-1	CACAGTGCTGCACAGCTGCCTCCTCTCTGCACATAAAGG
-IGLV6-57	CACAGTGCTCCAGACCCATGGGGAAGTGAGACAGAAACT
-TRBV6-5	CACAGCGCTACAAGGCCGTCTCCTCTCTGCACATAAAGG
-IGHV3-73	CACAGTGAGGGGAGGTCAGTGTGAGCCCGGACACAAACT
-IGHV3-72	CACAGCGAGGGGAGGTCAGTGTGAGCCCGGACACAAACC
-IGLV1-47	CACAGTGCTCCAGGCCCAGGGGGAACTGAGACAAGAACC
-TRAV9-2	CACAGTGACAGGGACTGCAGGGGGAGCTGAGCACAAACT
-IGLV7-46	CACAGTGACAGACCCATGAGAGGAACCAAGACATAAACC
-TRBV6-7	CACAGCGCTGCAAGGCTGTCTCCTCTCTGCACATAAAGG
-TRBV3-1	CACAGCCCTGCAAAGTCACTGCATCCCTGTGCACAAACC
-IGLV7-43	CACAGTGACAGACTCATAAGAGGAACCAAGACATAAACC
-IGHV1-8	CACAGTGTGAAAAACCACATCCTCAGAGAGTCAGAAACC
-IGLV1-51	CACAGTGCTCCAGCCCAATGGGGAACTGAGACAAGAACC
-IGLV1-50	CACAGTGCTCCAGGCCCAGGGGGAGGTGAGAGAAGAACC
-IGHV1-3	CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACC
-IGHV1-2	CACAGTGTGAAAACCCACATCCTGAGGGTGTCAGAAACC
-IGHV4-39	CACAGTGTGGAAACCCACATCCCGAGGGAGTCAGAAACC
-TRBV2	CACAGCCTTGCAAAGACAACTCCAGCCTGTGCAAAATCC
-TRDD3	CACAGTGCTACAAAACCTACAGAGACCTGTACAAAAACT
-TRDD2	CACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACA
-TRAV29/DV5	CACAGTGCTCTCCAGGCACCTGCAGCCCGTACTCAAACC
-IGLV3-10	CACAGTGACACTGGCAGATGGGGAAGTGAGACACAAACC
-TRBV27	CACAGTGTTGCACAGCCAGCTGCTCTCTGCACAAAAACA
-TRBV5-5	CACAGCCCGGCAGAATCACTGACATTCTGTATATAAACT
-TRGV10	CACATACTAGAACTGTTGAAACAACATGCACAAAATCCC
-TRGV11	CACAGTGTTCGAGTTGTCAAGATAAGGTACACAGAAACT
-TRGV5	CACAGTGATTCAGACCTGTCCTACACCACACTGAAAATC
-TRBV30	CACACTGAGCTGGGTGGGGCAGACATCTGTGCAAAAACC
-IGHV3-48	CACAGTGAGGGGAGGTCAGTGTGACACCAGACACAAACC
-IGHV3-49	CACAGTGAGGGGAGGTCAATGTGAGCCCAGACACAAACC
-TRGV2	CACAGTGATTCAGATCCGCCCTACACCACACTGAAAATC
-IGLV	CACAGTGCTGCAGGCACATGGGGAACCGAGACAAAAACC
-IGHV3-43	CACAGTGAGGGGAAGTCAGCGAGAGCCCAGACAAAAACC
-IGHV3-41	CACAGTGAGAGGAAGTCCGTGTGAGCCCAGACACAAACC
-IGHV3-7	CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC
-HPRT_23	CACTGTAGGTTCATCTTTATGGAGATGCTCATGGCCTCA
-TRGV9	CACAGCAGCAGACAGTTTGAGCCATCCCATTCAATAAAT
-IGLV4-60	CACAGTGATACAGGCAGATGAGGAAGTGGGACAAAATCC
-TRBV28	CACAGCGCAGCACAGCCGCATCCTCTCTGCACAAAAAGA
-TRBV25-1	CACAGTGCTACATGGATACCGACACTCCGCACAGAAAGG
-IGLV4-69	CACAGTGACACAGGCAGATGAGGAAGTGGGACAGAAACC
-IGHV4-28	CACAGTGAGGGGAGGTGAGTGTGAGCCCAGACACAAACC
-TRBV3-2	CACAGCCTTACAGAGCCACTGCATCCCTGTGCACAAACC
-IGHV6-1	CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC
-TRAV19	CACAGTGAGATGGGTGCCTGTGGGAGCCCTACAAAAACC
-TRAV10	CACTGTGCTCCACAGGCACTTGAAGCCAGTATGCAAACC
-TRAV17	CACAGTGTTCCCCAGGAACCTGCAGCCTCTACGCAAACC
-TRBV4-2	CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC
-TRBV4-3	CACAGCCTTGCAGAGTCACCGCTTTCCTGTGCAGAAACC
-IGHV3-53	CACAGTGAGGGGAGGTCAGTGTGAGCCCAGACACAAACC
-IGHV3-52	CACAGTGAGAGGCACAGTGAGGGGAGGTCAGTGTGAGCC
-TRBV7-3	CACAGCATGACACAATCGCCTCCTTCCTGCTCATAAACC
-TRBV7-2	CACAGCATGGCACAGTCGCCTCCTTCCTGCTCATAAACC
-TRBV7-7	CACAGCATGGCACAGTCGCCTCCTTCCTGTTCACAAACC
-TRBV7-6	CACAGTGTGGCATAGTCGCCTCCTTCCTGTTCACAAACC
-IGHV2-5	CACAAAGACACAGCCCAGGGCACCTCCTGTACAAAAACC
-IGLV3-13	CACAGTGACACAGGCAGATGGGGAAGTGAGACACAAATC
-IGLV3-16	CACAGTGACACAAGGAGACAGGGAAGTGAAACATAAACC
-IGLV3-19	CACAGTGACACAGACAGATGGGGAAGTGAGACAGAAACC
-IGHJ3	CACAGGGACACAGTCCGTTCCTAGACCCAGACACAAACC
-IGHV1-45	CACAGTGTGAAAACCCACATCCTGAGACCGTCAGAAACC
-IGHV1-46	CACAGTGTGAGAAACCACATCCTCAGAGTGTCAGAAACC
-IGLV8-61	CACAGTGATTTAAACCTATGAGGAAGTGCAACTAAAACC
-IGHV4-59	CACAGTGAGGGGAGGTGAGTGTGAGCCCAGACAAAAACC
-TRAV1-1	CACAGTGACTATGAGGCCTCCTTAACTGTGCCAAAATTC
-TRAV1-2	CACGGTGACTATGAGGCCTCTTTAGCTGCACCAAAATTC
-TRGV7	CACAGTGATTCACACCTGCCCTACACCACACTGAAAATC
-IGHV3-20	CACAGTGAGGGGAAGCCAGTGAGAGCCCAGACACAAACG
-IGHV3-21	CACAGTGAGGGGAAGTCAGTGTGAGCCCAGACACAAACC
-IGHV3-23	CACAGTGAGGGAAGTCATTGTGAGCCCAGACACAAACCT
-IGLV2-23	CACAGTGGTCCAAGTTCATGGGGAACTGAGACCAAAACC
-TRBD1	CACAATGATTCAACTCTACGGGAAACCTTTACAAAAACC
-TRGV4	CACAGTGATTCAGATCCGCCCTACACCACACTGAAAACC
-TRAV12-1	CACAGTGCTCCCCAGACACCTGCAGTCTGTACCCAAACC
-TRAV12-3	CACAGTGCTCCCCAGACACTTGCAGTCTGTACCCAAACC
-TRAV12-2	CACAGTGCTCCCCAGACACCTGCAGTCTGTACCCAAACC
-IGLV3-6	CACAGTGACACAGGCAGATTAGGAAGTGAGACACAAACC
-IGLV3-1	CACAGTGACACAGGCAGATGCGGAAGTGAGACAGAAACC
-IGHV1-58	CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACG
-TRBD2 	CACGATGATTCAGGTAGAGGAGGTGCTTTTACAAAAAAC
-IGLV3-9	CACAGTGACACAGGCAGATGAAGAAGTGAGACACAAACC
-TRBV6-8	CACAGCGCTACAAGGTTGTCTCCTCTCTGCACATAAAGG
-TRBV9	CACAGCCCTGCATGAGCATCAGCCTTCTGTGCAATAACA
-IGHV3-35	CACTGTGAGAGGTCGGAAGTGTGAGCCCAGACACAAACC
-IGHV3-33	CACAGTGAGGGGAGGTCATTGTGCGCCCAGACACAAACC
-IGLV2-14	CACAGTGGTCCAAGTTCCTGGGGAACTGAGACCAAAACC
-IGHV3-30	CACAGTGAGGGGAAGTCATTGTGCGCCCAGACACAAACC
-TRAV8-1	CACAGTGTCTGGGACTGCAAAGGGAGCTGAACACAAACT
-IGLV2-18	CACAGAGGTCCAAGTTCCTGGGGAACTGAGACCAAAACC
-TRAV14/DV4	CACAGTGACAGAACTGTCGGAGGGAGGTGTACAAAAGCC
-IGHV2-26	CACAGAGACACAGCCCAGGATGCCTCCTGTACAAGAACC
-TRBV16	CACAGTGTTAAATATTAGCTAATCTTAGGACACAGACTC
-TRBV15	CACAGAGCTGCAGTGCTTCCTGCTCTCTGTTCATAAACC
-TRBV14	CACAGTGCTTCACAGTCGTGCCCTTGCTGTGCAAAACCA
-TRBV13	CACAGCATGGCACAGTCGCCTCCTTCCTGCTCACAAACC
-TRBV11-3	CACAGTGTAGCAGAGACACTTCCCTCCTGTGCAGAAAAC
-TRBV19	CACAGTGAAGCACGGATGTCGCCTCTCTGTGCATAAATG
-TRBV11-2	CACAGTGTAGCAGAGACACTTCCCTCCTGTGCAGAAAAC
-TRAV38-1	CACAATGAGATGAGCAGCAGGTAGAGGCTTACAGAAATC
-TRBV12-5	CACAGCGCTGCAGAATCACCTGCTCCCTGTGCAGAAACC
-TRBV12-2	CACAGCGCTGCAGAATCTCCCCCTCCCTGTGCAGAAACT
-TRBV12-3	CACAGCGCTGCAGAATCACCCCTTTCCTGTGCAGAAAAC
-IGHV1-69	CACAGTGTGAAAACCCACATCCTGAGAGTGTCAGAAACC
-LMO1_23	CACAGTGGCTCACCACCCCACACAGCCCTCACTCTGGCA
-IGLV5-45	CACAGTGACACACACAGATGGGGAAGTGGGACAAAAACC
-IGLV5-37	CACAGTGACACACACAGATGGGAAAGTGGGACAAAAACC
-TRAV23/DV6	CACAGTGCTCCCCAGGCACCTGAAGCCTGTACCCAAACC
-TRDV1	CACAGTGTTTGAAGTGATAGTAAAAGCAAAACAAAAACC
-TRBV20-1	CACAGCGCCAGGAGGGGATCAGACACCGCGGCAAGAACC
-IGHV2-10	CACAGAGACAGAGCCCAGGGTGCCTCTTGTACAAGACCC
-TRBV29-1	CACAGTGCGGGGCACAGATCAAAGATCTGAGCAAGAACC
-TRBV23-1	CACAGCACTGAAATGTCAGTTCCTCTTAGCACACAAACT
-TRAV34	CACAGCGATCTTCAGGCCTCTATCAGCTGTCTCCAAACC
-TRAV30	CACAGTGATACCCAGGCCTCCAAGACCTGTACTCAAACC
-TAL1_23	CACAGCCTCGCGCATTTCTGTATATTGCGTAAGGAAAAG
-TRAV39	CACAGTGCTCCCCTGACGCCACCAGTCTGTACCCAAACC
--- a/filter.r	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input = args[1]
-filterf = args[2]
-output = args[3]
-
-
-dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
-filterdat = read.table(filterf, header=T, sep="\t", fill=T, stringsAsFactors=F)
-
-dat = dat[dat$Sequence.ID %in% filterdat$ID,]
-
-write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- a/gene_identification.py	Wed Sep 17 07:25:17 2014 -0400
+++ b/gene_identification.py	Mon Sep 22 10:19:36 2014 -0400
@@ -5,31 +5,38 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")
-parser.add_argument("--outdir", help="Output directory, 7 output files will be written here")
+parser.add_argument("--output", help="The annotated summary output file")
 
 args = parser.parse_args()
 
 infile = args.input
 #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"
-outdir = args.outdir
+output = args.output
 #outfile = "identified.txt"
 
 dic = dict()
 total = 0
 
+
 first = True
+IDIndex = 0
+seqIndex = 0
+
 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
-  for line in f:
-    total += 1
-    if first:
-      first = False
-      continue
-    linesplt = line.split("\t")
-    if linesplt[2] == "No results":
-      continue
-    ID = linesplt[1]
-    seq = linesplt[28]
-    dic[ID] = seq
+	for line in f:
+		total += 1
+		if first:
+			linesplt = line.split("\t")
+			IDIndex = linesplt.index("Sequence ID")
+			seqIndex = linesplt.index("Sequence")
+			first = False
+			continue
+		linesplt = line.split("\t")
+		ID = linesplt[IDIndex]
+		if len(linesplt) < 28: #weird rows without a sequence
+			dic[ID] = ""
+		else:
+			dic[ID] = linesplt[seqIndex]
 
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag",
@@ -128,7 +135,7 @@
 				continue
 			#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
 			currentIDHits[key + "_hits"] += 1
-		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1])
+		start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1])
 		#start_location[ID + "_" + key] = str(start.index(max(start)))
 
 
@@ -141,120 +148,49 @@
 varsInCM = 0
 requiredVarPercentage = 0.7
 
-ca = 0
-ca1 = 0
-ca2 = 0
-cg = 0
-cg1 = 0
-cg2 = 0
-cg3 = 0
-cg4 = 0
-cm = 0
-try:
-	cafile = open(outdir + "/ca.txt", 'w')
-	ca1file = open(outdir + "/ca1.txt", 'w')
-	ca2file = open(outdir + "/ca2.txt", 'w')
-	cgfile = open(outdir + "/cg.txt", 'w')
-	cg1file = open(outdir + "/cg1.txt", 'w')
-	cg2file = open(outdir + "/cg2.txt", 'w')
-	cg3file = open(outdir + "/cg3.txt", 'w')
-	cg4file = open(outdir + "/cg4.txt", 'w')
-	cmfile = open(outdir + "/cm.txt", 'w')
-	unmatchedfile = open(outdir + "/unmatched.txt", 'w')
-	cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
-	unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n")
-	for ID in hits.keys():
-		currentIDHits = hits[ID]
-		possibleca = float(len(compiledregex["ca"]))
-		possiblecg = float(len(compiledregex["cg"]))
-		possiblecm = float(len(compiledregex["cm"]))
-		cahits = currentIDHits["ca_hits"]
-		cghits = currentIDHits["cg_hits"]
-		cmhits = currentIDHits["cm_hits"]
-		if cahits > cghits and cahits > cmhits: #its a ca gene
-			if cahits <= int(chunksInCA * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n")
+
+first = True
+with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
+	with open(output, 'w') as o:
+		for line in f:
+			total += 1
+			if first:
+				o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n")
+				first = False
 				continue
-			ca += 1
-			ca1hits = currentIDHits["ca1"]
-			ca2hits = currentIDHits["ca2"]
-			cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-			if ca1hits > ca2hits:
-				#print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit"
-				if ca1hits <= int(varsInCA * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
-					continue
-				ca1 += 1
-				ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-			else:
-				#print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit"
-				if ca2hits <= int(varsInCA * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n")
-					continue
-				ca2 += 1
-				ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
-		elif cghits > cahits and cghits > cmhits: #its a cg gene
-			if cghits <= int(chunksInCG * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n")
-				continue
-			cg += 1
-			cg1hits = currentIDHits["cg1"]
-			cg2hits = currentIDHits["cg2"]
-			cg3hits = currentIDHits["cg3"]
-			cg4hits = currentIDHits["cg4"]
-			cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
-				if cg1hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n")
-					continue
-				cg1 += 1
-				cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
-				if cg2hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n")
-					continue
-				cg2 += 1
-				cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
-				if cg3hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n")
-					continue
-				cg3 += 1
-				cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-			else: #cg4 gene
-				if cg4hits <= int(varsInCG * requiredVarPercentage):
-					unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n")
-					continue
-				cg4 += 1
-				cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
-		else: #its a cm gene
-			if cmhits <= int(chunksInCM * requiredChunkPercentage):
-				unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n")
-				continue
-			cm += 1
-			cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
-finally:
-  cafile.close()
-  ca1file.close()
-  ca2file.close()
-  cgfile.close()
-  cg1file.close()
-  cg2file.close()
-  cg3file.close()
-  cg4file.close()
-  cmfile.close()
-  unmatchedfile.close()
-  
-
-#print ca,cg,cm,(ca+cg+cm)
+			linesplt = line.split("\t")
+			if linesplt[2] == "No results":
+				pass
+			ID = linesplt[1]
+			currentIDHits = hits[ID]
+			possibleca = float(len(compiledregex["ca"]))
+			possiblecg = float(len(compiledregex["cg"]))
+			possiblecm = float(len(compiledregex["cm"]))
+			cahits = currentIDHits["ca_hits"]
+			cghits = currentIDHits["cg_hits"]
+			cmhits = currentIDHits["cm_hits"]
+			if cahits > cghits and cahits > cmhits: #its a ca gene
+				ca1hits = currentIDHits["ca1"]
+				ca2hits = currentIDHits["ca2"]
+				if ca1hits > ca2hits:
+					o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+				else:
+					o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
+			elif cghits > cahits and cghits > cmhits: #its a cg gene
+				cg1hits = currentIDHits["cg1"]
+				cg2hits = currentIDHits["cg2"]
+				cg3hits = currentIDHits["cg3"]
+				cg4hits = currentIDHits["cg4"]
+				if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene
+					o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene
+					o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+				else: #cg4 gene
+					o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+			else: #its a cm gene
+				o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
 
 print "Time: %i" % (int(time.time() * 1000) - starttime)
 
--- a/logo.pm	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,826 +0,0 @@
-#!/usr/local/bin/perl -w
-
-=head1 NAME
-
-    logo.pm - organizes data in FASTA and CLUSTAL formats into height data.
-
-=head1 SYNOPSIS
-
-    Perl module
-
-=head1 DESCRIPTION
-
-    logo.pm: Takes in strings of aligned sequences and sorts them vertically
-             based on height as assigned by the following equations found in
-             Schneider and Stephens paper "Sequence Logos: A New Way to Display
-             Consensus Sequences":
-    
-                 height = f(b,l) * R(l)                            (1)
-
-             where f(b,l) is the frequency of base or amino acid "b" at position
-             "l". R(l) is amount of information present at position "l" and can
-             be quantified as follows:
-
-                 R(l) for amino acids   = log(20) - (H(l) + e(n))    (2a)
-                 R(l) for nucleic acids =    2    - (H(l) + e(n))    (2b)
-
-             where log is taken base 2, H(l) is the uncertainty at position "l", 
-             and e(n) is the error correction factor for small "n". H(l) is
-             computed as follows:
-
-                 H(l) = - (Sum f(b,l) * log[ f(b,l) ])             (3)
-
-             where again, log is taken base 2. f(b,l) is the frequency of base
-             "b" at position "l". The sum is taken over all amino acids or
-             bases, depending on which the data is.
-
-             Currently, logo.pm uses an approximation for e(n), given by:
-
-                 e(n) = (s-1) / (2 * ln2 * n)                      (4)
-
-             Where s is 4 for nucleotides, 20 for amino acids ; n is the number
-             of sequences in the alignment. e(n) also  gives the height of error
-             bars.
-
-=cut
-
-package logo;
-
-use strict;
-use Carp;
-
-################################################################################
-######                             SOME VARIABLES                         ######
-################################################################################
-
-my $DEBUG = 0;
-
-my $AA = 0;
-my $NA = 1;
-
-my %BASES = ("a" => "adenine",
-	     "t" => "thymine",
-	     "g" => "guanine",
-	     "c" => "cytosine",
-	     "u" => "uracil");
-
-# does not include B or Z
-my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "",
-		  "g" => "", "h" => "", "i" => "", "k" => "", "l" => "",
-		  "m" => "", "n" => "", "p" => "", "q" => "", "r" => "",
-		  "s" => "", "t" => "", "v" => "", "w" => "", "y" => "");
-
-my @data;
-my $kind;
-my ($seqs_r, $desc_r);
-
-my $CONFIDENCE_LIMIT = 0.90;
-
-################################################################################
-######                             SOME FUNCTIONS                         ######
-################################################################################
-
-=head1 APPENDIX
-
-=cut
-
-=head2 getHeightData()
-
- Usage   : my ($height_data_r, $description_data_r, $kind) =
-              logo::getHeightData($input_data_r, $params);
- Returns : * REFERENCE TO array of height data
-           * REFERENCE TO array of input description strings
-           * $AA if the data is amino acid, $NA otherise
- Args    : $input_data_r : input data in CLUSTAL or FASTA formats
-         : $params       : hash of parameters
-
- getHeightData is the entry point into the logo.pm module. $input_data_r is a
- reference to  an array of strings containing FASTA or CLUSTAL data, where all
- lines whose first character is "#" is considered a comment line.
-
- $params is a hash of parameters with the following keys:
-   * smallsampletoggle : 0 to turn off small sample correction, otherwise
-                         small sample correction is on
-   * input_kind : 0 for amino acids, 1 for nucleic acids; if undefined,
-                  logo.pm will attempt to automatically detect whether the
-                  input consists of amino or nucleic acid data. If
-                  $input_kind is defined, only those residues  defined by
-                  $input_kind will be in the output -- all other residues are
-                  considered as spaces. For example, if $input_kind is $NA,
-                  the residue "I" or "i" are considered spaces, since "I" and
-                  "i" are not nucleic acid residues.
-   * stretch : stretch all characters so they are flush at the maximum number
-               of bits allowed
-
- Sample use:
-
-  # get FASTA data
-  open (FASTA, "$fastafile");
-  my @inputdata = <FASTA>;
-  close (FASTA);
-
-   my %heightparams = (
-		       smallsamplecorrection => 0,
-		       input_kind => 0,
-		       stretch => 0
-		       );
-
-  # get height data
-  my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams);
-
-=cut
-
-# entry point into module
-sub getHeightData {
-
-    # $smallsampletoggle is toggle to turn off small sample correction (1 to turn off)
-    # $input_kind can be $AA or $NA or undef
-    my ($input_data_r, $params) = @_;
-
-    # gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n)
-    $input_data_r = normalizeData($input_data_r);
-
-    # gets sequences, sets $kind temporarily
-    my ($goodlength, $maxreslength, $badline, $validformat);
-    ($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) =
-	getSeqs($input_data_r, $params->{input_kind});
-
-#    for(my $i = 0; $i < scalar @$seqs_r ; $i++) {
-#	print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n");
-#    }
-#    print STDERR "maxreslength = $maxreslength\n";
-#
-#    exit(0);
-
-    if ($DEBUG) { print STDERR ("point 1\n");}
-
-    # check for valid format
-    if ((defined $validformat) && ($validformat == 1)) {
-#	print("returning\n");
-	return (undef, undef, undef, undef, undef, 1);
-    }
-
-    if ($DEBUG) { print STDERR ("point 2\n");}
-
-    # check for bad length
-    if (!$goodlength) {
-	return (undef, undef, undef, $goodlength, $badline);
-    }
-
-    # reset $kind if in $input_kind
-    if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) {
-	$kind = $params->{input_kind};
-    }
-
-    # build data
-    buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength);
-
-    if ($DEBUG) { print STDERR ("point 3\n");}
-
-#    print STDERR ("data size = ", scalar @data, "\n");
-#    foreach (@data) {
-#	print STDERR ("$_\n");
-#    }
-#
-#    exit(0);
-#
-#    print STDERR ("return at 2\n");
-    return (\@data, $desc_r, $kind, $goodlength, $badline);
-}
-
-sub isLegalKind {
-    return ($_[0] =~ /^[01]$/);
-}
-
-################################################################################
-#
-# sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline
-# style normalized to standard Unix-style newline style
-#
-################################################################################
-sub normalizeData {
-    my ($data_r) = @_;
-
-    # check args
-    if (not defined $data_r) {
-	die "data_r must be defined\n";
-    }
-
-    my @normalized = ();
-    foreach my $pseudo_line (@$data_r) {
-	my @split_line = split(/[\r\n]+/, $pseudo_line);
-	push(@normalized, @split_line);
-    }
-
-    return \@normalized;
-}
-
-
-################################################################################
-#
-# sub getSeqs($data_r, $kind) returns 5 values:
-#
-# * array reference to sequence strings
-# * array reference to sequence names
-# * length of sequence
-# * 1 if all sequences have the same length, 0 else
-# * line number L where sequenceLength(L) != sequenceLength(other lines), else
-#   undef
-#
-################################################################################
-sub getSeqs {
-    my ($input_data_r, $kind) = @_;
-
-    unless( $input_data_r->[0] ){
-        return (undef, undef, undef, undef, undef, 1);
-    }
-
-    # skip all comment chars and lines of all spaces
-    while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) {
-	shift @$input_data_r;
-	if( !defined $input_data_r->[0])  {return (undef, undef, undef, undef, undef, 1);}
-    }
-
-    if (isFormat_FASTA($input_data_r)) {
-	return getSeqs_FASTA($input_data_r, $kind);
-
-    } elsif (isFormat_CLUSTAL($input_data_r)) {
-	return getSeqs_CLUSTAL($input_data_r, $kind);
-
-    } elsif (isFormat_FLAT($input_data_r)) {
-	return getSeqs_FLAT($input_data_r, $kind);
-
-    } else {
-	if ($DEBUG) {print STDERR ("format nothing\n");}
-	return (undef, undef, undef, undef, undef, 1);
-    }
-
-#    if ($_[0] =~ />/) {
-#	return getSeqs_FASTA(@_);
-#    } else {
-#	return getSeqs_CLUSTAL(@_);
-#    }
-}
-
-################################################################################
-#
-# sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format
-#
-################################################################################s
-sub isFormat_FASTA {
-    my ($input_data_r) = @_;
-
-    # check args
-    if (not defined $input_data_r) {
-	Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n");
-    }
-
-    if ($input_data_r->[0] =~ />/) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-################################################################################
-#
-# sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format
-#
-################################################################################
-sub isFormat_CLUSTAL {
-    my ($input_data_r) = @_;
-
-    # check args
-    if (not defined $input_data_r) {
-	Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " .
-		      "defined\n");
-    }
-
-    my $i=0;
-
-#    # skip spaces or just "*" and "." and ":"
-#    while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) {
-#	$i++;
-#    }
-
-    # if it looks like CLUSTAL W (version) ... , then it must be clustal
-    if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) {
-	return 1;
-    }
-
-    # CLUSTAL looks like: "name        seq"
-    if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-################################################################################
-#
-# sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format
-#
-################################################################################
-sub isFormat_FLAT {
-    my ($input_data_r) = @_;
-
-    # check args
-    if (not defined $input_data_r) {
-	Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n");
-    }
-
-#    print("checking flat\n");
-#    print("first element = -->", $input_data_r->[0], "<--\n");
-
-    if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-################################################################################
-######                          FORMATTING FUNCTIONS                      ######
-################################################################################
-
-# the flat sequence format is as follows:
-# sequence1
-# sequence2
-# sequence3
-# ...
-# sequenceN
-sub getSeqs_FLAT {
-
-    if ($DEBUG) {print STDERR "DOING FLAT\n";}
-
-    my ($input_data_r, $input_kind) = @_;
-
-    my $linelength = 0;
-    my $seqCount = 0;
-    my $total_residues = 0;
-    my (@returnVal, @desc) = ();
-    my $prevlinelength = undef;
-    my $NA_count = 0;
-
-    foreach my $seq (@$input_data_r) {
-#	chomp $seq;
-	$seq =~ s/\s+$//;
-
-	my @chars = split(//,$seq);
-
-	my $char;
-	foreach (@chars) {
-	    $total_residues++;
-	    $linelength++;
-
-	    # set $char
-	    if (defined $input_kind) {
-		if ($input_kind == $AA) {
-		    $char = (isAA($_)) ? $_ : "-";
-		} else { # == $NA
-		    $char = (isNA($_)) ? $_ : "-";
-		}
-	    } else {
-		$char = $_;
-		if (isNA($char)) {
-		    $NA_count++;
-		}
-	    }
-
-	    $returnVal[$seqCount] .= $char;
-	}
-	$desc[$seqCount] = "no name";
-
-	if ($seqCount == 0) {
-	    $prevlinelength = $linelength;
-	} elsif ($prevlinelength != $linelength) {  # different number of residues, so complain
-	    return (undef, undef, undef, 0, $seq);  # 0 for not same length, $seq is name
-	}
-	$linelength=0;
-
-	$seqCount++;
-    }
-
-    # determine whether to use $NA or $AA
-    if (!defined $input_kind) {
-	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
-	    $kind = $NA;
-	} else { 
-	    $kind = $AA;
-	}
-    }
-
-    return (\@returnVal, \@desc, $prevlinelength, 1, undef);
-}
-
-sub getSeqs_CLUSTAL {
-
-    if ($DEBUG) {print STDERR "DOING CLUSTAL\n";}
-
-    my ($input_data_r, $input_kind) = @_;
-
-    my @returnVal;
-    my @desc;
-    my $seqCount=0;
-    my $reslength = 0;
-    my ($name, $seq);
-
-#    my $input_kind = pop @_;
-#    my $CONFIDENCE_LIMIT = 0.90;
-    my $NA_count = 0;
-    my $total_residues = 0;
-    my ($prevlinelength, $linelength) = (0,0);
-
-#    foreach (@_) {
-    foreach (@$input_data_r) {
-#	chomp;
-
-	if ($DEBUG) {print STDERR ("line = $_\n")};
-
-	$_ =~ s/\s+$//;
-
-	# skip if it is a comment character -- first character is "#"
-	next if (/^\s*\#/);
-
-	# skil if it is a CLUSTAL W header line
-	next if (/^\s*CLUSTAL/);
-
-	# if spaces or just "*" and "." and ":"
-	if (/^[\*\.\:\s]*$/) {
-	    $seqCount=0;
-	    $prevlinelength=0;
-	    next;
-	}
-
-	($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/);
-
-	if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); }
-
-	# add new entry
-	if (!defined $desc[$seqCount]) {
-	    $desc[$seqCount] = $name;
-	    $returnVal[$seqCount] = "";
-	}
-    
-    if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong
-    
-	my @chars = split(//,$seq);
-	my $char;
-	foreach (@chars) {
-	    if ($seqCount == 0) {
-		$reslength++;     # all sequences have same residue length, so only count first one
-	    }
-
-	    $total_residues++;
-	    $linelength++;
-
-	    # set $char
-	    if (defined $input_kind) {
-		if ($input_kind == $AA) {
-		    $char = (isAA($_)) ? $_ : "-";
-		} else { # == $NA
-		    $char = (isNA($_)) ? $_ : "-";
-		}
-	    } else {
-		$char = $_;
-		if (isNA($char)) {
-		    $NA_count++;
-		}
-	    }
-	    
-	    $returnVal[$seqCount] .= $char;
-	}
-
-	if ($seqCount == 0) {
-	    $prevlinelength = $linelength;
-	} elsif ($prevlinelength != $linelength) {  # different number of residues, so complain
-	    return (undef, undef, undef, 0, $name);
-	}
-	$linelength=0;
-
-	$seqCount++;
-    }
-
-    # determine whether to use $NA or $AA
-    if (!defined $input_kind ) {
-	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
-	    $kind = $NA;
-	} else { 
-	    $kind = $AA;
-	}
-    }
-
-    return (\@returnVal, \@desc, $reslength, 1, undef);
-}
-
-# if $input_kind is defined, residues that are not defined are set to space
-sub getSeqs_FASTA {
-
-    if ($DEBUG) {print STDERR "DOING FASTA\n";}
-
-    my ($input_data_r, $input_kind) = @_;
-
-    my @returnVal;
-    my @desc;
-    my $count=-1;
-    my $newElem=0;
-
-#    my $input_kind = pop @_;
-
-#    my $CONFIDENCE_LIMIT = 0.90;
-    my $NA_count = 0;
-    my $total_residues = 0;
-    my $reslength = 0;
-    my $maxreslength = 0;
-    
-    my ($goodlength, $currline, $prevline);
-
-
-#    # skip all lines that are all spaces
-#    while ($_[0] =~ /^\s*$/) {
-#	shift @_;
-#    }
-
-#    foreach (@_) {
-    foreach (@$input_data_r) {
-
-	$_ =~ s/\s+$//;
-
-	# skip if it is a comment character -- first character is "#"
-	next if (/^\s*\#/);
-
-	# skip all lines that are all spaces
-	next if (/^\s*$/);
-
-	$_ =~ s/\s+$//;  # cut trailing white space
-	$_ =~ s/^\s+//;  # cut leading white space
-	if (/>/) {
-	    $currline = $_;
-	    ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/);
-
-	    if (not $newElem) {		
-		$count++;
-		$newElem = 1;
-	    }
-	} else {
-	    if ($newElem) {
-		$maxreslength = $reslength if $maxreslength == 0;
-		if (($maxreslength != 0) && ($maxreslength != $reslength)) {
-		    return (undef, undef, undef, 0, $prevline);
-		}
-
-		$maxreslength = $reslength;
-		$reslength = 0;
-	    }
-
-	    my @chars = split(//,$_);
-	    my $char;
-	    foreach (@chars) {
-		$reslength++;
-		$total_residues++;
-
-		# set $char
-		if (defined $input_kind) {
-		    if ($input_kind == $AA) {
-			$char = (isAA($_)) ? $_ : "-";
-		    } else { # == $NA
-			$char = (isNA($_)) ? $_ : "-";
-		    }
-		} else {
-		    $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-";  # if not alpha char, use space
-		    if (isNA($char) && !isSpace($char)) {
-			$NA_count++;
-		    }
-		}
-
-		if ($newElem) {
-		    $returnVal[$count] = $char;
-		} else {
-		    $returnVal[$count] .= $char;
-		}
-		$newElem = 0;
-	    }
-	    $prevline = $currline if $currline =~ />/;
-	}
-    }
-
-    # check if last is biggest
-    if (($maxreslength != 0) && ($maxreslength != $reslength)) {
-	return (undef, undef, undef, 0, $prevline);
-    }
-#    $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength;
-
-    # determine whether to use $NA or $AA
-    if (!defined $input_kind) {
-	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
-	    $kind = $NA;
-	} else { 
-	    $kind = $AA;
-	}
-    }
-
-    return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef);  # 1 for good lengths
-}
-
-sub isSpace {
-    return $_[0] =~ /[ \-]/;
-}
-
-sub isAA {
-    return (defined $AMINOACIDS{lc $_[0]});
-}
-
-sub isNA {
-    return (defined $BASES{lc $_[0]});
-}
-
-################################################################################
-######                       DATA BUILDING FUNCTIONS                      ######
-################################################################################
-
-
-# arguments: takes reference to array and lines aligned sequences of bases or
-#            amino acids
-# returns: updated reference array to reflect contents of sequences sorted
-#          vertically by height as described by (1)
-sub buildData {
-    
-    my $currentx = 0;
-    my $h;
-    my $count=0;
-    my $maxreslength = pop (@_);
-    my $stretch = pop(@_);
-    my $smallsampletoggle = pop (@_);
-    my $totalsize = $#_+1;
-
-    while ($currentx < $maxreslength) {       #(length $_[0])) {
-	my $allspaces = 1;
-	my $spaceCount=0;
-
-	# get vertical sequence
-	my @vert=();
-	foreach (@_) {  # foreach sequence
-	    my $currentchar;
-
-	    # set currentchar, set to " " if $_ is not long enough
-	    if ($currentx >= (length $_)) {
-		$currentchar = " ";
-	    } else {
-		$currentchar = substr($_,$currentx,1);
-	    }
-
-	    # in all sequences, "-" is considered as a space
-	    # don't count " " and "-"
-	    if (($currentchar ne "-") && ($currentchar ne " ")) {
-		$vert[scalar @vert] = uc substr($_,$currentx,1);
-		$allspaces = 0;
-	    } else {
-		$spaceCount++;
-	    }
-	}
-
-	if ($allspaces) {
-	    # build @vert
-	    @vert = (" 0", ">0");
-
-	    # place in @data
-	    $data[scalar @data] = \@vert;
-
-	    $currentx++;
-	    next;
-	}
-
-	my $error;
-	if ($smallsampletoggle) {
-	    $error=getError($kind,$totalsize - $spaceCount);
-	} else {
-	    $error = 0;
-	}
-
-	# sort vertical sequence by amino acid or base
-	@vert = sort(@vert);
-	my $total = $#vert + 1;
-
-	# find H(l) -- must be done before collapsing
-	$h = getH(@vert);
-
-	# collect like terms
-	@vert = collapse(@vert);
-
-	# get R(l)
-	my $r;
-	if (!defined $stretch || !$stretch) {
-	    $r= getR($kind, $h, $error);
-	} else {
-	    $r = ($kind == $NA) ? 2 : (log(20) / log(2));
-	}
-
-	# place heights
-	my $count=0;
-	my $height;
-	my $elem;
-	foreach $elem (@vert) {
-	    $height = getHeight(substr($elem, 2) / $total,$r);
-	    $vert[$count] = substr($elem,0,1) . $height;
-	    $count++;
-	}
-
-	# sort by height
-	@vert = sort height_sort @vert;
-
-	# put in error bar size
-	$vert[$count] = ">$error";
-
-	# place in @data
-	$data[scalar @data] = \@vert;
-
-	$currentx++;
-    }
-}
-
-# uses error approximation given by:
-#             e :=  (s-1) / (2 * ln2 * ntrue);
-sub getError {
-    return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]);
-}
-
-sub height_sort {
-    my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1);
-    my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1);
-    
-    # compare by height first, then letter
-    if ($heighta <=> $heightb) {
-	return $heighta <=> $heightb;
-    } else {
-	return $letterb cmp $lettera;  #reversed for some reason...
-    }
-}
-
-sub collapse {
-    my @returnVal;
-    my $current = $_[0];
-    my $count=0;
-    my $freq;
-
-    foreach (@_) {
-        if ($current eq $_) {
-            $count++;
-        } else {
-	    $returnVal[scalar @returnVal] = "$current $count";
-
-            $current = $_;
-            $count=1;
-        }
-    }
-
-    # add last element
-    $returnVal[scalar @returnVal] = "$current $count";
-
-    return @returnVal;
-}
-
-# arguments : $_[0] : list of bases or amino acids
-sub getH {
-    my $h = 0;
-    my (@vert) = @_;  # vertical sequence (comparing multiple sequences)
-
-    my $current = $vert[0];
-    my $count=0;
-    my $freq;
-
-    foreach (@vert) {
-	if ($current eq $_) {
-	    $count++;
-	} else {
-	    $freq = $count / ($#vert + 1);
-	    $h += $freq * (log ($freq) / log(2));
-
-	    $current = $_;
-	    $count=1;
-	}
-    }
-
-    # add last element
-    $freq = $count / ($#vert + 1);
-    $h += $freq * (log ($freq) / log(2));
-
-    return -$h;
-}
-
-# arguments : $_[0] : $AA or $NA
-#             $_[1] : uncertainty = H(l)
-#             $_[2] : error correction factor for small number of sequences
-#                     = e(n) ; assummed to be 0 if not given
-sub getR {
-    my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2));
-    my $e = (defined $_[2]) ? $_[2] : 0;
-    return ($max - ($_[1] + $e));
-}
-
-# arguments: $_[0] : frequency = f(b,l)
-#            $_[1] : R(l)
-sub getHeight {
-    return $_[0] * $_[1]
-}
-
-1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_and_filter.r	Mon Sep 22 10:19:36 2014 -0400
@@ -0,0 +1,57 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+
+summaryfile = args[1]
+mutationanalysisfile = args[2]
+mutationstatsfile = args[3]
+hotspotsfile = args[4]
+output = args[5]
+unmatchedfile = args[6]
+
+summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+
+summ = summ[summ$Functionality != "No results",]
+tmp = summ[summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70,]
+unmatched = summ[summ$chunk_hit_percentage < 70 & summ$nt_hit_percentage < 70,]
+unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+summ = tmp
+rm(tmp)
+
+if(length(summ$Sequence.ID) == 0){
+	stop("No data remaining after filter")
+}
+
+result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-2])], by="Sequence.ID")
+result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
+result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
+
+
+cleanup_columns = c("FR1.IMGT.Nb.of.mutations", 
+                    "CDR1.IMGT.Nb.of.mutations", 
+                    "FR2.IMGT.Nb.of.mutations", 
+                    "CDR2.IMGT.Nb.of.mutations", 
+                    "FR3.IMGT.Nb.of.mutations")
+
+for(col in cleanup_columns){
+  result[,col] = gsub("\\(.*\\)", "", result[,col])
+  result[,col] = as.numeric(result[,col])
+  result[is.na(result[,col]),] = 0
+}
+
+result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
+result$VGene = gsub("[*].*", "", result$VGene)
+result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
+result$JGene = gsub("[*].*", "", result$JGene)
+
+result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match)
+
+result = result[!duplicated(result$past), ]
+
+result = result[,!(names(result) %in% c("past"))]
+
+write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
+write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
--- a/mutation_analysis.py	Wed Sep 17 07:25:17 2014 -0400
+++ b/mutation_analysis.py	Mon Sep 22 10:19:36 2014 -0400
@@ -2,79 +2,125 @@
 import argparse
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--mutationfile", help="The '7_V-REGION-mutation-and-AA-change-table' file from the IMGT output")
-parser.add_argument("--hotspotfile", help="The '10_V-REGION-mutation-hotspots' file from the IMGT output")
+parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
+parser.add_argument("--genes", help="The genes available in the 'best_match' column")
 parser.add_argument("--output", help="Output file")
 
 args = parser.parse_args()
 
-mutationfile = args.mutationfile #"test_VH-Ca_Cg_25nt/7_V-REGION-mutation-and-AA-change-table_test_VH-Ca_Cg_25nt_241013.txt"
-hotspotsfile = args.hotspotfile #"test_VH-Ca_Cg_25nt/10_V-REGION-mutation-hotspots_test_VH-Ca_Cg_25nt_241013.txt"
-outfile = args.output #"out.txt"
+infile = args.input
+genes = str(args.genes).split(",")
+outfile = args.output 
+
+genedic = dict()
 
 mutationdic = dict()
 mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
 linecount = 0
 
-with open(mutationfile, 'r') as i:
-  for line in i.readlines()[1:]:
+IDIndex = 0
+best_matchIndex = 0
+fr1Index = 0
+cdr1Index = 0
+fr2Index = 0
+cdr2Index = 0
+fr3Index = 0
+first=True
+with open(infile, 'r') as i:
+  for line in i:
+		if first:
+			linesplt = line.split("\t")
+			IDIndex = linesplt.index("Sequence.ID")
+			best_matchIndex = linesplt.index("best_match")
+			fr1Index = linesplt.index("FR1.IMGT")
+			cdr1Index = linesplt.index("CDR1.IMGT")
+			fr2Index = linesplt.index("FR2.IMGT")
+			cdr2Index = linesplt.index("CDR2.IMGT")
+			fr3Index = linesplt.index("FR3.IMGT")
+			first = False
+			continue
 		linecount += 1
 		linesplt = line.split("\t")
-		if linesplt[2] != "productive":
-			continue
-		ID = linesplt[1]
-		mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[5].split("|") if x]
-		mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[6].split("|") if x]
-		mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[7].split("|") if x]
-		mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[8].split("|") if x]
+		ID = linesplt[IDIndex]
+		genedic[ID] = linesplt[best_matchIndex]
+		mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]
+		mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
+		mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
+		mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
 		mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
-		mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[9].split("|") if x]
+		mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
     
 if linecount == 0:
 	print "No data, exiting"
 	with open(outfile, 'w') as o:
-		o.write("RGYW (%),0,0,NA\n")
-		o.write("WRCY (%),0,0,NA\n")
-		o.write("WA (%),0,0,NA\n")
-		o.write("TW (%),0,0,NA\n")
+		o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
+		o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
+		o.write("WA (%)," + ("0,0,0\n" * len(genes)))
+		o.write("TW (%)," + ("0,0,0\n" * len(genes)))
 	import sys
 	sys.exit()
 
 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
-RGYWCount = 0
-WRCYCount = 0
-WACount = 0
-TWCount = 0
+RGYWCount = {g: 0 for g in genes}
+WRCYCount = {g: 0 for g in genes}
+WACount = {g: 0 for g in genes}
+TWCount = {g: 0 for g in genes}
 
-with open(hotspotsfile, 'r') as i:
-	for line in i.readlines()[1:]:
-		linesplt = line.split("\t")
-		if linesplt[2] != "productive":
+IDIndex = 0
+ataIndex = 0
+tatIndex = 0
+aggctatIndex = 0
+atagcctIndex = 0
+first = True
+with open(infile, 'r') as i:
+	for line in i:
+		if first:
+			linesplt = line.split("\t")
+			ataIndex = linesplt.index("X.a.t.a")
+			tatIndex = linesplt.index("t.a.t.")
+			aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.")
+			atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.")
+			first = False
 			continue
-		ID = linesplt[1]
-		RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[6].split("|") if x]]
-		WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[7].split("|") if x]]
-		WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[4].split("|") if x]]
-		TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[5].split("|") if x]]
-		RGYWCount += sum([1 for (x,y,z) in RGYW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-		WRCYCount += sum([1 for (x,y,z) in WRCY if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-		WACount += sum([1 for (x,y,z) in WA if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-		TWCount += sum([1 for (x,y,z) in TW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
-
-path = outfile[:outfile.rfind("/") + 1] + "mutations.txt"
-value = 0;
-with open(path, 'r') as f:
-	value = f.readlines()[0].split(",")[1]
-with open(outfile, 'w') as o:
-	if value != "0":
-		o.write("RGYW (%)," + str(RGYWCount) + "," + value + "," + str(round(RGYWCount / float(value) * 100, 1)) + "\n")
-		o.write("WRCY (%)," + str(WRCYCount) + "," + value + "," + str(round(WRCYCount / float(value) * 100, 1)) + "\n")
-		o.write("WA (%)," + str(WACount) + "," + value + "," + str(round(WACount / float(value) * 100, 1)) + "\n")
-		o.write("TW (%)," + str(TWCount) + "," + value + "," + str(round(TWCount / float(value) * 100, 1)) + "\n")
-	else:
-		o.write("RGYW (%)," + str(RGYWCount) + ",0,NA\n")
-		o.write("WRCY (%)," + str(WRCYCount) + ",0,NA\n")
-		o.write("WA (%)," + str(WACount) + ",0,NA\n")
-		o.write("TW (%)," + str(TWCount) + ",0,NA\n")
+		linesplt = line.split("\t")
+		gene = linesplt[best_matchIndex]
+		ID = linesplt[IDIndex]
+		RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
+		WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
+		WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
+		TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
+		RGYWCount[ID] = sum([1 for (x,y,z) in RGYW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		WRCYCount[ID] = sum([1 for (x,y,z) in WRCY if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		WACount[ID] = sum([1 for (x,y,z) in WA if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
+		TWCount[ID] = sum([1 for (x,y,z) in TW if z and z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])])
 
 
+directory = outfile[:outfile.rfind("/") + 1]
+value = 0;
+valuedic = dict()
+for gene in genes:
+	with open(directory + gene + "_value.txt", 'r') as v:
+		valuedic[gene] = int(v.readlines()[0].rstrip())
+with open(directory + "total_value.txt", 'r') as v:
+	valuedic["total"] = int(v.readlines()[0].rstrip())
+
+dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
+arr = ["RGYW", "WRCY", "WA", "TW"]
+with open(outfile, 'w') as o:	
+	for typ in arr:
+		o.write(typ + " (%)")
+		curr = dic[typ]
+		for gene in genes:
+			geneMatcher = re.compile(".*" + gene + ".*")
+			if valuedic[gene] is 0:
+				o.write(",0,0,0")
+			else:
+				x = sum([curr[x] for x in [y for y,z in genedic.iteritems() if geneMatcher.match(z)]])
+				y = valuedic[gene]
+				z = str(round(x / float(valuedic[gene]) * 100, 1))
+				o.write("," + str(x) + "," + str(y) + "," + z)
+		#for total
+		x = sum([y for x,y in curr.iteritems()])
+		y = valuedic["total"]
+		z = str(round(x / float(valuedic["total"]) * 100, 1))
+		o.write("," + str(x) + "," + str(y) + "," + z + "\n")
--- a/mutation_analysis.r	Wed Sep 17 07:25:17 2014 -0400
+++ b/mutation_analysis.r	Mon Sep 22 10:19:36 2014 -0400
@@ -1,38 +1,29 @@
 args <- commandArgs(trailingOnly = TRUE)
 
 input = args[1]
-summaryinput = args[2]
+genes = unlist(strsplit(args[2], ","))
 outputdir = args[3]
 setwd(outputdir)
 
-#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F)
 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
 
-datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F)
-datSum = datSum[,c("Sequence.ID","J.GENE.and.allele", "AA.JUNCTION")]
-
-dat = merge(dat, datSum, by="Sequence.ID", all.x=T)
-
 if(length(dat$Sequence.ID) == 0){
-	setwd(outputdir)
-	result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
-	row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
-	write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
-	transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
-	row.names(transitionTable) = c("A", "C", "G", "T")
-	transitionTable["A","A"] = NA
-	transitionTable["C","C"] = NA
-	transitionTable["G","G"] = NA
-	transitionTable["T","T"] = NA
-	write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
-	cat("0", file="n.txt")
-	stop("No data")
+  setwd(outputdir)
+  result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))
+  row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
+  write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+  transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))
+  row.names(transitionTable) = c("A", "C", "G", "T")
+  transitionTable["A","A"] = NA
+  transitionTable["C","C"] = NA
+  transitionTable["G","G"] = NA
+  transitionTable["T","T"] = NA
+  write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+  cat("0", file="n.txt")
+  stop("No data")
 }
 
-#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left"))
 
-result = data.frame(x = 1:5, y = 1:5, z = 1:5)
-row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")
 
 cleanup_columns = c("FR1.IMGT.c.a",
                     "FR2.IMGT.g.t",
@@ -111,32 +102,19 @@
   dat[is.na(dat[,col]),] = 0
 }
 
-dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
-dat$VGene = gsub("[*].*", "", dat$VGene)
-dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
-dat$JGene = gsub("[*].*", "", dat$JGene)
-
-dat$past = paste(dat$AA.JUNCTION, dat$VGene, dat$JGene, (dat$FR1.IMGT.Nb.of.mutations + dat$CDR1.IMGT.Nb.of.mutations + dat$FR2.IMGT.Nb.of.mutations + dat$CDR2.IMGT.Nb.of.mutations + dat$FR3.IMGT.Nb.of.mutations))
-
-dat = dat[!duplicated(dat$past), ]
-
-VRegionMutations =  sum(dat$FR1.IMGT.Nb.of.mutations + 
+dat$VRegionMutations =  dat$FR1.IMGT.Nb.of.mutations + 
                         dat$CDR1.IMGT.Nb.of.mutations + 
                         dat$FR2.IMGT.Nb.of.mutations + 
                         dat$CDR2.IMGT.Nb.of.mutations +
-                        dat$FR3.IMGT.Nb.of.mutations)
+                        dat$FR3.IMGT.Nb.of.mutations
 
-VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides +
+dat$VRegionNucleotides =  dat$FR1.IMGT.Nb.of.nucleotides +
                           dat$CDR1.IMGT.Nb.of.nucleotides +
                           dat$FR2.IMGT.Nb.of.nucleotides +
                           dat$CDR2.IMGT.Nb.of.nucleotides +
-                          dat$FR3.IMGT.Nb.of.nucleotides)
+                          dat$FR3.IMGT.Nb.of.nucleotides
 
-result[1,"x"] = VRegionMutations
-result[1,"y"] = VRegionNucleotides
-result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1)
-
-transitionMutations = sum(dat$FR1.IMGT.a.g +
+dat$transitionMutations = dat$FR1.IMGT.a.g +
                           dat$FR1.IMGT.g.a +
                           dat$FR1.IMGT.c.t +
                           dat$FR1.IMGT.t.c +
@@ -155,58 +133,51 @@
                           dat$FR3.IMGT.a.g +
                           dat$FR3.IMGT.g.a +
                           dat$FR3.IMGT.c.t +
-                          dat$FR3.IMGT.t.c)
-
-result[2,"x"] = transitionMutations
-result[2,"y"] = VRegionMutations
-result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1)
+                          dat$FR3.IMGT.t.c
 
-transversionMutations = sum(  dat$FR1.IMGT.a.c +
-                              dat$FR1.IMGT.c.a +
-                              dat$FR1.IMGT.a.t +
-                              dat$FR1.IMGT.t.a +
-                              dat$FR1.IMGT.g.c +
-                              dat$FR1.IMGT.c.g +
-                              dat$FR1.IMGT.g.t +
-                              dat$FR1.IMGT.t.g +
-                              dat$CDR1.IMGT.a.c +
-                              dat$CDR1.IMGT.c.a +
-                              dat$CDR1.IMGT.a.t +
-                              dat$CDR1.IMGT.t.a +
-                              dat$CDR1.IMGT.g.c +
-                              dat$CDR1.IMGT.c.g +
-                              dat$CDR1.IMGT.g.t +
-                              dat$CDR1.IMGT.t.g +
-                              dat$FR2.IMGT.a.c +
-                              dat$FR2.IMGT.c.a +
-                              dat$FR2.IMGT.a.t +
-                              dat$FR2.IMGT.t.a +
-                              dat$FR2.IMGT.g.c +
-                              dat$FR2.IMGT.c.g +
-                              dat$FR2.IMGT.g.t +
-                              dat$FR2.IMGT.t.g +
-                              dat$CDR2.IMGT.a.c +
-                              dat$CDR2.IMGT.c.a +
-                              dat$CDR2.IMGT.a.t +
-                              dat$CDR2.IMGT.t.a +
-                              dat$CDR2.IMGT.g.c +
-                              dat$CDR2.IMGT.c.g +
-                              dat$CDR2.IMGT.g.t +
-                              dat$CDR2.IMGT.t.g +
-                              dat$FR3.IMGT.a.c +
-                              dat$FR3.IMGT.c.a +
-                              dat$FR3.IMGT.a.t +
-                              dat$FR3.IMGT.t.a +
-                              dat$FR3.IMGT.g.c +
-                              dat$FR3.IMGT.c.g +
-                              dat$FR3.IMGT.g.t +
-                              dat$FR3.IMGT.t.g)
+dat$transversionMutations = dat$FR1.IMGT.a.c +
+                            dat$FR1.IMGT.c.a +
+                            dat$FR1.IMGT.a.t +
+                            dat$FR1.IMGT.t.a +
+                            dat$FR1.IMGT.g.c +
+                            dat$FR1.IMGT.c.g +
+                            dat$FR1.IMGT.g.t +
+                            dat$FR1.IMGT.t.g +
+                            dat$CDR1.IMGT.a.c +
+                            dat$CDR1.IMGT.c.a +
+                            dat$CDR1.IMGT.a.t +
+                            dat$CDR1.IMGT.t.a +
+                            dat$CDR1.IMGT.g.c +
+                            dat$CDR1.IMGT.c.g +
+                            dat$CDR1.IMGT.g.t +
+                            dat$CDR1.IMGT.t.g +
+                            dat$FR2.IMGT.a.c +
+                            dat$FR2.IMGT.c.a +
+                            dat$FR2.IMGT.a.t +
+                            dat$FR2.IMGT.t.a +
+                            dat$FR2.IMGT.g.c +
+                            dat$FR2.IMGT.c.g +
+                            dat$FR2.IMGT.g.t +
+                            dat$FR2.IMGT.t.g +
+                            dat$CDR2.IMGT.a.c +
+                            dat$CDR2.IMGT.c.a +
+                            dat$CDR2.IMGT.a.t +
+                            dat$CDR2.IMGT.t.a +
+                            dat$CDR2.IMGT.g.c +
+                            dat$CDR2.IMGT.c.g +
+                            dat$CDR2.IMGT.g.t +
+                            dat$CDR2.IMGT.t.g +
+                            dat$FR3.IMGT.a.c +
+                            dat$FR3.IMGT.c.a +
+                            dat$FR3.IMGT.a.t +
+                            dat$FR3.IMGT.t.a +
+                            dat$FR3.IMGT.g.c +
+                            dat$FR3.IMGT.c.g +
+                            dat$FR3.IMGT.g.t +
+                            dat$FR3.IMGT.t.g
 
-result[3,"x"] = transversionMutations
-result[3,"y"] = VRegionMutations
-result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1)
 
-transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+dat$transitionMutationsAtGC = dat$FR1.IMGT.g.a +
                               dat$FR1.IMGT.c.t +
                               dat$CDR1.IMGT.g.a +
                               dat$CDR1.IMGT.c.t +
@@ -215,9 +186,9 @@
                               dat$CDR2.IMGT.g.a +
                               dat$CDR2.IMGT.c.t +
                               dat$FR3.IMGT.g.a +
-                              dat$FR3.IMGT.c.t)
+                              dat$FR3.IMGT.c.t
 
-totalMutationsAtGC = sum(dat$FR1.IMGT.g.a +
+dat$totalMutationsAtGC = dat$FR1.IMGT.g.a +
                          dat$FR1.IMGT.c.t +
                          dat$FR1.IMGT.c.a +
                          dat$FR1.IMGT.g.c +
@@ -246,18 +217,99 @@
                          dat$FR3.IMGT.c.a +
                          dat$FR3.IMGT.g.c +
                          dat$FR3.IMGT.c.g +
-                         dat$FR3.IMGT.g.t)
+                         dat$FR3.IMGT.g.t
 
-result[4,"x"] = transitionMutationsAtGC
-result[4,"y"] = totalMutationsAtGC
-result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1)
-
-result[5,"x"] = totalMutationsAtGC
-result[5,"y"] = VRegionMutations
-result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1)
 
 
-#transitiontable
+setwd(outputdir)
+
+matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=5)
+for(i in 1:length(genes)){
+  gene = genes[i]
+  tmp = dat[grepl(paste(".*", gene, ".*", sep=""), dat$best_match),]
+  if(gene == "."){
+    tmp = dat
+  }
+  if(length(tmp) == 0){
+		cat("0", file=paste(gene, "_value.txt" ,sep=""))
+    next
+  }
+  j = i - 1
+  x = (j * 3) + 1
+  y = (j * 3) + 2
+  z = (j * 3) + 3
+  matrx[1,x] = sum(tmp$VRegionMutations)
+  matrx[1,y] = sum(tmp$VRegionNucleotides)
+  matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+  matrx[2,x] = sum(tmp$transitionMutations)
+  matrx[2,y] = sum(tmp$VRegionMutations)
+  matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+  matrx[3,x] = sum(tmp$transversionMutations)
+  matrx[3,y] = sum(tmp$VRegionMutations)
+  matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+  matrx[4,x] = sum(tmp$transitionMutationsAtGC)
+  matrx[4,y] = sum(tmp$totalMutationsAtGC)
+  matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+  matrx[5,x] = sum(tmp$totalMutationsAtGC)
+  matrx[5,y] = sum(tmp$VRegionMutations)
+  matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
+  
+  transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
+  row.names(transitionTable) = c("A", "C", "G", "T")
+  transitionTable["A","A"] = NA
+  transitionTable["C","C"] = NA
+  transitionTable["G","G"] = NA
+  transitionTable["T","T"] = NA
+  nts = c("a", "c", "g", "t")
+  
+  
+  for(nt1 in nts){
+    for(nt2 in nts){
+      if(nt1 == nt2){
+        next
+      }
+      NT1 = LETTERS[letters == nt1]
+      NT2 = LETTERS[letters == nt2]
+      FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+      CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+      FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+      CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+      FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+      transitionTable[NT1,NT2] = sum( tmp[,FR1] + 
+                                        tmp[,CDR1] +
+                                        tmp[,FR2] +
+                                        tmp[,CDR2] +
+                                        tmp[,FR3])
+    }
+  }
+  write.table(x=transitionTable, file=paste("transitions_", gene ,".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
+  write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", gene ,".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+  
+  cat(matrx[1,x], file=paste(gene, "_value.txt" ,sep=""))
+  cat(length(tmp$Sequence.ID), file=paste(gene, "_n.txt" ,sep=""))
+}
+
+#again for all of the data
+tmp = dat
+j = i
+x = (j * 3) + 1
+y = (j * 3) + 2
+z = (j * 3) + 3
+matrx[1,x] = sum(tmp$VRegionMutations)
+matrx[1,y] = sum(tmp$VRegionNucleotides)
+matrx[1,z] = round(matrx[1,x] / matrx[1,y] * 100, digits=1)
+matrx[2,x] = sum(tmp$transitionMutations)
+matrx[2,y] = sum(tmp$VRegionMutations)
+matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
+matrx[3,x] = sum(tmp$transversionMutations)
+matrx[3,y] = sum(tmp$VRegionMutations)
+matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1)
+matrx[4,x] = sum(tmp$transitionMutationsAtGC)
+matrx[4,y] = sum(tmp$totalMutationsAtGC)
+matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1)
+matrx[5,x] = sum(tmp$totalMutationsAtGC)
+matrx[5,y] = sum(tmp$VRegionMutations)
+matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1)
 
 transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4)
 row.names(transitionTable) = c("A", "C", "G", "T")
@@ -269,40 +321,88 @@
 
 
 for(nt1 in nts){
-  for(nt2 in nts){
-    if(nt1 == nt2){
-      next
-    }
-    NT1 = LETTERS[letters == nt1]
-    NT2 = LETTERS[letters == nt2]
-    FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
-    CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
-    FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
-    CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
-    FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
-    transitionTable[NT1,NT2] = sum( dat[,FR1] + 
-                                    dat[,CDR1] +
-                                    dat[,FR2] +
-                                    dat[,CDR2] +
-                                    dat[,FR3])
-  }
+	for(nt2 in nts){
+		if(nt1 == nt2){
+			next
+		}
+		NT1 = LETTERS[letters == nt1]
+		NT2 = LETTERS[letters == nt2]
+		FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
+		CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
+		FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
+		CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
+		FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
+		transitionTable[NT1,NT2] = sum( tmp[,FR1] + 
+																			tmp[,CDR1] +
+																			tmp[,FR2] +
+																			tmp[,CDR2] +
+																			tmp[,FR3])
+	}
+}
+write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
+write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file="matched_all.txt", sep="\t",quote=F,row.names=F,col.names=T)
+cat(matrx[1,x], file="total_value.txt")
+cat(length(tmp$Sequence.ID), file="total_n.txt")
+
+
+
+result = data.frame(matrx)
+row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C.G (%)")
+
+write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
+
+
+if (!("ggplot2" %in% rownames(installed.packages()))) {
+	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
+}
+library(ggplot2)
+
+genesForPlot = gsub("[0-9]", "", dat$best_match)
+genesForPlot = data.frame(table(genesForPlot))
+colnames(genesForPlot) = c("Gene","Freq")
+genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+pc = pc + geom_bar(width = 1, stat = "identity")
+pc = pc + coord_polar(theta="y")
+pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+
+png(filename="all.png")
+pc
+dev.off()
+
+
+#blegh
+genesForPlot = dat[grepl("ca", dat$best_match),]$best_match
+if(length(genesForPlot) > 0){
+	genesForPlot = data.frame(table(genesForPlot))
+	colnames(genesForPlot) = c("Gene","Freq")
+	genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+	pc = pc + geom_bar(width = 1, stat = "identity")
+	pc = pc + coord_polar(theta="y")
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA", "( n =", sum(genesForPlot$Freq), ")"))
+
+
+	png(filename="ca.png")
+	print(pc)
+	dev.off()
 }
 
-setwd(outputdir)
-write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)
-write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
-cat(length(dat$Sequence.ID), file="n.txt")
+genesForPlot = dat[grepl("cg", dat$best_match),]$best_match
+if(length(genesForPlot) > 0){
+	genesForPlot = data.frame(table(genesForPlot))
+	colnames(genesForPlot) = c("Gene","Freq")
+	genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
+
+	pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))
+	pc = pc + geom_bar(width = 1, stat = "identity")
+	pc = pc + coord_polar(theta="y")
+	pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG", "( n =", sum(genesForPlot$Freq), ")"))
 
 
-weblogo = dat[,c("Sequence.ID", "VGene")]
-weblogo$VGene = gsub("\\*.*", "", weblogo$VGene)
-
-rs12 = read.table("HS12RSS.txt", sep="\t", header=TRUE)
-rs23 = read.table("HS23RSS.txt", sep="\t", header=TRUE)
-
-result12 = merge(weblogo, rs12, by.x="VGene", by.y="Gene")
-result23 = merge(weblogo, rs23, by.x="VGene", by.y="Gene")
-
-write.table(x=result12$Sequence, file="weblogo_in_rs12.txt", sep=",",quote=F,row.names=F,col.names=F)
-write.table(x=result23$Sequence, file="weblogo_in_rs23.txt", sep=",",quote=F,row.names=F,col.names=F)
-
+	png(filename="cg.png")
+	print(pc)
+	dev.off()
+}
--- a/piechart.r	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-if (!("ggplot2" %in% rownames(installed.packages()))) {
-	install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") 
-}
-library(ggplot2)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-vls = args[1]
-lbls = args[2]
-name = args[3]
-output = args[4]
-
-vls = as.numeric(unlist(strsplit(vls, ",")))
-lbls = unlist(strsplit(lbls, ","))
-
-pc = ggplot(data.frame(Type=lbls, vls=vls), aes(x = factor(1), y=vls, fill=Type))
-pc = pc + geom_bar(width = 1, stat = "identity")
-pc = pc + coord_polar(theta="y")
-pc = pc + xlab(" ") + ylab(" ") + ggtitle(name)
-
-png(filename=output)
-#pie(vls, labels=lbls, col=rainbow(length(vls)), main=name)
-pc
-dev.off()
--- a/seqlogo	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,356 +0,0 @@
-#!/usr/bin/perl -w
-
-=head1 NAME
-
-  seqlogo - runs the logo creation script 
-
-=head1 SYNOPSIS
-
-  seqlogo [OPTION]...-f [FILENAME]
-
-=head1 DESCRIPTION
-
-    Creates a logo for the given input filename.
-
-    Available options:
-    -B <bar bits>              Number of bits in bar (real between 0, 1)
-    -T <tic bits>              Number of bits between tic marks
-    -C <chars per line>        Number of characters per line of logo
-    -d <box shrink factor>     Shrink factor of characters if option c is toggled
-    -E <error bar fraction>    Fraction of error bar to show (real between 0, 1 )
-    -f <input file>            Input filename
-    -F <format>                Format of output (EPS, GIF, PDF, PNG), - for STDOUT
-    -h <logo height>           Height of output logo (real > 0)
-    -k <kind of data>          0 for amino acid, 1 for nucleic acid ; if not
-                                 defined, a 90% certainty method is used to
-                                 determine whether the input data is amino acid or
-                                 nucleic acid
-    -l <sequence lower bound>  Lower bound of sequence (integer)
-    -m <sequence upper bound>  Upper bound of sequence (integer)
-    -o <output file>           Name of output file
-    -s <sequence start>        Sequence start number, defaults to 1 (int)
-    -t <titletext>             Text of title, enclosed in "" if more than one word
-    -w <logo width>            Width of output logo
-    -x <x-axis label>          Label for x-axis
-    -y <y-axis label>          Label for y-axis
-
-    Available toggles (no values associated)
-    -a       Toggle antialiasing
-    -b       Toggle bar ends
-    -c       Toggle color
-    -e       Toggle error bar
-    -M       Toggle small sample correction
-    -O       Toggle outlining of characters
-    -n       Toggle numbering of x-axis
-    -S       Toggle stretching of logos to entire length
-    -X       Toggle boxing of characters
-    -Y       Toggle y-axis
-
-=head1 EXAMPLE
-
-  The following command takes as input "input.fasta" and returns the logo in the
-  form "logo.eps". Antialiasing, bar ends, color, small sample correction,
-  numbering of x-axis, and y-axis labelling are turned on:
-
-    seqlogo -f input.fasta -F EPS -o logo.eps -abcMnY
-
-=cut
-
-use vars qw($PATH);
-
-BEGIN {
-
-    use FindBin qw($Bin);
-    use lib "$Bin";
-
-    $PATH = $Bin;
-
-##    $PATH = "/h/gary/Seqlogo/Code/";
-#    $PATH = "/n/weblogo/home/httpd/weblogo/pub/beta/Seqlogo/Code";
-#    unshift(@INC, $PATH);
-}
-
-use logo;
-use template;
-use Getopt::Std;
-use FileHandle;
-
-
-my $opts;
-$opts =
-  $opt_a || # antialiasing
-  $opt_b || # bar ends (0 or 1)
-  $opt_c || # color
-  $opt_e || # show error bar (0 or 1)
-  $opt_n || # numbering (0 or 1)
-  $opt_M || # small sample correction (0 or 1)
-  $opt_O || # outline (0 or 1)
-  $opt_S || # stretch
-  $opt_X || # box (0 for no box, 1 for box)
-  $opt_Y || # y axis
-
-  $opt_B || # bar bits (real)
-  $opt_T || # tics bits (real)
-  $opt_C || # chars per line
-  $opt_d || # box shrinking factor (<1)
-  $opt_E || # error bar fraction (real)
-  $opt_f || # input filename
-  $opt_F || # format (PNG, EPS, PDF, GIF)
-  $opt_h || # logo height (cm)
-  $opt_k || # 0 = amino acid, 1 = nucleic acid
-  $opt_l || # lower bound of sequence to put in logo
-  $opt_m || # max bound of sequence to put in logo
-  $opt_o || # output file
-  $opt_s || # start number for very beginning of sequence
-  $opt_t || # title text (string)
-  $opt_w || # logo width (cm)
-  $opt_x || # x axis label
-  $opt_y || # y axis label
-$opts;
-
-################################################################################
-#####                              USAGE                                   #####
-################################################################################
-
-sub usage {
-    my $usage = <<END
-
-usage: seqlogo -f <input filename> [OPTIONs with values]
-Creates a logo for the given input filename.
-
-Available options:
-  -B <bar bits>              Number of bits in bar (real # > 0)
-  -T <tic bits>              Number of bits between tic marks
-  -C <chars per line>        Number of characters per line of logo
-  -d <box shrink factor>     Shrink factor of characters if option c is toggled
-  -E <error bar fraction>    Fraction of error bar to show (real # > 0)
-  -f <input filename>        Input filename
-  -F <format>                Format of output (EPS, GIF, PDF, PNG), - for STDOUT
-  -h <logo height>           Height of output logo (real # > 0)
-  -k <kind of data>          0 for amino acid, 1 for nucleic acid
-  -l <sequence lower bound>  Lower bound of sequence (integer)
-  -m <sequence upper bound>  Upper bound of sequence (integer)
-  -o <output file>           Name of output file
-  -s <sequence start>        Sequence start number, defaults to 1 (int)
-  -t <titletext>             Text of title, enclosed in "" if more than one word
-  -w <logo width>            Width of output logo
-  -x <x-axis label>          Label for x-axis
-  -y <y-axis label>          Label for y-axis
-
-Available toggles (no values associated) bOenc
-  -a       Toggle antialiasing
-  -b       Toggle bar ends
-  -c       Toggle color
-  -e       Toggle error bar
-  -M       Toggle small sample correction
-  -O       Toggle outlining of characters
-  -p       Toggle fineprint
-  -n       Toggle numbering of x-axis
-  -S       Toggle stretching of logos to entire length
-  -X       Toggle boxing of characters
-  -Y       Toggle y-axis
-
-END
-    ;
-
-    return $usage;
-}
-
-################################################################################
-#####                         MAIN FUNCTION                                #####
-################################################################################
-
-# arguments : $_[0] : file name
-MAIN: {
-    init();
-
-    # feed data from file to make height data array reference
-    my @input = <INPUTFILE>;
-    close (INPUTFILE);
-    my %heightparams = (
-			smallsampletoggle => $opt_M,
-			input_kind => $opt_k,
-			stretch => $opt_S
-			);
-
-    my ($heightdata_r, $desc_r, $kind, $goodlength, $badline, $validformat) =
-	logo::getHeightData(\@input, \%heightparams);
-
-    # check for errors
-    if ((defined $validformat) && ($validformat == 1)) {
-	die("Error: Invalid input format does not conform to FASTA, " .
-	    "CLUSTAL, or Flat.\n");
-    }
-    if (!$goodlength) {
-	die("Error: Number of characters in each logo line is not " .
-	    "consistent, starting at: ", $badline, "\n");
-    }
-
-    my %input = (
-		 LOGO_HEIGHT => $opt_h,
-		 LOGO_WIDTH => $opt_w,
-		 COLORSCHEME => ($opt_c) ? "DEFAULT" : "BW",
-
-		 LOGOSTART => $opt_l,
-		 LOGOEND => $opt_m,
-		 START_NUM => $opt_s,
-
-		 TITLETEXT => $opt_t,
-		 YAXIS_LABEL => $opt_y,
-		 XAXIS_LABEL => $opt_x,
-
-		 BOXSHRINK => $opt_d,
-		 CHARSPERLINE => $opt_C,
-		 BARBITS => $opt_B,
-		 TICBITS => $opt_T,
-		 RES => "96",
-		 "FORMAT" => (uc $opt_F),
-
-                 # toggles
-		 ANTIALIAS => $opt_a,
-	         ERRBAR => $opt_e,
-		 FINEPRINT => $opt_p,
-		 NUMBERING => $opt_n,
-		 OUTLINE => $opt_O,
-		 SHOWENDS => $opt_b,
-		 SHOWINGBOX => $opt_X,
-                 YAXIS => $opt_Y
-		 );
-
-    template::create_template(\%input, $kind, $desc_r, $heightdata_r, $opt_o, $PATH);
-}
-
-
-################################################################################
-#####                      FUNCTINOS FOR INIT                              #####
-################################################################################
-
-# all ints
-sub isInt {
-    return ($_[0] =~ /^[\+\-]?\d+$/) ? 1 : 0;
-}
-
-# all reals
-sub isReal {
-    return ($_[0] =~ /^[\+\-]?\d*.\d*?$/) ? 1 : 0;
-}
-
-sub isZeroOrOne {
-    return ($_[0] == 0 || $_[0] == 1) ? 1 : 0;
-}
-
-sub init {
-
-#  if (not defined $PATH) {
-#      die ("PATH must be defined\n");
-#  } elsif (not -e $PATH) {
-#      die ("PATH ($PATH) must exist\n");
-#  } elsif (not -d $PATH) {
-#      die ("PATH ($PATH) must be a directory\n");
-#  }
-
-  &getopts('T:B:C:d:E:f:F:h:k:l:m:o:s:t:w:x:y:abcenMOpSXY');
-
-    if (defined $opt_B &&
-	(!isReal($opt_B) || $opt_B < 0) ) {
-	printf("\noption B must be a positive real, but is $opt_B, $!\n");
-	die &usage();
-    }
-    if (defined $opt_d && 
-	( !isReal($opt_d) || $opt_d < 0 || $opt_d > 1) ) {
-        print("\noption d must be a real between 0 and 1, but is $opt_d, $!\n");
-        die &usage();
-    }
-    if (defined $opt_E &&
-	(!isReal($opt_E) || $opt_E < 0 || $opt_E > 1) ) {
-        print("\noption E must be a real between 0 and 1, but is $opt_E, $!\n");
-        die &usage();
-    }
-    if (defined $opt_f) {
-        open (INPUTFILE, "$opt_f") or die "Couldn't open input filename $opt_f: $!\n";
-    } else {
-	print("\ninput file not specified, terminating...\n");
-        die &usage();
-    }
-    if (defined $opt_h && 
-	(!isReal($opt_h) || $opt_h < 0) ) {
-        print("\noption h must be a positive real, but is $opt_h, $!\n");
-        die &usage();
-    }
-    if (defined $opt_w && 
-	(!isReal($opt_w) || $opt_w < 0) ) {
-        print("\noption w must be a positive real, but is $opt_w, $!\n");
-        die &usage();
-    }
-    if (defined $opt_k &&
-	(!isZeroOrOne($opt_k)) ) {
-	print("\noption k must be 0 or 1, but is $opt_k, $!\n");
-	die &usage();
-    }
-
-    #toggles
-    if (!defined $opt_a) {
-	$opt_a = 0;
-    }
-    if (!defined $opt_b) {
-	$opt_b = 0;
-    }
-    if (!defined $opt_c) {
-	$opt_c = 0;
-    }
-    if (!defined $opt_e) {
-	$opt_e = 0;
-    }
-    if (!defined $opt_n) {
-	$opt_n = 0;
-    }
-    if (!defined $opt_M) {
-	$opt_M = 0;
-    }
-    if (!defined $opt_O) {
-	$opt_O = 0;
-    }
-    if (!defined $opt_p) {
-	$opt_p = 0;
-    }
-    if (!defined $opt_S) {
-	$opt_S = 0;
-    }
-    if (!defined $opt_X) {
-        $opt_X = 0;
-    };
-    if (!defined $opt_Y) {
-        $opt_Y = 0;
-    };
-
-    if (!defined $opt_F) {
-	$opt_F = "EPS";  # default to EPS
-    }
-    if (!defined $opt_o) {
-	$opt_o = "-";    # for standard out
-    } else {
-#	$opt_o =~ s/\.\S*$//;         # remove extension if there is one
-	$opt_o .= "." . (lc $opt_F);  # make file name
-    }
-
-    if (defined $opt_C &&
-	(!isInt($opt_C) || $opt_C < 0) ) {
-        printf("\noption C must be a postive integer, but is $opt_C, $!\n");
-        die &usage();
-    }
-
-    if (defined $opt_l && !isInt($opt_l)) {
-	printf("\noption l must be an integer, but is $opt_l, $!\n");
-	die &usage();
-    }
-
-    if (defined $opt_m && !isInt($opt_m)) {
-	printf("\noption m must be an integer, but is $opt_m, $!\n");
-	die &usage();
-    }
-
-    if (defined $opt_s && !isInt($opt_s)) {
-        printf("\noption s must be an integer, but is $opt_s, $!\n");
-        die &usage();
-    }
-}
--- a/template.eps	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,674 +0,0 @@
-%!PS-Adobe-3.0 EPSF-3.0
-%%Title: Sequence Logo : {$TITLE}
-%%Creator: {$CREATOR}
-%%CreationDate: {$CREATIONDATE}
-%%BoundingBox:   0  0  {$LOGOWIDTHPOINTS}  {$LOGOHEIGHTPOINTS} 
-%%Pages: 0
-%%DocumentFonts: 
-%%EndComments
-
-{$DESC}
-
-
-% ---- CONSTANTS ----
-/cmfactor 72 2.54 div def % defines points -> cm conversion
-/cm {cmfactor mul} bind def % defines centimeters
-
-
-% ---- VARIABLES ----
-
-{$COLORDEF}
-
-/logoWidth {$LOGOWIDTH} cm def
-/logoHeight {$LOGOLINEHEIGHT} cm def
-/logoTitle ({$TITLE}) def
-
-/yaxis {$YAXIS} def
-/yaxisLabel ({$YAXIS_LABEL}) def
-/yaxisBits  {$BARBITS} def % bits
-/yaxisTicBits {$TICBITS} def
-
-
-/xaxis {$NUMBERING} def
-/xaxisLabel ({$XAXIS_LABEL}) def
-/showEnds ({$SHOWENDS}) def % d: DNA, p: PROTEIN, -: none
-
-/showFineprint true def
-/fineprint ({$FINEPRINT}) def
-
-/charsPerLine {$CHARSPERLINE} def
-/logoLines {$LOGOLINES} def
-
-/showingBox ({$SHOWINGBOX}) def    %n s f
-/shrinking {$SHRINKBOOLEAN} def
-/shrink  {$SHRINKFACTOR} def
-/outline {$OUTLINE} def
-
-/IbeamFraction  {$ERRORBARFRACTION} def
-/IbeamGray      0.50 def
-/IbeamLineWidth 0.5 def
-
-/fontsize       12 def
-/titleFontsize  14 def
-/smallFontsize   6 def
-
-/defaultColor {$DEFAULT_COLOR} def 
-
-{$COLORDICT}
-
-% Standard DNA/RNA color scheme
-% /colorDict << 
-%   (G)  orange
-%   (T)  red   
-%   (C)  blue  
-%   (A)  green 
-%   (U)  red   
-% >> def
-
-% Standard Amino Acid colors
-%/colorDict << 
-%  (G)  green  
-%  (S)  green  
-%  (T)  green  
-%  (Y)  green  
-%  (C)  green  
-%  (N)  purple 
-%  (Q)  purple 
-%  (K)  blue   
-%  (R)  blue   
-%  (H)  blue   
-%  (D)  red    
-%  (E)  red    
-%  (P)  black  
-%  (A)  black  
-%  (W)  black  
-%  (F)  black  
-%  (L)  black  
-%  (I)  black  
-%  (M)  black  
-%  (V)  black  
-%>> def
-
-
-
-% ---- DERIVED PARAMETERS ----
-
-/leftMargin
-  fontsize 3.5 mul
-
-def 
-
-/bottomMargin
-  fontsize 0.75 mul
-
-  % Add extra room for axis
-  xaxis {fontsize 1.75 mul add } if
-  xaxisLabel () eq {} {fontsize 0.75 mul add} ifelse
-def
-
-
-/topMargin 
-  logoTitle () eq { 10 }{titleFontsize 4 add} ifelse
-def
-
-/rightMargin 
-  %Add extra room if showing ends
-  showEnds (-) eq { fontsize}{fontsize 1.5 mul} ifelse
-def
-
-/yaxisHeight 
-  logoHeight 
-  bottomMargin sub  
-  topMargin sub
-def
-
-/ticWidth fontsize 2 div def
-
-/pointsPerBit yaxisHeight yaxisBits div  def
-
-/isBoxed 
-  showingBox (s) eq
-  showingBox (f) eq or { 
-    true
-  } {
-    false
-  } ifelse
-def
-
-/stackMargin 1 def
-
-% Do not add space aroung characters if characters are boxed
-/charRightMargin 
-  isBoxed { 0.0 } {stackMargin} ifelse
-def
-
-/charTopMargin 
-  isBoxed { 0.0 } {stackMargin} ifelse
-def
-
-/charWidth
-  logoWidth
-  leftMargin sub
-  rightMargin sub
-  charsPerLine div
-  charRightMargin sub
-def
-
-/charWidth4 charWidth 4 div def
-/charWidth2 charWidth 2 div def
-
-/stackWidth 
-  charWidth charRightMargin add
-def
- 
-/numberFontsize 
-  fontsize charWidth lt {fontsize}{charWidth} ifelse
-def
-
-% movements to place 5'/N and 3'/C symbols
-/leftEndDeltaX  fontsize neg         def
-/leftEndDeltaY  fontsize 1.5 mul neg def
-/rightEndDeltaX fontsize 0.25 mul     def
-/rightEndDeltaY leftEndDeltaY        def
-
-% Outline width is proporional to charWidth, 
-% but no less that 1 point
-/outlinewidth 
-  charWidth 32 div dup 1 gt  {}{pop 1} ifelse
-def
-
-
-% ---- PROCEDURES ----
-
-/StartLogo { 
-  % Save state
-  save 
-  gsave 
-
-  % Print Logo Title, top center 
-  gsave 
-    SetTitleFont
-
-    logoWidth 2 div
-    logoTitle
-    stringwidth pop 2 div sub
-    logoHeight logoLines mul  
-    titleFontsize sub
-    moveto
-
-    logoTitle
-    show
-  grestore
-
-  % Print X-axis label, bottom center
-  gsave
-    SetStringFont
-
-    logoWidth 2 div
-    xaxisLabel stringwidth pop 2 div sub
-    fontsize 3 div
-    moveto
-
-    xaxisLabel
-    show
-  grestore
-
-  % Show Fine Print
-  showFineprint {
-    gsave
-      SetSmallFont
-      logoWidth
-        fineprint stringwidth pop sub
-        smallFontsize sub
-          smallFontsize 3 div
-      moveto
-    
-      fineprint show
-    grestore
-  } if
-
-  % Move to lower left corner of last line, first stack
-  leftMargin bottomMargin translate
-
-  % Move above first line ready for StartLine 
-  0 logoLines logoHeight mul translate
-
-  SetLogoFont
-} bind def
-
-/EndLogo { 
-  grestore 
-  showpage 
-  restore 
-} bind def
-
-
-/StartLine{ 
-  % move down to the bottom of the line:
-  0 logoHeight neg translate
-  
-  gsave 
-    yaxis { MakeYaxis } if
-    xaxis { ShowLeftEnd } if
-} bind def
-
-/EndLine{ 
-    xaxis { ShowRightEnd } if
-  grestore 
-} bind def
-
-
-/MakeYaxis {
-  gsave    
-    stackMargin neg 0 translate
-    ShowYaxisBar
-    ShowYaxisLabel
-  grestore
-} bind def
-
-
-/ShowYaxisBar { 
-  gsave  
-    SetStringFont
-
-    /str 10 string def % string to hold number  
-    /smallgap stackMargin 2 div def
-
-    % Draw first tic and bar
-    gsave    
-      ticWidth neg 0 moveto 
-      ticWidth 0 rlineto 
-      0 yaxisHeight rlineto
-      stroke
-    grestore
-
-   
-    % Draw the tics
-    % initial increment limit proc for
-    0 yaxisTicBits yaxisBits abs %cvi
-    {/loopnumber exch def
-
-      % convert the number coming from the loop to a string
-      % and find its width
-      loopnumber 10 str cvrs
-      /stringnumber exch def % string representing the number
-
-      stringnumber stringwidth pop
-      /numberwidth exch def % width of number to show
-
-      /halfnumberheight
-         stringnumber CharBoxHeight 2 div
-      def
-
-      numberwidth % move back width of number
-      neg loopnumber pointsPerBit mul % shift on y axis
-      halfnumberheight sub % down half the digit
-
-      moveto % move back the width of the string
-
-      ticWidth neg smallgap sub % Move back a bit more  
-      0 rmoveto % move back the width of the tic  
-
-      stringnumber show
-      smallgap 0 rmoveto % Make a small gap  
-
-      % now show the tic mark
-      0 halfnumberheight rmoveto % shift up again
-      ticWidth 0 rlineto
-      stroke
-    } for
-  grestore
-} bind def
-
-/ShowYaxisLabel {
-  gsave
-    SetStringFont
-
-    % How far we move left depends on the size of
-    % the tic labels.
-    /str 10 string def % string to hold number  
-    yaxisBits yaxisTicBits div cvi yaxisTicBits mul 
-    str cvs stringwidth pop
-    ticWidth 1.5 mul  add neg  
-
-
-    yaxisHeight
-    yaxisLabel stringwidth pop
-    sub 2 div
-
-    translate
-    90 rotate
-    0 0 moveto
-    yaxisLabel show
-  grestore
-} bind def
-
-
-/StartStack {  % <stackNumber> startstack
-  xaxis {MakeNumber}{pop} ifelse
-  gsave
-} bind def
-
-/EndStack {
-  grestore
-  stackWidth 0 translate
-} bind def
-
-
-% Draw a character whose height is proportional to symbol bits
-/MakeSymbol{ % charbits character MakeSymbol
-  gsave
-    /char exch def
-    /bits exch def
-
-    /bitsHeight 
-       bits pointsPerBit mul 
-    def
-
-    /charHeight 
-       bitsHeight charTopMargin sub
-       dup 
-       0.0 gt {}{pop 0.0} ifelse % if neg replace with zero 
-    def 
- 
-    charHeight 0.0 gt {
-      char SetColor
-      charWidth charHeight char ShowChar
-
-      showingBox (s) eq { % Unfilled box
-        0 0 charWidth charHeight false ShowBox
-      } if
-
-      showingBox (f) eq { % Filled box
-        0 0 charWidth charHeight true ShowBox
-      } if
-
-    } if
-
-  grestore
-
-  0 bitsHeight translate 
-} bind def
-
-
-/ShowChar { % <width> <height> <char> ShowChar
-  gsave
-    /tc exch def    % The character
-    /ysize exch def % the y size of the character
-    /xsize exch def % the x size of the character
-
-    /xmulfactor 1 def 
-    /ymulfactor 1 def
-
-  
-    % if ysize is negative, make everything upside down!
-    ysize 0 lt {
-      % put ysize normal in this orientation
-      /ysize ysize abs def
-      xsize ysize translate
-      180 rotate
-    } if
-
-    shrinking {
-      xsize 1 shrink sub 2 div mul
-        ysize 1 shrink sub 2 div mul translate 
-
-      shrink shrink scale
-    } if
-
-    % Calculate the font scaling factors
-    % Loop twice to catch small correction due to first scaling
-    2 {
-      gsave
-        xmulfactor ymulfactor scale
-      
-        ysize % desired size of character in points
-        tc CharBoxHeight 
-        dup 0.0 ne {
-          div % factor by which to scale up the character
-          /ymulfactor exch def
-        } % end if
-        {pop pop}
-        ifelse
-
-        xsize % desired size of character in points
-        tc CharBoxWidth  
-        dup 0.0 ne {
-          div % factor by which to scale up the character
-          /xmulfactor exch def
-        } % end if
-        {pop pop}
-        ifelse
-      grestore
-    } repeat
-
-    % Adjust horizontal position if the symbol is an I
-    tc (I) eq {
-      charWidth 2 div % half of requested character width
-      tc CharBoxWidth 2 div % half of the actual character
-      sub 0 translate
-      % Avoid x scaling for I 
-      /xmulfactor 1 def 
-    } if
-
-
-    % ---- Finally, draw the character
-  
-    newpath
-    xmulfactor ymulfactor scale
-
-    % Move lower left corner of character to start point
-    tc CharBox pop pop % llx lly : Lower left corner
-    exch neg exch neg
-    moveto
-
-    outline {  % outline characters:
-      outlinewidth setlinewidth
-      tc true charpath
-      gsave 1 setgray fill grestore
-      clip stroke
-    } { % regular characters
-      tc show
-    } ifelse
-
-  grestore
-} bind def
-
-
-/ShowBox { % x1 y1 x2 y2 filled ShowBox
-  gsave
-    /filled exch def 
-    /y2 exch def
-    /x2 exch def
-    /y1 exch def
-    /x1 exch def
-    newpath
-    x1 y1 moveto
-    x2 y1 lineto
-    x2 y2 lineto
-    x1 y2 lineto
-    closepath
-
-    clip
-    
-    filled {
-      fill
-    }{ 
-      0 setgray stroke   
-    } ifelse
-
-  grestore
-} bind def
-
-
-/MakeNumber { % number MakeNumber
-  gsave
-    SetNumberFont
-    stackWidth 0 translate
-    90 rotate % rotate so the number fits
-    dup stringwidth pop % find the length of the number
-    neg % prepare for move
-    stackMargin sub % Move back a bit
-    charWidth (0) CharBoxHeight % height of numbers
-    sub 2 div %
-    moveto % move back to provide space
-    show
-  grestore
-} bind def
-
-
-/Ibeam{ % heightInBits Ibeam
-  gsave
-    % Make an Ibeam of twice the given height in bits
-    /height exch  pointsPerBit mul def 
-    /heightDRAW height IbeamFraction mul def
-
-    IbeamLineWidth setlinewidth
-    IbeamGray setgray 
-
-    charWidth2 height neg translate
-    ShowIbar
-    newpath
-      0 0 moveto
-      0 heightDRAW rlineto
-    stroke
-    newpath
-      0 height moveto
-      0 height rmoveto
-      currentpoint translate
-    ShowIbar
-    newpath
-    0 0 moveto
-    0 heightDRAW neg rlineto
-    currentpoint translate
-    stroke
-  grestore
-} bind def
-
-
-/ShowIbar { % make a horizontal bar
-  gsave
-    newpath
-      charWidth4 neg 0 moveto
-      charWidth4 0 lineto
-    stroke
-  grestore
-} bind def
-
-
-/ShowLeftEnd {
-  gsave
-    SetStringFont
-    leftEndDeltaX leftEndDeltaY moveto
-    showEnds (d) eq {(5) show ShowPrime} if
-    showEnds (p) eq {(N) show} if
-  grestore
-} bind def
-
-
-/ShowRightEnd { 
-  gsave
-    SetStringFont
-    rightEndDeltaX rightEndDeltaY moveto
-    showEnds (d) eq {(3) show ShowPrime} if
-    showEnds (p) eq {(C) show} if
-  grestore
-} bind def
-
-
-/ShowPrime {
-  gsave
-    SetPrimeFont
-    (\242) show 
-  grestore
-} bind def
-
- 
-/SetColor{ % <char> SetColor
-  dup colorDict exch known {
-    colorDict exch get aload pop setrgbcolor
-  } {
-    pop
-    defaultColor aload pop setrgbcolor
-  } ifelse 
-} bind def
-
-% define fonts
-/SetTitleFont {/Times-Bold findfont titleFontsize scalefont setfont} bind def
-/SetLogoFont  {/Helvetica-Narrow-Bold findfont charWidth  scalefont setfont} bind def
-/SetStringFont{/Helvetica-Bold findfont fontsize scalefont setfont} bind def
-/SetPrimeFont {/Symbol findfont fontsize scalefont setfont} bind def
-/SetSmallFont {/Helvetica findfont smallFontsize scalefont setfont} bind def
-
-/SetNumberFont {
-    /Helvetica-Bold findfont 
-    numberFontsize
-    scalefont
-    setfont
-} bind def
-
-%Take a single character and return the bounding box
-/CharBox { % <char> CharBox <lx> <ly> <ux> <uy>
-  gsave
-    newpath
-    0 0 moveto
-    % take the character off the stack and use it here:
-    true charpath 
-    flattenpath 
-    pathbbox % compute bounding box of 1 pt. char => lx ly ux uy
-    % the path is here, but toss it away ...
-  grestore
-} bind def
-
-
-% The height of a characters bounding box
-/CharBoxHeight { % <char> CharBoxHeight <num>
-  CharBox
-  exch pop sub neg exch pop
-} bind def
-
-
-% The width of a characters bounding box
-/CharBoxWidth { % <char> CharBoxHeight <num>
-  CharBox
-  pop exch pop sub neg 
-} bind def
-
-
-% Deprecated names
-/startstack {StartStack} bind  def
-/endstack {EndStack}     bind def
-/makenumber {MakeNumber} bind def
-/numchar { MakeSymbol }  bind def
-
-%%EndProlog
-
-%%Page: 1 1
-StartLogo
-StartLine % line number 1
-
-{$DATA}
-
-EndLine
-EndLogo
-
-%%EOF
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/template.pm	Wed Sep 17 07:25:17 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,672 +0,0 @@
-=head1 NAME
-
-  template.pm - creates logo output in various formats
-
-=head1 SYNOPSIS
-
-  Perl module 
-
-=head1 DESCRIPTION
-
-  logo.cgi and run.pl collect the logo data. They can then enter
-  template::create_template to create logo output in the following formats:
-
-   * EPS
-   * GIF
-   * PDF
-   * PNG
-
-  
-
-  If the configuration file "logo.conf" exists in the working directory, then
-  it will be parsed for the locations of GhostScript (gs) and convert. The
-  following is an
-  example of the configuration file ("#" at beginning of line indicates
-  comment):
-
-    # Make configuration changes here. Rename this file to logo.conf
-    # gs version 5.5 does not work, 6.5 does
-    # set the PATHS
-    gs=/usr/local/bin/gs
-    convert=/usr/X11R6/bin/convert
-
-=cut
-
-package template;
-use strict;
-
-
-################################################################################
-########                   STATE FOR FILLINGS                         ##########
-################################################################################
-
-#        BARBITS            number of bits in vertical y-axis bar
-#        CHARWIDTH          width of characters in logo
-#        COLORSCHEME        "a" for amino acid;
-#                           "n" for nucleic acid;
-#                           "b" for black (no color scheme)
-#                           color scheme
-#        DATA               string of heights of characters in cm
-#        DESC               description of aligned sequences
-#        FINEPRINT          enable adverts/credits
-#        LOGOLINES          number of lines of logo
-#        LOGOHEIGHT         height of EACH LINE in logo (in cm)
-#        LOGOWIDTH          width of final logo in cm
-#        LOGOHEIGHTPOINTS   height of final logo in points
-#        LOGOWIDTHPOINTS    height of final logo in points
-#        LOGOSTART          logo output will begin at residue LOGOSTART
-#        LOGOEND            logo output will end at residue LOGOEND
-#        ERRBAR             1 to include error bar, 0 to exclude
-#        ERRORBARFRACTION   percent of error bar to show in range [0,1]
-#        KIND               $AA for amino acid, $NA for nucleic acid
-#        NUMBERING          1 to show residue numbers, 0 to exclude
-#        OUTLINE            1 to print characters in outline form;
-#                           0 to print characters in solid form
-#        SHOWENDS           "d" to show 5' and 3' ends;
-#                           "p" to show N and C termini;
-#                           "-" to exclude end markers
-#        SHOWINGBOX         "n" to have No boxes around characters;
-#                           "s" to have boxes around characters, with Shrinking;
-#                           "f" to have Filled boxes around characters
-#        SHRINKBOOLEAN      1 to shrink characters; 0 to exclude shrinking
-#        SHRINKFACTOR       amount to shrink in range from 1(no shrinking) to
-#                           0(full shrinking)
-#        START_NUM          start number for very beginning of sequence
-#        TITLE              title of logo
-#        YAXIS              1 to turn on y axis and its labels
-
-################################################################################
-#####                   VARIABLES AND DEFAULT VALUES                       #####
-################################################################################
-
-my %defaults = (
-		LOGOHEIGHT => 5,
-		LOGOWIDTH => 8,
-
-		YAXIS => "false",
-		TITLE => "",
-		YAXIS_LABEL => "bits",
-		XAXIS_LABEL => "",
-
-		BARENDS => "false",
-		OUTLINE => "false",
-		SHOWINGBOX => "n",
-		NUMBERING => "false",
-        #FINEPRINT => "Created by: G. E. Crooks, G. Hon, J.-M. Chandonia & S. E. Brenner, (2002) <weblogo.berkeley.edu>",
-        FINEPRINT => "weblogo.berkeley.edu",
-
-		ERRORBARFRACTION => "1",
-		ERRBAR => "0",
-		SHRINKFACTOR => "1",
-		START_NUM => "1",
-
-		DEFAULT_COLOR => "black",
-
-		black  => "[0 0 0]",
-		red    =>  "[0.8 0 0]",
-		green  =>  "[0 0.8 0]",
-		blue   =>  "[0 0 0.8]",
-		yellow =>  "[1 0.7 1.0]",
-		purple =>  "[0.8 0 0.8]",
-		orange =>  "[1 0.7 0]"
-		);
-
-my $AA = 0;
-my $NA = 1;
-my $PATH;
-
-################################################################################
-#####                          SOME FUNCTIONS                              #####
-################################################################################
-
-
-sub create_template {
-    my ($input, $kind, $desc_r, $data_r, $outfile, $path) = @_;
-
-    # set path
-    $PATH = $path;
-
-    #Create EPS file
-    my %fillings;
-
-    # put parameters in fillings
-    makeFillings(\%fillings, $input, $kind, $desc_r, $defaults{FINEPRINT});
-
-    # set default data if not filled
-    setDefaults(\%fillings, \%defaults, scalar @$data_r);
-
-    # put color in fillings
-    setColors(\%fillings, \%defaults);
-
-    # put data in fillings
-    setData(\%fillings, $data_r);
-
-    # make eps output
-    my $eps = fillTemplate("$PATH/template.eps", \%fillings);
-    my $format = $input->{FORMAT};
-
-    # convert
-    my ($gsprog, $convertprog) = getProgs();
-
-#    print STDERR "(gsprog, convertprog) = ($gsprog, $convertprog)\n";
-
-    my $width = $fillings{LOGOWIDTHPOINTS};
-    my $height = $fillings{LOGOHEIGHTPOINTS};       # height of entire logo
-    my $res = $input->{RES};
-    my $antialias = (defined $input->{ANTIALIAS} && $input->{ANTIALIAS}) ? "-dTextAlphaBits=4" : "";
-
-    my $r = $width . "x" . $height;
-
-    if( $format eq "EPS" ) {
-	if ($outfile eq "-") { # if standard out
-	    print $eps;
-	} else {
-	    open (WRITEME, ">$outfile");
-	    print WRITEME $eps;
-	    close (WRITEME);
-	}
-
-    } elsif ($format eq "PDF"){
-#	print("outfile = $outfile\n");
-	my $program = "| $gsprog -sOutputFile=$outfile -sDEVICE=pdfwrite -dPDFSETTINGS=/printer -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height -dEmbedAllFonts=true $antialias -dSAFER -dBATCH  -dNOPAUSE -_";
-	open(WRITEME, $program);
-	print WRITEME "$eps";
-	close (WRITEME);
-
-    } elsif ( $format eq "PNG" ) {
-	my $program = "| $gsprog -sOutputFile=$outfile -sDEVICE=png16m -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height $antialias -dSAFER -dBATCH  -dNOPAUSE -_";
-    #print ("$program");
-	open(WRITEME, $program);
-	print WRITEME "$eps";
-        close (WRITEME);
-
-    } elsif ($format eq "GIF") {
-	# convert to EPS first, then GIF
-        die "Please check logo.conf: convert program does not exist" 
-            if (!defined $convertprog || !(-e $convertprog));
-
-	my $program = "| $gsprog -sOutputFile=- -sDEVICE=png16m -q -r$res -dDEVICEWIDTHPOINTS=$width -dDEVICEHEIGHTPOINTS=$height $antialias -dSAFER -dBATCH  -dNOPAUSE -_";
-
-	if ($outfile eq "-") {
-	    $program .= " | $convertprog png:- gif:-";
-	} else {
-	    $program .= " | $convertprog png:- $outfile";
-	}
-
-	open(WRITEME, $program);
-	print WRITEME "$eps";
-        close (WRITEME);
-    }
-}
-
-#deprecated
-sub c {
-    return create_template( @_);
-}
-
-sub getProgs {
-    my ($gsprog, $convertprog) = ("gs", "convert");     
-
-
-    # No configuration file, then return defaults.
-    return ($gsprog, $convertprog) if (! (-e "$PATH/logo.conf"));
-    open (CONF, "$PATH/logo.conf");
-    
-    while (<CONF>) {
-	next if (/^\#/);  # skip lines beginning with "#"
-	if (m/^gs/i) {    # if looks like gs (case insensitive)
-	    ($gsprog) = ($_ =~ /^\S+\=(.+)$/);
-	}
-	if (m/^convert/i) { # if looks like convert (case insensitive)
-	    ($convertprog) = ($_ =~ /^\S+\=(.+)$/);
-	}
-    }
-
-    # Do these fings exist?
-    my ($gsprogname) = ($gsprog =~ /^(\S+)/);
-
-    die "Please check $PATH/logo.conf: gs program ($gsprogname) does not exist" if (!defined $gsprogname || !(-e $gsprogname));
-    #die "Please check logo.conf: convert program does not exist" if (!defined $convertprog || !(-e $convertprog));
-
-    return ($gsprog, $convertprog);
-}
-
-sub get_eps { 
-    my ($input, $kind, $desc_r, $data_r) = @_;
-    my %fillings;
-
-    # put parameters in fillings
-    makeFillings(\%fillings, $input, $kind, $desc_r);
-
-    # set default data if not filled
-    setDefaults(\%fillings, \%defaults, $#$data_r);
-
-    # put data in fillings
-    setData(\%fillings, $data_r);
-
-    # make output
-    return fillTemplate("$PATH/template.eps", \%fillings);
-}
-
-sub fillTemplate {
-    my ($filename, $fillings) = @_;
-
-    if (not -e $filename) {
-	die "filename ($filename) must exist\n";
-    }
-
-    my $text;
-    local $/; # slurp mode (undef)
-    local *F; # create local filehandle
-    open(F, "< $filename\0") || return;
-    $text = <F>;
-    close(F);
-
-    #replace {$KEYWORDS} with value in %$fillings hash
-    $text =~ s{ \{\$ (.*?) \} }
-    { exists( $fillings->{$1})
-	  ? $fillings->{$1}
-                     : ""
-		     }gsex;
-    return $text;
-}
-
-
-################################################################################
-#####                    FILL THE FILLINGS HERE                            #####
-################################################################################
-
-sub isChecked {
-    return 0 if (!defined $_[0]);
-    return $_[0];
-}
-
-# negative/positive ints
-sub isInt {
-    return ($_[0] =~ /^[-\+]?\d+$/) ? 1 : 0;
-}
-
-sub makeFillings {
-
-#    my ($fillings, $input, $kind, $desc_r, $data_r, $fineprint) = @_;
-    my ($fillings, $input, $kind, $desc_r, $fineprint) = @_;
-    $fillings->{KIND} = $kind;
-    $fillings->{LOGOHEIGHT} = $input->{LOGO_HEIGHT};
-    $fillings->{LOGOWIDTH} = $input->{LOGO_WIDTH};
-    $fillings->{OUTLINE} = (isChecked($input->{OUTLINE})) ? "true" : "false";
-    $fillings->{NUMBERING} = (isChecked($input->{NUMBERING})) ? "true" : "false";
-    $fillings->{FINEPRINT} = (isChecked($input->{FINEPRINT})) ? $fineprint : "";
-
-    $fillings->{LOGOSTART} = $input->{LOGOSTART};
-    $fillings->{LOGOEND} = $input->{LOGOEND};
-    $fillings->{START_NUM} = $input->{START_NUM};
-
-    $fillings->{YAXIS} = (isChecked($input->{YAXIS}) && !isChecked($input->{STRETCH})) ? "true" : "false";
-    
-    
-    
-    $fillings->{TITLE} = $input->{TITLETEXT};
-    $fillings->{YAXIS_LABEL} = $input->{YAXIS_LABEL};
-
-    $fillings->{XAXIS_LABEL} = $input->{XAXIS_LABEL};
-    $fillings->{ERRBAR} = $input->{ERRBAR};
-    $fillings->{SHOWINGBOX} = (isChecked($input->{SHOWINGBOX})) ? "s" : "n";
-    $fillings->{SHRINKBOOLEAN} = ($fillings->{SHOWINGBOX} eq "s") ? "true" : "false";
-    $fillings->{SHRINKFACTOR} = $input->{BOXSHRINK};
-
-    if ((defined $input->{CHARSPERLINE}) && 
-	isInt($input->{CHARSPERLINE}) && 
-	($input->{CHARSPERLINE} > 0)) {
-	$fillings->{CHARSPERLINE} = $input->{CHARSPERLINE};
-    }
-
-    if (defined $input->{BARBITS}) {
-	$fillings->{BARBITS} = $input->{BARBITS};
-    } else {
-	$fillings->{BARBITS} = ($fillings->{KIND} == $AA) ? 4.3 : 2;
-    }
-
-    if (defined $input->{TICBITS}) {
-	$fillings->{TICBITS} = $input->{TICBITS};
-    } else {
-	$fillings->{TICBITS} = 1;
-    }
-
-
-
-
-#    if (isChecked($input->{NOCOLOR})) {
-#        $fillings->{COLORSCHEME} = "b";
-#    } else {
-#        $fillings->{COLORSCHEME} = ($kind == $AA) ? "a" : "n";
-#    }
-
-    #color
-    if (defined $input->{DEFAULT_COLOR}) {
-	$fillings->{DEFAULT_COLOR} = (isHexColor( $input->{DEFAULT_COLOR})) ? "c" . $input->{DEFAULT_COLOR} :
-	                              $input->{DEFAULT_COLOR};
-    }
-
-    if (isChecked($input->{SHOWENDS})) {
-	$fillings->{SHOWENDS} = ($fillings->{KIND} == $AA) ? "p" : "d";
-    } else {
-        $fillings->{SHOWENDS} = "-";
-    }
-
-    $fillings->{DESC} = getDescription($desc_r, $fillings->{KIND});
-
-    $fillings->{ERRORBARFRACTION} = $input->{ERRORBARFRACTION};
-    $fillings->{COLORSCHEME} = $input->{COLORSCHEME};
-    $fillings->{COLORS} = $input->{COLORS};
-}
-
-sub getDescription {
-    my $returnVal = "";
-
-    foreach (@{$_[0]}) {
-        if(defined($_)) {
-            $returnVal .= "% * $_\n";
-        } else {
-            $returnVal .= "% * \n";
-        }
-    }
-
-    if ($_[1] == $AA) {
-        $returnVal .= "% * PROTEIN ALIGNMENT";
-    } else {
-        $returnVal .= "% * NUCLEOTIDE ALIGNMENT";
-    }
-
-    return $returnVal;
-}
-
-
-################################################################################
-#####                        SETTING DEFAULTS                              #####
-################################################################################
-
-sub setDefaults {
-    my ($fillings, $defaults, $numchars) = @_;
-
-    $fillings->{LOGOHEIGHT} = $defaults->{LOGOHEIGHT} if !defined $fillings->{LOGOHEIGHT};
-    $fillings->{LOGOWIDTH} = $defaults->{LOGOWIDTH} if !defined $fillings->{LOGOWIDTH};
-
-    $fillings->{START_NUM} = $defaults->{START_NUM} if !defined $fillings->{START_NUM};    
-    $fillings->{LOGOSTART} = $fillings->{START_NUM} if !defined $fillings->{LOGOSTART};
-    $fillings->{LOGOEND} = $numchars + $fillings->{LOGOSTART} - 1 if !defined $fillings->{LOGOEND};
-
-    $fillings->{YAXIS} = $defaults->{YAXIS} if !defined $fillings->{YAXIS};
-    $fillings->{TITLE} = $defaults->{TITLE} if !defined $fillings->{TITLE} || $fillings->{TITLE} eq "";
-    #$fillings->{YAXIS_LABEL} = $defaults->{YAXIS_LABEL} if !defined $fillings->{YAXIS_LABEL} || $fillings->{YAXIS_LABEL} eq "";
-    $fillings->{YAXIS_LABEL} = $defaults->{YAXIS_LABEL} if !defined $fillings->{YAXIS_LABEL} ;
-    $fillings->{XAXIS_LABEL} = $defaults->{XAXIS_LABEL} if !defined $fillings->{XAXIS_LABEL} || $fillings->{XAXIS_LABEL} eq "";
-
-    $fillings->{BARENDS} = $defaults->{BARENDS} if !defined $fillings->{BARENDS};
-    $fillings->{OUTLINE} = $defaults->{OUTLINE} if !defined $fillings->{OUTLINE};
-    $fillings->{SHOWINGBOX} = $defaults->{SHOWINGBOX} if !defined $fillings->{SHOWINGBOX};
-    $fillings->{NUMBERING} = $defaults->{NUMBERING} if !defined $fillings->{NUMBERING};
-
-    $fillings->{ERRORBARFRACTION} = $defaults->{ERRORBARFRACTION} if !defined $fillings->{ERRORBARFRACTION};
-    $fillings->{SHRINKFACTOR} = $defaults->{SHRINKFACTOR} if !defined $fillings->{SHRINKFACTOR};
-    $fillings->{ERRBAR} = $defaults->{ERRBAR} if !defined $fillings->{ERRBAR};
-
-#    printf("logostart = %d, start num = %d, numchars = $numchars, logoend = %d\n", $fillings->{LOGOSTART}, $fillings->{START_NUM},
-#	   $fillings->{LOGOEND});
-
-    my $givenrange = $fillings->{LOGOEND} - $fillings->{LOGOSTART} + 1;
-    my $possiblerange = $numchars - ($fillings->{LOGOSTART} - $fillings->{START_NUM});
-
-    if (!defined $fillings->{CHARSPERLINE} && ($givenrange > $possiblerange)) {
-	$fillings->{CHARSPERLINE} = $numchars - ($fillings->{LOGOSTART} - $fillings->{START_NUM});
-    } elsif (!defined $fillings->{CHARSPERLINE}) {
-	$fillings->{CHARSPERLINE} = $fillings->{LOGOEND} - $fillings->{LOGOSTART} + 1;
-    }
-
-    $fillings->{DEFAULT_COLOR} = $defaults->{DEFAULT_COLOR} if !defined $fillings->{DEFAULT_COLOR} ||
-	                                                       $fillings->{DEFAULT_COLOR} eq "";
-
-#    printf("chars per line = %s\n",$fillings->{CHARSPERLINE});    
-#    print("givenrange = $givenrange, possiblerange = $possiblerange\n");
-
-    if ($givenrange > $possiblerange) {
-	$fillings->{LOGOLINES} = roundup($possiblerange / $fillings->{CHARSPERLINE});
-    } else {
-	$fillings->{LOGOLINES} = roundup($givenrange / $fillings->{CHARSPERLINE});
-    }
-
-    $fillings->{CHARWIDTH} = ($fillings->{LOGOWIDTH} - 1.5) / $fillings->{CHARSPERLINE};
-
-#    # change height if more than 1 line
-#    $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54)) * $fillings->{LOGOLINES};
-
-    # LOGOHEIGHTPOITNS is the height input by the user
-#    $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54));                          # user specifies height of entire logo
-    $fillings->{LOGOHEIGHTPOINTS} = int($fillings->{LOGOHEIGHT} * (72 / 2.54)) * $fillings->{LOGOLINES};  # user specifies height of logo line
-    $fillings->{LOGOWIDTHPOINTS} =  int($fillings->{LOGOWIDTH}  * (72 / 2.54));
-
-    # LOGOLINEHEIGHT is the height of each logo line, in cm
-#    $fillings->{LOGOLINEHEIGHT} = $fillings->{LOGOHEIGHT} / $fillings->{LOGOLINES};    # user specifies height of entire logo
-    $fillings->{LOGOLINEHEIGHT} = $fillings->{LOGOHEIGHT};                              # user specifies height of logo line
-}
-
-sub roundup {
-    return ($_[0] - int($_[0]) > 0) ? int($_[0] + 1) : $_[0];
-}
-
-
-################################################################################
-#####                         COLORS                                       #####
-################################################################################
-
-sub getDefaultColors {
-    my ($defaults) = @_;
-    my $returnVal = "";
-    $returnVal .= "/black " . $defaults->{black} . " def\n";
-    $returnVal .= "/red " . $defaults->{red} . " def\n";
-    $returnVal .= "/green " . $defaults->{green} . " def\n";
-    $returnVal .= "/blue " . $defaults->{blue} . " def\n";
-    $returnVal .= "/yellow " . $defaults->{yellow} . " def\n";
-    $returnVal .= "/purple " . $defaults->{purple} . " def\n";
-    $returnVal .= "/orange " . $defaults->{orange} . " def\n";
-
-    return $returnVal;
-}
-
-sub getNAColors {
-    my $returnVal = <<END
-% Standard DNA/RNA color scheme
-/colorDict << 
-   (G)  orange
-   (T)  red   
-   (C)  blue  
-   (A)  green 
-   (U)  red   
-END
-    ;
-
-    return $returnVal;
-}
-    
-sub getAAColors {
-    my $returnVal = <<END
-% Standard Amino Acid colors
-/colorDict << 
-  (G)  green  
-  (S)  green  
-  (T)  green  
-  (Y)  green  
-  (C)  green  
-  (N)  purple 
-  (Q)  purple 
-  (K)  blue   
-  (R)  blue   
-  (H)  blue   
-  (D)  red    
-  (E)  red    
-  (P)  black  
-  (A)  black  
-  (W)  black  
-  (F)  black  
-  (L)  black  
-  (I)  black  
-  (M)  black  
-  (V)  black
-END
-    ;
-
-    return $returnVal;
-}
-
-sub setColors {
-    my ($fillings, $defaults, $input) = @_;
-    my $colordef = getDefaultColors($defaults);
-    my $colordict = "/colorDict <<\n";
-
-    if ($fillings->{COLORSCHEME} eq "DEFAULT") {
-	$colordef = getDefaultColors($defaults);
-
-	if ($fillings->{KIND} eq $AA) {
-	    $colordict = getAAColors();
-	} else {
-	    $colordict = getNAColors();
-	}
-    } elsif ($fillings->{COLORSCHEME} eq "BW") {
-	# do nothing for dict
-    } else {
-
-	my %colorhash = %{ $fillings->{COLORS} };
-	my $colorName = "";
-
-	foreach (keys %colorhash) {  # keys are strings of residues, value = color name or color code (FF0000)
-	    # add color to definitions
-	    $colorName = $colorhash{$_};
-
-#	    print("color = $_\n");
-
-	    addColorDef(\$colordef, $colorName ) if (isHexColor($colorName));
-	    
-	    # add have each residue use the color
-	    foreach (split(//, $_)) {
-		# add color to dictionary
-		if (isHexColor($colorName)) {
-		    $colordict .= "  ($_)  c$colorName\n" if !($_ =~ /^\s*$/);
-		} else {
-		    $colordict .= "  ($_)  $colorName\n" if !($_ =~ /^\s*$/);
-		}
-	    }
-	}
-    }
-
-    $colordict .= "\n>> def";
-
-    # add to fillings
-    $fillings->{COLORDEF} = $colordef;
-    $fillings->{COLORDICT} = $colordict;
-}
-
-sub addColorDef {
-#    print("adding to color def\n");
-    my ($colordef_r, $color) = @_;
-    my $PSColor = getPSColor($color);
-    $$colordef_r .= "/c$color $PSColor def\n";
-}
-
-sub isHexColor {
-    return ($_[0] =~ /^[0-9a-fA-F]+$/) && (length $_[0] == 6);
-}
-
-# know that it is hex color
-sub getPSColor {
-    return "[" . hex(substr($_[0],0,2)) / 255 . "  " .
-          	 hex(substr($_[0],2,2)) / 255 . "  " .
-		 hex(substr($_[0],4,2)) / 255 . "]";
-}
-
-
-################################################################################
-#####                         SETTING DATA FIELD                           #####
-################################################################################
-
-sub setData {
-    my ($fillings, $data_r) = @_;
-
-    my @data = @$data_r;
-    my ($height, $letter);
-    my @slice;
-    my $data;
-    my $start_num = $fillings->{START_NUM};
-
-    my $start = $fillings->{LOGOSTART} - $start_num;    # where in @data to start
-    my $end = $fillings->{LOGOEND} - $start_num;        # where in @data to end
-    my $charsperline = $fillings->{CHARSPERLINE};
-
-    my $numlabel = $fillings->{LOGOSTART};
-
-    $end = ($end >= scalar @data) ? (scalar @data - 1) : $end;
-
-    for (my $i=$start ; $i<=$end ; $i++) {
-
-	# if add new lines
-#	if ((($i - $start) % $charsperline == 0) &&
-#	    ($i != $start) &&    # not first one
-#	    ($i != $end)) {      # not last one
-	if ((($i - $start) % $charsperline == 0) &&
-	    ($i != $start)) {    # not first one
-	    $data .= <<END
-EndLine
-StartLine
-
-END
-    ;
-	}
-
-        @slice = @{$data[$i]};
-        $data .= <<END
-($numlabel) startstack
-END
-    ;
-
-        $numlabel++;
-
-        foreach (@slice) {
-            ($letter,$height) = ($_ =~ /^(.{1})(\S+)/);
-
-	    # is space, so leave
-	    if ($letter eq " ") {
-		last;
-	    }
-	    
-	    # look for ">", which is symbol for error bar, then quit
-	    if ($letter eq ">") {
-		last;
-	    }
-
-#	    # look for negative heights
-#	    if ($height < 0) {
-#		next;
-#	    }
-
-            $letter =  (uc $letter);  # always uppercase
-	    $height = ($height < 0) ? 0 : $height;
-            $data .= " $height ($letter) numchar\n";
-        }
-
-        # put in error bars -- size is in $height as read in before
-        if ($fillings->{ERRBAR} && $letter ne " " && $height != 0) {
-	    $data .= " $height Ibeam\n";
-	}
-
-        $data .= <<END
-endstack
-
-END
-    ;
-
-    }
-
-    $fillings->{DATA} = $data;
-}
-
-################################################################################
-
-1;
--- a/wrapper.sh	Wed Sep 17 07:25:17 2014 -0400
+++ b/wrapper.sh	Mon Sep 22 10:19:36 2014 -0400
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 dir="$(cd "$(dirname "$0")" && pwd)"
 input=$1
 output=$2
@@ -12,108 +13,50 @@
 cat $PWD/files/*/8_* > $PWD/mutationstats.txt
 cat $PWD/files/*/10_* > $PWD/hotspots.txt
 
-cp $dir/HS12RSS.txt $outdir/
-cp $dir/HS23RSS.txt $outdir/
+
+echo "identification"
+python $dir/gene_identification.py --input $PWD/summary.txt --output $PWD/annotatedsummary.txt
+echo "merging"
+Rscript $dir/merge_and_filter.r $PWD/annotatedsummary.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/merged.txt $outdir/unmatched.txt
 
-mkdir $outdir/identification/
-python $dir/gene_identification.py --input $PWD/summary.txt --outdir $outdir/identification/
+genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm"
+echo "R mutation analysis"
+Rscript $dir/mutation_analysis.r $outdir/merged.txt $genes $outdir 2>&1
+echo "python mutation analysis"
+python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --output $outdir/hotspot_analysis.txt
+
+cat $outdir/mutations.txt $outdir/hotspot_analysis.txt > $outdir/result.txt
+
 genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm)
-tmp=$PWD/tmp
-tmp2=$PWD/tmp2
-hotspottmp=$PWD/hotspottmp
-mutationtmp=$PWD/mutationtmp
-touch $outdir/mutationandhotspot.txt
-for gene in ${genes[@]}
-do
-	echo "Running $gene <br />" >> $output
-	mkdir $outdir/$gene
-	cp $dir/HS12RSS.txt $outdir/$gene/
-	cp $dir/HS23RSS.txt $outdir/$gene/
-	echo "Filtering input..." >> $output
-	Rscript $dir/filter.r $PWD/summary.txt $outdir/identification/${gene}.txt $outdir/$gene/summary.txt
-	Rscript $dir/filter.r $PWD/mutationanalysis.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationanalysis.txt
-	Rscript $dir/filter.r $PWD/mutationstats.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationstats.txt
-	Rscript $dir/filter.r $PWD/hotspots.txt $outdir/identification/${gene}.txt $outdir/$gene/hotspots.txt
-	echo "done <br />" >> $output
-	
-	echo "Running R script on $gene..." >> $output
-	Rscript --verbose $dir/mutation_analysis.r $outdir/$gene/mutationstats.txt $outdir/$gene/summary.txt $outdir/$gene/ 2>&1
-	echo "done <br />" >> $output
-	
-	echo "Running Python script..." >> $output
-	python $dir/mutation_analysis.py --mutationfile $outdir/$gene/mutationanalysis.txt --hotspotfile $outdir/$gene/hotspots.txt --output $outdir/$gene/hotspot_analysis.txt
-	echo "done <br />" >> $output
-	echo "Done with $gene <br />" >> $output
-	cut $outdir/$gene/mutations.txt -d, -f2,3,4 > $mutationtmp
-	cut $outdir/$gene/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp
-	cat $mutationtmp $hotspottmp > $tmp
-	paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2
-	cat $tmp2 > $outdir/mutationandhotspot.txt
-	rm $outdir/$gene/HS12RSS.txt
-	rm $outdir/$gene/HS23RSS.txt
-done
-
-Rscript --verbose $dir/mutation_analysis.r $PWD/mutationstats.txt $PWD/summary.txt $outdir/ 2>&1
-python $dir/mutation_analysis.py --mutationfile $PWD/mutationanalysis.txt --hotspotfile $PWD/hotspots.txt --output $outdir/hotspot_analysis.txt
-
-cut $outdir/mutations.txt -d, -f2,3,4 > $mutationtmp
-cut $outdir/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp
-cat $mutationtmp $hotspottmp > $tmp
-paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2
-cat $tmp2 > $outdir/mutationandhotspot.txt
-
-cut $outdir/ca1/mutations.txt -d, -f1 > $mutationtmp
-cut $outdir/ca1/hotspot_analysis.txt -d, -f1 > $hotspottmp
-cat $mutationtmp $hotspottmp > $tmp
-paste $tmp $outdir/mutationandhotspot.txt -d, > $tmp2
-cat $tmp2 | tr -s "," > $outdir/mutationandhotspot.txt
-
-ca_n=`cat $outdir/ca/n.txt`
-ca1_n=`cat $outdir/ca1/n.txt`
-ca2_n=`cat $outdir/ca2/n.txt`
-cg_n=`cat $outdir/cg/n.txt`
-cg1_n=`cat $outdir/cg1/n.txt`
-cg2_n=`cat $outdir/cg2/n.txt`
-cg3_n=`cat $outdir/cg3/n.txt`
-cg4_n=`cat $outdir/cg4/n.txt`
-cm_n=`cat $outdir/cm/n.txt`
-#all_n=$((ca_n + cg_n + cm_n))
-all_n=`cat $outdir/n.txt`
 
 
 echo "<html><center><h1>$title</h1></center><table border='1'>" > $output
 echo "<tr><th>info</th>" >> $output
-echo "<th><a href='identification/ca.txt'>ca (N = $ca_n)</a></th>" >> $output
-echo "<th><a href='identification/ca1.txt'>ca1 (N = $ca1_n)</a></th>" >> $output
-echo "<th><a href='identification/ca2.txt'>ca2 (N = $ca2_n)</a></th>" >> $output
-echo "<th><a href='identification/cg.txt'>cg (N = $cg_n)</a></th>" >> $output
-echo "<th><a href='identification/cg1.txt'>cg1 (N = $cg1_n)</a></th>" >> $output
-echo "<th><a href='identification/cg2.txt'>cg2 (N = $cg2_n)</a></th>" >> $output
-echo "<th><a href='identification/cg3.txt'>cg3 (N = $cg3_n)</a></th>" >> $output
-echo "<th><a href='identification/cg4.txt'>cg4 (N = $cg4_n)</a></th>" >> $output
-echo "<th><a href='identification/cm.txt'>cm (N = $cm_n)</a></th>" >> $output
-echo "<th>all (N = $all_n)</th></tr>" >> $output
+for gene in ${genes[@]}
+do
+	tmp=`cat $outdir/${gene}_n.txt`
+	echo "<th><a href='matched_${gene}.txt'>${gene} (N = $tmp)</a></th>" >> $output
+done
+tmp=`cat $outdir/total_n.txt`
+echo "<th><a href='matched_all.txt'>all (N = $tmp)</a></th>" >> $output
+
 while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz
 	do
 		echo "<tr><td>$name</td><td>${cax}/${cay} (${caz}%)</td><td>${ca1x}/${ca1y} (${ca1z}%)</td><td>${ca2x}/${ca2y} (${ca2z}%)</td><td>${cgx}/${cgy} (${cgz}%)</td><td>${cg1x}/${cg1y} (${cg1z}%)</td><td>${cg2x}/${cg2y} (${cg2z}%)</td><td>${cg3x}/${cg3y} (${cg3z}%)</td><td>${cg4x}/${cg4y} (${cg4z}%)</td><td>${cmx}/${cmy} (${cmz}%)</td><td>${allx}/${ally} (${allz}%)</td></tr>" >> $output
-done < $outdir/mutationandhotspot.txt
+done < $outdir/result.txt
 echo "</table>" >> $output
-echo "<a href='identification/unmatched.txt'>umatched</a><br />" >> $output
-
-Rscript $dir/piechart.r "${ca_n},${cg_n},${cm_n}" "IgA - ${ca_n},IgG - ${cg_n},IgM? - ${cm_n}" "Ig* (N = $all_n)" $outdir/all.png 2>&1
-Rscript $dir/piechart.r "${ca1_n},${ca2_n}" "IgA1 - ${ca1_n},IgA2 - ${ca2_n}" "IgA (N = $ca_n)" $outdir/ca.png 2>&1
-Rscript $dir/piechart.r "${cg1_n},${cg2_n},${cg3_n},${cg4_n}" "IgG1 - ${cg1_n},IgG2 - ${cg2_n},IgG3 - ${cg3_n},IgG4 - ${cg4_n}" "IgG (N = $cg_n)" $outdir/cg.png 2>&1
-
-$dir/seqlogo -t "HS12RSS" -w 20 -h 5 -p -a -c -n -F PNG -f $outdir/weblogo_in_rs12.txt > $outdir/HS12.png 2>&1
-$dir/seqlogo -t "HS23RSS" -w 20 -h 5 -p -a -c -n -F PNG -f $outdir/weblogo_in_rs23.txt > $outdir/HS23.png 2>&1
+echo "<a href='unmatched.txt'>unmatched</a><br />" >> $output
 
 
-echo "<img src='all.png'/>" >> $output
-echo "<img src='ca.png'/>" >> $output
-echo "<img src='cg.png'/>" >> $output
-
-echo "<img src='HS12.png'/>" >> $output
-echo "<img src='HS23.png'/>" >> $output
+echo "<img src='all.png'/><br />" >> $output
+if [ -a $outdir/ca.png ] 
+then
+	echo "<img src='ca.png'/><br />" >> $output
+fi
+if [ -a $outdir/cg.png ]
+then
+	echo "<img src='cg.png'/><br />" >> $output
+fi
 
 for gene in ${genes[@]}
 do
@@ -121,7 +64,7 @@
 	while IFS=, read from a c g t
 		do
 			echo "<tr><td>$from</td><td>$a</td><td>$c</td><td>$g</td><td>$t</td></tr>" >> $output
-	done < $outdir/$gene/transitions.txt
+	done < $outdir/transitions_${gene}.txt
 	echo "</table>" >> $output
 done