Mercurial > repos > davidvanzessen > mutation_analysis
diff gene_identification.py @ 62:4262e880472d draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 25 Mar 2016 04:39:18 -0400 |
parents | 64e6a7803e07 |
children | 0fdd90f7c654 |
line wrap: on
line diff
--- a/gene_identification.py Fri Mar 18 08:17:08 2016 -0400 +++ b/gene_identification.py Fri Mar 25 04:39:18 2016 -0400 @@ -41,10 +41,11 @@ print "Number of input sequences:", len(dic) #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc +#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag #lambda/kappa reference sequence searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", - "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence compiledregex = {"ca": [], @@ -59,6 +60,12 @@ cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} +#remove last snp for shorter cg sequence --- note, also change varsInCG +del cg1[132] +del cg2[132] +del cg3[132] +del cg4[132] + #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap chunklength = 8 @@ -152,7 +159,7 @@ chunksInCM = len(compiledregex["cm"]) requiredChunkPercentage = 0.7 varsInCA = float(len(ca1.keys()) * 2) -varsInCG = float(len(cg1.keys()) * 2) + 1 +varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice varsInCM = 0 requiredVarPercentage = 0.7