Mercurial > repos > davidvanzessen > mutation_analysis
comparison gene_identification.py @ 62:4262e880472d draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Fri, 25 Mar 2016 04:39:18 -0400 |
| parents | 64e6a7803e07 |
| children | 0fdd90f7c654 |
comparison
equal
deleted
inserted
replaced
| 61:64e6a7803e07 | 62:4262e880472d |
|---|---|
| 39 dic[ID] = linesplt[seqIndex] | 39 dic[ID] = linesplt[seqIndex] |
| 40 | 40 |
| 41 print "Number of input sequences:", len(dic) | 41 print "Number of input sequences:", len(dic) |
| 42 | 42 |
| 43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc | 43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc |
| 44 #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag | |
| 44 | 45 |
| 45 #lambda/kappa reference sequence | 46 #lambda/kappa reference sequence |
| 46 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", | 47 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", |
| 47 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag", | 48 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", |
| 48 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence | 49 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence |
| 49 | 50 |
| 50 compiledregex = {"ca": [], | 51 compiledregex = {"ca": [], |
| 51 "cg": [], | 52 "cg": [], |
| 52 "cm": []} | 53 "cm": []} |
| 56 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} | 57 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} |
| 57 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 58 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} |
| 58 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} | 59 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} |
| 59 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 60 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} |
| 60 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} | 61 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} |
| 62 | |
| 63 #remove last snp for shorter cg sequence --- note, also change varsInCG | |
| 64 del cg1[132] | |
| 65 del cg2[132] | |
| 66 del cg3[132] | |
| 67 del cg4[132] | |
| 61 | 68 |
| 62 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap | 69 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap |
| 63 chunklength = 8 | 70 chunklength = 8 |
| 64 | 71 |
| 65 #create the chunks of the reference sequence with regular expressions for the variable nucleotides | 72 #create the chunks of the reference sequence with regular expressions for the variable nucleotides |
| 150 chunksInCA = len(compiledregex["ca"]) | 157 chunksInCA = len(compiledregex["ca"]) |
| 151 chunksInCG = len(compiledregex["cg"]) | 158 chunksInCG = len(compiledregex["cg"]) |
| 152 chunksInCM = len(compiledregex["cm"]) | 159 chunksInCM = len(compiledregex["cm"]) |
| 153 requiredChunkPercentage = 0.7 | 160 requiredChunkPercentage = 0.7 |
| 154 varsInCA = float(len(ca1.keys()) * 2) | 161 varsInCA = float(len(ca1.keys()) * 2) |
| 155 varsInCG = float(len(cg1.keys()) * 2) + 1 | 162 varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice |
| 156 varsInCM = 0 | 163 varsInCM = 0 |
| 157 requiredVarPercentage = 0.7 | 164 requiredVarPercentage = 0.7 |
| 158 | 165 |
| 159 | 166 |
| 160 first = True | 167 first = True |
