Mercurial > repos > davidvanzessen > mutation_analysis
comparison gene_identification.py @ 62:4262e880472d draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 25 Mar 2016 04:39:18 -0400 |
parents | 64e6a7803e07 |
children | 0fdd90f7c654 |
comparison
equal
deleted
inserted
replaced
61:64e6a7803e07 | 62:4262e880472d |
---|---|
39 dic[ID] = linesplt[seqIndex] | 39 dic[ID] = linesplt[seqIndex] |
40 | 40 |
41 print "Number of input sequences:", len(dic) | 41 print "Number of input sequences:", len(dic) |
42 | 42 |
43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc | 43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc |
44 #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag | |
44 | 45 |
45 #lambda/kappa reference sequence | 46 #lambda/kappa reference sequence |
46 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", | 47 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", |
47 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag", | 48 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", |
48 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence | 49 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence |
49 | 50 |
50 compiledregex = {"ca": [], | 51 compiledregex = {"ca": [], |
51 "cg": [], | 52 "cg": [], |
52 "cm": []} | 53 "cm": []} |
56 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} | 57 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} |
57 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 58 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} |
58 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} | 59 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} |
59 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} | 60 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} |
60 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} | 61 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} |
62 | |
63 #remove last snp for shorter cg sequence --- note, also change varsInCG | |
64 del cg1[132] | |
65 del cg2[132] | |
66 del cg3[132] | |
67 del cg4[132] | |
61 | 68 |
62 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap | 69 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap |
63 chunklength = 8 | 70 chunklength = 8 |
64 | 71 |
65 #create the chunks of the reference sequence with regular expressions for the variable nucleotides | 72 #create the chunks of the reference sequence with regular expressions for the variable nucleotides |
150 chunksInCA = len(compiledregex["ca"]) | 157 chunksInCA = len(compiledregex["ca"]) |
151 chunksInCG = len(compiledregex["cg"]) | 158 chunksInCG = len(compiledregex["cg"]) |
152 chunksInCM = len(compiledregex["cm"]) | 159 chunksInCM = len(compiledregex["cm"]) |
153 requiredChunkPercentage = 0.7 | 160 requiredChunkPercentage = 0.7 |
154 varsInCA = float(len(ca1.keys()) * 2) | 161 varsInCA = float(len(ca1.keys()) * 2) |
155 varsInCG = float(len(cg1.keys()) * 2) + 1 | 162 varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice |
156 varsInCM = 0 | 163 varsInCM = 0 |
157 requiredVarPercentage = 0.7 | 164 requiredVarPercentage = 0.7 |
158 | 165 |
159 | 166 |
160 first = True | 167 first = True |