comparison gene_identification.py @ 62:4262e880472d draft

Uploaded
author davidvanzessen
date Fri, 25 Mar 2016 04:39:18 -0400
parents 64e6a7803e07
children 0fdd90f7c654
comparison
equal deleted inserted replaced
61:64e6a7803e07 62:4262e880472d
39 dic[ID] = linesplt[seqIndex] 39 dic[ID] = linesplt[seqIndex]
40 40
41 print "Number of input sequences:", len(dic) 41 print "Number of input sequences:", len(dic)
42 42
43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc 43 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc
44 #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag
44 45
45 #lambda/kappa reference sequence 46 #lambda/kappa reference sequence
46 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", 47 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
47 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag", 48 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
48 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence 49 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
49 50
50 compiledregex = {"ca": [], 51 compiledregex = {"ca": [],
51 "cg": [], 52 "cg": [],
52 "cm": []} 53 "cm": []}
56 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} 57 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'}
57 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} 58 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
58 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} 59 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'}
59 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} 60 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
60 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} 61 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'}
62
63 #remove last snp for shorter cg sequence --- note, also change varsInCG
64 del cg1[132]
65 del cg2[132]
66 del cg3[132]
67 del cg4[132]
61 68
62 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap 69 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap
63 chunklength = 8 70 chunklength = 8
64 71
65 #create the chunks of the reference sequence with regular expressions for the variable nucleotides 72 #create the chunks of the reference sequence with regular expressions for the variable nucleotides
150 chunksInCA = len(compiledregex["ca"]) 157 chunksInCA = len(compiledregex["ca"])
151 chunksInCG = len(compiledregex["cg"]) 158 chunksInCG = len(compiledregex["cg"])
152 chunksInCM = len(compiledregex["cm"]) 159 chunksInCM = len(compiledregex["cm"])
153 requiredChunkPercentage = 0.7 160 requiredChunkPercentage = 0.7
154 varsInCA = float(len(ca1.keys()) * 2) 161 varsInCA = float(len(ca1.keys()) * 2)
155 varsInCG = float(len(cg1.keys()) * 2) + 1 162 varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice
156 varsInCM = 0 163 varsInCM = 0
157 requiredVarPercentage = 0.7 164 requiredVarPercentage = 0.7
158 165
159 166
160 first = True 167 first = True