diff gene_identification.py @ 62:4262e880472d draft

Uploaded
author davidvanzessen
date Fri, 25 Mar 2016 04:39:18 -0400
parents 64e6a7803e07
children 0fdd90f7c654
line wrap: on
line diff
--- a/gene_identification.py	Fri Mar 18 08:17:08 2016 -0400
+++ b/gene_identification.py	Fri Mar 25 04:39:18 2016 -0400
@@ -41,10 +41,11 @@
 print "Number of input sequences:", len(dic)
 
 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc
+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag
 
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
-                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag",
+                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
                  "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
 
 compiledregex = {"ca": [],
@@ -59,6 +60,12 @@
 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'}
 
+#remove last snp for shorter cg sequence --- note, also change varsInCG
+del cg1[132]
+del cg2[132]
+del cg3[132]
+del cg4[132]
+
 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap
 chunklength = 8
 
@@ -152,7 +159,7 @@
 chunksInCM = len(compiledregex["cm"])
 requiredChunkPercentage = 0.7
 varsInCA = float(len(ca1.keys()) * 2)
-varsInCG = float(len(cg1.keys()) * 2) + 1
+varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice
 varsInCM = 0
 requiredVarPercentage = 0.7