mutation_analysis: gene_identification.py comparison

comparison gene_identification.py @ 62:4262e880472d draft

Uploaded

author	davidvanzessen
date	Fri, 25 Mar 2016 04:39:18 -0400
parents	64e6a7803e07
children	0fdd90f7c654

comparison

equal deleted inserted replaced

-:64e6a7803e07
+:4262e880472d
 			dic[ID] = linesplt[seqIndex]
 print "Number of input sequences:", len(dic)
 #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc
+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
-"cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag",
+"cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
 compiledregex = {"ca": [],
 "cg": [],
 "cm": []}
 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'}
 cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
 cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'}
 cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'}
 cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'}
+#remove last snp for shorter cg sequence --- note, also change varsInCG
+del cg1[132]
+del cg2[132]
+del cg3[132]
+del cg4[132]
 #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap
 chunklength = 8
 #create the chunks of the reference sequence with regular expressions for the variable nucleotides
 chunksInCA = len(compiledregex["ca"])
 chunksInCG = len(compiledregex["cg"])
 chunksInCM = len(compiledregex["cm"])
 requiredChunkPercentage = 0.7
 varsInCA = float(len(ca1.keys()) * 2)
-varsInCG = float(len(cg1.keys()) * 2) + 1
+varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice
 varsInCM = 0
 requiredVarPercentage = 0.7
 first = True

Mercurial > repos > davidvanzessen > mutation_analysis

comparison gene_identification.py @ 62:4262e880472d draft