Mercurial > repos > davidvanzessen > mutation_analysis
changeset 10:4b83265b2686 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 26 Mar 2015 10:00:24 -0400 |
parents | 4c4149fa0bcb |
children | b0242cd1da34 |
files | gene_identification.py |
diffstat | 1 files changed, 14 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_identification.py Thu Mar 26 08:53:41 2015 -0400 +++ b/gene_identification.py Thu Mar 26 10:00:24 2015 -0400 @@ -37,10 +37,12 @@ dic[ID] = "" else: dic[ID] = linesplt[seqIndex] + +print "Number of input sequences:", len(dic) #lambda/kappa reference sequence -searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", - "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", +searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctg", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgacca", "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} compiledregex = {"ca": [], @@ -119,15 +121,6 @@ matches = regex.finditer(seq[lastindex:]) for match in matches: #for every match with the current regex, only uses the first hit lastindex += match.start() - print ID - print lastindex - print chunklength - print i - print seq[lastindex:] - print start - print len(seq) - print relativeStartLocation - print "-------------------" start[relativeStartLocation] += 1 if hasVar: #if the regex has a variable nt in it chunkstart = chunklength / 2 * i #where in the reference does this chunk start @@ -162,6 +155,7 @@ first = True +seq_write_count=0 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence with open(output, 'w') as o: for line in f: @@ -181,34 +175,35 @@ cahits = currentIDHits["ca_hits"] cghits = currentIDHits["cg_hits"] cmhits = currentIDHits["cm_hits"] - if cahits > cghits and cahits > cmhits: #its a ca gene + if cahits >= cghits and cahits >= cmhits: #its a ca gene ca1hits = currentIDHits["ca1"] ca2hits = currentIDHits["ca2"] - if ca1hits > ca2hits: + if ca1hits >= ca2hits: o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") else: o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - elif cghits > cahits and cghits > cmhits: #its a cg gene + elif cghits >= cahits and cghits >= cmhits: #its a cg gene cg1hits = currentIDHits["cg1"] cg2hits = currentIDHits["cg2"] cg3hits = currentIDHits["cg3"] cg4hits = currentIDHits["cg4"] - if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene + if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene + elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene + elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #cg4 gene o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #its a cm gene o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + seq_write_count += 1 print "Time: %i" % (int(time.time() * 1000) - starttime) +print "Number of sequences written to file:", seq_write_count + - -