Mercurial > repos > davidvanzessen > mutation_analysis
diff gene_identification.py @ 19:c518cf0d4adb draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 01 Apr 2015 05:09:59 -0400 |
parents | b0242cd1da34 |
children | c9f9623f1f76 |
line wrap: on
line diff
--- a/gene_identification.py Mon Mar 30 07:58:36 2015 -0400 +++ b/gene_identification.py Wed Apr 01 05:09:59 2015 -0400 @@ -5,7 +5,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") -parser.add_argument("--output", help="The annotated summary output file") +parser.add_argument("--output", help="The annotated output file to be merged back with the summary file") args = parser.parse_args() @@ -112,16 +112,17 @@ currentIDHits = hits[ID] seq = dic[ID] lastindex = 0 - start = [0] * len(seq) + start_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0) + start = [0] * (len(seq) + start_zero) for i, regexp in enumerate(regularexpressions): #for every regular expression relativeStartLocation = lastindex - (chunklength / 2) * i - if relativeStartLocation < 0 or relativeStartLocation >= len(seq): + if relativeStartLocation >= len(seq): break regex, hasVar = regexp matches = regex.finditer(seq[lastindex:]) for match in matches: #for every match with the current regex, only uses the first hit lastindex += match.start() - start[relativeStartLocation] += 1 + start[relativeStartLocation + start_zero] += 1 if hasVar: #if the regex has a variable nt in it chunkstart = chunklength / 2 * i #where in the reference does this chunk start chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end @@ -140,7 +141,7 @@ continue #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] currentIDHits[key + "_hits"] += 1 - start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1]) + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1]) #start_location[ID + "_" + key] = str(start.index(max(start))) @@ -161,7 +162,7 @@ for line in f: total += 1 if first: - o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + o.write("Sequence ID\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") first = False continue linesplt = line.split("\t") @@ -169,6 +170,7 @@ pass ID = linesplt[1] currentIDHits = hits[ID] + print currentIDHits possibleca = float(len(compiledregex["ca"])) possiblecg = float(len(compiledregex["cg"])) possiblecm = float(len(compiledregex["cm"])) @@ -179,24 +181,24 @@ ca1hits = currentIDHits["ca1"] ca2hits = currentIDHits["ca2"] if ca1hits >= ca2hits: - o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + o.write(ID + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") else: - o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + o.write(ID + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") elif cghits >= cahits and cghits >= cmhits: #its a cg gene cg1hits = currentIDHits["cg1"] cg2hits = currentIDHits["cg2"] cg3hits = currentIDHits["cg3"] cg4hits = currentIDHits["cg4"] if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene - o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene - o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene - o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #cg4 gene - o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #its a cm gene - o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") seq_write_count += 1 print "Time: %i" % (int(time.time() * 1000) - starttime)