Mercurial > repos > davidvanzessen > mutation_analysis
diff gene_identification.py @ 4:069419cccba4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 22 Sep 2014 10:19:36 -0400 |
parents | 2f4298673519 |
children | 71a12810eff3 |
line wrap: on
line diff
--- a/gene_identification.py Wed Sep 17 07:25:17 2014 -0400 +++ b/gene_identification.py Mon Sep 22 10:19:36 2014 -0400 @@ -5,31 +5,38 @@ parser = argparse.ArgumentParser() parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") -parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") +parser.add_argument("--output", help="The annotated summary output file") args = parser.parse_args() infile = args.input #infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" -outdir = args.outdir +output = args.output #outfile = "identified.txt" dic = dict() total = 0 + first = True +IDIndex = 0 +seqIndex = 0 + with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence - for line in f: - total += 1 - if first: - first = False - continue - linesplt = line.split("\t") - if linesplt[2] == "No results": - continue - ID = linesplt[1] - seq = linesplt[28] - dic[ID] = seq + for line in f: + total += 1 + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence ID") + seqIndex = linesplt.index("Sequence") + first = False + continue + linesplt = line.split("\t") + ID = linesplt[IDIndex] + if len(linesplt) < 28: #weird rows without a sequence + dic[ID] = "" + else: + dic[ID] = linesplt[seqIndex] #lambda/kappa reference sequence searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", @@ -128,7 +135,7 @@ continue #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] currentIDHits[key + "_hits"] += 1 - start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1]) #start_location[ID + "_" + key] = str(start.index(max(start))) @@ -141,120 +148,49 @@ varsInCM = 0 requiredVarPercentage = 0.7 -ca = 0 -ca1 = 0 -ca2 = 0 -cg = 0 -cg1 = 0 -cg2 = 0 -cg3 = 0 -cg4 = 0 -cm = 0 -try: - cafile = open(outdir + "/ca.txt", 'w') - ca1file = open(outdir + "/ca1.txt", 'w') - ca2file = open(outdir + "/ca2.txt", 'w') - cgfile = open(outdir + "/cg.txt", 'w') - cg1file = open(outdir + "/cg1.txt", 'w') - cg2file = open(outdir + "/cg2.txt", 'w') - cg3file = open(outdir + "/cg3.txt", 'w') - cg4file = open(outdir + "/cg4.txt", 'w') - cmfile = open(outdir + "/cm.txt", 'w') - unmatchedfile = open(outdir + "/unmatched.txt", 'w') - cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") - unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") - for ID in hits.keys(): - currentIDHits = hits[ID] - possibleca = float(len(compiledregex["ca"])) - possiblecg = float(len(compiledregex["cg"])) - possiblecm = float(len(compiledregex["cm"])) - cahits = currentIDHits["ca_hits"] - cghits = currentIDHits["cg_hits"] - cmhits = currentIDHits["cm_hits"] - if cahits > cghits and cahits > cmhits: #its a ca gene - if cahits <= int(chunksInCA * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") + +first = True +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + with open(output, 'w') as o: + for line in f: + total += 1 + if first: + o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + first = False continue - ca += 1 - ca1hits = currentIDHits["ca1"] - ca2hits = currentIDHits["ca2"] - cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - if ca1hits > ca2hits: - #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" - if ca1hits <= int(varsInCA * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") - continue - ca1 += 1 - ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - else: - #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" - if ca2hits <= int(varsInCA * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") - continue - ca2 += 1 - ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - elif cghits > cahits and cghits > cmhits: #its a cg gene - if cghits <= int(chunksInCG * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") - continue - cg += 1 - cg1hits = currentIDHits["cg1"] - cg2hits = currentIDHits["cg2"] - cg3hits = currentIDHits["cg3"] - cg4hits = currentIDHits["cg4"] - cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene - if cg1hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") - continue - cg1 += 1 - cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene - if cg2hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") - continue - cg2 += 1 - cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene - if cg3hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") - continue - cg3 += 1 - cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - else: #cg4 gene - if cg4hits <= int(varsInCG * requiredVarPercentage): - unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") - continue - cg4 += 1 - cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - else: #its a cm gene - if cmhits <= int(chunksInCM * requiredChunkPercentage): - unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") - continue - cm += 1 - cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") -finally: - cafile.close() - ca1file.close() - ca2file.close() - cgfile.close() - cg1file.close() - cg2file.close() - cg3file.close() - cg4file.close() - cmfile.close() - unmatchedfile.close() - - -#print ca,cg,cm,(ca+cg+cm) + linesplt = line.split("\t") + if linesplt[2] == "No results": + pass + ID = linesplt[1] + currentIDHits = hits[ID] + possibleca = float(len(compiledregex["ca"])) + possiblecg = float(len(compiledregex["cg"])) + possiblecm = float(len(compiledregex["cm"])) + cahits = currentIDHits["ca_hits"] + cghits = currentIDHits["cg_hits"] + cmhits = currentIDHits["cm_hits"] + if cahits > cghits and cahits > cmhits: #its a ca gene + ca1hits = currentIDHits["ca1"] + ca2hits = currentIDHits["ca2"] + if ca1hits > ca2hits: + o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + else: + o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + elif cghits > cahits and cghits > cmhits: #its a cg gene + cg1hits = currentIDHits["cg1"] + cg2hits = currentIDHits["cg2"] + cg3hits = currentIDHits["cg3"] + cg4hits = currentIDHits["cg4"] + if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene + o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene + o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene + o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #cg4 gene + o.write(line.rstrip() + "\tcg3\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm gene + o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") print "Time: %i" % (int(time.time() * 1000) - starttime)