Mercurial > repos > davidvanzessen > mutation_analysis
comparison baseline/script_xlsx.py @ 114:e7b550d52eb7 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Tue, 09 Aug 2016 07:20:41 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 113:b84477f57318 | 114:e7b550d52eb7 |
|---|---|
| 1 import xlrd | |
| 2 import argparse | |
| 3 | |
| 4 parser = argparse.ArgumentParser() | |
| 5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") | |
| 6 parser.add_argument("--ref", help="Reference file") | |
| 7 parser.add_argument("--output", help="Output file") | |
| 8 | |
| 9 args = parser.parse_args() | |
| 10 | |
| 11 gene_column = 6 | |
| 12 id_column = 7 | |
| 13 seq_column = 8 | |
| 14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] | |
| 15 | |
| 16 | |
| 17 refdic = dict() | |
| 18 with open(args.ref, 'r') as ref: | |
| 19 currentSeq = "" | |
| 20 currentId = "" | |
| 21 for line in ref.readlines(): | |
| 22 if line[0] is ">": | |
| 23 if currentSeq is not "" and currentId is not "": | |
| 24 refdic[currentId[1:]] = currentSeq | |
| 25 currentId = line.rstrip() | |
| 26 currentSeq = "" | |
| 27 else: | |
| 28 currentSeq += line.rstrip() | |
| 29 refdic[currentId[1:]] = currentSeq | |
| 30 | |
| 31 currentSeq = "" | |
| 32 currentId = "" | |
| 33 with xlrd.open_workbook(args.input, 'r') as wb: | |
| 34 with open(args.output, 'a') as o: | |
| 35 for sheet in wb.sheets(): | |
| 36 if sheet.cell(1,gene_column).value.find("IGHV") < 0: | |
| 37 print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name | |
| 38 continue | |
| 39 o.write(">>>" + sheet.name + "\n") | |
| 40 outputdic = dict() | |
| 41 for rowindex in range(1, sheet.nrows): | |
| 42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "") | |
| 43 if ref in outputdic: | |
| 44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] | |
| 45 else: | |
| 46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] | |
| 47 #print outputdic | |
| 48 | |
| 49 for k in outputdic.keys(): | |
| 50 if k in refdic: | |
| 51 o.write(">>" + k + "\n") | |
| 52 o.write(refdic[k] + "\n") | |
| 53 for seq in outputdic[k]: | |
| 54 #print seq | |
| 55 o.write(">" + seq[0] + "\n") | |
| 56 o.write(seq[1] + "\n") | |
| 57 else: | |
| 58 print k + " not in reference, skipping " + k |
