Mercurial > repos > davidvanzessen > experimental_design_igg
comparison igblastmerge.py @ 1:07a23652bc2a draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 25 Mar 2014 06:59:26 -0400 |
parents | 03dbb4601b15 |
children |
comparison
equal
deleted
inserted
replaced
0:03dbb4601b15 | 1:07a23652bc2a |
---|---|
1 import sys | 1 import sys |
2 # error | 2 import pandas as pd |
3 def stop_err( msg ): | |
4 sys.stderr.write( "%s\n" % msg ) | |
5 sys.exit() | |
6 | 3 |
7 # main | |
8 def main(): | 4 def main(): |
9 args = sys.argv[1:-2] | 5 patients = {} |
6 files = [] | |
7 sample_id = sys.argv[1] | |
8 imgt_files = 0 | |
9 blast_files = 0 | |
10 #organize files | |
11 for arg in sys.argv[2:-2]: | |
12 if arg.find("/") is -1: | |
13 patients[sample_id] = files | |
14 files = [] | |
15 sample_id = arg | |
16 else: | |
17 df = pd.read_csv(arg, sep="\t") | |
18 if "Functionality" in list(df.columns.values): | |
19 df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon" | |
20 imgt_files += 1 | |
21 else: | |
22 blast_files += 1 | |
23 files.append(df) | |
24 patients[sample_id] = files | |
25 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate'] | |
26 if blast_files is not 0: | |
27 print "Has a parsed blastn file, using limited columns." | |
28 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate'] | |
10 | 29 |
11 try: | 30 result = None |
12 o = open(sys.argv[-1], 'w') | 31 for patient_id, samples in patients.iteritems(): |
13 i = open(args[1], 'r') | 32 count = 1 |
14 separator = "\t" | 33 for sample in samples: |
15 newline = "\n" | 34 sample['Sample'] = patient_id |
16 line = i.readline() | 35 sample['Replicate'] = str(count) |
17 #write the header | 36 count += 1 |
18 o.write(line[:line.rfind(newline)] + separator + "Sample" + separator + "Replicate" + newline) | 37 if result is None: |
19 i.close() | 38 result = sample[columns] |
20 | 39 else: |
21 current = 1 | 40 result = result.append(sample[columns]) |
22 sampleID = args[0] | 41 result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index") |
23 count = 1 | |
24 | |
25 while True: | |
26 print str(o) | |
27 f = open(args[current], 'r') | |
28 line = f.readline() | |
29 line = f.readline() | |
30 while line: | |
31 o.write(line[:line.rfind(newline)] + separator + sampleID + separator + str(count) + newline) | |
32 line = f.readline() | |
33 f.close() | |
34 | |
35 if current >= (len(args) - 1): | |
36 break | |
37 if args[current + 1].find("/") is -1: | |
38 sampleID = args[current + 1] | |
39 current += 1 | |
40 count = 1 | |
41 else: | |
42 count += 1 | |
43 current += 1 | |
44 o.close() | |
45 | |
46 except Exception, ex: | |
47 stop_err('Error running new_column.py\n' + str(ex)) | |
48 sys.exit(0) | |
49 | 42 |
50 if __name__ == "__main__": | 43 if __name__ == "__main__": |
51 print sys.argv | |
52 main() | 44 main() |