annotate experimental_design.py @ 3:a558fed92bcf draft default tip

Uploaded
author davidvanzessen
date Thu, 10 Jul 2014 07:25:10 -0400
parents f2c4c7151016
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
1 import sys
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
2 import pandas as pd
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
3
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
4 def main():
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
5 patients = {}
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
6 files = []
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
7 sample_id = sys.argv[1]
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
8 imgt_files = 0
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
9 blast_files = 0
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
10 #organize files
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
11 for arg in sys.argv[2:-2]:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
12 if arg.find("/") is -1:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
13 patients[sample_id] = files
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
14 files = []
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
15 sample_id = arg
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
16 else:
3
a558fed92bcf Uploaded
davidvanzessen
parents: 2
diff changeset
17 df = pd.read_csv(arg, sep="\t", dtype=object)
2
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
18 if "Functionality" in list(df.columns.values):
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
19 df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon"
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
20 imgt_files += 1
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
21 else:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
22 blast_files += 1
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
23 files.append(df)
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
24 patients[sample_id] = files
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
25 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate']
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
26 if blast_files is not 0:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
27 print "Has a parsed blastn file, using limited columns."
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
28 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate']
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
29
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
30 result = None
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
31 for patient_id, samples in patients.iteritems():
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
32 count = 1
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
33 for sample in samples:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
34 sample['Sample'] = patient_id
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
35 sample['Replicate'] = str(count)
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
36 count += 1
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
37 if result is None:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
38 result = sample[columns]
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
39 else:
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
40 result = result.append(sample[columns])
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
41 result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index")
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
42
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
43 if __name__ == "__main__":
f2c4c7151016 Uploaded
davidvanzessen
parents:
diff changeset
44 main()