# HG changeset patch # User davidvanzessen # Date 1404740710 14400 # Node ID f2c4c7151016c248062893fa58776ecbdfff5721 # Parent 07a23652bc2a9ebc7c490b4e48741779176e1d5e Uploaded diff -r 07a23652bc2a -r f2c4c7151016 experimental_design.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/experimental_design.py Mon Jul 07 09:45:10 2014 -0400 @@ -0,0 +1,44 @@ +import sys +import pandas as pd + +def main(): + patients = {} + files = [] + sample_id = sys.argv[1] + imgt_files = 0 + blast_files = 0 + #organize files + for arg in sys.argv[2:-2]: + if arg.find("/") is -1: + patients[sample_id] = files + files = [] + sample_id = arg + else: + df = pd.read_csv(arg, sep="\t") + if "Functionality" in list(df.columns.values): + df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon" + imgt_files += 1 + else: + blast_files += 1 + files.append(df) + patients[sample_id] = files + columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate'] + if blast_files is not 0: + print "Has a parsed blastn file, using limited columns." + columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate'] + + result = None + for patient_id, samples in patients.iteritems(): + count = 1 + for sample in samples: + sample['Sample'] = patient_id + sample['Replicate'] = str(count) + count += 1 + if result is None: + result = sample[columns] + else: + result = result.append(sample[columns]) + result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index") + +if __name__ == "__main__": + main() diff -r 07a23652bc2a -r f2c4c7151016 experimental_design.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/experimental_design.xml Mon Jul 07 09:45:10 2014 -0400 @@ -0,0 +1,29 @@ + + + + experimental_design.py + #for $i, $f in enumerate($patients) + "$f.id" + #for $j, $g in enumerate($f.samples) + ${g.sample} + #end for + + #end for + --output $out_file + + + + + + + + + + + + + + Step 3 of the Immune Repertoire tools, merges the parsed reports generated in step 2 into one file with an Sample ID. + + + diff -r 07a23652bc2a -r f2c4c7151016 igblastmerge.py --- a/igblastmerge.py Tue Mar 25 06:59:26 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -import sys -import pandas as pd - -def main(): - patients = {} - files = [] - sample_id = sys.argv[1] - imgt_files = 0 - blast_files = 0 - #organize files - for arg in sys.argv[2:-2]: - if arg.find("/") is -1: - patients[sample_id] = files - files = [] - sample_id = arg - else: - df = pd.read_csv(arg, sep="\t") - if "Functionality" in list(df.columns.values): - df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon" - imgt_files += 1 - else: - blast_files += 1 - files.append(df) - patients[sample_id] = files - columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate'] - if blast_files is not 0: - print "Has a parsed blastn file, using limited columns." - columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate'] - - result = None - for patient_id, samples in patients.iteritems(): - count = 1 - for sample in samples: - sample['Sample'] = patient_id - sample['Replicate'] = str(count) - count += 1 - if result is None: - result = sample[columns] - else: - result = result.append(sample[columns]) - result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index") - -if __name__ == "__main__": - main() diff -r 07a23652bc2a -r f2c4c7151016 igblastmerge.xml --- a/igblastmerge.xml Tue Mar 25 06:59:26 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ - - - - igblastmerge.py - #for $i, $f in enumerate($patients) - "$f.id" - #for $j, $g in enumerate($f.samples) - ${g.sample} - #end for - - #end for - --output $out_file - - - - - - - - - - - - - - Step 3 of the Immune Repertoire tools, merges the parsed reports generated in step 2 into one file with an Sample ID. - - -