annotate tbprofiler_json_to_tabular.py @ 2:2da090ebb942 draft

Uploaded
author dfornika
date Mon, 24 Oct 2022 22:47:24 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
1 #!/usr/bin/env python
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
2
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
3 import argparse
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
4 import csv
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
5 import json
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
6
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
7
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
8 def main(args):
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
9
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
10 with open(args.input, 'r') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
11 report = json.load(f)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
12
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
13 qc_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
14 'pct_reads_mapped',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
15 'num_reads_mapped',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
16 'median_coverage',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
17 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
18
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
19 with open(args.qc, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
20 writer = csv.DictWriter(f, fieldnames=qc_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
21 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
22 output = {k: report['qc'][k] for k in qc_fieldnames}
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
23 writer.writerow(output)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
24
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
25 gene_coverage_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
26 'locus_tag',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
27 'gene',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
28 'fraction',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
29 'cutoff',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
30 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
31
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
32 with open(args.gene_coverage, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
33 writer = csv.DictWriter(f, fieldnames=gene_coverage_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
34 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
35 for row in report['qc']['gene_coverage']:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
36 writer.writerow(row)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
37
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
38 missing_positions_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
39 'locus_tag',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
40 'gene',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
41 'position',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
42 'variants',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
43 'drugs'
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
44 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
45
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
46 with open(args.missing_positions, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
47 writer = csv.DictWriter(f, fieldnames=missing_positions_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
48 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
49 for row in report['qc']['missing_positions']:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
50 writer.writerow(row)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
51
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
52 resistance_variants_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
53 'chrom',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
54 'genome_pos',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
55 'locus_tag',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
56 'feature_id',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
57 'gene',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
58 'type',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
59 'ref',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
60 'alt',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
61 'freq',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
62 'nucleotide_change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
63 'protein_change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
64 'change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
65 'drugs',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
66 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
67
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
68 with open(args.resistance_variants, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
69 writer = csv.DictWriter(f, fieldnames=resistance_variants_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
70 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
71 for row in report['dr_variants']:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
72 row['drugs'] = ', '.join([drug['drug'] + ':' + drug['confers'] for drug in row['drugs']])
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
73 output = {k: row[k] for k in resistance_variants_fieldnames}
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
74 writer.writerow(output)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
75
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
76 other_variants_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
77 'chrom',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
78 'genome_pos',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
79 'locus_tag',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
80 'feature_id',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
81 'gene',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
82 'type',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
83 'ref',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
84 'alt',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
85 'freq',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
86 'nucleotide_change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
87 'protein_change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
88 'change',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
89 'gene_associated_drugs',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
90 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
91
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
92 with open(args.other_variants, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
93 writer = csv.DictWriter(f, fieldnames=other_variants_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
94 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
95 for row in report['other_variants']:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
96 row['gene_associated_drugs'] = ', '.join(row['gene_associated_drugs'])
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
97 output = {k: row[k] for k in other_variants_fieldnames}
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
98 writer.writerow(output)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
99
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
100 analysis_metadata_fieldnames = [
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
101 'timestamp',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
102 'tbprofiler_version',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
103 'mapping_program',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
104 'variant_calling_program',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
105 'db_name',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
106 'db_commit',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
107 'db_date',
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
108 ]
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
109
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
110 with open(args.analysis_metadata, 'w') as f:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
111 writer = csv.DictWriter(f, fieldnames=analysis_metadata_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
112 writer.writeheader()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
113 output = {}
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
114 output['timestamp'] = report['timestamp']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
115 output['tbprofiler_version'] = report['tbprofiler_version']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
116 output['db_name'] = report['db_version']['name']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
117 output['db_commit'] = report['db_version']['commit']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
118 output['db_date'] = report['db_version']['Date']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
119 for pipeline_entry in report['pipeline']:
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
120 if pipeline_entry['Analysis'] == "Mapping":
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
121 output['mapping_program'] = pipeline_entry['Program']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
122 elif pipeline_entry['Analysis'] == "Variant calling":
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
123 output['variant_calling_program'] = pipeline_entry['Program']
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
124
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
125 writer.writerow(output)
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
126
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
127 if __name__ == '__main__':
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
128 parser = argparse.ArgumentParser()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
129 parser.add_argument('input')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
130 parser.add_argument('--qc')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
131 parser.add_argument('--gene-coverage')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
132 parser.add_argument('--missing-positions')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
133 parser.add_argument('--resistance-variants')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
134 parser.add_argument('--other-variants')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
135 parser.add_argument('--analysis-metadata')
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
136 args = parser.parse_args()
2da090ebb942 Uploaded
dfornika
parents:
diff changeset
137 main(args)