Mercurial > repos > dfornika > match_plasmid_to_reference
comparison match_plasmid_to_reference.py @ 4:826ddf832bef draft default tip
"planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/match_plasmid_to_reference commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
author | dfornika |
---|---|
date | Wed, 06 Nov 2019 13:52:40 -0500 |
parents | 3616b6eda1da |
children |
comparison
equal
deleted
inserted
replaced
3:d56b4f743779 | 4:826ddf832bef |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 from __future__ import print_function | 3 from __future__ import print_function, division |
4 | 4 |
5 import argparse | 5 import argparse |
6 import csv | 6 import csv |
7 import errno | 7 import errno |
8 import json | 8 import json |
54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | 54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) |
55 for row in reader: | 55 for row in reader: |
56 mob_typer_report.append(row) | 56 mob_typer_report.append(row) |
57 return mob_typer_report | 57 return mob_typer_report |
58 | 58 |
59 def parse_genbank_accession(genbank_file_path): | 59 def parse_genbank_accession(genbank_path): |
60 with open(genbank_file_path, 'r') as f: | 60 with open(genbank_path, 'r') as f: |
61 while True: | 61 while True: |
62 line = f.readline() | 62 line = f.readline() |
63 # break while statement if it is not a comment line | |
64 # i.e. does not startwith # | |
65 if line.startswith('ACCESSION'): | 63 if line.startswith('ACCESSION'): |
66 return line.strip().split()[1] | 64 return line.strip().split()[1] |
67 | 65 |
66 def parse_fasta_accession(fasta_path): | |
67 with open(fasta_path, 'r') as f: | |
68 while True: | |
69 line = f.readline() | |
70 if line.startswith('>'): | |
71 return line.strip().split()[0][1:] | |
68 | 72 |
69 def count_contigs(plasmid_fasta_path): | 73 def count_fasta_contigs(fasta_path): |
70 contigs = 0 | 74 contigs = 0 |
71 with open(plasmid_fasta_path, 'r') as f: | 75 with open(fasta_path, 'r') as f: |
72 for line in f: | 76 for line in f: |
73 if line.startswith('>'): | 77 if line.startswith('>'): |
74 contigs += 1 | 78 contigs += 1 |
75 return contigs | 79 return contigs |
76 | 80 |
77 def count_bases(plasmid_fasta_path): | 81 def count_fasta_bases(fasta_path): |
78 bases = 0 | 82 bases = 0 |
79 with open(plasmid_fasta_path, 'r') as f: | 83 with open(fasta_path, 'r') as f: |
80 for line in f: | 84 for line in f: |
81 line = line.strip() | 85 line = line.strip() |
82 if not line.startswith('>'): | 86 if not line.startswith('>'): |
83 bases += len(line) | 87 bases += len(line) |
84 return bases | 88 return bases |
85 | 89 |
90 def compute_fasta_gc_percent(fasta_path): | |
91 gc_count = 0 | |
92 total_bases_count = 0 | |
93 with open(fasta_path, 'r') as f: | |
94 for line in f: | |
95 if not line.startswith('>'): | |
96 line = line.strip() | |
97 line_c_count = line.count('c') + line.count('C') | |
98 line_g_count = line.count('g') + line.count('G') | |
99 line_total_bases_count = len(line) | |
100 gc_count += line_c_count + line_g_count | |
101 total_bases_count += line_total_bases_count | |
102 return 100 * (gc_count / total_bases_count) | |
103 | |
86 def main(args): | 104 def main(args): |
105 | |
87 # create output directory | 106 # create output directory |
88 try: | 107 try: |
89 os.mkdir(args.outdir) | 108 os.mkdir(args.outdir) |
90 except OSError as exc: | 109 except OSError as exc: |
91 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): | 110 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): |
93 else: | 112 else: |
94 raise | 113 raise |
95 | 114 |
96 # parse mob_typer report | 115 # parse mob_typer report |
97 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) | 116 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) |
98 num_plasmid_contigs = count_contigs(args.plasmid) | 117 num_plasmid_contigs = count_fasta_contigs(args.plasmid) |
99 num_plasmid_bases = count_bases(args.plasmid) | 118 num_plasmid_bases = count_fasta_bases(args.plasmid) |
100 | 119 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid) |
120 | |
101 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: | 121 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: |
102 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | 122 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) |
103 mob_typer_record_writer.writeheader() | 123 mob_typer_record_writer.writeheader() |
104 for record in mob_typer_report: | 124 for record in mob_typer_report: |
105 if num_plasmid_contigs == int(record['num_contigs']) and num_plasmid_bases == int(record['total_length']): | 125 # match the plasmid against three properties in the MOB-Typer report: |
106 for reference_plasmid in args.reference_plasmids: | 126 # 1. number of contigs |
127 # 2. total length of all contigs | |
128 # 3. G/C percent (within +/-0.1%) | |
129 if num_plasmid_contigs == int(record['num_contigs']) and \ | |
130 num_plasmid_bases == int(record['total_length']) and \ | |
131 abs(plasmid_gc_percent - float(record['gc'])) < 0.1: | |
132 for reference_plasmid in args.reference_plasmids_genbank: | |
107 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: | 133 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: |
108 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) | 134 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) |
109 mob_typer_record_writer.writerow(record) | 135 |
136 for reference_plasmid in args.reference_plasmids_fasta: | |
137 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None: | |
138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta")) | |
139 mob_typer_record_writer.writerow(record) | |
110 | 140 |
111 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) | 141 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) |
112 | 142 |
113 | 143 |
114 if __name__ == '__main__': | 144 if __name__ == '__main__': |
115 parser = argparse.ArgumentParser() | 145 parser = argparse.ArgumentParser() |
116 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") | 146 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") |
117 parser.add_argument("--reference_plasmids", nargs='+', help="reference plasmids (genbank)") | 147 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)") |
148 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)") | |
118 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") | 149 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") |
119 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") | 150 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") |
120 args = parser.parse_args() | 151 args = parser.parse_args() |
121 main(args) | 152 main(args) |