comparison match_plasmid_to_reference.py @ 4:826ddf832bef draft default tip

"planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/match_plasmid_to_reference commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
author dfornika
date Wed, 06 Nov 2019 13:52:40 -0500
parents 3616b6eda1da
children
comparison
equal deleted inserted replaced
3:d56b4f743779 4:826ddf832bef
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 from __future__ import print_function 3 from __future__ import print_function, division
4 4
5 import argparse 5 import argparse
6 import csv 6 import csv
7 import errno 7 import errno
8 import json 8 import json
54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) 54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
55 for row in reader: 55 for row in reader:
56 mob_typer_report.append(row) 56 mob_typer_report.append(row)
57 return mob_typer_report 57 return mob_typer_report
58 58
59 def parse_genbank_accession(genbank_file_path): 59 def parse_genbank_accession(genbank_path):
60 with open(genbank_file_path, 'r') as f: 60 with open(genbank_path, 'r') as f:
61 while True: 61 while True:
62 line = f.readline() 62 line = f.readline()
63 # break while statement if it is not a comment line
64 # i.e. does not startwith #
65 if line.startswith('ACCESSION'): 63 if line.startswith('ACCESSION'):
66 return line.strip().split()[1] 64 return line.strip().split()[1]
67 65
66 def parse_fasta_accession(fasta_path):
67 with open(fasta_path, 'r') as f:
68 while True:
69 line = f.readline()
70 if line.startswith('>'):
71 return line.strip().split()[0][1:]
68 72
69 def count_contigs(plasmid_fasta_path): 73 def count_fasta_contigs(fasta_path):
70 contigs = 0 74 contigs = 0
71 with open(plasmid_fasta_path, 'r') as f: 75 with open(fasta_path, 'r') as f:
72 for line in f: 76 for line in f:
73 if line.startswith('>'): 77 if line.startswith('>'):
74 contigs += 1 78 contigs += 1
75 return contigs 79 return contigs
76 80
77 def count_bases(plasmid_fasta_path): 81 def count_fasta_bases(fasta_path):
78 bases = 0 82 bases = 0
79 with open(plasmid_fasta_path, 'r') as f: 83 with open(fasta_path, 'r') as f:
80 for line in f: 84 for line in f:
81 line = line.strip() 85 line = line.strip()
82 if not line.startswith('>'): 86 if not line.startswith('>'):
83 bases += len(line) 87 bases += len(line)
84 return bases 88 return bases
85 89
90 def compute_fasta_gc_percent(fasta_path):
91 gc_count = 0
92 total_bases_count = 0
93 with open(fasta_path, 'r') as f:
94 for line in f:
95 if not line.startswith('>'):
96 line = line.strip()
97 line_c_count = line.count('c') + line.count('C')
98 line_g_count = line.count('g') + line.count('G')
99 line_total_bases_count = len(line)
100 gc_count += line_c_count + line_g_count
101 total_bases_count += line_total_bases_count
102 return 100 * (gc_count / total_bases_count)
103
86 def main(args): 104 def main(args):
105
87 # create output directory 106 # create output directory
88 try: 107 try:
89 os.mkdir(args.outdir) 108 os.mkdir(args.outdir)
90 except OSError as exc: 109 except OSError as exc:
91 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): 110 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir):
93 else: 112 else:
94 raise 113 raise
95 114
96 # parse mob_typer report 115 # parse mob_typer report
97 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) 116 mob_typer_report = parse_mob_typer_report(args.mob_typer_report)
98 num_plasmid_contigs = count_contigs(args.plasmid) 117 num_plasmid_contigs = count_fasta_contigs(args.plasmid)
99 num_plasmid_bases = count_bases(args.plasmid) 118 num_plasmid_bases = count_fasta_bases(args.plasmid)
100 119 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid)
120
101 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: 121 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f:
102 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) 122 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
103 mob_typer_record_writer.writeheader() 123 mob_typer_record_writer.writeheader()
104 for record in mob_typer_report: 124 for record in mob_typer_report:
105 if num_plasmid_contigs == int(record['num_contigs']) and num_plasmid_bases == int(record['total_length']): 125 # match the plasmid against three properties in the MOB-Typer report:
106 for reference_plasmid in args.reference_plasmids: 126 # 1. number of contigs
127 # 2. total length of all contigs
128 # 3. G/C percent (within +/-0.1%)
129 if num_plasmid_contigs == int(record['num_contigs']) and \
130 num_plasmid_bases == int(record['total_length']) and \
131 abs(plasmid_gc_percent - float(record['gc'])) < 0.1:
132 for reference_plasmid in args.reference_plasmids_genbank:
107 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: 133 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']:
108 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) 134 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk"))
109 mob_typer_record_writer.writerow(record) 135
136 for reference_plasmid in args.reference_plasmids_fasta:
137 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None:
138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta"))
139 mob_typer_record_writer.writerow(record)
110 140
111 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) 141 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta"))
112 142
113 143
114 if __name__ == '__main__': 144 if __name__ == '__main__':
115 parser = argparse.ArgumentParser() 145 parser = argparse.ArgumentParser()
116 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") 146 parser.add_argument("--plasmid", help="plasmid assembly (fasta)")
117 parser.add_argument("--reference_plasmids", nargs='+', help="reference plasmids (genbank)") 147 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)")
148 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)")
118 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") 149 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)")
119 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") 150 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory")
120 args = parser.parse_args() 151 args = parser.parse_args()
121 main(args) 152 main(args)