comparison match_plasmid_to_reference.py @ 0:8bb674372911 draft

"planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/match_plasmid_to_reference commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
author dfornika
date Wed, 06 Nov 2019 00:08:43 -0500
parents
children 3616b6eda1da
comparison
equal deleted inserted replaced
-1:000000000000 0:8bb674372911
1 #!/usr/bin/env python
2
3 from __future__ import print_function
4
5 import argparse
6 import csv
7 import errno
8 import json
9 import os
10 import re
11 import shutil
12 import sys
13
14 from pprint import pprint
15
16 MOB_TYPER_FIELDNAMES = [
17 "file_id",
18 "num_contigs",
19 "total_length",
20 "gc",
21 "rep_type(s)",
22 "rep_type_accession(s)",
23 "relaxase_type(s)",
24 "relaxase_type_accession(s)",
25 "mpf_type",
26 "mpf_type_accession(s)",
27 "orit_type(s)",
28 "orit_accession(s)",
29 "PredictedMobility",
30 "mash_nearest_neighbor",
31 "mash_neighbor_distance",
32 "mash_neighbor_cluster",
33 "NCBI-HR-rank",
34 "NCBI-HR-Name",
35 "LitRepHRPlasmClass",
36 "LitPredDBHRRank",
37 "LitPredDBHRRankSciName",
38 "LitRepHRRankInPubs",
39 "LitRepHRNameInPubs",
40 "LitMeanTransferRate",
41 "LitClosestRefAcc",
42 "LitClosestRefDonorStrain",
43 "LitClosestRefRecipientStrain",
44 "LitClosestRefTransferRate",
45 "LitClosestConjugTemp",
46 "LitPMIDs",
47 "LitPMIDsNumber",
48 ]
49
50 def parse_mob_typer_report(mob_typer_report_path):
51 mob_typer_report = []
52
53 with open(mob_typer_report_path) as f:
54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
55 for row in reader:
56 mob_typer_report.append(row)
57 return mob_typer_report
58
59 def parse_genbank_accession(genbank_file_path):
60 with open(genbank_file_path, 'r') as f:
61 while True:
62 line = f.readline()
63 # break while statement if it is not a comment line
64 # i.e. does not startwith #
65 if line.startswith('ACCESSION'):
66 return line.strip().split()[1]
67
68
69
70 def count_contigs(plasmid_fasta_path):
71 contigs = 0
72 with open(plasmid_fasta_path, 'r') as f:
73 contigs = 2
74 return contigs
75
76 def count_bases(plasmid_fasta_path):
77 bases = 0
78 with open(plasmid_fasta_path, 'r') as f:
79 bases = 11117
80 return bases
81
82 def main(args):
83 # create output directory
84 try:
85 os.mkdir(args.outdir)
86 except OSError as exc:
87 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir):
88 pass
89 else:
90 raise
91
92 # parse mob_typer report
93 mob_typer_report = parse_mob_typer_report(args.mob_typer_report)
94 num_plasmid_contigs = count_contigs(args.plasmid)
95 num_plasmid_bases = count_bases(args.plasmid)
96
97 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f:
98 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
99 mob_typer_record_writer.writeheader()
100 for record in mob_typer_report:
101 if num_plasmid_contigs == int(record['num_contigs']) and num_plasmid_bases == int(record['total_length']):
102 for reference_plasmid in args.reference_plasmids:
103 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']:
104 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk"))
105 mob_typer_record_writer.writerow(record)
106
107 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta"))
108
109
110 if __name__ == '__main__':
111 parser = argparse.ArgumentParser()
112 parser.add_argument("--plasmid", help="plasmid assembly (fasta)")
113 parser.add_argument("--reference_plasmids", nargs='+', help="reference plasmids (genbank)")
114 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)")
115 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory")
116 args = parser.parse_args()
117 main(args)