Mercurial > repos > dfornika > mob_suite
changeset 40:8ad65c3af781 draft
"planemo upload for repository https://github.com/phac-nml/mob-suite commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
| author | dfornika |
|---|---|
| date | Tue, 05 Nov 2019 16:46:20 -0500 |
| parents | be0576669f71 |
| children | cd69d25ce451 |
| files | mob_recon.xml mob_typer.xml mob_typer_refseq_download.py mob_typer_refseq_download.xml |
| diffstat | 4 files changed, 263 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/mob_recon.xml Wed Oct 30 23:43:49 2019 -0400 +++ b/mob_recon.xml Tue Nov 05 16:46:20 2019 -0500 @@ -1,7 +1,7 @@ -<tool id="mob_recon" name="MOB-Recon" version="2.0.0_galaxy0"> +<tool id="mob_recon" name="MOB-Recon" version="2.0.1_galaxy0"> <description>Type contigs and extract plasmid sequences</description> <requirements> - <requirement type="package" version="2.0.0">mob_suite</requirement> + <requirement type="package" version="2.0.1">mob_suite</requirement> </requirements> <command detect_errors="exit_code"> <![CDATA[
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mob_typer.xml Tue Nov 05 16:46:20 2019 -0500 @@ -0,0 +1,81 @@ +<tool id="mob_typer" name="MOB-Typer" version="2.0.1_galaxy0"> + <description>Get the plasmid type and mobility given its sequence</description> + <requirements> + <requirement type="package" version="2.0.1">mob_suite</requirement> + </requirements> + <command detect_errors="exit_code"> + <![CDATA[ + ln -s "${input}" "${input.name}"; + mob_typer --num_threads \${GALAXY_SLOTS:-4} --infile "${input.name}" + --min_rep_evalue '$adv_param.min_rep_evalue_value' + --min_mob_evalue '$adv_param.min_mob_evalue_value' + --min_con_evalue '$adv_param.min_con_evalue_value' + --min_ori_evalue '$adv_param.min_ori_evalue_value' + --min_mpf_evalue '$adv_param.min_mpf_evalue' + --min_rep_ident '$adv_param.min_rep_ident' + --min_mob_ident '$adv_param.min_mob_ident' + --min_ori_ident '$adv_param.min_ori_ident' + --min_mpf_ident '$adv_param.min_mpf_ident' + --outdir '.'; + ]]> + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Input" help="FASTA file with contig(s)"/> + <section name="adv_param" title="Advanced parameters" expanded="False"> + <param name="min_rep_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for replicon blastn"/> + <param name="min_mob_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for relaxase tblastn"/> + <param name="min_con_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for contig blastn"/> + <param name="min_ori_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for oriT elements blastn"/> + <param name="min_mpf_evalue" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for mpf elements blastn"/> + <param name="min_rep_ident" label="Minimum sequence identity for replicons" type="integer" min="0" max="100" value="80"/> + <param name="min_mob_ident" label="Minimum sequence identity for relaxases" type="integer" min="0" max="100" value="80"/> + <param name="min_ori_ident" label="Minimum sequence identity for oriT elements" type="integer" min="0" max="100" value="90"/> + <param name="min_mpf_ident" label="Minimum sequence identity for mpf elements" type="integer" min="0" max="100" value="80"/> + </section> + </inputs> + <outputs> + <data name="outfile1" label="${tool.name}: Plasmid report on ${input.name}" format="tabular"> + <discover_datasets pattern=".+_report\.txt" visible="true" format="tabular" assign_primary_output="true"/> + </data> + </outputs> + <tests> + <test> + <param name="input" value="plasmid_476.fasta" ftype="fasta"/> + <assert_stdout> + <has_text text="JN253636"/> + </assert_stdout> + </test> + </tests> + <help> + +**Syntax** + +This tool provides *in-silico* predictions on plasmid typing including identification of replicon, relaxase and mate-pair formation protein types. MOB-typer also predicts mobility of a plasmid (Conjugative, Mobilizable, Non-mobilizable). Do not include multiple unrelated plasmids in the input FASTA file as they will be treated as a single plasmid. + +For more information please visit https://github.com/phac-nml/mob-suite/. + +----- + +**Input:** + +A FASTA file with a single or multiple contigs (e.g. plasmid.fasta): + + +**Output:** + +Tab-delimited report listing identified plasmid(s) and their predicted mobility. Refer to https://github.com/phac-nml/mob-suite#mob-typer-report-file-format for the description of each column. + + + </help> + <citations> + <citation type="bibtex"> + @misc{githubmob-suite, + author = {Robertson J, Nash J}, + title = {MOB-Suite: Software tools for clustering, reconstruction and typing of plasmids from draft assemblies.}, + publisher = {GitHub}, + journal = {GitHub repository}, + doi = {10.1099/mgen.0.000206}, + url = {https://github.com/phac-nml/mob-suite} + }</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mob_typer_refseq_download.py Tue Nov 05 16:46:20 2019 -0500 @@ -0,0 +1,143 @@ +#!/bin/env python + +import argparse +import csv +import logging +import os +import re +import sys +import time +import urllib.request + +def parse_mash_result(path_to_mash_screen): + """ + Args: + path_to_mash_screen (str): Path to the mash screen report file. + Returns: + list(dict): Parsed mash screen report + For example: + [ + { "identity": 0.996805, + "shared_hashes": "935/1000", + "median_multiplicity": 38, + "p_value": 0.00, + "query_id": "GCF_001022155.1_ASM102215v1_genomic.fna.gz", + "query_comment": "[10 seqs] NZ_CP011612.1 Citrobacter freundii strain CAV1321, complete genome [...]" + }, + { "identity": 0.914483, + ... + } + ] + See mash docs for more info on mash screen report file: + https://mash.readthedocs.io/en/latest/tutorials.html#screening-a-read-set-for-containment-of-refseq-genomes + """ + + mash_screen_report_fields = { + 'identity': lambda x: float(x), + 'shared_hashes': lambda x: x, + 'median_multiplicity': lambda x: int(x), + 'p_value': lambda x: float(x), + 'query_id': lambda x: x, + 'query_comment': lambda x: x + } + + # Example mash screen report record (actual report has no header and is tab-delimited): + # identity shared_hashes median_multiplicity p_value query_id query_comment + # 0.998697 973/1000 71 0 GCF_000958965.1_matepair4_genomic.fna.gz [59 seqs] NZ_LAFU01000001.1 Klebsiella pneumoniae strain CDPH5262 contig000001, whole genome shotgun sequence [...] + + parsed_mash_result = [] + with open(path_to_mash_screen) as mashfile: + reader = csv.DictReader(mashfile, delimiter='\t', fieldnames=list(mash_screen_report_fields)) + mash_record = {} + for row in reader: + for field_name, parse in mash_screen_report_fields.items(): + mash_record[field_name] = parse(row[field_name]) + parsed_mash_result.append(mash_record.copy()) + return parsed_mash_result + +def mash_query_id_to_ncbi_ftp_path(query_id): + """ + Args: + query_id (str): Mash query ID (column 5 of mash screen report) + Returns: + list: Directory names used to locate reference genome + on ftp://ftp.ncbi.nlm.nih.gov/genomes/all/ + For example: + "GCF/001/022/155" + """ + prefix = query_id.split('_')[0] + digits = query_id.split('_')[1].split('.')[0] + path_list = [prefix] + [digits[i:i+3] for i in range(0, len(digits), 3)] + + return "/".join(path_list) + +def main(args): + logging.basicConfig() + log = logging.getLogger('mash_screen_refseq_download') + log.setLevel(logging.INFO) + + mash_results = parse_mash_result(args.input_file) + + for mash_result in mash_results: + url = None + query_id = mash_result['query_id'] + if re.match("^ref\|", query_id): + accession = re.search('ref\|(.*)\|', mash_result['query_id']).group(1) + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \ + "&".join([ + "db=nucleotide", + "id=" + accession, + "rettype=fasta", + ]) + # NCBI API is rate-limited to 3 requests per second, so pause between requests + time.sleep(1) + try: + urllib.request.urlretrieve(url, accession + ".fasta") + log.info("Downloaded: " + url) + except urllib.error.HTTPError as e: + log.error("Download Failed: " + str(e)) + + elif re.match("^GCF", query_id): + ncbi_ftp_path = mash_query_id_to_ncbi_ftp_path(query_id) + assembly = query_id[:query_id.find("_genomic.fna.gz")] + ncbi_ftp_server_base = "ftp://ftp.ncbi.nlm.nih.gov" + url = "/".join([ + ncbi_ftp_server_base, "genomes", "all", + ncbi_ftp_path, + assembly, + query_id + ]) + # NCBI API is rate-limited to 3 requests per second, so pause between requests + time.sleep(1) + try: + urllib.request.urlretrieve(url, query_id) + log.info("Downloaded: " + url) + except urllib.error.HTTPError as e: + log.error("Download Failed: " + str(e)) + + if args.download_assembly_stats: + assembly_stat_url = "/".join([ + ncbi_ftp_server_base, "genomes", "all", + ncbi_ftp_path, + assembly, + assembly + "_assembly_stats.txt" + ]) + try: + urllib.request.urlretrieve(assembly_stat_url, assembly + "_assembly_stats.txt") + log.info("Downloaded: " + assembly_stat_url) + except urllib.error.HTTPError as e: + log.error("Download Failed: " + str(e)) + else: + log.error("query ID \"" + mash_result['query_id'] + "\" not recognized.") + + +if __name__ == '__main__': + script_name = os.path.basename(os.path.realpath(sys.argv[0])) + parser = argparse.ArgumentParser(prog=script_name, description='') + parser.add_argument("-i", "--input", dest="input_file", + help="Mash screen report file", required=True) + parser.add_argument("-a", "--assembly_stats", dest="download_assembly_stats", action='store_true', + help="For genome files, also download assembly_stats.txt file,", required=False) + + args = parser.parse_args() + main(args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mob_typer_refseq_download.xml Tue Nov 05 16:46:20 2019 -0500 @@ -0,0 +1,37 @@ +<tool id="mob_typer_refseq_download" name="MOB-Typer RefSeq download" version="0.1.0"> + <description>Parses MOB-Typer report and downloads reference files from refseq.</description> + <requirements> + <requirement type="package" version="11.0">ncbi-acc-download</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + '$__tool_directory__/mob_typer_refseq_download.py' + -i $input + -o outdir + ]]></command> + <inputs> + <param name="input" format="tabular" type="data" /> + </inputs> + <outputs> + <collection name="downloaded_references" type="list"> + <discover_datasets pattern="__name_and_ext__" directory="outdir" /> + </collection> + </outputs> + <tests> + <test> + <param name="input" value="input/mash_screen_genome_single.tabular" /> + <output_collection name="downloaded_references" type="list"> + <element name="GCF_001601135.1_ASM160113v1_genomic" file="output/GCF_001601135.1_ASM160113v1_genomic.fna.gz" /> + </output_collection> + </test> + <test> + <param name="input" value="input/mash_screen_plasmid_single.tabular" /> + <output_collection name="output" type="list" > + <element name="NZ_CP019139.1" file="output/NZ_CP019139.1.fasta" /> + </output_collection> + </test> + </tests> + <help><![CDATA[ + ]]></help> + <citations> + </citations> +</tool>
