changeset 40:8ad65c3af781 draft

"planemo upload for repository https://github.com/phac-nml/mob-suite commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
author dfornika
date Tue, 05 Nov 2019 16:46:20 -0500
parents be0576669f71
children cd69d25ce451
files mob_recon.xml mob_typer.xml mob_typer_refseq_download.py mob_typer_refseq_download.xml
diffstat 4 files changed, 263 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/mob_recon.xml	Wed Oct 30 23:43:49 2019 -0400
+++ b/mob_recon.xml	Tue Nov 05 16:46:20 2019 -0500
@@ -1,7 +1,7 @@
-<tool id="mob_recon" name="MOB-Recon" version="2.0.0_galaxy0">
+<tool id="mob_recon" name="MOB-Recon" version="2.0.1_galaxy0">
   <description>Type contigs and extract plasmid sequences</description>
   <requirements>
-     <requirement type="package" version="2.0.0">mob_suite</requirement>
+     <requirement type="package" version="2.0.1">mob_suite</requirement>
   </requirements>   
   <command detect_errors="exit_code">
   <![CDATA[  
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mob_typer.xml	Tue Nov 05 16:46:20 2019 -0500
@@ -0,0 +1,81 @@
+<tool id="mob_typer" name="MOB-Typer" version="2.0.1_galaxy0">
+  <description>Get the plasmid type and mobility given its sequence</description>
+  <requirements>
+     <requirement type="package" version="2.0.1">mob_suite</requirement>
+  </requirements>   
+  <command detect_errors="exit_code">
+  <![CDATA[
+    ln -s "${input}" "${input.name}";
+    mob_typer  --num_threads \${GALAXY_SLOTS:-4} --infile "${input.name}" 
+   --min_rep_evalue '$adv_param.min_rep_evalue_value'
+   --min_mob_evalue '$adv_param.min_mob_evalue_value'
+   --min_con_evalue '$adv_param.min_con_evalue_value'
+   --min_ori_evalue '$adv_param.min_ori_evalue_value'
+   --min_mpf_evalue '$adv_param.min_mpf_evalue'
+   --min_rep_ident '$adv_param.min_rep_ident'
+   --min_mob_ident '$adv_param.min_mob_ident'
+   --min_ori_ident '$adv_param.min_ori_ident'
+   --min_mpf_ident '$adv_param.min_mpf_ident'
+    --outdir '.';
+  ]]>
+  </command>
+  <inputs>
+    <param name="input" type="data" format="fasta" label="Input" help="FASTA file with contig(s)"/>
+    <section name="adv_param" title="Advanced parameters" expanded="False">
+      <param name="min_rep_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for replicon blastn"/> 
+      <param name="min_mob_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for relaxase tblastn"/> 
+      <param name="min_con_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for contig blastn"/> 
+      <param name="min_ori_evalue_value" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for oriT elements blastn"/>
+      <param name="min_mpf_evalue" type="float" value="0.00001" min="0.00001" max="1" label="Minimum evalue threshold for mpf elements blastn"/>
+      <param name="min_rep_ident" label="Minimum sequence identity for replicons" type="integer"  min="0" max="100" value="80"/>
+      <param name="min_mob_ident" label="Minimum sequence identity for relaxases" type="integer"  min="0" max="100" value="80"/>
+      <param name="min_ori_ident" label="Minimum sequence identity for oriT elements" type="integer"  min="0" max="100" value="90"/>
+      <param name="min_mpf_ident" label="Minimum sequence identity for mpf elements" type="integer"  min="0" max="100" value="80"/>
+    </section>  
+  </inputs>
+  <outputs>
+    <data name="outfile1" label="${tool.name}: Plasmid report on ${input.name}" format="tabular">
+      <discover_datasets pattern=".+_report\.txt" visible="true" format="tabular" assign_primary_output="true"/>
+    </data>  
+  </outputs>
+  <tests>
+    <test>
+        <param name="input" value="plasmid_476.fasta" ftype="fasta"/>
+        <assert_stdout>
+            <has_text text="JN253636"/> 
+         </assert_stdout>
+    </test>
+  </tests>
+  <help>
+
+**Syntax**
+
+This tool provides *in-silico* predictions on plasmid typing including identification of replicon, relaxase and mate-pair formation protein types. MOB-typer also predicts mobility of a plasmid (Conjugative, Mobilizable, Non-mobilizable). Do not include multiple unrelated plasmids in the input FASTA file as they will be treated as a single plasmid.
+
+For more information please visit https://github.com/phac-nml/mob-suite/. 
+
+-----
+
+**Input:**
+
+A FASTA file with a single or multiple contigs (e.g. plasmid.fasta):
+
+
+**Output:**
+
+Tab-delimited report listing identified plasmid(s) and their predicted mobility. Refer to https://github.com/phac-nml/mob-suite#mob-typer-report-file-format for the description of each column.
+
+
+  </help>
+  <citations>
+    <citation type="bibtex">
+  @misc{githubmob-suite,
+  author = {Robertson J, Nash J},
+  title = {MOB-Suite: Software tools for clustering, reconstruction and typing of plasmids from draft assemblies.},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  doi = {10.1099/mgen.0.000206},
+  url = {https://github.com/phac-nml/mob-suite}
+    }</citation>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mob_typer_refseq_download.py	Tue Nov 05 16:46:20 2019 -0500
@@ -0,0 +1,143 @@
+#!/bin/env python
+
+import argparse
+import csv
+import logging
+import os
+import re
+import sys
+import time
+import urllib.request
+
+def parse_mash_result(path_to_mash_screen):
+    """
+    Args:
+        path_to_mash_screen (str): Path to the mash screen report file.
+    Returns:
+        list(dict): Parsed mash screen report
+        For example:
+        [
+          { "identity": 0.996805,
+            "shared_hashes": "935/1000",
+            "median_multiplicity": 38,
+            "p_value": 0.00,
+            "query_id": "GCF_001022155.1_ASM102215v1_genomic.fna.gz",
+            "query_comment": "[10 seqs] NZ_CP011612.1 Citrobacter freundii strain CAV1321, complete genome [...]"
+          },
+          { "identity": 0.914483,
+            ...
+          }
+        ]
+        See mash docs for more info on mash screen report file:
+        https://mash.readthedocs.io/en/latest/tutorials.html#screening-a-read-set-for-containment-of-refseq-genomes
+    """
+
+    mash_screen_report_fields = {
+        'identity': lambda x: float(x),
+        'shared_hashes': lambda x: x,
+        'median_multiplicity': lambda x: int(x),
+        'p_value': lambda x: float(x),
+        'query_id': lambda x: x,
+        'query_comment': lambda x: x
+    }
+    
+    # Example mash screen report record (actual report has no header and is tab-delimited):
+    # identity    shared_hashes    median_multiplicity    p_value    query_id                                    query_comment
+    # 0.998697    973/1000         71                     0          GCF_000958965.1_matepair4_genomic.fna.gz    [59 seqs] NZ_LAFU01000001.1 Klebsiella pneumoniae strain CDPH5262 contig000001, whole genome shotgun sequence [...]
+
+    parsed_mash_result = []
+    with open(path_to_mash_screen) as mashfile:
+        reader = csv.DictReader(mashfile, delimiter='\t', fieldnames=list(mash_screen_report_fields))
+        mash_record = {}
+        for row in reader:
+            for field_name, parse in mash_screen_report_fields.items():
+                mash_record[field_name] = parse(row[field_name])
+            parsed_mash_result.append(mash_record.copy())
+    return parsed_mash_result
+
+def mash_query_id_to_ncbi_ftp_path(query_id):
+        """
+        Args:
+            query_id (str): Mash query ID (column 5 of mash screen report)
+        Returns:
+            list: Directory names used to locate reference genome
+                  on ftp://ftp.ncbi.nlm.nih.gov/genomes/all/
+        For example:
+            "GCF/001/022/155"
+        """
+        prefix = query_id.split('_')[0]
+        digits = query_id.split('_')[1].split('.')[0]
+        path_list = [prefix] + [digits[i:i+3] for i in range(0, len(digits), 3)]
+
+        return "/".join(path_list)
+
+def main(args):
+    logging.basicConfig()
+    log = logging.getLogger('mash_screen_refseq_download')
+    log.setLevel(logging.INFO)
+
+    mash_results = parse_mash_result(args.input_file)
+
+    for mash_result in mash_results:
+        url = None
+        query_id = mash_result['query_id']
+        if re.match("^ref\|", query_id):
+            accession = re.search('ref\|(.*)\|', mash_result['query_id']).group(1)
+            url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \
+            "&".join([
+                "db=nucleotide",
+                "id=" + accession,
+                "rettype=fasta",
+            ])
+            # NCBI API is rate-limited to 3 requests per second, so pause between requests
+            time.sleep(1)
+            try:
+                urllib.request.urlretrieve(url, accession + ".fasta")
+                log.info("Downloaded: " + url)
+            except urllib.error.HTTPError as e:
+                log.error("Download Failed: " + str(e))
+            
+        elif re.match("^GCF", query_id):
+            ncbi_ftp_path = mash_query_id_to_ncbi_ftp_path(query_id)
+            assembly = query_id[:query_id.find("_genomic.fna.gz")]
+            ncbi_ftp_server_base = "ftp://ftp.ncbi.nlm.nih.gov"
+            url = "/".join([
+                ncbi_ftp_server_base, "genomes", "all",
+                ncbi_ftp_path,
+                assembly,
+                query_id
+            ])
+            # NCBI API is rate-limited to 3 requests per second, so pause between requests
+            time.sleep(1)
+            try:
+                urllib.request.urlretrieve(url, query_id)
+                log.info("Downloaded: " + url)
+            except urllib.error.HTTPError as e:
+                log.error("Download Failed: " + str(e))
+
+            if args.download_assembly_stats:
+                assembly_stat_url = "/".join([
+                    ncbi_ftp_server_base, "genomes", "all",
+                    ncbi_ftp_path,
+                    assembly,
+                    assembly + "_assembly_stats.txt"
+                ])
+                try:
+                    urllib.request.urlretrieve(assembly_stat_url, assembly + "_assembly_stats.txt")
+                    log.info("Downloaded: " + assembly_stat_url)
+                except urllib.error.HTTPError as e:
+                    log.error("Download Failed: " + str(e))
+        else:
+            log.error("query ID \"" + mash_result['query_id'] + "\" not recognized.")
+
+    
+if __name__ == '__main__':
+    script_name = os.path.basename(os.path.realpath(sys.argv[0]))
+    parser = argparse.ArgumentParser(prog=script_name, description='')
+    parser.add_argument("-i", "--input", dest="input_file",
+                        help="Mash screen report file", required=True)
+    parser.add_argument("-a", "--assembly_stats", dest="download_assembly_stats", action='store_true',                      
+                        help="For genome files, also download assembly_stats.txt file,", required=False)
+    
+    args = parser.parse_args()
+    main(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mob_typer_refseq_download.xml	Tue Nov 05 16:46:20 2019 -0500
@@ -0,0 +1,37 @@
+<tool id="mob_typer_refseq_download" name="MOB-Typer RefSeq download" version="0.1.0">
+    <description>Parses MOB-Typer report and downloads reference files from refseq.</description>
+    <requirements>
+        <requirement type="package" version="11.0">ncbi-acc-download</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        '$__tool_directory__/mob_typer_refseq_download.py'
+             -i $input
+             -o outdir
+    ]]></command>
+    <inputs>
+      <param name="input" format="tabular" type="data" />
+    </inputs>
+    <outputs>
+      <collection name="downloaded_references" type="list">
+	<discover_datasets pattern="__name_and_ext__" directory="outdir" />
+      </collection>
+    </outputs>
+    <tests>
+      <test>
+	<param name="input" value="input/mash_screen_genome_single.tabular" />
+	<output_collection name="downloaded_references" type="list">
+	  <element name="GCF_001601135.1_ASM160113v1_genomic" file="output/GCF_001601135.1_ASM160113v1_genomic.fna.gz" />
+	</output_collection>
+      </test>
+      <test>
+	<param name="input" value="input/mash_screen_plasmid_single.tabular" />
+	<output_collection name="output" type="list" >
+	  <element name="NZ_CP019139.1" file="output/NZ_CP019139.1.fasta" />
+	</output_collection>
+      </test>
+    </tests>
+    <help><![CDATA[
+    ]]></help>
+    <citations>
+    </citations>
+</tool>