Mercurial > repos > pimarin > recentrifuge

<?xml version="1.0" encoding="UTF-8"?>

<tool id="recentrifuge" name="Recentrifuge" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
  <description>
    Robust comparative analysis and contamination removal for metagenomics
  </description>
  <macros>
    <import>macro.xml</import>
  </macros>
  <expand macro='xrefs'/>
  <expand macro="requirements" />
  <expand macro="version_command" />
  <command detect_errors="aggressive"><![CDATA[
    ## database input ##
    ## if database imported from history ##
    #if $database.db_type.db_select == "cached"
      #set $rcf_db = $database.db_type.cached_db.fields.path
    #else if $database.db_type.db_select == "history"
      mkdir rcf_db &&
      #for i in $database.db_type.history_db
        ln -s '$i' 'rcf_db/$i.element_identifier' &&
      #end for
      #set $rcf_db = "rcf_db"
    #end if
    rcf
    -n $rcf_db
    ## input type ##
    #if $input_option.file_type.filetype == "centrifuge"
      -f '$input_option.input_file'
    #else if $input_option.file_type.filetype == "lmat"
      -l '$input_option.input_file'
    #else if $input_option.file_type.filetype == "clark"
      -r '$input_option.input_file'
    #else if $input_option.file_type.filetype == "kraken"
      -k '$input_option.input_file'
    #else if $input_option.file_type.filetype == "generic"
      -g '$input_option.input_file'
      --format '$input_option.file_type.format'
    #end if
    ## output option ##
    -e $output_option.extra
    -o output
    $output_option.pickle
    $output_option.nohtml
    ## advanced options ##
    #if $advanced_option.control_select.controls_type == "add_neg"
      --controls '$advanced_option.control_select.controls'
    #end if
    #if $advanced_option.scoring != "DEFAULT"
      --scoring '$advanced_option.scoring'
    #end if
    #if $advanced_option.minscore_select.minscore == "specify_minscore"
      --minscore '$advanced_option.minscore_select.minscore_value'
    #end if
    #if $advanced_option.mintaxa_type.mintaxa_select == "specify_mintaxa"
      --mintaxa '$advanced_option.mintaxa_type.mintaxa'
    #end if
    #if $advanced_option.exclude_taxa_type.exclude_taxa_select == "yes_exclude"
      --exclude '$advanced_option.exclude_taxa_type.exclude_taxa_name'
    #end if
    #if $advanced_option.include_taxa_type.include_taxa_select == "yes_include"
      --include '$advanced_option.include_taxa_type.include_taxa_name'
    #end if
    $advanced_option.avoidcross
    ## MORE ADVANCED OPTION ##
    #if $more_advanced_option.minscore_type.minscore_select == "specify_minscore"
      --ctrlminscore '$more_advanced_option.minscore_type.ctrlminscore'
    #end if
    #if $more_advanced_option.ctrlmintaxa_type.ctrlmintaxa_select =="specify_ctrlmintaxa"
      --ctrlmintaxa '$more_advanced_option.ctrlmintaxa_type.ctrlmintaxa'
    #end if
    --summary $more_advanced_option.summary
    $more_advanced_option.takeoutroot
    $more_advanced_option.nokollapse
    $more_advanced_option.strain
    $more_advanced_option.sequential
    $more_advanced_option.debug
    $more_advanced_option.version
    ## LOG FILE OUTPUT ##
    &> $logfile
    ]]>

  </command>
  <inputs>
    <!-- INPUT FILES -->
    <section name="input_option" title="Input options" expanded="true">
      <param name="input_file" type="data" format="tabular" label="Select taxonomy file tabular formated"/>
      <conditional name="file_type">
        <param name="filetype" type="select" label="Type of input file (centrifuge, CLARK, Generic, kraken, LMAT)" help="(-f, -r, -g, -k, -l)">
          <option value="centrifuge">Centrifuge</option>
          <option value="clark">CLARK</option>
          <option value="generic">Generic</option>
          <option value="lmat" >LMAT</option>
          <option value="kraken" >Kraken</option>
        </param>
        <when value="centrifuge"/>
        <when value="lmat"/>
        <when value="clark"/>
        <when value="kraken"/>
        <when value="generic">
          <param argument="--format" type="text" label="Format of the output files from a generic classifier"
                 help="string like 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'
                       where valid file TYPes are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
                       for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment (--format)">
          </param>
        </when>
      </conditional>
    </section>
    <!-- taxa databases -->
    <section name="database" title="Database type" expanded="true">
      <conditional name="db_type">
        <param name="db_select" type="select" label="Cached database with clade-specific marker genes">
          <option value="cached" selected="true">Locally installed</option>
          <option value="history">From history</option>
        </param>
        <when value="cached">
          <param name="cached_db" type="select" label="Cached database whith taxa ID">
            <options from_data_table="rcf_database">
              <validator message="No recentrifuge database is available" type="no_options"/>
            </options>
          </param>
        </when>
        <when value="history">
          <param name="history_db" type="data" multiple="true" format="txt" label="Database from history"/>
        </when>
      </conditional>
    </section>
    <!-- output name -->
    <section name="output_option" title="Output options">
      <param argument="--extra" type="select" label="Type of extra output to be generated (default on CSV)" help="(--extra)">
        <option value="CSV" selected="true" >CSV</option>
        <option value="DYNOMICS">DYNOMICS</option>
        <option value="FULL">FULL</option>
        <option value="MULTICSV">MULTICSV</option>
        <option value="TSV" >TSV</option>
      </param>
      <param argument="--pickle" type="boolean" truevalue="--pickle" falsevalue="" label="Serialize statistics and data results in pandas DataFrames" help="(--pickle)"/>
      <param argument="--nohtml" type="boolean" truevalue="--nohtml" falsevalue="" label="Suppress saving the HTML output file" help="(--nohtml)"/>
    </section>
    <!-- ADVANCED OPTIONS -->
    <section name="advanced_option" title="Coarse tuning of algorithm parameters">
      <conditional name="control_select">
        <param name="controls_type" type="select" label="Number of first samples will be treated as negative controls (default is 0)" help="(--controls)">
          <option value="default">No control</option>
          <option value="add_neg">Add negative controls </option>
        </param>
        <when value="default">
        </when>
        <when value="add_neg">
          <param name="controls" type="integer" min="0" value="0" label="Number of samples"/>
        </when>
      </conditional>
        <param name="scoring" type="select" label="Type of scoring to be applied" help="(--scoring)">
          <option value="DEFAULT" selected="true">Default scoring</option>
          <option value="SHEL">SHEL</option>
          <option value="LENGTH">LENGTH</option>
          <option value="LOGLENGTH">LOGLENGTH</option>
          <option value="NORMA">NORMA</option>
          <option value="LMAT">LMAT</option>
          <option value="CLARK_C">CLARK_C</option>
          <option value="CLARK_G">CLARK_G</option>
          <option value="KRAKEN">KRAKEN</option>
          <option value="GENERIC">GENERIC</option>
        </param>
      <conditional name="minscore_select">
        <param name="minscore" type="select" label="minimum score/confidence of the classification of a read to pass the quality filter; all pass by default" help="(--minscore)">
          <option value="default" selected="true">Default all pass</option>
          <option value="specify_minscore">Specify value</option>
        </param>
        <when value="default">
        </when>
        <when value="specify_minscore">
          <param name="minscore_value" type="integer" min="0" value="0" label="minimum score/confidence value"/>
        </when>
      </conditional>
      <conditional name="mintaxa_type">
        <param name="mintaxa_select" type="select" label="Minimum taxa to avoid collapsing one level into the parent (if not specified a value will be automatically assigned)" help="(--mintaxa)">
          <option value="default" selected="true">Automatically assigned</option>
          <option value="specify_mintaxa">Choose value</option>
        </param>
        <when value="default">
        </when>
        <when value="specify_mintaxa">
          <param name="mintaxa" type="integer" min="0" value="0" label="Minimum taxa number"/>
        </when>
      </conditional>
      <conditional name="exclude_taxa_type">
        <param name="exclude_taxa_select" type="select" label="NCBI taxid code to exclude a taxon and all underneath (default, no exclude)" help="(--exclude)">
          <option value="no_exclude">No exclusion</option>
          <option value="yes_exclude">Specify excluded taxa</option>
        </param>
        <when value="yes_exclude">
          <param name="exclude_taxa_name" type="text" label="NCBI taxid code to exclude" />
        </when>
        <when value="no_exclude"/>
      </conditional>
      <conditional name="include_taxa_type">
        <param name="include_taxa_select" type="select" label="NCBI taxid code to include a taxon and all underneath" help="(--include)">
          <option value="no_include">Default no taxa include</option>
            <option value="yes_include">Specify included taxa</option>
        </param>
        <when value="yes_include">
          <param name="include_taxa_name" type="text" label="NCBI taxid code to include"/>
        </when>
        <when value="no_include"/>
        </conditional>
          <param argument="--avoidcross" type="boolean" truevalue="--avoidcross" falsevalue="" label="Avoid cross analysis" help="(--avoidcross)"> </param>
    </section>
    <!-- Detailed more fine parameters -->
    <section name="more_advanced_option" title=" Fine tuning of algorithm parameters">
      <conditional name="minscore_type">
        <param name="minscore_select" type="select" label="minimum score/confidence of the classification of a read in control samples to pass the quality filter; it defaults to minscore" help="(--ctrlminscore)">
          <option value="default_minscore">Default minscore</option>
          <option value="specify_minscore">Specify minscore</option>
        </param>
        <when value="default_minscore"/>
        <when value="specify_minscore">
          <param name="ctrlminscore" type="integer" value="0" label="minimum score/confidence"/>
        </when>
      </conditional>
      <conditional name="ctrlmintaxa_type">
        <param name="ctrlmintaxa_select" type="select" label="Minimum taxa to avoid collapsing one level into the parent" help="(--ctrlmintaxa)">
          <option value="default_ctrlmintaxa">Default value</option>
          <option value="specify_ctrlmintaxa">Specify minimum taxa number</option>
        </param>
        <when value="default_ctrlmintaxa"/>
        <when value="specify_ctrlmintaxa">
          <param name="ctrlmintaxa"  type="integer" value="0" label="Minimum taxa number"/>
        </when>
      </conditional>
      <param name="summary" type="select" label="select to 'add' summary samples to other samples, or to 'only' show summary samples or to 'avoid' summaries at all" help="(--summary)">
        <option value="ADD" selected="true">ADD</option>
        <option value="ONLY">ONLY</option>
        <option value="AVOID">AVOID</option>
      </param>
      <param argument="--takeoutroot" type="boolean" truevalue="--takeoutroot" falsevalue="" label="remove counts directly assigned to the root level" help="(--takeoutroot)"/>
      <param argument="--nokollapse"  type="boolean" truevalue="--nokollapse" falsevalue=""  label="show the cellular organisms taxon" help="(--nokollapse)"/>
      <param argument="--strain"      type="boolean" truevalue="--strain" falsevalue=""      label="Strain level instead of species as the resolution limit for the robust contamination removal algorithm; use with caution, this is an experimental feature" help="(--strain)" />
      <param argument="--sequential"  type="boolean" truevalue="--sequential" falsevalue=""  label="deactivate parallel processing" help="(--sequential)" />
      <param argument="--debug"       type="boolean" truevalue="--debug" falsevalue=""       label="increase output verbosity and perform additional checks" help="(--debug)" />
      <param argument="--version"     type="boolean" truevalue="--version" falsevalue=""     label=" show program's version number and exit" help="(--version)" />
    </section>
  </inputs>
  <!-- OUTPUT FILE, TYPE DEPENDING ON extra PARAMETER -->
  <outputs>
    <data name="html_report" format="html" from_work_dir="output.rcf.html" label="${tool.name} on ${on_string}: html report">
      <filter> output_option['nohtml'] == False</filter>
    </data>
    <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"/>

    <data name="data_csv" format="csv" from_work_dir="output.rcf.data.csv" label="${tool.name} on ${on_string}: data.csv">
      <filter> output_option['extra']  == 'CSV' </filter>
    </data>
    <data name="stat_csv" format="csv" from_work_dir="output.rcf.stat.csv" label="${tool.name} on ${on_string}: stat csv">
        <filter> output_option['extra'] == 'CSV' or output_option['extra'] == 'MULTICSV' </filter>
    </data>
    <data name="data_tsv" format="tabular" from_work_dir="output.rcf.data.tsv" label="${tool.name} on ${on_string}: data tsv">
        <filter> output_option['extra'] == 'TSV' </filter>
    </data>
    <data name="stat_tsv" format="tabular" from_work_dir="output.rcf.stat.tsv" label="${tool.name} on ${on_string}: stat tsv">
      <filter> output_option['extra'] == 'TSV' </filter>
    </data>
    <data name="xls_report" format="xlsx" from_work_dir="output.rcf.xlsx" label="${tool.name} on ${on_string}: xlsx report">
      <filter> output_option['extra'] == 'FULL' or output_option['extra'] == 'DYNOMICS'</filter>
    </data>
    <data name="stat_bz" format="bz2" from_work_dir="output.rcf.stat.pkl.bz2" label="${tool.name} on ${on_string}: stat.pkl.bz2">
        <filter> output_option['pickle'] == True </filter>
    </data>
    <data name="data_bz" format="bz2" from_work_dir="output.rcf.data.pkl.bz2" label="${tool.name} on ${on_string}: data.pkl.bz2">
      <filter> output_option['pickle'] == True </filter>
    </data>
  </outputs>
  <tests>
    <test> <!-- kraken input and CSV output TEST_1-->
      <section name="database">
        <conditional name="db">
          <param name="db_selector" value="cached"/>
          <param name="cached_db" value="test-db-2022"/>
        </conditional>
      </section>
      <section name="input_option">
        <param name="input_file" value="kraken_test/kraken.out"/>
        <conditional name="file_type">
          <param name="filetype" value="kraken"/>
        </conditional>
      </section>
      <section name="output_option">
        <param name="output_type" value="default_type"/>
      </section>
      <section name="more_advanced_option">
        <param name="summary" value="AVOID"/>
      </section>
      <output name="data_csv" file="kraken_test/test1_csv.rcf.data.csv" lines_diff="2"/>
      <output name="stat_csv" file="kraken_test/test1_csv.rcf.stat.csv" lines_diff="2"/>
      <output name="html_report" file="kraken_test/test1_csv.rcf.html" lines_diff="2"/>
      <output name="logfile" file="kraken_test/test1_csv.log" lines_diff="7"/>
    </test>
    <test> <!-- centrifuge input and full options with imported database TEST_2 -->
      <section name="database">
        <conditional name="db">
          <param name="db_selector" value="history"/>
          <param name="history_db" value="test-db/delnodes.dmp,test-db/division.dmp,test-db/gc.prt,test-db/gencode.dmp,test-db/merged.dmp,test-db/names.dmp,test-db/nodes.dmp,test-db/readme.txt"/>
        </conditional>
      </section>
      <section name="input_option">
        <param name="input_file" value="centrifuge_test/centrifuge.out"/>
        <conditional name="file_type">
          <param name="filetype" value="centrifuge"/>
        </conditional>
      </section>
      <section name="output_option">
        <param name="extra" value="MULTICSV"/>
        <param name="pickle" value="false"/>
        <param name="nohtml" value="true"/>
      </section>
      <section name="advanced_option">
        <conditional name="control_select">
          <param name="controls_type"  value="add_neg"/>
          <param name="controls" value="0"/>
        </conditional>
        <param name="scoring" value="NORMA"/>
        <conditional name="minscore_select">
          <param name="minscore" value="specify_minscore"/>
          <param name="minscore_value" value="0"/>
        </conditional>
        <param name="avoidcross" value="true"/>
      </section>
      <section name="more_advanced_option">
        <conditional name="minscore_type">
          <param name="minscore_select" value="specify_minscore"/>
          <param name="ctrlminscore" value="0"/>
        </conditional>
        <param name="summary" value="AVOID"/>
      </section>
      <output name="stat_csv" file="centrifuge_test/test2_multicsv.rcf.stat.csv" lines_diff="2"/>
      <output name="logfile" file="centrifuge_test/test2_multicsv.log" lines_diff="20"/>
    </test>
    <test> <!-- kraken input cached DB several option added and .bz2 files generated TEST_3 -->
      <section name="database">
        <conditional name="db_type">
          <param name="db_select" value="cached"/>
          <param name="cached_db" value="test-db-2022"/>
        </conditional>
      </section>
        <section name="input_option">
          <param name="input_file" value="kraken_test/kraken.out"/>
          <conditional name="file_type">
            <param name="filetype" value="kraken"/>
          </conditional>
        </section>
          <section name="output_option" >
            <param name="extra" value="TSV"/>
            <param name="pickle" value="false"/>
            <param name="nohtml" value="true"/>
          </section>
        <section name="advanced_option">
          <param name="scoring" value="LOGLENGTH"/>
        </section>
        <section name="more_advanced_option">
          <param name="summary" value="ONLY"/>
          <param name="strain" value="true"/>
        </section>
      <output name="data_tsv" file="kraken_test/test3_rcf.data.tsv" lines_diff="2"/>
      <output name="stat_tsv" file="kraken_test/test3_rcf.stat.tsv" lines_diff="2"/>
      <output name="logfile" file="kraken_test/test3_tsv.log" lines_diff="20"/>
    </test>
  </tests>
  <help><![CDATA[
usage: rcf [-h] [-V] [-n PATH] [--format GENERIC_FORMAT]
         (-f FILE | -g FILE | -l FILE | -r FILE | -k FILE) [-o FILE]
         [-e OUTPUT_TYPE] [-p] [--nohtml] [-a | -c CONTROLS_NUMBER]
         [-s SCORING] [-y NUMBER] [-m INT] [-x TAXID] [-i TAXID] [-z NUMBER]
         [-w INT] [-u SUMMARY_BEHAVIOR] [-t] [--nokollapse] [-d] [--strain]
         [--sequential]

Robust comparative analysis and contamination removal for metagenomics

options:
-h, --help            show this help message and exit
-V, --version         show program's version number and exit

input:
Define Recentrifuge input files and formats

-n PATH, --nodespath PATH
                      path for the nodes information files (nodes.dmp and
                      names.dmp from NCBI)
--format GENERIC_FORMAT
                      format of the output files from a generic classifier
                      included with the option -g; It is a string like
                      "TYP:csv,TID:1,LEN:3,SCO:6,UNC:0" where valid file
                      TYPes are csv/tsv/ssv, and the rest of fields indicate
                      the number of column used (starting in 1) for the
                      TaxIDs assigned, the LENgth of the read, the SCOre
                      given to the assignment, and the taxid code used for
                      UNClassified reads
-f FILE, --file FILE  Centrifuge output files; if a single directory is
                      entered, every .out file inside will be taken as a
                      different sample; multiple -f is available to include
                      several Centrifuge samples
-g FILE, --generic FILE
                      output file from a generic classifier; it requires the
                      flag --format (see such option for details); multiple
                      -g is available to include several generic samples
-l FILE, --lmat FILE  LMAT output dir or file prefix; if just "." is
                      entered, every subdirectory under the current
                      directory will be taken as a sample and scanned
                      looking for LMAT output files; multiple -l is
                      available to include several samples
-r FILE, --clark FILE
                      CLARK full-mode output files; if a single directory is
                      entered, every .csv file inside will be taken as a
                      different sample; multiple -r is available to include
                      several CLARK, CLARK-l, and CLARK-S full-mode samples
-k FILE, --kraken FILE
                      Kraken output files; if a single directory is entered,
                      every .krk file inside will be taken as a different
                      sample; multiple -k is available to include several
                      Kraken (version 1 or 2) samples

output:
Related to the Recentrifuge output files

-o FILE, --outprefix FILE
                      output prefix; if not given, it will be inferred from
                      input files; an HTML filename is still accepted for
                      backwards compatibility with legacy --outhtml option
-e OUTPUT_TYPE, --extra OUTPUT_TYPE
                      type of extra output to be generated, and can be one
                      of ['FULL', 'CSV', 'MULTICSV', 'TSV', 'DYNOMICS']
-p, --pickle          pickle (serialize) statistics and data results in
                      pandas DataFrames (format affected by selection of
                      --extra)
--nohtml              suppress saving the HTML output file

tuning:
Coarse tuning of algorithm parameters

-a, --avoidcross      avoid cross analysis
-c CONTROLS_NUMBER, --controls CONTROLS_NUMBER
                      this number of first samples will be treated as
                      negative controls; default is no controls
-s SCORING, --scoring SCORING
                      type of scoring to be applied, and can be one of
                      ['SHEL', 'LENGTH', 'LOGLENGTH', 'NORMA', 'LMAT',
                      'CLARK_C', 'CLARK_G', 'KRAKEN', 'GENERIC']
-y NUMBER, --minscore NUMBER
                      minimum score/confidence of the classification of a
                      read to pass the quality filter; all pass by default
-m INT, --mintaxa INT
                      minimum taxa to avoid collapsing one level into the
                      parent (if not specified a value will be automatically
                      assigned)
-x TAXID, --exclude TAXID
                      NCBI taxid code to exclude a taxon and all underneath
                      (multiple -x is available to exclude several taxid)
-i TAXID, --include TAXID
                      NCBI taxid code to include a taxon and all underneath
                      (multiple -i is available to include several taxid);
                      by default, all the taxa are considered for inclusion

fine tuning:
Fine tuning of algorithm parameters

-z NUMBER, --ctrlminscore NUMBER
                      minimum score/confidence of the classification of a
                      read in control samples to pass the quality filter; it
                      defaults to "minscore"
-w INT, --ctrlmintaxa INT
                      minimum taxa to avoid collapsing one level into the
                      parent (if not specified a value will be automatically
                      assigned)
-u SUMMARY_BEHAVIOR, --summary SUMMARY_BEHAVIOR
                      choice for summary behaviour, and can be one of
                      ['ADD', 'ONLY', 'AVOID']
-t, --takeoutroot     remove counts directly assigned to the "root" level
--nokollapse          show the "cellular organisms" taxon

advanced:
Advanced modes of running

-d, --debug           increase output verbosity and perform additional
                      checks
--strain              set strain level instead of species as the resolution
                      limit for the robust contamination removal algorithm;
                      use with caution, this is an experimental feature
--sequential          deactivate parallel processing

rcf - Release 1.8.1 - Mar 2022

  Copyright (C) 2017–2022, Jose Manuel Martí Martínez

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU Affero General Public License as
  published by the Free Software Foundation, either version 3 of the
  License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Affero General Public License for more details.

  You should have received a copy of the GNU Affero General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.


  ]]></help>
  <expand macro="citations"/>
</tool>
author	pimarin
date	Wed, 06 Apr 2022 14:54:52 +0000
parents	b135c5908e8c
children	512dc05a0e5a