Mercurial > repos > pimarin > recentrifuge

<?xml version="1.0" encoding="UTF-8"?>

<tool id="recentrifuge" name="Recentrifuge" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
  <description>
    Robust comparative analysis and contamination removal for metagenomics
  </description>
  <macros>
    <import>macro.xml</import>
  </macros>
  <expand macro='xrefs'/>
  <expand macro="requirements" />
  <expand macro="version_command" />
  <command detect_errors="aggressive"><![CDATA[
    ## database input ##
    ## if database imported from history ##
    #if $database.db_type.db_select == "cached"
      #set $rcf_db = $database.db_type.cached_db.fields.path
    #else if $database.db_type.db_select == "history"
      mkdir rcf_db &&
      #for i in $database.db_type.history_db
        ln -s '$i' 'rcf_db/$i.element_identifier' &&
      #end for
      #set $rcf_db = "rcf_db"
    #end if
    rcf
    -n $rcf_db
    ## input type ##
    #if $input_option.file_type.filetype == "centrifuge"
      -f '$input_option.input_file'
    #else if $input_option.file_type.filetype == "lmat"
      -l '$input_option.input_file'
    #else if $input_option.file_type.filetype == "clark"
      -r '$input_option.input_file'
    #else if $input_option.file_type.filetype == "kraken"
      -k '$input_option.input_file'
    #else if $input_option.file_type.filetype == "generic"
      -g '$input_option.input_file'
      --format '$input_option.file_type.format'
    #end if
    ## output option ##
    -e $output_option.extra
    -o output
    $output_option.pickle
    $output_option.nohtml
    ## advanced options ##
    #if $advanced_option.control_select.controls_type == "add_neg"
      --controls '$advanced_option.control_select.controls'
    #end if
    #if $advanced_option.scoring != "DEFAULT"
      --scoring '$advanced_option.scoring'
    #end if
    #if $advanced_option.minscore_select.minscore == "specify_minscore"
      --minscore '$advanced_option.minscore_select.minscore_value'
    #end if
    #if $advanced_option.mintaxa_type.mintaxa_select == "specify_mintaxa"
      --mintaxa '$advanced_option.mintaxa_type.mintaxa'
    #end if
    #if $advanced_option.exclude_taxa_type.exclude_taxa_select == "yes_exclude"
      --exclude '$advanced_option.exclude_taxa_type.exclude_taxa_name'
    #end if
    #if $advanced_option.include_taxa_type.include_taxa_select == "yes_include"
      --include '$advanced_option.include_taxa_type.include_taxa_name'
    #end if
    $advanced_option.avoidcross
    ## MORE ADVANCED OPTION ##
    #if $more_advanced_option.minscore_type.minscore_select == "specify_minscore"
      --ctrlminscore '$more_advanced_option.minscore_type.ctrlminscore'
    #end if
    #if $more_advanced_option.ctrlmintaxa_type.ctrlmintaxa_select =="specify_ctrlmintaxa"
      --ctrlmintaxa '$more_advanced_option.ctrlmintaxa_type.ctrlmintaxa'
    #end if
    --summary $more_advanced_option.summary
    $more_advanced_option.takeoutroot
    $more_advanced_option.nokollapse
    $more_advanced_option.strain
    $more_advanced_option.sequential
    $more_advanced_option.debug
    $more_advanced_option.version
    ## LOG FILE OUTPUT ##
    &> $logfile
    ]]>

  </command>
  <inputs>
    <!-- INPUT FILES -->
    <section name="input_option" title="Input options" expanded="true">
      <param name="input_file" type="data" format="tabular" label="Select taxonomy file tabular formated"/>
      <conditional name="file_type">
        <param name="filetype" type="select" label="Type of input file (centrifuge, CLARK, Generic, kraken, LMAT)" help="(-f, -r, -g, -k, -l)">
          <option value="centrifuge">Centrifuge</option>
          <option value="clark">CLARK</option>
          <option value="generic">Generic</option>
          <option value="lmat" >LMAT</option>
          <option value="kraken" >Kraken</option>
        </param>
        <when value="centrifuge"/>
        <when value="lmat"/>
        <when value="clark"/>
        <when value="kraken"/>
        <when value="generic">
          <param argument="--format" type="text" label="Format of the output files from a generic classifier"
                 help="string like 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'
                       where valid file TYPes are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
                       for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment (--format)">
          </param>
        </when>
      </conditional>
    </section>
    <!-- taxa databases -->
    <section name="database" title="Database type" expanded="true">
      <conditional name="db_type">
        <param name="db_select" type="select" label="Cached database with clade-specific marker genes">
          <option value="cached" selected="true">Locally installed</option>
          <option value="history">From history</option>
        </param>
        <when value="cached">
          <param name="cached_db" type="select" label="Cached database whith taxa ID">
            <options from_data_table="rcf_database">
              <validator message="No recentrifuge database is available" type="no_options"/>
            </options>
          </param>
        </when>
        <when value="history">
          <param name="history_db" type="data" multiple="true" format="txt" label="Database from history"/>
        </when>
      </conditional>
    </section>
    <!-- output name -->
    <section name="output_option" title="Output options">
      <param argument="--extra" type="select" label="Type of extra output to be generated (default on CSV)" help="(--extra)">
        <option value="CSV" selected="true" >CSV</option>
        <option value="DYNOMICS">DYNOMICS</option>
        <option value="FULL">FULL</option>
        <option value="MULTICSV">MULTICSV</option>
        <option value="TSV" >TSV</option>
      </param>
      <param argument="--pickle" type="boolean" truevalue="--pickle" falsevalue="" label="Serialize statistics and data results in pandas DataFrames" help="(--pickle)"/>
      <param argument="--nohtml" type="boolean" truevalue="--nohtml" falsevalue="" label="Suppress saving the HTML output file" help="(--nohtml)"/>
    </section>
    <!-- ADVANCED OPTIONS -->
    <section name="advanced_option" title="Coarse tuning of algorithm parameters">
      <conditional name="control_select">
        <param name="controls_type" type="select" label="Number of first samples will be treated as negative controls (default is 0)" help="(--controls)">
          <option value="default">No control</option>
          <option value="add_neg">Add negative controls </option>
        </param>
        <when value="default">
        </when>
        <when value="add_neg">
          <param name="controls" type="integer" min="0" value="0" label="Number of samples"/>
        </when>
      </conditional>
        <param name="scoring" type="select" label="Type of scoring to be applied" help="(--scoring)">
          <option value="DEFAULT" selected="true">Default scoring</option>
          <option value="SHEL">SHEL</option>
          <option value="LENGTH">LENGTH</option>
          <option value="LOGLENGTH">LOGLENGTH</option>
          <option value="NORMA">NORMA</option>
          <option value="LMAT">LMAT</option>
          <option value="CLARK_C">CLARK_C</option>
          <option value="CLARK_G">CLARK_G</option>
          <option value="KRAKEN">KRAKEN</option>
          <option value="GENERIC">GENERIC</option>
        </param>
      <conditional name="minscore_select">
        <param name="minscore" type="select" label="minimum score/confidence of the classification of a read to pass the quality filter; all pass by default" help="(--minscore)">
          <option value="default" selected="true">Default all pass</option>
          <option value="specify_minscore">Specify value</option>
        </param>
        <when value="default">
        </when>
        <when value="specify_minscore">
          <param name="minscore_value" type="integer" min="0" value="0" label="minimum score/confidence value"/>
        </when>
      </conditional>
      <conditional name="mintaxa_type">
        <param name="mintaxa_select" type="select" label="Minimum taxa to avoid collapsing one level into the parent (if not specified a value will be automatically assigned)" help="(--mintaxa)">
          <option value="default" selected="true">Automatically assigned</option>
          <option value="specify_mintaxa">Choose value</option>
        </param>
        <when value="default">
        </when>
        <when value="specify_mintaxa">
          <param name="mintaxa" type="integer" min="0" value="0" label="Minimum taxa number"/>
        </when>
      </conditional>
      <conditional name="exclude_taxa_type">
        <param name="exclude_taxa_select" type="select" label="NCBI taxid code to exclude a taxon and all underneath (default, no exclude)" help="(--exclude)">
          <option value="no_exclude">No exclusion</option>
          <option value="yes_exclude">Specify excluded taxa</option>
        </param>
        <when value="yes_exclude">
          <param name="exclude_taxa_name" type="text" label="NCBI taxid code to exclude" />
        </when>
        <when value="no_exclude"/>
      </conditional>
      <conditional name="include_taxa_type">
        <param name="include_taxa_select" type="select" label="NCBI taxid code to include a taxon and all underneath" help="(--include)">
          <option value="no_include">Default no taxa include</option>
            <option value="yes_include">Specify included taxa</option>
        </param>
        <when value="yes_include">
          <param name="include_taxa_name" type="text" label="NCBI taxid code to include"/>
        </when>
        <when value="no_include"/>
        </conditional>
          <param argument="--avoidcross" type="boolean" truevalue="--avoidcross" falsevalue="" label="Avoid cross analysis" help="(--avoidcross)"> </param>
    </section>
    <!-- Detailed more fine parameters -->
    <section name="more_advanced_option" title=" Fine tuning of algorithm parameters">
      <conditional name="minscore_type">
        <param name="minscore_select" type="select" label="minimum score/confidence of the classification of a read in control samples to pass the quality filter; it defaults to minscore" help="(--ctrlminscore)">
          <option value="default_minscore">Default minscore</option>
          <option value="specify_minscore">Specify minscore</option>
        </param>
        <when value="default_minscore"/>
        <when value="specify_minscore">
          <param name="ctrlminscore" type="integer" value="0" label="minimum score/confidence"/>
        </when>
      </conditional>
      <conditional name="ctrlmintaxa_type">
        <param name="ctrlmintaxa_select" type="select" label="Minimum taxa to avoid collapsing one level into the parent" help="(--ctrlmintaxa)">
          <option value="default_ctrlmintaxa">Default value</option>
          <option value="specify_ctrlmintaxa">Specify minimum taxa number</option>
        </param>
        <when value="default_ctrlmintaxa"/>
        <when value="specify_ctrlmintaxa">
          <param name="ctrlmintaxa"  type="integer" value="0" label="Minimum taxa number"/>
        </when>
      </conditional>
      <param name="summary" type="select" label="select to 'add' summary samples to other samples, or to 'only' show summary samples or to 'avoid' summaries at all" help="(--summary)">
        <option value="ADD" selected="true">ADD</option>
        <option value="ONLY">ONLY</option>
        <option value="AVOID">AVOID</option>
      </param>
      <param argument="--takeoutroot" type="boolean" truevalue="--takeoutroot" falsevalue="" label="remove counts directly assigned to the root level" help="(--takeoutroot)"/>
      <param argument="--nokollapse"  type="boolean" truevalue="--nokollapse" falsevalue=""  label="show the cellular organisms taxon" help="(--nokollapse)"/>
      <param argument="--strain"      type="boolean" truevalue="--strain" falsevalue=""      label="Strain level instead of species as the resolution limit for the robust contamination removal algorithm; use with caution, this is an experimental feature" help="(--strain)" />
      <param argument="--sequential"  type="boolean" truevalue="--sequential" falsevalue=""  label="deactivate parallel processing" help="(--sequential)" />
      <param argument="--debug"       type="boolean" truevalue="--debug" falsevalue=""       label="increase output verbosity and perform additional checks" help="(--debug)" />
      <param argument="--version"     type="boolean" truevalue="--version" falsevalue=""     label=" show program's version number and exit" help="(--version)" />
    </section>
  </inputs>
  <!-- OUTPUT FILE, TYPE DEPENDING ON extra PARAMETER -->
  <outputs>
    <data name="html_report" format="html" from_work_dir="output.rcf.html" label="${tool.name} on ${on_string}: html report">
      <filter> output_option['nohtml'] == False</filter>
    </data>
    <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"/>

    <data name="data_csv" format="csv" from_work_dir="output.rcf.data.csv" label="${tool.name} on ${on_string}: data.csv">
      <filter> output_option['extra']  == 'CSV' </filter>
    </data>
    <data name="stat_csv" format="csv" from_work_dir="output.rcf.stat.csv" label="${tool.name} on ${on_string}: stat csv">
        <filter> output_option['extra'] == 'CSV' or output_option['extra'] == 'MULTICSV' </filter>
    </data>
    <data name="data_tsv" format="tabular" from_work_dir="output.rcf.data.tsv" label="${tool.name} on ${on_string}: data tsv">
        <filter> output_option['extra'] == 'TSV' </filter>
    </data>
    <data name="stat_tsv" format="tabular" from_work_dir="output.rcf.stat.tsv" label="${tool.name} on ${on_string}: stat tsv">
      <filter> output_option['extra'] == 'TSV' </filter>
    </data>
    <data name="xls_report" format="xlsx" from_work_dir="output.rcf.xlsx" label="${tool.name} on ${on_string}: xlsx report">
      <filter> output_option['extra'] == 'FULL' or output_option['extra'] == 'DYNOMICS'</filter>
    </data>
    <data name="stat_bz" format="bz2" from_work_dir="output.rcf.stat.pkl.bz2" label="${tool.name} on ${on_string}: stat.pkl.bz2">
        <filter> output_option['pickle'] == True </filter>
    </data>
    <data name="data_bz" format="bz2" from_work_dir="output.rcf.data.pkl.bz2" label="${tool.name} on ${on_string}: data.pkl.bz2">
      <filter> output_option['pickle'] == True </filter>
    </data>
  </outputs>
  <tests>
    <test> <!-- kraken input and CSV output TEST_1-->
      <section name="database">
        <conditional name="db">
          <param name="db_selector" value="cached"/>
          <param name="cached_db" value="test-db-2022"/>
        </conditional>
      </section>
      <section name="input_option">
        <param name="input_file" value="kraken_test/kraken.out"/>
        <conditional name="file_type">
          <param name="filetype" value="kraken"/>
        </conditional>
      </section>
      <section name="output_option">
        <param name="output_type" value="default_type"/>
      </section>
      <section name="more_advanced_option">
        <param name="summary" value="AVOID"/>
      </section>
      <output name="data_csv" file="kraken_test/test1_csv.rcf.data.csv" lines_diff="2"/>
      <output name="stat_csv" file="kraken_test/test1_csv.rcf.stat.csv" lines_diff="2"/>
      <output name="html_report" file="kraken_test/test1_csv.rcf.html" lines_diff="2"/>
      <output name="logfile" file="kraken_test/test1_csv.log" lines_diff="7"/>
    </test>
    <test> <!-- centrifuge input and full options with imported database TEST_2 -->
      <section name="database">
        <conditional name="db">
          <param name="db_selector" value="history"/>
          <param name="history_db" value="test-db/delnodes.dmp,test-db/division.dmp,test-db/gc.prt,test-db/gencode.dmp,test-db/merged.dmp,test-db/names.dmp,test-db/nodes.dmp,test-db/readme.txt"/>
        </conditional>
      </section>
      <section name="input_option">
        <param name="input_file" value="centrifuge_test/centrifuge.out"/>
        <conditional name="file_type">
          <param name="filetype" value="centrifuge"/>
        </conditional>
      </section>
      <section name="output_option">
        <param name="extra" value="MULTICSV"/>
        <param name="pickle" value="false"/>
        <param name="nohtml" value="true"/>
      </section>
      <section name="advanced_option">
        <conditional name="control_select">
          <param name="controls_type"  value="add_neg"/>
          <param name="controls" value="0"/>
        </conditional>
        <param name="scoring" value="NORMA"/>
        <conditional name="minscore_select">
          <param name="minscore" value="specify_minscore"/>
          <param name="minscore_value" value="0"/>
        </conditional>
        <param name="avoidcross" value="true"/>
      </section>
      <section name="more_advanced_option">
        <conditional name="minscore_type">
          <param name="minscore_select" value="specify_minscore"/>
          <param name="ctrlminscore" value="0"/>
        </conditional>
        <param name="summary" value="AVOID"/>
      </section>
      <output name="stat_csv" file="centrifuge_test/test2_multicsv.rcf.stat.csv" lines_diff="2"/>
      <output name="logfile" file="centrifuge_test/test2_multicsv.log" lines_diff="20"/>
    </test>
    <test> <!-- kraken input cached DB several option added and .bz2 files generated TEST_3 -->
      <section name="database">
        <conditional name="db_type">
          <param name="db_select" value="cached"/>
          <param name="cached_db" value="test-db-2022"/>
        </conditional>
      </section>
        <section name="input_option">
          <param name="input_file" value="kraken_test/kraken.out"/>
          <conditional name="file_type">
            <param name="filetype" value="kraken"/>
          </conditional>
        </section>
          <section name="output_option" >
            <param name="extra" value="TSV"/>
            <param name="pickle" value="false"/>
            <param name="nohtml" value="true"/>
          </section>
        <section name="advanced_option">
          <param name="scoring" value="LOGLENGTH"/>
        </section>
        <section name="more_advanced_option">
          <param name="summary" value="ONLY"/>
          <param name="strain" value="true"/>
        </section>
      <output name="data_tsv" file="kraken_test/test3_rcf.data.tsv" lines_diff="2"/>
      <output name="stat_tsv" file="kraken_test/test3_rcf.stat.tsv" lines_diff="2"/>
      <output name="logfile" file="kraken_test/test3_tsv.log" lines_diff="20"/>
    </test>
  </tests>
  <help>
    <![CDATA[
    =-= /home/pierre/anaconda3/envs/rcf/bin/rcf =-= v1.8.1 - Mar 2022 =-= by Jose Manuel Martí =-=
  usage: rcf [-h] [-V] [-n PATH] [--format GENERIC_FORMAT]
             (-f FILE | -g FILE | -l FILE | -r FILE | -k FILE) [-o FILE]
             [-e OUTPUT_TYPE] [-p] [--nohtml] [-a | -c CONTROLS_NUMBER]
             [-s SCORING] [-y NUMBER] [-m INT] [-x TAXID] [-i TAXID] [-z NUMBER]
             [-w INT] [-u SUMMARY_BEHAVIOR] [-t] [--nokollapse] [-d] [--strain]
             [--sequential]
  Robust comparative analysis and contamination removal for metagenomics
  options:
    -h, --help            show this help message and exit
    -V, --version         show program's version number and exit
  input:
    Define Recentrifuge input files and formats
    -n PATH, --nodespath PATH
                          path for the nodes information files (nodes.dmp and
                          names.dmp from NCBI)
    --format GENERIC_FORMAT
                          format of the output files from a generic classifier
                          included with the option -g; It is a string like
                          "TYP:csv,TID:1,LEN:3,SCO:6,UNC:0" where valid file
                          TYPes are csv/tsv/ssv, and the rest of fields indicate
                          the number of column used (starting in 1) for the
                          TaxIDs assigned, the LENgth of the read, the SCOre
                          given to the assignment, and the taxid code used for
                          UNClassified reads
    -f FILE, --file FILE  Centrifuge output files; if a single directory is
                          entered, every .out file inside will be taken as a
                          different sample; multiple -f is available to include
                          several Centrifuge samples
    -g FILE, --generic FILE
                          output file from a generic classifier; it requires the
                          flag --format (see such option for details); multiple
                          -g is available to include several generic samples
    -l FILE, --lmat FILE  LMAT output dir or file prefix; if just "." is
                          entered, every subdirectory under the current
                          directory will be taken as a sample and scanned
                          looking for LMAT output files; multiple -l is
                          available to include several samples
    -r FILE, --clark FILE
                          CLARK full-mode output files; if a single directory is
                          entered, every .csv file inside will be taken as a
                          different sample; multiple -r is available to include
                          several CLARK, CLARK-l, and CLARK-S full-mode samples
    -k FILE, --kraken FILE
                          Kraken output files; if a single directory is entered,
                          every .krk file inside will be taken as a different
                          sample; multiple -k is available to include several
                          Kraken (version 1 or 2) samples
  output:
    Related to the Recentrifuge output files
    -o FILE, --outprefix FILE
                          output prefix; if not given, it will be inferred from
                          input files; an HTML filename is still accepted for
                          backwards compatibility with legacy --outhtml option
    -e OUTPUT_TYPE, --extra OUTPUT_TYPE
                          type of extra output to be generated, and can be one
                          of ['FULL', 'CSV', 'MULTICSV', 'TSV', 'DYNOMICS']
    -p, --pickle          pickle (serialize) statistics and data results in
                          pandas DataFrames (format affected by selection of
                          --extra)
    --nohtml              suppress saving the HTML output file
  tuning:
    Coarse tuning of algorithm parameters
    -a, --avoidcross      avoid cross analysis
    -c CONTROLS_NUMBER, --controls CONTROLS_NUMBER
                          this number of first samples will be treated as
                          negative controls; default is no controls
    -s SCORING, --scoring SCORING
                          type of scoring to be applied, and can be one of
                          ['SHEL', 'LENGTH', 'LOGLENGTH', 'NORMA', 'LMAT',
                          'CLARK_C', 'CLARK_G', 'KRAKEN', 'GENERIC']
    -y NUMBER, --minscore NUMBER
                          minimum score/confidence of the classification of a
                          read to pass the quality filter; all pass by default
    -m INT, --mintaxa INT
                          minimum taxa to avoid collapsing one level into the
                          parent (if not specified a value will be automatically
                          assigned)
    -x TAXID, --exclude TAXID
                          NCBI taxid code to exclude a taxon and all underneath
                          (multiple -x is available to exclude several taxid)
    -i TAXID, --include TAXID
                          NCBI taxid code to include a taxon and all underneath
                          (multiple -i is available to include several taxid);
                          by default, all the taxa are considered for inclusion
  fine tuning:
    Fine tuning of algorithm parameters
    -z NUMBER, --ctrlminscore NUMBER
                          minimum score/confidence of the classification of a
                          read in control samples to pass the quality filter; it
                          defaults to "minscore"
    -w INT, --ctrlmintaxa INT
                          minimum taxa to avoid collapsing one level into the
                          parent (if not specified a value will be automatically
                          assigned)
    -u SUMMARY_BEHAVIOR, --summary SUMMARY_BEHAVIOR
                          choice for summary behaviour, and can be one of
                          ['ADD', 'ONLY', 'AVOID']
    -t, --takeoutroot     remove counts directly assigned to the "root" level
    --nokollapse          show the "cellular organisms" taxon
  advanced:
    Advanced modes of running
    -d, --debug           increase output verbosity and perform additional
                          checks
    --strain              set strain level instead of species as the resolution
                          limit for the robust contamination removal algorithm;
                          use with caution, this is an experimental feature
    --sequential          deactivate parallel processing
  rcf - Release 1.8.1 - Mar 2022
      Copyright (C) 2017–2022, Jose Manuel Martí Martínez
      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU Affero General Public License as
      published by the Free Software Foundation, either version 3 of the
      License, or (at your option) any later version.
      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU Affero General Public License for more details.
      You should have received a copy of the GNU Affero General Public License
      along with this program.  If not, see <https://www.gnu.org/licenses/>.

    ]]>
  </help>
  <expand macro="citations"/>
</tool>
author	pimarin
date	Wed, 06 Apr 2022 13:52:48 +0000
parents	e5474449c35d
children	2890083b1a84