comparison kraken.xml @ 2:898ded2d4fff draft

planemo upload for repository https://github.com/galaxyproject/tools-devteam/blob/master/tool_collections/kraken/kraken/ commit cb6ebb843c71dcfc73aa05cc616f8e3229170108-dirty
author devteam
date Wed, 15 Jul 2015 14:59:31 -0400
parents 656215d2a793
children 2ad66362ed0f
comparison
equal deleted inserted replaced
1:656215d2a793 2:898ded2d4fff
1 <?xml version="1.0"?> 1 <?xml version="1.0"?>
2 <tool id="kraken" name="Kraken" version="1.0.0"> 2 <tool id="kraken" name="Kraken" version="1.1.0">
3 <description> 3 <description>
4 assign taxonomic labels to short DNA reads 4 assign taxonomic labels to sequencing reads
5 </description> 5 </description>
6 <macros> 6 <macros>
7 <import>macros.xml</import> 7 <import>macros.xml</import>
8 </macros> 8 </macros>
9 <command> 9 <command>
10 <![CDATA[ 10 <![CDATA[
11 @SET_DATABASE_PATH@ && 11 @SET_DATABASE_PATH@ &&
12 kraken --threads \${GALAXY_SLOTS:-1} @INPUT_DATABASE@ 12 kraken --threads \${GALAXY_SLOTS:-1} @INPUT_DATABASE@
13
14 #if $input_sequences.is_of_type( 'fastq' ):
15 --fastq-input
16 #else:
17 --fasta-input
18 #end if
19
20 ${only_classified_output}
21
22 #if str( $quick_operation.quick ) == "yes":
23 --quick
24 --min-hits ${quick_operation.min_hits}
25
26 #end if
27
13 "$input_sequences" 28 "$input_sequences"
29
14 #if $split_reads: 30 #if $split_reads:
15 --classified-out "${classified_out}" --unclassified-out "${unclassified_out}" 31 --classified-out "${classified_out}" --unclassified-out "${unclassified_out}"
16 #end if 32 #end if
17 --output "${output}" && 33 --output "${output}"
18 kraken-translate --db ${kraken_database.fields.name} "${output}" > "${translated}" 34 ##kraken-translate --db ${kraken_database.fields.name} "${output}" > "${translated}"
19 ]]> 35 ]]>
20 </command> 36 </command>
21 <inputs> 37 <inputs>
22 <param format="fasta,fastq,fastqsanger" label="Input sequences" name="input_sequences" type="data" /> 38 <param format="fasta,fastq" label="Input sequences" name="input_sequences" type="data" help="FASTA or FASTQ datasets"/>
23 <param label="Output classified and unclassified reads" name="split_reads" type="boolean" /> 39 <param label="Output classified and unclassified reads?" name="split_reads" type="boolean" help="Sets --unclassified-out and --classified-out"/>
40
41 <conditional name="quick_operation">
42 <param name="quick" type="select" label="Enable quick operation?" help="--quick; Rather than searching all k-mers in a sequence, stop classification after a specified number of database hit">
43 <option value="yes">Yes</option>
44 <option selected="True" value="no">No</option>
45 </param>
46 <when value="yes">
47 <param name="min_hits" type="integer" value="1" label="Number of hits required for classification" help="--min-hits; min-hits will allow you to require multiple hits before declaring a sequence classified, which can be especially useful with custom databases when testing to see if sequences either do or do not belong to a particular genome; default=1"/>
48 </when>
49 <when value="no">
50 <!-- Do absolutely nothing -->
51 </when>
52 </conditional>
53
54 <param name="only_classified_output" type="boolean" checked="False" truevalue="--only-classified-output" falsevalue="" label="Print no Kraken output for unclassified sequences" help="--only-classified-output"/>
55
24 <expand macro="input_database" /> 56 <expand macro="input_database" />
25 </inputs> 57 </inputs>
26 <outputs> 58 <outputs>
27 <data format="tabular" label="${tool.name} on ${on_string}: Classified reads" name="classified_out"> 59 <data format="tabular" label="${tool.name} on ${on_string}: Classified reads" name="classified_out">
28 <filter>(split_reads)</filter> 60 <filter>(split_reads)</filter>
29 </data> 61 </data>
30 <data format="tabular" label="${tool.name} on ${on_string}: Unclassified reads" name="unclassified_out"> 62 <data format="tabular" label="${tool.name} on ${on_string}: Unclassified reads" name="unclassified_out">
31 <filter>(split_reads)</filter> 63 <filter>(split_reads)</filter>
32 </data> 64 </data>
33 <data format="tabular" label="${tool.name} on ${on_string}: Histogram" name="histogram">
34 <filter>(draw_histogram)</filter>
35 </data>
36 <data format="tabular" label="${tool.name} on ${on_string}: Classification" name="output" /> 65 <data format="tabular" label="${tool.name} on ${on_string}: Classification" name="output" />
37 <data format="tabular" label="${tool.name} on ${on_string}: Translated classification" name="translated" /> 66 <!--<data format="tabular" label="${tool.name} on ${on_string}: Translated classification" name="translated" />-->
38 </outputs> 67 </outputs>
39 <help> 68 <help>
40 <![CDATA[ 69 <![CDATA[
41 **What it does** 70 **What it does**
42 71
43 Kraken is a taxonomic sequence classifier that assigns taxonomic labels to short DNA reads. It does this by examining the k-mers within a read and querying a database with those k-mers. This database contains a mapping of every k-mer in Kraken's genomic library to the lowest common ancestor (LCA) in a taxonomic tree of all genomes that contain that k-mer. The set of LCA taxa that correspond to the k-mers in a read are then analyzed to create a single taxonomic label for the read; this label can be any of the nodes in the taxonomic tree. Kraken is designed to be rapid, sensitive, and highly precise. Our tests on various real and simulated data have shown Kraken to have sensitivity slightly lower than Megablast with precision being slightly higher. On a set of simulated 100 bp reads, Kraken processed over 1.3 million reads per minute on a single core in normal operation, and over 4.1 million reads per minute in quick operation. 72 Kraken is a taxonomic sequence classifier that assigns taxonomic labels to short DNA reads. It does this by examining the k-mers within a read and querying a database with those k-mers. This database contains a mapping of every k-mer in Kraken's genomic library to the lowest common ancestor (LCA) in a taxonomic tree of all genomes that contain that k-mer. The set of LCA taxa that correspond to the k-mers in a read are then analyzed to create a single taxonomic label for the read; this label can be any of the nodes in the taxonomic tree. Kraken is designed to be rapid, sensitive, and highly precise.
44 73
45 **Usage** 74 -----
46 75
47 Kraken classifies a set of sequences (reads) with the commands below: 76 **Kraken options**
48 77
49 kraken --db $DBNAME sequences.fa > sequences.kraken 78 The Galaxy version of Kraken implements the following options::
50 79
51 or 80
81 --fasta-input Input is FASTA format
82 --fastq-input Input is FASTQ format
83 --quick Quick operation (use first hit or hits)
84 --min-hits NUM In quick op., number of hits req'd for classification
85 NOTE: this is ignored if --quick is not specified
86 --unclassified-out Print unclassified sequences to filename
87 --classified-out Print classified sequences to filename
52 88
53 kraken --db $DBNAME sequences.fq > sequences.kraken 89 --only-classified-output Print no Kraken output for unclassified sequences
90
91 ------
54 92
93 **Output Format**
55 94
56 -DBNAME is the name of the Kraken Database to be used. 95 Each sequence classified by Kraken results in a single line of output. Output lines contain five tab-delimited fields; from left to right, they are::
57 96
58 -sequences.fa or sequences.fq is the FASTA or FASTQ input file containing the desired sequences for classification. 97 1. "C"/"U": one letter code indicating that the sequence was either classified or unclassified.
59 98 2. The sequence ID, obtained from the FASTA/FASTQ header.
60 -sequences.kraken is the generated output. 99 3. The taxonomy ID Kraken used to label the sequence; this is 0 if the sequence is unclassified.
61 100 4. The length of the sequence in bp.
62 101 5. A space-delimited list indicating the LCA mapping of each k-mer in the sequence. For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:
63
64 **Options**
65
66 The kraken program allows several different sequencing modifiers (parameters):
67
68 **Multithreading:** Use the --threads NUM switch to use multiple threads.
69
70 **Sequence filtering:** Classified or unclassified sequences can be sent to a file for later processing, using the --classified-out and --unclassified-out switches, respectively.
71
72
73
74 **Output Format**
75
76 Each sequence classified by Kraken results in a single line of output. Output lines contain five tab-delimited fields; from left to right, they are:
77
78 1. "C"/"U": one letter code indicating that the sequence was either classified or unclassified.
79 2. The sequence ID, obtained from the FASTA/FASTQ header.
80 3. The taxonomy ID Kraken used to label the sequence; this is 0 if the sequence is unclassified.
81 4. The length of the sequence in bp.
82
83 5. A space-delimited list indicating the LCA mapping of each k-mer in the sequence. For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:
84 a) the first 13 k-mers mapped to taxonomy ID #562 102 a) the first 13 k-mers mapped to taxonomy ID #562
85 b) the next 4 k-mers mapped to taxonomy ID #561 103 b) the next 4 k-mers mapped to taxonomy ID #561
86 c) the next 31 k-mers contained an ambiguous nucleotide 104 c) the next 31 k-mers contained an ambiguous nucleotide
87 d) the next k-mer was not in the database 105 d) the next k-mer was not in the database
88 e) the last 3 k-mers mapped to taxonomy ID #562 106 e) the last 3 k-mers mapped to taxonomy ID #562