comparison seqkit_grep.xml @ 0:c5d75079615d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/seqkit commit 76c1a289f15cc9a9a7d9a49dc132af62cc1d5af2
author iuc
date Fri, 26 Sep 2025 16:47:08 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c5d75079615d
1 <tool id="seqkit_grep" name="SeqKit grep" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>grep-like tools for FASTA/Q files</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="bio_tools"/>
7 <expand macro="requirements"/>
8 <command detect_errors="exit_code"><![CDATA[
9 #import re
10
11 #set input_identifier = re.sub('[^\s\w\-]', '_', str($input.element_identifier))
12 ln -s '${input}' '${input_identifier}' &&
13
14 seqkit grep
15 --threads "\${GALAXY_SLOTS:-4}"
16 #if $conditional_pattern.mode == 'expression'
17 --pattern '"$conditional_pattern.pattern"'
18 $conditional_pattern.use_regexp
19 #else
20 --pattern-file '$conditional_pattern.pattern_file'
21 #end if
22 $search_options.allow_duplicated_patterns
23 $search_options.by_name
24 $search_options.by_seq
25 $search_options.circular
26 $search_options.count
27 $search_options.degenerate
28 $search_options.delete_matched
29 $search_options.ignore_case
30 $search_options.invert_match
31 #if $search_options.by_seq and not $search_options.degenerate
32 --max-mismatch $search_options.max_mismatch
33 #end if
34 $search_options.only_positive_strand
35 $search_options.region
36 '${input_identifier}'
37 > '$output'
38 ]]></command>
39 <inputs>
40 <param name="input" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/FASTQ file"/>
41 <conditional name="conditional_pattern">
42 <param name="mode" type="select" label="Pattern mode" help="Specify pattern directly or upload a file with multiple patterns">
43 <option value="expression">Pattern/motif sequence</option>
44 <option value="file">FASTA file with the pattern/motif of interest</option>
45 </param>
46 <when value="expression">
47 <param argument="--pattern" type="text" label="Search pattern" help="Pattern to search for. Use quotes for special characters when using regex">
48 <sanitizer invalid_char="">
49 <valid initial="string.letters,string.digits">
50 <add value="^"/>
51 <add value="$"/>
52 <add value="("/>
53 <add value=")"/>
54 <add value="|"/>
55 <add value="?"/>
56 <add value="*"/>
57 <add value="+"/>
58 <add value="{"/>
59 <add value="}"/>
60 <add value="\"/>
61 <add value="["/>
62 <add value="]"/>
63 <add value="."/>
64 <add value=","/>
65 <add value=":"/>
66 </valid>
67 </sanitizer>
68 <validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
69 </param>
70 <param argument="--use-regexp" type="boolean" truevalue="--use-regexp" falsevalue="" checked="false" label="Interpret pattern as regular expression" help="Enable regular expression matching"/>
71 </when>
72 <when value="file">
73 <param argument="--pattern-file" type="data" format="fasta" label="Pattern/motif file"/>
74 </when>
75 </conditional>
76 <section name="search_options" title="Search options">
77 <param argument="--by-name" type="boolean" truevalue="--by-name" falsevalue="" checked="false" label="Search by sequence name" help="match by full name instead of just ID"/>
78 <param argument="--by-seq" type="boolean" truevalue="--by-seq" falsevalue="" checked="false" label="Search by sequence content" help="search subseq on seq. Both positive and negative strand are searched by default, you might use only-positive-strand. Mismatch allowed using max-mismatch"/>
79 <param argument="--ignore-case" type="boolean" truevalue="--ignore-case" falsevalue="" checked="false" label="Ignore case" help="ignore case"/>
80 <param argument="--only-positive-strand" type="boolean" truevalue="--only-positive-strand" falsevalue="" checked="false" label="Only search positive strand" help="Only search on positive strand (only applies when searching by sequence)"/>
81 <param argument="--max-mismatch" type="integer" min="0" value="0" label="Maximum mismatches" help="Maximum number of mismatches allowed (only for sequence search, 0 = exact match)"/>
82 <param argument="--invert-match" type="boolean" truevalue="--invert-match" falsevalue="" checked="false" label="Invert match" help="invert the sense of matching, to select non-matching records"/>
83 <param argument="--degenerate" type="boolean" truevalue="--degenerate" falsevalue="" checked="false" label="Pattern contains degenerate bases" help="Pattern contains degenerate basee"/>
84 <param argument="--circular" type="boolean" truevalue="--circular" falsevalue="" checked="false" label="Circular genome" help="Treat sequences as circular for matching"/>
85 <param argument="--count" type="boolean" truevalue="--count" falsevalue="" checked="false" label="Count" help="just print a count of matching records. with the -v/--invert-match flag, count non-matching records"/>
86 <param argument="--delete-matched" type="boolean" truevalue="--delete-matched" falsevalue="" checked="false" label="Delete matched patterns" help="delete a pattern right after being matched, this keeps the firstly matched data and speedups when using regular expressions"/>
87 <param argument="--allow-duplicated-patterns" type="boolean" truevalue="--allow-duplicated-patterns" falsevalue="" checked="false" label="Allow duplicated patterns" help="output records multiple times when duplicated patterns are given"/>
88 <param argument="--region" type="text" value="" label="Sequence region" help="Specify region for searching (e.g., 1:30 for first 30 bases, -12:-1 for last 12).">
89 <validator type="regex" message="Region must be in format 'start:end' or 'start:' or ':end'">^$|^-?[0-9]*:-?[0-9]*$</validator>
90 </param>
91 </section>
92 </inputs>
93 <outputs>
94 <data name="output" format_source="input" label="${tool.name} on ${on_string}"/>
95 </outputs>
96 <tests>
97 <test expect_num_outputs="1">
98 <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
99 <conditional name="conditional_pattern">
100 <param name="mode" value="expression"/>
101 <param name="pattern" value="ATGC"/>
102 </conditional>
103 <section name="search_options">
104 <param name="by_seq" value="true"/>
105 <param name="max_mismatch" value="0"/>
106 </section>
107 <output decompress="true" name="output" file="grep_output1.fasta.gz" ftype="fasta.gz"/>
108 </test>
109 <test expect_num_outputs="1">
110 <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
111 <conditional name="conditional_pattern">
112 <param name="mode" value="file"/>
113 <param name="pattern_file" value="grep_pattern.fasta"/>
114 </conditional>
115 <section name="search_options">
116 <param name="invert_match" value="true"/>
117 </section>
118 <output decompress="true" name="output" file="grep_output2.fasta.gz" ftype="fasta.gz"/>
119 </test>
120 <test expect_num_outputs="1">
121 <param name="input" value="input1.fastq.gz" ftype="fastq.gz"/>
122 <conditional name="conditional_pattern">
123 <param name="mode" value="expression"/>
124 <param name="pattern" value="^5"/>
125 <param name="use_regexp" value="true"/>
126 </conditional>
127 <section name="search_options">
128 <param name="by_name" value="true"/>
129 </section>
130 <output decompress="true" name="output" file="grep_output3.fastq.gz" ftype="fastq.gz"/>
131 </test>
132 <test expect_num_outputs="1">
133 <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
134 <conditional name="conditional_pattern">
135 <param name="mode" value="expression"/>
136 <param name="pattern" value="NNNNATGC"/>
137 </conditional>
138 <section name="search_options">
139 <param name="by_seq" value="true"/>
140 <param name="degenerate" value="true"/>
141 </section>
142 <output decompress="true" name="output" file="grep_output4.fasta.gz" ftype="fasta.gz"/>
143 </test>
144 </tests>
145 <help>
146 .. class:: infomark
147
148 **What it does**
149
150 search sequences by ID/name/sequence/sequence motifs, mismatch allowed
151
152 ------
153
154 .. class:: infomark
155
156 **Attention**
157
158 0. By default, we match sequence ID with patterns, use "-n/--by-name"
159 for matching full name instead of just ID.
160 1. Unlike POSIX/GNU grep, we compare the pattern to the whole target
161 (ID/full header) by default. Please switch "-r/--use-regexp" on
162 for partly matching.
163 2. When searching by sequences, it's partly matching, and both positive
164 and negative strands are searched.
165 Please switch on "-P/--only-positive-strand" if you would like to
166 search only on the positive strand.
167 Mismatch is allowed using flag "-m/--max-mismatch", you can increase
168 the value of "-j/--threads" to accelerate processing.
169 3. Degenerate bases/residues like "RYMM.." are also supported by flag -d.
170 But do not use degenerate bases/residues in regular expression, you need
171 convert them to regular expression, e.g., change "N" or "X" to ".".
172 4. When providing search patterns (motifs) via flag '-p',
173 please use double quotation marks for patterns containing comma,
174 e.g., -p '"A{2,}"' or -p "\"A{2,}\"". Because the command line argument
175 parser accepts comma-separated-values (CSV) for multiple values (motifs).
176 Patterns in file do not follow this rule.
177 5. The order of sequences in result is consistent with that in original
178 file, not the order of the query patterns.
179 But for FASTA file, you can use:
180 seqkit faidx seqs.fasta --infile-list IDs.txt
181 6. For multiple patterns, you can either set "-p" multiple times, i.e.,
182 -p pattern1 -p pattern2, or give a file of patterns via "-f/--pattern-file".
183 </help>
184 <expand macro="citations"/>
185 </tool>