changeset 15:e41d3ce2ab9f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/samtools/samtools_view commit e3de8bc1123bf4ce56818f2b7ad4b53080cb3bd8
author iuc
date Fri, 30 Aug 2024 10:24:13 +0000
parents e63aab0f18c6
children 17c2bd677389
files macros.xml samtools_view.xml test-data/test_1.bam test-data/test_11.sam test-data/test_12.sam test-data/test_15.cram test-data/test_17.bam test-data/test_19.bam test-data/test_2.bam test-data/test_20.bam test-data/test_21.sam test-data/test_22.sam test-data/test_23.sam test-data/test_24.bam test-data/test_25.sam test-data/test_26.bam test-data/test_27.bam test-data/test_28.bam test-data/test_29.bam test-data/test_3.bam test-data/test_30.bam test-data/test_31.bam test-data/test_32.bam test-data/test_33.bam test-data/test_4.bam test-data/test_5.bam test-data/test_7.bam test-data/test_8.bam
diffstat 28 files changed, 254 insertions(+), 81 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Mon Aug 15 09:13:14 2022 +0000
+++ b/macros.xml	Fri Aug 30 10:24:13 2024 +0000
@@ -5,7 +5,14 @@
             <yield/>
         </requirements>
     </xml>
+    <!-- NOTE: for some tools only the version of the requirement but not the
+        tool's version is controlled by the TOOL_VERSION token 
+        (because their version is ahead of the requirement version .. 
+         please only bump the minor version in order to let the requirement
+         version catch up eventually). To find the tools check:
+        `grep "<tool" . -r | grep -v VERSION_SUFFIX | cut -d":" -f 1` -->
     <token name="@TOOL_VERSION@">1.15.1</token>
+    <token name="@VERSION_SUFFIX@">2</token>
     <token name="@PROFILE@">20.05</token>
     <token name="@FLAGS@"><![CDATA[
         #set $flags = 0
@@ -212,37 +219,7 @@
 
     <xml name="citations">
         <citations>
-            <citation type="bibtex">
-                @misc{SAM_def,
-                title={Definition of SAM/BAM format},
-                url = {https://samtools.github.io/hts-specs/},}
-            </citation>
-            <citation type="doi">10.1093/bioinformatics/btp352</citation>
-            <citation type="doi">10.1093/bioinformatics/btr076</citation>
-            <citation type="doi">10.1093/bioinformatics/btr509</citation>
-            <citation type="bibtex">
-                @misc{Danecek_et_al,
-                Author={Danecek, P., Schiffels, S., Durbin, R.},
-                title={Multiallelic calling model in bcftools (-m)},
-                url = {http://samtools.github.io/bcftools/call-m.pdf},}
-            </citation>
-            <citation type="bibtex">
-                @misc{Durbin_VCQC,
-                Author={Durbin, R.},
-                title={Segregation based metric for variant call QC},
-                url = {http://samtools.github.io/bcftools/rd-SegBias.pdf},}
-            </citation>
-            <citation type="bibtex">
-                @misc{Li_SamMath,
-                Author={Li, H.},
-                title={Mathematical Notes on SAMtools Algorithms},
-                url = {http://www.broadinstitute.org/gatk/media/docs/Samtools.pdf},}
-            </citation>
-            <citation type="bibtex">
-                @misc{SamTools_github,
-                title={SAMTools GitHub page},
-                url = {https://github.com/samtools/samtools},}
-            </citation>
+            <citation type="doi">10.1093/gigascience/giab008</citation>
         </citations>
     </xml>
     <xml name="version_command">
--- a/samtools_view.xml	Mon Aug 15 09:13:14 2022 +0000
+++ b/samtools_view.xml	Fri Aug 30 10:24:13 2024 +0000
@@ -1,4 +1,4 @@
-<tool id="samtools_view" name="Samtools view" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+<tool id="samtools_view" name="Samtools view" version="@TOOL_VERSION@+galaxy3" profile="@PROFILE@">
     <description>- reformat, filter, or subsample SAM, BAM or CRAM</description>
     <macros>
         <import>macros.xml</import>
@@ -136,6 +136,9 @@
                 #if $mode.filter_config.qname_file:
                     #set std_filters = $std_filters + " --qname-file '%s'" % $mode.filter_config.qname_file
                 #end if
+                #if str($cond_expr.select_expr) == "yes":
+                    #set std_filters = $std_filters + " -e '%s'" % $cond_expr.expression
+                #end if
             #end if
 
             #if $with_subsampling:
@@ -154,11 +157,11 @@
                         ## not dealing with all of the reads in the indexed
                         ## file. We have to do an extra pass over the input to
                         ## count the reads to subsample.
-                        sample_fragment=`samtools view -c $std_filters infile $reg_filters | awk '{s=\$1} END {frac=s/${mode.subsample_config.subsampling_mode.target}; printf("%.8f\n", frac > 1 ? $seed+1/frac : ".0")}'` &&
+                        sample_fragment=`samtools view -c $std_filters infile $reg_filters | awk '{s=\$1} END {fac=s/${mode.subsample_config.subsampling_mode.target}; printf("%.8f\n", fac > 1 ? 1/fac : 1)}'` &&
                     #else:
                         ## We can get the count of reads to subsample using
                         ## an inexpensive call to idxstats.
-                        sample_fragment=`samtools idxstats infile | awk '{s+=\$4+\$3} END {frac=s/${mode.subsample_config.subsampling_mode.target}; printf("%.8f\n", frac > 1 ? $seed+1/frac : ".0")}'` &&
+                        sample_fragment=`samtools idxstats infile | awk '{s+=\$4+\$3} END {fac=s/${mode.subsample_config.subsampling_mode.target}; printf("%.8f\n", fac > 1 ? 1/fac : 1)}'` &&
                     #end if
                 #end if
             #end if
@@ -170,14 +173,14 @@
 
             ## filter options (except regions filter, which is the last parameter)
             $std_filters
-
             #if $with_subsampling:
+                --subsample-seed $seed
                 #if str($mode.subsample_config.subsampling_mode.select_subsample) == "target":
                     ##this is calculated at execution time before the main samtools command
-                    -s \${sample_fragment}
+                    --subsample \${sample_fragment}
                 #else:
-                    #set $fraction = $seed + 1 / float($mode.subsample_config.subsampling_mode.factor)
-                    -s $fraction
+                    #set $fraction = 1 / float($mode.subsample_config.subsampling_mode.factor)
+                    --subsample $fraction
                 #end if
             #end if
 
@@ -299,6 +302,24 @@
                             <param name="rgfile" type="data" format="tabular" argument="-R" label="Filter by read groups in file" help="Output alignments in read groups listed in FILE." />
                         </when>
                     </conditional>
+                    <conditional name="cond_expr">
+                        <param name="select_expr" type="select" label="Filter by expression">
+                            <option value="no" selected="True">No</option>
+                            <option value="yes">Filter using an expression (see manual)</option>
+                        </param>
+                        <when value="no"/>
+                        <when value="yes">
+                            <param name="expression" type="text" argument="-e" label="Filter by expression - for example sclen&gt;0 will filter all soft clipped reads" help="See Samtools manual for Filter expression syntax">
+                                <sanitizer invalid_char="">
+                                    <valid initial="string.printable">
+                                    <remove value=" "/>
+                                    <remove value="'"/>
+                                    <remove value='"'/>
+                                </valid>
+                                </sanitizer>
+                            </param>
+                        </when>
+                    </conditional>
                     <param name="quality" type="integer" argument="-q" optional="true" min="0" label="Filter by quality" help="Skip alignments with MAPQ smaller than INT." />
                     <param name="library" type="text" argument="-l" optional="true" label="Filter by library" help="Only output alignments in library STR" />
                     <param name="cigarcons" type="integer" argument="-m" optional="true" min="0" label="Filter by number of CIGAR bases consuming query sequence" help="Only output alignments with number of CIGAR bases consuming query sequence greater than or equal INT." />
@@ -398,13 +419,13 @@
         </data>
     </outputs>
     <tests>
-<!-- 1) sam to bam (copied from the sam_to_bam tool) -->
-        <test>
+        <!-- 1) sam to bam (copied from the sam_to_bam tool) -->
+        <test expect_num_outputs="1">
             <param name="input" ftype="sam" value="in_test_1.sam" />
             <output name="outputsam" ftype="bam" file="test_1.bam" lines_diff="4" />
         </test>
         <!-- 2) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" ftype="sam" dbkey="equCab2" value="in_test_1.sam" />
             <conditional name="addref_cond">
                 <param name="addref_select" value="cached" />
@@ -413,7 +434,7 @@
             <output name="outputsam" ftype="bam" file="test_2.bam" lines_diff="4" />
         </test>
         <!-- 3) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" ftype="sam" value="in_test_3.sam" />
             <conditional name="addref_cond">
                 <param name="addref_select" value="history" />
@@ -422,7 +443,7 @@
             <output name="outputsam" ftype="bam" file="test_3.bam" lines_diff="4" />
         </test>
         <!-- 4) cram to bam -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_4.cram" ftype="cram" />
             <conditional name="addref_cond">
                 <param name="addref_select" value="history" />
@@ -431,7 +452,7 @@
             <output name="outputsam" file="test_4.bam" ftype="bam" lines_diff="4" />
         </test>
         <!-- 5) within bam operations expected to result in sorting or not -->
-        <test >
+        <test expect_num_outputs="1">
             <!-- sorted bam should always result in unmodifed output -->
             <param name="input" ftype="bam" value="in_test_5.bam" />
             <assert_command>
@@ -440,7 +461,7 @@
             <output name="outputsam" ftype="bam" file="test_5.bam" lines_diff="2"/>
         </test>
         <!-- 6) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- sorted bam should always result in unmodifed output -->
             <param name="input" ftype="bam" value="in_test_5.bam" />
             <conditional name="mode">
@@ -456,7 +477,7 @@
             <output name="outputsam" ftype="bam" file="test_5.bam" lines_diff="2"/>
         </test>
         <!-- 7) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- qname_sorted.bam should get sorted during "conversion" to bam ... -->
             <param name="input" ftype="qname_sorted.bam" value="in_test_7.bam" />
             <assert_command>
@@ -465,7 +486,7 @@
             <output name="outputsam" ftype="bam" file="test_7.bam" lines_diff="4" />
         </test>
         <!-- 8) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- ... but should be emitted unmodifed when using input format -->
             <param name="input" ftype="qname_sorted.bam" value="in_test_7.bam" />
             <conditional name="mode">
@@ -481,7 +502,7 @@
             <output name="outputsam" ftype="qname_sorted.bam" file="test_8.bam" lines_diff="2"/>
         </test>
         <!-- 9) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- unsorted.bam should get sorted during "conversion" to bam ... -->
             <param name="input" ftype="unsorted.bam" value="in_test_7.bam" />
             <assert_command>
@@ -490,7 +511,7 @@
             <output name="outputsam" ftype="bam" file="test_7.bam" lines_diff="4" />
         </test>
         <!-- 10) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- ... ... but should be emitted unmodifed when using input format -->
             <param name="input" ftype="unsorted.bam" value="in_test_7.bam" />
             <conditional name="mode">
@@ -506,7 +527,7 @@
             <output name="outputsam" ftype="unsorted.bam" file="test_8.bam" lines_diff="2" />
         </test>
         <!-- 11) bam to sam + header options (adapted from bam_to_sam tool)-->
-        <test>
+        <test expect_num_outputs="1">
             <param ftype="bam" name="input" value="in_test_11.bam" />
             <conditional name="mode">
                 <conditional name="output_options">
@@ -519,7 +540,7 @@
             <output file="test_11.sam" ftype="sam" name="outputsam" lines_diff="2" />
         </test>
         <!-- 12) -->
-        <test>
+        <test expect_num_outputs="1">
             <param ftype="bam" name="input" value="in_test_11.bam" />
             <conditional name="mode">
                 <param name="outtype" value="header" />
@@ -532,7 +553,7 @@
             <output file="test_12.sam" ftype="sam" name="outputsam" lines_diff="2" />
         </test>
         <!-- 13) -->
-        <test>
+        <test expect_num_outputs="1">
             <param ftype="bam" name="input" value="in_test_11.bam" />
             <conditional name="mode">
                 <conditional name="output_options">
@@ -545,7 +566,7 @@
             <output file="test_13.sam" ftype="sam" name="outputsam" lines_diff="2" />
         </test>
         <!-- 14) count alignments -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="all_reads" />
@@ -556,7 +577,7 @@
             <output name="outputcnt" file="test_14.tab" ftype="tabular" lines_diff="2" />
         </test>
         <!-- 15) region filters -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_15.sam" ftype="sam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -575,10 +596,10 @@
                 <param name="addref_select" value="history" />
                 <param name="ref" value="test.fa" />
             </conditional>
-            <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="250" />
+            <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="500" />
         </test>
         <!-- 16) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -600,7 +621,7 @@
             <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="250" />
         </test>
         <!-- 17) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_17.cram" dbkey="equCab2" ftype="cram" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -622,7 +643,7 @@
             <output name="outputsam" file="test_17.bam" ftype="bam" lines_diff="4" />
         </test>
         <!-- 18) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -645,7 +666,7 @@
             <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="250" />
         </test>
         <!-- 19) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test_15.cram" ftype="cram" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -668,7 +689,7 @@
             <output name="outputsam" file="test_19.bam" ftype="bam" lines_diff="4"/>
         </test>
         <!-- 20) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test_15.cram" ftype="cram" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -691,7 +712,7 @@
             <output name="outputsam" file="test_20.bam" ftype="bam" lines_diff="4" />
         </test>
         <!-- 21) sampling options target < total reads -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_15.sam" ftype="sam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -710,7 +731,7 @@
             <output name="outputsam" file="test_21.sam" ftype="sam" compare="diff" lines_diff="10" />
         </test>
         <!-- 22) target > total reads -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_15.sam" ftype="sam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -729,7 +750,7 @@
             <output name="outputsam" file="test_22.sam" ftype="sam" lines_diff="2"/>
         </test>
         <!-- 23) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- subsampling SAM input without reads -->
             <param name="input" value="in_test_23.sam" ftype="sam" />
             <conditional name="mode">
@@ -749,7 +770,7 @@
             <output name="outputsam" file="test_23.sam" ftype="sam" lines_diff="2"/>
         </test>
         <!-- 24) -->
-        <test>
+        <test expect_num_outputs="1">
             <!-- subsampling BAM input without reads -->
             <param name="input" value="in_test_24.bam" ftype="bam" />
             <conditional name="mode">
@@ -769,7 +790,7 @@
             <output name="outputsam" file="test_24.bam" ftype="bam" lines_diff="2" />
         </test>
         <!-- 25) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_15.sam" ftype="sam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -789,7 +810,7 @@
             <output name="outputsam" file="test_25.sam" ftype="sam" compare="diff" lines_diff="2" />
         </test>
         <!-- 26) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -809,7 +830,7 @@
             <output name="outputsam" file="test_26.bam" ftype="bam" lines_diff="2" />
         </test>
         <!-- 27) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -829,7 +850,7 @@
             <output name="outputsam" file="test_27.bam" ftype="bam" lines_diff="2"/>
         </test>
         <!-- 28) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -849,7 +870,7 @@
             <output name="outputsam" file="test_28.bam" ftype="bam" lines_diff="2" />
         </test>
         <!-- 29) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_14.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -870,7 +891,7 @@
             <output name="outputsam" file="test_29.bam" ftype="bam" lines_diff="2"/>
         </test>
         <!-- 30) testing tag filtering -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_30.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -889,7 +910,7 @@
             <output name="outputsam" file="test_30.bam" ftype="bam" lines_diff="2" />
         </test>
         <!-- 31) testing readname filtering -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="in_test_30.bam" ftype="bam" />
             <conditional name="mode">
                 <param name="outtype" value="selected_reads" />
@@ -907,6 +928,50 @@
             </assert_command>
             <output name="outputsam" file="test_31.bam" ftype="bam" lines_diff="2" />
         </test>
+        <!-- 32) testing expression filters -->
+        <test expect_num_outputs="1">
+            <param name="input" value="in_test_30.bam" ftype="bam"/>
+            <conditional name="mode">
+                <param name="outtype" value="selected_reads" />
+                <section name="filter_config">
+                    <conditional name="cond_expr">
+                        <param name="select_expr" value="yes"/>
+                        <param name="expression" value="sclen>0"/>
+                    </conditional>
+                </section>
+                <conditional name="output_options">
+                    <conditional name="output_format">
+                        <param name="oformat" value="bam" />
+                    </conditional>
+                </conditional>
+            </conditional>
+            <assert_command>
+                <has_text text="-e 'sclen>0'"/>
+            </assert_command>
+            <output name="outputsam" file="test_32.bam" ftype="bam" lines_diff="2" />
+        </test>
+         <!-- 33) testing expression filters -->
+        <test expect_num_outputs="1">
+            <param name="input" value="in_test_30.bam" ftype="bam"/>
+            <conditional name="mode">
+                <param name="outtype" value="selected_reads" />
+                <section name="filter_config">
+                    <conditional name="cond_expr">
+                        <param name="select_expr" value="yes"/>
+                        <param name="expression" value='rname!="chr13"'/>
+                    </conditional>
+                </section>
+                <conditional name="output_options">
+                    <conditional name="output_format">
+                        <param name="oformat" value="bam" />
+                    </conditional>
+                </conditional>
+            </conditional>
+            <assert_command>
+                <has_text text="-e 'rname!="/>
+            </assert_command>
+            <output name="outputsam" file="test_33.bam" ftype="bam" lines_diff="2" />
+        </test>
     </tests>
     <help>
 **What it does**
@@ -990,12 +1055,143 @@
 
 This filters based on the MAPQ column of the SAM format which gives an estimate about the correct placement of the alignment. Note that aligners do not follow a consistent definition.
 
-## Filtering by Tag **
+**Filtering by Tag**
 
 This filter allows to select reads based on tool or user specific tags, e.g., XS:i:-18 the alignment score tag of bowtie.
 Thus to filter for a specific value of the tag you need the format STR1:STR2, e.g., XS:-18 to filter reads with an aligment score of -18.
 You can also just write STR1 without the value STR2 hence the filter selects all reads with the tag STR1, e.g., XS.
 
+**Filtering by Expression**
+
+
+Filter  expressions  are used as an on-the-fly checking of incoming SAM, BAM or CRAM records, discarding records that do not match the specified expression.
+
+The language used is primarily C style, but with a few differences in the precedence rules for bit operators and the  inclusion  of  regular  expression
+matching.
+
+The operator precedence, from strongest binding to weakest, is
+
+::
+
+        Grouping        (, )             E.g. &quot;(1+2)&#42;3&quot;
+        Values:         literals, vars   Numbers, strings and variables
+        Unary ops:      +, -, !, ~       E.g. -10 +10, !10 (not), ~5 (bit not)
+        Math ops:       \*, /, %          Multiply, division and (integer) modulo
+        Math ops:       +, -             Addition / subtraction
+        Bit-wise:       &amp;                Integer AND
+        Bit-wise        ^                Integer XOR
+        Bit-wise        |                Integer OR
+        Conditionals:   &gt;, &gt;=, &lt;, &lt;=
+        Equality:       \=\=, !=, =~, !~   =~ and !~ match regular expressions
+        Boolean:        &amp;&amp;, ||           Logical AND / OR
+
+
+Expressions  are  computed  using floating point mathematics, so &quot;10 / 4&quot; evaluates to 2.5 rather than 2.  They may be written as integers in decimal or
+&quot;0x&quot; plus hexadecimal, and floating point with or without exponents.However operations that require integers first do an implicit  type  conversion,  so
+&quot;7.9  %  5&quot;  is  2  and &quot;7.9 &amp; 4.1&quot; is equivalent to &quot;7 &amp; 4&quot;, which is 4.  Strings are always specified using double quotes.  To get a double quote in a
+string, use backslash.  Similarly a double backslash is used to get a literal backslash.  For example ab\&quot;c\\d is the string ab&quot;c\d.
+
+Comparison operators are evaluated as a match being 1 and a mismatch being 0, thus &quot;(2 &gt; 1) + (3 &lt; 5)&quot; evaluates as 2.  All comparisons involving  undefined (null) values are deemed to be false.
+
+The  variables are where the file format specifics are accessed from the expression.  The variables correspond to SAM fields, for example to find paired
+alignments with high mapping quality and a very large insert size, we may use the expression &quot;mapq &gt;= 30 &amp;&amp; (tlen &gt;= 100000 || tlen &lt;= -100000)&quot;.  Valid
+variable names and their data types are:
+
+::
+
+    endpos               int            Alignment end position (1-based)
+    flag                 int            Combined FLAG field
+    flag.paired          int            Single bit, 0 or 1
+    flag.proper_pair     int            Single bit, 0 or 2
+    flag.unmap           int            Single bit, 0 or 4
+    flag.munmap          int            Single bit, 0 or 8
+    flag.reverse         int            Single bit, 0 or 16
+    flag.mreverse        int            Single bit, 0 or 32
+    flag.read1           int            Single bit, 0 or 64
+    flag.read2           int            Single bit, 0 or 128
+    flag.secondary       int            Single bit, 0 or 256
+    flag.qcfail          int            Single bit, 0 or 512
+    flag.dup             int            Single bit, 0 or 1024
+    flag.supplementary   int            Single bit, 0 or 2048
+    hclen                int            Number of hard-clipped bases
+    library              string         Library (LB header via RG)
+    mapq                 int            Mapping quality
+    mpos                 int            Synonym for pnext
+    mrefid               int            Mate reference number (0 based)
+    mrname               string         Synonym for rnext
+    ncigar               int            Number of cigar operations
+    pnext                int            Mate's alignment position (1-based)
+    pos                  int            Alignment position (1-based)
+    qlen                 int            Alignment length: no. query bases
+    qname                string         Query name
+    qual                 string         Quality values (raw, 0 based)
+    refid                int            Integer reference number (0 based)
+    rlen                 int            Alignment length: no. reference bases
+    rname                string         Reference name
+    rnext                string         Mate's reference name
+    sclen                int            Number of soft-clipped bases
+    seq                  string         Sequence
+    tlen                 int            Template length (insert size)
+    [XX]                 int / string   XX tag value
+
+
+Flags are returned either as the whole flag value or by checking for a single bit.  Hence the filter expression flag.dup is equivalent to flag &amp; 1024.
+
+&quot;qlen&quot; and &quot;rlen&quot; are measured using the CIGAR string to count the number of query (sequence) and reference bases consumed.  Note &quot;qlen&quot; may not exactly
+match the length of the &quot;seq&quot; field if the sequence is &quot;&#42;&quot;.
+
+&quot;sclen&quot; and &quot;hclen&quot; are the number of soft and hard-clipped bases respectively.  The formula &quot;qlen-sclen&quot; gives the number of sequence bases used in the
+alignment, distinguishing between global alignment and local alignment length.
+
+&quot;endpos&quot; is the (1-based inclusive) position of the rightmost mapped base of the read, as measured using the CIGAR  string,  and  for  mapped  reads  is
+equivalent to &quot;pos+rlen-1&quot;. For unmapped reads, it is the same as &quot;pos&quot;.
+
+Reference  names  may  be matched either by their string forms (&quot;rname&quot; and &quot;mrname&quot;) or as the Nth @SQ line (counting from zero) as stored in BAM using
+&quot;tid&quot; and &quot;mtid&quot; respectively.
+
+Auxiliary tags are described in square brackets and these expand to either integer or string as defined by the tag  itself  (XX:Z:string  or  XX:i:int).
+For example [NM]&gt;=10 can be used to look for alignments with many mismatches and [RG]=~&quot;grp[ABC]-&quot; will match the read-group string.
+
+If no comparison is used with an auxiliary tag it is taken simply to be a test for the existence of that tag.  So [NM] will return any record containing
+an  NM tag, even if that tag is zero (NM:i:0).  In htslib &lt;= 1.15 negating this with ![NM] gave misleading results as it was true if the tag did not exist
+or did exist but was zero.  Now this is strictly does-not-exist.  An explicit exists([NM]) and !exists([NM]) function has also been  added  to  make
+this intention clear.
+
+Similarly  in htslib &lt;= 1.15 using [NM]!=0 was true both when the tag existed and was not zero as well as when the tag did not exist.  From 1.16 onwards
+all comparison operators are only true for tags that exist, so [NM]!=0 works as expected.
+
+Some simple functions are available to operate on strings.  These treat the strings as arrays of bytes, permitting their length,  minimum,  maximum  and
+average values to be computed.  These are useful for processing Quality Scores.
+
+::
+
+    length(x)   Length of the string (excluding nul char)
+    min(x)      Minimum byte value in the string
+    max(x)      Maximum byte value in the string
+    avg(x)      Average byte value in the string
+
+
+Note  that  &quot;avg&quot; is a floating point value and it may be NAN for empty strings.  This means that &quot;avg(qual)&quot; does not produce an error for records that
+have both seq and qual of &quot;&#42;&quot;.  NAN values will fail any conditional checks, so e.g. &quot;avg(qual) &gt; 20&quot; works and will not report these records.  NAN also
+fails all equality, &lt; and &gt; comparisons, and returns zero when given as an argument to the exists function.  It can be negated with !x in which case  it
+becomes true.
+
+Functions that operate on both strings and numerics:
+
+:: 
+
+    exists(x)      True if the value exists (or is explicitly true).
+    default(x,d)   Value x if it exists or d if not.
+
+Functions that apply only to numeric values:
+
+::
+
+    qrt(x)     Square root of x
+    og(x)      Natural logarithm of x
+    ow(x, y)   Power function, x to the power of y
+    xp(x)      Base-e exponential, equivalent to pow(e,x)
+
     </help>
     <expand macro="citations"/>
 </tool>
Binary file test-data/test_1.bam has changed
--- a/test-data/test_11.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_11.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -4,7 +4,7 @@
 @SQ	SN:chr8	LN:202
 @RG	ID:0	SM:Hi,Mom!
 @PG	ID:1	PN:Hey!	VN:2.0
-@PG	ID:samtools	PN:samtools	PP:1	VN:1.12	CL:samtools view -@ 0 -h -o outfile infile
+@PG	ID:samtools	PN:samtools	PP:1	VN:1.15.1	CL:samtools view -@ 0 -h -o outfile infile
 both_reads_align_clip_marked	83	chr7	1	255	101M	=	302	201	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:0
 both_reads_present_only_first_aligns	89	chr7	1	255	101M	*	0	0	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:0
 read_2_too_many_gaps	83	chr7	1	255	101M	=	302	201	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:0
--- a/test-data/test_12.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_12.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -4,4 +4,4 @@
 @SQ	SN:chr8	LN:202
 @RG	ID:0	SM:Hi,Mom!
 @PG	ID:1	PN:Hey!	VN:2.0
-@PG	ID:samtools	PN:samtools	PP:1	VN:1.12	CL:samtools view -H -o outfile infile
+@PG	ID:samtools	PN:samtools	PP:1	VN:1.15.1	CL:samtools view -H -o outfile infile
Binary file test-data/test_15.cram has changed
Binary file test-data/test_17.bam has changed
Binary file test-data/test_19.bam has changed
Binary file test-data/test_2.bam has changed
Binary file test-data/test_20.bam has changed
--- a/test-data/test_21.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_21.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -3,6 +3,6 @@
 @RG	ID:UNKNOWN	SM:UNKNOWN
 @PG	ID:bowtie2	PN:bowtie2	VN:2.0.0-beta5
 @PG	ID:0	CL:aaaaa/aaa/aaaaa/aaaaaa/aaaaaaaaa/aaa/iuc/package_aaaaaaaaa_x_y/aaaaaaaaaaaa/bin/aaaaaaaaaaaaaaaaa aaaaaaaaaa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.cram aa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.fa -O test	PN:samtools	VN:1.2
-@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 1 -h -f 0 -F 0 -G 0 -s 5.20000000 -o outfile infile
-SRR065390.1871511	16	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	<?@<@A8>0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:0	XM:i:0	XN:i:0	XO:i:0	AS:i:0	XS:i:0	YT:Z:UU
-SRR065390.6905811	16	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	#######################BB@>A<BC>@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:0	XM:i:0	XN:i:0	XO:i:0	AS:i:0	XS:i:0	YT:Z:UU
+@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 0 -h -f 0 -F 0 -G 0 --subsample-seed 24733 --subsample 0.20000000 -o outfile infile
+SRR065390.3743423	16	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:0	XM:i:0	XN:i:0	XO:i:0	AS:i:0	XS:i:0	YT:Z:UU
+SRR065390.5238868	16	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	@,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDB<DABADCACDCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:0	XM:i:0	XN:i:0	XO:i:0	AS:i:0	XS:i:0	YT:Z:UU
--- a/test-data/test_22.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_22.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -3,7 +3,7 @@
 @RG	ID:UNKNOWN	SM:UNKNOWN
 @PG	ID:bowtie2	PN:bowtie2	VN:2.0.0-beta5
 @PG	ID:0	CL:aaaaa/aaa/aaaaa/aaaaaa/aaaaaaaaa/aaa/iuc/package_aaaaaaaaa_x_y/aaaaaaaaaaaa/bin/aaaaaaaaaaaaaaaaa aaaaaaaaaa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.cram aa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.fa -O test	PN:samtools	VN:1.2
-@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 1 -h -f 0 -F 0 -G 0 -s 0.00000000 -o outfile infile
+@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 0 -h -f 0 -F 0 -G 0 --subsample-seed 5206 --subsample 1.00000000 -o outfile infile
 SRR065390.14978392	16	CHROMOSOME_I	2	1	27M1D73M	*	0	0	CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:1	XM:i:5	XN:i:0	XO:i:1	AS:i:-18	XS:i:-18	YT:Z:UU
 SRR065390.921023	16	CHROMOSOME_I	3	12	100M	*	0	0	CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000	RG:Z:UNKNOWN	XG:i:0	XM:i:3	XN:i:0	XO:i:0	AS:i:-6	XS:i:-13	YT:Z:UU
 SRR065390.1871511	16	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	<?@<@A8>0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:0	XM:i:0	XN:i:0	XO:i:0	AS:i:0	XS:i:0	YT:Z:UU
--- a/test-data/test_23.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_23.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -4,4 +4,4 @@
 @PG	ID:bowtie2	PN:bowtie2	VN:2.0.0-beta5
 @PG	ID:0	CL:aaaaa/aaa/aaaaa/aaaaaa/aaaaaaaaa/aaa/iuc/package_aaaaaaaaa_x_y/aaaaaaaaaaaa/bin/aaaaaaaaaaaaaaaaa aaaaaaaaaa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.cram aa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.fa -O test	PN:samtools	VN:1.2
 @PG	ID:samtools	PN:samtools	PP:0	VN:1.12	CL:samtools view -@ 0 -h -s .0 -o outfile infile
-@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.12	CL:samtools view -@ 0 -h -s .0 -o outfile infile
+@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.15.1	CL:samtools view -@ 0 -h -f 0 -F 0 -G 0 --subsample-seed 23916 --subsample 1.00000000 -o outfile infile
Binary file test-data/test_24.bam has changed
--- a/test-data/test_25.sam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_25.sam	Fri Aug 30 10:24:13 2024 +0000
@@ -3,7 +3,7 @@
 @RG	ID:UNKNOWN	SM:UNKNOWN
 @PG	ID:bowtie2	PN:bowtie2	VN:2.0.0-beta5
 @PG	ID:0	CL:aaaaa/aaa/aaaaa/aaaaaa/aaaaaaaaa/aaa/iuc/package_aaaaaaaaa_x_y/aaaaaaaaaaaa/bin/aaaaaaaaaaaaaaaaa aaaaaaaaaa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.cram aa /aaaa/aaaaa/aaa/aaaaaaaaaaaaaaaaaaa/tools/aaaaaaaaa/test-data/test.fa -O test	PN:samtools	VN:1.2
-@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 1 -h -f 0 -F 0 -G 0 -s 7.20000000 -o outfile infile
+@PG	ID:samtools	PN:samtools	PP:0	VN:1.15.1	CL:samtools view -@ 0 -h -f 0 -F 0 -G 0 --subsample-seed 7 --subsample 0.20000000 -o outfile infile
 SRR065390.14978392	16	CHROMOSOME_I	2	1	27M1D73M	*	0	0	CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	RG:Z:UNKNOWN	XG:i:1	XM:i:5	XN:i:0	XO:i:1	AS:i:-18	XS:i:-18	YT:Z:UU
 SRR065390.921023	16	CHROMOSOME_I	3	12	100M	*	0	0	CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA	###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000	RG:Z:UNKNOWN	XG:i:0	XM:i:3	XN:i:0	XO:i:0	AS:i:-6	XS:i:-13	YT:Z:UU
 SRR065390.6023338	0	CHROMOSOME_I	3	1	100M	*	0	0	CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAAGCTAC	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCDDDBCCABB=DABBA?################	RG:Z:UNKNOWN	XG:i:0	XM:i:3	XN:i:0	XO:i:0	AS:i:-6	XS:i:-6	YT:Z:UU
Binary file test-data/test_26.bam has changed
Binary file test-data/test_27.bam has changed
Binary file test-data/test_28.bam has changed
Binary file test-data/test_29.bam has changed
Binary file test-data/test_3.bam has changed
Binary file test-data/test_30.bam has changed
Binary file test-data/test_31.bam has changed
Binary file test-data/test_32.bam has changed
Binary file test-data/test_33.bam has changed
Binary file test-data/test_4.bam has changed
Binary file test-data/test_5.bam has changed
Binary file test-data/test_7.bam has changed
--- a/test-data/test_8.bam	Mon Aug 15 09:13:14 2022 +0000
+++ b/test-data/test_8.bam	Fri Aug 30 10:24:13 2024 +0000
@@ -167,7 +167,7 @@
 @SQ	SN:GCCAAACCCCAAAAACAAGACTAAACAATGCACAATACTTCATGAAGCTT	LN:0
 @SQ	SN:GAACTTTCCCCCCGCCATTAATACCAACATGCTACTTTAATCAATAAAAT	LN:0
 @SQ	SN:TTCTTCCCCC	LN:0
-@PG	ID:samtools	PN:samtools	VN:1.12	CL:samtools view -@ 0 -h -o outfile infile
+@PG	ID:samtools	PN:samtools	VN:1.15.1	CL:samtools view -@ 0 -h -o outfile infile
 HWI-EAS91_1_30788AAXX:1:1:1218:141	16	*	14062	25	36M	*	0	0	ACAAAACTAACAACAAAAATAACACTCNNAATAAAC	I+IIII1IIIIIIIIIIIIIIIIIIII""IIIIIII	NM:i:1	X1:i:1	MD:Z:7N0N27
 HWI-EAS91_1_30788AAXX:1:1:1310:991	16	*	10002	25	36M	*	0	0	CTCCTATGCCTAGAAGGAATAATACTANNACTATTC	I:2IEI:IIDIIIIII4IIIIIIIIII""IIIIIII	NM:i:1	X1:i:1	MD:Z:7N0N27
 HWI-EAS91_1_30788AAXX:1:1:1398:854	16	*	3921	25	36M	*	0	0	CACCCTTCCCGTACTAATAAATCCCCTNNTCTTCAC	IIIII=AIIIIIIIIIIIIIIBIIIII""IIIIIII	NM:i:1	X1:i:1	MD:Z:7N0N27