changeset 1:16defe17fda9 draft

v0.0.2 Cope with pipes in reference name (e.g. NCBI style FASTA naming)
author peterjc
date Wed, 17 Feb 2016 11:40:27 -0500
parents 78898a40028b
children 56852ae15edd
files test-data/SRR639755_mito_pairs_vs_NC_010642_clc.bam test-data/SRR639755_mito_pairs_vs_NC_010642_clc.bam.bai test-data/SRR639755_mito_pairs_vs_NC_010642_clc.count-1695-1725.tabular tools/count_roi_variants/README.rst tools/count_roi_variants/count_roi_variants.py tools/count_roi_variants/count_roi_variants.xml
diffstat 6 files changed, 36 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
Binary file test-data/SRR639755_mito_pairs_vs_NC_010642_clc.bam has changed
Binary file test-data/SRR639755_mito_pairs_vs_NC_010642_clc.bam.bai has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_mito_pairs_vs_NC_010642_clc.count-1695-1725.tabular	Wed Feb 17 11:40:27 2016 -0500
@@ -0,0 +1,4 @@
+Variant	Count	Percentage
+AGCCCATGAGATGGGAAGCAATGGGCTACA	14	87.50
+AGCCCATGAGATGGGAAGCAATGGGCTACG	1	6.25
+AGCGCATGAGATGGGAAGCAATGGGCTACG	1	6.25
--- a/tools/count_roi_variants/README.rst	Fri Feb 12 11:17:17 2016 -0500
+++ b/tools/count_roi_variants/README.rst	Wed Feb 17 11:40:27 2016 -0500
@@ -54,6 +54,7 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial public release
+v0.0.2  - Cope with pipes in reference name (e.g. NCBI style FASTA naming)
 ======= ======================================================================
 
 
@@ -77,7 +78,7 @@
 
 To just build and check the tar ball, use::
 
-    $ planemo shed_upload --tar_only  ~/repositories/pico_galaxy/tools/count_roi_variants/
+    $ planemo shed_build tools/count_roi_variants/
     ...
     $ tar -tzf shed_upload.tar.gz
     tools/count_roi_variants/README.rst
--- a/tools/count_roi_variants/count_roi_variants.py	Fri Feb 12 11:17:17 2016 -0500
+++ b/tools/count_roi_variants/count_roi_variants.py	Wed Feb 17 11:40:27 2016 -0500
@@ -19,7 +19,7 @@
 
 if "-v" in sys.argv or "--version" in sys.argv:
     # Galaxy seems to invert the order of the two lines
-    print("BAM coverage statistics v0.0.1")
+    print("BAM coverage statistics v0.0.2 (using samtools)")
     cmd = "samtools 2>&1 | grep -i ^Version"
     sys.exit(os.system(cmd))
 
@@ -128,11 +128,9 @@
 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")]
 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')]
 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')]
-assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G\
-'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
+assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
-assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G\
-'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
+assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
 
 def get_roi(seq, cigar_ops, start, end):
@@ -167,7 +165,7 @@
 
     # Call samtools view, don't need header so no -h added.
     # Only want mapped reads, thus flag filter -F 4.
-    child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region],
+    child = subprocess.Popen(["/mnt/galaxy/bin/samtools_1.1", "view", "-F", "4", bam_file, region],
                              stdout=subprocess.PIPE)
     for line in child.stdout:
         assert line[0] != "@", "Got unexpected SAM header line: %s" % line
--- a/tools/count_roi_variants/count_roi_variants.xml	Fri Feb 12 11:17:17 2016 -0500
+++ b/tools/count_roi_variants/count_roi_variants.xml	Wed Feb 17 11:40:27 2016 -0500
@@ -1,4 +1,4 @@
-<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.1">
+<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.2">
     <description>using samtools view</description>
     <requirements>
         <requirement type="binary">samtools</requirement>
@@ -13,7 +13,22 @@
     <command interpreter="python">count_roi_variants.py "$input_bam" "${input_bam.metadata.bam_index}" "$out_tabular" "$region"</command>
     <inputs>
         <param name="input_bam" type="data" format="bam" label="Input BAM file" />
-        <param name="region" type="text" label="Region of interest" help="Use the reference:start-end syntax as in samtools." />
+        <param name="region" type="text" label="Region of interest" help="Use the reference:start-end syntax as in samtools.">
+            <sanitizer>
+                <!--
+                    SAM/BAM spec says the name must match regex [!-)+-<>-~][!-~]*
+                    but will focus on ASCII 33 ("!") to ASCII 126 ("~"), i.e.
+                    !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+                    of which some are going to be potential trouble like !, ", $, &, ', \, `, {, |, }
+                    which is how I came to this hopefully safe set..
+                -->
+                <valid initial="string.letters,string.digits">
+                    <add value="#%+,-./:;=@^_|~" />
+                    <remove value=" " />
+                </valid>
+                <mapping initial="none" />
+            </sanitizer>
+        </param>
     </inputs>
     <outputs>
         <data name="out_tabular" format="tabular" label="$input_bam.name $region variants" />
@@ -35,6 +50,15 @@
                 <has_line line="Counted 1 variants from 1 reads spanning ref:10-15" />
             </assert_stdout>
         </test>
+        <!-- This test is a tricky one due to the NCBI's love of pipe characters in identifiers -->
+        <test>
+            <param name="input_bam" value="SRR639755_mito_pairs_vs_NC_010642_clc.bam" ftype="bam" />
+            <param name="region" value="gi|187250362|ref|NC_010642.1|:1695-1725" />
+            <output name="out_tabular" file="SRR639755_mito_pairs_vs_NC_010642_clc.count-1695-1725.tabular" ftype="tabular" />
+            <assert_stdout>
+                <has_line line="Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725" />
+            </assert_stdout>
+        </test>
     </tests>
     <help>
 **What it does**