changeset 4:f7901a2a9d0c draft

v0.0.4 - Improved usage text and README for use outside of Galaxy.
author peterjc
date Wed, 01 Feb 2017 07:07:57 -0500
parents 7cd370b7a952
children 2ad9d74f97a8
files tools/count_roi_variants/README.rst tools/count_roi_variants/count_roi_variants.py
diffstat 2 files changed, 58 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/tools/count_roi_variants/README.rst	Thu Feb 18 06:41:09 2016 -0500
+++ b/tools/count_roi_variants/README.rst	Wed Feb 01 07:07:57 2017 -0500
@@ -15,15 +15,38 @@
 http://toolshed.g2.bx.psu.edu/view/peterjc/count_roi_variants
 
 
-Automated Installation
-======================
+Use outside of Galaxy
+=====================
+
+You just need the ``count_roi_variants.py`` script and to have samtools
+on the ``$PATH``.  If you move/copy the script somewhere on your ``$PATH``
+and then you can run it like this::
+
+    $ count_roi_variants.py --help
+
+Or, call the script at an explicit path::
+
+    $ /path/to/my/stuff/count_roi_variants.py --help
+
+Run like this it will use the current default Python. This was written and
+tested under Python 2.7, but should also work under Python 2.6. e.g.::
+
+    $ python2.6 /path/to/my/stuff/count_roi_variants.py --help
+
+This does not yet support Python 3.
+
+The sample data and tests are designed to be run via Galaxy.
+
+
+Automated Galaxy Installation
+=============================
 
 This should be straightforward, Galaxy should automatically download and install
 samtools if required.
 
 
-Manual Installation
-===================
+Manual Galaxy Installation
+==========================
 
 This expects samtools to be on the ``$PATH``, and was tested using v0.1.3
 
@@ -56,6 +79,7 @@
 v0.0.1  - Initial public release
 v0.0.2  - Cope with pipes in reference name (e.g. NCBI style FASTA naming)
 v0.0.3  - Include a functional test for using an unrecognised reference.
+v0.0.4  - Improved usage text and README for use outside of Galaxy.
 ======= ======================================================================
 
 
--- a/tools/count_roi_variants/count_roi_variants.py	Thu Feb 18 06:41:09 2016 -0500
+++ b/tools/count_roi_variants/count_roi_variants.py	Wed Feb 01 07:07:57 2017 -0500
@@ -19,15 +19,28 @@
 
 if "-v" in sys.argv or "--version" in sys.argv:
     # Galaxy seems to invert the order of the two lines
-    print("BAM coverage statistics v0.0.3 (using samtools)")
+    print("BAM coverage statistics v0.0.4 (using samtools)")
     cmd = "samtools 2>&1 | grep -i ^Version"
     sys.exit(os.system(cmd))
 
 # TODO - Proper command line API
+usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region
+
+For ease of use, you can use a minus sign as the BAI filename which will use the BAM
+filename with the suffix .bai added to it. Example using one of the test-data files:
+
+$ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725"
+Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725
+$ cat counts.txt
+Variant	Count	Percentage
+AGCCCATGAGATGGGAAGCAATGGGCTACA	14	87.50
+AGCCCATGAGATGGGAAGCAATGGGCTACG	1	6.25
+AGCGCATGAGATGGGAAGCAATGGGCTACG	1	6.25
+"""
 if len(sys.argv) == 5:
     bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
 else:
-    sys.exit("Require 4 arguments: BAM, BAI, tabular filename, samtools-style region")
+    sys.exit(usage)
 
 if not os.path.isfile(bam_filename):
     sys.exit("Input BAM file not found: %s" % bam_filename)
@@ -41,9 +54,9 @@
     sys.exit("Input BAI file not found: %s" % bai_filename)
 
 try:
-    #Sanity check we have "ref:start-end" to give clear error message
-    #Note can have semi-colon in the reference name
-    #Note can have thousand separator commas in the start/end
+    # Sanity check we have "ref:start-end" to give clear error message
+    # Note can have semi-colon in the reference name
+    # Note can have thousand separator commas in the start/end
     ref, start_end = region.rsplit(":", 1)
     start, end = start_end.split("-")
     start = int(start.replace(",", ""))
@@ -75,7 +88,7 @@
     answer = []
     for letter in cigar:
         if letter.isdigit():
-            count += letter #string addition
+            count += letter  # string addition
         elif letter in "MIDNSHP=X":
             answer.append((int(count), letter))
             count = ""
@@ -83,26 +96,29 @@
             raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
     return answer
 
-assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14,'S'),(15,'M'),(1,'P'),(1,'D'),(3,'P'),(54,'M'),(1,'D'),(34,'M'),(5,'S')]
+
+assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')]
+
 
 def align_len(cigar_ops):
     """Sums the CIGAR M/=/X/D/N operators."""
     return sum(count for count, op in cigar_ops if op in "M=XDN")
 
+
 def expand_cigar(seq, cigar_ops):
     """Yields (ref_offset, seq_base) pairs."""
     ref_offset = 0
     seq_offset = 0
     for count, op in cigar_ops:
         if op in "MX=":
-            for (i, base) in enumerate(seq[seq_offset:seq_offset+count]):
+            for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
                 yield ref_offset + i, base
             ref_offset += count
             seq_offset += count
         elif op == "I":
             # Give them all an in-between reference position
             # (Python lets us mix integers and floats, wouldn't work in C)
-            for (i, base) in enumerate(seq[seq_offset:seq_offset+count]):
+            for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
                 yield ref_offset - 0.5, base
             # Does not change ref_offset
             seq_offset += count
@@ -133,6 +149,7 @@
 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
 
+
 def get_roi(seq, cigar_ops, start, end):
     """Extract region of seq mapping to the ROI.
 
@@ -147,8 +164,8 @@
         assert cigar_ops[0][0] == len(seq)
         return seq[start:end]
     # Would use "start <= i < end" if they were all integers, but
-    # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. 
-    return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end-1)
+    # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
+    return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1)
 
 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8)
@@ -157,6 +174,7 @@
 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8)
 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8)
 
+
 def count_region():
     # Could recreate the region string (with no commas in start/end)?
     # region = "%s:%i-%i" % (ref, start, end)
@@ -197,6 +215,7 @@
 
     return tally
 
+
 def record_counts():
 
     tally = count_region()