Mercurial > repos > peterjc > count_roi_variants
changeset 4:f7901a2a9d0c draft
v0.0.4 - Improved usage text and README for use outside of Galaxy.
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 07:07:57 -0500 |
parents | 7cd370b7a952 |
children | 2ad9d74f97a8 |
files | tools/count_roi_variants/README.rst tools/count_roi_variants/count_roi_variants.py |
diffstat | 2 files changed, 58 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/count_roi_variants/README.rst Thu Feb 18 06:41:09 2016 -0500 +++ b/tools/count_roi_variants/README.rst Wed Feb 01 07:07:57 2017 -0500 @@ -15,15 +15,38 @@ http://toolshed.g2.bx.psu.edu/view/peterjc/count_roi_variants -Automated Installation -====================== +Use outside of Galaxy +===================== + +You just need the ``count_roi_variants.py`` script and to have samtools +on the ``$PATH``. If you move/copy the script somewhere on your ``$PATH`` +and then you can run it like this:: + + $ count_roi_variants.py --help + +Or, call the script at an explicit path:: + + $ /path/to/my/stuff/count_roi_variants.py --help + +Run like this it will use the current default Python. This was written and +tested under Python 2.7, but should also work under Python 2.6. e.g.:: + + $ python2.6 /path/to/my/stuff/count_roi_variants.py --help + +This does not yet support Python 3. + +The sample data and tests are designed to be run via Galaxy. + + +Automated Galaxy Installation +============================= This should be straightforward, Galaxy should automatically download and install samtools if required. -Manual Installation -=================== +Manual Galaxy Installation +========================== This expects samtools to be on the ``$PATH``, and was tested using v0.1.3 @@ -56,6 +79,7 @@ v0.0.1 - Initial public release v0.0.2 - Cope with pipes in reference name (e.g. NCBI style FASTA naming) v0.0.3 - Include a functional test for using an unrecognised reference. +v0.0.4 - Improved usage text and README for use outside of Galaxy. ======= ======================================================================
--- a/tools/count_roi_variants/count_roi_variants.py Thu Feb 18 06:41:09 2016 -0500 +++ b/tools/count_roi_variants/count_roi_variants.py Wed Feb 01 07:07:57 2017 -0500 @@ -19,15 +19,28 @@ if "-v" in sys.argv or "--version" in sys.argv: # Galaxy seems to invert the order of the two lines - print("BAM coverage statistics v0.0.3 (using samtools)") + print("BAM coverage statistics v0.0.4 (using samtools)") cmd = "samtools 2>&1 | grep -i ^Version" sys.exit(os.system(cmd)) # TODO - Proper command line API +usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region + +For ease of use, you can use a minus sign as the BAI filename which will use the BAM +filename with the suffix .bai added to it. Example using one of the test-data files: + +$ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725" +Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725 +$ cat counts.txt +Variant Count Percentage +AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50 +AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25 +AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25 +""" if len(sys.argv) == 5: bam_filename, bai_filename, tabular_filename, region = sys.argv[1:] else: - sys.exit("Require 4 arguments: BAM, BAI, tabular filename, samtools-style region") + sys.exit(usage) if not os.path.isfile(bam_filename): sys.exit("Input BAM file not found: %s" % bam_filename) @@ -41,9 +54,9 @@ sys.exit("Input BAI file not found: %s" % bai_filename) try: - #Sanity check we have "ref:start-end" to give clear error message - #Note can have semi-colon in the reference name - #Note can have thousand separator commas in the start/end + # Sanity check we have "ref:start-end" to give clear error message + # Note can have semi-colon in the reference name + # Note can have thousand separator commas in the start/end ref, start_end = region.rsplit(":", 1) start, end = start_end.split("-") start = int(start.replace(",", "")) @@ -75,7 +88,7 @@ answer = [] for letter in cigar: if letter.isdigit(): - count += letter #string addition + count += letter # string addition elif letter in "MIDNSHP=X": answer.append((int(count), letter)) count = "" @@ -83,26 +96,29 @@ raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar)) return answer -assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14,'S'),(15,'M'),(1,'P'),(1,'D'),(3,'P'),(54,'M'),(1,'D'),(34,'M'),(5,'S')] + +assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')] + def align_len(cigar_ops): """Sums the CIGAR M/=/X/D/N operators.""" return sum(count for count, op in cigar_ops if op in "M=XDN") + def expand_cigar(seq, cigar_ops): """Yields (ref_offset, seq_base) pairs.""" ref_offset = 0 seq_offset = 0 for count, op in cigar_ops: if op in "MX=": - for (i, base) in enumerate(seq[seq_offset:seq_offset+count]): + for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): yield ref_offset + i, base ref_offset += count seq_offset += count elif op == "I": # Give them all an in-between reference position # (Python lets us mix integers and floats, wouldn't work in C) - for (i, base) in enumerate(seq[seq_offset:seq_offset+count]): + for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): yield ref_offset - 0.5, base # Does not change ref_offset seq_offset += count @@ -133,6 +149,7 @@ assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] + def get_roi(seq, cigar_ops, start, end): """Extract region of seq mapping to the ROI. @@ -147,8 +164,8 @@ assert cigar_ops[0][0] == len(seq) return seq[start:end] # Would use "start <= i < end" if they were all integers, but - # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. - return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end-1) + # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. + return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1) assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8) assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8) @@ -157,6 +174,7 @@ assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8) assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8) + def count_region(): # Could recreate the region string (with no commas in start/end)? # region = "%s:%i-%i" % (ref, start, end) @@ -197,6 +215,7 @@ return tally + def record_counts(): tally = count_region()