Mercurial > repos > jdv > albacore

--- a/albacore_1D.py	Wed Aug 30 02:47:27 2017 -0400
+++ b/albacore_1D.py	Wed Sep 06 12:12:52 2017 -0400
@@ -7,15 +7,20 @@
 import shutil
 import h5py
 import numpy as np
+import tarfile
+from distutils.util import strtobool

 def main():
     tar_file = sys.argv[1]
     out_file = sys.argv[2]
-    threads  = sys.argv[3]
+    out_fmt  = sys.argv[3]
+    demux    = strtobool( sys.argv[4] )
+    threads  = sys.argv[5]

     (flowcell, kit) = parse_meta(tar_file)

-    subprocess.call(["read_fast5_basecaller.py",
+    subprocess.call(
+        ["read_fast5_basecaller.py",
         "--input", "in_dir",
         "--worker_threads", threads,
         "--save_path", "out_dir",
@@ -23,15 +28,63 @@
         "--kit", kit,
         "--recursive",
         "--files_per_batch_folder", "0",
-        "--output_format", "fastq",
-        "--reads_per_fastq_batch", "999999999" ])
+        "--output_format", out_fmt,
+        "--reads_per_fastq_batch", "999999999" ] +
+        ["--barcoding"] * demux )
+
+    if demux:
+        #check for demuxed albacore output and copy to Galaxy output
+        final_dir = "final"
+        if not os.path.exists(final_dir):
+            os.makedirs(final_dir)
+        dirs = glob.glob("out_dir/workspace/*")
+        for d in dirs:
+
+            if out_fmt == 'fastq':
+                bc = os.path.basename( os.path.normpath( d ) ) + ".fastq"
+                print(d)
+                print(bc)
+                out = os.path.join( final_dir, bc )
+                files = glob.glob( os.path.join( d, "*.fastq") )
+                if len(files) != 1:
+                    raise ValueError('No or multiple FASTQ output files found')
+                found_file = files[0]
+                shutil.copy(found_file, out)

-    #check for single albacore output and copy to Galaxy output
-    files = glob.glob("out_dir/workspace/*.fastq")
-    if len(files) != 1:
-        raise ValueError('No or multiple FASTQ output files found')
-    found_file = files[0]
-    shutil.copy(found_file, out_file)
+            elif out_fmt == 'fast5':
+                bc = os.path.basename( os.path.normpath( d ) ) + ".fast5.tar.gz"
+                print(d)
+                print(bc)
+                out = os.path.join( final_dir, bc )
+                files = glob.glob( os.path.join( d, "**", "*.fast5"), recursive=True)
+                if len(files) < 1:
+                    raise ValueError('No FAST5 output files found')
+                tar = tarfile.open(out, 'w:gz')
+                tar.add( d )
+                tar.close()
+
+            else:
+                raise ValueError('Bad output format specified')
+
+    else:
+        if out_fmt == 'fastq':
+            #check for single albacore output and copy to Galaxy output
+            files = glob.glob("out_dir/workspace/*.fastq")
+            if len(files) != 1:
+                raise ValueError('No or multiple FASTQ output files found')
+            found_file = files[0]
+            shutil.copy(found_file, out_file)
+        elif out_fmt == 'fast5':
+            #check for single albacore output and copy to Galaxy output
+            files = glob.glob("out_dir/workspace/**/*.fast5", recursive=True)
+            if len(files) < 1:
+                raise ValueError('No FAST5 output files found')
+            tar = tarfile.open(out_file, 'w:gz')
+            tar.add("out_dir/workspace")
+            tar.close()
+        else:
+            raise ValueError('Bad output format specified')
+

 def parse_meta(fn):
--- a/albacore_1D.xml	Wed Aug 30 02:47:27 2017 -0400
+++ b/albacore_1D.xml	Wed Sep 06 12:12:52 2017 -0400
@@ -19,7 +19,7 @@
     <command detect_errors="aggressive">
     <![CDATA[

-    python3 $__tool_directory__/albacore_1D.py $input $output \${GALAXY_SLOTS:-1}
+    python3 $__tool_directory__/albacore_1D.py $input $output $out_format $demux \${GALAXY_SLOTS:-1}

     ]]>
     </command>
@@ -29,6 +29,11 @@
     <inputs>

         <param name="input" type="data" format="fast5_archive" label="Input reads" />
+        <param name="out_format" type="select" label="Output format">
+            <option value="fastq" selected="true">fastq</option>
+            <option value="fast5">fast5</option>
+        </param>
+        <param name="demux" type="boolean" checked="false" label="Demultiplex" />

     </inputs>

@@ -36,7 +41,21 @@

     <outputs>

-        <data name="output" format="fastq" label="${tool.name} on ${on_string} (called.fastq)" />
+        <data name="output" format="fastq" label="${tool.name} on ${on_string} (reads)">
+            <filter>demux is False</filter>
+            <change_format>
+                <when input="out_format" value="fast5" format="fast5_archive" />
+            </change_format>
+        </data>
+        <collection type="list" name="output_collection_fastq" label="${tool.name} on ${on_string} (reads)">
+            <filter>demux is True and out_format == 'fastq'</filter>
+            <discover_datasets pattern="(?P&lt;name&gt;.*)" directory="final" format="fastqsanger" />
+        </collection>
+        <collection type="list" name="output_collection_fast5" label="${tool.name} on ${on_string} (reads)">
+            <filter>demux is True and out_format == 'fast5'</filter>
+            <discover_datasets pattern="(?P&lt;name&gt;.*)" directory="final" format="fast5_archive" />
+        </collection>
+        <data name="table" format="tabular" from_work_dir="out_dir/sequencing_summary.txt" label="${tool.name} on ${on_string} (table)" />

     </outputs>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/albacore_denoise.xml	Wed Sep 06 12:12:52 2017 -0400
@@ -0,0 +1,112 @@
+<tool id="albacore_denoise" name="Albacore de-noise" version="0.001">
+
+    <description>Filter noise from barcode bins</description>
+
+    <!-- ***************************************************************** -->
+
+    <!--
+    <requirements>
+        <requirement type="package" version="1.2.6">albacore</requirement>
+    </requirements>
+    -->
+
+    <!-- ***************************************************************** -->
+
+    <version_command>echo "0.001"</version_command>
+
+    <!-- ***************************************************************** -->
+
+    <command detect_errors="aggressive">
+    <![CDATA[
+
+    perl $__tool_directory__/denoise.pl
+
+        --table $table
+
+        #if $filter.type == 'topN'
+          --n_keep ${filter.n_keep}
+        #else
+          --min_score ${filter.min_score}
+          --min_frac  ${filter.min_frac}
+        #end if
+
+        $remove_unclassified
+
+        #for $input in $inputs
+            --input ${input}
+            --name  ${input.name}
+        #end for
+
+        --summary $summary
+
+    ]]>
+    </command>
+
+    <!-- ***************************************************************** -->
+
+    <inputs>
+
+        <param name="inputs" type="data_collection" collection_type="list" format="fast5_archive" label="Input reads" multiple="true" />
+        <param name="table" type="data" format="tabular" label="Read table" />
+        <conditional name="filter">
+            <param name="type" type="select" label="Filtering type">
+                <option value="cutoffs" selected="true">By cutoff</option>
+                <option value="topN">Top N bins</option>
+            </param>
+            <when value="cutoffs">
+                <param name="min_score" value="70" type="float" min="0" max="100" label="Minimum average score (0-100)" />
+                <param name="min_frac" value="0.05" type="float" min="0" label="Minimum fraction of average count" />
+            </when>
+            <when value="topN">
+                <param name="n_keep" value="1" type="integer" min="1" label="Number of top bins to keep" />
+            </when>
+        </conditional>
+        <param name="remove_unclassified" type="boolean" checked="true" truevalue="--remove_unclassified" falsevalue="" label="Remove unclassified reads" />
+
+    </inputs>
+
+    <!-- ***************************************************************** -->
+
+    <outputs>
+
+        <collection type="list" name="outputs" label="${tool.name} on ${on_string} (reads)">
+            <discover_datasets pattern="(?P&lt;name&gt;.*)\.fast5\.tar\.gz$" directory="outputs" format="fast5_archive" />
+        </collection>
+
+        <data name="summary" format="tabular" label="${tool.name} on ${on_string} (summary)" />
+
+    </outputs>
+
+    <!-- ***************************************************************** -->
+
+    <!--
+    <tests>
+        <test>
+            <param name="input" value="test_data.fast5.tar.gz" ftype="fast5_archive" />
+            <output name="output" file="test_data.fastq" compare="diff" />
+        </test>
+    </tests>
+    -->
+
+    <!-- ***************************************************************** -->
+
+    <help>
+    <![CDATA[
+
+**Description**
+
+This script will filter "noise" bins from the barcoded output of Albacore
+based on read counts and mean quality scores for each barcode bin. It can
+either filter the top N bins (if you know the number of barcodes in your
+sample) or filter based on minimum read count (as ratio to average value over
+all bin) and minimum average score.
+
+    ]]>
+    </help>
+
+    <!-- ***************************************************************** -->
+
+    <citations>
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/denoise.pl	Wed Sep 06 12:12:52 2017 -0400
@@ -0,0 +1,136 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use File::Basename qw/basename/;
+use File::Copy qw/copy/;
+use Getopt::Long;
+use List::Util qw/sum/;
+
+my $fn_table;
+my @reads;
+my @names;
+my $min_score = 80;
+my $min_frac  = 0.05;
+my $rm_unclass = 0;
+my $n_keep;
+my $fn_summary;
+
+use constant BARCODE => 14;
+use constant SCORE   => 15;
+
+my $unclass_tag = 'unclassified';
+
+GetOptions(
+    'input=s' => \@reads,
+    'name=s'  => \@names,
+    'table=s' => \$fn_table,
+    'min_score=f' => \$min_score,
+    'min_frac=f'  => \$min_frac,
+    'remove_unclassified' => \$rm_unclass,
+    'n_keep=i'    => \$n_keep,
+    'summary=s'   => \$fn_summary,
+);
+
+die "Table not found\n"
+    if (! -r $fn_table);
+
+open my $tsv, '<', $fn_table;
+my $head = <$tsv>;
+chomp $head;
+my @fields = split "\t", $head;
+
+die "unexpected field order"
+    if ($fields[BARCODE] ne 'barcode_arrangement');
+die "unexpected field order"
+    if ($fields[SCORE] ne 'barcode_score');
+
+my %counts;
+my %sums;
+
+while (my $line = <$tsv>) {
+    chomp $line;
+    my @fields = split "\t", $line;
+    my $bin   = $fields[BARCODE];
+    my $score = $fields[SCORE];
+    $counts{$bin} += 1;
+    $sums{$bin}   += $score;
+}
+
+if ($rm_unclass) {
+    delete $counts{$unclass_tag};
+    delete $sums{$unclass_tag};
+}
+
+my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts;
+
+my %scores;
+
+my %status;
+
+@status{ @keys } = ('discarded') x scalar(@keys);
+
+for (@keys) {
+    $scores{$_} = $sums{$_}/$counts{$_};
+}
+my $sum_count  = sum(values %counts);
+my $mean_count = $sum_count / scalar(values %counts);
+
+if (defined $n_keep) {
+
+    my %rank_scores;
+    @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys;
+
+    @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores;
+
+    @keys = @keys[0..$n_keep-1];
+
+}
+
+else {
+
+    @keys = grep {$scores{$_} >= $min_score} @keys;
+    @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys;
+
+}
+
+@status{ @keys } = ('kept') x scalar(@keys);
+
+@keys = sort {$counts{$b} <=> $counts{$a}} keys %status;
+
+# print summary table
+open my $summary, '>', $fn_summary
+    or die "Failed to open summary: $!\n";
+say {$summary} join "\t",
+    '#bin',
+    'n_reads',
+    'avg_score',
+    'status',
+;
+for (@keys) {
+    say {$summary} join "\t",
+        $_,
+        $counts{$_},
+        sprintf("%0.3f",$scores{$_}),
+        $status{$_},
+    ;
+}
+close $summary;
+
+mkdir('outputs');
+
+for my $i (0..$#reads) {
+
+    my $fn = $reads[$i];
+    my $name = $names[$i];
+
+    my $base = basename($name);
+    my $bin = $base;
+    $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i;
+    if (defined $status{$bin} && $status{$bin} eq 'kept') {
+        copy( $fn, "outputs/$base" );
+    }
+
+}