# HG changeset patch
# User jdv
# Date 1504714372 14400
# Node ID 0a4f83207e53100984dd46d339629dab69bce92d
# Parent f8e25d69167d9f7b700ceb2d3c3afaeccb15f20b
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/albacore commit 4aa7a76a7b29c425dd89a020979e835d785d3c95-dirty
diff -r f8e25d69167d -r 0a4f83207e53 albacore_1D.py
--- a/albacore_1D.py Wed Aug 30 02:47:27 2017 -0400
+++ b/albacore_1D.py Wed Sep 06 12:12:52 2017 -0400
@@ -7,15 +7,20 @@
import shutil
import h5py
import numpy as np
+import tarfile
+from distutils.util import strtobool
def main():
tar_file = sys.argv[1]
out_file = sys.argv[2]
- threads = sys.argv[3]
+ out_fmt = sys.argv[3]
+ demux = strtobool( sys.argv[4] )
+ threads = sys.argv[5]
(flowcell, kit) = parse_meta(tar_file)
- subprocess.call(["read_fast5_basecaller.py",
+ subprocess.call(
+ ["read_fast5_basecaller.py",
"--input", "in_dir",
"--worker_threads", threads,
"--save_path", "out_dir",
@@ -23,15 +28,63 @@
"--kit", kit,
"--recursive",
"--files_per_batch_folder", "0",
- "--output_format", "fastq",
- "--reads_per_fastq_batch", "999999999" ])
+ "--output_format", out_fmt,
+ "--reads_per_fastq_batch", "999999999" ] +
+ ["--barcoding"] * demux )
+
+ if demux:
+ #check for demuxed albacore output and copy to Galaxy output
+ final_dir = "final"
+ if not os.path.exists(final_dir):
+ os.makedirs(final_dir)
+ dirs = glob.glob("out_dir/workspace/*")
+ for d in dirs:
+
+ if out_fmt == 'fastq':
+ bc = os.path.basename( os.path.normpath( d ) ) + ".fastq"
+ print(d)
+ print(bc)
+ out = os.path.join( final_dir, bc )
+ files = glob.glob( os.path.join( d, "*.fastq") )
+ if len(files) != 1:
+ raise ValueError('No or multiple FASTQ output files found')
+ found_file = files[0]
+ shutil.copy(found_file, out)
- #check for single albacore output and copy to Galaxy output
- files = glob.glob("out_dir/workspace/*.fastq")
- if len(files) != 1:
- raise ValueError('No or multiple FASTQ output files found')
- found_file = files[0]
- shutil.copy(found_file, out_file)
+ elif out_fmt == 'fast5':
+ bc = os.path.basename( os.path.normpath( d ) ) + ".fast5.tar.gz"
+ print(d)
+ print(bc)
+ out = os.path.join( final_dir, bc )
+ files = glob.glob( os.path.join( d, "**", "*.fast5"), recursive=True)
+ if len(files) < 1:
+ raise ValueError('No FAST5 output files found')
+ tar = tarfile.open(out, 'w:gz')
+ tar.add( d )
+ tar.close()
+
+ else:
+ raise ValueError('Bad output format specified')
+
+ else:
+ if out_fmt == 'fastq':
+ #check for single albacore output and copy to Galaxy output
+ files = glob.glob("out_dir/workspace/*.fastq")
+ if len(files) != 1:
+ raise ValueError('No or multiple FASTQ output files found')
+ found_file = files[0]
+ shutil.copy(found_file, out_file)
+ elif out_fmt == 'fast5':
+ #check for single albacore output and copy to Galaxy output
+ files = glob.glob("out_dir/workspace/**/*.fast5", recursive=True)
+ if len(files) < 1:
+ raise ValueError('No FAST5 output files found')
+ tar = tarfile.open(out_file, 'w:gz')
+ tar.add("out_dir/workspace")
+ tar.close()
+ else:
+ raise ValueError('Bad output format specified')
+
def parse_meta(fn):
diff -r f8e25d69167d -r 0a4f83207e53 albacore_1D.xml
--- a/albacore_1D.xml Wed Aug 30 02:47:27 2017 -0400
+++ b/albacore_1D.xml Wed Sep 06 12:12:52 2017 -0400
@@ -19,7 +19,7 @@
@@ -29,6 +29,11 @@
+
+
+
+
+
@@ -36,7 +41,21 @@
-
+
+ demux is False
+
+
+
+
+
+ demux is True and out_format == 'fastq'
+
+
+
+ demux is True and out_format == 'fast5'
+
+
+
diff -r f8e25d69167d -r 0a4f83207e53 albacore_denoise.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/albacore_denoise.xml Wed Sep 06 12:12:52 2017 -0400
@@ -0,0 +1,112 @@
+
+
+ Filter noise from barcode bins
+
+
+
+
+
+
+
+ echo "0.001"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r f8e25d69167d -r 0a4f83207e53 denoise.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/denoise.pl Wed Sep 06 12:12:52 2017 -0400
@@ -0,0 +1,136 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use File::Basename qw/basename/;
+use File::Copy qw/copy/;
+use Getopt::Long;
+use List::Util qw/sum/;
+
+my $fn_table;
+my @reads;
+my @names;
+my $min_score = 80;
+my $min_frac = 0.05;
+my $rm_unclass = 0;
+my $n_keep;
+my $fn_summary;
+
+use constant BARCODE => 14;
+use constant SCORE => 15;
+
+my $unclass_tag = 'unclassified';
+
+GetOptions(
+ 'input=s' => \@reads,
+ 'name=s' => \@names,
+ 'table=s' => \$fn_table,
+ 'min_score=f' => \$min_score,
+ 'min_frac=f' => \$min_frac,
+ 'remove_unclassified' => \$rm_unclass,
+ 'n_keep=i' => \$n_keep,
+ 'summary=s' => \$fn_summary,
+);
+
+die "Table not found\n"
+ if (! -r $fn_table);
+
+open my $tsv, '<', $fn_table;
+my $head = <$tsv>;
+chomp $head;
+my @fields = split "\t", $head;
+
+die "unexpected field order"
+ if ($fields[BARCODE] ne 'barcode_arrangement');
+die "unexpected field order"
+ if ($fields[SCORE] ne 'barcode_score');
+
+my %counts;
+my %sums;
+
+while (my $line = <$tsv>) {
+ chomp $line;
+ my @fields = split "\t", $line;
+ my $bin = $fields[BARCODE];
+ my $score = $fields[SCORE];
+ $counts{$bin} += 1;
+ $sums{$bin} += $score;
+}
+
+if ($rm_unclass) {
+ delete $counts{$unclass_tag};
+ delete $sums{$unclass_tag};
+}
+
+my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts;
+
+my %scores;
+
+my %status;
+
+@status{ @keys } = ('discarded') x scalar(@keys);
+
+for (@keys) {
+ $scores{$_} = $sums{$_}/$counts{$_};
+}
+my $sum_count = sum(values %counts);
+my $mean_count = $sum_count / scalar(values %counts);
+
+if (defined $n_keep) {
+
+ my %rank_scores;
+ @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys;
+
+ @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores;
+
+ @keys = @keys[0..$n_keep-1];
+
+}
+
+else {
+
+ @keys = grep {$scores{$_} >= $min_score} @keys;
+ @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys;
+
+}
+
+@status{ @keys } = ('kept') x scalar(@keys);
+
+@keys = sort {$counts{$b} <=> $counts{$a}} keys %status;
+
+# print summary table
+open my $summary, '>', $fn_summary
+ or die "Failed to open summary: $!\n";
+say {$summary} join "\t",
+ '#bin',
+ 'n_reads',
+ 'avg_score',
+ 'status',
+;
+for (@keys) {
+ say {$summary} join "\t",
+ $_,
+ $counts{$_},
+ sprintf("%0.3f",$scores{$_}),
+ $status{$_},
+ ;
+}
+close $summary;
+
+mkdir('outputs');
+
+for my $i (0..$#reads) {
+
+ my $fn = $reads[$i];
+ my $name = $names[$i];
+
+ my $base = basename($name);
+ my $bin = $base;
+ $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i;
+ if (defined $status{$bin} && $status{$bin} eq 'kept') {
+ copy( $fn, "outputs/$base" );
+ }
+
+}