# HG changeset patch # User jdv # Date 1504714372 14400 # Node ID 0a4f83207e53100984dd46d339629dab69bce92d # Parent f8e25d69167d9f7b700ceb2d3c3afaeccb15f20b planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/albacore commit 4aa7a76a7b29c425dd89a020979e835d785d3c95-dirty diff -r f8e25d69167d -r 0a4f83207e53 albacore_1D.py --- a/albacore_1D.py Wed Aug 30 02:47:27 2017 -0400 +++ b/albacore_1D.py Wed Sep 06 12:12:52 2017 -0400 @@ -7,15 +7,20 @@ import shutil import h5py import numpy as np +import tarfile +from distutils.util import strtobool def main(): tar_file = sys.argv[1] out_file = sys.argv[2] - threads = sys.argv[3] + out_fmt = sys.argv[3] + demux = strtobool( sys.argv[4] ) + threads = sys.argv[5] (flowcell, kit) = parse_meta(tar_file) - subprocess.call(["read_fast5_basecaller.py", + subprocess.call( + ["read_fast5_basecaller.py", "--input", "in_dir", "--worker_threads", threads, "--save_path", "out_dir", @@ -23,15 +28,63 @@ "--kit", kit, "--recursive", "--files_per_batch_folder", "0", - "--output_format", "fastq", - "--reads_per_fastq_batch", "999999999" ]) + "--output_format", out_fmt, + "--reads_per_fastq_batch", "999999999" ] + + ["--barcoding"] * demux ) + + if demux: + #check for demuxed albacore output and copy to Galaxy output + final_dir = "final" + if not os.path.exists(final_dir): + os.makedirs(final_dir) + dirs = glob.glob("out_dir/workspace/*") + for d in dirs: + + if out_fmt == 'fastq': + bc = os.path.basename( os.path.normpath( d ) ) + ".fastq" + print(d) + print(bc) + out = os.path.join( final_dir, bc ) + files = glob.glob( os.path.join( d, "*.fastq") ) + if len(files) != 1: + raise ValueError('No or multiple FASTQ output files found') + found_file = files[0] + shutil.copy(found_file, out) - #check for single albacore output and copy to Galaxy output - files = glob.glob("out_dir/workspace/*.fastq") - if len(files) != 1: - raise ValueError('No or multiple FASTQ output files found') - found_file = files[0] - shutil.copy(found_file, out_file) + elif out_fmt == 'fast5': + bc = os.path.basename( os.path.normpath( d ) ) + ".fast5.tar.gz" + print(d) + print(bc) + out = os.path.join( final_dir, bc ) + files = glob.glob( os.path.join( d, "**", "*.fast5"), recursive=True) + if len(files) < 1: + raise ValueError('No FAST5 output files found') + tar = tarfile.open(out, 'w:gz') + tar.add( d ) + tar.close() + + else: + raise ValueError('Bad output format specified') + + else: + if out_fmt == 'fastq': + #check for single albacore output and copy to Galaxy output + files = glob.glob("out_dir/workspace/*.fastq") + if len(files) != 1: + raise ValueError('No or multiple FASTQ output files found') + found_file = files[0] + shutil.copy(found_file, out_file) + elif out_fmt == 'fast5': + #check for single albacore output and copy to Galaxy output + files = glob.glob("out_dir/workspace/**/*.fast5", recursive=True) + if len(files) < 1: + raise ValueError('No FAST5 output files found') + tar = tarfile.open(out_file, 'w:gz') + tar.add("out_dir/workspace") + tar.close() + else: + raise ValueError('Bad output format specified') + def parse_meta(fn): diff -r f8e25d69167d -r 0a4f83207e53 albacore_1D.xml --- a/albacore_1D.xml Wed Aug 30 02:47:27 2017 -0400 +++ b/albacore_1D.xml Wed Sep 06 12:12:52 2017 -0400 @@ -19,7 +19,7 @@ @@ -29,6 +29,11 @@ + + + + + @@ -36,7 +41,21 @@ - + + demux is False + + + + + + demux is True and out_format == 'fastq' + + + + demux is True and out_format == 'fast5' + + + diff -r f8e25d69167d -r 0a4f83207e53 albacore_denoise.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/albacore_denoise.xml Wed Sep 06 12:12:52 2017 -0400 @@ -0,0 +1,112 @@ + + + Filter noise from barcode bins + + + + + + + + echo "0.001" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r f8e25d69167d -r 0a4f83207e53 denoise.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/denoise.pl Wed Sep 06 12:12:52 2017 -0400 @@ -0,0 +1,136 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use File::Basename qw/basename/; +use File::Copy qw/copy/; +use Getopt::Long; +use List::Util qw/sum/; + +my $fn_table; +my @reads; +my @names; +my $min_score = 80; +my $min_frac = 0.05; +my $rm_unclass = 0; +my $n_keep; +my $fn_summary; + +use constant BARCODE => 14; +use constant SCORE => 15; + +my $unclass_tag = 'unclassified'; + +GetOptions( + 'input=s' => \@reads, + 'name=s' => \@names, + 'table=s' => \$fn_table, + 'min_score=f' => \$min_score, + 'min_frac=f' => \$min_frac, + 'remove_unclassified' => \$rm_unclass, + 'n_keep=i' => \$n_keep, + 'summary=s' => \$fn_summary, +); + +die "Table not found\n" + if (! -r $fn_table); + +open my $tsv, '<', $fn_table; +my $head = <$tsv>; +chomp $head; +my @fields = split "\t", $head; + +die "unexpected field order" + if ($fields[BARCODE] ne 'barcode_arrangement'); +die "unexpected field order" + if ($fields[SCORE] ne 'barcode_score'); + +my %counts; +my %sums; + +while (my $line = <$tsv>) { + chomp $line; + my @fields = split "\t", $line; + my $bin = $fields[BARCODE]; + my $score = $fields[SCORE]; + $counts{$bin} += 1; + $sums{$bin} += $score; +} + +if ($rm_unclass) { + delete $counts{$unclass_tag}; + delete $sums{$unclass_tag}; +} + +my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts; + +my %scores; + +my %status; + +@status{ @keys } = ('discarded') x scalar(@keys); + +for (@keys) { + $scores{$_} = $sums{$_}/$counts{$_}; +} +my $sum_count = sum(values %counts); +my $mean_count = $sum_count / scalar(values %counts); + +if (defined $n_keep) { + + my %rank_scores; + @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys; + + @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores; + + @keys = @keys[0..$n_keep-1]; + +} + +else { + + @keys = grep {$scores{$_} >= $min_score} @keys; + @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys; + +} + +@status{ @keys } = ('kept') x scalar(@keys); + +@keys = sort {$counts{$b} <=> $counts{$a}} keys %status; + +# print summary table +open my $summary, '>', $fn_summary + or die "Failed to open summary: $!\n"; +say {$summary} join "\t", + '#bin', + 'n_reads', + 'avg_score', + 'status', +; +for (@keys) { + say {$summary} join "\t", + $_, + $counts{$_}, + sprintf("%0.3f",$scores{$_}), + $status{$_}, + ; +} +close $summary; + +mkdir('outputs'); + +for my $i (0..$#reads) { + + my $fn = $reads[$i]; + my $name = $names[$i]; + + my $base = basename($name); + my $bin = $base; + $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i; + if (defined $status{$bin} && $status{$bin} eq 'kept') { + copy( $fn, "outputs/$base" ); + } + +}