Mercurial > repos > jdv > albacore
changeset 1:0a4f83207e53 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/albacore commit 4aa7a76a7b29c425dd89a020979e835d785d3c95-dirty
author | jdv |
---|---|
date | Wed, 06 Sep 2017 12:12:52 -0400 |
parents | f8e25d69167d |
children | b658298e65d8 |
files | albacore_1D.py albacore_1D.xml albacore_denoise.xml denoise.pl |
diffstat | 4 files changed, 332 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/albacore_1D.py Wed Aug 30 02:47:27 2017 -0400 +++ b/albacore_1D.py Wed Sep 06 12:12:52 2017 -0400 @@ -7,15 +7,20 @@ import shutil import h5py import numpy as np +import tarfile +from distutils.util import strtobool def main(): tar_file = sys.argv[1] out_file = sys.argv[2] - threads = sys.argv[3] + out_fmt = sys.argv[3] + demux = strtobool( sys.argv[4] ) + threads = sys.argv[5] (flowcell, kit) = parse_meta(tar_file) - subprocess.call(["read_fast5_basecaller.py", + subprocess.call( + ["read_fast5_basecaller.py", "--input", "in_dir", "--worker_threads", threads, "--save_path", "out_dir", @@ -23,15 +28,63 @@ "--kit", kit, "--recursive", "--files_per_batch_folder", "0", - "--output_format", "fastq", - "--reads_per_fastq_batch", "999999999" ]) + "--output_format", out_fmt, + "--reads_per_fastq_batch", "999999999" ] + + ["--barcoding"] * demux ) + + if demux: + #check for demuxed albacore output and copy to Galaxy output + final_dir = "final" + if not os.path.exists(final_dir): + os.makedirs(final_dir) + dirs = glob.glob("out_dir/workspace/*") + for d in dirs: + + if out_fmt == 'fastq': + bc = os.path.basename( os.path.normpath( d ) ) + ".fastq" + print(d) + print(bc) + out = os.path.join( final_dir, bc ) + files = glob.glob( os.path.join( d, "*.fastq") ) + if len(files) != 1: + raise ValueError('No or multiple FASTQ output files found') + found_file = files[0] + shutil.copy(found_file, out) - #check for single albacore output and copy to Galaxy output - files = glob.glob("out_dir/workspace/*.fastq") - if len(files) != 1: - raise ValueError('No or multiple FASTQ output files found') - found_file = files[0] - shutil.copy(found_file, out_file) + elif out_fmt == 'fast5': + bc = os.path.basename( os.path.normpath( d ) ) + ".fast5.tar.gz" + print(d) + print(bc) + out = os.path.join( final_dir, bc ) + files = glob.glob( os.path.join( d, "**", "*.fast5"), recursive=True) + if len(files) < 1: + raise ValueError('No FAST5 output files found') + tar = tarfile.open(out, 'w:gz') + tar.add( d ) + tar.close() + + else: + raise ValueError('Bad output format specified') + + else: + if out_fmt == 'fastq': + #check for single albacore output and copy to Galaxy output + files = glob.glob("out_dir/workspace/*.fastq") + if len(files) != 1: + raise ValueError('No or multiple FASTQ output files found') + found_file = files[0] + shutil.copy(found_file, out_file) + elif out_fmt == 'fast5': + #check for single albacore output and copy to Galaxy output + files = glob.glob("out_dir/workspace/**/*.fast5", recursive=True) + if len(files) < 1: + raise ValueError('No FAST5 output files found') + tar = tarfile.open(out_file, 'w:gz') + tar.add("out_dir/workspace") + tar.close() + else: + raise ValueError('Bad output format specified') + def parse_meta(fn):
--- a/albacore_1D.xml Wed Aug 30 02:47:27 2017 -0400 +++ b/albacore_1D.xml Wed Sep 06 12:12:52 2017 -0400 @@ -19,7 +19,7 @@ <command detect_errors="aggressive"> <![CDATA[ - python3 $__tool_directory__/albacore_1D.py $input $output \${GALAXY_SLOTS:-1} + python3 $__tool_directory__/albacore_1D.py $input $output $out_format $demux \${GALAXY_SLOTS:-1} ]]> </command> @@ -29,6 +29,11 @@ <inputs> <param name="input" type="data" format="fast5_archive" label="Input reads" /> + <param name="out_format" type="select" label="Output format"> + <option value="fastq" selected="true">fastq</option> + <option value="fast5">fast5</option> + </param> + <param name="demux" type="boolean" checked="false" label="Demultiplex" /> </inputs> @@ -36,7 +41,21 @@ <outputs> - <data name="output" format="fastq" label="${tool.name} on ${on_string} (called.fastq)" /> + <data name="output" format="fastq" label="${tool.name} on ${on_string} (reads)"> + <filter>demux is False</filter> + <change_format> + <when input="out_format" value="fast5" format="fast5_archive" /> + </change_format> + </data> + <collection type="list" name="output_collection_fastq" label="${tool.name} on ${on_string} (reads)"> + <filter>demux is True and out_format == 'fastq'</filter> + <discover_datasets pattern="(?P<name>.*)" directory="final" format="fastqsanger" /> + </collection> + <collection type="list" name="output_collection_fast5" label="${tool.name} on ${on_string} (reads)"> + <filter>demux is True and out_format == 'fast5'</filter> + <discover_datasets pattern="(?P<name>.*)" directory="final" format="fast5_archive" /> + </collection> + <data name="table" format="tabular" from_work_dir="out_dir/sequencing_summary.txt" label="${tool.name} on ${on_string} (table)" /> </outputs>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/albacore_denoise.xml Wed Sep 06 12:12:52 2017 -0400 @@ -0,0 +1,112 @@ +<tool id="albacore_denoise" name="Albacore de-noise" version="0.001"> + + <description>Filter noise from barcode bins</description> + + <!-- ***************************************************************** --> + + <!-- + <requirements> + <requirement type="package" version="1.2.6">albacore</requirement> + </requirements> + --> + + <!-- ***************************************************************** --> + + <version_command>echo "0.001"</version_command> + + <!-- ***************************************************************** --> + + <command detect_errors="aggressive"> + <![CDATA[ + + perl $__tool_directory__/denoise.pl + + --table $table + + #if $filter.type == 'topN' + --n_keep ${filter.n_keep} + #else + --min_score ${filter.min_score} + --min_frac ${filter.min_frac} + #end if + + $remove_unclassified + + #for $input in $inputs + --input ${input} + --name ${input.name} + #end for + + --summary $summary + + ]]> + </command> + + <!-- ***************************************************************** --> + + <inputs> + + <param name="inputs" type="data_collection" collection_type="list" format="fast5_archive" label="Input reads" multiple="true" /> + <param name="table" type="data" format="tabular" label="Read table" /> + <conditional name="filter"> + <param name="type" type="select" label="Filtering type"> + <option value="cutoffs" selected="true">By cutoff</option> + <option value="topN">Top N bins</option> + </param> + <when value="cutoffs"> + <param name="min_score" value="70" type="float" min="0" max="100" label="Minimum average score (0-100)" /> + <param name="min_frac" value="0.05" type="float" min="0" label="Minimum fraction of average count" /> + </when> + <when value="topN"> + <param name="n_keep" value="1" type="integer" min="1" label="Number of top bins to keep" /> + </when> + </conditional> + <param name="remove_unclassified" type="boolean" checked="true" truevalue="--remove_unclassified" falsevalue="" label="Remove unclassified reads" /> + + </inputs> + + <!-- ***************************************************************** --> + + <outputs> + + <collection type="list" name="outputs" label="${tool.name} on ${on_string} (reads)"> + <discover_datasets pattern="(?P<name>.*)\.fast5\.tar\.gz$" directory="outputs" format="fast5_archive" /> + </collection> + + <data name="summary" format="tabular" label="${tool.name} on ${on_string} (summary)" /> + + </outputs> + + <!-- ***************************************************************** --> + + <!-- + <tests> + <test> + <param name="input" value="test_data.fast5.tar.gz" ftype="fast5_archive" /> + <output name="output" file="test_data.fastq" compare="diff" /> + </test> + </tests> + --> + + <!-- ***************************************************************** --> + + <help> + <![CDATA[ + +**Description** + +This script will filter "noise" bins from the barcoded output of Albacore +based on read counts and mean quality scores for each barcode bin. It can +either filter the top N bins (if you know the number of barcodes in your +sample) or filter based on minimum read count (as ratio to average value over +all bin) and minimum average score. + + ]]> + </help> + + <!-- ***************************************************************** --> + + <citations> + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/denoise.pl Wed Sep 06 12:12:52 2017 -0400 @@ -0,0 +1,136 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use File::Basename qw/basename/; +use File::Copy qw/copy/; +use Getopt::Long; +use List::Util qw/sum/; + +my $fn_table; +my @reads; +my @names; +my $min_score = 80; +my $min_frac = 0.05; +my $rm_unclass = 0; +my $n_keep; +my $fn_summary; + +use constant BARCODE => 14; +use constant SCORE => 15; + +my $unclass_tag = 'unclassified'; + +GetOptions( + 'input=s' => \@reads, + 'name=s' => \@names, + 'table=s' => \$fn_table, + 'min_score=f' => \$min_score, + 'min_frac=f' => \$min_frac, + 'remove_unclassified' => \$rm_unclass, + 'n_keep=i' => \$n_keep, + 'summary=s' => \$fn_summary, +); + +die "Table not found\n" + if (! -r $fn_table); + +open my $tsv, '<', $fn_table; +my $head = <$tsv>; +chomp $head; +my @fields = split "\t", $head; + +die "unexpected field order" + if ($fields[BARCODE] ne 'barcode_arrangement'); +die "unexpected field order" + if ($fields[SCORE] ne 'barcode_score'); + +my %counts; +my %sums; + +while (my $line = <$tsv>) { + chomp $line; + my @fields = split "\t", $line; + my $bin = $fields[BARCODE]; + my $score = $fields[SCORE]; + $counts{$bin} += 1; + $sums{$bin} += $score; +} + +if ($rm_unclass) { + delete $counts{$unclass_tag}; + delete $sums{$unclass_tag}; +} + +my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts; + +my %scores; + +my %status; + +@status{ @keys } = ('discarded') x scalar(@keys); + +for (@keys) { + $scores{$_} = $sums{$_}/$counts{$_}; +} +my $sum_count = sum(values %counts); +my $mean_count = $sum_count / scalar(values %counts); + +if (defined $n_keep) { + + my %rank_scores; + @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys; + + @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores; + + @keys = @keys[0..$n_keep-1]; + +} + +else { + + @keys = grep {$scores{$_} >= $min_score} @keys; + @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys; + +} + +@status{ @keys } = ('kept') x scalar(@keys); + +@keys = sort {$counts{$b} <=> $counts{$a}} keys %status; + +# print summary table +open my $summary, '>', $fn_summary + or die "Failed to open summary: $!\n"; +say {$summary} join "\t", + '#bin', + 'n_reads', + 'avg_score', + 'status', +; +for (@keys) { + say {$summary} join "\t", + $_, + $counts{$_}, + sprintf("%0.3f",$scores{$_}), + $status{$_}, + ; +} +close $summary; + +mkdir('outputs'); + +for my $i (0..$#reads) { + + my $fn = $reads[$i]; + my $name = $names[$i]; + + my $base = basename($name); + my $bin = $base; + $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i; + if (defined $status{$bin} && $status{$bin} eq 'kept') { + copy( $fn, "outputs/$base" ); + } + +}