# HG changeset patch # User abossers # Date 1307482920 14400 # Node ID 31f689e1d88b320e9eac0179483c3892ace1a40c Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TopHit_namefilter/TopHit_README Tue Jun 07 17:42:00 2011 -0400 @@ -0,0 +1,47 @@ +# Created May 2011 +# +# Alex Bossers +# Central Veterinary Institute +# Wageningen University and Research centre +# Lelystad, The Netherlands +# +# Comments/improvements/bugs: Alex (dot) Bossers (at) wur (dot) nl + + +# WHAT IT DOES +TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some +identifier. This is useful for keeping only the first N tophits of for instance BLAST when +multiple hits were returned (and you don't want to rerun the BLAST analysis). Of course it is NOT +restricted to BLAST and can basically filter ANY tabular data for uniqueness. + +Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits. +Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned +they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the +input and therefore in the output file). + + +# REQUIREMENTS +Perl +Galaxy :) + + +# SETUP +Just unpack the tool xml and perl script somewhere appropriate and plug the tool in the tool_config.xml +of your galaxy instance. Refresh the tools or restart the galaxy server. + + +# LICENSE +Copyright (c) 2011 Central Veterinary Institute of Wageningen UR, Lelystad, The Netherlands. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +When distributing the tools please include this original reference. + +Use this tool at your own risk. Even though we tried to build tools and wrappers that free of errors, +check your output since it might be erroneous. We will not be relyable to any failure this may have caused. + +If you like these scripts, please acknowledge our work. + diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter.test.table --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TopHit_namefilter/TopHit_namefilter.test.table Tue Jun 07 17:42:00 2011 -0400 @@ -0,0 +1,16 @@ +Q3262-21 gi|71066702|gb|AE016828.2| tjahier wat +Q3262-23 gi|71066702|gb|AE016828.2| okay +Q3262-21 gi|71066702|gb|AE016828.2| and here can contain multiple columns :) +Q3262-24 gi|71066702|gb|AE016828.2| nothing there +Q3262-26 gi|71066702|gb|AE016828.2| or still +Q3262-21 gi|71066702|gb|AE016828.2| +Q3262-21 gi|71066702|gb|AE016828.2| +Q3262-21 gi|71066702|gb|AE016828.2| +Q3262-21 gi|71066702|gb|AE016828.2| +Q3262-21 gi|145004|gb|M80806.1|COXTRANSPO +Q3262-21 gi|144996|gb|M20482.1|COXHSPAB +Q3262-21 gi|161761570|gb|CP000890.1| +Q3262-30 gi|161761570|gb|CP000890.1| +Q3262-21 gi|161761570|gb|CP000890.1| +Q3262-21 gi|161761570|gb|CP000890.1| +Q3262-21 gi|161761570|gb|CP000890.1| diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TopHit_namefilter/TopHit_namefilter.xml Tue Jun 07 17:42:00 2011 -0400 @@ -0,0 +1,112 @@ + + Simple filter to keep N occurrences of lines in a file + + TopHit_namefilter_galaxy.pl + $input + $column + "$splitter" + $hits + $output_file + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some identifier +useful for keeping only the first N tophits in blast when multiple hits were returned (and you don't want to rerun the BLAST analysis). + +Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits. +Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned +they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the input +and therefore in the output file). + +**Comments/feedback** on the Perl script or GALAXY wrapper: alex.bossers@wur.nl + +----- + +**Note!** Beware the special use of splitters! Especially if you want to use special characters that have a "perl" split +meaning. They need to be escaped by a leading \\. + +Examples of splitters before filtering (end result will remain the ORIGINAL unsplit line!): + +:: + + Splitter Meaning Example line to split Split result for filtering only! + -------- ------------------------------- ----------------------- -------------------------------- + \t Single tab Foo<tab>Bar<tab>here ---> Foo Bar here + \| Single pipe Foo<tab>Bar|here ---> Foo<tab>Bar here + - Single dash Foo-Bar ---> Foo Bar + -|\| Combined splits on dash OR pipe Foo-Bar|here ---> Foo Bar here + + +----- + +**EXAMPLE** + +Parameters: Column = 1, **hits = 2** and splitter = \\t + +**Input** + +Any text/tabular file: + +:: + + Q3262-21 gi|71066702|gb|AE016828.2| tja..here something extra + Q3262-23 gi|71066702|gb|AE016828.2| okay + Q3262-24 gi|71066702|gb|AE016828.2| nothing there + Q3262-21 gi|71066702|gb|AE016828.2| enhier was zonder space :) + Q3262-26 gi|71066702|gb|AE016828.2| or still + Q3262-21 gi|71066702|gb|AE016828.2| + Q3262-21 gi|71066702|gb|AE016828.2| + Q3262-21 gi|71066702|gb|AE016828.2| + Q3262-21 gi|71066702|gb|AE016828.2| + Q3262-21 gi|145004|gb|M80806.1|COXTRANSPO + Q3262-21 gi|144996|gb|M20482.1|COXHSPAB + Q3262-21 gi|161761570|gb|CP000890.1| + Q3262-30 gi|161761570|gb|CP000890.1| + Q3262-21 gi|161761570|gb|CP000890.1| + Q3262-21 gi|161761570|gb|CP000890.1| + Q3262-21 gi|161761570|gb|CP000890.1| + + +**Outputs** + +:: + + Q3262-21 gi|71066702|gb|AE016828.2| tja..here something extra + Q3262-23 gi|71066702|gb|AE016828.2| okay + Q3262-21 gi|71066702|gb|AE016828.2| enhier was zonder space :) + Q3262-24 gi|71066702|gb|AE016828.2| nothing there + Q3262-26 gi|71066702|gb|AE016828.2| or still + Q3262-30 gi|161761570|gb|CP000890.1| + +----- + +Please acknowledge our work when you find it useful! + +| + + + + + diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TopHit_namefilter/TopHit_namefilter_galaxy.pl Tue Jun 07 17:42:00 2011 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/perl -w + +# Simple filter to keep just the TOPHIT / first occurrence of some identifier +# usefull for keeping only the first tophit in blast when multiple hits are returned +# +# Please be aware that NO additional filtering or checking is done on for instance +# E values of BLAST hits. Tophit = FIRST hit...not necessarily the best.. +# +# input list/table having some groupable identifier +# input the column number to filter on (column number starts at 1) +# input number of occurrences to keep +# note that the hits are displayed in order of occurrence +# and NOT sorted on given column! +# column splitter (default TAB) +# Note that: splitting on tab: \t +# splitting on pipe: \| +# combined splits: -|\| (splits on '-' OR '|') +# +# output the same table having only the FIRST occurrence of the identifier. +# +# alex.bossers@wur.nl +# + +my $version = "v0.13.alx 19-5-2011"; +# Version history +# 0.13 19-05-2011 added extra cmdline opt hits to keep -> first galaxy version +# 0.12 19-05-2011 mods to fit initial needs. Not distributed. +# 0.1 xx-xx-2010 template + +use strict; +use warnings; + +#cmd line options +if (!$ARGV[4]) { + warn "Error: not enough arguments\n"; + usage(); +} +my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig; +my $column = $ARGV[1]; # column numbers start at 1! +my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "") +my $hits = $ARGV[3]; # number of occurences to keep +my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig; + +if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();} + +#keeping track +my $entrycounter = 0; +my $filter_count = 0; + +#open the files +open (IN,$input) || die "Input file error: $!\n" ; +open (OUT, ">$output") || die "Output file error: $!\n"; + +#read file into hash having KEY equal to column data specified +my %filtered; +while (){ + chomp; + my $line = $_; + my @fields = split($splitter,$line); + #print "@fields\n"; + $entrycounter++; + if (exists $filtered{$fields[$column-1]}){ + if ($filtered{$fields[$column-1]} < $hits){ + #number of occurrences to keep + print OUT "$line\n"; + $filtered{$fields[$column-1]}++; + $filter_count++; + } + next; + } + else { + $filtered{$fields[$column-1]} = "1"; #first occurrence + print OUT "$line\n"; + #print "key: $fields[$column-1]\tLine: $line\n"; + $filter_count++; + } +} + +#end and close +close (IN); +close (OUT); + +print "\nVersion : $version\nComments/bugs : alex.bossers\@wur.nl\n"; +print "Processed : $entrycounter entries\n"; +print "Filtered : $filter_count entries remain\n"; + +sub usage { + warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n"; + my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig; + die < + + INPUT: infile Input original tabular/text + + column Input column number to use (>= 1) + + splitter Splitter char to use (i.e. \t for tab) + For splitting on pipe use escaping: \| + Combined splits possible: -|\| splits both on - as | + + hits Number of hits to keep (in chronological order) + The results are NOT sorted! + + OUTPUT: outfile Output filename of filtered table. + +EOF +} +#end script \ No newline at end of file