# HG changeset patch
# User abossers
# Date 1307482920 14400
# Node ID 31f689e1d88b320e9eac0179483c3892ace1a40c
Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_README Tue Jun 07 17:42:00 2011 -0400
@@ -0,0 +1,47 @@
+# Created May 2011
+#
+# Alex Bossers
+# Central Veterinary Institute
+# Wageningen University and Research centre
+# Lelystad, The Netherlands
+#
+# Comments/improvements/bugs: Alex (dot) Bossers (at) wur (dot) nl
+
+
+# WHAT IT DOES
+TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some
+identifier. This is useful for keeping only the first N tophits of for instance BLAST when
+multiple hits were returned (and you don't want to rerun the BLAST analysis). Of course it is NOT
+restricted to BLAST and can basically filter ANY tabular data for uniqueness.
+
+Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits.
+Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned
+they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the
+input and therefore in the output file).
+
+
+# REQUIREMENTS
+Perl
+Galaxy :)
+
+
+# SETUP
+Just unpack the tool xml and perl script somewhere appropriate and plug the tool in the tool_config.xml
+of your galaxy instance. Refresh the tools or restart the galaxy server.
+
+
+# LICENSE
+Copyright (c) 2011 Central Veterinary Institute of Wageningen UR, Lelystad, The Netherlands.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+When distributing the tools please include this original reference.
+
+Use this tool at your own risk. Even though we tried to build tools and wrappers that free of errors,
+check your output since it might be erroneous. We will not be relyable to any failure this may have caused.
+
+If you like these scripts, please acknowledge our work.
+
diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter.test.table
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter.test.table Tue Jun 07 17:42:00 2011 -0400
@@ -0,0 +1,16 @@
+Q3262-21 gi|71066702|gb|AE016828.2| tjahier wat
+Q3262-23 gi|71066702|gb|AE016828.2| okay
+Q3262-21 gi|71066702|gb|AE016828.2| and here can contain multiple columns :)
+Q3262-24 gi|71066702|gb|AE016828.2| nothing there
+Q3262-26 gi|71066702|gb|AE016828.2| or still
+Q3262-21 gi|71066702|gb|AE016828.2|
+Q3262-21 gi|71066702|gb|AE016828.2|
+Q3262-21 gi|71066702|gb|AE016828.2|
+Q3262-21 gi|71066702|gb|AE016828.2|
+Q3262-21 gi|145004|gb|M80806.1|COXTRANSPO
+Q3262-21 gi|144996|gb|M20482.1|COXHSPAB
+Q3262-21 gi|161761570|gb|CP000890.1|
+Q3262-30 gi|161761570|gb|CP000890.1|
+Q3262-21 gi|161761570|gb|CP000890.1|
+Q3262-21 gi|161761570|gb|CP000890.1|
+Q3262-21 gi|161761570|gb|CP000890.1|
diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter.xml Tue Jun 07 17:42:00 2011 -0400
@@ -0,0 +1,112 @@
+
+ Simple filter to keep N occurrences of lines in a file
+
+ TopHit_namefilter_galaxy.pl
+ $input
+ $column
+ "$splitter"
+ $hits
+ $output_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some identifier
+useful for keeping only the first N tophits in blast when multiple hits were returned (and you don't want to rerun the BLAST analysis).
+
+Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits.
+Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned
+they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the input
+and therefore in the output file).
+
+**Comments/feedback** on the Perl script or GALAXY wrapper: alex.bossers@wur.nl
+
+-----
+
+**Note!** Beware the special use of splitters! Especially if you want to use special characters that have a "perl" split
+meaning. They need to be escaped by a leading \\.
+
+Examples of splitters before filtering (end result will remain the ORIGINAL unsplit line!):
+
+::
+
+ Splitter Meaning Example line to split Split result for filtering only!
+ -------- ------------------------------- ----------------------- --------------------------------
+ \t Single tab Foo<tab>Bar<tab>here ---> Foo Bar here
+ \| Single pipe Foo<tab>Bar|here ---> Foo<tab>Bar here
+ - Single dash Foo-Bar ---> Foo Bar
+ -|\| Combined splits on dash OR pipe Foo-Bar|here ---> Foo Bar here
+
+
+-----
+
+**EXAMPLE**
+
+Parameters: Column = 1, **hits = 2** and splitter = \\t
+
+**Input**
+
+Any text/tabular file:
+
+::
+
+ Q3262-21 gi|71066702|gb|AE016828.2| tja..here something extra
+ Q3262-23 gi|71066702|gb|AE016828.2| okay
+ Q3262-24 gi|71066702|gb|AE016828.2| nothing there
+ Q3262-21 gi|71066702|gb|AE016828.2| enhier was zonder space :)
+ Q3262-26 gi|71066702|gb|AE016828.2| or still
+ Q3262-21 gi|71066702|gb|AE016828.2|
+ Q3262-21 gi|71066702|gb|AE016828.2|
+ Q3262-21 gi|71066702|gb|AE016828.2|
+ Q3262-21 gi|71066702|gb|AE016828.2|
+ Q3262-21 gi|145004|gb|M80806.1|COXTRANSPO
+ Q3262-21 gi|144996|gb|M20482.1|COXHSPAB
+ Q3262-21 gi|161761570|gb|CP000890.1|
+ Q3262-30 gi|161761570|gb|CP000890.1|
+ Q3262-21 gi|161761570|gb|CP000890.1|
+ Q3262-21 gi|161761570|gb|CP000890.1|
+ Q3262-21 gi|161761570|gb|CP000890.1|
+
+
+**Outputs**
+
+::
+
+ Q3262-21 gi|71066702|gb|AE016828.2| tja..here something extra
+ Q3262-23 gi|71066702|gb|AE016828.2| okay
+ Q3262-21 gi|71066702|gb|AE016828.2| enhier was zonder space :)
+ Q3262-24 gi|71066702|gb|AE016828.2| nothing there
+ Q3262-26 gi|71066702|gb|AE016828.2| or still
+ Q3262-30 gi|161761570|gb|CP000890.1|
+
+-----
+
+Please acknowledge our work when you find it useful!
+
+|
+
+
+
+
+
diff -r 000000000000 -r 31f689e1d88b TopHit_namefilter/TopHit_namefilter_galaxy.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter_galaxy.pl Tue Jun 07 17:42:00 2011 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/perl -w
+
+# Simple filter to keep just the TOPHIT / first occurrence of some identifier
+# usefull for keeping only the first tophit in blast when multiple hits are returned
+#
+# Please be aware that NO additional filtering or checking is done on for instance
+# E values of BLAST hits. Tophit = FIRST hit...not necessarily the best..
+#
+# input list/table having some groupable identifier
+# input the column number to filter on (column number starts at 1)
+# input number of occurrences to keep
+# note that the hits are displayed in order of occurrence
+# and NOT sorted on given column!
+# column splitter (default TAB)
+# Note that: splitting on tab: \t
+# splitting on pipe: \|
+# combined splits: -|\| (splits on '-' OR '|')
+#
+# output the same table having only the FIRST occurrence of the identifier.
+#
+# alex.bossers@wur.nl
+#
+
+my $version = "v0.13.alx 19-5-2011";
+# Version history
+# 0.13 19-05-2011 added extra cmdline opt hits to keep -> first galaxy version
+# 0.12 19-05-2011 mods to fit initial needs. Not distributed.
+# 0.1 xx-xx-2010 template
+
+use strict;
+use warnings;
+
+#cmd line options
+if (!$ARGV[4]) {
+ warn "Error: not enough arguments\n";
+ usage();
+}
+my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+my $column = $ARGV[1]; # column numbers start at 1!
+my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "")
+my $hits = $ARGV[3]; # number of occurences to keep
+my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+
+if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();}
+
+#keeping track
+my $entrycounter = 0;
+my $filter_count = 0;
+
+#open the files
+open (IN,$input) || die "Input file error: $!\n" ;
+open (OUT, ">$output") || die "Output file error: $!\n";
+
+#read file into hash having KEY equal to column data specified
+my %filtered;
+while (){
+ chomp;
+ my $line = $_;
+ my @fields = split($splitter,$line);
+ #print "@fields\n";
+ $entrycounter++;
+ if (exists $filtered{$fields[$column-1]}){
+ if ($filtered{$fields[$column-1]} < $hits){
+ #number of occurrences to keep
+ print OUT "$line\n";
+ $filtered{$fields[$column-1]}++;
+ $filter_count++;
+ }
+ next;
+ }
+ else {
+ $filtered{$fields[$column-1]} = "1"; #first occurrence
+ print OUT "$line\n";
+ #print "key: $fields[$column-1]\tLine: $line\n";
+ $filter_count++;
+ }
+}
+
+#end and close
+close (IN);
+close (OUT);
+
+print "\nVersion : $version\nComments/bugs : alex.bossers\@wur.nl\n";
+print "Processed : $entrycounter entries\n";
+print "Filtered : $filter_count entries remain\n";
+
+sub usage {
+ warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n";
+ my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig;
+ die <
+
+ INPUT: infile Input original tabular/text
+
+ column Input column number to use (>= 1)
+
+ splitter Splitter char to use (i.e. \t for tab)
+ For splitting on pipe use escaping: \|
+ Combined splits possible: -|\| splits both on - as |
+
+ hits Number of hits to keep (in chronological order)
+ The results are NOT sorted!
+
+ OUTPUT: outfile Output filename of filtered table.
+
+EOF
+}
+#end script
\ No newline at end of file