| 
4
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 ##
 | 
| 
 | 
     3 ## Sort-header - wrapper for GNU sort with header-line support
 | 
| 
 | 
     4 ##
 | 
| 
 | 
     5 ## Copyright(C) A. Gordon
 | 
| 
 | 
     6 ## license AGPLv3+
 | 
| 
 | 
     7 ##
 | 
| 
 | 
     8 use strict;
 | 
| 
 | 
     9 use warnings;
 | 
| 
 | 
    10 use Data::Dumper;
 | 
| 
 | 
    11 use IO::Handle;
 | 
| 
 | 
    12 use Getopt::Long qw(:config bundling no_ignore_case_always);
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 ## Forward declarations
 | 
| 
 | 
    15 sub add_standard_sort_param(@);
 | 
| 
 | 
    16 sub add_standard_sort_param_value(@);
 | 
| 
 | 
    17 sub forbidden_sort_param(@);
 | 
| 
 | 
    18 sub show_help();
 | 
| 
 | 
    19 sub show_version();
 | 
| 
 | 
    20 sub show_examples();
 | 
| 
 | 
    21 sub parse_commandline_options();
 | 
| 
 | 
    22 sub reassign_input_output();
 | 
| 
 | 
    23 sub process_header_lines();
 | 
| 
 | 
    24 sub run_sort();
 | 
| 
 | 
    25 sub read_line_non_buffered();
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 ##
 | 
| 
 | 
    29 ## Runtime options
 | 
| 
 | 
    30 ##
 | 
| 
 | 
    31 my $PROGRAM="sort-header";
 | 
| 
 | 
    32 my $VERSION=0.4;
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 my $check_only=undef;
 | 
| 
 | 
    35 my $input_file=undef;
 | 
| 
 | 
    36 my $output_file=undef;
 | 
| 
 | 
    37 my $field_separator=undef;
 | 
| 
 | 
    38 my $header_lines =1 ;
 | 
| 
 | 
    39 my $debug=undef;
 | 
| 
 | 
    40 my $sort_exit_code=1; #by default, assume some error
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 my @sort_options;
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 ##
 | 
| 
 | 
    45 ## Program Start
 | 
| 
 | 
    46 ##
 | 
| 
 | 
    47 parse_commandline_options();
 | 
| 
 | 
    48 reassign_input_output();
 | 
| 
 | 
    49 process_header_lines();
 | 
| 
 | 
    50 run_sort();
 | 
| 
 | 
    51 exit($sort_exit_code);
 | 
| 
 | 
    52 ##
 | 
| 
 | 
    53 ## Program End
 | 
| 
 | 
    54 ##
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 sub show_examples()
 | 
| 
 | 
    57 {
 | 
| 
 | 
    58 print<<EOF;
 | 
| 
 | 
    59 Sorting a file with a header line:
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 \$ cat input.txt
 | 
| 
 | 
    62 Fruit	Color	Price
 | 
| 
 | 
    63 Banana	Yellow	4.1
 | 
| 
 | 
    64 Avocado	Green	8.0
 | 
| 
 | 
    65 Apple	Red	3.0
 | 
| 
 | 
    66 Melon	Green	6.1
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 # By default, 'sort-header' assumes 1 header line
 | 
| 
 | 
    69 # (no need to use --header in this case).
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 \$ sort-header -k3,3nr input.txt
 | 
| 
 | 
    72 Fruit	Color	Price
 | 
| 
 | 
    73 Avocado	Green	8.0
 | 
| 
 | 
    74 Melon	Green	6.1
 | 
| 
 | 
    75 Banana	Yellow	4.1
 | 
| 
 | 
    76 Apple	Red	3.0
 | 
| 
 | 
    77 
 | 
| 
 | 
    78 EOF
 | 
| 
 | 
    79 	exit(0);
 | 
| 
 | 
    80 }
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 sub show_help()
 | 
| 
 | 
    83 {
 | 
| 
 | 
    84 print<<EOF;
 | 
| 
 | 
    85 ${PROGRAM}: Wrapper for GNU sort, allowing sorting files with header lines.
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 Usage: $PROGRAM [HEADER-OPTIONS] [GNU sort Options] [INPUT-FILE]
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 HEADER-OPTIONS: the following options are supported by '${PROGRAM}':
 | 
| 
 | 
    90 
 | 
| 
 | 
    91    --header N    =  Treat the first N lines as header lines.
 | 
| 
 | 
    92                     These line will NOT be sorted. They will be passed
 | 
| 
 | 
    93 		    directly to the output file. (default: 1)
 | 
| 
 | 
    94 
 | 
| 
 | 
    95    --version     =  Print ${PROGRAM}'s version.
 | 
| 
 | 
    96 
 | 
| 
 | 
    97    --debugheader =  Print debug messages (relating to ${PROGRAM}'s operation).
 | 
| 
 | 
    98 
 | 
| 
 | 
    99    --help        =  Show this help screen.
 | 
| 
 | 
   100 
 | 
| 
 | 
   101    --examples    =  Show usage examples.
 | 
| 
 | 
   102 
 | 
| 
 | 
   103 GNU sort options:
 | 
| 
 | 
   104    Most of the standard GNU sort options are supported and passed to GNU sort.
 | 
| 
 | 
   105    The following options can not be used with '${PROGRAM}':
 | 
| 
 | 
   106 
 | 
| 
 | 
   107    -m --merge           => ${PROGRAM} can only sort one file, not merge multiple files.
 | 
| 
 | 
   108    -c -C --check        => Currently not supported
 | 
| 
 | 
   109    --files0-from        => Currently not supported
 | 
| 
 | 
   110    -z --zero-terminated => Currently not supported
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 INPUT-FILE:
 | 
| 
 | 
   113    If INPUT-FILE is not specified, $PROGRAM will use STDIN (just like GNU sort).
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 EOF
 | 
| 
 | 
   116 	exit(0);
 | 
| 
 | 
   117 }
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 sub show_version()
 | 
| 
 | 
   120 {
 | 
| 
 | 
   121 print<<EOF;
 | 
| 
 | 
   122 $PROGRAM $VERSION
 | 
| 
 | 
   123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu)
 | 
| 
 | 
   124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html)
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 To see the GNU's sort version, run:
 | 
| 
 | 
   127 	sort --version
 | 
| 
 | 
   128 EOF
 | 
| 
 | 
   129 	exit(0);
 | 
| 
 | 
   130 }
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 sub parse_commandline_options()
 | 
| 
 | 
   133 {
 | 
| 
 | 
   134 	my $rc = GetOptions(
 | 
| 
 | 
   135 		"ignore-leading-blanks|b" => \&add_standard_sort_param,
 | 
| 
 | 
   136 		"dictionary-order|d" => \&add_standard_sort_param,
 | 
| 
 | 
   137 		"ignore-case|f" => \&add_standard_sort_param,
 | 
| 
 | 
   138 		"general-numeric-sort|g" => \&add_standard_sort_param,
 | 
| 
 | 
   139 		"ignore-nonprinting|i" => \&add_standard_sort_param,
 | 
| 
 | 
   140 		"month-sort|M" => \&add_standard_sort_param,
 | 
| 
 | 
   141 		"human-numeric-sort|h" => \&add_standard_sort_param,
 | 
| 
 | 
   142 		"numeric-sort|n" => \&add_standard_sort_param,
 | 
| 
 | 
   143 		"random-source=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   144 		"random-sort|R" => \&add_standard_sort_param,
 | 
| 
 | 
   145 		"reverse|r" => \&add_standard_sort_param,
 | 
| 
 | 
   146 		"sort=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   147 		"version-sort|V" => \&add_standard_sort_param,
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 		"check|c" => \&forbidden_sort_param,
 | 
| 
 | 
   150 		"C" => \&forbidden_sort_param,
 | 
| 
 | 
   151 		"compress-program=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   152 		"debug" => \&add_standard_sort_param,
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 		"files0-from=s" => \&forbidden_sort_param,
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 		"key|k=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   157 		"merge|m" => \&forbidden_sort_param,
 | 
| 
 | 
   158 		"batch-size=i" => \&forbidden_sort_param,
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 		"parallel=i" => \&add_standard_sort_param_value,
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 		"output|o=s" => \$output_file,
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 		"stable|s" => \&add_standard_sort_param,
 | 
| 
 | 
   165 		"buffer-size|S=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   166 
 | 
| 
 | 
   167 		"field-separator|t=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   168 		"temporary-directory|T=s" => \&add_standard_sort_param_value,
 | 
| 
 | 
   169 		"unique|u" => \&add_standard_sort_param,
 | 
| 
 | 
   170 
 | 
| 
 | 
   171 		"zero-terminated|z" => \&forbidden_sort_param,
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 		"help" => \&show_help,
 | 
| 
 | 
   174 		"version" => \&show_version,
 | 
| 
 | 
   175 		"examples" => \&show_examples,
 | 
| 
 | 
   176 
 | 
| 
 | 
   177 		"header=i" => \$header_lines,
 | 
| 
 | 
   178 		"debugheader" => \$debug,
 | 
| 
 | 
   179 		);
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 	exit 1 unless $rc;
 | 
| 
 | 
   182 
 | 
| 
 | 
   183 	my @INPUT_FILES = @ARGV;
 | 
| 
 | 
   184 
 | 
| 
 | 
   185 	die "$PROGRAM: error: invalid number of header lines ($header_lines)\n" unless $header_lines>=0;
 | 
| 
 | 
   186 	die "$PROGRAM: error: Multiple input files specified. This program can sort only a signle file.\n" if (scalar(@INPUT_FILES)>1);
 | 
| 
 | 
   187 	$input_file = shift @INPUT_FILES if scalar(@INPUT_FILES)==1;
 | 
| 
 | 
   188 
 | 
| 
 | 
   189 	if ($debug) {
 | 
| 
 | 
   190 		warn "$PROGRAM: number of header lines = $header_lines\n";
 | 
| 
 | 
   191 		warn "$PROGRAM: PASS-to-Sort options:\n", Dumper(\@sort_options), "\n";
 | 
| 
 | 
   192 	}
 | 
| 
 | 
   193 }
 | 
| 
 | 
   194 
 | 
| 
 | 
   195 sub reassign_input_output()
 | 
| 
 | 
   196 {
 | 
| 
 | 
   197 	if ($output_file) {
 | 
| 
 | 
   198 		warn "$PROGRAM: Re-assigning STDOUT to '$output_file'\n" if $debug;
 | 
| 
 | 
   199 		open OUTPUT, '>', $output_file or die "$PROGRAM: Error: failed to create output file '$output_file': $!\n";
 | 
| 
 | 
   200 		STDOUT->fdopen(\*OUTPUT, 'w') or die "$PROGRAM: Error: failed to reassign STDOUT to '$output_file': $!\n";
 | 
| 
 | 
   201 	}
 | 
| 
 | 
   202 
 | 
| 
 | 
   203 
 | 
| 
 | 
   204 	if ($input_file) {
 | 
| 
 | 
   205 		warn "$PROGRAM: Re-assigning STDIN to '$input_file'\n" if $debug;
 | 
| 
 | 
   206 		open INPUT, '<', $input_file or die "$PROGRAM: Error: failed to open input file '$input_file': $!\n";
 | 
| 
 | 
   207 		STDIN->fdopen(\*INPUT, 'r') or die "$PROGRAM: Error: failed to reassign STDIN to '$input_file': $!\n";
 | 
| 
 | 
   208 	}
 | 
| 
 | 
   209 }
 | 
| 
 | 
   210 
 | 
| 
 | 
   211 sub process_header_lines()
 | 
| 
 | 
   212 {
 | 
| 
 | 
   213 	warn "$PROGRAM: Reading $header_lines header lines...\n" if $debug;
 | 
| 
 | 
   214 	for (my $i=0; $i<$header_lines; $i++) {
 | 
| 
 | 
   215 		my $line = read_line_non_buffered();
 | 
| 
 | 
   216 		exit unless defined $line;
 | 
| 
 | 
   217 		print $line;
 | 
| 
 | 
   218 	}
 | 
| 
 | 
   219 }
 | 
| 
 | 
   220 
 | 
| 
 | 
   221 sub run_sort()
 | 
| 
 | 
   222 {
 | 
| 
 | 
   223 	warn "$PROGRAM: Running GNU sort...\n" if $debug;
 | 
| 
 | 
   224 	system('sort', @sort_options);
 | 
| 
 | 
   225 	if ($? == -1) {
 | 
| 
 | 
   226 		die "$PROGRAM: Error: failed to execute 'sort': $!\n";
 | 
| 
 | 
   227 	}
 | 
| 
 | 
   228 	elsif ($? & 127) {
 | 
| 
 | 
   229 		my $signal = ($? & 127);
 | 
| 
 | 
   230 		kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide
 | 
| 
 | 
   231 		die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n";
 | 
| 
 | 
   232 	}
 | 
| 
 | 
   233 	else {
 | 
| 
 | 
   234 		$sort_exit_code = ($? >> 8);
 | 
| 
 | 
   235 	}
 | 
| 
 | 
   236 }
 | 
| 
 | 
   237 
 | 
| 
 | 
   238 
 | 
| 
 | 
   239 sub add_standard_sort_param(@)
 | 
| 
 | 
   240 {
 | 
| 
 | 
   241 	my ($obj)=  @_;
 | 
| 
 | 
   242 	add_standard_sort_param_value($obj, undef);
 | 
| 
 | 
   243 }
 | 
| 
 | 
   244 
 | 
| 
 | 
   245 sub add_standard_sort_param_value(@)
 | 
| 
 | 
   246 {
 | 
| 
 | 
   247 	my ($obj,$value)=  @_;
 | 
| 
 | 
   248 
 | 
| 
 | 
   249 	my $option = "" . $obj ; #stringify the optino object, get the option name.
 | 
| 
 | 
   250 
 | 
| 
 | 
   251 	if (length($option)==1) {
 | 
| 
 | 
   252 		$option = "-" . $option ;
 | 
| 
 | 
   253 	} else {
 | 
| 
 | 
   254 		$option = "--" . $option ;
 | 
| 
 | 
   255 	}
 | 
| 
 | 
   256 	push @sort_options, $option ;
 | 
| 
 | 
   257 	push @sort_options, $value if $value;
 | 
| 
 | 
   258 }
 | 
| 
 | 
   259 
 | 
| 
 | 
   260 sub forbidden_sort_param(@)
 | 
| 
 | 
   261 {
 | 
| 
 | 
   262 	my ($obj,$value)=  @_;
 | 
| 
 | 
   263 	my $option = "" . $obj ; #stringify the optino object, get the option name.
 | 
| 
 | 
   264 
 | 
| 
 | 
   265 	die "$PROGRAM: Error: option '$option' can not be used with this program. If you must use it, run GNU sort directly. see --help for more details.\n";
 | 
| 
 | 
   266 }
 | 
| 
 | 
   267 
 | 
| 
 | 
   268 sub read_line_non_buffered()
 | 
| 
 | 
   269 {
 | 
| 
 | 
   270 	my $line = '';
 | 
| 
 | 
   271 	while ( 1 ) {
 | 
| 
 | 
   272 		my $c;
 | 
| 
 | 
   273 		my $rc = sysread STDIN, $c, 1;
 | 
| 
 | 
   274 		die "$PROGRAM: STDIN Read error: $!" unless defined $rc;
 | 
| 
 | 
   275 		return $line if $rc==0 && $line;
 | 
| 
 | 
   276 		return undef if $rc==0 && (!$line);
 | 
| 
 | 
   277 		$line .= $c ;
 | 
| 
 | 
   278 		return $line if ( $c eq "\n");
 | 
| 
 | 
   279 	}
 | 
| 
 | 
   280 }
 | 
| 
 | 
   281 
 |