#!/usr/bin/perl -w # $Id: split-exec,v 1.1 2003/05/02 11:20:33 suter Exp $ # # Copyright (c) 2001 Mark Suter # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # [MJS 3 Mar 2002] Based on discussions with David Conran # [MJS 23 Apr 2002] Added --separation option # [MJS 1 May 2002] Added the stuff about the environment =head1 NAME B - Fork the command for its split of input =head1 SYNOPSIS B [ options ] command ... B --help B --man =head1 DESCRIPTION B will fork a new child for each chunk of input. The assumption is that the child will take a "long time" before it produces any output but that once it starts producing output, it will complete "quickly". The potential performance gain comes from the ability to overlap this "long time", for example, input-output tasks like "count the number of lines in all files in the given directory". Both the command and the environment are B and no checks are made to ensure they will not result in something undesired. For example, $PATH is propagated B. =head1 OPTIONS =over 8 =item B<--help> Print this brief help message and exit. =item B<--man> Display the manual page. =item B<--children=i> The maximum number of children to have at any one time. This only includes the children this program creates --- the provided command may fork multiple children. The default is 4. =item B<--lines=i> How many sequential lines to pass to each child. The last child will not get this number of lines if the number of input lines is not a whole multiple of this number. The default is 1. To avoid the obvious deadlock, all the lines are read in and then child is executed. =item B<--separation=i> The minimum number of wallclock seconds separation between successive forks to create children. The default is 0. =item B<--timeout=i> Children that have executed longer than this many wall-clock seconds may be terminated. This is a minimum and children may survive longer before being killed. The default, 0, means no timeout is used. =back =head1 EXAMPLES Here are some contrived examples, mainly to demonstrate the syntax and how this is meant to work. Hopefully, you have real work to do and this script will be of some use to you. perl -e 'print join "\n", 1..20, "\n";' | split-exec factor perl -e 'print "dummy\n" x 20' | split-exec --children=5 -- perl -pe 'sleep rand 10' perl -e 'print "dummy\n" x 20' | split-exec --timeout=5 -- perl -pe 'sleep rand 10' =head1 EXIT STATUS This script will exit with zero upon successful completion, non-zero on any error. All debugging or diagnostic output is written to STDERR. Please do not ignore STDERR. =head1 VERSION $Id: split-exec,v 1.1 2003/05/02 11:20:33 suter Exp $ =head1 COPYRIGHT Copyright (c) 2002 Mark Suter This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut ## The perl packages and their Debian packages use strict; # perl-base use IO::Socket; # perl-base use IO::Select; # perl-base use Getopt::Long; # perl-base use Pod::Usage; # perl-modules ## Fork off a child and return the details as [ $fh, $pid, time() ] sub create_child(\@@) { my ($command, @lines) = @_; my ($rdr, $wtr) = IO::Socket->socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "pair: $!\n"; if (my $pid = fork) { print $rdr @lines; shutdown($rdr, 1); return [ $rdr, $pid, time() ]; } else { defined $pid or die "fork: $!\n"; open(STDIN, "<&=" . fileno $wtr) or die "open: $!\n"; open(STDOUT, ">&=" . fileno $wtr) or die "open: $!\n"; exec { $command->[0] } @{$command} or die "exec: $!\n"; } } ## Reap any children ready to talk or any too old. sub reap_children($$) { my ($sel, $timeout) = @_; ## Get *all* output if any is ready foreach ($sel->can_read()) { $sel->remove($_); print $$_[0]->getlines(); close $$_[0] or warn "child $$_[1] status: ", $? >> 8, "\n"; waitpid($$_[1], 0) == $$_[1] or warn "waitpid != $$_[1]\n"; } ## Kill old children if necessary return unless $timeout > 0; foreach (grep { $$_[2] < time() - $timeout } $sel->handles()) { $sel->remove($_); kill 'HUP', $$_[1]; # CHECK: may not be enough warn "Child $$_[1] timeout - killed.\n"; } } # # The main program # ## Process options and exit with error message if needed. my %opt = (man => 0, help => 0, children => 4, separation => 0, timeout => 0, lines => 1); GetOptions(\%opt, "man", "help", "children=i", "separation=i", "timeout=i", "lines=i") or pod2usage(-exitval => 1, -verbose => 0); $opt{man} and pod2usage(-exitval => 1, -verbose => 2); $opt{help} and pod2usage(-exitval => 1, -verbose => 1); scalar @ARGV >=1 or pod2usage(-exitval => 1, -verbose => 1); ## Explicitly trust our command and environment (we're not suid or guid) my @command = map { /(.+)/; $1 } @ARGV; # %ENV = map { $ENV{$_} =~ /(.+)/; $_, $1 } keys %ENV; ## Autoflush our output to avoid duplicate output on some systems STDOUT->autoflush(1); ## Our select object my $sel = IO::Select->new(); ## Time last child started (fake the zeroth) my $last = time() - $opt{separation}; ## Split our input into chunks of $opt{lines} lines. my @lines = (); while () { push @lines, $_; if (scalar @lines == $opt{lines} or eof STDIN) { ## Create an additional child sleep $opt{separation} - (time() - $last); $sel->add(create_child @command, @lines); $last = time(); @lines = (); ## Reap a child if we're at the limit while ($sel->count() >= $opt{children}) { reap_children $sel, $opt{timeout}; } } } ## Get any remaining output while ($sel->count()) { reap_children $sel, $opt{timeout}; }