package RsyncSnapshot;

# $Id: RsyncSnapshot.pm 1038 2007-10-07 22:53:42Z jhealy $ #

=head1 NAME

  RsyncSnapshot

=head1 SYNOPSIS

  use RsyncSnapshot ':vars', ':subs';

=head1 ABSTRACT

Utility package containing methods for backing up a system using rsync and
hard links.  Both local and remote backups are supported.

End users should not invoke methods in this package directly; instead they
should consider one of the pre-written wrapper scripts:

=over 2

=item L<rsync_snapshot_local>

=item L<rsync_snapshot_sender>

=item L<rsync_snapshot_receiver>

=back

=head1 DESCRIPTION

Full documentation for this package is available online:

L<http://web.suffieldacademy.org/ils/netadmin/docs/software/rsync_snapshot/>

This package (and the associated scripts that use it) perform an
automated backup of a set of directories.  It uses rsync and hard
links to make a "full" backup where only the incremental changes
actually take up disk space.

See L<http://www.mikerubel.org/computers/rsync_snapshots/> for more
information on this snapshotting technique with rsync.

It allows for the exclusion of files/directories over a certain size
(and the automatic notification of users who are over this limit).
Additionally, it allows for patterned exclusion of files that should
not be backed up.

To use, simply create a config file that contains declarations for the
various global variables (see below).  Then, invoke one of the wrapper
scripts (listed above), passing the path to the config file(s) as
arguments.  The script will parse each file in turn and execute a
backup based on the parameters they specify.

=cut

# Report errors well
use strict;
use warnings;

# utility function to find base of file names
use File::Basename;

# Use LDAP to look up user e-mail addresses
# also used to confirm user quotas
#use Net::LDAP;
#use Net::LDAP::Constant qw(LDAP_SUCCESS);

# Use SMTP to deliver the warning messages via e-mail
use Net::SMTP;

# Use time formatting strings
use POSIX qw(strftime);

# Allow direct access to Syslog
use Unix::Syslog qw(:macros :subs);

our ($DEBUG, $SENDER, $RECEIVER, $USER_MAX_BYTES, $BACKUP_ROOT,
     $PREVIOUS_LINK, $CURRENT_LINK, @SOURCES, $DEST, $LINKDEST,
     $LDAP_SERVER, $SMTP_SERVER, $SMTP_FROM, %RSYNC_OPTIONS,
     @RSYNC_EXCLUDES, $RSYNC_COMMAND, $CONFIG_DESCRIPTION, $SPLIT,
     $SSH_HOST, $SSH_KEY, $SSH_USER, @LISTENERS, @listenerPIDs);

=head2 EXPORT

None by default.  Use :all to get constants and methods.

=cut

use base qw(Exporter);

our %EXPORT_TAGS = (vars =>
		    [ qw(
			 $DEBUG $SENDER $RECEIVER $USER_MAX_BYTES
			 $BACKUP_ROOT $PREVIOUS_LINK $CURRENT_LINK
			 @SOURCES $DEST $LINKDEST $LDAP_SERVER
			 $SMTP_SERVER $SMTP_FROM %RSYNC_OPTIONS
			 @RSYNC_EXCLUDES $RSYNC_COMMAND
			 $CONFIG_DESCRIPTION $SPLIT $SSH_HOST $SSH_KEY
			 $SSH_USER @LISTENERS
			) ],
		   subs =>
		    [ qw (
			  initGlobals debug logWarn logDie excluded du
			  duSummary rsyncRegexps rsyncUserExcludes
			  keysBySize kilobytes megabytes gigabytes
			  trunc autoSuffix getLdapUserInfo
			  createUserExcludes sendQuotaEmail
			  notifyUsers parseRsyncOutput rsyncSender
			  rsyncReceiver findLeafDirs splitRsync specificRsync
			  rotateDirectories parseConfig printResults
			 ) ] );

Exporter::export_ok_tags('vars', 'subs');


=head1 GLOBAL VARIABLES

The following global variables may be specified in a config file.  Most have
pre-defined defaults, so you do not need to specify every variable in your
config files.  However, certain variables must appear in each config file
(such as source, destination, and description).

=over 4

=cut

# all the variables in this function may be overridden by a config file
# provided on the command line
sub initGlobals() {

=item C<$CONFIG_DESCRIPTION> B<MANDATORY>

A short description for the type of backup this config file defines.

=cut
$CONFIG_DESCRIPTION = undef;

=item C<$BACKUP_ROOT> B<MANDATORY>

Backup destination directory (must be absolute; no trailing slash).  This
directory is the top-level directory where all the incremental directories
go.  For example, if you make this directory "/tmp", then your backups
will be placed in "/tmp/YYYY-MM-DD" directories by date.

=cut
$BACKUP_ROOT = "";

=item C<@SOURCES>

An array of source directories to back up (must be absolute; no
trailing slash).  All files under each this directory (except those
that are excluded) will be backed up.

=cut
@SOURCES = ();

=item C<$DEST>

Destination directory to copy to (must be relative to backup root; no
trailing slash).  By default, this variable becomes a timestamp following
this format:

  YYYY-MM-DD.HH-MM-SS

If you need a different format, create and evaluate it in your config file,
and store the finalized version in this variable.

=cut
$DEST = strftime("%Y-%m-%d.%H-%M-%S", localtime);

=item C<$LINKDEST>

Normally, the script will automatically detect a previous backup, and use
this directory as a hard-link destination to save space.

If you'd prefer to specify an absolute location for hardlinks, you may
do so with this parameter.  Otherwise, leave it undefined for autodetecting
behavior.

Finally, if you don't want to use hard links at all (in other words, a
regular rsync backup which overwrites the current one), specify the
empty string ("") as the LINKDEST parameter.  The script will detect this
and ignore hard links.

=cut
$LINKDEST = undef;

=item C<@LISTENERS>

A two-dimensional array (using standard perl semantics of ([],
... []);) containing exec() arrays of commands to run whenever a sync
is started.  Each row in the array will be passed directly to exec(),
unless the first column contains the string 'piggyback'.  In this
case, the internal piggybackRsync() method shall be called instead,
with the second column of the array used for the configuration file.

For regular exec() arrays, the path to a temporary file of pathnames
to sync will be pushed onto the end of the array before execution.

The commands B<must> adhere to the following specifications:

=over 2

=item *

The command will receive the temporary file to read from as its final argument.

=item *

The command must handle SIGUSR1, which will be sent whenever the
next batch of data are available for processing (alternately, the
command may ignore SIGUSR1 and simply "tail" the log file).

=item *

The command must handle SIGUSR2, which will be sent when there
are no more data left to process in the temporary file.

=back

The command should be written as a daemon, and should therefore not
expect STDIN or STDOUT to be connected to a controlling terminal.

Here's a sample LISTENERS array with two commands; one is the built-in
piggyback method, and the other is a (ficticious) logger.

  @LISTENERS = (
    ['piggyback', '/path/to/config-file.conf'],
    ['/usr/local/bin/logger', 'arg1', 'arg2']
  );

=cut
@LISTENERS = ();

=item C<$SSH_HOST>

If backups to a remote machine are desired, the rsync connection may be
tunneled over SSH.  Specify a remote hostname or IP address in this
field to signal the use of a remote connection.

Right now, the script only allows passwordless logins using SSH keys.  You
must specify a keyfile in the C<$SSH_KEY> parameter below for SSH tunnelling
to work properly.

=cut
$SSH_HOST = undef;

=item C<$SSH_KEY>

To prevent the script from asking for password to the remote machine,
passwordless SSH auth keys should be used.  You can create such keys
with the C<ssh-keygen> command.

=cut
$SSH_KEY = undef;

=item C<$SSH_USER>

This optional parameter specifies the remote username to use when tunnelling
over SSH.  If omitted, the user C<root> is used.  Note that root access
my be required on the remote machine to preserve timestamps, permissions,
and special files.

=cut
$SSH_USER = 'root';

=item C<$RSYNC_COMMAND>

Path (full path, if necessary) to the rsync binary, along with any required
options.

Note that the splitRsync() function B<requires> the use of the C<-R> flag
(for relative directory transfers).

We currently use rsyncX to transfer resource forks; vanilla rsync
users should remove full path, change name to 'rsync', and remove the
'--eahfs' flag

Mac OS X 10.4should have metadata support in the native version of
rsync (use the -E flag).  Unfortunately, as of 10.4.2, support for
ACLs and hard links seems a little broken (files don't get
trasferred), and there is a known bug with Apple where the resource
forks *always* get transferred, even if they haven't changed (wastes a
lot of wire bandwidth).  RsyncX doesn't do ACLs, but it gets the rest
right, so that's what we're going with for now.

The current default for this command is:

  /usr/bin/rsync -auR --delete

=cut
#$RSYNC_COMMAND = '/usr/local/bin/rsyncX --eahfs -auR --delete';
$RSYNC_COMMAND = '/usr/bin/rsync -auR --delete';

=item C<%RSYNC_OPTIONS>

Options to pass to rsync, as key/value pairs.  See the rsync() method below
for examples.

=cut
%RSYNC_OPTIONS = ();

=item C<@RSYNC_EXCLUDES>

Array of exclude patterns to pass to rsync.  The patterns are in rsync
format; see the L<rsync/"EXCLUDE PATTERNS"> section of the L<rsync(1)>
man page for more information.

Note: excludes may match an absolute path, rather than just the syncronized
portion of the path.  For example, if you're syncing /usr/local/src/, your
patterns may reference /usr/local/src (presumably to anchor the paths),
instead of just the directories contained in .../src/.

=cut
@RSYNC_EXCLUDES = ();

=item C<$SMTP_FROM>

Email address for notification messages to come from.

Note that the address is only used when checking user directories
and notifying them when they're over quota. If you don't use the
notification features of this script, you don't need to set this.

=cut
$SMTP_FROM = 'Network Administrator <netadmin@suffieldacademy.org>';

=item C<$SMTP_SERVER>

Outgoing SMTP server to use for sending e-mails.

Note that the SMTP server is only used when checking user directories
and notifying them when they're over quota. If you don't use the
notification features of this script, you don't need to set this.

=cut
$SMTP_SERVER = 'smtp.suffieldacademy.org';

=item C<$LDAP_SERVER>

LDAP server to look up user information.  Currently, we expect an Apple
OS X OpenDirectory LDAP server; you will need to modify the code if you
have a different server.

Note that the LDAP server is only used when checking user directories for
quota information and e-mail addresses.  If you don't use the quota-checking
features of this script, you don't need to set this.

=cut
$LDAP_SERVER = 'ldap.suffieldacademy.org';

=item C<$PREVIOUS_LINK>

Name of the symbolic link to the backup before the current backup
(must be relative to backup root).  Defaults to 'previous-backup'.

Note that there are two symlinks tracked by this application; this is
necessary because we want to create hard links to the last full
backup.  Because the "current" backup may still be in progress, we
need a link to the last guaranteed complete backup.

=cut
$PREVIOUS_LINK = "previous-backup";

=item C<$CURRENT_LINK>

Name of the symbolic link to the most current backup (must be relative to
backup root).  Defaults to 'current-backup'.

=cut
$CURRENT_LINK = "current-backup";

=item C<$USER_MAX_BYTES>

Maximum size of a user home directory, in bytes.  Defaults to 1GB.

=cut
$USER_MAX_BYTES = (1024 * 1024 * 1024);

=item C<$SPLIT>

Rsync has trouble syncing very large file trees.  We attempt to get around
this limitation by syncing only to a specific depth (the "split"), and then
iterating over all of the subdirectory leaves and rsyncing each in turn.

Default split is 1, as that works well with our directory structure.  You
may need to experiment to find a value that works for you.  0 is the minimum
value.

See the splitRsync() function for more information.

=cut
$SPLIT = 1;

=item C<$DEBUG>

Set to 1 for normal operation.  Set to 15 for maximum debugging messages.
Set to 0 for quieter-than-normal operation.

=cut
$DEBUG = 1;

}

# initialize globals right away (we'll do it again before every config file)
initGlobals();

=back

=cut


# A list of hash key tags, and regular expressions for parsing the output
# generated by rsync.
my %RSYNC_OUTPUT_TAGS = (
			 'FILES_TOTAL'   => qr/^Number of files: (\d+)$/,
			 'FILES_SENT'    => qr/^Number of files transferred: (\d+)$/,
			 'SIZE_TOTAL'    => qr/^Total file size: (\d+) bytes$/,
			 'SIZE_SENT'     => qr/^Total transferred file size: (\d+) bytes$/,
			 'DATA_LITERAL'  => qr/^Literal data: (\d+) bytes$/,
			 'DATA_MATCHED'  => qr/^Matched data: (\d+) bytes$/,
			 'FILE_LIST'     => qr/^File list size: (\d+)$/,
			 'BYTES_WRITTEN' => qr/^Total bytes [sw][er][ni]t[a-z]{0,3}: (\d+)$/,
			 'BYTES_READ'    => qr/^Total bytes re[ca][ed][a-z]{0,4}: (\d+)$/
			);


# Whether or not to log to the console or syslog
our $LOG_STDOUT = 1;
our $LOG_SYSLOG = 1;

# Whether this script has been called on the "sending" or "receiving" end
$SENDER = 0;
$RECEIVER = 0;


###################################
## End of customizable variables ##
###################################


=head1 METHODS

Below, we list the signatures and descriptions for the various methods
used by this script.  In general, you should be able to modify the
script's behavior through the config files.  However, certain actions
(such as reporting users over quota) require running the methods below,
and you may wish to customize their contents to the needs of your site.


=head2 debug(I<$message>, I<$level>, [I<$timestamp>]) [returns I<void>]

Prints the specified message to the console (with a time stamp), but only
if the debug level is strictly greater than the given debug level.

Optionally, a timestamp may be provided.  If it is undefined, the current
time is used.  Otherwise, the provided value (which may be empty) will
be prepended to the message.

=cut
sub debug($$;$) {

  my $message = shift(@_);
  my $level = shift(@_);
  my $timestamp = shift(@_);

  if ($DEBUG > $level) {

    if (!defined($timestamp)) {
      $timestamp = strftime("%a %b %d %H:%M:%S", localtime) . "\t";
    }

    if ($LOG_STDOUT) {
      print $timestamp . $message;
    }

    if ($LOG_SYSLOG) {

      # Extra logging: print important messages to syslog
      openlog "rsync_snapshot", LOG_PID, LOG_DAEMON;

      if ($level < 2) {
	syslog LOG_NOTICE, "%s", $message;
      }
      elsif ($level < 6) {
	syslog LOG_INFO, "%s", $message;
      }
      elsif ($level < 16) {
	syslog LOG_DEBUG, "%s", $message;
      }

      closelog;
    }
  }

}				# end sub debug


=head2 logWarn(I<$message>) [returns I<void>]

Similar to the perl builtin C<warn> statement, but does not
(necessarily) print to the console.  This is useful for daemonized
applications that have no STDERR to print to.

Calls C<debug> to print the message to the screen and/or to the system log.

=cut
sub logWarn($) {

  debug($_[0], -1);

} # end sub logWarn


=head2 logDie(I<$message>) [returns I<void>]

Similar to the perl builtin C<die> statement, but does not
(necessarily) print to the console.  This is useful for daemonized
applications that have no STDERR to print to.

Calls C<debug> to print the message to the screen and/or to the system log.
Afterwords, the program dies normally.

=cut
sub logDie($) {

  debug($_[0], -1);
  die($_[0]);

} # end sub logDie


=head2 excluded(I<$entry>, I<\@excludes>) [returns I<$boolean>]

Checks the given entry against a list of compiled regular expressions.

Returns 1 (true) if the file should be excluded according to the patterns.
Returns 0 (false) otherwise.

=cut
sub excluded($$;) {

  my $entry = shift(@_);
  my $excludes = shift(@_);

  foreach my $exclude (@{$excludes}) {

    debug("Checking '$entry' against '$exclude'    ", 10);

    if ($entry =~ $exclude) {
      debug("Matched '$entry' against '$exclude'\n", 8);
      return 1;
    }

    debug("\n", 10, "");
  }

  return 0;
}				# end sub excluded


=head2 du(I<$path>, I<\@excludes>) [returns I<$bytes>]

Given a filesystem path and a list of regular-expression excludes, this
function returns the number of bytes occupied by the filesystem path and
any enclosed subdirectories, while excluding any files matching the regular
expressions provided.

The path should be a fully-qualified filesystem path (beginning with a forward
slash).  The list of excludes may be empty, but must be defined.  Each exclude
must be a compiled regular expression pattern (created through the use of
the perl qr// operator).

This file contains numerous functions for generating compiled regular
expression lists from other well-known formats (such as rsync exclude
lists).  You may find it helpful to use these functions to generate
the regexp lists for you.

The function traverses the directory starting at root path provided, and
follows all non-excluded child directories.  Symbolic links and special
files are NOT counted or followed.

The total number of non-excluded bytes for the files and directories
is returned from the function.  Byte total is an exact count of all
files, which may be lower than the actual amount of disk space used
(due to block rounding).

=cut
sub du($$;) {

  my $root = shift(@_);
  my $excludes = shift(@_);

  my $size = 0;

  # by calling "stat" now, perl stores the stat call in the special
  # variable "_", so we don't have to make repeated stat calls for
  # each file test.

  # note that we must call "lstat" to check if something is a symbolic link
  lstat($root);

  if (-l $root) {
    # symlink; safe to ignore (or we might end up double-counting)
  }
  elsif (-f _) {
    unless (excluded("$root", $excludes)) {
      $size = -s _;
    }
  }
  elsif (-d _) {

    unless (excluded("$root/", $excludes)) {

      $size = -s _;

      opendir(DIR, $root) or warn "Can't opendir '$root': $!\n";

      foreach my $entry (readdir(DIR)) {

	# skip special entries '.' and '..'
	if ('.' eq $entry || '..' eq $entry) {
	  # just short-circuit if it's a special dir
	  next;
	}

	# use non-prototype call to prevent warnings on recursive call
	$size += &du("$root/$entry", $excludes);
      }

      closedir(DIR);
    }
  }
  else {
    warn "Ignoring special file '$root'\n";
  }

  return $size;

}				# end sub du


=head2 duSummary(I<$path>, I<\@excludes>, I<$size>) [returns I<%entries>]

Given a directory, returns a hash containing the entries of the directory
(key is the entry name, value is the entry size).  Only the cumulative size
of the immediate entries are returned; subdirectories are only included
through the cumulative totals.

A list of compiled regular expressions is taken as the second parameter.  This
list follows the same format expected by the du() function above.

The final parameter instructs the method to return only entries larger
than or smaller than a specific size.  If the size is 0 (the default), all
entries are returned.  If size is a positive number, only entries larger than
the given size will be returned.  If size is a negative number, only entries
smaller than the given size will be returned.

=cut
sub duSummary($$;$) {

  my $root = shift(@_);
  my $excludes = shift(@_);
  my $size = shift(@_) || 0;

  my %entries = ();

  opendir(DIR, $root) or warn "Can't opendir '$root': $!\n";

  foreach my $entry (readdir(DIR)) {

    # skip special entries '.' and '..'
    if ('.' eq $entry || '..' eq $entry) {
      # just short-circuit if it's a special dir
      next;
    }

    my $bytes = du("$root/$entry", $excludes);

    if (
	($size == 0) ||
	( ($size > 0) && ($bytes > $size) ) ||
	( ($size < 0) && ($bytes < -$size) )
       ) {

      $entries{$entry} = du("$root/$entry", $excludes);

    }

  }

  closedir(DIR);

  return %entries;

}				# end sub duRsyncSummary


=head2 rsyncRegexps(I<@excludes>) [returns I<\@regexps>]

Converts a list of rsync-style exclude patterns (see
L<rsync/"EXCLUDE PATTERNS"> section of the L<rsync(1)> man page)
into a list of compiled perl regular expressions suitable for feeding
to the du() function.

=cut
sub rsyncRegexps(@) {

  my @patterns = @_;

  my @excludes = ();

  foreach my $rsyncPattern (@patterns) {
    # convert the rsync exclude pattern into a regular expression
    my $exclude = $rsyncPattern;

    # strip off any leading + or - character
    $exclude =~ s{^[+-] (.*)$}{$1};

    # the '.' metachar has no special meaning in rsync patterns
    $exclude =~ s{\.}{\\.}g;

    # the ** wildcard matches anything (including slashes)
    $exclude =~ s{\*\*}{.*}g;

    # the * wildcard matches anything except a slash
    # (look-behind assertion keeps us from matching the . from ** above)
    $exclude =~ s{(?<!\.)\*}{[^/]*}g;

    # the ? wildcard matches a single char
    $exclude =~ s{\?}{.}g;

    # patterns without a / or ** should match the end of the last
    # component of the path only
    if ( index($exclude, '.*') == index($exclude, '/') ) {
      $exclude = $exclude . "\$";
    }

    # if the pattern ends with a / then it must match a directory
    if ($exclude =~ m{/$}) {
      $exclude = $exclude . "\$";
    }

    # if a pattern starts with a / then it matches the beginning of the
    # name
    if ($exclude =~ m{^/}) {
      $exclude = "^" . $exclude;
    }

    # compile the regexp and store it
    debug("$rsyncPattern -> $exclude\n", 6);
    push(@excludes, qr/$exclude/);

  }

  return \@excludes;

}				# end sub rsyncRegexps


=head2 rsyncUserExcludes(I<\%userExcludes>, I<$baseDir>) [returns I<@excludes>]

Given a list of user excludes (as generated by the createUserExcludes
function), this function generates an array of exclude parameters in rsync
format.

The second argument gives the base path to the user home directories.

=cut
sub rsyncUserExcludes(\%$;) {

  my %userExcludes = %{shift(@_)};
  my $baseDir = shift(@_);

  my @excludes = ();

  foreach my $user (sort keys %userExcludes) {

    my %entries = %{$userExcludes{$user}};

    foreach my $entry (keys %entries) {
      if ( -d "$baseDir/$user/$entry" ) {
	# tack a slash on to entry if its a directory
	$entry .= "/";
      }

      debug("Excluding user item /$user/$entry\n", 6);
      push(@excludes, "- $baseDir/$user/$entry");
    }

  }

  return @excludes;

}				# end sub rsyncUserExcludes


=head2 keysBySize(I<%hash>) [returns I<@keys>]

A sort comparator that orders hash keys based on the size of their values.
Use this in place of:

  sort keys %hash

This function returns the keys of the hash ordered by their values, with
the largest values first.

=cut
sub keysBySize(\%) {

  my $hash = shift(@_);

  return sort {$hash->{$b} <=> $hash->{$a}} keys %$hash;

}				# end sub keysBySize


=head2 kilobytes(I<$bytes>) [returns I<$kilobytes>]

Converts the number of bytes in the first argument into kilobytes (2^10 b).

=cut
sub kilobytes($;) {

  return ($_[0] / 1024);

}				# end sub kilobytes


=head2 megabytes(I<$bytes>) [returns I<$megabytes>]

Converts the number of bytes in the first argument into megabytes (2^20 b).

=cut
sub megabytes($;) {

  return ($_[0] / (1024 * 1024));

}				# end sub megabytes


=head2 gigabytes(I<$bytes>) [returns I<$gigabytes>]

Converts the number of bytes in the first argument into gigabytes (2^30 b).

=cut
sub gigabytes($;) {

  return ($_[0] / (1024 * 1024 * 1024));

}				# end sub gigabytes


=head2 trunc(I<$value>, I<$precision>) [returns I<$truncated>]

Truncates the given number to the specified number of digits.

=cut
sub trunc($$;) {

  my $value = shift(@_);
  my $precision = shift(@_);
  my $factor = 10 ** $precision;

  return int($value * $factor) / $factor;

}				# end sub trunc


=head2 autoSuffix(I<$bytes>, [I<$precision>]) [returns I<$string>]

Given a number of bytes, this function will convert it into a string
representing the number of KB, MB, or GB it represents (using whichever
suffix is most appropriate).

It also truncates to the specified number of digits (2 by default).

=cut
sub autoSuffix($;$) {

  my $bytes = shift(@_);
  my $precision = shift(@_) || 2;

  if ($bytes < (1024 * 1024)) {
    return trunc(kilobytes($bytes), $precision) . " KB";
  }
  elsif ($bytes < (1024 * 1024 * 1024)) {
    return trunc(megabytes($bytes), $precision) . " MB";
  }
  else {
    return trunc(gigabytes($bytes), $precision) . " GB";
  }

}				# end sub autoSuffix


=head2 getLdapUserInfo(I<$uid>, I<\@attrNames>, [I<$ldapHandle>]) [returns I<%attributes>]

Given a username and a list of LDAP attributes, this function looks up the
user entry on an LDAP server and returns the requested attributes.

A pre-bound LDAP handle may be passed as the last argument.  If undefined,
the function will set up, use, and tear down a connection to the default
LDAP server (defined by the global var LDAP_SERVER).

=cut
sub getLdapUserInfo($$;$) {

  my $uid = shift(@_);
  my $attrs = shift(@_);
  my $ldap = shift(@_);
  my $localLdap = 0;

  my %values = ();

  if (!defined($ldap)) {
    $localLdap = 1;
    # get a connection to the LDAP server
    $ldap = Net::LDAP->new($LDAP_SERVER)
      or logDie("Couldn't connect to LDAP server: $!");
    $ldap->bind();
  }

  # look up the user's info in LDAP
  my $results = $ldap->search(
			      base   => "cn=users,dc=suffieldacademy,dc=org",
			      scope  => "one",
			      filter => "(uid=$uid)",
			      attrs  => $attrs
			     );


  if ($results->count() < 1) {
    warn "No user information available for $uid; skipping this user.\n";
  }
  elsif ($results->count() > 1) {
    warn "Multiple results found for $uid; skipping this user\n";
  }
  else {
    # get the one (and only) user entry
    my $user = $results->entry(0);

    foreach my $attr ($user->attributes) {
      $values{$attr} = $user->get_value($attr);
    }
  }

  return %values;

}				# end sub getLdapUserInfo


=head2 createUserExcludes(I<$baseDir>, [$<sizeSpec>]) [returns I<%userExcludes>]

Iterates over all user directories and finds those that are over quota.
Uses the parameter baseDir as the base dir to check.  Also uses the
global var USER_MAX_BYTES to determine who is over quota.  You may specify
these parameters optionally to the function to override the defaults.

As an extra sanity check, the function consults with the LDAP server to
see if a per-user quota has been set.  If it is, and if it's higher than
the default size provided, then the larger number is used instead.

For each over-quota user, build a list of excludes of their subdirectories
until the total size of the directory is back under quota.

The function returns a hash of hashes, keyed by username.  Each username
references a hash of directories to exclude and their size.  For example:

  (
    jbogus => {Documents => 12305971, Movies => 129048712345},
    croot  => {Music => 10329487123094871}
  )

Note that the usernames and paths are relative, not absolute.  In order to
correctly exclude these directories, you'll need to prepend the user base
dir to these values.  This was done to make reporting the dirs easier (so
the names aren't obscured by leading path information).

=cut
sub createUserExcludes($;$) {

  my $baseDir = shift(@_);
  my $sizeSpec = shift(@_) || $USER_MAX_BYTES;

  # find all users over quota
  my %diskHogs = duSummary($baseDir, rsyncRegexps(@RSYNC_EXCLUDES), $sizeSpec);

  # fire up a connection to the LDAP server to check user quotas
  my $ldap = Net::LDAP->new($LDAP_SERVER)
    or logDie("Couldn't connect to LDAP server: $!");
  my $results = $ldap->bind();

  my %userExcludes = ();

  # output over-quota users to a file for later analysis
  open(HOGS, ">/tmp/users_over_quota")
    or logWarn("Couldn't write over-quota file: $!\n");

  # sort in reverse order by space used
  foreach my $over (sort { $diskHogs{$b} <=> $diskHogs{$a} } keys %diskHogs) {
    printf HOGS ("%9s  %s\n", autoSuffix($diskHogs{$over}), $over);
  }

  close(HOGS);

  foreach my $hog (sort(keys(%diskHogs))) {

    my %user = getLdapUserInfo($hog, ["apple-user-homequota"], $ldap);

    my $size = $user{'apple-user-homequota'};

    if (!defined($size) || $size < $sizeSpec) {
      $size = $sizeSpec
    }

    debug("LDAP quota for $hog is '$size'\n", 2);

    # try to prune directories until the users are under quota
    $hog =~ m%([^/]+)$%;

    debug(("$hog is over quota (" . autoSuffix($diskHogs{$hog})
	   . "); purging...\n"), 2);

    # keep track of the current amount of space used
    my $total = $diskHogs{$hog};

    my %entries = duSummary("$baseDir/$hog", rsyncRegexps(@RSYNC_EXCLUDES), 0);

    my %excludes = ();

    # visit each item in the directory, starting with the largest
    foreach my $entry (keysBySize(%entries)) {

      # if we've fallen below quota, stop excluding items
      last if ($total < $size);

      # exlude the item
      $excludes{$entry} = $entries{$entry};

      debug(("Excluded '" . $entry . "' for $hog\n"), 2);

      $total -= $entries{$entry};

    }

    $userExcludes{$hog} = \%excludes;

  }

  # tear down LDAP session
  $results = $ldap->unbind();

  return %userExcludes;

}				# end sub createUserExcludes



=head2 sendQuotaEmail(I<$email>, I<$name>, I<$max>, I<$dirlist>, I<$smtp>) [returns I<void>]

Sends a message to the given user warning them that their files exceed their
quota space and will not be backed up.

=cut
sub sendQuotaEmail($$$$$) {

  my $email = shift(@_);
  my $name = shift(@_);
  my $max = autoSuffix(shift(@_));
  my $dirlist = shift(@_);
  my $smtp = shift(@_);

  my $message = <<EOMAIL
Dear $name,

Our reports show that your folder on the file server exceeds the
maximum backup limit of $max.

Exceeding the limit is not an immediate problem; we have enough space
on the server to hold your files.  However, we do not have enough
room to back up all of the files in your account.  In the unlikely
event of a server malfunction, we might be unable to recover all of
your files.

Therefore, please be aware that the following files and folders in your
account will NOT be backed up automatically:

$dirlist

What to do about large files/folders:
=====================================

If you are unsure why a particular folder is too large, please log in
to the file server and examine its contents.  Large folders frequently
contain backups of old files, multimedia projects (such as iDVD or
iMovie files), and music.

If you no longer need these files, please delete them from your
account.  If you are still working with the files, you are welcome
to keep them, but please be aware that they will not be backed up.


If you have further questions:
==============================

If you are not sure why you are taking up so much space, or do not
understand why you have received this e-mail message, please feel free
to contact the CRC staff.  You may reply directly to this message and
we will reply with an answer to your questions.

Thank you,

Jason Healy
Network Administrator
EOMAIL
    ;		     # ' <- stupid comment to help syntax highlighting

  $smtp->mail($SMTP_FROM);
  $smtp->to($email);
  #$smtp->to($SMTP_FROM);

  $smtp->data();

  $smtp->datasend("To: $name <$email>\n");
  #$smtp->datasend("To: $SMTP_FROM\n");

  $smtp->datasend("From: $SMTP_FROM\n");
  $smtp->datasend("Subject: Fileserver Backup Warning\n");
  $smtp->datasend("\n");

  $smtp->datasend($message);

  $smtp->dataend();

}				# end sub sendQuotaEmail


=head2 notifyUsers(I<%userExcludes>, [I<$sizeSpec>]) [returns I<void>]

Given a hash of excluded user dirs (as generated by the createUserExcludes()
function), this function parses each user block and sends an e-mail notifying
them that certain folders have not been backed up.

User e-mail addresses are looked up via LDAP, and messages are sent directly
using SMTP.

Optionally, the maximum size to report to the users may also be provided.  If
it is omitted, the default USER_MAX_BYTES is used.

=cut
sub notifyUsers(\%;$) {

  my %userExcludes = %{shift(@_)};
  my $max = shift(@_) || $USER_MAX_BYTES;
  # convert to human-readable number
  $max = autoSuffix($max);

  # fire up a connection to the LDAP server
  my $ldap = Net::LDAP->new($LDAP_SERVER)
    or logDie("Couldn't connect to LDAP server: $!");
  my $results = $ldap->bind();

  # get all user information
  my @users = ();

  foreach my $uid (sort keys %userExcludes) {

    my %user = getLdapUserInfo($uid,
			       ["givenName", "cn", "mail",
				"apple-user-homequota"],
			       $ldap);

    # make sure the names contain something
    my $name = $user{'givenName'} || $user{'cn'} || $uid;

    my $email = $user{'mail'};
    if (!defined($email)) {
      warn "No e-mail address for $uid ($name)\n";
      next;
    }

    my $size = $user{'apple-user-homequota'};

    if (!defined($size) || $size < $USER_MAX_BYTES) {
      $size = $USER_MAX_BYTES;
    }

    my %subdirs = %{$userExcludes{$uid}};

    my $dirlist = '';

    foreach my $subdir (sort keys %subdirs) {
      $dirlist .= sprintf("        %-20s    (%s)\n",
			  $subdir, autoSuffix($subdirs{$subdir}));
    }

    push(@users, [$email, $name, $size, $dirlist]);

  }

  # tear down LDAP session
  $results = $ldap->unbind();

  # we now have e-mail info -- ready to send messages

  # get an SMTP session for sending e-mail
  my $smtp = Net::SMTP->new($SMTP_SERVER);

  for my $i (0 .. $#users) {
    sendQuotaEmail($users[$i][0], $users[$i][1],
		   $users[$i][2], $users[$i][3],
		   $smtp);
  }

  # tear down SMTP session
  $smtp->quit();

}				# end sub notifyUsers


=head2 parseRsyncOutput() [returns I<returntype>]

Given the textual output of the rsync() command, this function attempts to
parse out the file transfer statistics (as given by the --stat flag) and
build them into a hash for easier manipulation.

The function expects a newline-delimited string containing the output, as
well as a reference to a hash.  If the hash already contains values from
another run of rsync, the values are added to the ones that already exist.

=cut
sub parseRsyncOutput($$) {

  my $output = shift(@_);
  my $values = shift(@_);

  foreach my $line (split(/\n/, $output)) {
    my $key = undef;
    my $val = undef;

    # try each output key search
    foreach my $k (keys %RSYNC_OUTPUT_TAGS) {
      if ($line =~ $RSYNC_OUTPUT_TAGS{$k}) {
	$val = $1;
	$key = $k;
	last;
      }
    }

    # if we found one, add the value to the result hash
    if (defined($key) && defined($val)) {
      $values->{$key} = 0 unless(defined($values->{$key}));

      $values->{$key} += $val;
    }
  }

  # keep track of how many output results we've racked up
  $values->{'RESULT_COUNT'} = 0 unless(defined($values->{'RESULT_COUNT'}));
  $values->{'RESULT_COUNT'}++;

}				# end sub parseRsyncOutput


=head2 shellQuote() [returns I<returntype>]

Takes the given string and returns a double-quoted version with shell
quoting characters escaped.

=cut
sub shellQuote(*;) {

  my $orig = shift(@_);
  my $word = $orig;

  # According to the sh manpage, here's how we do it:

  # 1) Escape any backslashes
  $word =~ s'\\'\\\\'g;

  # 2) Escape any backticks
  $word =~ s'`'\\`'g;

  # 3) Escape any dollar signs
  $word =~ s'\$'\\$'g;

  # 4) Finally, escape any existing double quotes
  $word =~ s'"'\\"'g;

  # Return the whole argument wrapped in quotes
  $word = qq{"$word"};

  debug("Shell-escaped >>$orig<< to >>$word<<\n",6);

  return $word;

}				# end sub shellQuote



=head2 rsyncSender(I<$source>, I<$dest>, I<$previous>, I<\%options>, I<\%output>, [I<@excludes>]) [returns I<$rsyncOutput>]

Calls the 'rsync' binary, passing the source, destination, options, and
excludes specified.  The results of the rsync transaction are returned as
a string.  This is the "initiator" version of an rsync call.  This means
that this rsync call is the "source" of a backup.  For local transfers, it
is the only call you need.  For networked transfers, there must be a version
of rsync running in "receiver" mode (e.g., with the C<--server> flag) to
complete the transaction.

Note that for networked transfers, several flags (destination, link-dir,
etc) may be ignored.  It is up to the receiving side to make these sorts
of decisions.

Source and destination should be fully-qualified filesystem paths.  They will
be quoted when passed to the shell, so you should not escape spaces.  The
destination path may (optionally) include a user@hostname specification.

The "previous" parameter should contain a path to compare against when
making the backup.  Files that are the same as those in the previous
directory will be left out of the backup to save space.  You do not have
to specify a previous directory; undef is OK.  However, if you do so, each
backup will be a full backup, instead of incremental.

The options hash is in key-value form, where the key and value should be
separated on the command line by a space.  Quoting is NOT performed on these
options, so you must escape any special values.  For flags that have no value,
or where the value is not separated from the flag, use a value of undef.

A sample rsync options hashref might look like this:

  {
    '-e' => 'ssh'                    # a key/value pair separated by space
    '-au' => undef,                  # a flag with no argument
    '--include=/tmp/foo/' => undef   # a non-separated key/value
  }

The output hash is where the result of the rsync command is stored.  This
function calls parseRsyncOutput() (see above) and passes this hash reference
in for the function to use to store the values.

=cut
sub rsyncSender($$$$$;@) {

  my $source = shift(@_);
  my $dest = shift(@_);
  my $previous = shift(@_);
  my %options = %{shift(@_)};
  my $output = shift(@_);
  my @excludes = @_;

  # location and mandatory args for rsync binary

  my $command = $RSYNC_COMMAND;

  # Add --stats to rsync call to get statistic information
  $options{'--stats'} = undef;

  # Add verbose item for tracking changed files (if necessary)
  if (@LISTENERS) {
    $options{'-v'} = undef;
  }

  # add remote host information, if necessary
  if (defined($SSH_HOST)) {
    $dest = $SSH_USER . '@' . $SSH_HOST . ':' . $dest;

    $SSH_KEY =~ s{'}{\\'}g;
    $options{'-e'} = shellQuote("ssh -i $SSH_KEY");
  }

  # build list of exclude arguments
  foreach my $opt (keys %options) {
    $command .= " " . $opt;
    $command .= " " . $options{$opt} if (defined($options{$opt}));
  }

  # add source, destination, and previous
  if (defined($previous)) {
    $previous =~ s{'}{\\'}g;
    $command .= " --link-dest=" . shellQuote($previous);
  }

  # tell rsync that include/exclude patterns will be piped to STDIN
  $command .= " --exclude-from=-";

  $command .= " " . shellQuote($source);

  $command .= " " . shellQuote($dest);

  # create a tempfile name:
  my $tempfile =
    '/tmp/rsync_snapshot_'. strftime("%s", localtime) . '_' . $$;

  debug("Executing sender command '$command' (results to '$tempfile')\n", 1);

  # run it, and return output
  open (RSYNC, "| $command >$tempfile 2>&1")
    or logDie("Could not open pipe to rsync command:\n$command\nReason: $!\n");

  # pipe in the includes and excludes
  foreach my $exclude (@excludes) {
    if ($exclude =~ /^[+-] /) {
      print RSYNC "$exclude\n";
    }
    else {
      logWarn("Bad exclude: '$exclude'\n");
    }
  }

  my $status = 0;

  unless (close(RSYNC)) {
    if ($!) {
      logWarn("$command (@excludes) syscall error: $!");
    }
    elsif ($? > 5000) {
      # RsyncX returns errors of around 5888 (and Apple's of 6000-ish)
      # for chown errors on symlinks.  This isn't serious (you can't
      # own a link), so we ignore these.
    }
    else {
      $status = $?;
    }
  }

  open(RESULTS, $tempfile)
    or logWarn("Could not read rsync tempfile '$tempfile': $!\n");

  my $results = "";

  while (my $result = <RESULTS>) {
    # Ignore lines that don't start with a '/source/path'
    # Ignore lines that end with a '._filename' (resource fork) [rsync-ea]
    if (@LISTENERS && $result =~ /^$source/ && $result !~ m~\._[^/]+$~) {
      print LISTENLOG $result;
    }
    elsif ($result =~ /: \d/) {
      $results .= $result;
    }
  }

  # if there are listeners, signal them now to read their data
  if (@listenerPIDs) {
    kill('USR1', @listenerPIDs);
  }

  close(RESULTS);

  unlink $tempfile;

  # Now that we have command output, print any errors of executtion
  if ($status != 0) {
    logWarn("$command (@excludes) exited with non-zero status '$?'\n");
    logWarn("Output: $results\n");
  }

  parseRsyncOutput($results, $output);

  return $results;

}				# end sub rsyncSender


=head2 rsyncReceiver(I<$dest>, I<$previous>, I<\%options>) [returns I<void>]

Runs the rsync binary in "server" mode, which allows it to receive transfers
initiated from another host.  Normally, rsync uses a remote shell to set
up the receving end automatically.  However, because we want to rotate
directories and set up hard links, we must "intercept" this process, perform
any housekeeping, and then reconstruct the correct rsync call ourselves.

Because this version simply receives an existing connection, many options
are not specified (for example, the source).  Also, no output is produced,
as this will interfere with the communication with the sending side.  This
version is simply supposed to run a dumb version of rsync that receives and
sends info via STDIN/OUT.  Thus, it does not produce any readble output.

Finally, this version uses a system() call to invoke the program, so the
input and output file descripters get hooked up correctly.

If an error occurs, the result is logged.

=cut
sub rsyncReceiver($$$) {

  my $dest = shift(@_);
  my $previous = shift(@_);
  my %options = %{shift(@_)};

  # location and mandatory args for rsync binary
  my $command = $RSYNC_COMMAND;

  # add arguments necessary for the receiving (server) form of rsync
  $options{'--server'} = undef;

  # build list of exclude arguments
  foreach my $opt (keys %options) {
    $command .= " " . $opt;
    $command .= " " . $options{$opt} if (defined($options{$opt}));
  }

  # add source, destination, and previous
  if (defined($previous)) {
    $previous =~ s{'}{\\'}g;
    $command .= " --link-dest=" . shellQuote($previous);
  }

  # The source for a --server side transfer is always "."
  $command .= " .";

  $command .= " " . shellQuote($dest);

  debug("Executing receiver command '$command'\n", 1);

  # run it, and return output (do NOT redirect output; if you do,
  #                            be sure to use a construct other than 'system')
  my $result = system("$command");
  my $signal = $? & 127;
  my $status = $? >> 8;

  if ($result == -1) {
    logWarn("Could not run rsync server: $!");
  }
  elsif ($signal) {
    logWarn("Rsync receiver died with signal $signal\n");
  }
  elsif ($status) {
    logWarn("Rsync receiver exited with value $status\n");
  }

}				# end sub rsyncReceiver


=head2 findLeafDirs(I<$depth>, I<$current>, I<$root>, I<$path>, I<\@list>, I<\@excludes>]) [returns I<void>]

Finds all of the directories at a particular depth, starting from the
given root.  Use by the splitRsync() function to divide up the
transfer among different directories.  The returned paths will be relative
to (but not including) the root path.

The $depth is the level you consider to contain the leaves.  $current
is the level you're at right now (should be zero to start).  This method
is recursive, so it uses the $current value to keep track of how deep
it is right now.

$root is the path to start searching at.  $current contains the current
path relative to the root that we're investigating (should be "" to
start).

The list is a refernce to an array of leaf directories.  This gets passed
along on the recursive calls (hence the reference instead of a full list).
The function returns nothing, as it modifies the list in-place.

The excludes directory should be a list of compiled regular
expressions (perhaps generated by rsyncRegexps() above) to exclude
from consideration.

=cut
sub findLeafDirs($$$$$$;) {

  my $depth = shift(@_);
  my $current = shift(@_);
  my $root = shift(@_);
  my $path = shift(@_);
  my $list = shift(@_);
  my $excludes = shift(@_);


  debug("Searching for leaves in '$root$path'\n", 6);

  opendir(DIR, "$root$path") or logWarn("Can't opendir '$root$path': $!\n");

  foreach my $entry (readdir(DIR)) {

    # skip special entries '.' and '..'
    if ('.' eq $entry || '..' eq $entry) {
      # just short-circuit if it's a special dir
      next;
    }

    if ( -d "$root$path/$entry" &&
	 !excluded("$root$path/$entry/", $excludes)) {

      if ($current < $depth) {
	# not at the right level yet; find dirs and call recursively
	# to get to the next level

	# old-school function call to prevent prototype check
	# by perl on the recursive call
	&findLeafDirs($depth, ($current + 1),
		      "$root", "$path/$entry",
		      $list, $excludes);
      }
      else {
	# we're at the correct level, so save this directory
	debug("Adding leaf dir '$path/$entry' at level $current\n", 6);

	push(@$list, "$path/$entry");
      }
    }
  }

  closedir(DIR);

}				# end sub findLeafDirs


=head2 specificRsync(I<$files>, I<$dest>, I<\%options>, I<\%output>) [returns I<$rsyncOutput>]

Similar to "rsyncSender" above, but takes an explicit list of files to
sync instead of recursing through subdirectories.

See "rsyncSender" comments for information about options and output.

=cut
sub specificRsync($$$$) {

  my $files = shift(@_);
  my $dest = shift(@_);
  my %options = %{shift(@_)};
  my $output = shift(@_);

  # location and mandatory args for rsync binary
  my $command = $RSYNC_COMMAND;

  # Add --stats to rsync call to get statistic information
  $options{'--stats'} = undef;

  # add remote host information, if necessary
  if (defined($SSH_HOST)) {
    $dest = $SSH_USER . '@' . $SSH_HOST . ':' . $dest;

    $SSH_KEY =~ s{'}{\\'}g;
    $options{'-e'} = shellQuote("ssh -i $SSH_KEY");
  }

  # build command-line version of options
  foreach my $opt (keys %options) {
    $command .= " " . $opt;
    $command .= " " . $options{$opt} if (defined($options{$opt}));
  }

  # tell rsync that files to sync will be provided on STDIN
  $command .= " --files-from=-";

  # use "/" as source; we'll be providing full pathnames on STDIN
  $command .= " /";

  $command .= " " . shellQuote($dest);

  # create a tempfile name:
  my $tempfile =
    '/tmp/rsync_snapshot_'. strftime("%s", localtime) . '_' . $$;

  # we write our own explicit fork here, so that the main thread can
  # catch signals, and the child can execute rsync in a long-running process
  # without worrying about being interrupted.
  my $pid = fork;
  if ($pid == 0) { # child of fork
    # run it, and return output
    open (RSYNC, "| $command >$tempfile 2>&1") 
     or logDie("Could not open pipe to rsync command:\n$command\nReason: $!\n");

    # dump in the requested file names
    print RSYNC "$files";

    my $status = 0;

    unless (close(RSYNC)) {
      if ($!) {
	logWarn("$command syscall error: $!");
      }
      elsif ($? > 5000) {
	# RsyncX returns errors of around 5888 (and Apple's of 6000-ish)
	# for chown errors on symlinks.  This isn't serious (you can't
	# own a link), so we ignore these.
      }
      else {
	$status = $?;
      }

      # Now that we have command output, print any errors of executtion
      if ($status != 0) {
	logWarn("$command exited with non-zero status '$?'\n");
#	logWarn("Output: $results\n");
      }

    }
    exit; # terminate this thread once the job is done
  }
  else { # parent of fork
    waitpid($pid, 0);
  }

  open(RESULTS, $tempfile)
    or logWarn("Could not read rsync tempfile '$tempfile': $!\n");

  my $results = join("", <RESULTS>);

  close(RESULTS);

  unlink $tempfile;

  parseRsyncOutput($results, $output);

  return $results;

}				# end sub specificRsync


=head2 piggybackRsync(I<$conffile>, I<$filelist>) [returns I<void>]

Implements a listener which "piggybacks" on top of a running backup,
backing up changed files to a separate location.  A configuration file
for the alternate location must be provided.

This method is usually invoked by creating a LISTENER with the first
array value of "piggyback" and the second array value containing the path
to the configuration file.

As this is written as a LISTENER, it expects the following signals:

=over 2

=item SIGUSR1

Received when the list of files to sync has been appended to.

=item SIGUSR2

Received when the list of files has closed, and there are no more files
to consider (in other words, time for one last sync and exit).

=back

=cut
sub piggybackRsync(;$$) {

  my $conffile = shift(@_);
  my $filelist = shift(@_);

  # loop termination occurs on a signal
  my $running = 1;
  my $pending = 0;

  # install the signal handlers
  $SIG{USR1} = sub { $pending++; };
  $SIG{USR2} = sub { $running = 0; };

  logDie("Couldn't parse $conffile") unless (parseConfig($conffile));

  debug("Starting piggyback backup for $CONFIG_DESCRIPTION\n", 0);

  # run the rsync command
  my %results = ();

  open(FILES, $filelist) or logDie("Could not open $filelist: $!\n");

  do {

    # wait around for a signal to arrive
    while ($pending == 0 && $running == 1) {
      my $time = time;
      sleep 60;
      $time = time - $time;
    }

    # reset the pending count
    $pending = 0;

    # reset the filehandle to try to read more
    seek(FILES, 0, 1);

    # read more data and act on it
    my $files = join("", <FILES>);
    if ($files ne '') {
      specificRsync($files, "$BACKUP_ROOT/$DEST", \%RSYNC_OPTIONS, \%results);
    }

    # keep looping until we're signalled to stop
  } until ($running == 0 && $pending == 0);

  close(FILES);

  debug("Finished piggyback backup for $CONFIG_DESCRIPTION\n", 0);

  printResults(%results);

}


=head2 splitRsync(I<$levels>, I<\@sources>, I<$dest>, I<$previous>, I<\%options>, I<\%output>, [I<@excludes>]) [returns I<void>]

A special version of the rsync() function.  The result of the call (and most
of the parameters) are exactly the same as the rsync() method described
above.

This version of rsync attempts to work around the problem that rsync has
with extremely large file trees.  Currently, rsync holds its entire file
list in memory, which can get a little ugly for large trees.  Additionally,
the memory situation only gets worse when you use features like hard links
(which we do).

This function takes an additional parameter as its first parameter ($levels)
which breaks up the rsync process into two phases.  The first phase
syncs the root level down through $levels in the hierarchy.  The method
then visits each leaf directory that is $level deep in the filesystem and
calls rsync on it individually.

The split happens at $level; if $level is 0 then nothing happens in phase 1,
and everything gets synched in phase 2.  If $level is 2, then the first two
levels (0 and 1) get synced in phase 1, and everything else gets synced
in phase 2.

If there is a uniform distribution of directories at each level, an
order of magnitude reduction in the size of the file list is achieved
for each level deeper you sync.

We recommend using a level of 2, though you should tweak this value to match
the layout of your filesystem.

=cut
sub splitRsync($$$$$$;@) {

  my $level = shift(@_);
  my $source = shift(@_);
  my $dest = shift(@_);
  my $previous = shift(@_);
  my $options = shift(@_);
  my $output = shift(@_);

  # create a special exclude to stop syncing beyond the depth
  # level requested
  my $levelExclude = '- ' . $source . ('/*' x $level) . '/*/';

  # log changed files to an external file if requested
  if (@LISTENERS) {
    my $listenerLog = `mktemp /tmp/rsync_listeners.XXXXXXXXXXXX`;
    chomp($listenerLog);
    open(LISTENLOG, ">$listenerLog")
      or logWarn("Could not create listener log file '$listenerLog': $!\n");

    @listenerPIDs = ();

    # initialize listener children
    foreach my $i (0 .. $#LISTENERS) {
      my $pid = fork;
      if (defined($pid)) {
	if ($pid == 0) { # child of fork

	  my @exec = @{$LISTENERS[$i]};

	  # push on the path to the file list
	  push(@exec, $listenerLog);

	  if ($exec[0] eq 'piggyback') {
	    piggybackRsync($exec[1], $exec[2]);
	  }
	  else {
	    # disconnect from controlling TTY to make for a better daemon
#	    chdir '/' or logWarn("Can't chdir to /: $!");
#	    open STDIN, '/dev/null' or logWarn("Can't read /dev/null: $!");
#	    open STDOUT, '>/dev/null' or logWarn("Can't write to /dev/null: $!");
#	    setpgrp(0,0) or logWarn("Can't start a new session: $!");
#	    open STDERR, '>&STDOUT' or logWarn("Can't dup stdout: $!");

	    exec(@exec);
	  }

	  exit; # terminate this thread once the job is done
	}
	else { # parent of fork
	  push(@listenerPIDs, $pid);
	}
      }
      else {
	logDie("Fork does not appear to work on this system!\n");
      }
    }
  }

  # now syncronize just to that level
  debug("Calling root rsync on $source, excluding at $levelExclude\n", 2);
  rsyncSender($source, $dest, $previous, $options, $output, ($levelExclude, @_));

  # if $source is a directory, recurse into the subdirectories and sync
  if (-d "$source") {

    # get an array of leaf directories at the given level
    my @leaves = ();
    findLeafDirs($level, 0, $source, "", \@leaves, rsyncRegexps(@_));

    # finally, for each leaf dir, run rsync with the new root
    foreach my $leaf (@leaves) {

      debug("Calling rsync on $source$leaf\n", 2);

      # note that we order the stock excludes in front of
      # the targeted ones.  This way, the stock excludes
      # will "win" over the targeted ones correctly.
      rsyncSender("$source$leaf", "$dest", $previous, $options, $output, @_);

    }
  }

  # close listener file if necessary
  if (@LISTENERS) {
    close(LISTENLOG);

    debug("Waiting for listeners to finish...\n", 0);

    sleep 1;

    # signal the children that it's time to stop
    kill('USR2', @listenerPIDs);

    # wait for them to die off nicely
    while (@listenerPIDs) {
      my $listener = shift(@listenerPIDs);
      waitpid($listener, 0);
    }

    debug("Listeners complete\n", 0);

  }

}				# end sub splitRsync


=head2 rotateDirectories() [returns I<previousBackup>]

Creates the destination directory for the current backup, and points the
"current" symlink to the new directory (if necessary).

If the current symlink points to a different backup than the current one,
it is updated to point to the current directory, and the "previous" symlink
is pointed to the old "current".  In this way, we can always track the last
full backup path for the purposes of making hard links.

In every case, the value to hard link against (the previous backup) is
returned.  Calling functions can use this as the target of rsync's hard
link parameters.

=cut
sub rotateDirectories() {

  my $previous = readlink("$BACKUP_ROOT/$PREVIOUS_LINK");
  my $current = readlink("$BACKUP_ROOT/$CURRENT_LINK");
  my $now = "$BACKUP_ROOT/$DEST";

  # if there are no symlinks, go ahead and create everything from scratch
  if (!defined($current)) {
    logWarn("No link to existing backups; STARTING FROM SCRATCH.\n");
    mkdir($now) or logWarn("Can't create new backup directory '$now':\n\t$!\n");
    symlink("$now", "$BACKUP_ROOT/$CURRENT_LINK");
    $current = readlink("$BACKUP_ROOT/$CURRENT_LINK");
  }

  if (!defined($previous)) {
    # Warn the user if there's no previous copy to symlink against
    if ($current eq $now) {
      logWarn("No previous backup defined; UPDATING CURRENT COPY\n");
    }
    symlink("$current", "$BACKUP_ROOT/$PREVIOUS_LINK");
    $previous = readlink("$BACKUP_ROOT/$PREVIOUS_LINK");
  }

  # if "current" agrees with our desired destination, we're all set;
  # just return previous
  # otherwise, "current" is out of date, so repoint the symlinks
  if ($current ne $now) {

    logWarn("Creating new snapshot directory: $now\n");
    mkdir($now) or logWarn("Can't create new backup directory '$now':\n\t$!\n");

    # point "previous" to "current"
    unlink("$BACKUP_ROOT/$PREVIOUS_LINK");
    symlink("$current", "$BACKUP_ROOT/$PREVIOUS_LINK");
    $previous = readlink("$BACKUP_ROOT/$PREVIOUS_LINK");

    # point "current" to "now"
    unlink("$BACKUP_ROOT/$CURRENT_LINK");
    symlink("$now", "$BACKUP_ROOT/$CURRENT_LINK");
    $current = readlink("$BACKUP_ROOT/$CURRENT_LINK");

  }

  return $previous;

}				# end sub rotateDirectories


=head2 parseConfig(I<$file>) [returns I<$failure>]

Attempts to parse the given configuration file.  On success, the routine
returns 1, otherwise 0 is returned.

=cut
sub parseConfig(;$) {

  my $config = shift(@_);

  unless ( -f $config && -R $config ) {
    logWarn("Could not read config file '$config'\n");
    return 0;
  }

  debug("Starting parse of config file $config.\n", 2);

  my $conf = "";

  # read the file in
  if (open(FILE, $config)) {
    $conf = join("", <FILE>);
    close(FILE);
  }
  else {
    logWarn("Could not read config file $config: $!\n");
    return 0;
  }

  # re-initialize the globals
  initGlobals();

  # and eval() to get any settings
  unless(defined(eval($conf))) {
    logWarn("Could not parse config file '$config':\n\t$@\n");
    return 0;
  }

  unless(defined($CONFIG_DESCRIPTION)) {
    logWarn("Cannot use config file '$config'; it did not override\n"
      . "the \$CONFIG_DESCRIPTION variable.  Please edit the config\n"
	. "file and correct this.\n");
    return 0;
  }

  debug("Finished parse of config file $config.\n", 2);

  return 1;

} # end sub parseConfig


=head2 printResults(I<%results>) [returns I<void>]

Prints the cumulative results of the rsync calls in a format mimicing
rsync's own statistics output.

=cut
sub printResults(%) {

  my %results = @_;

  print "\nResults for $CONFIG_DESCRIPTION:\n\n";

  print "\t      Files Considered: " . $results{'FILES_TOTAL'} . "\n";
  print "\t     Files Transferred: " . $results{'FILES_SENT'} . "\n";
  print "\t      Bytes Considered: " . autoSuffix($results{'SIZE_TOTAL'}) . "\n";
  print "\t     Bytes Transferred: " . autoSuffix($results{'SIZE_SENT'}) . "\n";
  print "\t  Total file list size: " .
    $results{'FILE_LIST'} . " (avgerage " .
      int($results{'FILE_LIST'} / $results{'RESULT_COUNT'}) . ")\n";
  print "\t Number of rsync calls: " . $results{'RESULT_COUNT'} . "\n";

  $results{'SIZE_SENT'}++ if ($results{'SIZE_SENT'} < 1);

  print "\tTransfer Speedup Ratio: " .
    ($results{'SIZE_TOTAL'} / $results{'SIZE_SENT'}) . "\n";
  print "\n";

} # end sub printResults


# Always return true at the end of package files
1;

=head1 SEE ALSO

Full documentation for this package is located on the web:

L<http://web.suffieldacademy.org/ils/netadmin/docs/software/rsync_snapshot/>

=head1 AUTHOR

Jason Healy, Suffield Academy

=head1 COPYRIGHT AND LICENSE

Copyright 2006 by Jason Healy

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
