#!/usr/bin/env perl

=head1 NAME

users_over_quota

=head1 SYNOPSIS

  users_over_quota [-w] </homedirs/base> [/homedirs/more] ... [/homedirs/last]

=head1 ABSTRACT

Takes a base directory containing user home directories on a system
and produces a list of rsync exclude filters as output to prevent
backing up files over quota.  Optionally, it will e-mail users to warn
them if they are over quota.

=head1 DESCRIPTION

The C<-w> flag specifies whether to warn users via e-mail if they are
over quota.  Otherwise, all other arguments are interpreted as base
dirs containing user home directories.  For example, C</Users> and
C</Volumes/raid/Users> might be used as the paths that contain user
home directories.

As output, the script produces a list of rsync filters that exclude
any files that put the user over quota.  For example, if user "jbogus"
has a "Movies" folder in his home directory that puts him over quota,
the script would output

  - /path/to/homes/jbogus/Movies/

So that rsync will ignore this directory.  The script works on
I<first-level> directories inside the home directory only, so if a
single large file is embedded deep inside a folder, the entire
folder's contents will be excluded.

Directories are excluded starting with the largest, then the next
largest, etc. until the total size of the non-excluded directories is
under quota.

Apple's "dscl" utility is used to look up home directories to find the
user quota information.  If a home directory is not found in the
directory system, no excludes are generated (that is, the directory
will be backed up in full).  If you still wish to enforce a maximum
size for directories that do not have an owner, set the
"DEFAULT_QUOTA" variable.

=cut

# Report errors well
use strict;
use warnings;

# deliver warnings via e-mail
use Net::SMTP;


=head1 CONFIGURATION VARIABLES

=cut

my $WARN = 0;

my $SMTP = Net::SMTP->new('smtp.suffieldacademy.org');

my $EMAIL_FROM = 'netadmin@suffieldacademy.org';
my $EMAIL_FROM_NAME = 'Network Administrator';

my $EMAIL_TO_DEFAULT = 'netadmin@suffieldacademy.org';

# 1GiB
my $DEFAULT_QUOTA = 1073741824;

my $NFS_PREFIX = '/Network/Servers/tessa.suffieldacademy.org';

# Any path matching any of the regular expressions in this array will
# NOT be counted towards a user's quota
my @IGNORE =
  (
   qr'^/raid/all/Users/[^/]+/Library/Caches',
   qr'^/raid/all/Users/[^/]+/.Trash'
);

# Any directory containing a hidden file with this name will be ignored
my $DOT_EXCLUDE = '.rsync_snapshot_exclude';


=head1 METHODS


=head2 getUserInfo(C<$homes>, C<$dir>) [returns I<\%userinfo>]

Given the base home directory path and the relative user directory,
attempts to find the owner of the directory and then queries the
directory service for information about the user (such as quota size
and e-mail address).  If information cannot be obtained, a hash with
the special key "ERROR" will be returned, and a reason (if any)
specified as the value to that key.

If the C<NFS_PREFIX> variable is specified, this method attempts to
find the owner by searching the directory service for the user's home
directory key.  Otherwise, we assume that the name of the directory
exactly matches the name of the user, and we query the directory
service with this value.

=over 2

=item C<homes> (I<string> [B<required])

The base directory that contains the given relative directory.

=item C<dir> (I<string> [B<required])

The relative directory name of the user's home directory.

=back

=cut
sub getUserInfo($$;) {

  my $homes = shift(@_);
  my $dir = shift(@_);
  my %attr = ();
  my $user = undef;

  if (defined($NFS_PREFIX)) {
    # try to query the DS for the given directory
    if (open(DSCL, '-|', '/usr/bin/dscl', '-q', '/Search', '-search',
	     '/Users', 'homeDirectory', "$NFS_PREFIX$homes/$dir")) {

      my $result = <DSCL>;

      close(DSCL) or $attr{ERROR} = "Could not close pipe to dscl: $!";

      if (defined($result) && $result =~ /^(\S*)/) {
	$user = $1;
      }
      else {
	$attr{ERROR} = "Unable to find user with home directory $NFS_PREFIX$homes/$dir: $!";
      }

    }
    else {
      $attr{ERROR} = "Unable to open dscl: $!";
    }
  }
  else {
    # assume the username is the same as the dir name
    $user = $dir;
  }

  return \%attr if ($attr{ERROR});

  if (open(DSCL, '-|', '/usr/bin/dscl', '-q', '/Search', '-read', "/Users/$user",
	   'FirstName', 'EMailAddress', 'HomeDirectoryQuota')) {

    my $k = undef;
    my $v = undef;

    while (my $line = <DSCL>) {
      chomp $line;

      if ($line =~ /^(\S+): (.+)$/) {
	$attr{$1} = $2;
	$k = undef;
      }
      elsif ($line =~ /^(\S+):$/) {
	$k = $1;
	$v = undef;
      }
      elsif (defined($k) && $line =~ /^ (.*)$/) {
	$v = $attr{$k} . "\n" if (exists($attr{$k}));
	$v .= $1;
	$attr{$k} = $v;
	$v = undef;
      }
      elsif ($line =~ /^No such key: (.*)$/) {
	my $key = $1;
	if ($key eq 'FirstName') {
#	  print STDERR "No $key key for $user: substituting default\n";
	  $attr{FirstName} = "Unknown";
	}
	elsif ($key eq 'HomeDirectoryQuota') {
#	  print STDERR "No $key key for $user: substituting default\n";
	  $attr{HomeDirectoryQuota} = $DEFAULT_QUOTA;
	}
	elsif ($key eq 'EMailAddress') {
#	  print STDERR "No $key key for $user: substituting default\n";
	  $attr{EMailAddress} = $EMAIL_TO_DEFAULT;
	}
      }
      else {
	$attr{ERROR} = "Could not decode dscl line '$line'";
      }

    }

    close(DSCL) or $attr{ERROR} = "Could not close pipe to dscl: $!";
  }
  else {
    $attr{ERROR} = "Could not open pipe to dscl: $!";
  }

  return \%attr;

} # end sub getUserInfo


=head2 du(I<$path>) [returns I<$totalBytes>]

Given a filesystem path, this function returns the total number of bytes
occupied by the filesystem path and any enclosed subdirectories.

The path should be a fully-qualified filesystem path (beginning with a
forward slash).

The function traverses the directory starting at root path provided,
and follows child directories.  Symbolic links and special files are
NOT counted or followed.

Any paths that match patterns in the @IGNORE array are not counted,
nor are they recursively searched.

Any directories that contain a file named $DOT_EXCLUDE are also not
counted, nor are they recursively searched.  Additionally, an rsync
exclude pattern for that directory is emitted by the program.

The total number of bytes for the files and directories is returned
from the function.  Byte total is an exact count of all files, which
may be lower than the actual amount of disk space used (due to block
rounding).

=cut
sub du($$;$) {

  my $base = shift(@_);
  my $path = shift(@_);

  my $emit = shift(@_);
  $emit = 0 unless defined($emit);

  # shortcut so we don't have to write this a dozen times
  my $root = "$base/$path";

  my $total = 0;

  my @stat = stat($root);

  if (-f _ || -l $root) {
    $total = -s _;
  }
  elsif (-d _) {

    $total = -s _;

    # Ignore if this directory shouldn't be counted
    for my $ignore (@IGNORE) {
      if ($root =~ $ignore) {
#	print "du Ignoring '$path'\n";
	return 0;
      }
    }

    # Exclude if this directory is marked to not be backed up
    if (-f "$root/$DOT_EXCLUDE") {
#	print "du excluding '$path'\n";
      print "- /$path\n" if $emit;
      return 0;
    }

    # recurse into subdirs and count totals
    # (note local indirect fh to prevent namespace collisions in recursion)
    opendir(my $subdir, $root) or warn "Can't opendir '$root': $!\n";

    for my $entry (readdir($subdir)) {

      # skip special entries '.' and '..'
      if ('.' eq $entry || '..' eq $entry) {
	# just short-circuit if it's a special dir
	next;
      }

      # use non-prototype call to prevent warnings on recursive call
      $total += &du($base, "$path/$entry", $emit);
    }

    closedir($subdir);

  }

  return $total;

} # end sub du


=head2 duDepth(I<$path>, I<$levels>) [returns I<\@sizes>]

Given a filesystem path, this function returns the total number of bytes
for each file or directory that is I<$level> deep in the filesystem.  Files
that are deeper than this are counted in the totals of their parents.

For example, given the following directory structure:

  root/
    SomeFile.txt
    Movies/
      MyBigMovie.mov
    Pictures/
      Vacation/
        One.jpg
        Two.jpg

Assuming the path "/root" is passed to this function:

A level of zero would give the total for the entire "root"

A level of 1 would give totals for "SomeFile.txt", "Movies" (and all
its child files), and "Pictures" (and all it's child files).

A level of 2 would give totals for "SomeFile.txt" (even though it's
only one level deep, it has no children and so is counted),
"MyBigMovie.mov", and "Vacation"

For an infinite depth, use -1.  Note that this may consume a lot of
memory, as a record will be kept for every file in the directory
structure!

The function returns a 2-dimensional array, sorted by file size
(largest is at index zero).  The dimensions are the file size, and
then the path of the file.

=cut
sub duDepth($$$;$) {

  my $base = shift(@_);
  my $path = shift(@_);
  my $level = shift(@_);
  my $sizes = shift(@_) || [];

  my $root = "$base/$path";

  # Ignore if this directory shouldn't be counted
  for my $ignore (@IGNORE) {
    if ($root =~ $ignore) {
#      print "Depth Ignoring '$path'\n";
      return $sizes;
    }
  }

  # Exclude if this directory is marked to not be backed up
  if (-d $root && -f "$root/$DOT_EXCLUDE") {
#    print "Depth Excluding '$path'\n";
    return $sizes;
  }
  
  if ($level == 0 || -l "$root" || ! -d "$root") {
    # leaf dir, or plain file, so add the accumulated size here
    my $size = du($base, $path, 0);

    # perform an insertion at the proper place in the list
    my $insert = $#{$sizes} + 1;
    while ($insert > 0) {
      if ($size > $sizes->[$insert-1][0]) {
	# my size is bigger than the one at this spot in the list,
	# so ripple it down
	$sizes->[$insert] = $sizes->[$insert-1];
      }
      else {
	# this spot is where I belong; the next spot in the list
	# has a larger item than me
	last;
      }
      $insert--;
    }

    $sizes->[$insert] = [$size, $path];
  }
  else {
    # set up for the recursive call
    $level-- unless $level < 0;

    opendir(my $subdir, "$root") or warn "Can't opendir '$root': $!\n";

    for my $entry (readdir($subdir)) {

      # skip special entries '.' and '..'
      if ('.' eq $entry || '..' eq $entry) {
	# just short-circuit if it's a special dir
	next;
      }

      # use non-prototype call to prevent warnings on recursive call
      &duDepth($base, "$path/$entry", $level, $sizes);
    }

    closedir($subdir);
  }

  return $sizes;

} # end sub duDepth



=head2 format_bytes(C<$bytes>, [C<$fmt>]) [returns I<string>]

Format the given number of bytes into a more human-comprehensible number.

=over 2

=item C<$bytes> (I<integer> [B<required>])

Should be a non-negative integer value.  Decimal values will be
truncated, and other values interpreted as zero.

=item C<$fmt> (I<string> [B<required>])

Alternate formatting string to use.  Must have two placeholders;
the first is for the number, and the second is for the suffix.

=back

=cut
my @BYTE_SUFFIX = qw(B kB MB GB TB PB EB ZB YB);
sub format_bytes($;$) {

  my $b = int(shift(@_) || 0);
  my $fmt = shift(@_) || '%3.2f %-2s';
  my $s = '';

  for my $i (0 .. $#BYTE_SUFFIX) {
    if ($b < 1024) {
      $s = $BYTE_SUFFIX[$i];
      last;
    }
    else {
      $b = $b/1024;
    }
  }

  return sprintf($fmt, $b, $s);
} # end sub format_bytes


=head2 sendWarning(C<>) [returns I<void>]

Sends an e-mail message to the specified user if the C<-w> flag has been set.

=over 2

=item C<$to> (I<string> [B<required>])

Address to send the e-mail to

=item C<$name> (I<string> [B<required>])

Name of the person to send the e-mail to

=item C<$quota> (I<int> [B<required>])

Number of bytes in this user's quota.

=item C<$over> (I<int> [B<required>])

Number of bytes over the quota that the user has consumed.

=item C<\@sizes> (I<array of sizes and paths> [B<required>])

2D array of paths and their sizes, sorted from largest to smallest, as
produced by the C<duDepth()> function.

=item C<$overidx> (I<int> [B<required>])

The index of the last item in the sizes array that is over quota.  This
function will report on all items in sizes from 0 .. $overidx (inclusive).

=back

=cut
sub sendWarning($$$$$$;) {

  my $to = shift(@_);
  my $name = shift(@_);
  my $quota = format_bytes(shift(@_));
  my $over = format_bytes(shift(@_));
  my $sizes = shift(@_);
  my $overidx = shift(@_);

  # don't send e-mail unless the -w flag was specified
  return unless ($WARN > 0);

  my $message = <<EOINTRO
Dear $name ($to),

You are receiving this message because you have a large amount of data
stored in your file server account.  We will store and back up $quota
of data, but you have exceeded this amount by $over.

At this time, we have not done anything with your files on the server.
However, please be aware that we have not backed up files that take you
over your limit of $quota (see below for a full list).  Should anything
happen to the original files on the server, they will not be backed up
by us.

Please review the list of files below.  If they are not school-related
(TV shows, music, personal photos, etc.), please remove them from the
server.  If they are for a school project or function, you may continue
to keep them on the server.  If you anticipate needing to store them for
a long time, please reply to this message and let us know so we can
increase your disk space quota.

If you have any questions, please reply to this message.

Thank you,

The Technology Department


The following files are NOT being backed up by our automatic backup
procedure.  However, they will remain on the server until you remove them.
The largest files and folders are listed first.


EOINTRO
;

  for my $i (0 .. $overidx) {
    $message .= sprintf('  %9s  %-65s%s',
			format_bytes($sizes->[$i][0]),
			substr($sizes->[$i][1], 0, 65),
		       "\n");
  }

  $message .= <<EOCLOSE

Again, if you have any questions, please reply directly to this message.
EOCLOSE
;

  $SMTP->mail($EMAIL_FROM);
#  $SMTP->to($to);
  $SMTP->to('jhealy@suffieldacademy.org');

  $SMTP->data();

#  $SMTP->datasend("To: $name <$to>\n");
  $SMTP->datasend("To: Jason Healy <jhealy\@suffieldacademy.org>\n");

  $SMTP->datasend("From: $EMAIL_FROM_NAME <$EMAIL_FROM>\n");
  $SMTP->datasend("Subject: Fileserver Usage Warning\n");
  $SMTP->datasend("\n");

  $SMTP->datasend($message);

  $SMTP->dataend();


} # end sub sendWarning



############################################################################
###                             MAIN                                     ###
############################################################################

if ($#ARGV > -1 && $ARGV[0] eq '-w') {
  shift(@ARGV);
  $WARN = 1;
}

if ($#ARGV < 0) {
  print STDERR "Usage: users_over_quota [-w] </path/to/home/dirs>\n\n";
  exit 1;
}

for my $HOMES (@ARGV) {

# strip trailing slash(es), if any
  $HOMES =~ s#/*$##;

  opendir(HOMES, $HOMES) or die "Could not open homes base dir '$HOMES': $!";

  # ignore "hidden" dirs and plain files
  my @dirs = grep { /^[^.]/ && -d "$HOMES/$_" } readdir(HOMES);
#  my @dirs = grep { /^jhealy/ } readdir(HOMES);

  closedir(HOMES) or die "Could not close home dir handle: $!";

  for my $dir (@dirs) {

   my %attr = %{getUserInfo($HOMES, $dir)};

    if ($attr{ERROR}) {
      die "Could not get user info for dir '$dir': $attr{ERROR}\n";
    }
    else {

      my $over = du($HOMES, $dir, 1) - $attr{HomeDirectoryQuota};

      if ($over > 0) {
	# user is over quota; find largest items
	my $sizes = duDepth($HOMES, $dir, 2);

	my $overleft = $over;
	my $overidx = -1;

	while ($overleft > 0) {
	  $overidx++;
	  $overleft -= $sizes->[$overidx][0];
	}

	# at this point, everything in $sizes from 0 to $overidx (inclusive)
	# should be excluded

#	print "# $dir is $over (" . format_bytes($over) . ") over quota\n";

	for my $i (0 .. $overidx) {
	  print "- /" . $sizes->[$i][1] . "\n";
	}

	sendWarning($attr{EMailAddress}, $attr{FirstName},
		    $attr{HomeDirectoryQuota}, $over, $sizes, $overidx);

      }
      else {
#	print "# $dir is $over (-" . format_bytes(-$over) . ") under quota\n";
      }

    }

  }
}


=head1 SEE ALSO

L<http:E<sol>E<sol>web.suffieldacademy.orgE<sol>ilsE<sol>netadminE<sol>softwareE<sol>rsync_snapshotE<sol>>

=head1 AUTHOR

Jason Healy, Suffield Academy

=head1 COPYRIGHT AND LICENSE

Copyright 2009 by Jason Healy

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
