#!/bin/bash

###
### # $Id: rsync_snapshot_fcns 1756 2013-03-15 17:41:10Z jhealy $
###
### This script runs from the OpenVZ master Hardware Node.  It sends a
### command to FirstClass running in the VZ containter to "hold" the server
### (flush to disk), and then uses LVM2 to snapshot the logical volume
### holding the FCNS.
### 
### Once snapshotted, the script mounts the snapshot volume, backs it up
### using rsync, and then destroys the snapshot when finished.
###


# rsync snapshot password
export RSYNC_PASSWORD="FIXME: put real password here"

# OpenVZ container ID of the machine running FirstClass
CTID=30115

# Volume group device where the mailstore (and snapshot) live
VG="/dev/vg_mailstore"

# name of the logical device to snapshot
MASTER="lv_mailstore"

# name of the snapshotted logical volume to create
SNAPSHOT="rsync_snapshot_mailstore"

# amount of space to reserve for the snapshot (basically, the amount of
# changes the snapshot needs to withstand before the backup is done)
SNAPSHOT_EXTENTS="10G"

# The FirstClass server accepts a "hold" command that tells it to flush
# everything to disk and wait so a snapshot can be taken.  The command
# accepts a time (in seconds) for the hold, and then it auto expires.
# The trick is picking a low enough hold time so users don't notice the
# disruption, but long enough so that the command returns and the snapshot
# can be taken.
#
# Kanon (Ben) said there can be a 17 second delay for the hold command to
# return, and this is counted as part of the total hold time.  We haven't
# observed this, but we'll build it in as a safety margin.
#
# We also want to make sure the lvcreate snapshot has a little extra time
# to run as well, so we build in another few seconds for that.
#
# The script will tell FC to pause for HOLD_TIME.  When that hold command
# returns, the script will ensure that at least HOLD_BUFFER seconds remain
# on the wall clock to start the snapshot.  If not, the script will exit
# and report an error.
#
# HOLD_TME= 20 (Kanon minimum of 17 rounded up to 20) + buffer * 2 (10)
HOLD_TIME=30
# Buffer time is minium time remaining on hold if we're going to
# perform the snapshot
HOLD_BUFFER=5


########################################################################
##                  END OF USER-SPECIFIED VARIABLES                   ##
########################################################################

SELF="$(basename $0)"

###
### Functions
###


# This function is called when the script terminates abnormally.
# Caller should set global ERROR variable prior to calling.
function die() {
    printf "%b\n" "$*" 1>&2
    exit 1;
}

# Set this to the value of a mountpoint if you'd like it automatically removed
CLEANMOUNT=""

# This function is called when the script exits (even if interrupted)
# It cleans up any temporary files.
function cleanup() {
    echo "$SELF: cleaning up..."

    if [ -n "$CLEANMOUNT" -a -d "$CLEANMOUNT" ]; then
	rm -rf "$CLEANMOUNT"
    fi
}


# Trap for interrupt signals, so we can shut down cleanly
trap 'die "$SELF was interrupted at `date`\nNOT CLEANING UP SNAPSHOT"' \
SIGHUP SIGINT SIGQUIT SIGTERM

# Universal trap for any exit (clean or interrupted)
trap 'cleanup' 0


###
### Preliminary Tests
###

# Sanity check: do not run unless the logical volume names are not empty
if [ -z "$MASTER" ]; then
    die "No master logical volume specified; exiting"
fi

if [ -z "$SNAPSHOT" ]; then
    die "No snapshot logical volume name specified; exiting"
fi


# Sanity check: verify that there isn't already an existing snapshot
if [ -b "$VG/$SNAPSHOT" ]; then
    die "Snapshot device '$VG/$SNAPSHOT' already exists.\n" \
Assuming another backup is running, so terminating this one.
EOF
fi


###
### Main Script
###


# Get the wall clock time
HOLD_START=$(date +%s)

# Tell the FirstClass server to flush to disk and wait
/usr/sbin/vzctl exec2 $CTID "/opt/fcsd/fcputil hold $HOLD_TIME" 2>&1
VZEXEC=$?

if [ $VZEXEC -ne 0 ]; then
    die "fcputil returned unexpected value: $VZEXEC.\nGiving up and exiting."
fi

# Get the wall clock time again
HOLD_STOP=$(date +%s)

# Calculate the elapsed time, and the time remaining on the hold
HOLD_REMAINING=$(($HOLD_STOP-$HOLD_START+$HOLD_BUFFER-$HOLD_TIME))

if [ $HOLD_REMAINING -ge 0 ]; then
    die "FirstClass hold of $HOLD_TIME did not complete in time to make the
snapshot (would have needed $HOLD_REMAINING more seconds).
Giving up and exiting."
fi


# Once FC is frozen, snapshot the partition
/sbin/lvcreate "-L$SNAPSHOT_EXTENTS" -s -n "$SNAPSHOT" "$VG/$MASTER"

# FC will automatically unfreeze

# Now mount the snapshot to a temporary location
MOUNT=`mktemp -d -t "$SELF.XXXXXXXXXX"`
if [ $? -ne 0 ]; then
    CLEANMOUNT="$MOUNT"
    die "Unable to create temporary mountpoint for snapshot volume ('$MOUNT').
Giving up and exiting."
fi

/bin/mount -t xfs -o ro,nouuid "$VG/$SNAPSHOT" "$MOUNT"

if [ $? -ne 0 ]; then
    die "Unable to mount snapshot to '$MOUNT'.
Giving up and exiting."
fi


# Backup the snapshot
/usr/bin/rsync -aHX --delete --inplace --exclude=/stats "$MOUNT/fcns-master/" \
fcns@rsync-snapshot.suffieldacademy.org::fcns/transfer/


# Unmount the snapshot
/bin/umount "$MOUNT"

if [ $? -ne 0 ]; then
    CLEANMOUNT=""
    die "Unable to unmount snapshot from '$MOUNT'.
Leaving temporary mountpoint and snapshot!"
fi
CLEANMOUNT="$MOUNT"


# Destroy the snapshot
/sbin/lvremove -f "$VG/$SNAPSHOT"

if [ $? -ne 0 ]; then
    die "Unable to destroy snapshot.  Leaving in place!"
fi

