#!/afs/athena/contrib/perl/perl
#
# a tool for moving all volumes from an AFS partition or an entire
# server to another partition
#
# by bert Dvornik <bert@mit.edu>, Watchmaker Zone Night Patrol
#
# $Id: volmove.pl,v 1.7 96/05/15 03:44:57 bert Exp Locker: bert $

unshift(@RCSID, '$Id: volmove.pl,v 1.7 96/05/15 03:44:57 bert Exp Locker: bert $');

### use other relevant files

unshift(@INC, '/afs/athena/user/b/e/bert/project/volmove');
require('agetopt.pl');
require('tokens.pl');

### defaults

$DEFlogfile  = '/usr/tmp/vos_move.log.'; # plus the PID of the current process
$DEFstopfile = '/usr/tmp/STOP-MOVE-';    # plus the source and destination

### sighandler...

sub handler {
    local($sig) = @_;
    print "*** CAUGHT SIG$sig: QUITTING AFTER THIS OPERATION COMPLETES ***\n";
    print LOG "(CAUGHT SIG$sig: QUITTING AFTER THIS OPERATION COMPLETES)\n";
    $QUIT++;
    # I don't think this is needed at all, but it can't hurt.
    $SIG{$sig} = 'IGNORE';
}

### exec a process so that it doesn't see SIGINT even if it installs a handler

sub protect {
    $SIG{'INT'} = $SIG{'QUIT'} = 'IGNORE'; # mostly irrelevant
    open (STDERR, ">&STDOUT");
    setpgrp(0, $$); # protect the child from console-generated SIGINT
    exec(@_) || die "can't exec '@_'";
}

### run an AFS command and log the output

sub run_cmd {
    local($cmd) = @_;
    local($last);

    $afsop++;

    print LOG "\n%%% $cmd\n";
    if ($dontdoit) {
	print "WOULD DO: $cmd\n";
    } else {
	print "\n%%% $cmd\n";

        # fork, setpgrp and exec (see camel book for details on open'ing "-|")
	open(VCMD, "-|") || do protect($cmd);

	while (<VCMD>) {
	    print LOG;
	    print if $verbose;
	    $last = $_ if (!/^\s+$/);
	}
	print $last if !$verbose;

	close(VCMD);
    }
}

### usage info

sub usage {
    local($err) = @_;
    local($prog) = $0;
    $prog =~ s@.*/@@g;
    $err = "\n$prog: $err\n" if $err;

    print <<"EndOfUsage";
$err
Usage: $prog [options] from-host[.part] to-host.part [cell]
Options:

  -backupbyname  automatically back up all volumes that don't start in "n."
                 or end in ".nb"
                 [default: back up those which are currently backed up]

  -unreplicated  don't automatically replicate and release volumes on new site
  [-r]           [default: do replicate and release]

  -keepro        do not remove old read-only volumes
  [-o]           [default: remove old read-only volumes]

  -zephyr c[,i]  send messages to the specified zephyr class,instance
                 (or class,cellname if instance is omitted)

  -logfile lf    use the file "lf" as the logfile.
                 [default: $DEFlogfile<pid>]
  -stopfile sf   use the file "sf" as the stopfile.
                 [default: $DEFstopfile<src>-<dst>]

  -noaction      just say what we WOULD do
  -verbose       verbose
  -debug         debugging output (probably useless to you)

Omitting the "from" partition means *all* partitions on that host.
Not specifying the cell may lead to authenticantion problems (if it's
something other than athena).

Ctrl-C will abort the operation *after* the vos operation in progress
is finished.  Thus it should be safe to Ctrl-C out of $prog while it
is performing vos operations you normally can't safely Ctrl-C out of,
like "vos move".  However, some manual cleanup may be required.

Removing the stopfile will abort the operation after the operation in
progress and all related operations are finished.  Thus, removing the
stopfile will stop the move cleanly.

EndOfUsage
    print join("\n", @RCSID), "\n";
    exit 1;
}

### main body

# Ctrl-C exits in a more controlled fashion.
$SIG{'INT'} = $SIG{'QUIT'} = 'handler';

# arg parsing

($err = &agetopt('-backupbyname', "-unreplicated\000-r", "-keepro\000-o",
		 '-noaction', '-verbose', '-debug',
		 '-from:', '-to:', '-cell:', '-logfile:', '-stopfile:',
		 "\000-zephyr:"))
    && &usage($err);

$debug      = $aopt{'-debug'};
$verbose    = $aopt{'-verbose'};
$dontdoit   = $aopt{'-noaction'};

$autobackup = $aopt{'-backupbyname'};
$keepro     = $aopt{'-keepro'};
$norelease  = $aopt{'-unreplicated'};

$shost = $aopt{'-from'} || &usage("Missing required field `-from'");
if ($shost =~ /^(\S+)\.(.)$/) {
    ($shost,$spart) = ($1,$2);
}

$dhost = $aopt{'-to'} || &usage("Missing required field `-to'");
if ($dhost =~ /^(\S+)\.(.)$/) {
    ($dhost,$dpart) = ($1,$2);
} else { &usage("Missing or garbled partition in required field `-to'"); }

if ($afscell = $aopt{'-cell'}) {
    $cell = "-cell $afscell";
}

if ($zephyr = $aopt{'-zephyr'}) {
    local(@z) = split(/,/, $zephyr);
    @zwrite = ('zwrite', '-c', @z[0], '-i', @z[1] ? @z[1] : $afscell);
}

$logfile =  $aopt{'-logfile'}  || ($DEFlogfile . $$);
$stopfile = $aopt{'-stopfile'} ||
    ("$DEFstopfile$shost." . ($spart ? $spart : "all") . "-$dhost.$dpart");

# open the logfile

open(LOG, ">$logfile") || die "can't open $logfile ($!)";
select((select(LOG), $|=1)[$[]);
print LOG "(not actually doing anything)\n" if $dontdoit;
$afsop = 0;

# complain if our tokens will last < 2 hours.

if (@life = &token_expiration_alert(60*60, 0, $afscell)) {
    print LOG "(token expiration in $life[0] seconds -- user notified)\n";
}

# parse the list of volumes

# this does a fork, setpgrp and exec
open(VOL, "-|") || do protect("vos listvol $shost $spart $cell");

print "Reading volume list for $shost $spart\n" if $verbose;

while (<VOL>) {
    print ">> $_" if ($verbose>1);
    chop;

    # parse the first line

    if (! m@^Total number of volumes on server (\S+) partition /vicep(.): (.*)$@) {
	print STDERR "Oops, I can't parse vos output:\n";
	print STDERR "'$_'\n";
	die 'quitting';
    }
    ($host,$part,$nn) = ($1, $2, $3);
    push(@partitions, "$host $part");

    # parse the volume names

  VOLUME:
    for ($i=0; $i<$nn; $i++) {
	($_ = <VOL>) || die "early end of output fron listvol";
	print ">> $_" if ($verbose>1);
	($vol,$id,$type) = split(/\s+/,$_,4);

	# ignore disk.*
	if ($vol =~ /^disk\./) {
	    print "Will ignore $vol.\n" if $debug;
	    next VOLUME;
	}

	# ignore **...
	if ($vol =~ /^\*\*+/) {
	    print;
	    next VOLUME;
	}

	# preserve back-up volumes
	if ($vol =~ /^(.*)\.backup$/) {
	    print "Warning: $vol is listed as '$type', not 'BK'!\n" if ($type ne 'BK');
	    print LOG "!!! $vol is listed as '$type', not 'BK'!\n" if ($type ne 'BK');
	    $rvol = $1;
	    print "Will back up $rvol.\n" if $debug;
	    $backup{$rvol}++;
	    next VOLUME;
	}
	if ($type eq 'BK') {
	    print "Warning: oops, $vol is a backup of *what*? (ignoring)\n";
	    print "!!! oops, $vol is a backup of *what*? (ignoring)\n";
	    next VOLUME;
	}

	# nuke and re-create read-only replication sites
	if ($vol =~ /^(.*)\.readonly$/) {
	    print "Warning: $vol is listed as '$type', not 'RO'!\n" if ($type ne 'RO');
	    print LOG "!!! $vol is listed as '$type', not 'RO'!\n" if ($type ne 'RO');
	    $rvol = $1;
	    print "Will re-replicate and blow away $rvol.\n" if $debug;
	    $replicated{$rvol} .= "$host $part,";
	    next VOLUME;
	}
	if ($type eq 'RO') {
	    print "Warning: oops, $vol is a replication site for *what*? (ignoring)\n";
	    print "!!! oops, $vol is a replication site for *what*? (ignoring)\n";
	    next VOLUME;
	}

	push(@move,"$vol $host $part");
    }

    # now we get a newline, a "Total onLine/offLine/busy" line, and another newline
    $_ = <VOL>;
    if (!/^\s*$/) {
	chop;
	print STDERR "Oops, I was expecting a newline and I got:\n'$_'\n";
	die 'quitting';
    }
    $_ = <VOL>;
    if (!/^Total volumes/) {
	chop;
	print STDERR "Oops, I was expecting a on-line/off-line/busy summary:\n'$_'\n";
	die 'quitting';
    }
    $_ = <VOL>;
    if (!/^\s*$/) {
	chop;
	print STDERR "Oops, I was expecting a newline and I got:\n";
	print STDERR "'$_'\n";
	die 'quitting';
    }
}

close(VOL);

# done parsing the list of volumes

# touch the stopfile

open(STOP, ">$stopfile") || die "can't touch $stopfile ($!)";
close(STOP);
print "\n[Remove $stopfile to stop the move cleanly.]\n\n";

# send out Zephyr notification

if (@zwrite && !$dontdoit) {
    local($spart) = $spart || '*';
    system (@zwrite, '-m',
	    "Moving volumes from $shost.$spart to $dhost.$dpart...");
}

# do work

while ($info = shift(@move)) {
    ($vol,$hostport) = split(/\s+/, $info, 2);

    # progressively complain if our tokens will last < 15 minutes.

    if (@life = &token_expiration_alert(5*60, 5*60, $afscell)) {
	print LOG "\n(token expiration in $life[0] seconds-- user notified)\n";
	$alert15 = 1;
    }
    if (!$alert15 && (@life= &token_expiration_alert(15*60, 5*60, $afscell))) {
	print LOG "(token expiration in $life[0] seconds -- user notified)\n";
	$alert15 = 1;
    }

    # create a list of what we need to do.

    local(@todo, $need_bk, $need_ro, @need_rm);

    if ((!$autobackup && $backup{$vol}) ||
	 ($autobackup && ($vol !~ /^n\./) && ($vol !~ /\.nb$/))) {
	$need_bk = 1;
	push(@todo, "$vol needs to be backed up.");
    }

    if (!$norelease && $replicated{$vol}) {
	$need_ro = 1;
	push(@todo, "New replication site for $vol needs to be added.");
	push(@todo, "$vol needs to be released.");
    }

    if (!$keepro && $replicated{$vol}) {
	local($oldsites) = $replicated{$vol};
	chop($oldsites);	# comma-separated list of old replication sites

	@need_rm = split(/,/, $oldsites);
	for $site (@need_rm) {
	    push(@todo, "RO site for $vol on $site needs to be removed.");
	}
    }

    # move the volume
    # (this is a good place to stop, if a clean stop is requested)

    ($QUIT || ( ! -e $stopfile ))
	&& do wrapup("Some volumes have not been moved.");
    ($debug>2) && do prstatus("Some volumes have not been moved.");

    do run_cmd ("vos move $info $dhost $dpart $cell -verbose");

    # back it up if needed

    if ($need_bk) {
	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);
	delete $backup{$vol};

	do run_cmd ("vos backup $vol $cell -verbose");
    }

    # replicate and release if needed.
    if ($need_ro) {
	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);
	delete $replicated{$vol};

	do run_cmd ("vos addsite $dhost $dpart $vol $cell -verbose");

	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);

	do run_cmd ("vos release $vol $cell -verbose");
    }

    # remove old RO sites.
    while (@need_rm) {
	local($site) = shift(@need_rm);
	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);
	do run_cmd ("vos remove $site $vol.readonly $cell -verbose");
    }
}

# warn about "lost" back-up volumes

for $vol (keys %backup) {
    if (!$autobackup) {
	print "Warning: couldn't find $vol to back up (consistency error?)\n";
	print LOG "!!! couldn't find $vol to back up (consistency error?)\n";
    } else {
	print "Warning: $vol used to be backed up, but no longer is.\n";
	print LOG "!!! $vol used to be backed up, but no longer is.\n";
    }
}

# deal with replication sites from different partitions

for $vol (keys %replicated) {
    local(@todo, $need_ro, @need_rm);

    # (this is a good place to stop, if a clean stop is requested)

    ( -e $stopfile ) || do wrapup("Some RO volumes (with RW's elsewhere) "
				  . "have not been processed.");

    # create a list of what we need to do.

    if (!$norelease) {
	$need_ro = 1;
	push(@todo, "New replication site for $vol needs to be added.");
	push(@todo, "$vol needs to be released.");
    }

    if (!$keepro) {
	local($oldsites) = $replicated{$vol};
	chop($oldsites);	# comma-separated list of old replication sites

	@need_rm = split(/,/, $oldsites);
	for $site (@need_rm) {
	    push(@todo, "RO site for $vol on $site needs to be removed.");
	}
    }

    # replicate and release if needed.
    if ($need_ro) {
	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);
	delete $replicated{$vol};

	do run_cmd ("vos addsite $dhost $dpart $vol $cell -verbose");

	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);

	do run_cmd ("vos release $vol $cell -verbose");
    }

    # remove old RO sites.
    while (@need_rm) {
	local($site) = shift(@need_rm);
	$QUIT && do wrapup(@todo);
	($debug>1) && do prstatus(@todo);

	shift(@todo);
	do run_cmd ("vos remove $site $vol.readonly $cell -verbose");
    }
}

# wrap-up

do wrapup();

sub wrapup {
    local(@oops) = @_;

    if (@oops) {
	print <<"EndOfWarning";

-----------------------------------------------------
Warning: the move was interrupted before it finished.

This shouldn't be catastrophic (I waited on vos to end), but please
make sure everything is OK and happy.  In particular, you may want to
check the following list of potential problems:
-----
EndOfWarning

        print join("\n", @oops, "-----\n");

	print LOG "*** ended before everything was wrapped up.\n";
	print LOG "*** potential problem topics:\n";
        print LOG join("\n", @oops, '');
	if (@zwrite && !$dontdoit) {
	    local($spart) = $spart || '*';
	    system (@zwrite, '-m',
		    "Done moving volumes from $shost.$spart to $dhost.$dpart:".
		    " POSSIBLE PROBLEMS!");
	}
    } else {
	print "\nDone!\n";
	if (@zwrite && !$dontdoit) {
	    local($spart) = $spart || '*';
	    system (@zwrite, '-m',
		   "Done moving volumes from $shost.$spart to $dhost.$dpart.");
	}
    }

    close(LOG);
    if (!$dontdoit) {
	print "\nLog of $afsop performed operations stored in $logfile.\n";
    }
    unlink($stopfile);
    exit(0);
}

# display the array of status messages

sub prstatus {
    print join("\n! ", '! ', @_, "\n");
}
