#!/usr/athena/bin/perl

# written by grant emery, february 1999

# make stdout unbuffered
$| = 1;

# ctrl-c will really fuck things up, so we ignore it.  there's no
# conceivable reason to interrupt this script, everything should time
# out eventually.
$SIG{INT} = 'IGNORE';

unshift (@INC, '/afs/sipb.mit.edu/project/www/src/db');
require 'dbparse.pl';
require '/afs/sipb.mit.edu/project/www/bin/homelib.pl';

use Socket;

$usage = <<USAGE;
Usage: deacpurge [options] doomfiles...

Reads a doom list and removes database entries and /people symlinks as
is appropriate.

Options:
-n           don't make any changes, just print information
-dw          delete database entries which are classified as "errors or
             other weirdness".  use this with caution.
-dt          delete database entries with remote urls that timeout on
             connection.  use this with caution.  it may be better to
             just remove these by hand, ans there likely aren't many.
-u [on/off]  turns on and off the check for valid remote urls.  this is
             OFF by default
-v [on/off]  turns on and off verbose status reporting.  this is ON by
             default
-s [on/off]  turns on and off the generation of a summary report at the
             end of processing.  this is ON by default
-l <file>    sends a duplicate of all the output to the named file.
	     verbose output and a summary will always be sent to this
	     file regardless of the above settings
             this is set to /tmp/purge.log by default
-l off       do not generate a log file
-h           show this screen

Specifying an option without an argument will turn that option on.
USAGE

$master_db_file = $DB'default_home;

# set up defaults for each of the options
$nomod = 0;
$urlcheck = 0;
$verbose = 1;
$summary = 1;
$delete_weird = 0;
$delete_timeout = 0;
$logfile = "/tmp/purge.log";
undef @doomfiles;

# process arguments: anything starting with a - is taken as an option,
# everything else is taken as a doom file
while ($arg = shift @ARGV) {
   if($arg eq "-n") {
      $nomod = 1;
   } elsif($arg eq "-dw") {
      $delete_weird = 1;
   } elsif($arg eq "-dt") {
      $delete_timeout = 1;
   } elsif($arg eq "-u") {
      if($arg = shift @ARGV) {
	 if($arg eq "on") {
	    $urlcheck = 1;
	 } elsif($arg eq "off") {
	    $urlcheck = 0;
	 } else {
            $urlcheck = 1;
            redo;                           # guess this isn't for us
	 }
      } else {
         $urlcheck = 1;
      }
   } elsif($arg eq "-v") {
      if($arg = shift @ARGV) {
	 if($arg eq "on") {
	    $verbose = 1;
	 } elsif($arg eq "off") {
	    $verbose = 0;
	 } else {
            $verbose = 1;
            redo;                           # guess this isn't for us
	 }
      } else {
         $verbose = 1;
      }
   } elsif($arg eq "-s") {
      if($arg = shift @ARGV) {
	 if($arg eq "on") {
	    $summary = 1;
	 } elsif($arg eq "off") {
	    $summary = 0;
	 } else {
            $summary = 1;
            redo;
	 }
      } else {
         $summary = 1;
      }
   } elsif($arg eq "-l") {
      if($arg = shift @ARGV) {
         if(substr($arg, 0, 1) eq "-") {
	    $logfile = "/tmp/purge.log";
	    redo;
	 } elsif($arg eq "off") {
	    undef $logfile;
	 } else {
	    $logfile = $arg;
	 }
      } else {
         $logfile = "/tmp/purge.log";
      }
   } elsif(substr($arg, 0, 1) eq "-") {
      die($usage);
   } else {
      push(@doomfiles, $arg);
   }
}

die("No doomfiles specified\n$usage") unless @doomfiles;

print "Not making changes\n" if $nomod;
print "Checking off-site urls\n" if $urlcheck;
print "Verbose message reporting\n" if $verbose;
print "Generating summary\n" if $summary;
print "Duplicating output to $logfile\n" if defined($logfile);

print "Is this what you want? ";

exit unless &accept();

# set up file handle for status messages
if($logfile) {
   open(LOGFILE, ">$logfile") or die "can't open $logfile: $!\n";
}

&rcs_check_out_file($master_db_file); print "\n";

# read in the database file
if(&DB'ParseHome($master_db_file) == 0) {
   print("Can't open $master_db_file.  Aborting.\n");
   &rcs_check_in_file($master_db_file);
   exit(1);
}

# these hashes are for keeping track of what will happen to each
# database entry.  the key is the username and the value is some message
# explaining why that entry is there.
# %errors: stores entries for which errors or other weirdnesses occur
# @timeout: stores entires for remote urls to which connections timed
#           out
# %local_nuke: entries with local symlinks, the value is the name of the
#              symlink
# %remote_nuke: entries with urls that aren't in /people
# %kept: entries which will be retained in the database
# @nothing: just a list of usernames which didn't have database entries

undef %errors, %local_nuke, %remote_nuke, %kept, @nothing, @timeout;
$total_victims = 0;

foreach $file (@doomfiles) {
   unless(open(DOOM, $file))  {
      warn("can't open $file: $!\n");
      &status_msg(0, "can't open $file: $!\n");
      next;
   }

   &status_msg(1, "Processing $file...\n");

   VICTIM: while(<DOOM>) {
      next VICTIM if /^#/;        #  punt accounts which never registered
      chomp;

      $total_victims++;

      &status_msg($verbose, "$_: ");

# retrieve database entry for user
      ($url, $name, $sort_index, $symlink, $date) = &DB'FetchUser($_);

# this means the user has no database entry
      if($url eq "0") {
         &status_msg($verbose, "no database entry\n");
         push(@nothing, $_);
         next VICTIM;
      }
   
# determine if the url starts with /people.  this means it should
# be a symlink on our server
      if($url =~ m|^/people/([^/]+)|) {
         $lfile = $1;
         &status_msg($verbose, "should have symlink...");

# try to determine where the symlink points
         unless(defined($linkval = readlink("$root_people_dir/$lfile"))) {
            &status_msg($verbose, "error reading symlink\n");
            $errors{$_} = "error reading symlink $lfile: $!";
            next VICTIM;
         }

# just an existence test, rather than checking readability, since -r
# isn't reliable in afs, and might not be what we want anyway
         if(-e $linkval) {
            &status_msg($verbose, "symlink is good\n");
            $kept{$_} = "symlink '$lfile' still exists";
            next VICTIM;
         }
      
         &status_msg($verbose, "will be deleted\n");
         $local_nuke{$_} = $lfile;
      } else {

# otherwise the url is on a remote server amd must be dealt with accordingly
         if($urlcheck) {
            &status_msg($verbose, "checking url...");
            ($code, $msg) = &url_check($url);
            if($code == -1) {
               $errors{$_} = $msg;
               &status_msg($verbose, "$msg\n");
            } elsif($code == -2) {
               $remote_nuke{$_} = $msg;
               &status_msg($verbose, "$msg\n");
            } elsif($code == -3) {
               push(@timeout, $_);
               &status_msg($verbose, "connection timed out\n");
            } elsif(substr($code, 0, 1) eq "2") {
               $kept{$_} = "$code $msg";
               &status_msg($verbose, "good\n");
            } elsif(substr($code, 0, 1) eq "3") {
               $kept{$_} = "$code $msg ($url)";
               &status_msg($verbose, "got redirect\n");
            } elsif(substr($code, 0, 1) eq "4") {
               $remote_nuke{$_} = "$code $msg";
               &status_msg($verbose, "not found\n");
            } elsif(substr($code, 0, 1) eq "5") {
               $remote_nuke{$_} = "$code $msg";
               &status_msg($verbose, "server error\n");
            } else {
               $errors{$_} = "$code $msg ($url)";
               &status_msg($verbose, "nonstandard response ($code)\n");
            }
         } else {
            $kept{$_} = "url not checked";
            &status_msg($verbose, "url not checked\n");
         }
      }
   }

   close(DOOM);
}

&status_msg(1, "\n");

$link_nuke_cnt = 0;
$db_nuke_cnt = 0;

# make the changes unless we were told not to
unless($nomod) {

   &status_msg(1, "Purging ", scalar(keys %local_nuke), " local entries");

   LINK: foreach $person (sort keys %local_nuke) {
      unless(unlink("$root_people_dir/$local_nuke{$person}")) {
         &status_msg(1, "$root_people_dir/$local_nuke{$person}: $!\n");
         $errors{$person} = "unlink symlink: $!";
         delete($local_nuke{$person});
         next LINK;
      }
      &DB'DelUser($person);
      $db_nuke_cnt++;
   }

   &status_msg(1, "\nPurging ", scalar(keys %remote_nuke), " non-local entries");
   foreach $person (sort keys %remote_nuke) {
      &DB'DelUser($person);
      $db_nuke_cnt++;
   }
   
   &status_msg(1, "\n\n");

   if($delete_weird) {
      &status_msg(1, "\nPurging ", scalar(keys %errors), " error/weird entries");
      foreach $person (sort keys %errors) {
         &DB'DelUser($person);
	 $db_nuke_cnt++;
      }
   }

   if($delete_timeout) {
      &status_msg(1, "\nPurging ", scalar(@timeout), " timeout entries");
      foreach $person (sort @timeout) {
         &DB'DelUser($person);
	 $db_nuke_cnt++;
      }
   }

# we're done removing entries from the database, write it to disk
   &DB'SpewHome($master_db_file);

   $rcs_message = "deacpurge removed $db_nuke_cnt deactivated accounts";
} else {
   $rcs_message = "no changes, deacpurge.pl run in readonly mode";
}

&rcs_check_in_file($master_db_file, $rcs_message);

# if a summary was requested, these lines will print it to STDOUT,
# otherwise it'll just be printed to the log file.  we also conjugate
# the form of "be" to cause these messages to actually make sense

$verb = $nomod ? "would be" : "were";

&status_msg($summary, "\nSymlinks and database entries $verb removed for the following users:\n");
foreach $person (sort keys %local_nuke) {
   &status_msg($summary, "$person\n");
}
&status_msg($summary, "(none)\n") unless scalar(keys %local_nuke);

&status_msg($summary, "\nDatabase entries $verb removed for the following users:\n");
foreach $person (sort keys %remote_nuke) {
   &status_msg($summary, "$person: $remote_nuke{$person}\n");
}
&status_msg($summary, "(none)\n") unless scalar(keys %remote_nuke);

&status_msg($summary, "\nDatabase entries/symlinks $verb retained for the following users:\n");
foreach $person (sort keys %kept) {
   &status_msg($summary, "$person: $kept{$person}\n");
}
&status_msg($summary, "(none)\n") unless scalar(keys %kept);

&status_msg($summary, "\nErrors or other weirdness occurred for the following users:\n");
foreach $person (sort keys %errors) {
   &status_msg($summary, "$person: $errors{$person}\n");
}
&status_msg($summary, "(none)\n") unless scalar(keys %errors);

&status_msg($summary, "\nRemote URL checks timed out for the following
users:\n");
foreach $person (sort @timeout) {
   &status_msg($summary, "$person: connection timed out\n");
}
&status_msg($summary, "(none)\n") unless scalar(@timeout);

# end summary generation

# now we have the even more concise information.  this information
# always reflects what changes were made

&status_msg(1, "\nTotal entries in doomlists: $total_victims\n");
&status_msg(1, "Entries not in database: ", scalar(@nothing), "\n");
&status_msg(1, "Entries retained: ", scalar(keys %kept), "\n");
&status_msg(1, "Symlinks removed: $link_nuke_cnt\n");
&status_msg(1, "Database entries removed: $db_nuke_cnt\n");
&status_msg(1, "Entries which encountered errors: ", scalar(keys %errors), "\n");

close(LOGFILE);

exit 0;

# makes an http connection to a web server and returns an array
# consisting of the HTTP response code and the free-form string returned 
# by the server for that code
# -1: some error in parsing, entry should be marked as 'error'
# -2: some error connecting, entry should be deleted
# -3: connection timed out, entry should be marked as 'error'
# 2*: ok
# 3*: redirect
# 4*: doesn't exist/permission denied
# 5*: server error

# this routine should probably time out on its own, but right now you
# can terminate a url check that seems to be hanging by hitting ctrl-c

sub url_check {
   local($url) = @_;
   local($service, $host, $port, $path, $null, $code, $message, $pid);
   local(@data);
   local($oldfh);

# parse out the url for service, host, port, and path
   $url =~ m|([\w]+)://([a-zA-Z0-9.\-]+)(:(\d+))?(/.*)| or return (-1, "url parse error");
   ($service, $host, $port, $path) = ($1, $2, $4, $5);

   return (-2, "$service not allowed") unless $service eq "http";

   $port = 80 if $port eq "";

# incantation for summoning a socket
   $iaddr = inet_aton($host) or return (-2, "unable to resolve $host");
   $paddr=sockaddr_in($port, $iaddr);
   $proto=getprotobyname('tcp');
   socket(S, PF_INET, SOCK_STREAM, $proto) or return (-2, $!);

# this little block grabs ^C and just terminates the connection attempt
# if one is received
   eval {
      local $SIG{INT} = sub { die "SIGINT\n" };
      local $SIG{ALRM} = sub { die "TIMEOUT\n" };

# time out after 20 seconds
      alarm(20);
      &status_msg($verbose, "connecting...");
# errno 145 is ETIMEDOUT afaict
      connect(S, $paddr) or die($! = 145 ? "TIMEOUT\n" : "$host: $!\n");
      &status_msg($verbose, "retrieving...");

# make the socket unbuffered.  trust me.
      $oldfh = select(S); $| = 1; select($oldfh);

# make a HEAD request to save on retrieval time
      print S "HEAD $path HTTP/1.0\nHost: $host\n\n";
      @data = <S>;
      close(S);
   };

   if($@) {
      close(S);
      if($@ =~ /SIGINT/) {
         return (-1, "got SIGINT during connection");
      } elsif($@ =~ /TIMEOUT/) {
         return (-3, "connection timed out");
      } else {
         chomp $@;
         return (-2, $@);
      }
   }

   $data[0] =~ tr/\r\n//d;   # hack to get rid of ^M and ^J
   ($null, $code, $message) = split(/\s+/, $data[0], 3);

   return ($code, $message);
}

# routine to deal with printing a message to a filehandle and standard
# output.  the first argument is a flag to determine whether or not the
# message should be duplicated to STDOUT as it's written to the log
sub status_msg {
   local($flag, @message) = @_;

   print @message if $flag;
   print LOGFILE @message if $logfile;
}
