#!/usr/athena/bin/perl

#
# Scans an index file of the FTP archive (by default,
# $PPDIR/find.cache) and removes (moves to .#) all files with headers 
# matching ANY of the indicated values (not case-sensitive). 
#
# For example, to remove all files with a Newsgroups: header containing 
# "alt.god.grubor" OR a Path: header containing "SpamSite", use
# remove-spam.pl -header newsgroups alt.god.grubor -header path spamsite
#
# Make sure it's run on an up-to-date copy of find.cache.
#
# pgreene 02/09/99
#

push(@INC, '/afs/sipb.mit.edu/project/periodic-postings/faq_server');

umask(002);

require "header.pl";

$archive_index = "/afs/sipb.mit.edu/project/periodic-postings/data/find.cache";
%matchlist = undef;
$delete = 1;

($whoami = $0) =~ s,.*/,,;

$usage = "Usage: $whoami [ -nodelete] [ -archive archive_list] {[-header header_name header_match]}+ [ -help]";

while ($_ = $ARGV[0], /^-/) {
    shift;
    if (/^-nod(|elete)$/) {
	$delete = undef;
    }
    elsif (/^-a(|rchive)$/) {
	($archive_index = shift) ||
	    die "Missing argument to $_ option.\n$usage\n";
    }
    elsif (/^-h(|eader)$/) {
	($matchlist{shift}=shift) ||
	    die "Missing argument(s) to $_ option.\n$usage\n";
    }
    elsif (/^-help$/) {
	print "$usage\n";
	exit 0;
    }
    else {
	die "Unknown option \"$_\".\n$usage\n";
    }
}

open(ARI, $archive_index) || die "Opening $archive_index: $!";
print "The following *.answers archive files have been identified as spam ". ($delete ? "and removed \n(renamed to .#)" : "but NOT removed") . ":\n";

$i = 0;

file: while (<ARI>) {
    chop;
    $file = $_;
    
    print "$i\n" if ($i % 100 == 0);
    $i++;

    if (! open(ARTICLE, "<$file")) {
	warn "Opening \"$file\": $!.\n";
	next;
    }
    $fhandle = "ARTICLE";

    ($next_line, %headers) = &header'parse(undef, "main'$fhandle");
    ($next_line, %auxheaders) = &header'parse($next_line, "main'$fhandle");
    %headers = (%headers, %auxheaders);
    close($fhandle);

    foreach $key (keys(%matchlist)) {

        $headerval = &header'field_value("$key",%headers);

        if ($headerval =~ /$matchlist{"$key"}/i) {
	    print "$file\n";
	    &delete("$file");
	    next file;
	}
    }
}


sub delete {
    local($file) = @_;
    local($dir, $name);

    return if (! $delete);

    ($dir, $name) = ($file =~ m,(.*)/([^/]+)$,);

    rename($file, "$dir/.#$name") ||
        warn "Renaming $file to $dir/.#$name: $!.\n";
}

exit;
