#!/mit/perldev/arch/@sys/bin/perl -w -037
# NB: -037 means "separator is ^_ [end-of-message]"

use FileHandle;
use strict;

select STDERR;
autoflush STDERR 1;
$0 =~ s@.*/@@g;

use vars qw( @msgs @headers @bodies @bidx $i );

print "Reading data...\n";
@msgs = <>;
print scalar(@msgs)." messages found\n";

print "Extracting headers and bodies...\n";
foreach (0..$#msgs) {
  if ($msgs[$_] =~ /\n\n/) {
    $headers[$_] = $` . "\n";
    $bodies[$_] = "\n" . $';
  } else {
    warn "no body found for message #$_";
    $headers[$_] = $msgs[$_];
    $bodies[$_] = '';
  }
}


print "Indexing bodies...\n";
@bidx = sort { $bodies[$a] cmp $bodies[$b] } 0..$#msgs;

print "Checking for duplicate bodies...\n";
BODY:
for ($i=0; $i<$#msgs; $i++) {
  my $j = $i;
  $j++ while ($bodies[$bidx[$i]] eq $bodies[$bidx[$j+1]]);
  next BODY if $i == $j;

  my @punt = sort { $a <=> $b } @bidx[$i..$j];
  my $msg = shift @punt;

  print scalar(@punt)." duplicates of message $msg will be removed.\n";
  @msgs[@punt]    = ((undef) x scalar @punt);
  @headers[@punt] = ((undef) x scalar @punt);
  @bodies[@punt]  = ((undef) x scalar @punt);

  $i=$j;
}

print STDOUT grep defined($_), @msgs;
