#!/afs/sipb/project/perldev/p -w

use IO::File;
use Socket;
use Getopt::Long 2.0;  # known to work with 2.10, but may with earlier
use strict;

#use lib qw(/mit/bert/PERL /mit/bert/project/perl5);
#quse Wrath;

### set defaults

select STDERR;
autoflush STDERR 1;
$0 =~ s@.*/@@g;

use vars qw(@message $header %headers @headers
	    @to @pm_hosts %suppress %mixcase %spam $offending);
%suppress = ('xref' => 1, 'lines' => 1);  # suppress headers added by gnus

%spam = ('earthlink.net' => 'spam@earthlink.net',
	 'uu.net' => 'fraud@uu.net',
	);

use vars qw($me $here @compose $debug $outside $opt_n $opt_N
	    $chainletter $abuse_only $msn_is_evil $no_window);
$me = '"Albert Dvornik" <bert@mit.edu>';
$here = 'MIT.EDU';
@compose = ('comp', '-form');

### parse command line options

# Getopt::Long::config("bundling_override");
Getopt::Long::config("bundling"); # bugs suck
GetOptions("h|here=s"       => \$here,
	   "m|outside"      => \$outside,
	   "c|chainletter"  => \$chainletter,
	   "a|abuse"        => \$abuse_only,
	     "msn_is_evil"  => \$msn_is_evil,
	   "n|display_only" => \$opt_n,
	     "nw|tty"	    => \$no_window,
	   "N|dryrun"       => \$opt_N,
	   "d|debug"	    => \$debug
	  )
  or usage();

(@ARGV > 0)
  or die("Error: '$0 < file' will confuse 'comp' (or 'less')!\n");
(@ARGV > 1)
  and usage("Error: too many arguments.");

($opt_n && $opt_N)
  and usage("Error: --display_only and --dryrun are mutually exclusive");
$opt_n and ( @compose=($ENV{'PAGER'}||'less') );
$opt_N and ( @compose=() );

# kludge: we may have no way of passing '-nw' to Emacs directly.
$no_window && delete $ENV{'DISPLAY'};

$here =~ s/^\.+//;
$here = ".$here" unless $outside;

### Read in offending message.

@message = <>;
@message || die "Empty input message";

### Build headers list.

$header='';

HEADER:
  while ($_ = shift(@message)) {
    /^\s+\S/ && ($header .= $_, next HEADER);	# deal with continued headers

    deal_with_header($header) if length($header);
    $header = $_;
    /^\s*$/ && last HEADER;			# check for end of headers
  }

### Deal with 'From' and 'Received' fields

dissect_from();
$offending = dissect_recv();

open (OUTPUT, ">/tmp/wrath.$$");
print_header();
print_original();
close(OUTPUT);
system(@compose, "/tmp/wrath.$$") if @compose;
unlink("/tmp/wrath.$$");

# Add data from a header line to @headers and %headers (and %mixcase).
sub deal_with_header {
  my ($header) = @_;

  unless ($header =~ /^([-\w\d]+):\s*/) {
    warn "Un-intelligible header:\n-----\n$header-----\nfound";
    return undef;
  }

  my $key = "\L$1";
  $mixcase{$key} = $1;
  push(@headers, $key);
  $headers{$key} = [] unless exists($headers{$key});
  push(@{$headers{$key}}, $');
  $';
}

# Parse "sender", "from", "apparently-from" and "reply-to" fields
# (from @headers), and stuff results into @to [destinations] and
# @pm_hosts [hosts involved].
sub dissect_from {
  local $_;
  my ($sender) = $headers{'sender'} || [];
  my ($from) = $headers{'from'} || [];
  my ($appfrom) = $headers{'apparently-from'} || [];
  my ($replyto) = $headers{'reply-to'} || [];
  $from = join(',', @$replyto, @$sender, @$from);

  $from =~ s/(^|[^\\])\"(|(\\\"|[^\"])*[^\\])\"/$1/g;
  $from =~ s/\s+/ /g;
  $from =~ s/^ | $//g;

  for (split(/\s*,\s*/, $from)) {
    s/^.*<([^<>]+)>.*$/$1/g;
    push(@to, $_);
    ## obsoleted:
    #s/^.*@//;
    #push(@pm_hosts, $_);
  }
}

# Add host data from the last host line (like 'received: ... by HOST ...')
# to @pm_hosts
sub dissect_recv_basic {
  local $_;
  my ($recv) = $headers{'received'} || return undef;
  my ($orig) = $recv->[$#$recv];
  my ($host);

 RECV_BY:
  while ($orig =~ /\sBY\s+(\S+)/gi) {
    $host = $1;
    $host =~ s/.*@//g;
    $host=~s/\[?(\d+\.\d+\.\d+\.\d+)\]?/gethostbyaddr(inet_aton($1),AF_INET)/e;
    if ($host !~ /^[-_\w\d\.]+$/) {
      warn "Broken host name '$host' encountered";
      next RECV_BY;
    }

    warn "would have added $host\n";
    ## obsoleted:
    #push(@pm_hosts, $host);
  }
}

# If host data contains any IP addresses that reverse-resolve, return
# the resulting hostname.  Otherwise, try resolving the first fragment
# (as a hostname) and reverse-resolving it, and report inconsistencies.
# Return a best possible guess for the IP address and hostname of the sender.
sub recv_fromhost {
  my ($descr) = @_;
  my (@frags) = split(/\s*[][)(]\s*/, $descr);
  my ($name, $addr, $frag, @candidate);
  my (@ip) = grep /^\d+\.\d+\.\d+\.\d+$/, @frags;

  warn "Several IP addresses in Recieved fragment '$descr'\n" if (@ip > 1);

  # See if any IP addresses reverse-resolve.
  for $addr (@ip) {
    $name = gethostbyaddr(inet_aton($addr), AF_INET);
    if ($name) {
      if ($descr =~ /\Q$name\E/i) {
	warn "using: $name; IP addr reverse-resolved to it, listed.\n";
      } else {
	#$name = "$addr [$name]";
	warn "using: $name; IP addr reverse-resolved to it.\n";
      }
      return (inet_aton($addr), $name);
    }
  }

  # OK, now see if any hostnames at least resolve to address.
 FRAG:
  for $frag (@frags) {
    (($frag =~ /\./) && ($frag !~ /^\d+\.\d+\.\d+\.\d+$/)) or next FRAG;
    $addr = gethostbyname("$frag.")
      or next FRAG;
    $name = gethostbyaddr($addr, AF_INET);

    if ($name) {
      # resolves both ways
      warn "broken resolution: $frag -> ".inet_ntoa($addr)." -> $name\n"
	if lc($frag) ne lc($name);

      if (!@ip) {
	# no IP addrs listed; probably the valid address.
	warn "using: $name; no IP addr specified\n";
	return ($addr, $frag);
      } elsif (scalar grep($addr eq inet_aton($_), @ip)) {
	# matches an IP addr; why didn't the addr resolve already?
	die "reverse resolve succeeded *and* failed (???)";
      } else {
	# doesn't match an IP addr; probably intended to deceive.
	warn "ignoring: $frag; faked-but-existing host name\n";
	next FRAG;
      }
    } else {
      # resolves forward but not backward
      if (!@ip) {
	# no IP addrs listed; maybe the valid address, but don't be sure.
	@candidate = ($addr, $frag);
	next FRAG;
      } elsif (scalar grep($addr eq inet_aton($_), @ip)) {
	# matches an IP addr; that's a match, loud and (mostly) clear.
	$name = $frag;
	#$name = "$frag [".inet_ntoa($addr)."]";
	warn "using: $name; resolves forward but not backward\n";
	return ($addr, $name);
      } else {
	# doesn't match an IP addr; probably just extraneous data.
	next FRAG;
      }
    }
  }

  # Still here?  Well, return @candidate (or empty list, if none).
  @candidate;
}

# Extract first available bit of host data (eg 'host [ip]' -> 'host')
sub recv_byhost {
  my ($descr, $haddr, $hint) = @_;
  my (@frags) = split(/\s*[][)(]\s*/, $descr);
  my (@dotfrags) = grep(/\./ && !/^\d+\.\d+\.\d+\.\d+$/, @frags);
  my (@hint_matches);

  @hint_matches = grep($hint =~ /(^|\.)\Q$_\E$/i, @dotfrags)
    if ($hint);
  return ($haddr, $hint_matches[0]) if (@hint_matches);

  my ($name) = @dotfrags ? $dotfrags[0] : $frags[0];
  (inet_aton($name), $name);
}

# add an appropriate host to pm_hosts
sub pm_host {
  my ($host) = @_;
  my (@domain) = split /\./, $host;
  my ($howmany) = scalar(@domain)-1;
  $howmany = 2 if ($domain[$#domain] =~ /^(?:edu|com|org|net)$/);
  @domain = @domain[($#domain-$howmany+1)..$#domain];
  push(@pm_hosts, join('.', @domain));
}

# more involved parsing of Received: lines.  XXX: work on this!
sub dissect_recv {
  #dissect_recv_basic() if $debug;

  my (@recv) = @{$headers{'received'} or return undef};
  my ($rhead) = shift(@recv);
  my (%chunk, $faddr, $from, $baddr, $by);
  %chunk = ('^', split(/\s*\b(from|by|with|id|for|;)\b\s*/, lc $rhead));
  ($baddr, $by)   = recv_byhost($chunk{'by'});
  ($faddr, $from) = recv_fromhost($chunk{'from'});

  for $rhead (@recv) {
    my ($xfrom) = $from;
    %chunk = ('^', split(/\s*\b(from|by|with|id|for|;)\b\s*/, lc $rhead));
    ($baddr, $by) =   recv_byhost($chunk{'by'}, $faddr, $from);

    if ($baddr ne $faddr) {
      warn "mismatch: 'from $from' doesn't match 'by $by'\n";
      pm_host($xfrom);
      return $xfrom;
    }

    ($faddr, $from) = recv_fromhost($chunk{'from'})
      or do {
	warn "no host for $chunk{'from'}\n";
	pm_host($xfrom); return $xfrom;
      };

    print STDERR "$by <- $from\n";
  }

  # if we're here, everything checks out OK, and the last address is
  # the offending one.
  pm_host($from);
  $from;
}

# Remove duplicates from a sorted list of non-empty data.
sub uniq {
  my ($p) = '';
  grep( ((length($_) && ($_ ne $p)), $p=$_)[0], @_ );
}

# Output headers *and main body* for a mail message
sub print_header {
  my ($subdata) = $headers{'subject'};
  my ($subject) = ($subdata ? ('JUNK MAIL: ' . $subdata->[0]) : "JUNK MAIL\n");

  my (@notify) = uniq(sort @pm_hosts);
  my ($abuse) =      join(', ',
			  map(exists $spam{lc($_)}? $spam{lc($_)}: "abuse\@$_",
			      @notify));
  my ($rr) = $me;
  $rr =~ s/^.*<([^<>]+)>.*$/$1/g;

  if ($abuse_only) {
    if ($msn_is_evil) {
      printf OUTPUT
	<<'EndOfMsnNote', $rr, $me, $abuse, $subject;
Return-Receipt-To: %s
From: %s
To: %s
Subject: %s--------

Dear Person Who Has To Deal With Abuse Complaints,

I believe the following piece of unsolicited junk mail originated from
the Microsoft Network.  UUnet has so far not responded to my requests
for an appropriate complaint address for MS.UU.NET hosts, so I'm
sending this to you and you can forward it to the MSN people.

Thank you for your time.

--Albert Dvornik
  bert@mit.edu

EndOfMsnNote
#';
    } else {
      my $offhost = '';
      $offhost = " (host $offending)" if defined $offending;
      printf OUTPUT
	<<'EndOfAbuseNote', $rr, $me, $abuse, $subject, $offhost;
Return-Receipt-To: %s
From: %s
To: %s
Subject: %s--------

Dear Person Who Has To Deal With Abuse Complaints,

I hope that your organization does not condone sending unsolicited bulk
E-mail.  I believe the following piece of unsolicited junk mail originated
from your site%s.
If it did, I hope you will deal with its sender in a manner you consider
appropriate.

If the mail in question merely passed through your system, but the real
sender was not recorded in the mail headers, you may wish to alter your
system to provide this information.  If the mail did not pass through your
system, please let me know so I can figure out where it did come from.

Thank you for your time.

--Albert Dvornik
  bert@mit.edu

EndOfAbuseNote
    }
  } else {
    printf OUTPUT
      <<'EndOfFormat', $rr, $me, join(', ',@to), $abuse, $subject;
Return-Receipt-To: %s
From: %s
To: %s
Cc: %s
Subject: %s--------

Dear Junk Mailer/Mailess,

I earnestly hope that your access provider does not condone sending
unsolicited E-mail, and has yanked your account for spamming activities.
However, if they have not, this automated response script has several
things to tell you.

EndOfFormat

    if ($chainletter) {
      print OUTPUT <<'EndOfChain';
(1) Your letter is illegal in the United States and many other
    countries.  If in doubt, ask your lawyer about Title 18, United
    States Code, Section 1302 (the Postal Lottery Statute), or look at

        http://www.usps.gov/websites/depart/inspect/chainlet.htm

    In a further display of brilliance, you have listed your mailing
    address and your alleged earnings, so any recipient could forward
    them to your local law enforcement and tax collection agencies.  Thanks!
EndOfChain
    } else {
      print OUTPUT <<'EndOfBusiness';
(1) Sending me junk E-mail is an excellent way of making sure I never do
    business with your company.  Congratulations!
EndOfBusiness
    }

  print OUTPUT <<'EndOfRest';

(2) I demand to be taken off your junk mail list, since I do not welcome
    your solicitation.

(3) I would appreciate if you let me know what source you obtained the
    list from, since I would like to stop others from mistakenly assuming
    I enjoy reading their unsolicited advertising or other drivel.

The original mail has been appended below for postmasters' convenience.
Have a nice day.

--Albert Dvornik
  bert@mit.edu

EndOfRest
#'
  }
}

use vars '$quoted_here';

# Return true if specified header shouldn't be output in the written message.
sub hide_received {
  my ($header_name, $header_body) = @_;
  unless ($quoted_here) {
    my $domain = ($here =~ /^\./);
    $quoted_here = "\Q$here";		# quote regexp metacharacters
    $quoted_here = '(^|[\\s\\.])' . $quoted_here   # insure token boundary
      unless $domain;
  }
  ($header_name eq 'received') && ($header_body =~ /$quoted_here(\s|$)/io);
}

# Quote an original message body
sub print_original {
  my ($k, $hdr, $xhdr);

  print OUTPUT "------- Original message\n\n";

  for $k (@headers) {
    $hdr = shift(@{$headers{$k}});
    #($xhdr = $hdr) =~ s/\s+/ /;

    print OUTPUT $mixcase{$k}, ': ', $hdr
      unless ($suppress{$k} || hide_received($k, $hdr, $headers{$k}));
  }

  print OUTPUT "\n", @message;

  print OUTPUT "\n------- End of original message\n";
}

# Beat user soundly about the head and shoulders when [s]he's bad.
sub usage {
  my ($errmsg) = @_;
  $errmsg .= "\n" if defined($errmsg) and ($errmsg !~ /\n$/s);
  $errmsg = '' unless defined($errmsg);
  die $errmsg, <<"EndOfUsage";
usage:  $0 [options] message-file
options:
   -h , --here DOMAIN
	Set the description of "here" used for stripping local Received-To
	headers.  [default: 'MIT.EDU']
   -m , --outside
	Remove "Received: ... DOMAIN ..." as well as "... HOST.DOMAIN ..."
        headers from the message.
   -c , --chain
	The offending mail is a chain letter, not a business solicitation.
   -a , --abuse
	No user-serviceable addresses inside; only contact abuse+postmaster.
      --msn
        Bug UUnet about MSN addresses.
   -n , --display_only
        Generate a message, but only display it, do not send.
   -N , --dryrun
        Generate a message, then discard it without displaying or sending.
   -d , --debug
        Turn on debugging.
EndOfUsage
}
