#!/usr/athena/bin/perl -w
#
# $Id: rcvanal.pl,v 1.41 2006/08/06 03:50:16 jhawk Exp $
#
# script for analyzing Received: headers in a message.
# Keywords: perl received header analysis tool rfc2822 rfc822 smtp
#
# todo:
# Honor 2822 quotation syntax
#
#
use strict;

# Look for Date::Parse in the directory were run from.
use FindBin;
use lib "$FindBin::Bin/lib/perl5/site_perl";

use Date::Parse;
use POSIX;

# From quickie strict-ification. Needs semantic sorting. XXX
my ($diff, $time, $i, $j, $prevwith, $prevtime, $msgdate,$msgid, $r,
    $firsttime);
my ($approvaldate, $line, $type, $datetime, $approvaltime, $from, $format);
my (@msgdel, @msgrcv, @rcvby, @rcvfrom, @rcvtime, @rcvwith);
my (@formatwidths);
my (%hostcache);

# Increase human-readability of hostames; add mit-specific annotations.
#
# Parenthetical names generated from mit.edu MXs, full list at
# /afs/net/admin/hosts/etc/master/mit.edu.header (now fascist).
#
sub shortenhost {
        ($_, my $hint) = @_;
        # print "shortenhost $_ hint $hint\n"; # debug
	# Try to limit the substitutions to FOO.mit.edu in case
	# some other site has a manawatu-mail-centre (year right).
	# Not good enough for "mit.edu" alone, though.
	if (s/\.mit\.edu\b//g || /^mit\.edu\b/) {
          # print "shortenhost2 $_\n"; # debug

          s/\(\[unix socket]\)/(unix)/i;
    
          s/^manawatu-mail-centre\b/mmc(outgoing)/i;
          s/^melbourne-city-street\b/mcs(outgoing)/i;
          s/^central-city-carrier-station\b/cccs(mailhub)/i;
          s/^grand-central-station\b/gcs(mailhub)/i;
          s/^fort-point-station/fps(dmz)/i;
          s/^pacific-carrier-annex\b/pca(dmz)/i;
          s/^biscayne-one-station\b/bIs(dmza)/i;
          s/^south-station-annex\b/ssa(dmz-outgoing)/i;
          s/^pch\b$/pch(mailman)/i;
# shortenhost mit.edu (W92-130-BARRACUDA-2.MIT.EDU [18.7.21.223])
          s/^mit.edu\s\((\S+)-\S+-(barracuda-\d)\S+\s.*\)/lc "mit($1-$2)"/ei;
# shortenhost mit.edu (Spam Firewall)
          #s/^mit.edu\s\(Spam Firewall\)/mit.edu(barracuda-?)/;
	  if (/^mit.edu\s\(Spam Firewall\)/) {
	      # The barracuda doesn't say which one it is, but we
	      # probably know from the previous header (hint).
	      if ($hint =~ /^mit\(\S+barracuda-\d\)$/) {
		  $_ = $hint;
		  s/(\d)\)$/$1?)/; # add a questionmark
	      } else {
		  $_="mit.edu(barracuda-?)";
	      }
	  }
# shortenhost po12.mit.edu ([unix socket])
# shortenhost po12.mit.edu (Cyrus v2.1.5)
# shortenhost fort-point-station
# shortenhost po12.mit.edu (8.13.6/4.7)
          s/^(po\d+)\s\((\S+).*\)/$1($2)/i;
        } # end mit.edu block.

# shortenhost [198.116.61.253] (juno.usno.navy.mil [198.116.61.253]) hint
# shortenhost [198.116.61.253] hint [198.116.61.253]
	# If we have an IP address but also a domain name in parents, favor
	# the domain name. Also, save it for later use.
        if (m/^(\[\d+\.\d+\.\d+\.\d+])\s\((\S+)\s(\[\d+\.\d+\.\d+\.\d+])\)$/
           && ($1 eq $3)) {
	      $hostcache{$1} = $2;
	      $_ = $2;
	}

	# Received: from noao.edu ([140.252.1.54]) by ...
	# Save to cache
	if (m/^(\S+)\s\((\[\d+\.\d+\.\d+\.\d+])\)/) {
	    # print "adding $2 as $1\n"; #debug
	    $hostcache{$2} = $1;
	}

	# If we have only an IP address, check and see if we've previously
	# cached it.
        if (m/^(\[\d+\.\d+\.\d+\.\d+]).*/ && $hostcache{$1}) {
	  $_ = $hostcache{$1};
        }

        # Received: from [10.0.1.22] (really [68.231.133.145])
        s/^(\[10\.\d+\.\d+\.\d+])\s\(really\s(\[\d+\.\d+\.\d+\.\d+])\)$/$2/;

# Received: from [10.0.0.11]
#   (host96-136-static.25-87-b.business.telecomitalia.it [87.25.136.96])
        s/^(\[10\.\d+\.\d+\.\d+])\s\((\S+)\s(\[\d+\.\d+\.\d+\.\d+])\)$/$2/;

	# Received: from localhost (HELO Charlotte.LoveWebWiz.Biz) (127.0.0.1)
	s/^localhost\s\(HELO\s(\S+)\).*/$1/;
	s/^unknown\s\(HELO\s(\S+)\).*/$1/;

        # Remove comments preceded by whitespace (i.e. those not added
        # by us).
	s/\s\(.*?\)//g;

        $_[0] = $_;
 }

# Parse the header types.
# X-Mailman-Approved-At: Fri, 04 Aug 2006 08:48:42 -0400
sub proc {
    my ($line, $type) = @_;

  SWITCH: for ($type) {
# XXX we got the last resent-, not the first.
 ($_ eq "resent-date:")                   && do { $msgdate=$line; last; };
 ($_ eq "date:" && !$msgdate)             && do { $msgdate=$line; last; };
 ($_ eq "delivered-to:")                  && do { $msgdel[$r]=$line; last; };
 ($_ eq "resent-message-id:")  		  && do { $msgid=$line; last; };
 ($_ eq "message-id:" && !$msgid)         && do { $msgid=$line; last; };
 ($_ eq "received:")                      && do { $msgrcv[$r++]=$line; last; };
# wtf? pine does this?!
 ($_ eq "x-received:")                    && do { $msgrcv[$r++]=$line; last; };
 ($_ eq "x-mailman-approved-at:")	  && do { $approvaldate=$line; last; };
  }
}

# Read in the headers
$r=0;
while (<>) {
    chop;
    do { proc($line, $type) if ($line);
	 $line=$_;
	 @_ = split(' ');
	 $type = lc $_[0]; }			if (/^[\041-\071\073-\176]/);
    $line=$line . $_ 				if (/^\s/);
    do { proc($line, $type) if ($line); last }	if (/^$/);
}

if (defined($msgdate)) {
  @_ = split(/\s/, $msgdate, 2);
  $datetime = str2time($_[1]);
}

# xxx code duplication w/ above
if (defined($approvaldate)) {
  @_ = split(/\s/, $approvaldate, 2);
  $approvaltime = str2time($_[1]);
}
  

#   Received: from mail.netbsd.org (mail.isc.netbsd.org [204.152.185.212])
#           by pacific-carrier-annex.mit.edu (8.12.4/8.9.2) with SMTP id
#           hA27x8I1003735
#           for <jhawk@mit.edu>; Sun, 2 Nov 2003 02:59:08 -0500 (EST)
#   Received: (qmail 3013 invoked by uid 1218); 2 Nov 2003 07:58:59 -0000
#   Received: from cvs.isc.netbsd.org (HELO cvs.netbsd.org) (204.152.185.213)
#     by mail.netbsd.org with SMTP; 2 Nov 2003 07:58:52 -0000   
#
for ($i=0; $i <= $#msgrcv; $i++) {
    my $x;

# Parse Received headers, assuming the date is at the end, after a semicolon,
# per RFC2822.
     $_ = $msgrcv[$i];
     @_ = split(/;/);
     if ($#_) { $_=$_[-1]; } else {
       # We did not successfully split! Alas! Perhaps:
       #Received: by Knoppix
       #        via sendmail from stdin
       #        id <m1G90uI-0000ZAC@Knoppix> (Debian Smail3.2.0.115)
       #        Fri, 4 Aug 2006 10:44:10 -0400 (EDT)
       s/\(\S+\)\s*$//;  	# Remove a terminal parenthtical expression
       @_=split(/[\(\)]/);	# try everything after the remaining ) instead.
       $_=$_[-1];		# xxx overwrites $_ in failure
   }

     # printf "1:$msgrcv[$i]\n2:$_\n";  # debug
     $rcvtime[$i]=str2time($_);
     # printf "3:$rcvtime[$i]\n"; # debug

# Parse source/dst name-val pairs.
     $_=$msgrcv[$i];
#     print "b: $_\n"; #debug
     s/(\S+)\s+(\([^)]+\))/$x="$1 $2"; ($x =~ s;[	 ]+;XXXSPACEXXX;g); $x/ge;
     # If we start with a comma, remove the space.
     s/^(\S+:)XXXSPACEXXX/$1 /;
  

     # my $debug=1 if /qmail/;
     # print "a: $_\n" if /qmail/; #debug
     s/^Received:\s+(.*);/$1/i;	# Punt headername
     s/by Google Production Router with/by Google_Production_Router with/;
     s/by Google Production with/by Google_Production with/;
     @_ = split(/\s+/);
     foreach (@_) {
       s/XXXSPACEXXX/ /g;
     }
     s/XXXSPACEXXX/ /g;
     $rcvfrom[$i] = $rcvby[$i] = $rcvwith[$i] = "";	# initialize
     # XXX: perl idiom
     for ($j=0; $j <= $#_; $j++) {
	 # print "dbg: $j: $_[$j]\n" if ($debug);
	 if ($_[$j] =~ /^from$/i) {
	     $rcvfrom[$i]=$_[$j+1];
	     shortenhost($rcvfrom[$i]);
	 } elsif ($_[$j] =~ /^by$/i) {
	     $rcvby[$i]=$_[$j+1];
	     shortenhost($rcvby[$i],
	       defined($rcvfrom[$i-1])?$rcvfrom[$i-1]:"" # hint
	     );
	 } elsif ($_[$j] =~ /^with$/i) {
	     # print "with $_[$j+1]\n"; # debug
	     $rcvwith[$i]=$_[$j+1];
	     $rcvwith[$i] =~ s/\(.+\)//g; # Remove comments.
	     }
     }

     ## Received: (qmail 4662 invoked from network); 6 Aug 2006 02:09:00
     #s/^\(([^)]*)\)/$1/;

     if (($rcvfrom[$i] eq "") && ($rcvby[$i] eq "") && ($rcvwith[$i] eq "")) {
 	 ($rcvfrom[$i] = $_); # msgrcv[$i]);
     }
 }


# Summarize
print $msgid, "\n" if defined($msgid);

# Calculate field widths. Tricksy because of times.
# This is apatheticaly ugly code. There is significant
# code duplication from the main loop (subsequent),
# and it just all come sout unfriendly. Quite possibly
# this entire revision (1.27) should be reverted.
#
# FURTHERMORE, it's not even clear if changing the
# field widths based on the data is a good idea -- after
# all, perhaps someone will want to parse the output
# format without expecting such unreasaonble variation
# in the output! XXX!
@formatwidths = (1, 10, 6, 25, 25);
$prevtime=$datetime;
for ($i=$#msgrcv; $i>=0; $i--) {
    my $x;
    if ($prevtime && $rcvtime[$i]) {
	$diff=$rcvtime[$i]-$prevtime;
    } else {
	$diff=-1 }	# XXX use undef

    next if (abs($diff) < 60);

    if ($rcvtime[$i]) {
	$time=$rcvtime[$i];
    } else {
	$time=0;
    }

    $x=ceil(log(abs($diff)/60)/log(10));
    if ($x > $formatwidths[2]-3) {
	$formatwidths[2] += ($x - ($formatwidths[2]-3));
    }
    if (($diff<0) && ($x == $formatwidths[2]-3)) {
      $formatwidths[2]++;
    }

    $prevtime=$time;
}
# Is this use of my even a good idea?
{my @fw=@formatwidths; $format = "%$fw[0]s %$fw[1]s %$fw[2]s %-$fw[3]s %-$fw[4]s";}


printf "$format%s\n",
    "#", "Raw time", "mmm:ss", "fromhost", "byhost", "with";

if (defined($datetime)) {
    printf "$format\n",
        "D", $datetime, "---:--", "--", "Date header";
}
$prevtime=$datetime;

for ($i=$#msgrcv; $i>=0; $i--) {
    if ($prevtime && $rcvtime[$i]) {
	$diff=$rcvtime[$i]-$prevtime;
    } else {
	$diff=undef }	# XXX use undef

    if (!defined($firsttime)) {
	if (defined($datetime)) { 
	    $firsttime = $datetime;
	} else {
	    $firsttime = $rcvtime[$i];
	}
    }

    if (defined($rcvfrom[$i])) {
	$from=$rcvfrom[$i];
    } elsif ($msgdel[$i]) {
	($from=$msgdel[$i]) =~ s/Delivered-To: /(del.to): /;
    } else {
	$from="-";
    }

    if ($rcvtime[$i]) {
	$time=$rcvtime[$i];
    } else {
	$time=0;
    }

    
    # This is a bit kludgey, but print a note if there was a
    # moderation approval, and print some of the times. Perhaps
    # this should get shoved into @msgrcv or something? Currently
    # it only handles mailman.
    if (defined($approvaltime)) {
       if (($approvaltime > $prevtime) && ($approvaltime < $time)) {
           my ($time1, $time2);
           $time1 = $approvaltime - $prevtime;
           $time2 = $time - $approvaltime;

           printf "%$formatwidths[0]s %$formatwidths[1]d " .
                  "%$formatwidths[2]s MODERATION APPROVAL " .
                  "[%d:%02d (%s%d:%02d:%02d) + %d:%02d] %s\n",
             "M", $approvaltime, "---:--",
             $time1/60, fmod($time1,60), # XXX fmod but fix signs!
             $time1<1?"-":"",
             abs($time1)/3600, abs($time1)%3600/60, $time1%60,
             $time2/60, fmod($time2,60),
              "--------";
       }
    }

    printf "%$formatwidths[0]x %$formatwidths[1]d " .
    (!defined($diff)?
      "%$formatwidths[2].$formatwidths[2]s%0.0s ":
      "%" . ($formatwidths[2]-3) . "s:%02d ") .
    ($rcvby[$i]?"%-25.25s %-25.25s":(
      $msgdel[$i]?"[%23s] %s":"%-25.25s %-25.25s")) .
    "%-7.7s\n",
    $i, $time,
    !defined($diff)?
      "n/a":
      sprintf("%s%d", $diff<0?"-":"", abs($diff)/60),
    !defined($diff)?"":abs($diff)%60,
    $from,
    $rcvby[$i],
    $rcvwith[$i];

    $prevtime=$time;
}

$diff= $time - $firsttime;
printf "\nElapsed time %d:%02d (%s%d:%02d:%02d).\n",
    $diff/60, $diff%60,
    # All this abs() crap is to avoid the wrong thing
    # happening when we get negative times that are more
    # than 60 seconds. Surely there's a better way? (??)
    $diff<1?"-":"",
    abs($diff)/3600, abs($diff)%3600/60, $diff%60;
