#!/usr/athena/bin/perl -w
#
# $Id: rcvanal.pl,v 1.57 2017/10/24 10:54:11 jhawk Exp $
#
# Written by John Hawkinson <jhawk@MIT.EDU>, 2 November 2003
# This is in the Public Domain.
#
#
# script for analyzing Received: headers in a message.
# Keywords: perl received header analysis tool rfc2822 rfc822 smtp
#
# todo:
# Honor 2822 quotation syntax
#
#
use strict;

# Look for Date::Parse in the directory were run from.
use FindBin;
use lib "$FindBin::Bin/lib/perl5/site_perl";

use Date::Parse;
use POSIX;

my $debug = 0;

# From quickie strict-ification. Needs semantic sorting. XXX
my ($msgapprovaldate, $msgdate, $msgid, $rcvcount);
my (@msgdel, @msgrcv, @rcvby, @rcvfrom, @rcvtime, @rcvwith);
my (%hostcache);

# Increase human-readability of hostames; add mit-specific annotations.
#
# Parenthetical names generated from mit.edu MXs, full list at
# /afs/net/admin/hosts/etc/master/mit.edu.header (now fascist).
#
sub shortenhost {
        ($_, my $hint) = @_;

        do
	  { my $dbg= "shortenhost $_" .
            (defined($hint)?"hint $hint":"") . "\n";
	    print $dbg;
	} if (0); # debug
        # Try to limit the substitutions to FOO.mit.edu in case
        # some other site has a manawatu-mail-centre (year right).
        # Not good enough for "mit.edu" alone, though.
        if (s/\.mit\.edu\b//g || /^mit\.edu\b/ || /^barracuda.edu\b/) {
          # print "shortenhost2 $_\n"; # debug

          s/\(\[unix socket]\)/(unix)/i;
    
          s/^manawatu-mail-centre\b/mmc(outgoing)/i;
          s/^melbourne-city-street\b/mcs(outgoing)/i;
          s/^central-city-carrier-station\b/cccs(mailhub)/i;
          s/^grand-central-station\b/gcs(mailhub)/i;
          s/^fort-point-station/fps(dmz)/i;
          s/^pacific-carrier-annex\b/pca(dmz)/i;
          s/^biscayne-one-station\b/bIs(dmza)/i;
          s/^south-station-annex\b/ssa(dmz-outgoing)/i;
          s/^pch\b$/pch(mailman)/i;
# shortenhost mit.edu (W92-130-BARRACUDA-2.MIT.EDU [18.7.21.223])
          s/^(mit|barracuda).edu\s\((\S+)-\S+-(barracuda-\d)\S+\s.*\)/lc "mit($2-$3)"/ei;
# shortenhost mit.edu (Spam Firewall)
          #s/^mit.edu\s\(Spam Firewall\)/mit.edu(barracuda-?)/;
          if (/^(mit|barracuda).edu\s\(Spam Firewall\)/) {
              # The barracuda doesn't say which one it is, but we
              # probably know from the previous header (hint).
              if ($hint =~ /^(mit)\(\S+barracuda-\d\)$/) {
                  $_ = $hint;
                  s/(\d)\)$/$1?)/; # add a questionmark
              } else {
                  $_="mit.edu(barracuda-?)";
              }
          }
# shortenhost po12.mit.edu ([unix socket])
# shortenhost po12.mit.edu (Cyrus v2.1.5)
# shortenhost fort-point-station
# shortenhost po12.mit.edu (8.13.6/4.7)
          s/^(po\d+)\s\((\S+).*\)/$1($2)/i;
        } # end mit.edu block.

# shortenhost [198.116.61.253] (juno.usno.navy.mil [198.116.61.253]) hint
# shortenhost [198.116.61.253] hint [198.116.61.253]
        # If we have an IP address but also a domain name in parents, favor
        # the domain name. Also, save it for later use.
        if (m/^(\[\d+\.\d+\.\d+\.\d+])\s\((\S+)\s(\[\d+\.\d+\.\d+\.\d+])\)$/
           && ($1 eq $3)) {
              $hostcache{$1} = $2;
              $_ = $2;
        }

        # Received: from noao.edu ([140.252.1.54]) by ...
        # Save to cache
        if (m/^(\S+)\s\((\[\d+\.\d+\.\d+\.\d+])\)/) {
            # print "adding $2 as $1\n"; #debug
            $hostcache{$2} = $1;
        }

        # If we have only an IP address, check and see if we've previously
        # cached it.
        if (m/^(\[\d+\.\d+\.\d+\.\d+]).*/ && $hostcache{$1}) {
          $_ = $hostcache{$1};
        }

        # Received: from [10.0.1.22] (really [68.231.133.145])
        s/^(\[10\.\d+\.\d+\.\d+])\s\(really\s(\[\d+\.\d+\.\d+\.\d+])\)$/$2/;

# Received: from [10.0.0.11]
#   (host96-136-static.25-87-b.business.telecomitalia.it [87.25.136.96])
        s/^(\[10\.\d+\.\d+\.\d+])\s\((\S+)\s(\[\d+\.\d+\.\d+\.\d+])\)$/$2/;

        # Received: from localhost (HELO Charlotte.LoveWebWiz.Biz) (127.0.0.1)
        s/^localhost\s\(HELO\s(\S+)\).*/$1/;
        s/^unknown\s\(HELO\s(\S+)\).*/$1/;

        # Remove comments preceded by whitespace (i.e. those not added
        # by us).
        s/\s\(.*?\)//g;

        $_[0] = $_;
 }

# Parse the header types.
# X-Mailman-Approved-At: Fri, 04 Aug 2006 08:48:42 -0400
sub process {
    my ($line, $type) = @_;

  SWITCH: for ($type) {
# XXX we got the last resent-, not the first.
 ($_ eq "resent-date:")                   && do { $msgdate=$line; last; };
 ($_ eq "date:" && !$msgdate)             && do { $msgdate=$line; last; };
 ($_ eq "delivered-to:")                  && do { $msgdel[$rcvcount]=$line; last; };
 ($_ eq "resent-message-id:")             && do { $msgid=$line; last; };
 ($_ eq "message-id:" && !$msgid)         && do { $msgid=$line; last; };
 ($_ eq "received:")                      && do { $msgrcv[$rcvcount++]=$line; last; };
# wtf? pine does this?!
 ($_ eq "x-received:")                    && do { $msgrcv[$rcvcount++]=$line; last; };
 ($_ eq "x-mailman-approved-at:")         && do { $msgapprovaldate=$line; last; };
  }
}

sub printstats {
  my ($datetime, $approvaltime, $firsttime, $prevtime);
  my ($diff, $time);
  my ($i, $j);
  my ($from, $format);
  my (@formatwidths);

  if (defined($msgdate)) {
    @_ = split(/\s/, $msgdate, 2);
    $datetime = str2time($_[1]);
  }
  
  # xxx code duplication w/ above
  if (defined($msgapprovaldate)) {
    @_ = split(/\s/, $msgapprovaldate, 2);
    $approvaltime = str2time($_[1]);
  }
    
  
  #   Received: from mail.netbsd.org (mail.isc.netbsd.org [204.152.185.212])
  #           by pacific-carrier-annex.mit.edu (8.12.4/8.9.2) with SMTP id
  #           hA27x8I1003735
  #           for <jhawk@mit.edu>; Sun, 2 Nov 2003 02:59:08 -0500 (EST)
  #   Received: (qmail 3013 invoked by uid 1218); 2 Nov 2003 07:58:59 -0000
  #   Received: from cvs.isc.netbsd.org (HELO cvs.netbsd.org) (204.152.185.213)
  #     by mail.netbsd.org with SMTP; 2 Nov 2003 07:58:52 -0000   
  #
  for ($i=0; $i <= $#msgrcv; $i++) {
    my $x;
    my ($rawrcvdata, $rawrcvtime);
  
  # Parse Received headers, assuming the date is at the end, after a semicolon,
  # per RFC2822.
    $_ = $msgrcv[$i];
    @_ = split(/;/);
    if ($#_) {
      $rawrcvtime=pop;
      $rawrcvdata=join(";", @_);
    } else {
      # We did not successfully split! Alas! Perhaps:
      #Received: by Knoppix
      #        via sendmail from stdin
      #        id <m1G90uI-0000ZAC@Knoppix> (Debian Smail3.2.0.115)
      #        Fri, 4 Aug 2006 10:44:10 -0400 (EDT)
      s/\(\S+\)\s*$//;       # Remove a terminal parenthtical expression
      @_=split(/[\(\)]/);    # try everything after the remaining ) instead.
      if ($#_) { # is this the best test for success?
        $rawrcvtime=pop;               # xxx overwrites $_ in failure
        $rawrcvdata=$_;
      } else {
# Received: by filter0021p3las1.sendgrid.net with SMTP id
#  filter0021p3las1-32196-59E4675E-43        2017-10-16 08:01:34.935056102 +0000
#  UTC
# Received: from MTc5ODU4Nw (unknown [104.236.130.165])	by
#  ismtpd0012p1las1.sendgrid.net (SG) with HTTP id zu4FyZjOS7ipaJClvIt9WQ	Mon,
#  16 Oct 2017 08:01:34.908 +0000 (UTC)
	  ($rawrcvtime = $_) =~ s/.* with ...P id\s+\S+\s+//;
	  $rawrcvdata=$_;
      }
    }
 
    printf "1:$msgrcv[$i]\n" if $debug;
    printf "A:$_\n" if $debug; # why was this interesting?
    printf "2:$rawrcvtime\n" if $debug;
    $rcvtime[$i]=str2time($rawrcvtime);
    printf "3:$rcvtime[$i]\n" if $debug;
  

    # Parse source/dst name-val pairs.
    $_ = $rawrcvdata;

     # print "b: $_\n"; #debug
     s/(\S+)\s+(\([^)]+\))/$x="$1 $2"; ($x =~ s;[ \t]+;XXXSPACEXXX;g); $x/ge;

     # print "c: $_\n"; #debug

     # If we start with a comma, remove the space.
     s/^(\S+:)XXXSPACEXXX/$1 /;
    
     # my $debug=1 if /qmail/;
     # print "a: $_\n" if /qmail/; #debug
     s/^Received:\s+(.*);/$1/i;       # Punt headername
     s/by Google Production Router with/by Google_Production_Router with/;
     s/by Google Production with/by Google_Production with/;
     @_ = split(/\s+/);
     foreach (@_) {
       s/XXXSPACEXXX/ /g;
     }
     s/XXXSPACEXXX/ /g;
     $rcvfrom[$i] = $rcvby[$i] = $rcvwith[$i] = "";   # initialize
     # XXX: perl idiom
     for ($j=0; $j <= $#_; $j++) {
       # print "dbg: $j: $_[$j]\n"; # if ($debug);
       if ($_[$j] =~ /^from$/i && defined($_[$j+1])) {
         $rcvfrom[$i]=$_[$j+1];
         shortenhost($rcvfrom[$i]);
       } elsif ($_[$j] =~ /^by$/i && defined($_[$j+1])) {
         $rcvby[$i]=$_[$j+1];
         shortenhost($rcvby[$i],
           defined($rcvfrom[$i-1])?$rcvfrom[$i-1]:"" # hint
         );
       } elsif ($_[$j] =~ /^with$/i) {
         # print "with $_[$j+1]\n"; # debug
         $rcvwith[$i]=$_[$j+1];
         $rcvwith[$i] =~ s/\(.+\)//g; # Remove comments.
         }
     }
  
     ## Received: (qmail 4662 invoked from network); 6 Aug 2006 02:09:00
     #s/^\(([^)]*)\)/$1/;
  
     if (($rcvfrom[$i] eq "") && ($rcvby[$i] eq "") && ($rcvwith[$i] eq "")) {
       ($rcvfrom[$i] = $_); # msgrcv[$i]);
     }
   }
  
  
  # Summarize
  print $msgid, "\n" if defined($msgid);
  
  # Calculate field widths. Tricksy because of times.
  # This is apatheticaly ugly code. There is significant
  # code duplication from the main loop (subsequent),
  # and it just all come sout unfriendly. Quite possibly
  # this entire revision (1.27) should be reverted.
  #
  # FURTHERMORE, it's not even clear if changing the
  # field widths based on the data is a good idea -- after
  # all, perhaps someone will want to parse the output
  # format without expecting such unreasaonble variation
  # in the output! XXX!
  @formatwidths = (1, 10, 6, 25, 25);
  $prevtime=$datetime;
  for ($i=$#msgrcv; $i>=0; $i--) {
      my $x;
      if ($prevtime && $rcvtime[$i]) {
        $diff=$rcvtime[$i]-$prevtime;
      } else {
        $diff=-1;      # XXX use undef
      }
  
      next if (abs($diff) < 60);
  
      if ($rcvtime[$i]) {
        $time=$rcvtime[$i];
      } else {
        $time=0;
      }
  
      $x=ceil(log(abs($diff)/60)/log(10));
      if ($x > $formatwidths[2]-3) {
        $formatwidths[2] += ($x - ($formatwidths[2]-3));
      }
      if (($diff<0) && ($x == $formatwidths[2]-3)) {
        $formatwidths[2]++;
      }
  
      $prevtime=$time;
  }
  # Is this use of my even a good idea?
  {my @fw=@formatwidths; $format = "%$fw[0]s %$fw[1]s %$fw[2]s %-$fw[3]s %-$fw[4]s";}
  
  
  printf "$format%s\n",
      "#", "Raw time", "mmm:ss", "fromhost", "byhost", "with";
  
  if (defined($datetime)) {
      printf "$format\n",
          "D", $datetime, "---:--", "--", "Date header";
  }
  $prevtime=$datetime;
  
  for ($i=$#msgrcv; $i>=0; $i--) {
      if ($prevtime && $rcvtime[$i]) {
        $diff=$rcvtime[$i]-$prevtime;
      } else {
        $diff=undef }   # XXX use undef
  
      if (!defined($firsttime)) {
        if (defined($datetime)) { 
            $firsttime = $datetime;
        } else {
            $firsttime = $rcvtime[$i];
        }
      }
  
      if (defined($rcvfrom[$i])) {
        $from=$rcvfrom[$i];
      } elsif ($msgdel[$i]) {
        ($from=$msgdel[$i]) =~ s/Delivered-To: /(del.to): /;
      } else {
        $from="-";
      }
  
      if ($rcvtime[$i]) {
        $time=$rcvtime[$i];
      } else {
        $time=0;
      }
  
      
      # This is a bit kludgey, but print a note if there was a
      # moderation approval, and print some of the times. Perhaps
      # this should get shoved into @msgrcv or something? Currently
      # it only handles mailman.
      if (defined($approvaltime)) {
         if (($approvaltime > $prevtime) && ($approvaltime < $time)) {
             my ($time1, $time2);
             $time1 = $approvaltime - $prevtime;
             $time2 = $time - $approvaltime;
  
             printf "%$formatwidths[0]s %$formatwidths[1]d " .
                    "%$formatwidths[2]s MODERATION APPROVAL " .
                    "[%d:%02d (%s%d:%02d:%02d) + %d:%02d] %s\n",
               "M", $approvaltime, "---:--",
               $time1/60, fmod($time1,60), # XXX fmod but fix signs!
               $time1<1?"-":"",
               abs($time1)/3600, abs($time1)%3600/60, $time1%60,
               $time2/60, fmod($time2,60),
                "--------";
         }
      }
  
      printf "%$formatwidths[0]x %$formatwidths[1]d " .
      (!defined($diff)?
        "%$formatwidths[2].$formatwidths[2]s%0.0s ":
        "%" . ($formatwidths[2]-3) . "s:%02d ") .
      ($rcvby[$i]?"%-25.25s %-25.25s":(
        $msgdel[$i]?"[%23s] %s":"%-25.25s %-25.25s")) .
      "%-7.7s\n",
      $i, $time,
      !defined($diff)?
        "n/a":
        sprintf("%s%d", $diff<0?"-":"", abs($diff)/60),
      !defined($diff)?"":abs($diff)%60,
      $from,
      $rcvby[$i],
      $rcvwith[$i];
  
      $prevtime=$time;
  }
  
  $diff= $time - $firsttime;
# printf "  $diff= $time - $firsttime;\n";
  my $days = int(abs($diff)/3600/24);
  printf "Elapsed time %s%d:%02d (%s%s%d%s%02d%s%02d%s)%s.\n\n",
      $diff<0?"-":"",
      # All this abs() crap is to avoid the wrong thing
      # happening when we get negative times such that the modulus
      # operator returns a positive number. E.g. -6%60 => 54, not -6.
      # Is there a better way??
      abs($diff)/60, abs($diff)%60,
      $diff<1?"-":"",          ($days>0)?"$days day".($days>1?"s":"").": ":"",
      abs($diff)%86400/3600,   ($days>0)?"h ":":",
      abs($diff)%3600/60,      ($days>0)?"m ":":",
      abs($diff)%60,                ($days>0)?"s" :"",
      defined($approvaltime)?" includes mailman moderation":"";
}

sub resetstate {
    $rcvcount = 0;
    undef $msgdate; undef $msgid; undef $msgapprovaldate;
    undef @msgdel; undef @msgrcv; undef @rcvby; undef @rcvfrom; undef @rcvtime; undef @rcvwith;
    # undef %hostcache;
}


sub main {
  my ($inheaders, $line, $type);

  # instrumentation code...for mutt screen weirdness.
  #printf "Startup...\nline 2\n";
  #system("stty sane < /dev/tty");
  
  # Read in the message
  $rcvcount=0; $inheaders = 1;
  while (<>) {
    chop;
  
    # 2 states: 1) In headers 2) In body and looking for headers.
  
    # Look for the start of headers
    if (!$inheaders && /^[A-Z][\041-\071\073-\176]+:\s/) {
      $inheaders = 1;
      &resetstate;
    }
  
    # print "$inheaders $rcvcount $_\n"; # debug
  
    if ($inheaders) {
      # If we start with a non-colon printable...
  
      # RFC2822 Page 7
      # 2.2. Header Fields
      # 
      #    Header fields are lines composed of a field name, followed by a colon
      #    (":"), followed by a field body, and terminated by CRLF.  A field
      #    name MUST be composed of printable US-ASCII characters (i.e.,
      #    characters that have values between 33 and 126, inclusive), except
      #    colon.  A field body may be composed of any US-ASCII characters,
      #    except for CR and LF.  However, a field body may contain CRLF when
      #    used in header "folding" and  "unfolding" as described in section
      #    2.2.3.  All field bodies MUST conform to the syntax described in
      #    sections 3 and 4 of this standard.
      # 
      if (/^[\041-\071\073-\176]/) {
           # Process the saved line, if there is one.
           process($line, $type) if ($line);
           # Save the line for next time...
           $line=$_;
           @_ = split(' ');
           $type = lc $_[0];      # break out the type.
      }                   
      # Append continuation lines.        
      $line=$line . $_ if (/^\s+\S/);
      
      # Stop at a blank.
      if (/^$/) {
          process($line, $type) if ($line);
          &printstats() if $rcvcount;
          $inheaders = 0;
      }   
    }      
  }
}

&main();
