# log.pl -- tool for displaying stats from the server log
# Albert Dvornik (bert@athena.mit.edu)
#
# $Id: log-parser.pl,v 1.3 93/06/10 07:56:31 bert Exp Locker: bert $
#

$map{'log-hosts'} = '&do_log("$rest?$query")';

require 'ctime.pl';
require 'timelocal.pl';

sub do_log {
    local($args) = @_;
    local($hmin,$lmin,$mmin,$imin,$fmin,$dmin) = (1,1,1,1,10,1); # minimum values for an entry to be displayed

    local($time_out)=30;

    local(*LOG);		# a hack to localize filehandles
    local(%hstat,%lhstat,%fstat,%weird_cmds,%ipstat,%mitstat,%date,@wday,@daytime,@servlog);
    local($restart,$k,$count,$clist,$query,$lno,$logtime,$itime);

    local(%unmonth) = ("Jan",0,"Feb",1,"Mar",2,"Apr",3,"May",4,"Jun",5,"Jul",6,
		       "Aug",7,"Sep",8,"Oct",9,"Nov",10,"Dec",11);

    local(%unday) = ("Sun",0,"Mon",1,"Tue",2,"Wed",3,"Thu",4,"Fri",5,"Sat",6);

    local($gmtdiff) = (4);
    $ENV{"TZ"}="EDT";

    alarm($time_out);

    print <<"EndOfCrock";
<H1>Sorry...</H1>
The logfile parser is, sadly, unavailable until we switch over to
Plexus 3.0, which will hopefully be soon.<p>
(No, it's not Matt's fault any more.  Sorry, Matt... =)<p>
<address>--bert</address>
EndOfCrock

    return;

    {
	($args =~ m,\?(.+),) || ($query="info", &head, &info, next);
	$query=$1;

	($query =~ s/\+(.*)$//) &&
	    ((($count=$1) <= 0) &&
	     &error('bad_request',"log?$query+$count: argument after '+' must be a positive number"));

	($query eq "all") && &error('not_implemented','This query has been discontinued.  Use individual queries.');
	($query eq "hosts") &&  ($hmin=($count?$count:$hmin), &count_hosts,  &head, &hosts,  &foot, next);
	($query eq "long") &&   ($lmin=($count?$count:$lmin), &count_long,   &head, &long,   &foot, next);
	($query eq "files") &&  ($fmin=($count?$count:$fmin), &count_files,  &head, &files,  &foot, next);
	($query eq "net") &&    ($imin=($count?$count:$imin), &count_net,    &head, &net,    &foot, next);
	($query eq "mitnet") && ($mmin=($count?$count:$mmin), &count_mitnet, &head, &mitnet, &foot, next);
	($query eq "date") &&   ($dmin=($count?$count:$dmin), &count_days,   &head, &date,   &foot, next);

	$count && &error('bad_request',"log?$query+$count: unknown query type");

	($query eq "info") &&                                (&head, &info, next);
	($query eq "weird") &&                               (&count_files,  &head, &weird,  &foot, next);
	($query eq "status") &&                              (&count_status, &head, &status, &foot, next);
	($query eq "week") &&                                (&count_week,   &head, &week,   &foot, next);

	($query =~ /^\//) &&                                 (&count_files,  &head, &file_query,   &foot, next);
	($query =~ /^18\.[0-9]+\.[0-9]+\.\*$/) &&            (&count_mitnet, &head, &mitnet_query, &foot, next);
	($query =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/) &&    (&count_long,   &head, &ip_query,     &foot, next);
	($query =~ /^[0-9]+\.[0-9]+\.[0-9]+\.\*$/) &&        (&count_net,    &head, &net_query,    &foot, next);
	($query =~ /^[0-9]+\.[0-9]+\.\*\.\*$/) &&            (&count_net,    &head, &net_query,    &foot, next);
	($query =~ /^[0-9]+\.\*\.\*\.\*$/) &&                (&count_net,    &head, &net_query,    &foot, next);
	($query =~ /^[a-zA-Z0-9\.\-\_]+$/) &&                (&count_long,   &head, &host_query,   &foot, next);

	&error('bad_request',"log?$query: unknown query type");
    }

    alarm(0);



    sub servtime {
	local($wdays,$mons,$mday,$hms,$zone,$year) = split(/\s+/,@_[0]);
	local($hour,$min,$sec) = split(/:/,$hms);

	($zone eq "GMT") ? &timegm($sec,$min,$hour,$mday,$unmonth{$mons},$year-1900)
                      : &timelocal($sec,$min,$hour,$mday,$unmonth{$mons},$year-1900);
    }

    sub daytime {
	local($wdays,$mons,$mday,$hms,$zone,$year) = split(/\s+/,$logtime);
	local($hour,$min,$sec) = split(/:/,$hms);
	local($wd) = $unday{$wdays};
	if ($zone eq "GMT") {$hour -= 4;}
	if ($hour < 0) {$hour += 24; $wd = ($wd+6)%7;}
	local($dtime) = 86400 - (3600*$hour + 60*$min + $sec);
	local($ctime) = time - $itime;

	@daytime=(0,0,0,0,0,0,0);
	$daytime[$wd] = $dtime/3600;

	$dtime = $dtime;

	while ($dtime < $ctime) {
	    $wd = ($wd+1)%7;
	    $daytime[$wd] += 24;
	    $dtime += 86400;
	}
	$daytime[$wd] -= ($dtime - $ctime)/3600;
    }

    sub initcount {
	open(LOG, $http_log) || &error("internal-error","Can't find $http_log!");
	$lno=1;
	@servlog=($_=<LOG>);
	/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ 
	    || &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	$restart=$logtime=$1;
	$itime = &servtime($logtime);
    }

#
# counts are done separately to prevent lossage and high loads
#

    sub count_hosts {
	local($host,$ip);

	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {		
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
	        push(@servlog,$_); next;
	    }
	    /^(\S*)\s+\((\S+)\s*\)/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    ($host,$ip) = ($1,$2);
	    $host =~ tr/A-Z/a-z/;
	    ($host =~ /^[^\.]+\.MIT\.EDU$/i) ? $hstat{"*.MIT.EDU"}++ : $hstat{"$host ($ip)"}++;
	}
	close(LOG);
    }

    sub count_long {
	local($host,$ip);

	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /^(\S*)\s+\((\S+)\s*\)/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    ($host,$ip) = ($1,$2);
	    $host =~ tr/A-Z/a-z/;
	    $lhstat{"$host ($ip)"}++;
	}
	close(LOG);
    }

    sub count_files {
	local($cmd,$file);

	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /\)\s+(\S.+)$/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    ($cmd,$file) = (split(/\s+/,$1))[6,7];
	    ($cmd eq "GET") ? $fstat{($file=~/^(.*)\?/)?$1:$file}++ : $weird_cmds{$cmd}++;
	}
	close(LOG);
    }

    sub count_net {
	local($ip,@tmp);

	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /\((\S+)\s*\)/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    $ip = $1;
	    @tmp=split(/\./,$ip);
	    {
		($ipstat{"$tmp[0].*.*.*"}++, next)       if ($tmp[0] < 128); # class A networks
		($ipstat{"$tmp[0].$tmp[1].*.*"}++, next) if ($tmp[0] < 192); # class B networks
		$ipstat{"$tmp[0].$tmp[1].$tmp[2].*"}++;                      # class C (and lower) networks
	    }
	}
	close(LOG);
    }

    sub count_mitnet {
	local($ip,@tmp);

	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /\((\S+)\s*\)/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    $ip = $1;
	    @tmp=split(/\./,$ip);
	    $mitstat{"$tmp[0].$tmp[1].$tmp[2].*"}++ if ($tmp[0]==18);
	}
	close(LOG);
    }

    sub count_status {
	&initcount;
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	}
	close(LOG);
    }

    sub count_week {
	&initcount;
	@wday=(0,0,0,0,0,0,0);
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /\)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    (($5 eq "GMT")&&($4 < $gmtdiff)) ? $wday[($unday{$1}+6)%7]++ : $wday[$unday{$1}]++;
	}
	close(LOG);
    }

    sub count_days {
	local($hour,$min,$sec);
	&initcount;
	@wday=(0,0,0,0,0,0,0);
	while (<LOG>) {
	    $lno++;
	    if (/^----/) {
		/^----Server #[0-9]+ on port [0-9]+ started at (.+)$/ && ($restart=$1);
		push(@servlog,$_); next;
	    }
	    /\)\s+(\S+\s+\S+\s+\S+)\s+\S+\s+\S+\s+(\S+)\s/
		|| &error("internal_error","line $lno in $http_log is misformatted:<p>'$_'");
	    $date{"$1 $2"}++;
	}
	close(LOG);
    }

#
# global beginning, ending, and stat page
#

    sub head {
	print "<ISINDEX><TITLE>WWW.MIT.EDU server usage information ($query) </TITLE>\n";
	print "<H1>WWW.MIT.EDU Server Usage</H1>\n";
	print "<H3>logfile span: $logtime - ".&ctime(time)."</H3>\n" if ($query ne "info");
    }

    sub foot {&srvstat;}

    sub srvstat {
	local($dens) = $lno/(time-$itime);
        print "<H2>Statistics since ".&ctime($itime)."</H2><P>\n";
	print "Total: $lno requests<P>\n";
	printf "Average %5.2f requests/hour (%4.2f per minute, %6.4f per second... =)<P>\n",3600*$dens,60*$dens,$dens;
	print "Server last restarted: ".&ctime(&servtime($restart))."<P>\n";
    }

#
# various queries on the counted data
#

    sub file_query {
	if ($k=$fstat{$query}) {
	    print "<H2>file $query</H2>\n";
	    print "file has been requested $k times.<P>\n";
	} else {
	    print "No matching files found.<P>\n"; 
	    &info;
	}
    }

    sub net_query {
	if ($k=$ipstat{$query}) {
	    print "<H2>net $query</H2>\n";
	    print "hosts on this net have originated $k requests.<P>\n";
	} else {
	    print "No matching networks found.<P>\n"; 
	    &info;
	}
    }

    sub mitnet_query {
	if ($k=$mitstat{$query}) {
	    print "<H2>subnet $query</H2>\n";
	    print "hosts on this subnet have originated $k requests.<P>\n";
	} else {
	    print "No matching subnets found.<P>\n"; 
	    &info;
	}
    }

    sub host_query {
	local($disp);
	$query =~ tr/A-Z/a-z/;
	for $k (grep((/^$query / || /^$query.mit.edu /), keys %lhstat)) {
	    print "<H2>host $k</H2>\n";
	    print "site has originated ",$lhstat{$k}," requests.<P>\n";
	    $disp++;
	} 
	unless ($disp) {
	    print "No matching hosts found.<P>\n"; 
	    &info;
	}
    }

    sub ip_query {
	local($disp);
	for $k (grep(/\($query\)/, keys %lhstat)) {
	    print "<H2>host $k</H2>\n";
	    print "site has originated ",$lhstat{$k}," requests.<P>\n";
	    $disp++;
	} 
	unless ($disp) {
	    print "No matching hosts found.<P>\n"; 
	    &info;
	}
    }

#
# other formatting filters
#

    sub hosts {
	print "<H2>Client Hosts (condensed)</H2><P>\n";
	print "<PRE>#requests  hostname\n-----------------------\n";
	$count=0; $clist=0;
	for $k (sort {$hstat{$b} <=> $hstat{$a}} keys %hstat) {
	    if ($hstat{$k} >= $hmin) {
		printf "%9d  %s\n",$hstat{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist entr".(($clist==1)?"y":"ies")." listed (out of $count)</PRE><P>\n";
	print "Click here for <A HREF=\"log?long+$hmin\">long</A> list.<P>\n" unless $show{'long'};
    }

    sub long {
	print "<H2>Client Hosts (long list)</H2><P>\n";
	print "<PRE>#requests  hostname\n-----------------------\n";
	$count=0; $clist=0;
	for $k (sort {$lhstat{$b} <=> $lhstat{$a}} keys %lhstat) {
	    if ($lhstat{$k} >= $lmin) {
		printf "%9d  %s\n",$lhstat{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist host".(($clist==1)?"":"s")." listed (out of $count)</PRE><P>\n";
    }

    sub net {
	print "<H2>Client IP Networks</H2><P>\n";
	print "<PRE>#requests  network\n-----------------------\n";
	$count=0; $clist=0;
	for $k (sort {$ipstat{$b} <=> $ipstat{$a}} keys %ipstat) {
	    if ($ipstat{$k} >= $imin) {
		printf "%9d  %s\n",$ipstat{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist net".(($clist==1)?"":"s")." listed (out of $count)</PRE><P>\n";
    }

    sub mitnet {
	print "<H2>Client Subnets on MITnet</H2><P>\n";
	print "<PRE>#requests  MIT subnet\n-----------------------\n";
	$count=0; $clist=0;
	for $k (sort {$mitstat{$b} <=> $mitstat{$a}} keys %mitstat) {
	    if ($mitstat{$k} >= $mmin) {
		printf "%9d  %s\n",$mitstat{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist subnet".(($clist==1)?"":"s")." listed (out of $count)</PRE><P>\n";
    }

    sub files {
	print "<H2>File transfers</H2><P>\n";
	print "<PRE>#requests  pathname\n-----------------------\n";
	$count=0;
	for $k (sort {$fstat{$b} <=> $fstat{$a}} keys %fstat) {
	    if ($fstat{$k} >= $fmin) {
		printf "%9d  %s\n",$fstat{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist file".(($clist==1)?"":"s")." listed (out of $count)</PRE><P>\n";
    }

    sub weird {
	print "<H2>Weird server commands</H2><P>\n";
	print "<PRE>";
	for $k (keys %weird_cmds) {
	    printf "%4d '%s'\n",$weird_cmds{$k},$k;
	    $clist++;
	}
	print "</PRE><P>\n";
    }

    sub status {
	print "<H2>Server status log</H2><P>\n";
	print "<PRE>".join("",@servlog)."</PRE><P>\n";
    }

    sub date {
	print "<H2>Top transfer days (GMT)</H2><P>\n";
	print "<PRE>#requests  date\n-----------------------\n";
	$count=0;
	for $k (sort {$date{$b} <=> $date{$a}} keys %date) {
	    if ($date{$k} >= $dmin) {
		printf "%9d  %s\n",$date{$k},$k;
		$clist++;
	    }
	    $count++;
	}
	print "\ntotal: $clist date".(($clist==1)?"":"s")." listed (out of $count)</PRE><P>\n";
    }

    sub week {
	&daytime;
	print "<H2>Transfers by weekday</H2><P>\n";
	print "<PRE>#requests  req/hr  weekday\n---------------------------------\n";
	printf "%9d  %6.2f  Monday     (total %5.1f hours)\n",$wday[1],$daytime[1]?$wday[1]/$daytime[1]:0,$daytime[1];
	printf "%9d  %6.2f  Tuesday    (total %5.1f hours)\n",$wday[2],$daytime[2]?$wday[2]/$daytime[2]:0,$daytime[2];
	printf "%9d  %6.2f  Wednesday  (total %5.1f hours)\n",$wday[3],$daytime[3]?$wday[3]/$daytime[3]:0,$daytime[3];
	printf "%9d  %6.2f  Thursday   (total %5.1f hours)\n",$wday[4],$daytime[4]?$wday[4]/$daytime[4]:0,$daytime[4];
	printf "%9d  %6.2f  Friday     (total %5.1f hours)\n",$wday[5],$daytime[5]?$wday[5]/$daytime[5]:0,$daytime[5];
	printf "%9d  %6.2f  Saturday   (total %5.1f hours)\n",$wday[6],$daytime[6]?$wday[6]/$daytime[6]:0,$daytime[6];
	printf "%9d  %6.2f  Sunday     (total %5.1f hours)\n",$wday[0],$daytime[0]?$wday[0]/$daytime[0]:0,$daytime[0];
	print "\n</PRE><P>\n";
    }

    sub info {
	print <<"EndOfInfo";
<H2>Version Information</H2>
This is an *old* version of the log-parser.  It requires less maintenance than
the current one. =)  Don't abuse it too much-- there should be a <em>And Now
Something Completely Different</em> version soon, and this is only here
because I don't expect it to get much load while the server is in beta.<p>
<H2>Log Parser Information</H2>
Welcome to the SIPB WWW server log.  You can look up various server stats from here.<P>
This node has been set up as a <STRONG>searchable index</STRONG>.  Useful keywords you
can search for are:<P>
<UL>
<LI><EM>hostnames</EM><P>Searching for a specific hostname (or IP address) will give
information on the requests this server has gotten from that host.  Hostname should refer
to the host's primary name.  If it isn't fully qualified, .MIT.EDU domain is assumed.<P>
Searching for the keyword
<CODE>"<A HREF="log?hosts">hosts</A>+<EM>n</EM>"</CODE> will give a
compact list of the sites that have contacted the server at least
<CODE><EM>n</EM></CODE> times.  (The default is currently set to
$hmin.)  All the *.MIT.EDU sites will be listed in a single entry.<P>
Searching for the keyword
<CODE>"<A HREF="log?long">long</A>+<EM>n</EM>"</CODE> will give a
complete list of all the hosts that have contacted the server at least
<CODE><EM>n</EM></CODE> times.  (The default is currently set to
$lmin.)<P>
<LI><EM>network IP numbers</EM><P>Searching for net IP numbers (in the
form <CODE>"num.*.*.*"</CODE> for type A, <CODE>"num.num.*.*"</CODE> for
type B or <CODE>"num.num.num.*"</CODE> for type C networks) will show the
total usage for all hosts on that net.<P>
Searching for the keyword <CODE>"<A
HREF="log?net">net</A>+<EM>n</EM>"</CODE> will give a complete list of
all the IP networks that have totalled at least
<CODE><EM>n</EM></CODE> requests to the server.  (The default is
currently set to $imin.)<P>
<LI><EM>MIT subnet IP numbers</EM><P>Searching for subnet IP numbers
(in the form <CODE>"18.num.num.*"</CODE>) will show the total usage for
all hosts on that subnet.<P>
Searching for the
keyword <CODE>"<A HREF="log?mitnet">mitnet</A>+<EM>n</EM>"</CODE> will
give a complete list of all the MIT subnets that have totalled at
least <CODE><EM>n</EM></CODE> requests to the server.  (The default is
currently set to $mmin.)<P>
<LI><EM>filenames</EM><P>Searching for a specific pathname will display total number of
times the file has been requested from this server.  The name should omit the initial
<CODE>http://www.mit.edu:8001</CODE> string and start with a '/'.  Queries of the form 
<EM>/path?query</EM> get logged under <EM>/path</EM>.<P>
Searching for the keyword
<CODE>"<A HREF="log?files">files</A>+<EM>n</EM>"</CODE> will give a
list of all the files that have been requested at least
<CODE><EM>n</EM></CODE> times.  (The default is currently set to
$fmin.)  Index queries of the form <EM>/path?query</EM> get logged
under <EM>/path</EM>.<P>
<LI><EM>Server status</EM><P>Searching for tke keyword
<CODE>"<A HREF="log?status">status</A>"</CODE> will show the list of the server
startups and restarts.<P>
Searching for tke keyword 
<CODE>"<A HREF="log?date">date</A>"</CODE> will show the requests
sorted by Greenwich Mean date.  (The server currently logs times in GMT, and it would
be a real kludge to fix it.)<P>
Searching for tke keyword
<CODE>"<A HREF="log?week">week</A>"</CODE> will show the data on
requests sorted by the (localtime) weekday.  (This data is interesting because it
reveals more of patterns in the mysterious secret life of
<A HREF="http://iicm.tu-graz.ac.at:80/Cjargon">hackers</A>.  Hey, maybe
I can figure out a way to use the log to explore their mating habits... =)<P>
</UL>
Please send any <A HREF="/comment">comments</A> or suggestions to
<CODE>webmaster@mit.edu</CODE><P>
<address><a href="/people/bert.html">bert Dvornik</a></address>
EndOfInfo

    }
}
