#!/afs/athena/contrib/perl5/perl
# searchspy - figure out what search engine queries are leading to your
#   web pages by looking at referrer information.
# Usage: ./searchspy < access_log | sort
# Nelson Minar <nelson@media.mit.edu> http://www.media.mit.edu/~nelson/
#   written on the way to LA Wed Feb 11 17:19:32 EST 1998
# $Id: searchspy,v 1.3 1998/02/25 00:17:37 nelson Exp $
# Copyright (C) 1998 Nelson Minar all rights reserved.

# Configurable variables:
#   a pattern matching the URLs you care about. (^ means match all).
$searchURL = '^';
$|=1;
# $searchURL = '^/people/nelson';

#   debug turns on some info about how this script could be failing
#     - prints out warnings about unknown search engines and weird fixups
$debug = 0;

# How it works:
# Look through a standard access_log (at least, Apache 1.0.5) and
#   look for queries where the referrer has an ? in it. These are referrers
#   from a CGI script, and are almost always search engines.
# If it has an ? in it, then the referrer is parsed into the %a hash
#   and then passed through the table %engineMap to extract the particular
#   field that's the search term. This is then passed through a filter
#   to turn the encoding of the search term into something readable,
#   dealing with +s and %xx.
# All this parsing is quick and dirty. Please mail improvements
#   (especially patterns for more search engines) to the author.


# Applytable, a handy function
# Usage: applytable(string, reference-to-hashtable, arglist)
#   this will call function(arglist), where function is the right
#   one depending on the hash table, arglist is passed through unmodified.
#   Note: $& can be used to extract which string matched you.
# Examples
#   using an existing hashtable, with a two element arglist
#     applytable($_, \%functionmap, 'lala', 'mama');
#   using an anonymous hash, with one anonymous sub, no arglist
#     applytable("Foobar", {'xyzzy' => \&subNotUsed, 
# 	     	            'bar'   => sub { print "Bar detected\n" }});

sub applytable {
    my $inputString = shift;
    my %fmap = %{shift()};
    my @arglist = @_;
    my $k;

    foreach $k (keys(%fmap)) {
	if ($inputString =~ /$k/i) {
	    $ename = $k;
	    $ename =~ s/\\\./\./g;
	    return &{$fmap{$k}}(@arglist);
	}
    }
    return undef;
}

# This table controls how individual queries are parsed by search engine.
# The left side is a URL that matches the search engine. Note, the
#   ordering here doesn't make a difference - avoid ambiguity.
# The right side is a subroutine to return the string that is the actual
#   search query. See the call to applytable() to understand all the context.
#   $a{FIELD} is the value of the field named. 
%engineMap =
    ('metacrawler'           => sub { $a{general}; },
     'mckinley\.com'         => sub { $a{search}; },
     'lycos'                 => sub { $a{query}; },
     'www\.goo\.ne\.jp'      => sub { $a{MT} },
     'www\.fireball\.de'     => sub { $a{q} },
     'hotbot\.com'           => sub { $a{MT}; },
     'infoseek\.com'         => sub { "$a{qt}+$a{oq}"; },
     'planetsearch\.com'     => sub { $a{text}; },
     'yahoo'                 => sub { $a{p}; },
     'snap\.com'             => sub { $a{keyword}; },
     'excite'                => sub { $a{search} ? $a{search} : $a{s}; },
     'designlab\.ukans\.edu' => sub { $a{queryTerm}; },
     'search\.metafind\.com' => sub { $a{q}; },
     'lokace'                => sub { $a{MOTCLEF}; },
     'looksmart\.com'        => sub { $a{key}; },
     'dogpile\.com'          => sub { $a{q}; },
     'www\.sear\.ch'         => sub { $a{q}; },
     'www\.nlsearch\.com'    => sub { $a{qr}; },
     'www\.naver\.com'       => sub { $a{query}; },
     'www\.mamma\.com'       => sub { $a{query}; },
     'search\.com/Infoseek'  => sub { $a{QUERY}; },
     'search\.com/AltaVista' => sub { $a{query}; },
     'netfind\.aol\.com'     => sub { $a{search} ? $a{search} : $a{s}; },
     'webcrawler\.com'       => sub { $a{searchText} ? $a{searchText} :
                                          ($a{search} ? $a{search} :
                                           $a{text}); },
     'altavista\.digital\.com'      => sub { $a{q}; },
     'altavista\.telia\.com'        => sub { $a{q}; },
     'altavista\.magallanes'        => sub { $a{q}; },
     'altavista\.yellowpages\.com'  => sub { $a{q}; },
     'infind\.inference\.com'       => sub { $a{query}; },
     'ahoy\.cs\.washington\.edu'    => sub { "$a{first}+$a{last}"; },
     'www\.northernlight\.com'      => sub { $a{qr}; },
     );
$engineSubs = \%engineMap;

$parsedQueries = 0;
$unparsedQueries = 0;

while (<>) {
    # first parse out the relevant fields from the logfile
    ($host, $time, $req, $code, $length, $ref) =
        ($_ =~ /^(\S+) - - \[(.+)\] "(.+)" (\S+) (\S+) (\S+)/);
    ($url) = ($req =~ /^\w+ (\S+)/);

    # Check if it's a URL we're interested in and then see if the referrer
    # is a search. Any referrer with a ? is a candidate.
    if ($url =~ /$searchURL/ && $ref =~ /\?/) {
        # ok, we have a candidate. Now unpack the search engine name and
        # the particular request
        ($engine, $req) = ($ref =~ /(.*)\?(.*)/);

        # hack around any weird search engines
        # looksmart has fields in the CGI request without values
        if ($engine =~ /looksmart/) {              # hack!
            if ($debug) {
                print STDERR "Fixing up looksmart request $req\n";
            }
            # strip fields with no values like ?l&q=foo and ?bar=&q=foo
            $req =~ s/^[^=]+(=&|&)//;
        }

        # now unpack the request into the %a hash.
        %a = split /[=&]/, $req;

        # and pass it through the table to extract the search term
        $search = applytable($engine, $engineSubs);

        if ($search) {
            # convert the +s to spaces in the search term
            $search =~ s/\+/ /g;
            # and convert all the %xx stuff back into proper characters
            $search =~ s/%(\w\w)/chr(hex($1))/eg;
            # and print out the exciting information!
            print "$url $ename $search\n";
            $parsedQueries++;
        } else {
            if ($debug) {
                print STDERR ("Unknown engine $engine -> $req\n");
            }
            $unparsedQueries++;
        }
    }
}

print "$parsedQueries queries successfully parsed, $unparsedQueries weren't understood.\n";
