#!/afs/athena/contrib/perl/.bin.next_mach20/perl

require '/mit/mkgray/perl/chat2.pl';

# Catch some signals
$SIG{'INT'}='report';
$SIG{'HUP'}='report';
$SIG{'QUIT'}='report';

# Set some filenames
$sites = "/usr/local/w4/sites";
$finished = "/usr/local/w4/finished";
$diskqueue = "/usr/local/w4/dq";
$tmpqueue = "/usr/local/w4/tmpq";
$logfile = "/usr/local/w4/log";

#
#  Load the sites file
#
open(SITES, $sites);
print("Loading all.sites\n");
while(<SITES>){
    chop;
    $sites{$_}=1;
}
close(SITES);


#
#  Open and set the buffering on some files
#
open(FIN, ">>$finished");
open(SITES, ">>$sites");
open(LOG, ">>$logfile");
select(SITES);$|=1;
select(FIN);$|=1;
select(LOG);$|=1;
select(STDOUT);

#
#
#   Main
#
#
if(-e $diskqueue){
    &queuecleanup;
	$d = shift(@dqueue);
    ($site, $path, $port) = &parseurl(shift(@queue));
    &wwwfollow($site, '/'.$path, $port, $d+1);

}
else{
    &wwwfollow("www.mit.edu", "/", "8001", 0);
}

#
#   End
#




#
#  The important routine
#
sub wwwfollow {
    print("Yeah, i made it into the subroutine!\n");
    local($mysite, $mypath, $myport, $depth) = (@_);
#    local($href, @hrefs, $port, $path, $site);

    print("Doing traversal of $mysite, $mypath on $myport\n");

#    chop($mysite) if(substr($mysite, length($mysite)-1) eq '.');

# Check if a queue cleanup is needed at this point
    if($#queue < 2 || $#addtodq > 150){
	print("Doing queue cleanup due to $#queue left on the queue and $#addtodq to go on\n");
	&queuecleanup();
    }


#
#   Check to see if I've already visited this URL or if it is already
#   queued
#

#    return if &fg("http://$mysite:$myport$mypath", $diskqueue);
#    return if &fg("http://$mysite:$myport$mypath", $logfile);
#    return if grep(m^http://$mysite:$myport$mypath^, @addtodq);

#
#   Fix port number to default
#
    if(!$myport){ $myport=80; }


#
#  Make connection and get the document
#
    print("Traversing $mysite:$myport$mypath (q: $#queue, dq: $#addtodq)\n");
    $handle =&chat'open_port($mysite, $myport) || do {print("Failed following $mysite:$myport\n");return;};
    &chat'print("GET $mypath WWWWanderer\n\n");
    $doc=&listen(30);
    &chat'close($handle);
    
# Is it a new site?    
    print(SITES "$mysite:$myport\n") unless $sites{"$mysite:$myport"};
    $sites{"$mysite:$myport"}=1;


#  Initialize statistics counters
    $links = 0;
    $title = '';
    
#
#  Primitive parse HTML
#
    print("Parsing doc (len: ", length($doc), ")\n");
    @list=();
    while($doc =~ /<[Aa]\s+[^>]*[Hh][Rr][Ee][Ff]=\"([^\n\"]+)\">/g){
print("$1...\n");
	push(@list, $1);
    }
    for $href (@list){
#	print("Found reference $href\n");
	if(substr($href, 0, 4) ne 'http' && $href !~m,//,){ # 
	    $myspath = substr($mypath, 0, rindex($mypath, '/'));
	    if(substr($href, 0 ,1) eq '/'){
		$href =  "http://$mysite:$myport$href";
	    }
	    else{
		$href = "http://$mysite:$myport$myspath/".$href;
	    }
	}
	
#	else{print "Begins with http: $href\n";}

	if(!&visited($href)){
	    push(@addtodq, $href);
	    push(@addtoddq, $depth);
#	    print("Added $href to dq\n");
	}
	$links++;
    }
#  Fetch Title?
    if($doc=~/<title>([^<]*)<\/title>/){
	$title = $1;
    }


    print(LOG "http://$mysite:$myport$mypath $links <<$title>> ==$depth==\n");
    &queuecleanup if ($#queue < 1);

    
    while($href = shift(@queue)){
#	print("Trying $href\n");
	($site, $path, $port) = &parseurl($href);
#	print("trying... ($site, $path, $port)\n");
	&wwwfollow($site, '/'.$path, $port, shift(@dqueue)+1);
    }
    print(FIN "$site:$port/$path\n");
    &queuecleanup;

    ($site, $path, $port) = &parseurl(shift(@queue));
	($depth) = shift(@dqueue);
    &wwwfollow($site, '/'.$path, $port, $depth+1);
    
}


sub visited {
    local($href) = @_;

    return 1 if grep(/$href/, @queue);
    return 1 if grep(/$href/, @addtodq[1..$#addtodq]);
    return 1 if &fg($href, $diskqueue);
    return 1 if &fg($href, $logfile);
    return 0;
}


sub queuecleanup {

    print("--------------Doing Queue Cleanup-------\n");
    open(DQ, ">>$diskqueue");
    for $qitem (@addtodq){
$dpth = shift(@addtoddq);
	next if $qitem =~ /\+/;
	if(&fg($qitem, $diskqueue)){
	    print("Hey, why am I adding $qitem again???????????\n");
	}
	print(DQ $qitem." $dpth\n") unless $qitem eq '';
#	print("Added $qitem to disk\n");
    }
    @addtodq=();
	@addtoddq=();


    open(DQ, "$diskqueue");
    for $i (1..(12-$#queue)) {
	chop($ditem = <DQ>);
	($item, $dp) = split(' ', $ditem);
	push(@queue, $item) unless $item eq '';
	push(@dqueue, $dp) unless $item eq '';
#	print("added $item to queue\n");
    }
    open(TMP, ">>$tmpqueue");
    while(<DQ>) {
	print TMP;
    }
    close(TMP);
    close(DQ);
    unlink($diskqueue);
    `mv $tmpqueue $diskqueue`;
    print("-----Done----\n");
}


sub report {
    print("Ack!\n");
    exit(0);
}

sub listen {
    local($secs) = @_;
    local($return,$tmp) = "";
    while (length($tmp = &chat'expect($secs, '(.|\n)+', '$&'))) {
        print $tmp if $trace;
        $return .= $tmp;
        (return $return) if (length($return) > 100000);
    }
    $return;
}

sub fg {
    local($expr, $file) = @_;

    open(FILE, $file);
    while(<FILE>){
	split;
	if ($_[0] eq $expr){
	    print("Aha! in file $file, $expr matches!\n");
	    return 1;
	}
    }
    return 0;
}

sub parseurl {
    local($url) = @_;

    ($proto, $garbage, $siteport, @path) = split('/', $url);
    next if ($proto ne 'http:');
	($site, $port)=split(':', $siteport);
    $pn = 0;
    for (@path){
	if($_ eq '..'){
	    splice(@path, $pn-1, 3);
	    $pn -=2;
	    }
	if($_ eq '.'){
	    splice(@path, $pn, 1);
	    $pn -=1;
	}
	$pn++;
    }
    $path = join('/', @path);
    return($site, $path, $port);
}




