#!/afs/athena/contrib/perl/p

require 'analyze.pl';
require 'url.pl';
require 'sys/socket.ph' || print "no socket thing\n";
eval 'sub AF_INET {2;}';
eval 'sub NC_TPI_COTS {2;}';
eval 'sub SOCK_STREAM { &NC_TPI_COTS ;}';
eval 'sub F_GETFL {3;}';
eval 'sub FNDELAY {0x04;}';
require "sys/file.ph";
    eval 'sub FNDELAY {0x04;}';

require "fcntl.ph";
$| = 1; $debug = 0;


$homepages =  ("http://www.mit.edu:8001/home-pages.html");
print("Getting home pages doc\n");
($homehead, $homebody) = &get_url("GET", $homepages);
print("extracting links...\n");
($inst, $webm, $alpha, @home_pages) = &get_links($homebody);
print("correcting links...\n");
for $n (0..$#home_pages){
    $home_pages[$n] = &url'ConstructURL($homepages, $home_pages[$n]);
}

print("Retrieving homepages...\n");
open(OUT, ">>/tmp/pagescores");
select(OUT); $|=1; select(STDOUT);
&do_urls(@home_pages);

sub do_doc {
    local($url, $head, $body) = @_;
    
    print(OUT "$url ");
    $nlinks = ($body=~s/(<a[ \t\n])/$1/gi);
    print(OUT "$nlinks ");
    $nimages = ($body=~s/<img/<img/gi);
    print(OUT "$nimages ");
    if($head =~ /content-length: (.+)/i){
	print(OUT "$1 ");
    }
    else{ print length($body); }
    %uimages = (); $uimgs=0;
    while($body=~ /<img[^>]+src[^=]*=[ \t\n]*([^ \t\n]+)/gi){
	$imagename= $1;
	if(!$uimages{$imagename}){
	    $uimages{$imagename}=1;
	    $uimgs++;
	}
    }
    print OUT "$uimgs ";
    $head =~ /last-modified: (.+)/i;
    ($weekday, $date, $time, $zone) = split(' ', $1);
    print(OUT "$date ");
    
    print(OUT "\n");
}

sub do_urls {
    local(@urls)=@_;
    $donenum = $#urls+1;
    $sockaddr = 'S n a4 x8';
    $allsockets = 40;
    $donecount = 0;
    $usedsockets = 0;
    $sockname = "SOCKAAA";

    print("Doing run of $donenum urls\n");
    $starttime = time;
    $endtime = 30+$starttime+$donenum*3;
    while(($donecount < $donenum) && time()<$endtime){
	$loopc++;
	if(time > $lastt+2){
	    $lastt = time;
	    $loopx = 1;
	}
	else{
	    $loopx=0;
	}
	# First, make sure all available sockets are open
	while(($usedsockets < $allsockets) && ($#urls >=0)){
	    $url = shift(@urls);
	    ($host, $path, $port) = &url'ParseURL($url);
	    ($host) = (gethostbyname($host))[4];
	    &generate_socket($sockname);
	    $urlsock{$sockname} = $url;
	    $hostsock{$sockname} = $host;
	    $portsock{$sockname} = $port;
	    $pathsock{$sockname} = $path;
	    print("Connecting $sockname\n");
	    $result=connect($sockname, pack($sockaddr, &AF_INET, $port, $host));
	    print("Connected $sockname\n");
	    warn("Connect: $result ($!)\n") if $debug;
	    $sockfileno{fileno($sockname)} = $sockname;
	    $fn = fileno($sockname);
	    print("Socket ($sockname)[$fn] for $url generated\n");
	    vec($win, fileno($sockname), 1) = 1;
	    vec($rin, fileno($sockname), 1) = 1;
	    
	    $sockname++;
	    $usedsockets++;
	}
	
	# See who is ready for writing and if we have to tell them anything
	$winbits = unpack("b*", $win);
#	print("Checking for writability ($loopc)[$winbits]...\n") if $loopx;
	if(select(undef, $wout=$win, undef, 0)){
	    $woutbits = unpack("b*", $wout);
	    $winbits = unpack("b*", $win);
	    while(($idx = index($woutbits, "1", $idx+1)) !=-1){
		# Write to it
		$fh = $sockfileno{$idx};
		if(!getpeername($fh)){
		    warn("Getpeername undefined\n") if $debug;
		    $err = getsockopt($fh, &SOL_SOCKET, &SO_ERROR), "\n";
		    warn("$!\n");
		    $! = $err;
		    warn(0+$!, "$!") if $debug;
		}
		else{
		    if(!$wrote{$idx}){
			warn("Ready to write: $woutbits ($winbits)\n") if $debug;
			print("Getting $pathsock{$sockfileno{$idx}}\n");
			$sent = send($fh,"GET $pathsock{$sockfileno{$idx}} HTTP/1.0\nUser-Agent: Wander by Matthew Gray <mkgray@netgen.com>  quickwww/1.0\r\n\r\n",0);
			warn("Sent: $sent ($!)\n") if $debug;
			$wrote{$idx}=1;
			warn("Sent request on $fh ($idx);\n") if $debug;
		    }
		}
	    }
	}
	
	# Now, read anything there is to read.
	$rinbits = unpack("b*", $rin);
#	print("Checking for readability ($loopc)[$rinbits]\n"); #if $loopx;
	$idx = 0;
	if($loopx){
	    print("Still waiting for: ");
	    while(($idx = index($rinbits, "1", $idx+1)) !=-1){
		print("$urlsock{$sockfileno{$idx}} | ");
	    }
	    print("\n");
	}
	$idx = 0;
	if(select($rout=$rin, undef, undef, 0)){
	    $routbits = unpack("b*", $rout);
	    $rinbits = unpack("b*", $rin);
	    while(($idx = index($routbits, "1", $idx+1)) !=-1){
#		if($wrote{$idx}){
		    
		    warn("Reading ($routbits) [$rinbits]\n") if $debug;
		    $fh = $sockfileno{$idx};
		    $nread = read($fh, $outtemp,512);
		    $output{$idx} .= $outtemp;
#		    print("Read $nread bytes...\n");
		    if(!$nread){
			&close_connection($idx);
#		    }
		}
	    }
	}
    }
    if(time()>$endtime){
	print("Timeout on run of $donenum URLs...\n");
    }
}
sub close_connection {
    local($fno) = @_;

    $fh = $sockfileno{$fno};
    $usedsockets--;
    $host = $hostsock{$fh};
    $port = $portsock{$fh};
    $path = $pathsock{$fh};
    $url = $urlsock{$fh};
    undef $hostsock{$fh};
    undef $portsock{$fh};
    undef $pathsock{$fh};
    undef $urlsock{$fh};
    undef $wrote{$fno};
    vec($win, $fno, 1) = 0;
    vec($rin, $fno, 1) = 0;
    undef $sockfileno{$fno};

    ($head, @body) = split("\n\n", $output{$fno});
#    print("==========\n$head\n==========\n");
    $body= join("\n\n", @body);
    undef $output{$fno};
    &do_doc($url, $head, $body);
    $donecount++;
    print("Done with $donecount documents (out of $donenum)\n");
    close($fh);
}

sub generate_socket {
    local($fhandle)=@_;
    socket($fhandle, &AF_INET, &SOCK_STREAM, $proto) ||
	die "socket: $!\n";
    fcntl($fhandle, &F_GETFL, $tmp='') ||
	die "fcntl(\&F_GETFL) failed: $!";
    fcntl($fhandle, &F_SETFL, $tmp | &FNDELAY) ||
	die "fcntl(\&F_SETFL) failed: $!";
}
