#!/usr/bin/perl

$debug = 0;
#require 'sys/socket.ph';
#require "fcntl.ph";

sub  AF_INET {2; }
sub  SOCK_STREAM { 1;}

require 'should.pl';            # Should and auxilliary routines
require 'log.pl';               # Logging routines & badlink
require 'analyze.pl';           # Analyze document.  get_* & is_real...
require 'queue.pl';             # Qeueuing routines
require 'url.pl';               # get_url and auxilliary stuff
$proto = ((getprotobyname("tcp"))[2] || 0);
$|=1;
$addhowmanypages = 25000;

@mylist = ();
$myn =$addn= 0; $| =1;
$howmany = $ARGV[1];
mkdir("/usr/tmp/quick.$$", 0777);
mkdir("/usr/tmp/quick.$$/html", 0777);
print("Writing to /usr/tmp/quick.$$\n");

open(URL, ">/usr/tmp/quick.$$/0");
print URL "NOPARENT\n$ARGV[0]\n";
close(URL);

$limit = $ARGV[1];
&logger'init;

open(REPLOG, ">>/usr/tmp/quick.$$/contactlog");
print REPLOG "<dl>\n";
close(REPLOG);

while($totgot < $howmany){
    print("MAINLOOP: Added $addn pages, proceeding\n");
    if($#mylist >= 0){
	print("MAINLOOP: Starting cleanup of list (Currently $#mylist items)\n");
	@newalist = grep(!`grep $_ /usr/tmp/quick.$$/memory`, @mylist);
	@newlist = grep(!/\#/, @newalist);
	print("MAINLOOP: Done with list cleanup (Now $#newlist items)\n");
	&do_urls(@newlist) if $#newlist >=0;
	print("MAINLOOP: Done fetching and logging URLs\n");
    }
    print("MAINLOOP: Fetching list $myn\n");
    &load_list($myn++);
    print("MAINLOOP: Done fetching list $myn (Done with $addn pages)\n");
}

open(REPLOG, ">>/usr/tmp/quick.$$/contactlog");
print REPLOG "</dl>\n";
close(REPLOG);
 
#&generate_reports;
exit;

sub do_doc {
    local($url, $hdrs, $doc) = @_;
    $addn++;
    open(Q, ">/usr/tmp/quick.$$/$addn");
    print Q $url, "\n";
    close(Q);
    if($nkids > 6){
	for $i (1..3){
#	    print("Waiting for a child to exit...\n");
	    wait();
#	    print("...done\n");
	    $nkids--;
	}
    }
    
#    print("Forking...\n");
    $pid = fork();
    if(!$pid){
	&process_doc($url, $hdrs, $doc);
#	print("Done processing $url\n");
	exit;
    }
    $nkids++;
#    print("Leaving do_doc\n");
}

sub load_list {
    local($file) = @_;
#    print "Waiting to load $file\n";
#    print "Loading $file\n";
    @mylist = ();
    sleep(2);
    if(-e "/usr/tmp/quick.$$/$file"){
	open(FILE, "/usr/tmp/quick.$$/$file");
	chop($parent = (<FILE>));
	while(<FILE>){
	    chop;
#            print("Putting $_ on list\n");
	    push(@mylist, $_);
	}
#        print("Done.\n");
    }
    else{
#        print "Waiting...\n";
	sleep(1);
	&load_list($file);
    }
}

sub process_doc {
    local($url, $hdrs, $doc) = @_;

#    print("Processing $url\n");

    @local_links = &get_links($doc);
    $ll = $#local_links+1;
    print("Found $ll links in $url\n");
    if($ll == 0) {
	print("Uh oh!\n$hdrs\n\n$doc\n");
    }
#    @local_links = (@local_links, &get_images($doc));
#    $nimages = ($#local_links+1)-$ll;
    $title = &get_title($doc); $headers = '';
#    $headers = &get_headers($doc);
#    $beginning = &get_beginning($doc, $beg_def{$ancestors[0]});
#    $types = &get_types($doc);
#    $markup = &get_markup($doc);
    &log_visit($url, $title, $headers, $beginning, $types,
	       $#local_links, $#ancestors, $markup);

    $r1 = $resp = "";
    ($r1, @rest) = split("\n", $hdrs);
    ($http10, $resp, @english) = split(" ", $r1);
    $resp = "bad" if !$resp;

    $size = ($hdrs=~/Content-length: (\d+)/) ? $1:length($doc);
    
    $ppid = getppid();
    open(REPLOG, ">>/usr/tmp/quick.$ppid/contactlog");
    $*=1;
    while($doc=~/([^\@\s\<\>\"\,]+\@[^\s\<\>\"\,]+)/g){
	$before = $`; $after = $'; $email = $1;
	$before=~ s/\<[^\>]*\>//g;
	$after=~ s/\<[^\>]*\>//g;
	$befcon = substr($before, length($before)-200);
	$afcon = substr($after, 0, 200);
	print(REPLOG "\n<dt>At <a href=\"$url\">$title</a><br><a href=\"$url\">$url</a><br><strong>$email</strong>\n<dd>$befcon<strong>$email</strong>$afcon\n");
    }
    while($doc=~/([\(\. ]+\d\d\d[\)\. \-]+\d\d\d[\. \-]*\d\d\d\d)/g){
	$before = $`; $after = $'; $email = $1;
	$before=~ s/\<[^>]*\>//g;
	$after=~ s/\<[^>]*\>//g;
	$befcon = substr($before, length($before)-200);
	$afcon = substr($after, 0, 200);
	print(REPLOG "\n<dt>At <a href=\"$url\">$title</a><br><a href=\"$url\">$url</a><br><strong>$email</strong>\n<dd>$befcon<strong>$email</strong>$afcon\n");
    }

#    print("Replog: $url $parent $resp $ll $nimages $size\n");
    close(REPLOG);

#    print("$hdrs\n$doc\n");

#    print("Writing links ($#local_links)\n");
    open(Q, ">>/usr/tmp/quick.$ppid/$addn") || warn "Hey, can't write quick.$$/$addn";
    foreach $ll (@local_links){
#	last if (($parent!~/$limit/) && $parent ne "NOPARENT");
	$fullurl = &url'ConstructURL($url, $ll);
#	$fullurl =~ y/A-Z/a-z/;
	print Q $fullurl."\n";
    }
    close(Q);
}

sub generate_reports {
    print("Generating report...\n");
    sleep(5);

    open(REPLOG, "/usr/tmp/quick.$$/reportlog");
    while(<REPLOG>){
	($url, $anc, $status, $links, $images, $size) = split;
	$ancest{$url} = $anc;
	$status{$url} = $status;
	$links{$url} = $links;
	$images{$url} = $images;
	$size{$url} = $size;

	$totlinks+=$links;
	$totimages+=$images;
	$totsize+=$size;
	$filesize+=$size unless $url=~/\.gif$/i;
	$imagesize+=$size if $url=~/\.gif$/i;
	$count++;
	$rcount++ unless $url=~/\.gif$/i;

	$type = substr($status, 0, 1);
	if($type eq "3"){
	    push(@moved, $url);
	}
	elsif($type eq "b"){
	    push(@broken, $url);
	}
	elsif($type eq "4"){
	    push(@notfound, $url);
	}
	elsif($type eq "5"){
	    push(@servererror, $url);
	}
	elsif($type eq "7"){
	    push(@timeout, $url);
	}
	elsif($type ne "2"){
	    push(@uhoh, $url);
	}
    }
    close(REPLOG);

    open(OUT, ">/usr/tmp/quick.$$/html/report.html");
    select(OUT);
    print("<h1>net.Quality Report</h1>\n");
    print("A total of $rcount documents and ",$count-$rcount," images were retrieved, starting at <a href=\"$ARGV[0]\">$ARGV[0]</a>.\n");
    print("<p>The following statistics were gathered:<ul>\n");
    print("<li>Average number of links: ", int($totlinks/$rcount), "\n");
    print("<li>Average number of images: ", int($totimages/$rcount), "\n");
    print("<li>Average size, bytes (no images): ", int($filesize/$rcount), "\n");
    print("<li>Average size, bytes (images): ", int($imagesize/($count-$rcount)), "\n");
    print("<li>Average size, bytes (all): ", int($totsize/$count), "\n");
    print("<li>Moved Documents: ", $#moved+1, "\n");
    print("<li>Total Errors: ", $#broken+$#notfound+$#servererror+$#uhoh+$#timeout+5, "\n");
    print("</ul>\n");
    print("<h2>Error Reports</h2>");
    print("<ul>\n");
    if(($#moved+1) >0){
	print("<li><a href=\"moved.html\">Moved Documents</a> (not <em>real</em> errors, but useful to know about)\n");

	&error_report("moved.html", "Moved Documents", @moved);
	$errors=1;
    }
    if(($#notfound+1) >0){
	print("<li><a href=\"notfound.html\">Not Found Documents</a> (server couldn't find them)\n");

	&error_report("notfound.html", "Not Found Documents", @notfound);
	$errors=1;
    }
    if(($#broken+1) >0){
	print("<li><a href=\"broken.html\">Very Broken Links</a> (total failures)\n");

	&error_report("broken.html", "Very Broken Documents", @broken);
	$errors=1;
    }
    if(($#servererror+1) >0){
	print("<li><a href=\"servererror.html\">Server Errors</a> (often broken CGI's)\n");


	&error_report("servererror.html", "Server Errors", @servererror);
	$errors=1;
    }
    if(($#uhoh+1) > 0){
	print("<li><a href=\"uhoh.html\">Confusing Errors</a> (who knows, they're confusing)\n");

	&error_report("uhoh.html", "Confusing Errors", @uhoh);
	$errors=1;
    }
    if(($#timeout+1)>0){
	print("<li><a href=\"timeout.html\">Timeout Errors</a> (server is too slow)\n");

	&error_report("timeout.html", "Timeout Errors", @timeout);
	$errors=1;
    }

    if(!$errors){
	print("<li>Congrats, No errors!\n");
    }

    print("</ul>\n");
    close(OUT);
    select(STDOUT);

}

sub error_report {
    local($file, $title, @errors) = @_;

    open(ERR, ">/usr/tmp/quick.$$/html/$file");
    print(ERR "<h1>$title</h1>\n");
    print("<ul>\n");
    for $err (@errors){
	$an = $ancest{$err};
	$an =~ s,http://[^/]+/?,,i;
	print(ERR "<li><a href=\"$err\">$err</a> is pointed to in document <a href=\"$ancest{$err}\">$an</a>\n");
    }  
    print("</ul>\n");
    close(ERR);
}



sub generate_socket {
    local($fhandle)=@_;
    socket($fhandle, &AF_INET, &SOCK_STREAM, $proto) ||
	die "socket: $!\n";
#    fcntl($fhandle, &F_GETFL, $tmp='') ||
#	die "fcntl(\&F_GETFL) failed: $!";
#    fcntl($fhandle, &F_SETFL, $tmp | &FNDELAY) ||
#	die "fcntl(\&F_SETFL) failed: $!";
}
sub do_urls {
    local(@urls)=@_;
    $donenum = $#urls+1;
    $sockaddr = 'S n a4 x8';
    $allsockets = 40;
    $usedsockets = 0;
    $donecount = 0;
    $sockname = "SOCKAAA";

    print("Doing run of $donenum urls ($donecount)\n");
    $startt=time();
    $endtime = 15+$donenum;
    %set = ();
    while(($donecount < $donenum)  && ((time()-$startt)<($endtime))){
	$diff = time()-$startt;
	print "Time: ", $diff, " (max: ", $endtime, ") (got $donecount out of $donenum so far)\n" if !$set{$diff};
	$set{$diff}=1;

	# First, make sure all available sockets are open
	while(($usedsockets < $allsockets) && ($#urls >=0)){
	    $url = shift(@urls);
	    ($host, $path, $port) = &url'ParseURL($url);
	    ($host) = (gethostbyname($host))[4];
	    &generate_socket($sockname);
	    $urlsock{$sockname} = $url;
	    $hostsock{$sockname} = $host;
	    $portsock{$sockname} = $port;
	    $pathsock{$sockname} = $path;
#	    print("Connecting $sockname\n");
	    $result=connect($sockname, pack($sockaddr, &AF_INET, $port, $host));
#	    print("Connected $sockname\n");
#	    warn("Connect: $result ($!)\n") if $debug;
	    $sockfileno{fileno($sockname)} = $sockname;
	    $fn = fileno($sockname);
#	    warn("Socket ($sockname)[$fn] for $url generated\n");
	    vec($win, fileno($sockname), 1) = 1;
	    vec($rin, fileno($sockname), 1) = 1;
	    
	    $sockname++;
	    $usedsockets++;
	}
	
	# See who is ready for writing and if we have to tell them anything
#	print("Checking for writability...\n");
	if(select(undef, $wout=$win, undef, 0)){
	    $woutbits = unpack("b*", $wout);
	    $winbits = unpack("b*", $win);
	    while(($idx = index($woutbits, "1", $idx+1)) !=-1){
		# Write to it
		$fh = $sockfileno{$idx};
		if(!getpeername($fh)){
		    warn("Getpeername undefined\n") if $debug;
#		    $err = getsockopt($fh, &SOL_SOCKET, &SO_ERROR), "\n";
#		    warn("$!\n");
#		    $! = $err;
		    warn(0+$!, "$!") if $debug;
		}
		else{
		    if(!$wrote{$idx}){
			warn("Ready to write: $woutbits ($winbits)\n") if $debug;
			print("Sending: GET $pathsock{$sockfileno{$idx}} HTTP/1.0\n");
			$sent = send($fh,"GET $pathsock{$sockfileno{$idx}} HTTP/1.0\r\nUser-Agent: c00ld00d/v0 by Matthew Gray <mkgray@netgen.com>  quickwww/1.0\r\n\r\n",0);
#			warn("Sent: $sent ($!)\n");
			$wrote{$idx}=1;
			warn("Sent request on $fh ($idx);\n") if $debug;
		    }
		}
	    }
	}
	
	# Now, read anything there is to read.
#	print("Checking for readability\n");
	$idx = $route = 0;
	$routbits = "";
	if(select($rout=$rin, undef, undef, 0)){
	    $routbits = unpack("b*", $rout);
	    $rinbits = unpack("b*", $rin);
	    while(($idx = index($routbits, "1", $idx+1)) !=-1){
#		if($wrote{$idx}){
		    
#		    warn("Reading ($routbits) [$rinbits]\n") if $debug;
#		print("Reading ($routbits) [$rinbits]\n");
		    $fh = $sockfileno{$idx};
		    $nread = read($fh, $outtemp,32);
		    $output{$idx} .= $outtemp;
		    if(!$nread){
			&close_connection($idx);
#			print("Closed\n");
#		    }
		}
	    }
	}
    }
    $rinbits = unpack("b*", $rin);
    while(($idx = index($rinbits, "1", $idx+1))!=-1){
	$output{$idx} = "HTTP/1.0 700 Timeout\n\nTimeout\n";
	&close_connection($idx);
    }
}


# Things to reset on close:
# hostsock, portsock, pathsock, sockfileno, wrote, rin, win, output

sub close_connection {
    local($fno) = @_;

    $totgot++;
    print("Got $totgot documents\n");
    $fh = $sockfileno{$fno};
    $usedsockets--;
    $host = $hostsock{$fh};
    $port = $portsock{$fh};
    $path = $pathsock{$fh};
    $url = $urlsock{$fh};
    undef $hostsock{$fh};
    undef $portsock{$fh};
    undef $pathsock{$fh};
    undef $urlsock{$fh};
    undef $wrote{$fno};
    vec($win, $fno, 1) = 0;
    vec($rin, $fno, 1) = 0;
    undef $sockfileno{$fno};
#    print("Undef'd it all\n");
    ($head, @body) = split("\n\n", $output{$fno});
    $body= join("\n\n", @body);
    undef $output{$fno};
    &do_doc($url, $head, $body);
    $donecount++;
#    print("Done with $donecount documents (out of $donenum)\n");
#    print("Trying to close $fh ($fno)\n");
    close($fh);
#    print("Leaving close_connection\n");
}
