#!/usr/bin/perl

unshift(@INC, "/netgen/www/comprehensive/quick250");
require 'should.pl';            # Should and auxilliary routines
require 'log.pl';               # Logging routines & badlink
require 'analyze.pl';           # Analyze document.  get_* & is_real...
require 'queue.pl';             # Qeueuing routines
require 'url.pl';               # get_url and auxilliary stuff

@mylist = ();
$myn =$addn= 0; $| =1;
$howmany = $ARGV[2];
mkdir("/usr/tmp/quick.$$", 0777);
mkdir("/usr/tmp/quick.$$/html", 0777);
print("Writing reports to /usr/tmp/quick.$$/html/report.hml\n");

open(URL, ">/usr/tmp/quick.$$/0");
print URL "NOPARENT\n$ARGV[0]\n";
close(URL);

$limit = $ARGV[1];
&logger'init;

while($totgot < $howmany){
    if($#mylist >= 0){
	$url = shift(@mylist);
	($host, $path, $port) = &url'ParseURL($url);
	$url = "http://$host:$port$path";
#	print("checking memory for $url\n");
	next if (`grep  $url /usr/tmp/quick.$$/memory` ne "");
#	print("Is it a hash?\n");
	next if $url =~ /[\#\$]/;
	next if $url =~ /\.gz$/;
	next if $url =~ /\.Z$/;
	next if $url =~ /\.ps$/;
        next unless ($url=~/$limit/ || $parent=~/$limit/);
	$addn++;
	open(Q, ">>/usr/tmp/quick.$$/$addn");
	print(Q $url, "\n");
	close(Q);
#	print("Getting $url\n");
	$hdrs = ""; $doc = "";
	($hdrs, $doc) = &get_url($url);
	$totgot++;
	print("Done $totgot documents\n");

	if($nkids > 6){
	    for $i (1..3){
#		print("Waiting for a child to exit...\n");
		wait();
#		print("...done\n");
		$nkids--;
	    }
	}

#	print("Forking...\n");
	$pid = fork();
	if(!$pid){
	    &process_doc($url, $hdrs, $doc);
#	    print("Done processing $url\n");
	    exit;
	}
	$nkids++;
    }
    else{
        $waited=0;
	&load_list($myn++,0);
    }
}

&generate_reports;
exit;

sub load_list {
    local($file, $waited) = @_;
#    print "Loading $file\n";
    if(-e "/usr/tmp/quick.$$/$file"){
	sleep(1);
	open(FILE, "/usr/tmp/quick.$$/$file");
	chop($parent = (<FILE>));
	while(<FILE>){
	    chop;
#	    print("Putting $_ on list\n");
	    push(@mylist, $_);
	}
#	print("Done.\n");
    }
    else{
#	print "Waiting...(for the $waited th time) to load $file\n";
	sleep(5);
	if($waited > 5){
	    $waited=0;
	    $myn--;
	    $file--;
	}
	&load_list($file, $waited++);
    }
}

sub process_doc {
    local($url, $hdrs, $doc) = @_;

#    print("Processing $url\n");

    @local_links = &get_links($doc);
    $ll = $#local_links+1;
    @local_links = (@local_links, &get_images($doc));
    $nimages = ($#local_links+1)-$ll;
    $title = &get_title($doc); $headers = '';
    $headers = &get_headers($doc);
#    $beginning = &get_beginning($doc, $beg_def{$ancestors[0]});
#    $types = &get_types($doc);
#    $markup = &get_markup($doc);
    &log_visit($url, $title, $headers, $beginning, $types,
	       $#local_links, $#ancestors, $markup);

    $r1 = $resp = "";
    ($r1, @rest) = split("\n", $hdrs);
    ($http10, $resp, @english) = split(" ", $r1);
    $resp = "bad" if !$resp;

    $size = ($hdrs=~/Content-length: (\d+)/) ? $1:length($doc);
    
    $ppid = getppid();
    open(REPLOG, ">>/usr/tmp/quick.$ppid/reportlog");
    print(REPLOG "$url $parent $resp $ll $nimages $size\n");
    close(REPLOG);


#    print("Writing links ($#local_links)\n");
    open(Q, ">>/usr/tmp/quick.$ppid/$addn") || warn "Hey, can't write quick.$$/$addn";
    foreach $ll (@local_links){
	$fullurl = &url'ConstructURL($url, $ll);
#	$fullurl =~ y/A-Z/a-z/;
	print Q $fullurl."\n";
    }
    close(Q);
}

sub generate_reports {
    print("Generating report...\n");
    sleep(5);

    open(REPLOG, "/usr/tmp/quick.$$/reportlog");
    while(<REPLOG>){
	($url, $anc, $status, $links, $images, $size) = split;
	$ancest{$url} = $anc;
	$status{$url} = $status;
	$links{$url} = $links;
	$images{$url} = $images;
	$size{$url} = $size;

	$totlinks+=$links;
	$totimages+=$images;
	$totsize+=$size;
	$filesize+=$size unless $url=~/\.gif$/i;
	$imagesize+=$size if $url=~/\.gif$/i;
	$count++;
	$rcount++ unless $url=~/\.gif$/i;

	$type = substr($status, 0, 1);
	if($type eq "3"){
	    push(@moved, $url);
	}
	elsif($type eq "b"){
	    push(@broken, $url);
	}
	elsif($type eq "4"){
	    push(@notfound, $url);
	}
	elsif($type eq "5"){
	    push(@servererror, $url);
	}
	elsif($type ne "2"){
	    push(@uhoh, $url);
	}
    }
    close(REPLOG);

    open(OUT, ">/usr/tmp/quick.$$/html/report.html");
    select(OUT);
    print("<h1>net.Quality Report</h1>\n");
    print("A total of $rcount documents and ",$count-$rcount," images were retrieved, starting at <a href=\"$ARGV[0]\">$ARGV[0]</a>.\n");
    print("<p>The following statistics were gathered:<ul>\n");
    print("<li>Average number of links: ", int($totlinks/$rcount), "\n");
    print("<li>Average number of images: ", int($totimages/$rcount), "\n");
    print("<li>Average size, bytes (no images): ", int($filesize/$rcount), "\n");
    print("<li>Average size, bytes (images): ", int($imagesize/($count-$rcount)), "\n");
    print("<li>Average size, bytes (all): ", int($totsize/$count), "\n");
    print("<li>Moved Documents: ", $#moved+1, "\n");
    print("<li>Total Errors: ", $#broken+$#notfound+$#servererror+$#uhoh+4, "\n");
    print("</ul>\n");
    print("<h2>Error Reports</h2>");
    print("<ul>\n");
    if(($#moved+1) >0){
	print("<li><a href=\"moved.html\">Moved Documents</a> (not <em>real</em> errors, but useful to know about)\n");

	&error_report("moved.html", "Moved Documents", @moved);
	$errors=1;
    }
    if(($#notfound+1) >0){
	print("<li><a href=\"notfound.html\">Not Found Documents</a> (server couldn't find them)\n");

	&error_report("notfound.html", "Not Found Documents", @notfound);
	$errors=1;
    }
    if(($#broken+1) >0){
	print("<li><a href=\"broken.html\">Very Broken Links</a> (total failures)\n");

	&error_report("broken.html", "Very Broken Documents", @broken);
	$errors=1;
    }
    if(($#servererror+1) >0){
	print("<li><a href=\"servererror.html\">Server Errors</a> (often broken CGI's)\n");


	&error_report("servererror.html", "Server Errors", @servererror);
	$errors=1;
    }
    if(($#uhoh+1) > 0){
	print("<li><a href=\"uhoh.html\">Confusing Errors</a> (who knows what's wrong)\n");

	&error_report("uhoh.html", "Confusing Errors", @uhoh);
	$errors=1;
    }
    if(!$errors){
	print("<li>Congrats, No errors!\n");
    }

    print("</ul>\n");
    close(OUT);
    select(STDOUT);

}

sub error_report {
    local($file, $title, @errors) = @_;

    open(ERR, ">/usr/tmp/quick.$$/html/$file");
    print(ERR "<h1>$title</h1>\n");
    print("<ul>\n");
    for $err (@errors){
	$an = $ancest{$err};
	$an =~ s,http://[^/]+/?,,i;
	print(ERR "<li><a href=\"$err\">$err</a> is pointed to in document <a href=\"$ancest{$err}\">$an</a>\n");
    }  
    print("</ul>\n");
    close(ERR);
}
     
