#!/usr/bin/perl

#unshift(@INC, "/netgen/www/comprehensive/quick250");
require 'should.pl';            # Should and auxilliary routines
require 'indexlog.pl';               # Logging routines & badlink
require 'analyze.pl';           # Analyze document.  get_* & is_real...
require 'queue.pl';             # Qeueuing routines
require 'url.pl';               # get_url and auxilliary stuff

@mylist = ();
$myn =$addn= 0; $| =1;
if($ARGV[0] eq "-f"){
    shift(@ARGV);
    unlink("memory");
    unlink("index");
    unlink("errors");
}
else{
    if(-e "memory" || -e "index" || -e "errors"){
	print("Eiter a memory or an index file already exist.  Please move them.\n");
	exit;
    }
}

($thehost, $path, $port) = &url'ParseURL($ARGV[0]);

@mylist = (@ARGV, "DEPTH");
@reflist = ("Starting point");

&logger'init;
$depth=0;
while(1){
    $url = shift(@mylist);
    if($url eq "DEPTH"){
	print("Processed a total of $ct documents through depth $depth\n");
	$depth++;
	push(@mylist, "DEPTH");
	next;
    }
    $refer = shift(@reflist);
    ($host, $path, $port) = &url'ParseURL($url);
    $url = "http://$host:$port$path";
    next if $beenthere{$url};
    next if $url =~ /[\#\$]/;
    next if $url =~ /\.gz$/;
    next if $url =~ /\.Z$/;
    next if $url =~ /\.ps$/;
    next if $url =~ /mailto:/;
    next if $url =~ /-thread/;
    next unless $url=~/$thehost/;
    print("Depth $depth: $url\n");
    ($hdrs, $doc) = &get_url($url);
#    print("Processing doc $url\n");
    &process_doc($url, $hdrs, $doc);
    $beenthere{$url}=1;
    $ct++;
}


sub process_doc {
    local($url, $hdrs, $doc) = @_;
#    print("processing... $url\n");
	if($url =~ /\.Z$/i || $url=~ /\.gz$/){
		open(TMP, ">>/tmp/uncompressme.$$.gz");
		print TMP $doc;
		close(TMP);
		$doc = `zcat /tmp/uncompressme.$$.gz`;
		unlink("/tmp/uncompressme.$$.gz");
	}

    if($hdrs =~ /Not Found/i){
#	print("Headers: $hdrs\n");
	open(ERRS, ">>errors");
	print(ERRS "$refer -> $url\n");
	close(ERRS);
	return;
    }

#    print("Doc-------------\n$doc\n----------------------\n");
    @local_links = &get_links($doc);
    print("Found $#local_links in $url\n");
    $title = &get_title($doc); $headers = '';
    $headers = &get_headers($doc);
    $beginning = &get_beginning($doc, $beg_def{$ancestors[0]});
    &log_visit($url, $title, $headers, $beginning);

    $ppid = getppid();
    foreach $ll (@local_links){
	$fullurl = &url'ConstructURL($url, $ll);
#	$fullurl =~ y/A-Z/a-z/;
	push(@mylist, $fullurl) unless $beenthere{$fullurl};
	push(@reflist, $url) unless $beenthere{$fullurl};
	($site, $path, $port) = &url'ParseURL($fullurl);
    }
    close(Q);
}
