require 'html.pl';

sub get_title{
    local($d) = @_;
    $d =~ s/\n/ /g;
    $d =~ /<title>(.+)<\/title>/i;
    $title = $1;
    $title;
}

sub get_headers {
    local($d) = @_;

    $d =~ s/\n/ /g;

    join(" ", &html'headers($d, 1,2,3));
}

sub get_links {
    local($doc) = @_;

    @anchs = ();
    $doc =~ s/\n/ /g;
    while($doc =~ /<a [^>]*href *= *\"([^\"]+)\"[^>]*>/ig){
	push(@anchs, $1);
    }
    @anchs;
}

sub get_beginning {
    local($doc) = @_;
    $doc =~ s/\n/ /g;
    $lowmark = 600; $highmark = 1200;
    if($out = &get_beg($doc)){
	return $out;
    }
    $lowmark = 300; $highmark = 600;
    if($out = &get_beg($doc)){
	return $out;
    }
    $lowmark = 100; $highmark = 300;
    if($out = &get_beg($doc)){
	return $out;
    }
}

sub get_beg {
    local($doc) = @_;
#    $doc =~ s/\n/ /g;
    
     $ind = 0;
    while(($ind = index($doc, "<h", $ind+1)) != -1){
	if($ind > $lowmark && $ind < $highmark){
	    return substr($doc, 0, $ind-1);
	}
    }
    while(($ind = index($doc, "<H", $ind+1)) != -1){
	if($ind > $lowmark && $ind < $highmark){
	    return substr($doc, 0, $ind-1);
	}
    }


    $ind = 0;
    while(($ind = index($doc, "<p>", $ind+1)) != -1){
	if($ind > $lowmark && $ind < $highmark){
	    return substr($doc, 0, $ind-1);
	}
    }
    $ind = 0;
    while(($ind = index($doc, "<P>", $ind+1)) != -1){
	if($ind > $lowmark && $ind < $highmark){
	    return substr($doc, 0, $ind-1);
	}
    }

    $ind = 0;
    while(($ind = index($doc, "<ul>", $ind+1)) != -1){
	if($ind > $lowmark && $ind < $highmark){
	    return substr($doc, 0, $ind-1);
	}
    }

    substr($doc, 0, 600);
}


sub get_types {
    "Type_descriptor";
}

sub get_markup {
    local($doc) = @_;

    ($ndoc = $doc) =~ s/<[^>]*>//g;
    $denom = 100000;
    $denom = length($ndoc) unless !length($ndoc);
    length($doc)/$denom;
}

sub is_real_document {
    local($head, $bod) = @_;

    ($prot, $ret, @crap) = split(' ', $head);


    $ret;
}
1;
