#===========================================================================

package Sitescooper::Robot;

require Exporter;
use Carp;
use Sitescooper::StripTablesFilter;

@ISA = qw(Exporter);
@EXPORT= qw();
$VERSION = "0.1";
sub Version { $VERSION; }

sub new {
  my $class = shift; $class = ref($class) || $class;

  my $url = shift;
  my $outfile = shift;
  my $outidxfile = shift;

  my $self = {
    'url'		=> $url,
    'outfile'		=> $outfile,
    'outtmp'		=> $outfile.".tmp",	# default value
    'outidxfile'	=> $outidxfile,
    'prctitle'		=> undef,
    'sitename'		=> undef,
    'site'		=> undef,

    'cachedir'		=> undef,
    'newcachedir'	=> undef,
    'filekey'		=> undef,

    'convert_now'	=> 0,

    'sitekey'		=> undef,

    'warn_about_ext_links' => 0,	# turned on where necessary

    'page_to_tmpfile'	=> { },
    'output_file'	=> { }
  };

  bless ($self, $class);
  $self->clear_page_tmpfiles();
  $self;
}

sub dont_convert {
  my $self = shift;

  if (defined $self->{filekey}) {
    undef $Scoop::key2syncfile{$self->{filekey}};
  } else {
    carp "cannot block conversion of site without filekey!";
  }
}

sub scoop_site {
  local ($_);
  my $self = shift;
  my $url = $self->{url};

  if (!defined $self->{sitename}) { $self->{sitename} = $url; }

  # This apparently is needed on MacOS. Sounds unlikely, but there it
  # is...
  if (&Scoop::MyOS eq 'Mac') {
    my $parentdir = Scoop::dirname($self->{outfile});
    if (!-d $parentdir) { mkdir ($parentdir, 0755); }
  }

  $_ = $SCF::site_defined_at{$url}; /^(.*):/; my $site_file_name = $1;
  &Scoop::verbose ("SITE START: now scooping site \"$site_file_name\".");

  &Scoop::dbg ("tmp dir: $self->{outtmp}, output dir: $self->{outfile}");

  (-d $self->{outtmp}) && &File::Path::rmtree ($self->{outtmp});
  mkdir ($self->{outtmp}, 0755) || die "cannot mkdir $self->{outtmp}\n";

  $self->clear_page_tmpfiles();

  my $output_filename = $self->{outtmp}.$CF::slash.$self->{outidxfile};

  # evaluate perl code for this site.
  my $proc = Scoop::get_layout_param ('eval_code', $url, $url);
  if (defined $proc) {
    my $skip_site = 0;
    if (!eval $proc."; 1;") {
      &Scoop::sitewarn("EvaluatePerl failed: $@");
    } else {
      if ($skip_site) {
	&Scoop::dbg ("skip_site set, skipping this site.");
	next;
      }
    }
  }

  %{$self->{output_file}} = ();
  ${$self->{output_file}}{'MAIN'} = '';
  %{$self->{output_links_snarfed}} = ();
  %{$self->{new_already_seen_age_cache_data}} = ();
  @{$self->{output_story_urls}} = ();

  my $upindex = $self->{current_story_index} = 0;
  my $hdr;

  if ($CF::writeheader) {
    if ($CF::outstyle == $CF::OUT_HTML) {
      $hdr = "<html><head><title>$self->{prctitle}</title></head>".
	      "<body><h1>$self->{prctitle}</h1>\n";
    } else {
      $hdr = "$self->{prctitle}\n\n\n";
    }

  } else {
    if ($CF::outstyle == $CF::OUT_HTML) {
      $hdr = "<html><head></head><body>\n";
    } else {
      $hdr = "";
    }
  }
  ${$self->{output_file}}{'MAIN'} .= $hdr;

  $self->{stories_found} = 0;
  $self->{file_size} = 0;
  $self->{hit_file_size_limit} = 0;

  &Scoop::set_got_intr_behaviour ('setflag');
  my $u;

  foreach $u ($url, split (' ', $SCF::extra_urls{$url})) {
    # if we were interrupted, clear the flag and go on
    if ($Scoop::got_intr_flag) { &Scoop::set_got_intr_behaviour ('setflag'); }
    $self->add_page_tmpfile ($self->{outtmp}.$CF::slash.$self->{outidxfile}, $u);

    if ($SCF::levels{$url} >= 0) {
      $self->download_front_page ($u, $SCF::levels{$url}, $upindex);
    } else {
      $self->download_story_page ($u, 1, $upindex);
    }
  }

  # Now go through any additional URLs at the different levels, starting
  # at the highest level and working down.
  #
  my $lev;
  for ($lev = $SCF::levels{$url}; $lev >= -1; $lev--)	# -1 = story level
  {
    next unless (defined $Scoop::extra_urls{"$lev $url"});

    foreach $u (split (' ', $Scoop::extra_urls{"$lev $url"}))
    {
      # if we were interrupted, clear the flag and go on
      if ($Scoop::got_intr_flag) { &Scoop::set_got_intr_behaviour ('setflag'); }
      $self->add_page_tmpfile ($self->{outtmp}.$CF::slash.$self->{outidxfile}, $u);

      if ($lev >= 0) {
	$self->download_front_page ($u, $lev, $upindex);
      } else {
	$self->download_story_page ($u, 1, $upindex);
      }
    }
  }

  &Scoop::set_got_intr_behaviour ('exit');

  # kld: addition for image-only sites
  # jm -- TODO:  needs checking, this.
  if (defined $SCF::image_only_site{$url} &&
	$SCF::image_only_site{$url} == 1 &&
	$self->{stories_found} == 0)
  {
    &Scoop::verbose ("$self->{prctitle}: forcing conversion for image-only site");
    $self->{stories_found} = 1;
  }

  if ($self->{stories_found} > 0) {
    &Scoop::verbose ("$self->{prctitle}: $self->{stories_found} ".
	    "stories downloaded (".
	    sprintf ("%3.1f", $self->{file_size}/1024).
	    " K uncompressed).");

    my $ind = $self->{current_story_index};
    foreach $ofkey (keys %{$self->{output_file}}) {
      # convert sitescooper navigation links: [<<][^][>>]
      my $story = ${$self->{output_file}}{$ofkey};

      # trim off the first and last ones anyway
      $story =~ s/\[<a href=\"__SITESCOOPER_STORY_(-1|${ind})\">.*?<\/a>\]//g;

      # and run through the rest
      for ($i = 0; $i < $ind; $i++) {
	next unless (defined ${$self->{output_story_urls}}[$i]);
	$story =~
	s/\"__SITESCOOPER_STORY_${i}\"/\"${$self->{output_story_urls}}[$i]\"/g;
      }

      # remove stray links
      ${$self->{output_file}}{$ofkey} = $self->remove_external_links ($story);
    }

    # if we're in single-page mode, rewrite the <a name> anchors to be
    # much shorter, by numbering them. Off for the time being. TODO

    my $turned_off_for_now = 0;
    if ($turned_off_for_now && !$CF::fileperpage) {
      $_ = ${$self->{output_file}}{'MAIN'};

      my %ankmap = ();
      my $curank = 0;

      s{<a\s+name\s*=\s*\'([^\']+?)\'}{	#'
	if (!defined ($ankmap{$1})) { $ankmap{$1} = $curank++; }
	"<a name=\"".$ankmap{$1}."\"";
      }gies;

      s{<a\s+name\s*=\s*\"([^\"]+?)\"}{
	if (!defined ($ankmap{$1})) { $ankmap{$1} = $curank++; }
	"<a name=\"".$ankmap{$1}."\"";
      }gies;

      s{<a\s+name\s*=\s*([^\"\'][^\s]+?)}{
	if (!defined ($ankmap{$1})) { $ankmap{$1} = $curank++; }
	"<a name=\"".$ankmap{$1}."\"";
      }gies;

      s/(href\s*=\s*\'\#)([^\']+?)\'/$1${ankmap{$2}}\'/gis;
      s/(href\s*=\s*\"\#)([^\"]+?)\"/$1${ankmap{$2}}\"/gis;
      s/(href\s*=\s*\#)(\S+?)/$1${ankmap{$2}}/gis;
    }

    my $blurb1 = "(End of snarf - copyright retained by original ".
      "providers. ";
    my $blurb2 = "";
    if (defined $SCF::rights{$url}) { $blurb2 = $SCF::rights{$url}; }
    my $blurb3 = "Downloaded and converted by sitescooper; see ".
      "$Scoop::home_url )";

    if ($CF::writefooter) {
      if ($CF::outstyle == $CF::OUT_HTML) {
	${$self->{output_file}}{'MAIN'} .= "\n\n<p><hr>".
			"<font size=1><i>$blurb1 $blurb2 $blurb3</i></font>\n";
      } elsif ($CF::outstyle == $CF::OUT_DOC) {
	${$self->{output_file}}{'MAIN'} .= "$blurb1 $blurb2 $blurb3\n";
      } else {
	${$self->{output_file}}{'MAIN'} .= "$blurb1\n$blurb2\n$blurb3\n";
      }
    }

    if ($CF::outstyle == $CF::OUT_HTML) {
      ${$self->{output_file}}{'MAIN'} .= "</body></html>\n";
    } elsif ($CF::outstyle == $CF::OUT_DOC) {
      ${$self->{output_file}}{'MAIN'} .= "<$Scoop::bookmark_char>\n";
    }

    open (OUTFILE, "> $output_filename")
		    or die "Failed to create $output_filename\n";
    print OUTFILE ${$self->{output_file}}{'MAIN'};
    close OUTFILE or warn "Failed to write to $output_filename";

    if (!$CF::fileperpage) {
      if ($CF::dump) {
	open (IN, "<".$self->{outtmp}.$CF::slash.$self->{outidxfile});
	while (<IN>) { print STDOUT; }
	close IN; &File::Path::rmtree ($self->{outtmp});

	# ensure we do not try to convert it later
	$self->dont_convert();

      } else {
	&File::Path::rmtree ($self->{outfile});
	rename ($self->{outtmp}, $self->{outfile});
	if ($self->{convert_now}) { &Scoop::convert_output($self->{filekey}, $url); }
      }

    } else {
      foreach $_ (keys %{$self->{output_file}}) {
	next if ($_ eq 'MAIN');
	open (OUTFILE, "> $_")
			or die "Failed to create $_\n";
	print OUTFILE ${$self->{output_file}}{$_};
	close OUTFILE or warn "Failed to write to $_";
      }

      &File::Path::rmtree ($self->{outfile});
      rename ($self->{outtmp}, $self->{outfile});

      if ($CF::dump) {
	# print the path to the created directory containing the pages
	print $self->{outfile}."\n";
	# ensure we do not try to convert it later
	$self->dont_convert();

      } else {
	if ($self->{convert_now}) { &Scoop::convert_output($self->{filekey}, $url); }
      }
    }

    &Scoop::dbg ("output dir: $self->{outfile}");
    &Scoop::dbg ("output index: ".
    		$self->{outfile}.$CF::slash.$self->{outidxfile});

    my ($from, $to);
    while (($from,$to) = each %{$self->{new_already_seen_age_cache_data}}) {
      &Scoop::dbg ("Saving new already_seen age cache entry: $from => $to ".
	      "(".&Scoop::time2datestr($to).")");
      $Scoop::oldest_already_seen{$from} = $to;
    }

  } else {
    close OUTFILE;
    &Scoop::verbose ("$self->{prctitle}: no new stories, ignoring.");
    &Scoop::dbg ("(Not setting already_seen age cache since no links were followed)");
    $self->dont_convert();
    &File::Path::rmtree ($self->{outtmp});
  }

  &Scoop::verbose ("SITE END: done scooping site \"$site_file_name\".");
  delete ${$self->{output_file}}{'MAIN'};
}

# ---------------------------------------------------------------------------
# Note on levels: a 2-level site has a contents page and stories off that;
# 3-level has issue links page, per-issue contents page and stories.
# 1-level has only the story page, no links.

sub download_front_page {
  my $self = shift;
  my $url = shift;
  my $level = shift;
  my $upindex = shift;
  my ($cachefile, $page);

  my $baseurl = $self->{url};
  my $key = "$level $baseurl";
  my $human_level = $level + 2;

  $Scoop::sitewarn_current_site_line = $SCF::site_defined_at{$baseurl};

  if ($Scoop::got_intr_flag) { return; }
  if ($self->{hit_file_size_limit}) { return; }

  my $pat = $SCF::links_limit_to{$key};
  if (defined $pat) {
    if (!Scoop::match_url ($url, $pat)) {
      &Scoop::dbg ("front page URL $url does not match $pat, ignoring.");
      return;
    }
  }

  $pat = Scoop::get_layout_param ('links_skip', $key, $url);
  if (defined $pat) {
    if ($url =~ m#^${pat}$#) {
      &Scoop::verbose ("Skipping: $url"); return;
    }
  }

  my $origurl = $url;
  $url = $self->apply_url_postproc ($url);
  if (!defined $url) {
    &Scoop::dbg ("URLProcess says URL should be ignored: $origurl"); return;
  }

  my $fullurl = $url; $url = &Scoop::URLWithoutAnchor ($url);
  return if (defined $self->{already_seen_this_session}{$url});
  $self->{already_seen_this_session}{$url} = 1;
  study $url;

  &Scoop::verbose ("Reading level-".($human_level)." front page: $fullurl");
  &Scoop::set_got_intr_behaviour ('setflag');

  my $is_dynamic_html;
  if (defined $SCF::cacheable{$key}) {
    $is_dynamic_html = ($SCF::cacheable{$key} == 0);
  } elsif (defined $SCF::links_diff{$key} && $SCF::links_diff{$key} != 0) {
    $is_dynamic_html = 1;	# pages that need diff'ing are dynamic
  } else {
    $is_dynamic_html = 1;	# index pages are usually dynamic
  }

  push (@Scoop::seen_this_time, $url);
  $Scoop::already_seen {$url} = 1;
  $self->check_for_oldest ($url);	# we came across the link, so keep it around

  &Sitescooper::ParProxy::check_for_url ($url);
  $page = $self->get_page ($url, $is_dynamic_html);
  if (!defined $page) {
    &Scoop::verbose ("Skipping (get_page returned nothing): $fullurl");
    return;
  }
  if ($Scoop::got_intr_flag) { goto interrupted; }

  if ($CF::linkslimit > 0 && $self->{stories_found} >= $CF::linkslimit) {
    &Scoop::verbose ("over links limit, stopping this site.");
    $self->{hit_file_size_limit} = 1;
    return;
  }

  if ($Scoop::useragent->redirect_occurred()) {
    $url = $Scoop::useragent->get_last_redirect();
    &Scoop::dbg ("turn-over links will use redirect as base URL: $url");
  }

  if ($page =~ /<head>.*<base\s+href\s*=\s*[\"\']*(\S+?)[\"\']*\s*>.*<\/head>/is)
  {
    $url = $1;
    &Scoop::dbg ("BASE HREF tag found, setting new base URL: $url");
    $self->add_page_tmpfile ($fname, $url);
  }

  my $life = Scoop::get_layout_param ('story_lifetime', $baseurl, $url);
  if (defined $Scoop::last_modtime{$url} &&
    			$Scoop::last_modtime{$url} < $life * 24 * 60 * 60)
  {
    &Scoop::verbose ("Skipping (contents are older than ".$life." days): $fullurl");
    return;
  }

  my $origpage = $page;
  &Scoop::journal ("pre_strip_level".($human_level), $page);
  $page = $self->strip_front_page ($url, $key, $page);
  &Scoop::journal ("post_strip_level".($human_level), $page);

  my $cachedpage;
  if ((defined $SCF::links_diff{$key} && $SCF::links_diff{$key} != 0) ||
     ((defined $lprint && $lprint != 0) || &Scoop::writing_html))
  {
    $cachedpage = $self->strip_front_page ($url, $key,
				$self->get_cached_page_for_diff ($url));
  }

  if (defined $SCF::links_diff{$key} && $SCF::links_diff{$key} != 0) {
    $page = $self->get_new_bits ($cachedpage, $page);
  }

  $self->cache_page_later ($url, $origpage);

  my $proc = Scoop::get_layout_param ('links_preproc', $key, $url);
  if (defined $proc) {
    $_ = $page;
    my $site_level = $human_level;
    if (!eval $proc."; 1;") {
      &Scoop::sitewarn("level-".($human_level)." HTMLPreProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
    }
  }

  if (defined fileno Scoop::JOURNAL) {
    # always write a text-mode version for the journal
    &Scoop::journal ("to_text_level".($human_level),
    	$self->html_to_text ($url, $page, $CF::OUT_TEXT));
  }

  my $lprint = Scoop::get_layout_param ('links_print', $key, $url);
  my $printed_this_front_page = 0;
  my $has_not_changed = 0;

  if ((defined $lprint && $lprint != 0) || &Scoop::writing_html) {
    $self->{warn_about_ext_links} = 1;
    my $txtpage = $self->html_to_text ($url, $page, $CF::outstyle);
    $self->{warn_about_ext_links} = 0;

    if (defined $cachedpage && !$CF::refresh) {
      # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
      # HTML. Sometimes the ad banners will be the only things that have
      # changed between retrieves, and html_to_text will have stripped those
      # out.
      my $cachedtxt = $self->html_to_text ($url,
      				$cachedpage, $CF::outstyle);
      if (&text_equals ($txtpage, $cachedtxt)) {
	$has_not_changed = 1;
      }
    }

    # if we're only printing the links because we're writing HTML,
    # then use links_trim. (off for the time being!)

    #if (!(defined $lprint && $lprint != 0) && &Scoop::writing_html) {
      #my $ltrim = Scoop::get_layout_param ('links_trim', $key, $url);
      #if ($ltrim) {
	## trim that number of chars after the last link in the page.
	#my $eval = '
	  #$txtpage =~ s/(href=.*?<\/a>.{'.$ltrim.'}\S*).*?$/$1
	  #<i>[irrelevant links trimmed]<\/i>/i;';
	#eval "$eval; 1;" or warn ("trim code eval failed: $@\n$eval\n");
      #}
    #}

    &Scoop::verbose ("Printing: $fullurl");
    $self->write_as_story (1, $url, $txtpage, undef, $upindex);

    if ($has_not_changed && !$CF::refresh) {
      # don't count a front page as a story if:
      # 1. we're just outputting it because we're writing HTML
      # 3. and the page had not changed since previous GET
      # 4. and we're not refreshing

      &Scoop::dbg ("text has not changed, not counting this page as a story");
      $self->{stories_found}--;
    }
  }

  # this is a front page. Pages followed from this page should use this as
  # the "up a level" link.
  $upindex = $self->{current_story_index} - 1;

  # see if there's any links to extra contents pages
  my @turnoverlinks = $self->get_contents_turnover_links ($url, $key, $page);

  my @links = ();
  my $wrote_sep = 0;

  # This was all getting a bit tricky, so I've redone it a bit.
  # It now does not try to strip closing tags, as it doesn't have to.
  while (1) {
    if ($Scoop::got_intr_flag) { goto interrupted; }
    if ($self->{hit_file_size_limit}) { last; }

    if (
      $page =~ s/<a\s+[^>]*href=\s*\"([^\">]+)\"//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*\'([^\'>]+)\'//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*([^\s>]+)//is
      )
    {
      my $link = $1;
      push (@links, $link);
      next;
    }

    # support for frames
    if (
      $page =~ s/<frame\s+[^>]*src=\"([^\">]+)\"//is
       ||
      $page =~ s/<frame\s+[^>]*src=\'([^\'>]+)\'//is
       ||
      $page =~ s/<frame\s+[^>]*src=([^\s+>]+)//is
      )
    {
      my $link = $1;
      if (&Scoop::writing_html) {
	if ($wrote_sep == 0) {
	  ${$self->{output_file}}{'MAIN'} .= "<p><hr>\n"; $wrote_sep = 1;
	}
	${$self->{output_file}}{'MAIN'} .=
		$self->translate_link ($fullurl, $link, $link). "<br>\n";
      }
      push (@links, $link);
      next;
    }

    # rudimentary support for My-Netscape-style RDF files
    if ($page =~ s/<item>(.*?)<link\s*[^>]*>(.+?)<\/link>(.*?)<\/item>//is)
    {
      my ($title, $link, $title2) = ($1, $2, $3);

      # <link> tags in RSS can contain other crap. Ditch it; we want the link!
      $link =~ s/^.*<url>(.*?)<\/url>.*$/$1/gis;

      $link = &Scoop::AbsoluteURL ($url, $link);
      if ($title =~ /<title>(.*?)<\/title>/is
	   || $title =~ /<text>(.*?)<\/text>/is
	   || $title2 =~ /<title>(.*?)<\/title>/is
	   || $title2 =~ /<text>(.*?)<\/text>/is)
      {
	$SCF::url_title{$link} = $1;
      }

      push (@links, $link);
      next;
    }

    last;		# no more links available
  }

  if ($#links >= 0) {
    &Scoop::verbose ("Found ".($#links+1)." links, examining them.");
  }

  # now traverse the links and get the stories
  &Scoop::journal ("links_level".($human_level), join ("\n", @links));
  my $followed_a_link = 0;

  foreach $_ (@links) {
    if ($self->{hit_file_size_limit}) {
      my $msg = "File size limit exceeded,".
			  " skipped some stories from this site.";
      &Scoop::verbose ($msg);
      if (&Scoop::writing_html) {
	${$self->{output_file}}{'MAIN'} .= "<hr><i>$msg</i><br>\n";
      } else {
	${$self->{output_file}}{'MAIN'} .= "\n($msg)\n";
      }
      last;
    }

    $self->follow_front_link ($url, $level, $_, $upindex)
				and ($followed_a_link = 1);

    if ($Scoop::got_intr_flag) { goto interrupted; }
  }

  # if there's more contents pages, process them as well.
  &Scoop::journal ("turnover_links_level".($human_level), join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($Scoop::got_intr_flag) { goto interrupted; }
      $link = &Scoop::AbsoluteURL ($url, $link);
      $self->download_front_page ($link, $level)
				and ($followed_a_link = 1);
    }
  }

  &Scoop::dbg ("stories found so far: ".$self->{stories_found});

interrupted:
  &Scoop::set_got_intr_behaviour ('exit');

  ($printed_this_front_page || $followed_a_link || !$has_not_changed);
}

# ---------------------------------------------------------------------------

sub follow_front_link {
  my $self = shift;
  my $baseurl = $self->{url};
  my ($url, $level, $nextpage, $upindex) = @_;

  $nextpage = &Scoop::AbsoluteURL ($url, $nextpage);
  return if ($nextpage !~ /^(http|file):/i);	# only supported links

  &Scoop::dbg ("Link found on $url: $nextpage");

  # should we download the next front page?
  if ($level > 0) {
    return $self->download_front_page ($nextpage, $level-1, $upindex);
  }
  if ($Scoop::got_intr_flag) { return; }

  # nope, we're onto the stories already
  $nextpage = $self->make_printable ($nextpage, 1);

  $self->download_story_page ($nextpage, 0, $upindex);
}

sub make_printable {
  my $self = shift;
  my $nextpage = shift;
  my $warn_if_fail = shift;

  my $baseurl = $self->{url};
  my $sub = Scoop::get_layout_param ('printable_sub', $baseurl, $nextpage);
  if (defined $sub) {
    my $new = $nextpage;
    $sub =~ s/\\(\d+)/\$$1/g;	# avoid warnings

    eval '$new =~ '.$sub.'; 1;'
      or &Scoop::sitewarn ("Printable substitution failed! ($!)\n");

    if ($nextpage ne $new) {
      # &Scoop::verbose ("Using printable version instead: $new");
      my $limitto = $SCF::story_limit_to{$baseurl};
      if (defined $limitto && !Scoop::match_url ($new, $limitto)) {
	if ($warn_if_fail) {
	  &Scoop::sitewarn ("Printable version does not match StoryURL".
		"pattern, reverting from $new to $nextpage\n");
	}
      } else {
	$nextpage = $new;
      }
    }
  }
  $nextpage;
}

# ---------------------------------------------------------------------------

sub download_story_page {
  my $self = shift;
  my $url = shift;
  my $is_dynamic_html = shift;
  my $upindex = shift;

  my $baseurl = $self->{url};
  my $fullurl = $url; $url = &Scoop::URLWithoutAnchor ($url);
  study $url;

  $Scoop::sitewarn_current_site_line = $SCF::site_defined_at{$baseurl};

  my $cacheflag = $SCF::cacheable{"s $baseurl"};
  if (defined $cacheflag) {
    # user setting overrides our heuristics
    $is_dynamic_html = ($cacheflag==0);
  }
  if (defined $SCF::story_diff{$baseurl} && $SCF::story_diff{$baseurl}) {
    $is_dynamic_html = 1;	# diff pages are always dynamic
  }

  my $limitto = $SCF::story_limit_to{$baseurl};
  if (defined $limitto) {
    if (!defined $self->{output_storyurl_dbg}{$baseurl}) {
      &Scoop::dbg ("StoryURL for $baseurl: $limitto");
      $self->{output_storyurl_dbg}{$baseurl} = 1;
    }

    if (!Scoop::match_url ($url, $limitto)) {
      &Scoop::dbg ("Non-story URL ignored: $fullurl");
      return;
    }
  }

  if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) {
    &Scoop::dbg ("Non-story URL ignored (bad protocol): $fullurl");
    return;
  }

  $url = $self->apply_url_postproc($url);
  if (!defined $url) {
    &Scoop::dbg ("URLProcess says URL should be ignored: $fullurl"); return;
  }

  $pat = Scoop::get_layout_param ('story_skip', $baseurl, $url);
  if (defined $pat) {
    if ($url =~ m#^${pat}$#) {
      &Scoop::verbose ("Skipping: $fullurl"); return;
    }
  }

  $self->check_for_oldest ($url);

  if (!$is_dynamic_html && $Scoop::already_seen {$url}) {
    &Scoop::dbg ("skipping, already seen: $fullurl");
    return;
  }

  push (@Scoop::seen_this_time, $url);
  $Scoop::already_seen {$url} = 1;

  if ($self->{hit_file_size_limit}) { return; }
  $self->get_story_page ($url, $is_dynamic_html, $upindex);
  1;
}

# ---------------------------------------------------------------------------

sub get_story_page {
  my $self = shift;
  my $url = shift;
  my $baseurl = $self->{url};
  my $is_dynamic_html = shift;
  my $upindex = shift;
  my @turnoverlinks;
  my $headline;

  &Scoop::verbose ("Reading: $url");
  $self->check_for_oldest ($url);	# we came across the link, so keep it around

  my $cachedpage = undef;
  if (defined $SCF::story_diff{$baseurl} && $SCF::story_diff{$baseurl}) {
    $cachedpage = $self->get_cached_page_for_diff ($url);
  } elsif ($is_dynamic_html) {
    $cachedpage = $self->get_cached_page ($url);
  }
  if (defined $cachedpage) {
    $cachedpage = $self->strip_story ($url, $cachedpage, " (cached)");
  }

  &Sitescooper::ParProxy::check_for_url ($url);
  my $origpage = $self->get_page ($url, $is_dynamic_html);
  return unless defined $origpage;
  if ($Scoop::got_intr_flag) { return; }

  if ($CF::linkslimit > 0 && $self->{stories_found} >= $CF::linkslimit) {
    &Scoop::verbose ("over links limit, stopping this site.");
    $self->{hit_file_size_limit} = 1;
    return;
  }

  if ($Scoop::useragent->redirect_occurred()) {
    $url = $Scoop::useragent->get_last_redirect();
    &Scoop::dbg ("turn-over links will use redirect as base URL: $url");
  }

  # get headline before stripping StoryStart and StoryEnd
  $headline = $self->get_headline ($url, $origpage);
  &Scoop::journal ("pre_strip_story", $origpage);
  my $page = $self->strip_story ($url, $origpage, "");
  &Scoop::journal ("post_strip_story", $page);

  my $bits = Scoop::get_layout_param ('story_html_header', $baseurl, $url);
  if (defined $bits) { $page = $bits . $page; }
  $bits = Scoop::get_layout_param ('story_html_footer', $baseurl, $url);
  if (defined $bits) { $page .= $bits; }

  my $proc = Scoop::get_layout_param ('story_preproc', $baseurl, $url);
  if (defined $proc) {
    $_ = $page;
    my $site_level = 1;
    if (!eval $proc."; 1;") {
      &Scoop::sitewarn("StoryHTMLPreProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
      &Scoop::journal ("post_story_preproc", $page);
    }
  }

  if (defined $SCF::story_diff{$baseurl} && $SCF::story_diff{$baseurl}) {
    $page = $self->get_new_bits ($cachedpage, $page);
    $self->cache_page_later ($url, $origpage);
  } else {
    $self->cache_page ($url, $origpage);
  }

  if ($Scoop::got_intr_flag) { return; }

  if (defined fileno Scoop::JOURNAL) {
    # always write a text-mode version for the journal
    &Scoop::journal ("to_text_story",
    	$self->html_to_text ($url, $page, $CF::OUT_TEXT));
  }

  # get turn-over links after stripping StoryStart and StoryEnd
  @turnoverlinks = $self->get_story_turnover_links ($url, $page);
  $self->{warn_about_ext_links} = 1;
  $page = $self->html_to_text ($url, $page, $CF::outstyle);
  $self->{warn_about_ext_links} = 0;

  if ($is_dynamic_html && defined $cachedpage && !$CF::refresh) {
    # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
    # HTML. Sometimes the ad banners will be the only things that have
    # changed between retrieves, and html_to_text will have stripped those
    # out.
    $cachedpage = $self->html_to_text ($url, $cachedpage, $CF::outstyle);
    if (&text_equals ($page, $cachedpage)) {
      &Scoop::verbose ("Skipping (text has not changed): $url");
      return;
    }
  }

  my $life = Scoop::get_layout_param ('story_lifetime', $baseurl, $url);
  if (defined $Scoop::last_modtime{$url} &&
    		$Scoop::last_modtime{$url} < $life * 24 * 60 * 60)
  {
    &Scoop::verbose ("Skipping (story is older than ".$life." days): $url");
    return;
  }

  # ensure there's some alphanumerics in the output text. No alnums means
  # no output. HTML needs to be checked to ensure we don't just pick
  # up tags which will not be displayed. Added kld's check for image-only
  # sites.
  {
    my $gottext = 1;

    if ($SCF::image_only_site{$baseurl} == 1) {
      &Scoop::dbg ("image-only site, not checking if text is present");

    } elsif (&Scoop::writing_html) {
      if ($page !~ /[A-Za-z0-9"']\s*</
		&& $page !~ />\s*[A-Za-z0-9"']/
		&& $page !~ /^\s*[A-Za-z0-9"']/)
      { $gottext = 0; }

    } else {
      if ($page !~ /[A-Za-z0-9"']/) { $gottext = 0; }
    }

    if ($gottext == 0) {
      &Scoop::verbose ("Skipping (no text to write): $url");
      return;
    }
  }

  if ($SCF::levels{$baseurl} < 0) {
    # this is a one-level site: therefore the story should be treated
    # as the "front page". Thx Carsten for this one.
    $self->write_as_story (1, $url, $page, $headline, $upindex);
  } else {
    $self->write_as_story (0, $url, $page, $headline, $upindex);
  }

  &Scoop::journal ("turnover_links_story", join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($Scoop::got_intr_flag) { return; }
      $link = &Scoop::AbsoluteURL ($url, $link);
      $self->download_story_page ($link, 0, $upindex);	# right now
    }
  }
}

# ---------------------------------------------------------------------------

sub apply_url_postproc {
  my $self = shift;
  local ($_) = shift;
  my $baseurl = $self->{url};

  my $proc = Scoop::get_layout_param ('url_preproc', $baseurl, $_);
  if (defined $proc) {
    if (!eval $proc."; 1;") {
      &Scoop::sitewarn("URLProcess failed: $@");
      undef $_;
    }
  }
  $_;
}

# ---------------------------------------------------------------------------

sub clean_pre_tags_for_diff {
  my $self = shift;

  my $file = shift;
  my $pre_nl_tag = shift;
  my $pre_pre_tag = shift;
  my $pre_slashpre_tag = shift;

  my $start = '';
  my $end = '';

  ($file =~ s/^(.*)<pre>//i) and $start = $1;
  ($file =~ s/<\/pre>(.*)$//i) and $end = $1;
  $file =~ s/\n/${pre_nl_tag}/gs;

  $start.$pre_pre_tag.$file.$pre_slashpre_tag.$end;
}

sub get_new_bits {
  my $self = shift;

  local ($_);
  my ($oldfile, $newfile) = @_;

  if ($CF::refresh) {
    &Scoop::verbose ("-refresh is on, not looking for differences");
    return $newfile;
  }

  if (!defined $oldfile || $oldfile =~ /^\s*$/) {
    if (!$CF::debugdiffs) { return $newfile; }
    $oldfile = '';
  }

  &Scoop::verbose ("Finding differences between current page and cached version");

  # it's important to keep these names 8.3 for Windows-95 compatibility,
  # as some Windoze diffs may not be able to handle them otherwise!
  # This also requires that we are chdir'd into the temporary directory
  # to avoid hassles with long filenames in the args when we run the
  # diff command. What a pain!
  #
  my $oldf = "a$$.tmp";		# we are already chdir'ed
  my $newf = "b$$.tmp";

  if ($CF::debugdiffs) {
    $oldf = "diff_old.tmp";
    $newf = "diff_new.tmp";
  }

  # Split the file lines at probable story-header endpoints.
  # This makes them more amenable to diffing, hopefully without
  # losing bits we don't want to lose, or gaining bits we don't
  # want to gain. Also try to keep cross-line-split HTML tags
  # together.

  # preserve newlines in <pre> text
  my $cleaned_pre_nls = 0;
  my $pre_nl_tag = "<!!!n>";
  my $pre_pre_tag = "<!!!pre>";
  my $pre_slashpre_tag = "<!!!/pre>";

  while ($oldfile =~ /<pre>/i) {
    $oldfile = $self->clean_pre_tags_for_diff ($oldfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  while ($newfile =~ /<pre>/i) {
    $newfile = $self->clean_pre_tags_for_diff ($newfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  # canonicalise all other newlines (we control the vertical!)
  $oldfile =~ s/\s*[\r\n]+\s*/ /gs;
  $newfile =~ s/\s*[\r\n]+\s*/ /gs;

  # remove extraneous whitespace from inside tags
  $oldfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;
  $newfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;

  # handle the two types of <p> tags -- <p>...</p>, and just ...<p>
  $oldfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;
  $newfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;

  $oldfile =~ s/(<p *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<p *[^>]*>)/$1\n/gi;

  $oldfile =~ s/<!!!p/<p/gi;
  $newfile =~ s/<!!!p/<p/gi;

  # put newline before these tags (thx Carsten Clasohm, again!)
  $oldfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  $newfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  # after these ones
  $oldfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;

  # remove newlines inside <a href> tags. Thx to Carsten Clasohm.
  1 while $oldfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;
  1 while $newfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;

  if ($cleaned_pre_nls) {
    $oldfile =~ s/${pre_nl_tag}/\n/g; $oldfile =~ s/${pre_pre_tag}/<pre>/g;
    $oldfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
    $newfile =~ s/${pre_nl_tag}/\n/g; $newfile =~ s/${pre_pre_tag}/<pre>/g;
    $newfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
  }

  my $page = '';
  my $created_newf = 0;

  if ($CF::diff eq '') {
    # use the perl module implementation of diff instead!
    eval '
      use Algorithm::Diff qw(diff);

      my @chunk;
      my ($sign, $lineno, $text);
      my @f1 = split "\n", $oldfile;
      my @f2 = split "\n", $newfile;

      my $diffs = diff(\@f1, \@f2);

      if (@$diffs) {
	foreach $chunk (@$diffs) {
	  foreach $line (@$chunk) {
	    ($sign, $lineno, $text) = @$line;
	    if ($sign =~ /\+/) {
	      $page .= $text . "\n";
	    }
	  }
	}
      }
    1;' or die ("diff code eval failed: $@");

  } else {
    open (F1, "> $oldf") || warn "cannot write to $oldf\n";
    print F1 $oldfile; close F1;
    open (F2, "> $newf") || warn "cannot write to $newf\n";
    print F2 $newfile; close F2;
    $created_newf = 1;

    if ($CF::diff ne '' && open (DIFF, "$CF::diff $oldf $newf |")) {
      while (<DIFF>) {
	/^>/ || next;
	$page .= $';
      }
      close DIFF;		# ignore exit status -- exit 1 only means no diffs.

    } else {
      warn "cannot run Diff command \"$CF::diff\", using entire page instead.\n";
      $page = $newfile;
    }
  }

  if ($CF::debugdiffs) {
    open (F1, "> diff_out.tmp"); print F1 $page; close F1;
    warn "$CF::diff $oldf $newf, breaking for debug"; &cleanexit;
  }

  if ($created_newf) {
    unlink $oldf; unlink $newf;
  }

  $page;
}

# ---------------------------------------------------------------------------

sub text_equals {
  my $t1 = shift;
  my $t2 = shift;
  $t1 =~ s/[\s\r\n]+/ /gs; $t1 =~ s/^\s+//; $t1 =~ s/\s+$//;
  $t2 =~ s/[\s\r\n]+/ /gs; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//;
  ($t1 eq $t2);
}

# ---------------------------------------------------------------------------
# Strip a story page from StoryStart to StoryEnd.
# In addition, strip out non-story sidebar table items
# and carriage returns (they confuse plenty of regexps later).
#
sub strip_story {
  my $self = shift;
  my $url = shift;
  my $baseurl = $self->{url};
  my $page = shift;
  my $comment = shift;

  if (!defined $page) { return undef; }

  # ok, now strip the headers and footers
  my $pat = Scoop::get_layout_param ('story_start', $baseurl, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &Scoop::sitewarn("StoryStart pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#^.*?${pat}##gs) {
      $page =~ s#^[^<]*?>##gs;		# strip superfluous ends of tags
      if (defined fileno Scoop::JOURNAL) { &Scoop::journal ("pre_stripped", $&); }
    } else {
      &Scoop::sitewarn("StoryStart pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  $pat = Scoop::get_layout_param ('story_end', $baseurl, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &Scoop::sitewarn("StoryEnd pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#${pat}.*?$##gs) {
      $page =~ s#<[^>]*?$##gs;		# strip superfluous starts of tags
      if (defined fileno Scoop::JOURNAL) { &Scoop::journal ("post_stripped", $&); }
    } else {
      &Scoop::sitewarn("StoryEnd pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  # smart_clean_table only operates on table items with size specifications.
  # TODO -- work out table sizes using images if possible.
  #
  my $smarts = Scoop::get_layout_param ('use_table_smarts', $baseurl, $url);
  if (!defined $smarts || $smarts != 0) {
    my $filter = Sitescooper::StripTablesFilter->new();
    $filter->parse ($page);
    $page = $filter->filtered_html();
  }

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

sub strip_front_page {
  my $self = shift;
  my $url = shift;
  my $key = shift;
  my $page = shift;

  if (!defined $page) { return undef; }

  my $baseurl = $self->{url};
  my $pat = Scoop::get_layout_param ('links_start', $key, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &Scoop::sitewarn("ContentsStart pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#^.*?${pat}##gs) ||
	&Scoop::sitewarn("ContentsStart pattern \"$pat\" not found in page $url\n");
    $page =~ s#^[^<]*?>##gs;		# strip cut-in-half tags
  }

  $pat = Scoop::get_layout_param ('links_end', $key, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &Scoop::sitewarn("ContentsEnd pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#${pat}.*?$##gs) ||
	&Scoop::sitewarn("ContentsEnd pattern \"$pat\" not found in page $url\n");
    $page =~ s#<[^>]*?$##gs;		# strip cut-in-half tags
  }

  my $smarts = Scoop::get_layout_param ('use_table_smarts', $baseurl, $url);
  if (!defined $smarts || $smarts != 0) {
    my $filter = Sitescooper::StripTablesFilter->new();
    $filter->parse ($page);
    $page = $filter->filtered_html();
  }

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

# ---------------------------------------------------------------------------

sub get_headline {
  my $self = shift;
  my $url = shift;
  my $page = shift;

  my $headline;
  my $baseurl = $self->{url};

  if (defined $SCF::url_title{$url}) {
    $headline = $self->html_to_text ($url,
    		$SCF::url_title{$url}, $CF::OUT_TEXT);
    &Scoop::dbg ("StoryHeadline: (from RDF): $headline");

  } else {
    my $pat = Scoop::get_layout_param ('head_pat', $baseurl, $url);
    if (defined $pat) {
      if ($page !~ m#${pat}#m) {
	&Scoop::sitewarn("StoryHeadline pattern \"$pat\" not found in page $url\n");
      } elsif (defined $1) {
	$headline = $self->html_to_text ($url, $1, $CF::OUT_TEXT);
	# &Scoop::dbg ("StoryHeadline: $headline"); # logged later on anyway
      } else {
	&Scoop::sitewarn("StoryHeadline pattern \"$pat\" contains no brackets!\n");
      }

    } elsif ($page =~ m#<meta name="PCTITLE" content="(.*)">#mi) {
      # try a fallback: search for PointCast headline tags
      $headline = $self->html_to_text ($url, $1, $CF::OUT_TEXT);
      &Scoop::dbg ("StoryHeadline (default, PointCast): $headline");
    }
  }

  $headline;
}

# ---------------------------------------------------------------------------

sub get_story_turnover_links {
  my $self = shift;
  my $url = shift;
  my $page = shift;
  my $baseurl = $self->{url};

  my @turnoverlinks = ();
  my $followlinks = Scoop::get_layout_param ('story_follow_links',
  							$baseurl, $url);

  while (1) {
    if ($self->{hit_file_size_limit}) { last; }

    if (
      $page =~ s/<a\s+[^>]*href=\s*\"([^\">]+)\"[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*\'([^\'>]+)\'[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*([^\s>]+)[^>]*>(.+?)<\/a>//is
      )
    {
      my $link = $1;
      my $txt = $2;

      $link =~ s/^(?:\"|\'|%22)*//; $link =~ s/(?:\"|\'|%22)*$//;
      if ($followlinks) {
	push (@turnoverlinks, $link);

      } elsif ($txt =~ m,(more|next|\d+ of \d+|&gt;&gt;),i) {
	my $urlguts = '.';
	($baseurl =~ /^http:\/\/\S+\.([^\.\/]+\.[^\.\/]+\/).*$/) and
	    ($urlguts = $1);

	if (($txt !~ /[a-z0-9] [a-z0-9]+ [a-z0-9]+ [a-z0-9]/i) # 5 or more words
	    && (length ($txt) < 15)
	    && $link =~ m/$urlguts/)
	{
	  push (@turnoverlinks, $link);
	  $txt =~ s/[\n\r]+/ /g;
	  &Scoop::verbose ("(Following 'next page' link: \"$txt\")");
	}
      }
      next;
    }

    last;		# no more links available
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub get_contents_turnover_links {
  my $self = shift;

  my $url = shift;
  my $key = shift;
  my $page = shift;

  my $followlinks = Scoop::get_layout_param ('links_follow_links', $key, $url);
  if (!$followlinks) {
    return ();
  }

  my @turnoverlinks = ();

  while ($page =~ s,<a\s+[^>]*href=\s*(?:\"|\'|%22)?([^>]+)(?:\"|\'|%22)?>(.+?)</a>,,is)
  {
    my $link = $1;
    my $txt = $2;

    push (@turnoverlinks, $link);
    # we don't do the automatic "more/next/page x of y" stuff
    # that we do with the story pages
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub remove_an_ext_link {
  my $self = shift;

  my ($link, $text, $ahref, $posthref) = @_;

  if (!&Scoop::writing_html) {
    return $text;
  }

  if (defined (${$self->{output_links_snarfed}}{$link})
   || ($link =~ /__HASH__/ && defined (${$self->{output_links_snarfed}}{$`}))
              || $CF::nolinkrewrite)
  {
    $ahref.$link.$posthref.$text."</a>";
  } else {
    &Scoop::dbg ("Removing non-snarfed link: $link (\"$text\")");
    "<u>".$text."</u>";		# without <a href=...> </a>
  }
}

sub remove_external_links {
  my $self = shift;

  local ($_) = $_[0];

  #&Scoop::dbg (join(' ', sort keys %{$self->{output_links_snarfed}}));

  s/(<a\s+[^>]*href=\s*[\"\'])([^\"\']+)([\"\'][^>]*?>)(.*?)<\/a>/
	  $self->remove_an_ext_link ($2, $4, $1, $3);
      /gies;

  # fix REAL external links so they're now active and valid
  s/HREF_EXTERNAL/href/gs;

  $_;
}

# We could do this smarter, but it looks really gross when converted to
# DOC format -- and this tool is primarily for that conversion. Sorry!
# This also works well for iSilo, because iSilo's rendering of <pre> text
# is pretty rotten.
#
sub clean_preformatted_text {
  my $self = shift;

  my $txt = shift;
  $txt =~ s/[ \t]+\n/\n/g;
  $txt =~ s/<(|\/)(pre|code)>//g;	# strip extra <pre> tags!

  # convert blank lines to a paragraph separator.
  $txt =~ s/\n{1,}\n/<p>\n\n/g;

  # The idea with this one is to add a <br> at the end of lines shorter
  # than 50 columns, and conversely to allow lines longer than 50 cols to
  # run into the next line as if they were part of a paragraph.  I'm not
  # sure about it, but a lot of <pre> sites are just copies of emails, so
  # it can make them look a lot better, since the Palm's screen is a
  # lot narrower than 80 columns (which is what most <pre> pages aim for).
  # REVISIT - Make this a .site file parameter?
  $txt =~ s/\n\s*(.+[<>].+)\s*\n/<br>\n$1<br>\n/g;
  $txt =~ s/\n\s*([^\n]{1,50})\s*\n/\n$1<br>\n/g;

  $txt =~ s/[ \t]+/ /g;
  $txt;
}

sub translate_link {
  my $self = shift;
  my $baseurl = $self->{url};
  my ($url, $link, $text, $ahref, $posthref) = @_;

  if (!&Scoop::writing_html) {
    return $text;
  }
  if (!defined $ahref) { $ahref = "<a href="; }
  if (!defined $posthref) { $posthref = ">"; }

  my $nolink;
  if ($text =~ /\S/) {
    $nolink = "<u>".$text."</u>";
  } else {
    $nolink = " ";		# sometimes there's no text!
  }

  $link = &Scoop::AbsoluteURL ($url, $link);
  return $nolink if ($link !~ /^(http|file):/i);	# only supported links

  if ($CF::nolinkrewrite) {
    return $ahref."\"".$link."\"".$posthref.$text."</a>";
  }

  # translate to printable version first, in case the StoryURL pattern
  # only covers the printable style.
  $link = $self->make_printable ($link, 0);

  # Is the link one that we will be downloading? If not, just de-linkify
  # it. 1-level sites never have active links so we can just assume
  # the links should not be links.
  my $limitto = $SCF::story_limit_to{$baseurl};

  if (!Scoop::match_url ($link, $limitto)
    && &Scoop::URLWithoutAnchor ($link) ne &Scoop::URLWithoutAnchor ($url))
  {
    # check the contents/issue levels as well.
    my $ok = 0;

    for ($lev = $SCF::levels{$baseurl}; $lev >= 0; $lev--) {
      my $key = "$lev $baseurl";
      $limitto = $SCF::links_limit_to{$key};
      if (defined $limitto) {
	if (Scoop::match_url ($link, $limitto)) { $ok = 1; last; }
      }
    }

    if ($ok == 0) {
      if ($self->{warn_about_ext_links}) {
	&Scoop::dbg ("External link not translated: $link");
      }
      return $nolink;

      # REVISIT -- provide links at end of stories
    }
  }
  #&Scoop::dbg ("Translating link: $link");
 
  # Note that we always put in quotes around the URL.
  # remove_external_links, which is run later, requires this (and anyway
  # it makes for better HTML).
  #
  if ($CF::fileperpage) {
    my ($fname, $relative) = $self->href_to_multipage_anchor ($link);
    $ahref."\"".$relative."\"".$posthref.$text."</a>";
  } else {
    my $anchor = $self->href_to_singlepage_anchor ($link);
    $ahref."\"#".$anchor."\"".$posthref.$text."</a>";
  }
}

# try to preserve images used as capital letters starting a story. NYTimes
# does this.
sub clean_inline_images {
  my $self = shift;
  my $url = shift;
  my $tag = shift;
  my $baseurl = $self->{url};

  my $usealt = Scoop::get_layout_param ('use_alt_tags', $baseurl, $url);
  if (defined ($usealt) && $tag =~ /(?:^|\s)src=\"?([^\"> ]+)\"?(?:$|\s)/is)
  {
    my $src = $1;

    if ($tag =~ /(?:^|\s)alt=\"?([^\"> ]+)\"?(?:$|\s)/is) {
      my $alt = $1;

      $src = &Scoop::AbsoluteURL($url,$src);
      if (Scoop::match_url ($src, $usealt)) {
	&Scoop::dbg ("using alt tag \"$alt\" for img: $src");
	return $alt;
      } else {
	# &Scoop::dbg ("not using alt tag \"$alt\" for img: $src");
      }
    }
  }

  if ($tag =~ /(?:^|\s)alt=\"?([A-Za-z0-9])\"?(?:$|\s)/is) {
    &Scoop::dbg ("converting one-letter img to letter: $1");
    return $1;
  }

  my $imgurl = Scoop::get_layout_param ('imageurl', $baseurl, $url);
  if ($CF::allowimgs && defined ($imgurl)) {
    if ($tag =~ /(?:^|\s)src=\"?([^\"> ]+)\"?(?:$|\s)/is) {
      my $src = $1;

      $src = &Scoop::AbsoluteURL($url,$src);
      if (Scoop::match_url ($src, $imgurl)) {
	&Scoop::dbg ("keeping img: $src");
	$tag =~ s/(?:^|\s)src=\"?[^\"> ]+\"?(?:$|\s)/ /gis;
	$relative = $self->download_img ($src, $tag);

	return "<img src=\"".$relative."\" $tag>";
      }
    }
  }

  " ";
}

sub download_img {
  my $self = shift;
  my $url = shift;
  my $tag = shift;
  my $baseurl = $self->{url};

  my $type = '.gif';
  if ($url =~ /\.jp[eg]+/i) { $type = '.jpg'; }
  my ($fname, $relative) = $self->href_to_multipage_anchor ($url, $type);

  if (!${$self->{output_links_snarfed}}{$url}) {
    &Scoop::verbose ("Image: $url");

    my $img = $self->get_img ($url, 0);
    if (!defined $img) {
      &Scoop::verbose ("Skipping (get_img returned nothing): $url");
      return;
    }
    if ($Scoop::got_intr_flag) { return ""; }

    $self->cache_page ($url, $img);
    open (OUT, "> $fname") or warn
	  (warn ("failed to write to $fname!\n"), return "");
    binmode OUT;
    print OUT $img;
    close OUT or warn ("failed to write to $fname!\n");

    $self->add_snarfed_link ($url);
    $self->up_file_size ($url, (-s $fname), "image");
  }

  $relative;
}

sub html_to_text {
  my $self = shift;
  my $url = shift;
  my $page = shift;
  my $format = shift;
  my $baseurl = $self->{url};

  if ((defined $SCF::site_format{$baseurl}) && ($SCF::site_format{$baseurl} eq 'rss')) {
    # Convert the RSS formatting into a nice display, for the index page.
    $page =~ s,<channel>(.*?)<title>(.*?)<\/title>(.*?)<\/channel>,<h2>$2<\/h2> $1 $3,gis;

    my $link;
    $page =~ s/<link>(.*?)<\/link>/
      $link = $1; $link =~ s,^.*<url>(.*)<\/url>.*$,$1,g;
      $link = &Scoop::AbsoluteURL($url,$link);
      '(<a href='.$link.'>'.$link.'<\/a>)';
    /gies;

    $page =~ s,<title>(.*?)<\/title>,<b>$1<\/b> ,gis;
    $page =~ s,<item>,<p>,gis; $page =~ s,<\/item>,<\/p>,gis;

    # the description is converted for RSS 0.91 sites -- the "fat" format
    $page =~ s,<description>(.*?)<\/description>,$1 ,gis;
  }


  # strip tags we know we don't want
  # modified by Albert K T Hui <avatar /at/ deva.net>: allow text in
  # <head> tag, netscape will display it so so should we.
  #
  $page =~ s/<head(?:\s+[^>]+|)>(.*?)<\/head>/$1<br>/gis;
  $page =~ s/<(?:html|body)(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/(?:html|body)>/ /gis;
  $page =~ s/<iframe(?:\s+[^>]+|)>.*?<\/iframe>/ /gis;
  $page =~ s/<ilayer(?:\s+[^>]+|)>.*?<\/ilayer>/ /gis;
  $page =~ s/<layer(?:\s+[^>]+|)>.*?<\/layer>/ /gis;
  $page =~ s/<\/?frame(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/?frameset(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<script(?:\s+[^>]+|)>.*?<\/script>/ /gis;
  $page =~ s/<style(?:\s+[^>]+|)>.*?<\/style>/ /gis;	# not yet
  $page =~ s/<!--.*?-->/ /gis;			# MSIE-style comments
  $page =~ s/<!--[^>]+>/ /gis;			# Netscape-style comments
  $page =~ s/<form(?:\s+[^>]+|)>.*?<\/form>/ /gis;
  $page =~ s/<image(?:\s+[^>]+|)>.*?<\/image>/ /gis;	# RDF tag
  $page =~ s/<channel(?:\s+[^>]+|)>.*?<\/channel>/ /gis;	# RDF tag
  $page =~ s/<map(?:\s+[^>]+|)>.*?<\/map>/ /gis;
  $page =~ s/<applet(?:\s+[^>]+|)>.*?<\/applet>/ /gis;
  $page =~ s/<item(?:\s+[^>]+|)>.*?<\/item>/ /gis;	# some RDF items
  $page =~ s/<link(?:\s+[^>]+|)>.*?<\/link>/ /gis;	# some RDF items
  $page =~ s/<title(?:\s+[^>]+|)>.*?<\/title>/ /gis;	# some RDF items
  #$page =~ s/<(?:table|td|tr)(?:\s+[^>]+|)>/ /gis;	# TO INVESTIGATE
  #$page =~ s/<\/(?:table|td|tr)>/ /gis;	# TO INVESTIGATE
  $page =~ s/<meta\s+[^>]+>/ /gis;
  $page =~ s/<link\s+[^>]+>/ /gis;	# reported by Olivier Lamer

  # Handle inline images.
  $page =~ s/<img\s+([^>]*)>/$self->clean_inline_images($url, $1)/gies;

  # try to add closing tags, since we may have stripped off the original
  # ones. This allows us to return formatting to the baseline before
  # going on to the next page in a multi-page site.
  #
  if ($Scoop::add_closing_tags) {
    my $tag;
    foreach $tag (qw(blockquote b h1 h2 h3 h4 h5 h6 div em
			  i u code small big strong pre li ul ol font a td
			  tr table))
    {
      if ($page =~ m#^.*<\s*${tag}(?:\s+[^>]+|)\s*>#is &&
			$' !~ m#<\s*/\s*${tag}\s*>#is)
      {
	&Scoop::dbg ("re-adding stripped closing tag: </$tag>");
	$page .= "</".$tag.">";
      }
    }
  }

  # convert <pre> text to proper HTML, it displays better.
  $page =~ s/<pre>(.*?)<\/pre>/$self->clean_preformatted_text($1);/gies;
  $page =~ s/<code>(.*?)<\/code>/$self->clean_preformatted_text($1);/gies;

  # strip all existing line breaks, they will just confuse matters
  # when we convert to text or HTML. It's also easier to do proper diffs
  # when we control the placement of newlines.
  $page =~ s/[\r\n]+/ /gs;

  if ($format == $CF::OUT_DOC) {
    # Create DOC bookmarks at <a name> tags
    # From Brian Lalor <blalor@hcirisc.cs.binghamton.edu>
    # via Christopher Heschong's <chris@screwdriver.net>
    # webpage-to-prc converter. Nice one lads, good trick!
    $page =~ s/<a\s+name.*?>/$Scoop::bookmark_char /gis;
  }

  if ($format == $CF::OUT_HTML) {
    if (!$CF::fileperpage) {
      $one_page_anchor = $self->href_to_singlepage_anchor ($url);
      $one_page_anchor =~ s/[^-_A-Za-z0-9]/_/g;

      # the substitutions on the anchor name itself mimic what AbsoluteURL
      # will do to it.
      my ($pre, $post, $ank);
      $page =~ s{<a\s([^>]*)name=[\"\']([^\"\'>]+)[\"\'](|\s[^>]*)>}{
		  $pre = $1; $post = $3; $ank = $2;
		  $ank =~ s/ /_20/g; $ank =~ s/[^-_A-Za-z0-9]/_/g;
		  "<!!!a $pre name=\"${one_page_anchor}__HASH__$ank\" $post>";
		}gies;
      $page =~ s{<a\s([^>]*)name=([^ >]+?)(|\s[^>]*)>}{
		  $pre = $1; $post = $3; $ank = $2;
		  $ank =~ s/ /_20/g; $ank =~ s/[^-_A-Za-z0-9]/_/g;
		  "<!!!a $pre name=\"${one_page_anchor}__HASH__$ank\" $post>";
		}gies;
      $page =~ s{<!!!a}{<a}gs;
    }

    # note the conversion of href= to href!!!=. This stops the second
    # substitution from screwing up the output of the first one!
    $page =~ s/(<a\s+[^>]*href)=\s*(?:\"|%22)([^\">]+)(?:\"|%22)([^>]*?>)(.*?)<\/a>/
	    $self->translate_link ($url, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/(<a\s+[^>]*href)=\s*([^>\s\n]+)([^>]*>)(.*?)<\/a>/
	    $self->translate_link ($url, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/href!!!=/href=/gis;	# back to normal

    # This works around a bug (I think) in iSilo that makes Wired News
    # indexes look rotten. Shouldn't be harmful anyway.
    $page =~ s/<br>\s*\&nbsp;\s*<br>/<br><br>/gis;

    # clean up useless tags and whitespace at the start and end of the text.
    1 while $page =~ s,^\s*<(?:br|hr|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>,,gis;
    1 while $page =~ s,<(?:br|hr|td|table|p|tr|h\d|div)\s*[^>]*>\s*$,,gis;

    # remove now-empty table items, text markup, paragraphs etc.  the
    # ordering of the tags in the foreach loop is important; strip the
    # "smallest" ones first. (actually, don't do td's, they can
    # affect the formatting quite a lot.)
    #
    # TODO - this is currently offline - some HTML will cause an infinite
    # loop in perl's regular expression implementation.
    #
    if ($Scoop::strip_empty_tag_sets) {
      foreach $tag (qw(b i u em font small big strong code div li ul ol
	  blockquote h1 h2 h3 h4 h5 h6 pre table))
      {
	$page =~ s{<\s*${tag}(?:\s+[^>]*|\s*)>(?:\s+|<\s*br\s*>|\&nbsp;)*<\s*\/\s*${tag}\s*>}{
	  &Scoop::dbg ("stripping now-empty tag set: $&");
	}gies;
      }
    }

    # since we're rendering to HTML, line breaks are OK. Put them back in!
    $page =~
      s,(<(?:br|p|hr|table|td|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>),$1\n,gis;

    # strip colors.
    $page =~ s,(<\S+\s*[^>]*\s)(?:bg|fg|border|)color=[\"']?[-_\#0-9a-z]+[\"']?,$1,gis;
  }

  if ($format == $CF::OUT_DOC || $format == $CF::OUT_TEXT) {
    # We're converting to DOC or text format, so we need to do a lot
    # more work here.

    # a sidebar enclosed by a table? separate it from the rest of the text.
    $page =~ s/<\/tr>/\n\n/gis;
    $page =~ s/<\/table>/\n\n/gis;	# end of <table>
    $page =~ s/<\/pre>/\n\n/gis;	# end of <pre> text
    $page =~ s/<(\/h\d|h\d)(\s+[^>]+|)>/\n\n/gis;	# headings
    $page =~ s/<\/?blockquote(\s+[^>]+|)>/\n\n/gis;	# quotes
    $page =~ s/<hr(\s+[^>]+?|)>/\n\n/gis;	# horiz lines
    $page =~ s/<br(\s+[^>]+?|)>/\n/gis;	# end-of-line markers
    $page =~ s/<li(\s+[^>]+?|)>/\n/gis;	# list items

    $page =~ s/<\/?p(\s+[^>]+?|)>/\n\n/gis;
    # don't worry, multiple blank lines are sorted later

    $page =~ s/<\/td>/\n/gis;		# end-of-table-item

    1 while ($page =~ s/<[^>]+?>//gs);	# trim all other tags

    HTML::Entities::decode_entities($page);

    # trim multiple (blank) bookmarks
    $page =~ s/($Scoop::bookmark_char\s+){2,}/$Scoop::bookmark_char /gs;
  }

  # Convert some HTML entities that the viewers can't handle.
  $page =~ s/\&apos;/\'/gi;	# confuses iSilo
  $page =~ s/\&\#150;/-/gi;	# bad Industry Standard - no cookie!

  $page =~ s/[ \t]+/ /g;	# canonicalise down to one space
  $page =~ s/\n /\n/gs;		# leading w/s on each line
  $page =~ s/\n{3,}/\n\n/gs;	# too many blank lines
  $page =~ s/^\s+//gs;		# blank space at start of story
  $page =~ s/\s+$//gs;		# blank space at end of story

  $page;
}

# ---------------------------------------------------------------------------

sub check_for_oldest {
  my $self = shift;
  my $url = shift;

  my $fullurl = $url; $url = &Scoop::URLWithoutAnchor ($url);
  $url =~ m,http://(\S+?)/,i; my $urlhost = $1;
  return unless defined ($urlhost);

  #&Scoop::dbg ("checking to see if $url is oldest at its site: modtime=".
  	#(defined $Scoop::last_modtime{$url} ? $Scoop::last_modtime{$url} : "unknown)"));

  if (defined $Scoop::last_modtime{$url}) {
    if (defined(${$self->{new_already_seen_age_cache_data}}{$urlhost})
	? ${$self->{new_already_seen_age_cache_data}}{$urlhost} > $Scoop::last_modtime{$url} : 1)
    {
      &Scoop::dbg ("oldest link seen at $urlhost $url: modtime=".$Scoop::last_modtime{$url}.
	            " (".&Scoop::time2datestr($Scoop::last_modtime{$url}).")");
      ${$self->{new_already_seen_age_cache_data}}{$urlhost} = $Scoop::last_modtime{$url};
    }
  }
}

# ---------------------------------------------------------------------------

sub pre20_generic_cache_fname {
  my $dir = shift;
  my $url = shift;

  if (!defined $dir) { return undef; }

  $url = &Scoop::URLWithoutAnchor ($url);
  $url =~ s/[^-_A-Za-z0-9]/_/g;

  if ($CF::use_hashes_for_cache_filenames) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...
    if ($url =~ /^(.+)(.{16})$/) {
      my $hash = unpack ("%16C*", $1);
      $url = sprintf ("%4x_%s", $hash, $2);
    }
  }

  $url = $dir.$CF::slash.$url;	# put it in the directory/folder
  $url;
}

sub upgrade_cache_directory_if_needed {
  local ($_);
  my $olddir = shift;
  $upg_cache_newdir = $olddir;		# in place
  my $ver = 0;

  if (open (IN, "<".$olddir.$CF::slash."cache.cf")) {
    while (<IN>) { /^version (\d+)/ && ($ver = $1+0); }
  }

  if ($ver < 2) {
    warn "Upgrading cache directory to version 2 format...\n";
    $dompat = "(?:com|org|net|gov|mil|int|edu|ie|uk|hu|hr|fr|".
		 "us|de|il|mx|br|nl|se|pl|no|fi|in|gr|be|za)";

    sub upg_cache_file_to_2 {
      # skip dirs, they're part of the new structure.
      if (/_/ && -d $_) { $File::Find::prune = 1; return; }

      return unless (-f _);
      if ( (/^([a-z0-9]+)___([-_a-z0-9]+?_${dompat}(?:_\d+|))_(.*)$/i)
	|| (/^([a-z0-9]+)___(\d+_\d+_\d+_\d+(?:_\d+|))_(.*)$/i)
	|| (/^(file)___()(.*)$/i))
      {
	my ($proto, $site, $path) = ($1, $2, $3);
	$site =~ s/_$//;
	my $url = $proto."://".$site."/".$path;
	my $newname = &generic_cache_fname ($upg_cache_newdir, $url);

	rename ($_, $newname)
		or warn "rename \"$_\" -> \"$newname\" failed!\n";

      } else {
	warn "Didn't know how to upgrade cache file, ignored: $_\n";
      }
    }

    File::Find::find (\&upg_cache_file_to_2, $olddir);

    open (OUT, ">".$olddir.$CF::slash."cache.cf");
    print OUT "version 2\n"; close OUT;
  }
}

sub generic_cache_fname {
  my $dir = shift;
  my $url = shift;

  if (!defined $dir) { return undef; }

  $url = &Scoop::URLWithoutAnchor ($url);		# trim #anchors
  
  my $site;
  my $path;
  if ($url =~ m,^([\w]+://[^/]+)\/(.+)$,) {
    $site = $1; $path = $2;
  } else {
    $site = $url; $path = '_'; $site =~ s/\/$//;
  }
  $site =~ s,^http://,,i; $site =~ s,^www\.,,i;	# trim common stuff
  $site =~ s/[^-_A-Za-z0-9]/_/g;
  $path =~ s/[^-_A-Za-z0-9]/_/g;

  if ($CF::use_hashes_for_cache_filenames) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...
    if ($site =~ /^(.+)(.{16})$/) {
      $site = sprintf ("%4x_%s", unpack ("%16C*", $1), $2);
    }
    if ($path =~ /^(.+)(.{16})$/) {
      $path = sprintf ("%4x_%s", unpack ("%16C*", $1), $2);
    }
  }

  $site = $dir.$CF::slash.$site;
  if (!-d $site) {
    mkdir ($site, 0777) or die "failed to mkdir '$site'\n";
  }
  $site .= $CF::slash.$path;
  $site;
}

sub cachefilename {
  my $self = shift; &generic_cache_fname ($self->{cachedir}, $_[0]);
}
sub sharedcachefilename {
  my $self = shift; &generic_cache_fname ($self->{sharedcachedir}, $_[0]);
}
sub newcachefilename {
  my $self = shift; &generic_cache_fname ($self->{newcachedir}, $_[0]);
}

sub get_cached_page {
  my $self = shift;
  my $url = shift;
  my $is_diff_page = shift;
  my $cachefile = $self->cachefilename ($url);

  if (!defined $cachefile) { return undef; }

  # if -refresh is on, do not return any cached pages.
  # if it and -fromcache are both on, *do* return them.
  if ($CF::refresh && !$CF::use_only_cache) { return undef; }

  if (open (IN, "< $cachefile")) {
    binmode IN; my $cachedpage = join ('', <IN>); close IN; $cachedpage;
  } else {
    undef;
  }
}

sub get_cached_page_for_diff {
  my $self = shift; $self->get_cached_page (@_, 1);
}

sub get_page {
  my $self = shift;
  $self->http_get (@_, 0);	# only text content types
}

sub get_img {
  my $self = shift;
  $self->http_get (@_, 1);	# allow binary files
}

sub http_get {
  my $self = shift;
  my $url = shift;
  my $is_dynamic_html = shift;
  my $allow_binary = shift;
  my $baseurl = $self->{url};

  my $page = '';

  $url = &Scoop::URLWithoutAnchor ($url);
  my $cachefile = $self->cachefilename ($url);
  my $cachedpage = $self->get_cached_page ($url);
  $self->check_for_oldest ($url);
  my $lastmod;

  $Scoop::useragent->clear_redirect ();

  if (defined $cachefile && defined $cachedpage) {
    if ($is_dynamic_html == 0) {
      &Scoop::dbg("cached version exists");
      return $cachedpage;

    } elsif (defined (-M $cachefile)
    	&& -M _ < $CF::cached_front_page_lifetime
	&& -M _ > 0)		# just make sure the clock is sane
    {
      &Scoop::dbg("cached version is new enough: ".(-M $cachefile)." days");
      return $cachedpage;

    } elsif ($CF::use_only_cache) {
      &Scoop::dbg("-fromcache switch is on, using cached version");
      return $cachedpage;
    }
  }

  # see if we have it in the shared cache
  if (defined $CF::sharedcache) {
    $cachedpage = undef;
    $cachefile = $self->sharedcachefilename ($url);
    if (defined $cachefile && (open (IN, "< $cachefile"))) {
      binmode IN; $cachedpage = join ("", <IN>); close IN;
      if ($cachedpage =~ s/^<!-- lastmod: (\d+) -->//) {
	$lastmod = $1+0;
      }
    }

    if (defined $cachefile && defined $cachedpage) {
      if ($is_dynamic_html == 0) {
	&Scoop::dbg("shared-cache version exists: $cachefile");
	if (defined $lastmod)
	      { $Scoop::last_modtime{$url} = $lastmod; $self->check_for_oldest ($url); }
	return $cachedpage;

      } elsif (defined (-M $cachefile)
	  && -M _ < $CF::cached_front_page_lifetime && -M _ > 0)
      {
	&Scoop::dbg("shared-cache version is new enough: ".(-M $cachefile)." days");
	if (defined $lastmod)
	      { $Scoop::last_modtime{$url} = $lastmod; $self->check_for_oldest ($url); }
	return $cachedpage;

      } elsif ($CF::use_only_cache) {
	&Scoop::dbg("-fromcache switch is on, using shared-cache version");
	if (defined $lastmod)
	      { $Scoop::last_modtime{$url} = $lastmod; $self->check_for_oldest ($url); }
	return $cachedpage;
      }
    }

    undef $cachedpage;	# if it didn't pass those tests, don't keep it!
  }

  if ($CF::use_only_cache) {
    &Scoop::dbg("-fromcache switch is on, not doing HTTP request");
    return undef;
  }

  if (!$allow_binary && ($url =~ /\.(ra|ram|wav|jpeg|jpg|gif|mov|zip|rar)$/i
    	|| $url =~ /\.(tar|tgz|gz|tbz|bz2|rpm|swf|mpeg|mpg)$/i))
  {
    &Scoop::dbg("not retrieving non-HTML content: $url");
    return undef;
  }

  my $resp;
  my $retries;

  for ($retries = 0; $retries < 4; $retries++) {
    if ($Scoop::got_intr_flag) { return undef; }

    my $parresp = Sitescooper::ParProxy::have_response ($url);
    if (defined $parresp) {
      &Scoop::dbg ("preload: using preloaded response");
      $resp = $parresp; $parresp = undef;

    } else {
      # REVISIT - support POST
      my $req = new HTTP::Request ('GET', $url);

      $req->header ("Accept-Language" => "en",
	    "Accept-Charset" => "iso-8859-1,*,utf-8");

      # cookie_jar will assume that it's a HTTP request. Reasonable enough
      # I suppose...
      if ($url =~ /^http:/i) {
	$Scoop::cookie_jar->add_cookie_header($req);
      }

      $resp = undef;
      $cmd = '$resp = $Scoop::useragent->request ($req);';

      my $timeout = 10;	# minutes
      undef $Sitescooper::UserAgent::last_auth_realm;

      # REVISIT -- implement timeout for Win32 perl
      if (&Scoop::MyOS eq 'UNIX') {
	eval '
	  local $SIG{"ALRM"} = sub { die "alarm\n" };
	  alarm $timeout*60; { ' . $cmd. ' } alarm 0;
	';
      } else {
	eval $cmd;
      }

      die if $@ && $@ ne "alarm\n"; if ($@) {
	&Scoop::sitewarn ("HTTP GET timed out, $timeout minutes without a response.");
	&got_intr;
      }
    }
    if ($Scoop::got_intr_flag) { return undef; }

    if (!$resp->is_success) {
      &Scoop::sitewarn ("HTTP GET failed: ".$resp->status_line." ($url)");
      if ($resp->status_line =~ /^401/ && defined ($Sitescooper::UserAgent::last_auth_realm))
      {
	if (defined ($Sitescooper::UserAgent::site_logins{$Sitescooper::UserAgent::last_auth_realm}))
	{
	  &Scoop::verbose ("Deleting incorrect username and password for this realm.");
	  delete $Sitescooper::UserAgent::site_logins{$Sitescooper::UserAgent::last_auth_realm};
	  delete $Sitescooper::UserAgent::site_passes{$Sitescooper::UserAgent::last_auth_realm};
	  next;		# re-request page
	} else {
	  &Scoop::verbose ("Cannot read page, it requires a username and password.");
	}
      }
      return undef;
    }

    if (!$allow_binary && (defined($resp->content_type) &&
      	$resp->content_type ne '' && $resp->content_type !~ /^(text\/|multipart\/)/))
    {
      &Scoop::verbose ("Non-text content: Content-Type: ".$resp->content_type.".");
      return undef;
    }

    if (defined $SCF::need_login_url{$baseurl}) {
      if (Scoop::match_url ($resp->base, $SCF::need_login_url{$baseurl})) {
	&Scoop::verbose ("Page requires a username and password, requesting...");
	&get_basic_credentials ($Scoop::useragent, $baseurl, $url);
	next;		# re-request page
      }
    }

    $page = $resp->content;
    # handle (ugh) Javascript or meta-tag redirects
    if ($page =~ /meta\s+http-equiv=\"refresh\"\s+content=\"([^\"]*)url=([^\"\;]+)[\"\;]/is)
    {
      my $timeout = $1;
      $url = $2;

      if ($timeout =~ /\d+/ && $&+0 > 60) {
	&Scoop::dbg ("Ignored slow meta-tag refresh: \"$timeout url=$url\"");
      } else {
	&Scoop::verbose ("Redirected by META tag to: $url");
	$Scoop::useragent->note_redirect ($url);
	next;
      }
    }

    last;		# break out of for loop
  }

  if (defined $resp->last_modified) {
    $lastmod = $resp->last_modified;
    &Scoop::dbg ("last-modified time for $url: $lastmod (".&Scoop::time2datestr($lastmod).")");

    if (defined $Scoop::last_modtime{$url} && defined($lastmod)
      && $lastmod <= $Scoop::last_modtime{$url} && !$CF::refresh
      && !$is_dynamic_html && !$allow_binary)
    {
      &Scoop::verbose ("Skipping (no mod since last download): $url");
      return undef;
    }

  } else {
    &Scoop::dbg ("last-modified time for $url: not provided");
    $lastmod = time;
  }
  $Scoop::last_modtime{$url} = $lastmod; $self->check_for_oldest ($url);

  if (!$is_dynamic_html && defined $cachedpage && $cachedpage eq $page
    	&& !$CF::refresh && !$allow_binary)
  {
    &Scoop::verbose ("Skipping (HTML has not changed): $url");
    return undef;
  }
  $page;
}

# ---------------------------------------------------------------------------
 
sub cache_page {
  my $self = shift;
  my ($url, $page, $cachelater) = @_;

  $url = &Scoop::URLWithoutAnchor ($url);
  my $cachefile = $self->newcachefilename ($url);

  # if this page is the latest version of a diffed page, don't cache it
  # immediately, as it will mean lost stories if we're interrupted.
  # Instead save the filename for renaming when the run finishes.
  if (defined $cachelater && $cachelater == 1) {
    my $tmpname = $cachefile.'.tmp';
    $CF::caches_to_rename{$tmpname} = $cachefile;
    $cachefile = $tmpname;
  }

  open (C1OUT, "> $cachefile"); binmode C1OUT; print C1OUT $page; close C1OUT;

  if (defined $CF::sharedcache) {
    $cachefile = $self->sharedcachefilename ($url);
    open (C2OUT, "> $cachefile"); binmode C2OUT;
    if (defined $Scoop::last_modtime {$url}) {
      # cache the last-modified time of this page as well.
      print C2OUT "<!-- lastmod: ",$Scoop::last_modtime{$url}," -->\n";
    }
    print C2OUT $page; close C2OUT;
  }
  $page;
}
 
sub cache_page_later { &cache_page ($_[0], $_[1], $_[2], 1); }

# ---------------------------------------------------------------------------

sub write_as_story {
  my $self = shift;
  local ($_);
  my ($is_front, $url, $page, $headline, $upindex) = @_;
  my $baseurl = $self->{url};

  my $fullurl = $url;
  $url = &Scoop::URLWithoutAnchor ($url);

  my $sitename = $self->{sitename};
  if (!defined $sitename) { $sitename = $url; }

  my $proc = Scoop::get_layout_param ('table_render', $baseurl, $url);
  if (!defined $proc || $proc eq 'keep') {
    # do nothing

  } elsif (!defined $proc || $proc eq 'list') {
    require Exten::Table;
    my $parser = Exten::Table->new();
    $parser->parse($page);
    $page = $parser->get_result();
    
  } else {
    &Scoop::sitewarn("TableProcess \"$proc\" invalid");
  }

  $proc = Scoop::get_layout_param ('story_postproc', $baseurl, $url);
  if (defined $proc) {
    my $bookmark_char = $Scoop::bookmark_char;	# convenience for PostProc
    $_ = $page;
    if (!eval $proc."; 1;") {
      &Scoop::sitewarn("StoryPostProc failed: $@");
      # and keep the original $page
    } elsif (!defined $_) {
      &Scoop::sitewarn("StoryPostProc failed (\$_ is undefined): $@");
      # and keep the original $page
    } else {
      $page = $_;
    }
  }

  my $outtext = '';
  my $one_page_anchor;

  if (&Scoop::writing_html) {
    $one_page_anchor = $self->href_to_singlepage_anchor ($url);

    if ($CF::writeheader) {
      $outtext .=
	"\n\n<hr><font size=1><i>$sitename: <a HREF_EXTERNAL=$url>$url</a></i></font><br>"
	. "<a name=\"$one_page_anchor\">\n"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($self->{current_story_index}-1)."\">&lt;&lt;</a>]"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($upindex)."\">^</a>]"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($self->{current_story_index}+1)."\">&gt;&gt;</a>]<br>\n\n";

    } else {
      $outtext .= "<a name=\"$one_page_anchor\">\n";
    }
    $outtext .= $page;

  } else {
    $outtext .= "------------\n";
    if ($CF::writeheader) {
      $outtext .= "$sitename: $url\n\n";
    }
    if (&Scoop::writing_doc) {
      if (defined $headline) {
	&Scoop::verbose ("(Headline: $headline)");
	$outtext .= "$Scoop::bookmark_char $headline\n";
      } else {
	# use the first line in the story instead
	$outtext .= "$Scoop::bookmark_char ";
      }
    }

    foreach $_ (split (/\n/, $page)) {
      if (&Scoop::writing_text) {
	# wrap each line after 70 columns
	while (s/^(.{70}\S*)\s+//) {
	  $outtext .= $1."\n";
	}
      }
      $outtext .= $_."\n";
    }

    $outtext .= "\n\n\n";
  }

  if ($CF::fileperpage) {
    my ($fname, $relative) = $self->href_to_multipage_anchor ($url);

    if ($is_front && $fullurl eq $baseurl) {
      # this is the front page, just append it to the index file
      ${$self->{output_file}}{'MAIN'} .= $outtext;

    } else {
      if (&Scoop::writing_html) {
	${$self->{output_file}}{$fname} = "<html><head></head><body>".
			  $outtext."</body></html>";
      }
    }

    if (&Scoop::writing_html) {
      $self->add_snarfed_link ($relative);
      ${$self->{output_story_urls}}[$self->{current_story_index}] = $relative;

      if ($fullurl ne $url) {
	my ($ffname, $frelative) = $self->href_to_multipage_anchor ($fullurl);
	$self->add_snarfed_link ($ffname);
	$self->add_snarfed_link ($frelative);
      }

    } else {
      ${$self->{output_file}}{$fname} .= $outtext;
    }

  } else {
    # this is single-page output mode.
    ${$self->{output_file}}{'MAIN'} .= $outtext;

    if (&Scoop::writing_html) {
      $self->add_snarfed_link ('#'.$one_page_anchor);
      ${$self->{output_story_urls}}[$self->{current_story_index}]
      						= '#'.$one_page_anchor;

      if ($one_page_anchor =~ /__HASH__/) {
	$self->add_snarfed_link ('#'.$`);
      }
    }
  }

  $self->{current_story_index}++;
  $self->add_snarfed_link ($url);
  if ($fullurl ne $url) { $self->add_snarfed_link ($fullurl); }
  $self->up_file_size ($url, length($outtext), "story");
  $self->{stories_found}++;

  if ($CF::storylimit > 0 && $self->{stories_found} >= $CF::storylimit) {
    &Scoop::verbose ("over story limit, stopping this site.");
    $self->{hit_file_size_limit} = 1;
  }
}

# ---------------------------------------------------------------------------

sub up_file_size {
  my $self = shift;
  my $url = shift;
  my $siz = shift;
  my $typetxt = shift;

  $self->{file_size} += $siz;
  &Scoop::dbg ("$typetxt written, ".
  	($self->{file_size}/1024)." K, limit ".
	$CF::filesizelimit." K, site limit ".
	(defined ($SCF::size_limit{$url}) ?
			$SCF::size_limit{$url}." K" : "unset"));

  if ($self->{file_size}/1024 >= $CF::filesizelimit) {
    $self->{hit_file_size_limit} = 1;
  }
  if (defined $SCF::size_limit{$url} && $SCF::size_limit{$url} != 0) {
    if ($self->{file_size}/1024 >= $SCF::size_limit{$url}) {
      $self->{hit_file_size_limit} = 1;
    }
  }
}

# ---------------------------------------------------------------------------

sub add_snarfed_link {
  my $self = shift;
  my $url = shift;
  #&Scoop::dbg ("Tracking snarfed link: $url");		# J M D
  ${$self->{output_links_snarfed}}{$url} = 1;
}

# ---------------------------------------------------------------------------

sub href_to_multipage_anchor {
  my $self = shift;
  my $url = shift;
  my $type = shift;

  if (!defined $type) { $type = '.html'; }

  my $anchor = &Scoop::URLAnchor ($url);
  $url = &Scoop::URLWithoutAnchor ($url);

  if (!defined $self->{page_to_tmpfile}{$url}) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...

    my $hash = unpack ("%16C*", $url);
    my $h = sprintf ("%04x", $hash);
    my $acc = 'aaa';
    my $name = $self->{outtmp}.$CF::slash.$h.$acc.$type;

    while (defined $tmpfile_assigned{$name})
    		{ $name = $self->{outtmp}.$CF::slash.$h.(++$acc).$type; }

    $self->add_page_tmpfile ($name, $url);
    &Scoop::dbg ("page file for $url: $self->{page_to_tmpfile}{$url}");
  }

  $self->{page_to_tmpfile}{$url} =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/;
  ($self->{page_to_tmpfile}{$url}, $1.$anchor);
}

sub clear_page_tmpfiles {
  my $self = shift;
  %{$self->{page_to_tmpfile}} = ();
}

sub add_page_tmpfile {
  my $self = shift;
  my $fname = shift;
  my $url = shift;
  $self->{page_to_tmpfile}{$url} = $fname;
  $tmpfile_assigned{$fname} = 1;
}

sub href_to_singlepage_anchor {
  my $self = shift;
  my $anchor = shift;
  $anchor =~ s/#/__HASH__/g;
  $anchor =~ s/[^-_A-Za-z0-9]/_/g;
  $anchor;
}

1;

