# Scoop.pm -- main sitescooper logic.

package Scoop;

use Carp;
use File::Find;
use File::Path;
use File::Copy;
use File::Basename;
use Cwd;
use FindBin;

sub Init {
  $Scoop::VERSION = "2.2.7";

  if (eval 'require LWP::Parallel::RobotUA; 1;') {
    $CF::lwp_par_available = 1;
  } else {
    $CF::lwp_par_available = 0;
  }

  $CF::OUT_TEXT = 0;
  $CF::OUT_DOC = 1;
  $CF::OUT_HTML = 2;

  $Scoop::home_url = "http://sitescooper.cx";
  $Scoop::bookmark_char = "\x8D";	# yes, same as Chris' one, cheers!
  $Scoop::cgimode = 0;			# for possible future CGI mode

  $CF::refresh = 0;
  $CF::debug = 0;
  $CF::debugdiffs = 0;			# set to 1 to break after diffing
  $CF::use_lwp_par = 0;

  #$CF::just_caching = 0;
  $CF::cached_front_page_lifetime = 60;	# in minutes
  $CF::dump = 0;
  $CF::dumpprc = 0;

  $CF::verbose = 1;
  $CF::nowrite = 0;
  undef $CF::prcdir;
  undef $CF::pilotinstdir;
  undef $CF::pilotinstapp;

  # Text is the default on MacOS.
  if (&Scoop::MyOS eq 'Mac') {
    $CF::outstyle = $CF::OUT_TEXT;
    $CF::outputfilter = '__cat__';
    $CF::use_convert_tool = 0;

  } else {
    $CF::outstyle = $CF::OUT_HTML;
    $CF::outputfilter = 'isilo';
    $CF::use_convert_tool = 1;
  }

  $CF::fileperpage = 0;
  $CF::nolinkrewrite = 0;
  $CF::filesizelimit = 300;		# limit of Kb per file (uncompressed)
  $CF::storylimit = 0;
  $CF::linkslimit = 0;

  $CF::writeheader = 1;
  $CF::writefooter = 1;
  $CF::allowimgs = 1;

  $CF::use_only_cache = 0;
#CGI $Scoop::cgi = undef;
  $CF::admincmd = undef;

  @CF::sites_grep = ();
  @CF::site_files_to_read = ();
  @CF::site_choices = ();
  @CF::layout_site_files = ();
  @CF::cmdline_urls = ();
  %CF::caches_to_rename = ();
  $CF::keep_tmps = 0;
  $CF::disconnect_after_scoop = 0;

  $Scoop::argv_settings = ();

  $Scoop::add_closing_tags = 1;		# close tags when cleaning HTML
  $Scoop::strip_empty_tag_sets = 0; 	# strip empty tag sets (broken atm)

  $CF::filename_template = "YYYY_MM_DD_Site";
  $CF::prc_title = "YYYY-Mon-DD: Site";

  $Scoop::conf = ();

# --------------------------------------------------------------------------

#if (defined $ENV{'REQUEST_METHOD'}) {
    # we're running from a CGI script, use CGI mode
    #$Scoop::cgimode = 1;
    #$Scoop::cgi = new CGI;
#}

# This is the placeholder for development debug flags.
# Add debugging stuff here, tagged with J M D (without the spaces ;).


# --------------------------------------------------------------------------

# Andrew Fletcher <fletch@computer.org>:
# A relative path on Mac seems to need a ":" before it. I've called
# this $CF::colon. (jm note: currently unused but we may need it again)

  if (&Scoop::MyOS eq 'UNIX') {
    $CF::colon = '';
    $CF::slash = '/';
    $CF::use_hashes_for_cache_filenames = 0;

  } elsif (&Scoop::MyOS eq 'Win32') {
    $CF::colon = '';
    $CF::slash = '\\';
    $CF::use_hashes_for_cache_filenames = 0;

  } elsif (&Scoop::MyOS eq 'Mac') {
    $CF::slash = ':'; $CF::colon = ':';
    # because of the Mac's 32-char filename limitation, we need to include
    # a hash of the URL in cache filenames to avoid clashes. This may be
    # handy for other OSes too, but leave it Mac-only for now.
    $CF::use_hashes_for_cache_filenames = 1;
  }

  $CF::sitescooperdir = $FindBin::Bin;
  $Scoop::cwd = getcwd;

# $CF::sitescooperdir =~ s,^\.[\\\/\:],,g;	# './blah'
# $CF::sitescooperdir =~ s,^\.$,${Scoop::cwd},ge;	# '.'
# $CF::sitescooperdir =~ s,^([^\\\/\:]),${Scoop::cwd}.$CF::slash.$1,ge;

  &Scoop::verbose ("Sitescooper version ".$Scoop::VERSION
				  .", Copyright (c) 1999-2000 Justin Mason\n"
	  ."Sitescooper comes with ABSOLUTELY NO WARRANTY; for details\n"
	  ."see http://jmason.org/software/sitescooper/doc/gpl.html .\n");

}

sub AddCmdlineSiteParam {
  push (@Scoop::argv_settings, shift);
}

# --------------------------------------------------------------------------

sub ParseArgs {
  local ($_);
  my @sites_grep = ();

  if ($Scoop::cgimode == 0) {
    $Scoop::MAC_ARGS =~ s/ /\001/g;
    my @macargs = split (' ', $Scoop::MAC_ARGS);
    @macargs = map { s/ /\001/g; } @macargs;
    unshift (@ARGV, @macargs);

    while ($#ARGV >= 0) {
      $_ = shift @ARGV;

      if (/^-debug$/) {
	$CF::debug = 1;

      } elsif (/^-quiet$/) {
	$CF::verbose = 0;

      } elsif (/^-refresh/) {
	$CF::cached_front_page_lifetime = 0;
	$CF::refresh = 1;

      #} elsif (/^-cache/) {
	#$CF::just_caching = 1;	# used for future parallelism

      } elsif (/^-dump/) {
	$CF::dump = 1;
	$CF::outstyle = $CF::OUT_TEXT;
	$CF::use_convert_tool = 0;
	$CF::allowimgs = 0;

      } elsif (/^-dumpprc/) {
	$CF::dumpprc = 1;
	$CF::use_convert_tool = 1;

      } elsif (/^-doc/) {
	$CF::outstyle = $CF::OUT_DOC;
	$CF::fileperpage = 0;
	$CF::outputfilter = 'makedoc';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 0;

      } elsif (/^-isilo/) {
	$CF::outstyle = $CF::OUT_HTML;
	$CF::fileperpage = 0;
	$CF::outputfilter = 'isilo';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;

      } elsif (/^-misilo/) {
	$CF::outstyle = $CF::OUT_HTML;
	$CF::fileperpage = 1;
	$CF::outputfilter = 'isilo';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;

      } elsif (/^-richreader/) {
	$CF::outstyle = $CF::OUT_HTML;
	$CF::fileperpage = 0;
	$CF::outputfilter = 'richreader';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;		# TODO - not sure

      } elsif (/^-text/) {
	$CF::outstyle = $CF::OUT_TEXT;
	$CF::fileperpage = 0;
	$CF::outputfilter = '__cat__';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 0;

      } elsif (/^-html/) {
	$CF::outstyle = $CF::OUT_HTML;
	$CF::fileperpage = 0;
	$CF::outputfilter = '__path__';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;

      } elsif (/^-mhtml/) {
	$CF::outstyle = $CF::OUT_HTML;
	$CF::fileperpage = 1;
	$CF::outputfilter = '__path__';
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;

      } elsif (/^-pipe/) {
	my $fmt = shift @ARGV;
	my $cmd = shift @ARGV;

	if ($fmt eq 'text') {
	  $CF::outstyle = $CF::OUT_TEXT;
	  $CF::fileperpage = 0;

	} elsif ($fmt eq 'html') {
	  $CF::outstyle = $CF::OUT_HTML;
	  $CF::fileperpage = 0;

	} elsif ($fmt eq 'mhtml') {
	  $CF::outstyle = $CF::OUT_HTML;
	  $CF::fileperpage = 1;
	} else {
	  &main::usage;
	}
	$CF::outputfilter = 'cmd: '.$cmd;
	$CF::use_convert_tool = 1;
	$CF::allowimgs = 1;		# probably ;)

      } elsif (/^-admin$/) {
	$CF::admincmd = shift @ARGV;
	if ($CF::admincmd eq 'import-cookies')
				  { $Scoop::importcookies = shift @ARGV; }

      } elsif (/^-nolinkrewrite/) {
	$CF::nolinkrewrite = 1;
      } elsif (/^-fromcache/) {
	$CF::use_only_cache = 1;
      } elsif (/^-parallel/) {
	$CF::use_lwp_par = $CF::lwp_par_available;
      } elsif (/^-limit/) {
	$CF::filesizelimit = shift(@ARGV)+0;
      } elsif (/^-maxlinks/) {
	$CF::storylimit = shift(@ARGV)+0;
      } elsif (/^-maxstories/) {
	$CF::linkslimit = shift(@ARGV)+0;
      } elsif (/^-nodates/) {
	$CF::filename_template = 'Site';
	$CF::prc_title = 'Site';
      } elsif (/^-nowrite/) {
	$CF::nowrite = 1;
      } elsif (/^-config/) {
	$CF::config = shift @ARGV;
      } elsif (/^-install/) {
	$CF::pilotinstdir = shift @ARGV;
      } elsif (/^-instapp/) {
	$CF::pilotinstapp = shift @ARGV;
      } elsif (/^-site$/) {
	push (@sites_grep, shift @ARGV);
      } elsif (/^-sites/) {
	push (@sites_grep, @ARGV); @ARGV = (); last;
      } elsif (/^-name/) {
	$Scoop::argv_name = shift (@ARGV);
      } elsif (/^-levels/) {
	AddCmdlineSiteParam ("Levels: ". shift @ARGV);
      } elsif (/^-storyurl/) {
	AddCmdlineSiteParam ("StoryURL: ". shift @ARGV);
      } elsif (/^-set/) {
	AddCmdlineSiteParam (shift (@ARGV) .": ". shift @ARGV);
      } elsif (/^-keep-tmps/) {
	$CF::keep_tmps = 1;
      } elsif (/^-disc/) {
	$CF::disconnect_after_scoop = 1;

      } elsif (/^-noheaders/) {
	$CF::writeheader = 0;
      } elsif (/^-nofooters/) {
	$CF::writefooter = 0;

      } elsif (/^-filename/) {
	$CF::filename_template = shift @ARGV;
      } elsif (/^-prctitle/) {
	$CF::prc_title = shift @ARGV;

      } elsif (/^-stdout-to/) {
	$_ = shift @ARGV; close (STDOUT);
	open (STDOUT, ">> ".$_) or die "failed to redirect STDOUT to $_\n";

      } elsif (/^-/) {
	&main::usage;
      } else {
	unshift @ARGV, $_; last;
      }
    }
    @CF::cmdline_urls = @ARGV;
    $Scoop::userid = $<;

  } else {
    # load some things from CGI parameters
    #CGI@CF::cmdline_urls = ($Scoop::cgi->param ('url'));

    #CGI@CF::sites_grep = $Scoop::cgi->param ('sites');

    #CGI$CF::debug = $Scoop::cgi->param ('debug');
    #CGI$CF::outstyle = $Scoop::cgi->param ('outstyle');
    #CGI$CF::nowrite = $Scoop::cgi->param ('nowrite');
    #CGI$CF::refresh = $Scoop::cgi->param ('refresh');
    #CGI$Scoop::userid = $Scoop::cgi->param ('userid');
    #CGI&ScoopCGI::get_cookie;
    #CGI $Scoop::password = $Scoop::cgi->param ('password');
    # REVISIT -- use a cookie to store userid and password

    #CGI$Scoop::pilotinstdir = undef;
  }

  if (!$Scoop::cgimode) {
    my $key;
    foreach $key (@sites_grep) {
      if (!-r $key) {
	warn "Failed to read -site argument \"$key\".\n";
	next;
      }

      # it's a site file.
      push (@CF::site_files_to_read, $key);
      push (@CF::sites_grep, $key);
      # TODO: add support for -sitesmatching pattern or similar...
    }
  }
}

# --------------------------------------------------------------------------

sub ReadConfig {
  @Scoop::conf = ();
  @Scoop::conflines = ();

  my $default_config = $CF::sitescooperdir.$CF::slash."sitescooper.cf";
  if (-r "/etc/sitescooper.cf") { $default_config = "/etc/sitescooper.cf"; }

# UNIX platforms: use ~/.sitescooper/sitescooper.cf if it exists.
# Otherwise use /etc/sitescooper.cf if that exists.
# All platforms use {sitescooperdir}/sitescooper.cf if that exists.
#
  if (&Scoop::MyOS eq 'UNIX') {
    my $homecf = $ENV{'HOME'}."/.sitescooper/sitescooper.cf";;
    if (!defined $CF::config && -r $homecf) { $CF::config ||= $homecf; }
  }

  # UNIX platforms: create a default configuration file. This makes it
  # easier to support rpm installation.
  #
  $CF::config ||= $default_config;
  if (!-r $CF::config) {
    die "Cannot find default sitescooper.cf file, please use the '-config' argument!\n";
  }

  if (&Scoop::MyOS eq 'UNIX') {
    $CF::tmpdir ||= $ENV{'HOME'}."/.sitescooper";
    my $homecf = $ENV{'HOME'}."/.sitescooper/sitescooper.cf";;

    if (!-r $homecf) {
      (-d $CF::tmpdir) or
	  mkdir ($CF::tmpdir, 0777) or die "failed to mkdir '$CF::tmpdir'\n";

      warn "Copying default config to \"$homecf\".\n".
	"Edit this if you need to change any configuration settings.\n\n";

      copy ($CF::config, $homecf) or die "cannot copy $CF::config to $homecf\n";
      $CF::config = $homecf;
    }
  }

  if (!defined $CF::config || !-r $CF::config) {
    my $cfg = $CF::config; $cfg ||= "(unset)";
    die ("Cannot find configuration file \"$cfg\"; please use the '-config' argument!\n");
  }

  &Scoop::verbose ("Reading configuration from \"$CF::config\".");
  {
    open (IN, "<$CF::config") || die "cannot read $CF::config\n";
    @Scoop::conf = (<IN>); close IN;

    my $i;
    for ($i=0; $i<$#conf+1; $i++) {
      push (@Scoop::conflines, $CF::config.":".($i+1));
    }
  }

  if ($CF::debugdiffs) {
    &Scoop::dbg ("debugging, will exit after diff");
  }

# --------------------------------------------------------------------------

  $CF::outdir = '';
  $CF::cached_front_page_lifetime /= (24*60);	# convert to days
  $CF::expiry_days = 7.0;
  $CF::sharedcache = undef;

  %SCF::active = ();
  %SCF::levels = ();
  %SCF::extra_urls = ();
  %SCF::sizelimit = ();
  %SCF::name = ();
  %SCF::site_defined_at = ();
  %SCF::site_format = ();
  %SCF::links_start = %SCF::links_end = ();
  %SCF::story_start = %SCF::story_end = ();
  %SCF::links_limit_to = %SCF::story_limit_to = ();
  %SCF::links_print = ();
  %SCF::links_trim = ();
  %SCF::story_skip = %SCF::links_skip = ();
  %SCF::story_diff = %SCF::links_diff = ();
  %SCF::links_follow_links = %SCF::story_follow_links = ();
  %SCF::story_lifetime = ();
  %SCF::story_postproc = ();
  %SCF::story_preproc = ();
  %SCF::url_postproc = ();
  %SCF::cacheable = ();	# 0 = static, 1 = dynamic, undef = use heuristics
  %SCF::printable_sub = ();
  %SCF::use_alt_tags = ();
  %SCF::head_pat = ();
  %SCF::use_table_smarts = ();
  %SCF::story_html_header = ();
  %SCF::story_html_footer = ();
  %SCF::rights = ();
  %SCF::need_login_url = ();
  %SCF::eval_code = ();
  %SCF::image_only_site = ();
  %SCF::image_max_width = ();
  %SCF::table_render = ();

  @SCF::sites = ();
  @SCF::layouts = ();
  @SCF::exceptions = ();
  $SCF::have_layouts = 0;
  $SCF::have_exceptions = 0;

  %SCF::url_title = ();

  @TmpGlobal::unsorted_layouts = ();
  @TmpGlobal::unsorted_exceptions = ();

  undef $CF::tmpdir;
  if (&Scoop::MyOS eq 'UNIX') {
    $CF::tmpdir = $ENV{'HOME'}."/.sitescooper";
  }
  $CF::tmpdir ||= $ENV{'TMPDIR'};
  $CF::tmpdir ||= $ENV{'TEMP'};
  if (!defined $CF::tmpdir && &Scoop::MyOS eq 'Win32' && defined $ENV{'WINDIR'})
  {
    $CF::tmpdir = $ENV{'WINDIR'}."\\Temp";
  }

  $CF::diff = 'diff';
  if (&Scoop::MyOS eq 'Win32') { $CF::diff = "diff.exe"; }
  if (&Scoop::MyOS eq 'Mac') { $CF::diff = ""; }	# use Algorithm::Diff
  $CF::checked_for_diff = 0;

  $CF::makedoc = 'makedoc';
  if (&Scoop::MyOS eq 'Win32') { $CF::makedoc = "makedocw.exe"; }

  $CF::isilo = 'iSilo386'; $CF::isiloargs = '-y'; $CF::isilomultipageargs = '-d9';
  if ($CF::allowimgs) { $CF::isiloargs .= ' -Is__IMAGE_MAX_WIDTH__ -Ic -Id'; }

  if (&Scoop::MyOS eq 'Win32') { $CF::isilo = "iSiloC32.exe"; }

  $CF::richreader = 'HTML2Doc'; $CF::richargs = '';
  if (&Scoop::MyOS eq 'Win32')
	  { $CF::richreader = "HTML2Doc.exe"; $CF::richargs = '-i'; }
# Note that currently there is no HTML2Doc for UNIX platforms; it's
# supported here anyway for future-proofing.

# ---------------------------------------------------------------------------

  sub set_got_intr_behaviour {
    $Scoop::got_intr_behaviour = shift;
    $Scoop::got_intr_flag = 0;
  }

  sub got_intr {
    my $signame = shift;
    (&Scoop::MyOS eq 'UNIX') and system ("stty echo");

    if ($Scoop::got_intr_behaviour eq 'exit') {
      die "got signal SIG$signame, exiting.\n";
    } else {
      die "got signal SIG$signame, skipping site...\n";
      $Scoop::got_intr_flag = 1;
    }
  }

  &Scoop::set_got_intr_behaviour ('exit');
  $SIG{'INT'} = \&Scoop::got_intr;
  $SIG{'TERM'} = \&Scoop::got_intr;

# ---------------------------------------------------------------------------

  $CF::proxyhost = undef;
  $CF::proxyport = 80;

  # parse the basic config file first.
  &ParseConfig (\@Scoop::conf, \@Scoop::conflines);
  undef @Scoop::conf;		# save memory
  undef @Scoop::conflines;

# ---------------------------------------------------------------------------

  if (!defined $CF::tmpdir) {
    warn "Warning: cannot work out TmpDir, please set it manually\n".
	  "in the configuration section of the script.\n";
  }
  if (!defined $CF::sitescooperdir) {
    warn "Warning: cannot work out SitescooperDir, please set it manually\n".
	  "in the configuration section of the script.\n";
  }
  &make_basic_dirs;	# make tmpdir and user_tmpdir first off

# ---------------------------------------------------------------------------

  if (!defined $CF::pilotinstdir && !$Scoop::cgimode)
  {
    if (&Scoop::MyOS eq 'Win32' && !defined $CF::pilotinstapp) {
      $CF::pilotinstapp = "***USE_MODULE***";

    } elsif (&Scoop::MyOS eq 'UNIX') {
      my $pilot_mgr_dir = $ENV{'HOME'}."/.pilotmgr/Installer";
      my $jpilot_file = $ENV{'HOME'}."/.jpilot/jpilot_to_install";

      if (defined $CF::pilotinstapp) {
	# see if one of the built-in support for UNIX pilot desktops is
	# being used.
	if ($CF::pilotinstapp =~ /pilot.*manager/i)
	  { $CF::pilotinstdir = $pilot_mgr_dir; undef $CF::pilotinstapp; }
	elsif ($CF::pilotinstapp =~ /gnome.*pilot/i)
	  { $CF::pilotinstapp = "gpilot-install-file --later"; }
	elsif ($CF::pilotinstapp =~ /jpilot/i)
	  { $CF::pilotinstapp = "***ADD_TO_MANIFEST*** ".$jpilot_file; }

      } else {
	$CF::pilotinstapp = "***USE_MODULE***";		# use the module
      }
    }
  }

# ---------------------------------------------------------------------------

  # we may not have read the sites directory by this stage.
  if (!defined $CF::sitesdir) {
    &Scoop::dbg ("SitesDir was not specified, trying to guess it...");

    my $possible = "sites";
    if (!defined $CF::sitesdir && -d $possible) { $CF::sitesdir = $possible; }

    if (&Scoop::MyOS eq 'UNIX') {
      $possible = $ENV{'HOME'}."/sites";
      if (!defined $CF::sitesdir && -d $possible) { $CF::sitesdir = $possible; }

      # default for UNIX platforms
      $CF::sitesdir ||= $CF::user_tmpdir.$CF::slash."sites";

    } else {
      # default for non-UNIX platforms
      $CF::sitesdir ||= $CF::sitescooperdir.$CF::slash."sites";
    }
  }

# ---------------------------------------------------------------------------

  # ensure we can find our external modules. Also ensure they override any
  # others already in the path, as some distros and versions of perl include
  # old versions of LWP etc.
  #
  unshift (@INC, $CF::sitescooperdir.$CF::slash."lib");

  require LWP::UserAgent;
  require URI::URL;
  require HTTP::Date;
  require HTTP::Cookies;
  require HTTP::Request::Common;
  require HTML::Entities;
  require HTML::Parser;
  require HTML::Filter;

  require Sitescooper::Robot;
  require Sitescooper::StripTablesFilter;
  require Sitescooper::UserAgent;
  require Sitescooper::ParProxy;

  require PDA::PilotInstall;

# ---------------------------------------------------------------------------

  $Scoop::useragent = new Sitescooper::UserAgent;
  $Scoop::useragent->env_proxy;
  $Scoop::useragent->agent ("sitescooper/$Scoop::VERSION ($Scoop::home_url) ".
		  $Scoop::useragent->agent);
  $Scoop::useragent->max_size (1024*1024*2);	# 2-meg file limit

  $Scoop::cookie_jar = HTTP::Cookies::Netscape->new;
  $Scoop::pua = undef;

  if (!defined $CF::pilotinstdir && !$Scoop::cgimode
    	&& !defined $CF::pilotinstapp)
  {
    # write PRCs to cwd if nothing is set.
    &Scoop::verbose ("Warning: since no PilotInstallDir was specified".
    " in the configuration,\nPRC files will be saved to current directory.\n");
    $CF::pilotinstdir = $Scoop::cwd;

  } elsif (defined $CF::pilotinstdir) {
    # just write them direct to the install dir.
    $CF::prcdir = $CF::pilotinstdir;
  }
}

# ---------------------------------------------------------------------------

sub ReadSites {
  @Scoop::conf = ();
  @Scoop::conflines = ();

  &ReadSiteChoices;
  &ScanSitesDir;
  &ReadSiteFiles;

  # parse the site files. We've already read the config file BTW.
  &ParseConfig (\@Scoop::conf, \@Scoop::conflines);
  undef @Scoop::conf;		# save memory
  undef @Scoop::conflines;

  &PolishConfig;

# ---------------------------------------------------------------------------

  if ($#CF::cmdline_urls > -1) {
    @SCF::sites = ();

    my ($url, $confline, @conf, @conflines, $i);
    foreach $url (@CF::cmdline_urls)
    {
      # if it's a local file URL, switch around the slashes (for windows)
      if (&Scoop::MyOS eq 'Win32' && $url =~ m,file:///,i) {
	$url =~ s/\\/\//g;
      }
      # REVISIT -- I don't know what to do in the same case for MacOS ;)

      if (-r $url) {
	if ($url =~ m,^/,) {
	  $url = 'file://'.$url;
	} else {
	  $url = "file://".$Scoop::cwd."/".$url;
	}
      }

      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }
      push (@conf, "URL: ".$url);

      if (defined $Scoop::argv_name) {
	$_ = $Scoop::argv_name;
      } else {
	if ($url =~ m,/([^/]+)$,) {
	  $_ = $1;
	  if (length ($_) > 40) {
	    # trim out spare stuff to keep it short.
	    s,^([^:]+://[^/]+)/.*/([^/]+$),$1/.../$2,i; #/
	  }
	}
      }
      push (@conf, "Name: ".$_);
      push (@conf, "StoryLifetime: 0");	# never used cached stuff
      push (@conf, @Scoop::argv_settings);

      for ($i=0; $i<$#conf+1; $i++) { push (@conflines, $url.":0"); }
      ParseConfig (\@conf, \@conflines);
      push (@SCF::sites, $url);
    }
  }

# ---------------------------------------------------------------------------

  {
    my ($mday, $mon, $year, $monstr) = &get_date;
    my ($min, $hr, $wdaystr) = &get_extra_date;

    $CF::filename_template =~ s/YYYY/ sprintf ("%04d", $year); /eg;
    $CF::filename_template =~ s/MM/ sprintf ("%02d", $mon); /eg;
    $CF::filename_template =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
    $CF::filename_template =~ s/DD/ sprintf ("%02d", $mday); /eg;
    $CF::filename_template =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
    $CF::filename_template =~ s/hh/ sprintf ("%02d", $hr); /eg;
    $CF::filename_template =~ s/mm/ sprintf ("%02d", $min); /eg;

    $CF::prc_title =~ s/YYYY/ sprintf ("%04d", $year); /eg;
    $CF::prc_title =~ s/MM/ sprintf ("%02d", $mon); /eg;
    $CF::prc_title =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
    $CF::prc_title =~ s/DD/ sprintf ("%02d", $mday); /eg;
    $CF::prc_title =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
    $CF::prc_title =~ s/hh/ sprintf ("%02d", $hr); /eg;
    $CF::prc_title =~ s/mm/ sprintf ("%02d", $min); /eg;
  }

  %Scoop::already_seen = ();
  %Scoop::last_modtime = ();
  %Scoop::oldest_already_seen = ();
  @Scoop::seen_this_time = ();
  $Scoop::failed_to_cvt = 0;

  &make_dirs;
  &generate_output_filenames (@SCF::sites);

  if (defined $CF::admincmd) {
    if ($CF::admincmd eq 'dump-sites') {
      my ($key, $outdir);

      while (($key,$outdir) = each %Scoop::key2outdir) {
	my $url = $Scoop::key2url{$key};
	my $title = $Scoop::key2title{$key};
	$title =~ s,\t, ,g; $title =~ s,^\d+-\S+-\d+: ,,g;
	my $base = $Scoop::key2tmp{$key}; $base =~ s,^.*${CF::slash}(\S+?)\.tmp$,$1,o;
	my $site = $SCF::site_defined_at{$url};
	$site =~ s/:\d+$//; $site =~ s/^.*${CF::slash}(\S+?)$/$1/o;

	# foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar
	print "$site\t$url\t$title\t$base\n";
      }
      exit;

    } elsif ($CF::admincmd eq 'journal') {
      open (JOURNAL, "> ${CF::tmpdir}${CF::slash}journal")
	  or die "cannot write to ${CF::tmpdir}${CF::slash}journal!\n";

    } elsif ($CF::admincmd eq 'import-cookies') {
      warn "Importing Netscape-format cookie jar from \"$Scoop::importcookies\"...\n";
      $Scoop::cookie_jar->load ($Scoop::importcookies);
      warn "Cookie jar now looks like:\n".$Scoop::cookie_jar->as_string;
      @SCF::sites = ();
      @CF::cmdline_urls = ();
      @Scoop::filekeys = ();
      # and carry on to exit.

    } else { &main::usage; }
  }

  if ($CF::use_convert_tool) {
    if (defined $CF::prcdir && !-d $CF::prcdir) {
      mkdir ($CF::prcdir, 0755) || die "failed to mkdir '$CF::prcdir'\n";
    }
    if (defined $CF::pilotinstdir && !-d $CF::pilotinstdir) {
      mkdir ($CF::pilotinstdir, 0755) || die "failed to mkdir '$CF::pilotinstdir'\n";
    }
  }

  #if (defined $CF::sharedcache) {
    #&Sitescooper::Robot::upgrade_cache_directory_if_needed ($CF::sharedcache);
  #}
  #&Sitescooper::Robot::upgrade_cache_directory_if_needed ($CF::cachedir);

  &expire_old_cache_files;
  &Sitescooper::UserAgent::load_logins;
  &read_state;

  &Sitescooper::ParProxy::setup_pua;

  if (defined $CF::pilotinstapp && $CF::pilotinstapp eq '***USE_MODULE***')
  {
    $CF::installer = new PDA::PilotInstall;
    $CF::installer->read_config_file ($CF::user_tmpdir.$CF::slash."inst.txt");
  }
}

# ---------------------------------------------------------------------------

sub Run {
  # to do all the conversions at the end:
  #&get_all_sites;
  #foreach $filekey (@Scoop::filekeys) {
  #  &Scoop::convert_output($filekey, $Scoop::key2url{$filekey});
  #}

  # to do them as each site is scooped:
  &get_all_sites (1);
}

# ---------------------------------------------------------------------------

sub Finish {
  &write_state;
  &Sitescooper::UserAgent::save_logins;
  $Scoop::cookie_jar->save ($CF::user_tmpdir.$CF::slash."cookies");

  if (defined $CF::pilotinstapp && $CF::pilotinstapp eq '***USE_MODULE***')
  {
    $CF::installer->write_config_file ($CF::user_tmpdir.$CF::slash."inst.txt");
  }

  if ($CF::disconnect_after_scoop) {
    &disconnect;
  }

  &Scoop::verbose ("Finished!");
}

# ---------------------------------------------------------------------------

sub ParseConfig (\@\@) {
  local ($_);
  my ($conf, $conflines) = @_;

  my $postproc = undef;
  my $postproctype = undef;
  my $cf = undef;
  my $curkey;
  my $confline;

  foreach $_ (@$conf) {
    $confline = shift @$conflines;
    s/#.*$//; s/^\s+//; s/\s+$//g; next if (/^$/);
    if (!defined $confline) { &Scoop::dbg ("oops! confline not set for $_"); }

    # process environment variable references: ${ENVVARNAME}
    # &Scoop::dbg ("variable ref in site file: $1");
    s/\$\{(\S+?)\}/
	  defined($ENV{$1}) ? $ENV{$1} : "";
    /ge;
    s/\$HOME/$ENV{'HOME'}/ge;		# always supported

    if (defined $postproctype) {
      $postproc .= $_;
      # see if it's the end of the postproc statement scope
      my $x = $postproc; 1 while ($x =~ s/\{[^\{\}]*\}//gs);	#{
      if ($x =~ /\}\s*$/) {
	if ($postproctype eq 'Story') {				#{
	  $postproc =~ /^(.*)\}\s*$/; $SCF::story_postproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'StoryPre') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $SCF::story_preproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype =~ /LinksPre (\d+)/) {		#{
	  my $lev = $1;
	  $postproc =~ /^(.*)\}\s*$/; $SCF::links_preproc{"$lev $curkey"} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'URL') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $SCF::url_postproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'Eval') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $SCF::eval_code{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
      }
      next;
    }

    s/^(\S+:)\s+/$1 /;		# easier to read this way ;)
    /^ProxyHost: (.*)$/ and ($CF::proxyhost = $1), next;
    /^ProxyPort: (.*)$/ and ($CF::proxyport = $1+0), next;
    /^TmpDir: (.*)$/ and ($CF::tmpdir = $1), next;
    /^SitescooperDir: (.*)$/ and ($CF::sitescooperdir = $1), next;
    if (/^SitesDir: (.*)$/) { $CF::sitesdir = $1; next; }

    /^MakeDoc: (.*)$/ and ($CF::makedoc = $1), next;
    /^iSilo: (.*)$/ and ($CF::isilo = $1), next;
    /^HTML2Doc: (.*)$/ and ($CF::richreader = $1), next;
    /^Diff: (.*)$/ and ($CF::diff = $1), next;
    /^TextSaveDir: (.*)$/ and ($CF::outdir = $1), next;
    /^PilotInstallDir: (.*)$/ and ($CF::pilotinstdir = $1), next;
    /^PilotInstallApp: (.*)$/ and ($CF::pilotinstapp = $1), next;
    /^SharedCacheDir: (.*)$/ and ($CF::sharedcache = $1), next;
    /^ExpireCacheAfter: (.*)$/ and ($CF::expiry_days = $1+0.0), next;

    if (/^CachedPageLifetime: (.*)$/) {
      (!$CF::refresh) and
    		($CF::cached_front_page_lifetime = ($1+0) / (24*60));
      next;
    }

    if (/^URL: (.*)$/) {
      my $url = &expand_url_magic ($1);
      if (!defined $url) {
	&Scoop::sitewarn_file_line ($confline, "Bad URL in site file: $_\n");
      }

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      push (@SCF::sites, $url);
      &SetDefaultConfigForURL ($url, $confline);
      $SCF::site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    # LayoutURL is similar to URL, but defines a layout for a specific
    # pattern. If an URL falls within this pattern, and parameters are
    # defined for this layout but not defined by the site file, the
    # layout parameters will be used.
    #
    if (/^LayoutURL: (.*)$/) {
      my $url = &expand_url_magic ($1);

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      if (!defined $SCF::site_defined_at{$url}) {
	# allow extra parameters to be added to an existing layout
	&SetDefaultConfigForURL ($url, $confline);
	push (@TmpGlobal::unsorted_layouts, $url);
      }

      $SCF::site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    # ExceptionURL is like LayoutURL, but it takes priority over
    # both LayoutURL and the normal site file rules. This way you
    # can define bits of a site that uses different layouts, caching
    # rules etc. by matching pages' URLs against the ExceptionURL
    # regular expression.
    #
    if (/^ExceptionURL: (.*)$/) {
      my $url = &expand_url_magic ($1);

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      if (!defined $SCF::site_defined_at{$url}) {
	# allow extra parameters to be added to an existing exception
	&SetDefaultConfigForURL ($url, $confline);
	push (@TmpGlobal::unsorted_exceptions, $url);
      }

      $SCF::site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    if (!defined $curkey || $curkey eq '') {
      my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
      die "Configuration line invalid (outside URL scope?) in $line:\n  $_\n";
    }

    /^Name: (.*)$/ and ($SCF::name{$curkey} = $1), next;
    /^Description: (.*)$/ and next;	# we don't use it!
    /^AuthorName: (.*)$/ and next;	# we don't use it!
    /^AuthorEmail: (.*)$/ and next;	# we don't use it!
    /^Active: (.*)$/ and ($SCF::active{$curkey} = $1+0), next;
    /^SizeLimit: (\d+)\s*[Kk]*$/ and ($SCF::sizelimit{$curkey} = $1), next;
    /^Levels: (.*)$/ and ($SCF::levels{$curkey} = $1-2), next;
    /^AddURL: (.*)$/ and ($SCF::extra_urls{$curkey} .= ' '.&expand_url_magic($1)), next;
    /^RequireCookie: (.*)$/ and ($SCF::req_cookie{$curkey} = $1), next;
    /^Rights: (.*)$/ and ($SCF::rights{$curkey} = $1), next;
    /^TableRender: (.*)$/ and ($SCF::table_render{$curkey} = $1), next;

    /^Level(\d+)LinksStart: (.*)$/ and ($SCF::links_start{($1-2)." $curkey"} = $2), next;
    /^Level(\d+)LinksEnd: (.*)$/     and ($SCF::links_end{($1-2)." $curkey"} = $2), next;
    /^Level(\d+)Print: (.*)$/      and ($SCF::links_print{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)TrimAfterLinks: (.*)$/   and ($SCF::links_trim{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)Cache?able: (.*)$/     and ($SCF::cacheable{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)Diff: (.*)$/        and ($SCF::links_diff{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)UseTableSmarts: (.*)$/ and ($SCF::use_table_smarts{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)FollowLinks: (.*)$/ and ($SCF::links_follow_links{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)AddURL: (.*)$/ and
    		($SCF::extra_urls{($1-2)." $curkey"} .= ' '.&expand_url_magic($2)), next;

    if (/^Level(\d+)URL: (.*)$/) {
      my $lev = $1;
      my $pat = $2;
      $pat = &expand_url_magic($pat);
      $SCF::links_limit_to{($lev-2)." $curkey"} =
      		&AddRegexpToSet ($SCF::links_limit_to{($lev-2)." $curkey"}, $pat);
      next;
    }

    /^IssueLinksStart: (.*)$/ and ($SCF::links_start{"1 $curkey"} = $1), next;
    /^IssueLinksEnd: (.*)$/     and ($SCF::links_end{"1 $curkey"} = $1), next;
    /^IssuePrint: (.*)$/      and ($SCF::links_print{"1 $curkey"} = $1+0), next;
    /^IssueTrimAfterLinks: (.*)$/   and ($SCF::links_trim{"1 $curkey"} = $1+0), next;
    /^IssueCache?able: (.*)$/     and ($SCF::cacheable{"1 $curkey"} = $1+0), next;
    /^IssueDiff: (.*)$/        and ($SCF::links_diff{"1 $curkey"} = $1+0), next;
    /^IssueUseTableSmarts: (.*)$/ and ($SCF::use_table_smarts{"1 $curkey"} = $1+0), next;
    /^IssueFollowLinks: (.*)$/ and ($SCF::links_follow_links{"1 $curkey"} = $1+0), next;
    /^IssueAddURL: (.*)$/ and
    		($SCF::extra_urls{"1 $curkey"} .= ' '.&expand_url_magic($1)), next;

    # Normally Issue-level stuff is the highest level, so this would seem to
    # be irrelevant as we never would have to decide whether a URL is the
    # issues page since it's provided in the site file. However the
    # IssueFollowLinks parameter provides a need for this.
    if (/^IssueURL: (.*)$/) {
      my $pat = $1;
      $pat = &expand_url_magic($pat);
      $SCF::links_limit_to{"1 $curkey"} =
      		&AddRegexpToSet ($SCF::links_limit_to{"1 $curkey"}, $pat);
      next;
    }

    if (/^ContentsFormat: (.*)$/) {
      my $fmt = $1;
      if ($fmt eq 'rss') {
	# set up defaults for a Rich Site Summary site.
	# cf. http://my.netscape.com/publish/
	$SCF::site_format{$curkey} = 'rss';
	$SCF::links_start{"0 $curkey"} = '(<rdf:RDF|<rss version=|<scriptingNews)';
	$SCF::links_end{"0 $curkey"} = '(</rdf:RDF>|</rss>|</scriptingNews>)';
	$SCF::links_diff{"0 $curkey"} = 1;
	$SCF::levels{$curkey} = 0;

      } elsif ($fmt eq 'html') {
	# the default -- do nothing.

      } else {
	&Scoop::sitewarn_file_line ($confline, "Unrecognised ContentsFormat: $_\n");
      }
      next;
    }

    /^ContentsStart: (.*)$/   and ($SCF::links_start{"0 $curkey"} = $1), next;
    /^ContentsEnd: (.*)$/       and ($SCF::links_end{"0 $curkey"} = $1), next;
    /^ContentsPrint: (.*)$/   and ($SCF::links_print{"0 $curkey"} = $1+0), next;
    /^ContentsTrimAfterLinks: (.*)$/   and ($SCF::links_trim{"0 $curkey"} = $1+0), next;
    /^ContentsCache?able: (.*)$/  and ($SCF::cacheable{"0 $curkey"} = $1+0), next;
    /^ContentsSkipURL: (.*)$/  and ($SCF::links_skip{"0 $curkey"} = $1), next;
    /^ContentsDiff: (.*)$/     and ($SCF::links_diff{"0 $curkey"} = $1+0), next;
    /^ContentsUseTableSmarts: (.*)$/ and ($SCF::use_table_smarts{"0 $curkey"} = $1+0), next;
    /^ContentsFollowLinks: (.*)$/	and ($SCF::links_follow_links{"0 $curkey"} = $1+0), next;
    /^ContentsAddURL: (.*)$/ and
    		($SCF::extra_urls{"0 $curkey"} .= ' '.&expand_url_magic($1)), next;

    if (/^ContentsURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $pat = &expand_url_magic($pat);
      $SCF::links_limit_to{"0 $curkey"} =
      		&AddRegexpToSet ($SCF::links_limit_to{"0 $curkey"}, $pat);
      next;
    }

    /^StoryStart: (.*)$/	and ($SCF::story_start{$curkey} = $1), next;
    /^StoryEnd: (.*)$/		and ($SCF::story_end{$curkey} = $1), next;
    /^StoryCache?able: (.*)$/	and ($SCF::cacheable{"s $curkey"} = $1+0), next;
    /^StoryDiff: (.*)$/		and ($SCF::story_diff{$curkey} = $1+0), next;
    /^StorySkipURL: (.*)$/	and ($SCF::story_skip{$curkey} = $1), next;
    /^StoryHeadline: (.*)$/	and ($SCF::head_pat{$curkey} = $1), next;
    /^StoryToPrintableSub: (.*)$/	and ($SCF::printable_sub{$curkey} = $1), next;
    /^(Story|)UseTableSmarts: (.*)$/ and ($SCF::use_table_smarts{$curkey} = $2+0), next;
    /^StoryFollowLinks: (.*)$/	and ($SCF::story_follow_links{$curkey} = $1+0), next;
    /^StoryLifetime: (.*)$/	and ($SCF::story_lifetime{$curkey} = $1+0), next;
    /^StoryHTMLHeader: (.*)$/ and ($SCF::story_html_header{$curkey} = $1), next;
    /^StoryHTMLFooter: (.*)$/ and ($SCF::story_html_footer{$curkey} = $1), next;
    /^StoryAddURL: (.*)$/ and
    		($SCF::extra_urls{"-1 $curkey"} .= ' '.&expand_url_magic($1)), next;

    if (/^UseAltTagForURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $pat = &expand_url_magic($pat);
      $SCF::use_alt_tags{$curkey} = &AddRegexpToSet ($SCF::use_alt_tags{$curkey}, $pat);
      next;
    }

    if (/^NeedLoginURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $pat = &expand_url_magic($pat);
      $SCF::need_login_url{$curkey} = &AddRegexpToSet ($SCF::need_login_url{$curkey}, $pat);
      next;
    }

    if (/^StoryURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $pat = &expand_url_magic($pat);
      $SCF::story_limit_to{$curkey} = &AddRegexpToSet ($SCF::story_limit_to{$curkey}, $pat);
      next;
    }

    if (/^ImageURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $pat = &expand_url_magic($pat);
      $SCF::imageurl{$curkey} = &AddRegexpToSet ($SCF::imageurl{$curkey}, $pat);
      next;
    }

    # used to get image only sites... (thx to kld)
    if (/^ImageOnlySite: (.*)$/) {
      $SCF::image_only_site{$curkey} = 1;
      next;
    }

    if (/^ImageScaleToMaxWidth: (.*)$/) {
      $SCF::image_max_width{$curkey} = $1+0;
      next;
    }

    if (/^(URL)Process: (.*)$/) {
      my $type = $1;
      my $val = $2;
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'URL') { $SCF::url_postproc{$curkey} = $val; }
      }
      next;
    }

    if (/^(Story)PostProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'Story') { $SCF::story_postproc{$curkey} = $val; }
      }
      next;
    }

    if (/^EvaluatePerl: (.*)$/) {
      my $val = $2;

      if ($val =~ s/^\{//) #}
      {
	$postproctype = "Eval";
	$postproc = $val;
      } else {
	$SCF::eval_code{$curkey} = $val;
      }
      next;
    }

    if (/^(Contents|Issue|Level\d+)HTMLPreProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;

      my $lev;
      ($type eq 'Contents') &&	($lev = 0);
      ($type eq 'Issue') &&	($lev = 1);
      ($type =~ /Level(\d+)/) && ($lev = $1-2);

      if ($val =~ s/^\{//) #}
      {
	$postproctype = "LinksPre $lev";
	$postproc = $val;
      } else {
	$SCF::links_preproc{"$lev $curkey"} = $val;
      }
      next;
    }

    if (/^(Story)HTMLPreProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;
      ($type eq 'Story') && ($type = 'StoryPre');
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'StoryPre') { $SCF::story_preproc{$curkey} = $val; }
      }
      next;
    }

    my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
    &Scoop::sitewarn_file_line ($confline, "Unrecognised in $line:\n  $_\n");
  }

  if (defined $postproctype) {
    &Scoop::sitewarn_file_line ($confline,
	  "Fell off end of ${postproctype}PostProcess statement!\n");
  }
}

# ---------------------------------------------------------------------------

sub PolishConfig {
  if (defined $CF::proxyhost) {
    $Scoop::useragent->proxy
	  (['http', 'ftp'], "http://${CF::proxyhost}:${CF::proxyport}/");
  }

  @SCF::layouts = ();
  my $pat;
  foreach $pat (sort { length($a) <=> length($b) } @TmpGlobal::unsorted_layouts) {
    $SCF::active{$pat} = 0;		# ensure they arent treated like sites
    push (@SCF::layouts, $pat);
    $SCF::have_layouts = 1;
  }
  undef @TmpGlobal::unsorted_layouts;

  &Scoop::dbg ("site layouts defined: ", join(' ', @SCF::layouts));

  @SCF::exceptions = ();
  foreach $pat (sort { length($a) <=> length($b) } @TmpGlobal::unsorted_exceptions) {
    $SCF::active{$pat} = 0;		# ensure they arent treated like sites
    push (@SCF::exceptions, $pat);
    $SCF::have_exceptions = 1;
  }
  undef @TmpGlobal::unsorted_exceptions;

  # CRG: Added $CF::checked_for_diff to prevent diff check that follows
  # which would fail.
  if ($CF::diff eq 'MODULE') { $CF::diff = ''; $CF::checked_for_diff = 1; }

  if (&Scoop::MyOS eq 'Win32' && $CF::checked_for_diff == 0) {
    my $file = $CF::user_tmpdir.$CF::slash."tstdiff.txt";

    open (OUT, "> $file"); close OUT;
    warn "Checking for availability of the \"diff.exe\" command...\n";
    open (IN, $CF::diff." \"$file\" \"$file\" |");
    1 while (<IN>);
    close IN;

    if ($? >> 8 != 0) {
      warn "\n".
	"$CF::diff could not be found and run. Using perl module\n".
	"Algorithm::Diff instead.\n";

      $CF::diff = "";		# use module instead
    }
    unlink $file; $CF::checked_for_diff = 1;
  }

  if ($CF::diff eq '') {
    &Scoop::dbg ("using Algorithm::Diff module to diff pages.");
    eval 'use Algorithm::Diff qw(diff); 1;'
		  or die "Cannot use built-in diff support, perl module\n".
			  "Algorithm::Diff not found: $@\n";
  }
}

# ---------------------------------------------------------------------------

sub find_sites {
  push(@TmpGlobal::site_files_found, $File::Find::name) if (-f $_ && /\.site?$/i);
}

sub ScanSitesDir {
  my ($file, $key);
  my %sites_grep = ();
  my %read_sites = ();

  foreach $key (@CF::sites_grep) { $sites_grep{$key} = 1; }

  if ($#CF::sites_grep >= 0) {
    &Scoop::verbose ("Restricting to sites: ".join (' ', @CF::sites_grep));

  } elsif ($#CF::site_choices >= 0) {
    # only scoop sites from the site_choices list if the -site argument
    # was not used.
    &Scoop::verbose ("Adding site: ".join (' ', @CF::site_choices));
    foreach $key (@CF::site_choices) {
      if (-r $key) { push (@CF::site_files_to_read, $key); }
    }
  }

  foreach $key (@CF::layout_site_files) {
    &Scoop::dbg ("Using layout: $key");
    if (-r $key) { push (@CF::site_files_to_read, $key); }
  }

  if ($#CF::cmdline_urls >= 0) {
    # we're only snarfing the command-line URLs, skip the predefined sites
    return;
  }

  if (defined $CF::sitesdir && -d $CF::sitesdir) {
    @TmpGlobal::site_files_found = (); find(\&find_sites, $CF::sitesdir);

    foreach $file (@TmpGlobal::site_files_found) {
      next if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);	# skip backups, etc.
      next if (-d $file);		# skip directories

      if ($#CF::sites_grep >= 0) {
	my $base = $file; $base =~ s,^.*[\/\\:]([^\/\\:]+)$,$1,g;
	&Scoop::dbg ("checking if site file is wanted: $file");
	next unless (defined $sites_grep{$base}
		|| defined $sites_grep{$file}
		|| $file =~ /layouts\.site/i);
      }
      push (@CF::site_files_to_read, $file);
    }
    undef @TmpGlobal::site_files_found;
  }
}

sub ReadSiteFiles {
  if ($#CF::site_files_to_read < 0) {
    warn "\n".
      "No sites were read -- the site_choices.txt file is empty, or the\n".
      "\"sites\" directory could not be found.\n\n";
  }

  foreach $file (@CF::site_files_to_read) {
    next if (defined $read_sites{$file});
    $read_sites{$file} = 1;		# don't read the same file twice

    if (open (IN, "< $file")) {
      my $line = 0;
      while (<IN>) {
	push (@Scoop::conf, $_);
	push (@Scoop::conflines, "$file:$line"); $line++;
      }
      close IN;
      &Scoop::verbose ("Scooping site from file \"$file\".");

    } else {
      &Scoop::sitewarn_file_line ("$file:0", "Cannot read $file\n");
    }
  }
}

# ---------------------------------------------------------------------------

sub ReadSiteChoices {
  my $choicefile = $CF::tmpdir.$CF::slash."site_choices.txt";

  if (!-r $choicefile) {
    &EditSiteChoices ($choicefile);	# or create it in this case
  }

  &Scoop::verbose ("Using site choices from \"$choicefile\".");
  open (IN, "<$choicefile") or return;

  my $samplesdir = $CF::sitescooperdir.$CF::slash."site_samples";
  while (<IN>) {
    if (/^\s*\[\s*x\s*\]/i) {
      while (<IN>) {
	if (/^\s*Filename:\s*(\S+)\s*$/) {
	  $_ = $1; s/\[samples\]/${samplesdir}/g;
	  if (/layouts\.site/) {
	    push (@CF::layout_site_files, $_);
	  } else {
	    &Scoop::dbg ("site choice: $_");
	    push (@CF::site_choices, $_);
	  }
	  last;
	}
      }
    }
  }
  close IN;
}

# ---------------------------------------------------------------------------

sub EditSiteChoices {
  my $choicefile = shift;
  my $samplesdir = $CF::sitescooperdir.$CF::slash."site_samples";

  if (!-d $samplesdir) {
    warn "Cannot find 'site_samples' directory, not creating site_choices file.\n".
    	"(looked for '$samplesdir')\n";
    return;
  }

  if ($#CF::sites_grep >= 0) {
    warn "'-site' argument used, not creating site_choices file.\n";
    return;
  }

  warn "Creating/editing \"site_choices.txt\" file...\n";

  my %chosen_sites = ();
  foreach $_ (@CF::site_choices) { $chosen_sites{$_} = 1; }

  if (!open (CHOICE, ">$choicefile")) {
    warn "Cannot create \"site_choices.txt\" file $choicefile\n";
    return;
  }

  print CHOICE <<EOHDR;
Please pick the site files you wish to use here.  Put an X in the box
beside the sites you wish to scoop.

If you want to use the traditional 'sites' directory, or you have your own
site files not in this list, then do not put an X in any of the boxes.
Sitescooper will supplement what you have ticked here with the contents
of your 'sites' directory, if it exists.

EOHDR

  @TmpGlobal::site_files_found = (); find(\&find_sites, $samplesdir);

  my $samplespat = $samplesdir;
  $samplespat =~ s/([^-_:A-Za-z0-9])/\\$1/g;
  my $file;
  
  foreach $file (@TmpGlobal::site_files_found) {
    my $pretty = $file; $pretty =~ s,^${samplespat},\[samples\],g;

    if ($file =~ /layouts\.site/) {
      print CHOICE "    [x] (Site layouts for common sites)\n".
	      "\tFilename: $pretty\n\n";
      next;
    }

    my ($url, $name, $desc) = &ReadSiteForChoices($file);
    if (!defined $url) { next; }
    if (!defined $name) { $name = $url; }

    if (defined $desc) { $desc = "\t($desc)\n"; }
    else { $desc = ''; }

    my $chosen = ' ';
    if (defined $chosen_sites{$pretty}) { $chosen = 'x'; }

    print CHOICE "    [$chosen] $name\n\tURL: $url\n".
    		"\tFilename: $pretty\n$desc\n";
  }
  undef @TmpGlobal::site_files_found;

  close CHOICE or die "failed to write to site_choices file";

  my $edit;
  if (defined $ENV{'VISUAL'}) {
    $edit = $ENV{'VISUAL'};
  } elsif (defined $ENV{'EDITOR'}) {
    $edit = $ENV{'EDITOR'};
  } elsif (&Scoop::MyOS eq 'UNIX' && -x '/usr/bin/editor') {
    $edit = 'editor';		# Debian std, thanks to michael d. ivey
  } elsif (&Scoop::MyOS eq 'UNIX') {
    $edit = 'vi';
  } elsif (&Scoop::MyOS eq 'Win32') {
    $edit = 'notepad.exe';
  } elsif (&Scoop::MyOS eq 'MacOS') {
    # REVISIT -- don't know what to do here ;)
    warn "\nIf you wish to choose which sites to scoop from a list\n".
    	"of pre-defined sites, stop this script now, edit the file\n".
	"$choicefile\n".
	"and re-run it.\n\n";
    return;
  }

  if (&Scoop::MyOS eq 'UNIX' && !-t STDOUT) {
    warn "\nNot running editor for site_choices.txt file, as sitescooper is\n".
      "not running interactively. Please edit this file at your\n".
      "convenience: $choicefile\n\n";

  } else {
    warn "Running editor for site_choices.txt file using command $edit...\n";
    system ($edit, $choicefile);

    if (($? >> 8) != 0) {
      die "The command failed. Please edit $choicefile\n".
	  "by hand and re-run sitescooper.\n\n";
    }
  }
}

# ---------------------------------------------------------------------------

sub ReadSiteForChoices {
  my $file = shift;
  return if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);
  return if (-d $file);

  open (IN, "<$file") || next;
  my ($url, $sitename, $desc);
  $url = $sitename = $desc = undef;
  while (<IN>) {
    s/*$//g; s/#.*$//g;
    /^\s*Name:\s*(.*)$/ and ($sitename = $1), next;
    /^\s*Description:\s*(.*)$/ and ($desc = $1), next;
    /^\s*URL:\s*(.*)$/ and ($url = $1), next;
  }
  close IN;

  ($url, $sitename, $desc);
}

# ---------------------------------------------------------------------------
# Default configuration for a newly-specified URL.

sub SetDefaultConfigForURL {
  my $curkey = shift;
  my $confline = shift;

  $SCF::active{$curkey} = 1;		# active by default
  $SCF::name{$curkey} = $curkey;	# default name - the URL
  $SCF::use_table_smarts{$curkey} = undef;	# use smarts
  $SCF::levels{$curkey} = -1;		# 1-level site
  $SCF::extra_urls{$curkey} = '';	# no extra URLs
  $SCF::story_lifetime{$curkey} = 90;	# dont scoop stories older than 3 months
  $SCF::links_trim{"0 $curkey"} = 1024;	# trim after last href + 1024 chars (contents)
  $SCF::links_trim{"1 $curkey"} = 1024;	# trim after last href + 1024 chars (issue)
  $SCF::image_max_width{$curkey} = 300;
  $SCF::image_only_site{$curkey} = 0;

  # default limit to articles at the same site
  $curkey =~ m,^((http|file)://[^/]*/),i;
  if (defined $1) {
    $SCF::story_limit_to{$curkey} = $1.'.*';
  } else {
    &Scoop::sitewarn_file_line ($confline,
    			"Unsupported URL protocol for URL '".$curkey."'.\n");
  }
}

# ---------------------------------------------------------------------------

sub AddRegexpToSet {
  my $regexp = shift;
  my $pat = shift;
  if (!defined ($regexp)
	      || $regexp !~ /\)$/)
  {
    $regexp = "($pat)";
  } else {
    $regexp =~ s/\)$/|${pat})/g;
  }
  $regexp;
}

# ---------------------------------------------------------------------------

sub make_basic_dirs {
  if (!-d $CF::tmpdir) {
    mkdir ($CF::tmpdir, 0777) || die "failed to mkdir '$CF::tmpdir'\n";
  }

  $CF::user_tmpdir = $CF::tmpdir.$CF::slash."sitescooper_$Scoop::userid";

  if (!-d $CF::user_tmpdir) {
    mkdir ($CF::user_tmpdir, 0777) || die "failed to mkdir '$CF::user_tmpdir'\n";
  }
}

sub make_dirs {
  chdir ($CF::tmpdir) or die "cannot cd to $CF::tmpdir\n";

  if (-f "${CF::user_tmpdir}/cookies") {
    $Scoop::cookie_jar->load ("${CF::user_tmpdir}/cookies");
  }

  if ($CF::debug) {
    open (LOGFILE, "> ${CF::user_tmpdir}${CF::slash}log.txt");
    select LOGFILE; $| = 1; select STDOUT;
  }

  if ($CF::outdir eq '') { $CF::outdir = "${CF::user_tmpdir}${CF::slash}txt"; }
  if (!-d $CF::outdir) {
    mkdir ($CF::outdir, 0777) || die "failed to mkdir '$CF::outdir'\n";
  }

  $CF::cachedir = $CF::user_tmpdir.$CF::slash."cache"; $CF::newcachedir = $CF::cachedir;
  if (!-d $CF::cachedir) {
    mkdir ($CF::cachedir, 0777) || die "failed to mkdir '$CF::cachedir'\n";
  }

  if (defined $CF::sharedcache) {
    if (!-d $CF::sharedcache) {
      mkdir ($CF::sharedcache, 0777) || die "failed to mkdir '$CF::sharedcache'\n";
    }
  }

  $CF::alreadyseen = $CF::user_tmpdir.$CF::slash."already_seen.txt";
  $CF::newalreadyseen = $CF::alreadyseen;

  if ($CF::nowrite) {
    $CF::newcachedir = $CF::user_tmpdir.$CF::slash."new_cache";
    if (!-d $CF::newcachedir) {
      mkdir ($CF::newcachedir, 0777) || die "failed to mkdir '$CF::newcachedir'\n";
    }
    $CF::newalreadyseen = $CF::user_tmpdir.$CF::slash."new_already_seen.txt";
  }

  $CF::prcdir = $CF::user_tmpdir.$CF::slash."prc";

  # check for spaces on Win32 -- MakeDocW can't handle them!
  # Thx to wgoosey /at/ servtech.com for spotting this one.
  if ($CF::outputfilter eq 'makedoc') {
    if (&Scoop::MyOS eq 'Win32') {
      if ($CF::outdir =~ / /) {
	warn "

Warning: Sitescooper is installed in a directory containing spaces in the
filename. The MakeDocW conversion tool does not support this, so you may
need to move Sitescooper to another directory, e.g. C:\\Sitescooper, for
this conversion to work!  (This is a bug in MakeDOCW.exe.)

";
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub expire_old_cache_files {
  sub expire_cache { unlink if (-f $_ && -M $_ > $CF::expiry_days); }
  sub expire_shared_cache { unlink if (-f $_ && -M $_ > $CF::expiry_days); }

  my $stamp = $CF::cachedir.$CF::slash."last_clean.stamp";
  if (!defined (-M $stamp) || -M _ > $CF::expiry_days/2.0) {
    find(\&expire_cache, $CF::cachedir);
  }

  if (defined $CF::sharedcache) {
    $stamp = $CF::sharedcache.$CF::slash."last_clean.stamp";
    if (!defined (-M $stamp) || -M _ > $CF::expiry_days/2.0) {
      find(\&expire_shared_cache, $CF::sharedcache);
    }
  }
}

# ---------------------------------------------------------------------------

sub read_state {
  if ($CF::refresh == 0) {
    if (!open (IN, "< $CF::alreadyseen")) {
      &Scoop::verbose ("Cannot read $CF::alreadyseen, creating a new one");

    } else {
      my $url;
      my $mod;
      my $urlhost;
      while (<IN>) {
	/ lastmod=(\d+)$/;
	next if (!defined $1);
	$url = $`; $mod = $1;
	$Scoop::already_seen{$url} = 1;
	$Scoop::last_modtime{$url} = $mod+0;

	if ($url =~ m,http://(\S+?)/,) {
	  $urlhost = $1;
	  if (defined($Scoop::oldest_already_seen{$urlhost})
	      ? $Scoop::oldest_already_seen{$urlhost} > $mod : 1)
	  {
	    $Scoop::oldest_already_seen{$urlhost} = $mod;
	  }
	}
      }
      close IN;
    }
  }
}

# ---------------------------------------------------------------------------

sub generate_output_filenames {
  my @sites = @_;
  my %already_done = ();
  my $filekeyacc = 0;
  my $url;

  foreach $url (@sites) {
    next if ($url eq '');

    if (defined $SCF::req_cookie{$url}) {
      ($TmpGlobal::req_cookie_host, $TmpGlobal::req_cookie_key)
      				= split (' ', $SCF::req_cookie{$url});
      $TmpGlobal::gotit = 0;

      sub chk_for_reqd_cookie {
	if ($_[4] eq $TmpGlobal::req_cookie_host &&
	  	$_[1] eq $TmpGlobal::req_cookie_key)
	{
	  $TmpGlobal::gotit = 1;
	}
      }
      $Scoop::cookie_jar->scan (\&chk_for_reqd_cookie);

      if (!$TmpGlobal::gotit) {
	my $line = $SCF::site_defined_at{$url};
	$line =~ s/^(.*):(.*?)$/"$1"/g;
	&Scoop::verbose ("Cookie from $TmpGlobal::req_cookie_host is not imported, not scooping $line.");
	$SCF::active{$url} = 0;
      }

      undef $TmpGlobal::req_cookie_host;
      undef $TmpGlobal::req_cookie_key;
    }

    next unless ($SCF::active{$url} == 1);

    my $filekey = $filekeyacc++;

    my $sitename = $SCF::name{$url};
    if (!defined $sitename) { $sitename = $url; }

    my $filedesc = $CF::filename_template;
    $filedesc =~ s/Site/${sitename}/g;
    $filedesc =~ s/Section//g;		# backwards compat

    # trim out dangerous chars (modified to allow most 8-bit stuff, thx to
    # <avatar /at/ deva.net>.
    $filedesc =~ s/[\s\;\*\$\%\!\&\<\>\|\?\'\"\`\]\[\{\}\:\\\/\000-\037]+/_/g;

    $filedesc =~ s/^[ _]+//g; $filedesc =~ s/[ _]+$//g;

    if (&Scoop::MyOS eq 'Mac') {
      # try to limit the filename to 32 characters
      $filedesc =~ s/^(.{26}).*$/$1/g;
    }

    my $outdir = $CF::outdir.$CF::slash.$filedesc;
    my $outidxfile;

    if (&Scoop::writing_html) {
      $outidxfile = $filedesc.'.html';
    } else {
      $outidxfile = $filedesc.'.txt';
    }

    next if (defined $already_done{$outdir});
    $already_done{$outdir} = 1;

    my $outtmp = $CF::outdir.$CF::slash.$filedesc.'.tmp';

    my $prctitle = $CF::prc_title;
    $prctitle =~ s/Site/${sitename}/g;
    $prctitle =~ s/Section//g;		# backwards compat
    $prctitle =~ s/^[ _]+//g; $prctitle =~ s/[ _]+$//g;

    $Scoop::key2tmp{$filekey} = $outtmp;
    $Scoop::key2outdir{$filekey} = $outdir;
    $Scoop::key2outidxfile{$filekey} = $outidxfile;
    $Scoop::key2title{$filekey} = $prctitle;
    $Scoop::key2sitename{$filekey} = $sitename;
    $Scoop::key2site{$filekey} = $url;
    $Scoop::key2url{$filekey} = $url;
    push (@Scoop::filekeys, $filekey);

    if ($CF::dumpprc) {
      $Scoop::key2syncfile{$filekey} = $outtmp;		# reuse it!
    } else {
      $Scoop::key2syncfile{$filekey} =
		  $CF::prcdir.$CF::slash.$filedesc.'.prc';
    }
  }
}

# ---------------------------------------------------------------------------

sub get_all_sites {
  my $convert_now = shift;
  my $filekey;
  if (!defined $convert_now) { $convert_now = 0; }

  # optimise by pre-getting all front pages using parallel
  # requests, if LWP::Parallel is installed.
  #
  if ($CF::use_lwp_par) {
    &Scoop::dbg ("using LWP::Parallel to preload front pages");
    &Sitescooper::ParProxy::set_caches ();
    foreach $filekey (@Scoop::filekeys) {
      &Sitescooper::ParProxy::register ($Scoop::key2url{$filekey});
    }

  } else {
    &Scoop::dbg ("LWP::Parallel module not available, not preloading");
  }

  foreach $filekey (@Scoop::filekeys) {
    &Sitescooper::ParProxy::check_for_responses;

    my $scooper = new Sitescooper::Robot ($Scoop::key2url{$filekey},
    		$Scoop::key2outdir{$filekey}, $Scoop::key2outidxfile{$filekey});

    $scooper->{outtmp} = $Scoop::key2tmp{$filekey};
    $scooper->{site} = $Scoop::key2site{$filekey};
    $scooper->{sitename} = $Scoop::key2sitename{$filekey};
    $scooper->{prctitle} = $Scoop::key2title{$filekey};
    $scooper->{filekey} = $filekey;
    $scooper->{convert_now} = $convert_now;

    $scooper->{cachedir} = $CF::cachedir;
    $scooper->{newcachedir} = $CF::newcachedir;
    $scooper->{sharedcachedir} = $CF::sharedcache;

    $scooper->scoop_site();
  }
}

# ---------------------------------------------------------------------------

sub convert_output {
  my $filekey = shift;
  my $scfkey = shift;

  return unless ($CF::use_convert_tool);

  my $syncfile = $Scoop::key2syncfile{$filekey};
  return unless defined $syncfile;

  my $outdir = $Scoop::key2outdir{$filekey};
  my $outidxfile = $Scoop::key2outidxfile{$filekey};
  my $prctitle = $Scoop::key2title{$filekey};
  unlink $syncfile;

  my $idx = $outdir.$CF::slash.$outidxfile;

  if ($CF::outputfilter eq '__cat__') {
    open (IN, "< ".$idx);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $idx;
    return;
  }
  if ($CF::outputfilter eq '__path__') {
    print STDOUT "$idx\n";
    return;
  }

  my $cmd;
  if ($CF::outputfilter eq 'makedoc') {
    $cmd = "$CF::makedoc \"$idx\" \"".$syncfile."\" '".$prctitle."'";

  } elsif ($CF::outputfilter eq 'isilo') {
    if (&Scoop::MyOS eq 'Win32' && $CF::isilo =~ /isilow32/i) {
      $CF::isiloargs .= ' -u';		# doesn't support the other args AFAIK.
    }

    if ($CF::fileperpage) {
      $cmd = "$CF::isilo $CF::isiloargs ".
      			"$CF::isilomultipageargs \"".$idx."\"";
    } else {
      $cmd = "$CF::isilo $CF::isiloargs \"".$idx."\"";
    }

    # UNIX iSilo utils take the output filename as well; Win32
    # doesn't need it as it installs as it goes along.
    if (&Scoop::MyOS ne 'Win32') {
      $cmd .= " \"".$syncfile."\"";
    }

    # Win32 iSilo only takes the -u arg for the GUI version, not the
    # command line one. Strip the arg for the command-line converter.
    # Also add the output filename.
    if (&Scoop::MyOS eq 'Win32' && $cmd =~ /isiloc32/i) {
      $cmd =~ s/ -u / /g;
      $cmd .= " \"".$syncfile."\"";
    }

  } elsif ($CF::outputfilter eq 'richreader') {
    $cmd = "$CF::richreader $CF::richargs \"".$idx."\"";

  } elsif ($CF::outputfilter =~ /^cmd: /) {
    $cmd = $';
    $cmd =~ s/__SCOOPFILE__/${idx}/g;
    $cmd =~ s/__SYNCFILE__/${syncfile}/g;
    $cmd =~ s/__TITLE__/${prctitle}/g;

  } else {
    die "bad output filter $CF::outputfilter\n";
  }

  # substitute in the parameters taken from the site file
  # these should be set in the SetDefaultConfig function to avoid undefs.
  $cmd =~ s/__IMAGE_MAX_WIDTH__/${SCF::image_max_width{$scfkey}}/g;

  my $keep_tmps = ($CF::debug || $CF::keep_tmps);

  if (&Scoop::MyOS ne 'Mac') {
    &add_cmd_dir_to_path ($cmd);
    &Scoop::verbose ("Running: $cmd");

    # cd to conversion dir for command
    my $realwd = getcwd; chdir $outdir;
    system $cmd;
    chdir $realwd;			# back again

    # output a newline, MakeDoc won't do it itself.
    if (&Scoop::MyOS eq 'UNIX') { &Scoop::verbose ("\n"); }

    if (($? >> 8) != 0) {

      # work around a bug in iSilo converter on Win32 -- it
      # reports failure even when the conversion went fine.
      # (TODO: check if still the case)
      if (&Scoop::MyOS ne 'Win32' ||
	    $CF::outputfilter ne 'isilo')
      {
	warn "command failed: $cmd\n";
	$Scoop::failed_to_cvt = 1;
      }
    }

  } else {
    # system is broken on MacOS, so print the required command 
    #so it can be run easily from MPW shell
    if (!defined $Scoop::macos_system_warning_written) {
      warn "[Warning: not using the broken MacPerl system() call. ".
	    "You will need to\ncut and paste the command ".
	    "lines yourself!]\n\n";
      $Scoop::macos_system_warning_written = 1;
    }
    print $cmd, "\n";
    $keep_tmps = 1;
  }

  if (!$keep_tmps) {
    &File::Path::rmtree ($outdir);		# don't keep .txt files around
  }

  if ($CF::dumpprc)
  {
    # If we're dumping, read in the generated file and write it to
    # STDOUT.
    open (IN, "< ".$syncfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $syncfile;

  } elsif (defined $CF::pilotinstdir)
  {
    # if installing to a dir, install it and output the filename.
    #
    my $filedesc = basename ($Scoop::key2syncfile{$filekey});
    my $instfile = $CF::pilotinstdir.$CF::slash.$filedesc;
    if ($syncfile ne $instfile) {
      move ($syncfile, $instfile) or
      	warn "failed to move $syncfile to $instfile\n";
    }
    print "Created: ".$instfile."\n";

  } elsif (defined $CF::pilotinstapp)
  {
    # If installing using an app, run it as appropriate, or for an
    # ***ADD_TO_MANIFEST*** install method, write the filename to
    # that manifest file.
    #
    if ($CF::pilotinstapp =~ /^\*\*\*ADD_TO_MANIFEST\*\*\* (.*)$/) {
      if (!open (OUT, ">> $1")) {
	warn "cannot write to $1\n";
      } else {
	print OUT $syncfile."\n"; close OUT;
      }

    } elsif ($CF::pilotinstapp eq '***USE_MODULE***') {
      my $instret = $CF::installer->install_file ($syncfile);
      &Scoop::dbg ("PDA::PilotInstall install_file returned $instret");
      if ($instret == 1) {
	unlink $syncfile;
      }

    } else {
      $cmd = "$CF::pilotinstapp $syncfile";
      &add_cmd_dir_to_path ($cmd);
      &Scoop::verbose ("Running: $cmd");
      system $cmd;

      if (($? >> 8) != 0) {
	warn "command failed: $cmd\n";
      } else {
	unlink $syncfile;
      }
    }
  } else
  {
    # just output the name of the file, in the sitescooper tmp directory,
    # for other apps that may want to collect these files and store
    # them somewhere.
    #
    print "Created: ".$syncfile."\n";
  }
}

sub add_cmd_dir_to_path {
  local ($_);
  my $cmd = shift;

  # Perl on some Win32 platforms seems to require that the binary be
  # in the PATH.
  #
  if (&Scoop::MyOS eq 'Win32') {
    $_ = $cmd;
    if (!/[\\\/]/) { return; }	# foo arg ...
    if (/^\"([^\"]+)\"/) { $cmd = $1; }	# "C:\Program Files\foo.exe" arg ...
    elsif (/^(\S+)\s/) { $cmd = $1; }	# C:\windows\foo.exe arg ...
    else { $cmd = $_; }			# C:\windows\foo.exe

    $cmd =~ s,[\\/][^\\/]+\s*$,,g;		# trim the filename
    my $cmdpat = $cmd; $cmdpat =~ s,(\W),\\$1,g;	# escape funny chars

    if ($ENV{'PATH'} !~ /;${cmdpat}(;|$)/) {
      &Scoop::dbg ("Adding directory to command path: $cmd");
      my $path = $ENV{'PATH'} || $ENV{'Path'} || $ENV{'path'};
      $path .= ";$cmd"; $ENV{'PATH'} = $path;
    }
  }
}

# ---------------------------------------------------------------------------

sub write_state {
  if (!$Scoop::failed_to_cvt) {
    # only write alreadyseen if the files converted successfully, otherwise
    # the user may lose some recent news due to a makedoc screwup.
    #
    my $towrite = '';
    my $now = time;
    my $twomonthsago = $now - (24*60*60*30*2);
    my $mod;
    my $urlhost;

    # keep the already-seen list small by cutting out old entries.  We
    # define "old entries" as (a) older than 2 months and (b) older than
    # the oldest link we saw in today's scooping run.
    #
    if (!$CF::refresh) {
      &Scoop::dbg ("trying to cut old entries from already-seen URL cache");

      foreach $_ (keys %Scoop::already_seen) {
	m,http://(\S+?)/,; $urlhost = $1; next unless defined ($urlhost);
	if (defined $Scoop::last_modtime{$_} &&
	  			defined $Scoop::oldest_already_seen{$urlhost})
	{
	  $mod = $Scoop::last_modtime{$_};
	  if ($twomonthsago > $mod &&
	    		$Scoop::oldest_already_seen{$urlhost} > $mod)
	  {
	    &Scoop::dbg ("stripping old entry: $_ lastmod=$mod (".&Scoop::time2datestr($mod).")");
	    next;
	  }
	}
	$towrite .= $_." lastmod=".(defined $Scoop::last_modtime{$_}
		    ? $Scoop::last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, "> $CF::newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot rewrite $CF::newalreadyseen\n";
      } else {
	warn "Cannot rewrite $CF::newalreadyseen\n";
      }

    } else {
      # it's small enough -- so we can just append to it.
      &Scoop::dbg ("appending already-seen URLs to $CF::newalreadyseen");

      foreach $_ (@Scoop::seen_this_time) {
	$towrite .= $_." lastmod=".(defined $Scoop::last_modtime{$_}
		    ? $Scoop::last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, ">> $CF::newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot append to $CF::newalreadyseen\n";
      } else {
	warn "Cannot append to $CF::newalreadyseen\n";
      }
    }

    my ($from, $to);
    while (($from,$to) = each %CF::caches_to_rename) {
      &Scoop::dbg ("Saving new cache file: $to");
      rename ($from, $to) or warn ("rename $from -> $to failed\n");
    }
  }
}

# ---------------------------------------------------------------------------

sub disconnect {
  if (&Scoop::MyOS eq 'Win32') {
    warn "disconnect on Win32 not implemented yet -- sorry.\n";

  } elsif (&Scoop::MyOS eq 'UNIX') {
    warn "disconnect: trying to kill pppd.\n";
    system ("killall pppd");	# probably won't work.

  } elsif (&Scoop::MyOS eq 'Mac') {
    eval '
      use MacPerl;
      MacPerl::DoAppleScript("ppp disconnect");
    1' or die "Cannot disconnect, MacPerl module not found: $@\n";
  }
}

# ---------------------------------------------------------------------------

sub warn_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Warning: ", $msg);
  &main::scoop_warn (@_);
}

sub die_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Fatal: ", $msg);
  &main::scoop_die (@_);
}

sub log {
  if (defined fileno LOGFILE) { print LOGFILE @_, "\n"; }
}

sub journal {
  if (defined fileno Scoop::JOURNAL) {
    my $tag = shift;
    my $lines = join("", @_); $lines =~ s/^/$tag:\t/gm;
    print Scoop::JOURNAL $lines, "\n";
  }
}

sub dbg {
  if ($CF::debug != 0) {
    my $msg = "debug: ".join ('', @_); chomp $msg; &log ($msg);
    &main::dbg (@_);
  }
}

sub verbose {
  if ($CF::verbose) {
    my $msg = join ('', @_); chomp $msg; &log ($msg);
    &main::verbose (@_);
  }
}

sub sitewarn_file_line {
  my $fname = shift;
  $fname =~ s,^.*[\/\\:]([^\/\\:]+?):\d+?$,$1,o;
  warn "Site \"$fname\": ".join('', @_)."\n";
  &main::sitewarn ($fname, @_);
}

sub sitewarn {
  &Scoop::sitewarn_file_line ($sitewarn_current_site_line, @_);
}

sub cleanexit {
  &main::cleanexit (@_);
}

sub AbsoluteURL {
  local ($fromurl, $_) = @_;
  s/^\"//; s/\"$//;		# trim quotes if necessary
  s/^\'//; s/\'$//;		# dodgy quotes
  s/^%22//; s/%22.*?$//;	# trim escaped quotes (!!)
  s/&amp;/&/g;			# HTML escapes are not supposed to be in URLs

  if (/^[^\/]+:/) {
    if (!/^(http|file):/) {
      # non-HTTP urls get ignored; don't get URI::URL involved, it'll crash
      return $_;
    }
  }

  require URI::URL;
  my $url = new URI::URL ($_, $fromurl);
  $url->abs->as_string;
}

sub AddHostToURL {
  # a simpler form of AbsoluteURL, used for StoryURL lines.
  # this is necessary because the real thing will escape metacharacters
  # which screws up regexp patterns.

  local ($fromurl, $_) = @_;

  carp ("url not defined in AddHostToURL") unless defined $_;

  s/^"//; s/"$//;	# trim quotes if necessary
  $_ = &expand_url_magic ($_);	# allow [[MM]] etc. keywords in these patterns

  if (m,^[^/]+://,) {
    # do nothing, it's fully-qualified
  } elsif (m,^/,) {
    $fromurl =~ m,^([^/]+://[^/]+)/, and ($_ = $1.$_);
  }
  $_;
}

sub URLWithoutAnchor {
  my $url = shift;
  if (!defined $url) {
    warn ("undef in URLWithoutAnchor from: ".join(' ', caller()));
  }
  if ($url =~ /#/) {
    $`;
  } else {
    $url;
  }
}

sub URLAnchor {
  my $url = shift;
  if ($url =~ /#/) {
    "#".$';
  } else {
    "";
  }
}

sub mm_to_monthname {
  my @months = qw(x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
  $months[$_[0]];
}

sub get_extra_date {
  my $time = shift;
  my ($x, $wday, $min, $hr);
  ($x,$min,$hr,$x,$x,$x,$wday,$x,$x) =
  		localtime(defined $time ? $time : time);
  my @days = qw(Sun Mon Tue Wed Thu Fri Sat);
  ($min, $hr, $days[$wday]);
}

sub get_date {
  my $time = $_[0];
  my ($x, $mday, $mon, $year);
  ($x,$x,$x,$mday,$mon,$year,$x,$x,$x) =
  		localtime(defined $time ? $time : time);
  $mon++; $year += 1900;
  ($mday, $mon, $year, &mm_to_monthname($mon));
}

sub time2datestr {
  my $time = shift;
  my ($dd, $mm, $year, $mon) = &get_date ($time);
  "$mon $dd $year";
}

sub match_url {
  my $url = $_[0];
  my $pat = $_[1];

  if (!defined $url || !defined $pat) {
    warn ("undef in match_url from: ".join(' ', caller()));
    return undef;
  }

  ($url =~ m{^${pat}(?:\#|$)});	# TODO -- trap errors here
}

BEGIN {
  $match_url_dd = $match_url_mm =
  	$match_url_yyyy = $match_url_Mstr = undef;
};

sub expand_url_magic {
  my $url = $_[0];
  if ($url !~ /\[\[/) { return $url; }

  local ($_);
  if (!defined $match_url_yyyy) {
    ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr)
    				= &get_date(undef);
    $match_url_yy = $match_url_yyyy; $match_url_yy =~ s/^\d\d//; # trim century
    $match_url_mm = "0$match_url_mm" unless ($match_url_mm =~ /^..$/);
    $match_url_dd = "0$match_url_dd" unless ($match_url_dd =~ /^..$/);
  }

  $url =~ s/\[\[YYYY\]\]/${match_url_yyyy}/g;
  $url =~ s/\[\[YY\]\]/${match_url_yy}/g;

  $url =~ s{\[\[MM([\+\-]\d+|)\]\]}{
    &offset_month($match_url_mm, $1);
  }ge;

  $url =~ s{\[\[M([\+\-]\d+|)\]\]}{
    &offset_month($match_url_mm, $1)+0;
  }ge;	# single-digit if poss

  $url =~ s{\[\[Mon([\+\-]\d+|)\]\]}{
    &mm_to_monthname (&offset_month($match_url_mm, $1));
  }ge;
  $url =~ s{\[\[mon([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/A-Z/a-z/; $_;
  }ge;
  $url =~ s{\[\[MON([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/a-z/A-Z/; $_;
  }ge;

  $url =~ s/\[\[DD\]\]/${match_url_dd}/g;
  $url =~ s{\[\[D\]\]}{ $match_url_dd+0; }ge;	# single-digit if poss
  $url;
}

sub get_layout_param {
  my ($parmname, $key, $url) = @_;

  if (!defined $key) {
    carp "get_layout_param with undefined key: $parmname $url";
    return undef;
  }

  my $keyprefix = ''; if ($key =~ /^\d+ /) { $keyprefix = $&; }
  my $pat;

  if ($SCF::have_exceptions || $SCF::have_layouts) { study $url; }

  # This code originally used eval to interpolate the name of the
  # parameter appropriately. It now uses symbolic references,
  # resulting in a massive speedup (1/10th of the time previously
  # spent in this method!)

  # Highest priority, check for an ExceptionURL rule.
  if ($SCF::have_exceptions) {
    foreach $pat (@SCF::exceptions) {
      next unless (Scoop::match_url ($url, $pat));
      if (defined ${"SCF::${parmname}"}{$keyprefix.$pat}) {
	return ${"SCF::${parmname}"}{$keyprefix.$pat};
      }
    }
  }
  
  # check for a parameter defined in the site file for this site first.
  if (defined ${"SCF::${parmname}"}{$key}) {
    return ${"SCF::${parmname}"}{$key};
  }

  # nope -- now check the layouts. Eval the lot for better speed.
  if ($SCF::have_layouts) {
    foreach $pat (@SCF::layouts) {		# perky! ;)
      next unless (Scoop::match_url ($url, $pat));
      if (defined ${"SCF::${parmname}"}{$keyprefix.$pat}) {
	return ${"SCF::${parmname}"}{$keyprefix.$pat};
      }
    }
  }
  
  undef;
}

sub offset_month {
  my $mm = shift;
  my $offset = shift;
  if ($offset ne '') { $mm += $offset; }
  if ($mm < 1 || $mm > 12) { $mm = ((($mm-1)+12) % 12)+1; }
  $mm = "0$mm" unless ($mm =~ /^..$/);
  $mm;
}

sub writing_doc {
  ($CF::outstyle == $CF::OUT_DOC);
}

sub writing_html {
  ($CF::outstyle == $CF::OUT_HTML);
}

sub writing_text {
  ($CF::outstyle == $CF::OUT_TEXT);
}

# to shut up -w: this sub is never called. Hence the name ;)
sub never_called {
  $Scoop::cgimode = $Scoop::bookmark_char.
	$Scoop::add_closing_tags.
	$Scoop::sitewarn_current_site_line.
	$Scoop::strip_empty_tag_sets.
	$Scoop::pua.
	$SCF::url_title;
}

# simple wrapper around Config and $^O to provide win/mac/unix
# differentiation, without worrying about which variant of UNIX
# it is.
#
sub MyOS {
  if (defined ($Scoop::MY_OS)) { return $Scoop::MY_OS; }

  # FIGURE OUT THE OS WE'RE RUNNING UNDER
  # Some systems support the $^O variable.  If not available then require()
  # the Config library.  [nicked from CGI.pm -- jmason]

  my $os;
  unless ($os) {
    unless ($os = $^O) {
      require Config;
      $os = $Config::Config{'osname'};
    }
  }

  if ($os=~/win/i) {
    $os = 'Win32';
  } elsif ($os=~/vms/i) {
    $os = 'VMS';
  } elsif ($os=~/mac/i) {
    $os = 'Mac';
  } elsif ($os=~/os2/i) {
    $os = 'OS2';
  } else {
    $os = 'UNIX';
  }
  $Scoop::MY_OS = $os;
}

1;

#===========================================================================

# TODO:
#
# add a story inclusion/exclusion "grep" mode
# URLs at end like [1] this
# finish CGI support
#
#---------------------------------------------------------------------------
# vim:sw=2:tw=74: