#=========================================================================== package Sitescooper::Robot; require Exporter; use Carp; use Sitescooper::StripTablesFilter; @ISA = qw(Exporter); @EXPORT= qw(); $VERSION = "0.1"; sub Version { $VERSION; } sub new { my $class = shift; $class = ref($class) || $class; my $url = shift; my $outfile = shift; my $outidxfile = shift; my $self = { 'url' => $url, 'outfile' => $outfile, 'outtmp' => $outfile.".tmp", # default value 'outidxfile' => $outidxfile, 'prctitle' => undef, 'sitename' => undef, 'site' => undef, 'cachedir' => undef, 'newcachedir' => undef, 'filekey' => undef, 'convert_now' => 0, 'sitekey' => undef, 'warn_about_ext_links' => 0, # turned on where necessary 'page_to_tmpfile' => { }, 'output_file' => { } }; bless ($self, $class); $self->clear_page_tmpfiles(); $self; } sub dont_convert { my $self = shift; if (defined $self->{filekey}) { undef $Scoop::key2syncfile{$self->{filekey}}; } else { carp "cannot block conversion of site without filekey!"; } } sub scoop_site { local ($_); my $self = shift; my $url = $self->{url}; if (!defined $self->{sitename}) { $self->{sitename} = $url; } # This apparently is needed on MacOS. Sounds unlikely, but there it # is... if (&Scoop::MyOS eq 'Mac') { my $parentdir = Scoop::dirname($self->{outfile}); if (!-d $parentdir) { mkdir ($parentdir, 0755); } } $_ = $SCF::site_defined_at{$url}; /^(.*):/; my $site_file_name = $1; &Scoop::verbose ("SITE START: now scooping site \"$site_file_name\"."); &Scoop::dbg ("tmp dir: $self->{outtmp}, output dir: $self->{outfile}"); (-d $self->{outtmp}) && &File::Path::rmtree ($self->{outtmp}); mkdir ($self->{outtmp}, 0755) || die "cannot mkdir $self->{outtmp}\n"; $self->clear_page_tmpfiles(); my $output_filename = $self->{outtmp}.$CF::slash.$self->{outidxfile}; # evaluate perl code for this site. my $proc = Scoop::get_layout_param ('eval_code', $url, $url); if (defined $proc) { my $skip_site = 0; if (!eval $proc."; 1;") { &Scoop::sitewarn("EvaluatePerl failed: $@"); } else { if ($skip_site) { &Scoop::dbg ("skip_site set, skipping this site."); next; } } } %{$self->{output_file}} = (); ${$self->{output_file}}{'MAIN'} = ''; %{$self->{output_links_snarfed}} = (); %{$self->{new_already_seen_age_cache_data}} = (); @{$self->{output_story_urls}} = (); my $upindex = $self->{current_story_index} = 0; my $hdr; if ($CF::writeheader) { if ($CF::outstyle == $CF::OUT_HTML) { $hdr = "$self->{prctitle}". "

$self->{prctitle}

\n"; } else { $hdr = "$self->{prctitle}\n\n\n"; } } else { if ($CF::outstyle == $CF::OUT_HTML) { $hdr = "\n"; } else { $hdr = ""; } } ${$self->{output_file}}{'MAIN'} .= $hdr; $self->{stories_found} = 0; $self->{file_size} = 0; $self->{hit_file_size_limit} = 0; &Scoop::set_got_intr_behaviour ('setflag'); my $u; foreach $u ($url, split (' ', $SCF::extra_urls{$url})) { # if we were interrupted, clear the flag and go on if ($Scoop::got_intr_flag) { &Scoop::set_got_intr_behaviour ('setflag'); } $self->add_page_tmpfile ($self->{outtmp}.$CF::slash.$self->{outidxfile}, $u); if ($SCF::levels{$url} >= 0) { $self->download_front_page ($u, $SCF::levels{$url}, $upindex); } else { $self->download_story_page ($u, 1, $upindex); } } # Now go through any additional URLs at the different levels, starting # at the highest level and working down. # my $lev; for ($lev = $SCF::levels{$url}; $lev >= -1; $lev--) # -1 = story level { next unless (defined $Scoop::extra_urls{"$lev $url"}); foreach $u (split (' ', $Scoop::extra_urls{"$lev $url"})) { # if we were interrupted, clear the flag and go on if ($Scoop::got_intr_flag) { &Scoop::set_got_intr_behaviour ('setflag'); } $self->add_page_tmpfile ($self->{outtmp}.$CF::slash.$self->{outidxfile}, $u); if ($lev >= 0) { $self->download_front_page ($u, $lev, $upindex); } else { $self->download_story_page ($u, 1, $upindex); } } } &Scoop::set_got_intr_behaviour ('exit'); # kld: addition for image-only sites # jm -- TODO: needs checking, this. if (defined $SCF::image_only_site{$url} && $SCF::image_only_site{$url} == 1 && $self->{stories_found} == 0) { &Scoop::verbose ("$self->{prctitle}: forcing conversion for image-only site"); $self->{stories_found} = 1; } if ($self->{stories_found} > 0) { &Scoop::verbose ("$self->{prctitle}: $self->{stories_found} ". "stories downloaded (". sprintf ("%3.1f", $self->{file_size}/1024). " K uncompressed)."); my $ind = $self->{current_story_index}; foreach $ofkey (keys %{$self->{output_file}}) { # convert sitescooper navigation links: [<<][^][>>] my $story = ${$self->{output_file}}{$ofkey}; # trim off the first and last ones anyway $story =~ s/\[.*?<\/a>\]//g; # and run through the rest for ($i = 0; $i < $ind; $i++) { next unless (defined ${$self->{output_story_urls}}[$i]); $story =~ s/\"__SITESCOOPER_STORY_${i}\"/\"${$self->{output_story_urls}}[$i]\"/g; } # remove stray links ${$self->{output_file}}{$ofkey} = $self->remove_external_links ($story); } # if we're in single-page mode, rewrite the anchors to be # much shorter, by numbering them. Off for the time being. TODO my $turned_off_for_now = 0; if ($turned_off_for_now && !$CF::fileperpage) { $_ = ${$self->{output_file}}{'MAIN'}; my %ankmap = (); my $curank = 0; s{{output_file}}{'MAIN'} .= "\n\n