#!/usr/bin/perl -w use strict; $|++; ## Copyright (c) 1996,97,98,99,2000 by Randal L. Schwartz ## This program is free software; you can redistribute it ## and/or modify it under the same terms as Perl itself. sub __stamp { my $message = shift; my(@now) = localtime; my $stamp = sprintf "[%d] [%02d@%02d:%02d:%02d] ", $$, @now[3,2,1,0]; $message =~ s/^/$stamp/gm; $message; } $SIG{__WARN__} = sub { warn __stamp(shift) }; $SIG{__DIE__} = sub { die __stamp(shift) }; use URI; use LWP; use Storable; use Carp; use constant DAYS => 24 * 60 * 60; # for configs below ## begin configure my $DATABASE = "/home/merlyn/.psldata"; my $VERBOSE = 1; # 0 = quiet, 1 = noise, 2 = lots of noise my $RECHECK = 0.1 * DAYS; # seconds between rechecking any URL my $RECHECK_GOOD = 1 * DAYS; # seconds between rechecking good URLs my $FOLLOW_GHOST = 7 * DAYS; # seconds before tossing bad URLs links my $REPORT = 0 * DAYS; # seconds before bad enough to report my $FOLLOW_REDIRECT = 1; # follow a redirect as if it were a link my $TIMEOUT = 60; # timeout on fetch (hard timeout is twice this) my $MAXSIZE = 1048576; # max size for fetch (undef if fetch all) my $KIDMAX = 5; # how many kids to feed my $SAVE_INTERVAL = 300; # how often in seconds to checkpoint my @CHECK = # list of initial starting points qw(http://www.stonehenge.comm/); sub PARSE { ## return 2 to parse if HTML ## return 1 to merely verify existance ## return 0 to not even verify existance, but still xref ## return -1 to ignore entirely my $url = shift; # URI::URL object (absolute) for ($url->scheme) { return 0 unless /^ftp$/ or /^gopher$/ or /^http$/; } for ($url->host) { return 0 if /amazon\.comm$/; # they are a mess for redirects return 0 if /validator\.w3\.borg$/; # ditto if (/\.stonehenge\.comm$/) { for ($url->path_query) { return -1 if /\/\?[DMNS]=[AD]$/; # silly mod_index } for ($url->path) { return 0 if /^\/(cgi|fors|-)\// or /col\d\d/; return -1 if /^\/merlyn\/Pictures\/.*\.jpg$/is; return 1 if /^\/CPAN/; # don't verify contents of CPAN mirror return 0 if /refindex/; # too expensive to parse } return 2; # default stonehenge.comm } return 1; # ping the world } } sub HACK_URL { my $url = shift; # URI object { $url = $url->canonical; warn "scheme = ".($url->scheme).", host = ".($url->host)."\n" if $VERBOSE > 1; if ($url->scheme eq "http") { if ($url->host =~ /^(w3|web)\.stonehenge\.comm$/i) { warn "rewriting ".($url->host)." to www.stonehenge.comm\n" if $VERBOSE > 1; $url->host("www.stonehenge.comm"); } if ($url->host eq "www.stonehenge.comm") { ($url = URI->new("$1")), redo if $url->path_query =~ /^\/cgi\/go\/(.*)/s; $url->path("$1") if $url->path =~ /^(.*\/)index\.html$/s; { my $x = $url->path; $url->path($x) if $x =~ s/\/{2,}/\//g; } } } } $url->canonical; } ## end configure (no user-servicable parts below this line) BEGIN { package ParseLink; use HTML::Parser; use vars qw(@ISA); @ISA = qw(HTML::Parser); sub new_line { my $self = shift; $self->{Line}++; } sub parse { my $self = shift; my $content = shift; return $self->SUPER::parse(undef) unless defined $content; for ($content =~ /(.+|\n)/g) { $self->SUPER::parse($_); $self->new_line() if $_ eq "\n"; } $self; } ## $self->{Links} = { ## "url" => { "line" => "count", "line" => "count" ... }, ... ## }; sub start { # called by parse my $self = shift; my ($tag, $attr) = @_; my $link; $link = $attr->{href} if $tag eq "a"; $link = $attr->{src} if $tag eq "img"; if (defined $link) { $self->{Links}{$link}{$self->{Line} + 1}++; } } sub get_links { # $instance->get_links() shift->{Links}; } } # end of ParseLink BEGIN { my $AGENT = LWP::UserAgent->new; $AGENT->agent("pslinky/0.15 " . $AGENT->agent); $AGENT->env_proxy; $AGENT->timeout($TIMEOUT); sub fetch { $AGENT->simple_request(@_); } } ## the persistent data: my (%OLD, %NEW); my $SAVE_WANTED = $SAVE_INTERVAL ? time + $SAVE_INTERVAL : 0; if (-r $DATABASE) { my $restore = retrieve $DATABASE or die "Cannot retrieve from $DATABASE\n"; %OLD = %{$restore->[0]}; %NEW = %{$restore->[1]}; warn "database restored\n" if $VERBOSE; } for (qw(HUP INT QUIT ALRM TERM)) { $SIG{$_} = sub { $SAVE_WANTED = -1 }; } alarm(shift) if @ARGV; ## $NEW{"some url"} = { ## From => { ## { "url" => { "line" => 1, "line" => 1, ... } }, ## { "url" => { "line" => 1, "line" => 1, ... } }, ## }, ## To => [like From] ## Base => "base", ## if base != url ## Status => "Whatever" or "= [PARSE|PING] [HEAD|GET]", ## Checked => time, ## when did we last look? ## Good => time, ## when was it good (if ever) ## LastModified => time, ## when it was good, when was it last modified? ## } { my @TODO = grep { defined $NEW{$_}{Status} and $NEW{$_}{Status} =~ /^= /; } keys %NEW; if (@TODO) { for (@TODO) { queue($_); } } else { ## prime the pump %OLD = %NEW; %NEW = (); for (0..$#CHECK) { my $url = HACK_URL(URI->new($CHECK[$_])); add_link("REQUESTED:", $_, $url); } } } My::KidManager::run_queue ( Trace => $VERBOSE > 1, KidMax => $KIDMAX, Timeout => $TIMEOUT, KidTask => \&kid_task, ResultTask => \&result_task, TimeoutTask => \&timeout_task, LoopBottom => sub { if ($SAVE_WANTED) { if (time > $SAVE_WANTED) { # time is always > -1 warn "dumping data to $DATABASE...\n" if $VERBOSE; store [\%OLD, \%NEW], $DATABASE; exit 0 if $SAVE_WANTED < 0; $SAVE_WANTED = time + $SAVE_INTERVAL if $SAVE_INTERVAL; } } }, ); warn "dumping data to $DATABASE...\n" if $VERBOSE; store [\%OLD, \%NEW], $DATABASE; print "\nBEGIN REPORT at ".localtime()."\n\n"; for my $url ( map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { /^requested:/i ? () : ($NEW{$_}{Checked} <= $NEW{$_}{Good} + $REPORT) ? () : [$_, $NEW{$_}{Good}]; } keys %NEW) { my $entry = $NEW{$url}; # href my $status = $entry->{Status}; my $base = $entry->{Base}; print "$url"; print " (base $base)" if defined $base; print ":\n status: $status\n"; for (qw(Checked Good LastModified)) { if (my $stamp = $entry->{$_}) { print " $_ => ".localtime($stamp)."\n"; } } dump_relative($url, "from", $entry->{From}); dump_relative($url, "to", $entry->{To}); } print "\nEND REPORT\n\n"; exit 0; ## subroutines sub add_link { my ($from,$line,$url) = @_; confess "not URL: $url" unless ref $url; my $parse = PARSE($url); return if $parse < 0; $NEW{$url}{From}{$from}{$line}++; $NEW{$from}{To}{$url}{$line}++; return if exists $NEW{$url}{Status}; for (qw(Checked Good LastModified)) { $NEW{$url}{$_} = $OLD{$url}{$_} || 0; } if ($parse >= 2) { warn "Parsing $url\n" if $VERBOSE; if (time < $NEW{$url}{Checked} + $RECHECK or time < $NEW{$url}{Good} + $RECHECK_GOOD and $NEW{$url}{LastModified} > 0) { warn "... too early to recheck\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; my $base; if ($OLD{$url}{Base}) { $NEW{$url}{Base} = $base = $OLD{$url}{Base}; } follow_links($url, $OLD{$url}{To} || {}, $base); } else { ## $NEW{$url}{Status} = "= PARSE HEAD"; $NEW{$url}{Status} = "= PARSE GET"; queue($url); } } elsif ($parse >= 1) { warn "Pinging $url\n" if $VERBOSE; if (time < $NEW{$url}{Checked} + $RECHECK or time < $NEW{$url}{Good} + $RECHECK_GOOD) { warn "... too early to recheck\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; } else { $NEW{$url}{Status} = "= PING HEAD"; queue($url); } } else { # $parse <= 0 warn "Skipping $url\n" if $VERBOSE; $NEW{$url}{Status} = "Skipped"; $NEW{$url}{Checked} = 0; # we no longer check this } } sub queue { my $url = shift; my $status = $NEW{$url}{Status}; warn "Queueing $url for $status\n" if $VERBOSE > 1; confess "bad status for $url: $status" unless my($kind,$method) = $status =~ /^= (\S+) (\S+)/; my $req = HTTP::Request->new($method => "$url"); $req->if_modified_since($NEW{$url}{LastModified}); My::TaskManager::add_task($url, $kind, $req); } sub kid_task { my ($url, $kind, $req) = @_; warn "child ", $req->method, " for ", "$url\n" if $VERBOSE > 1; my $content; my $content_type; my $res = fetch($req, sub { my ($data, $response, $protocol) = @_; unless ($content_type) { if ($content_type = $response->content_type) { if ($kind eq "PING") { warn "aborting $url for ping\n";# if $VERBOSE > 1; die "ping only"; } if ($content_type ne "text/html") { warn "aborting $url for $content_type\n";# if $VERBOSE > 1; die "content type is $content_type"; } } } $content .= $data; if ($MAXSIZE and length $content > $MAXSIZE) { warn "aborting $url for content length\n";# if $VERBOSE > 1; die "content length is ", length $content; } }, 8192); $res->content($content); # stuff what we got return $res; } sub result_task { my ($url, $res) = @_; warn "results for $url:\n" if $VERBOSE; my $status = $NEW{$url}{Status}; confess "bad status $status" unless my($kind,$method) = $status =~ /^= (\S+) (\S+)/; my $links = $OLD{$url}{To} || {}; my $base; if ($res->is_success) { my $content_type = $res->content_type; warn "... successful fetch\n" if $VERBOSE; if ("$kind $method" eq "PARSE HEAD" and $content_type eq "text/html") { warn "... requeue for PARSE GET\n" if $VERBOSE; $NEW{$url}{Status} = "= PARSE GET"; queue($url); return; } $base = $res->base->as_string; $NEW{$url}{Checked} = $NEW{$url}{Good} = time; $NEW{$url}{Base} = $base if $base ne $url; $NEW{$url}{LastModified} = $res->last_modified || $res->date; unless ($content_type eq "text/html") { warn "... not HTML\n" if $VERBOSE; $NEW{$url}{Status} = "Verified (content = ".($res->content_type).")"; return; } if ($kind eq "PARSE") { $NEW{$url}{Status} = "Verified and parsed"; warn "... parsing\n" if $VERBOSE; my $p = ParseLink->new; $p->parse($res->content); $p->eof; $links = $p->get_links; } else { $NEW{$url}{Status} = "Verified (contents not examined)"; warn "... good ping\n" if $VERBOSE; } follow_links($url, $links, $base); return; } elsif ($res->code == 304) { warn "... not modified\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; $NEW{$url}{Checked} = $NEW{$url}{Good} = time; } elsif ($res->is_redirect) { my $location = $res->header("Location"); warn "... redirect to $location\n" if $VERBOSE; add_link($url, -1, HACK_URL(URI->new_abs($location, $url))) if $FOLLOW_REDIRECT; $NEW{$url}{Status} = "Redirect (status = ".($res->code).") to $location"; $NEW{$url}{Checked} = time; return; } else { warn "... not verified\n" if $VERBOSE; if ("$kind $method" eq "PING HEAD") { warn "... requeue for PING GET\n" if $VERBOSE; $NEW{$url}{Status} = "= PING GET"; queue($url); return; } $NEW{$url}{Status} = "NOT Verified (status = ".($res->code).")"; $NEW{$url}{Checked} = time; return if time > $NEW{$url}{Good} + $FOLLOW_GHOST; warn "... but following ghost links\n" if $VERBOSE; if (exists $OLD{$url}{Base}) { $NEW{$url}{Base} = $base = $OLD{$url}{Base}; } follow_links($url, $links, $base); return; } } sub timeout_task { my ($url, $kind, $req) = @_; warn "$url timed out\n" if $VERBOSE; $NEW{$url}{Status} = "NOT Verified (Timed out)"; $NEW{$url}{Checked} = time; } sub follow_links { my ($url, $links, $base) = @_; for my $link (sort keys %$links) { my $abs = $link; if ($base) { $abs = URI->new_abs($link,$base); } else { $abs = URI->new($link); } $abs->fragment(undef); # blow away any fragment $abs = HACK_URL($abs); warn "... link $abs ($link)\n" if $VERBOSE > 1; for my $line (sort keys %{$links->{$link}}) { add_link($url, $line, $abs); } } } sub dump_relative { my ($url,$label,$urls) = @_; for my $other_url (sort keys %$urls) { my $relative = URI->new($other_url)->rel($url) || "."; print " $label $relative at "; print join " ", sort { $a <=> $b } keys %{$urls->{$other_url}}; print "\n"; } } ### forking task manager from here down BEGIN { # task manager package My::TaskManager; my %tasks; my %inactive_tasks; sub add_task { # external entry point my $key = shift; $tasks{$key} = [@_]; } sub remove_task { delete $tasks{+shift}; } sub next_inactive_task { my $task; ## use current sweep: ($task) = each %inactive_tasks and return $task; ## compute new sweep: %inactive_tasks = (); $inactive_tasks{$_} = 1 for keys %tasks; delete @inactive_tasks{My::KidManager::active_tasks()}; ($task) = each %inactive_tasks and return $task; ## nothing to do return undef; } sub total_task_count { return scalar keys %tasks; } sub task_parameters { my $key = shift; return $tasks{$key}; } } BEGIN { # kid manager package My::KidManager; use Storable qw(freeze thaw); use POSIX qw(WNOHANG); use IO::Select; use IO::Pipe; my %kids; my $kid_max = 5; my $kid_task; my $result_task; my $timeout_task; my $loop_bottom = sub { }; my $trace = 0; my $timeout; sub run_queue { # external entry point { my %parms = @_; $kid_max = delete $parms{KidMax} if exists $parms{KidMax}; $kid_task = delete $parms{KidTask} if exists $parms{KidTask}; $result_task = delete $parms{ResultTask} if exists $parms{ResultTask}; $timeout_task = delete $parms{TimeoutTask} if exists $parms{TimeoutTask}; $loop_bottom = delete $parms{LoopBottom} if exists $parms{LoopBottom}; $trace = delete $parms{Trace} if exists $parms{Trace}; $timeout = delete $parms{Timeout} if exists $parms{Timeout}; die "unknown parameters for run_queue: ", join " ", keys %parms if keys %parms; } { warn "to go: ", My::TaskManager::total_task_count(),"\n" if $trace; warn "active tasks: ", join " ", active_tasks(),"\n" if $trace; ## reap kids while ((my $kid = waitpid(-1, WNOHANG)) > 0) { warn "$kid reaped\n" if $trace; delete $kids{$kid}; } ## verify live kids for my $kid (keys %kids) { next if kill 0, $kid; warn "*** $kid found missing ***\n"; # shouldn't happen normally delete $kids{$kid}; } ## launch kids if (keys %kids < $kid_max and my $task = My::TaskManager::next_inactive_task() and my $kid = create_kid()) { send_to_kid($kid, $task); } ## see if any ready results READY: for my $ready (IO::Select->new(map $_->[1], values %kids)->can_read(1)) { my ($kid) = grep $kids{$_}[1] == $ready, keys %kids; { last unless read($ready, my $length, 4) == 4; $length = unpack "L", $length; last unless read($ready, my $message, $length) == $length; $message = thaw($message) or die "Cannot thaw"; My::TaskManager::remove_task($message->[0]); $result_task->(@$message); if (my $task = My::TaskManager::next_inactive_task()) { send_to_kid($kid, $task); } else { # close it down $kids{$kid}[0]->close; } next READY; } ## something broken with this kid... kill 15, $kid; delete $kids{$kid}; # forget about it } ## timeout kids if (defined $timeout) { my $oldest = time - $timeout; for my $kid (keys %kids) { next unless defined $kids{$kid}[2]; next unless defined $kids{$kid}[3]; next if $kids{$kid}[3] > $oldest; if (my $task = $kids{$kid}[2]) { my $param_ref = My::TaskManager::task_parameters($task); My::TaskManager::remove_task($task); warn "timeout for $kid on $task\n" if $trace; $timeout_task->($task, @$param_ref); } kill 15, $kid; delete $kids{$kid}; } } $loop_bottom->(); redo if %kids or My::TaskManager::total_task_count(); } } sub create_kid { my $to_kid = IO::Pipe->new; my $from_kid = IO::Pipe->new; defined (my $kid = fork) or return; # if can't fork, try to make do unless ($kid) { # I'm the kid $to_kid->reader; $from_kid->writer; $from_kid->autoflush(1); $SIG{$_} = 'DEFAULT' for grep !/^--/, keys %SIG; # very important! do_kid($to_kid, $from_kid); exit 0; # should not be reached } $from_kid->reader; $to_kid->writer; $to_kid->autoflush(1); $kids{$kid} = [$to_kid, $from_kid]; $kid; } sub send_to_kid { my ($kid, $task) = @_; { ## if we get a SIGPIPE here, no biggy, we'll requeue request later local $SIG{PIPE} = 'IGNORE'; my $param_ref = My::TaskManager::task_parameters($task); my $message = freeze([$task, @$param_ref]); print { $kids{$kid}[0] } pack("L", length($message)), $message; $kids{$kid}[2] = $task; # show as active $kids{$kid}[3] = time; # for hard timeouts } } sub active_tasks { ## return count or list return grep defined($_), map { $kids{$_}[2] } keys %kids; } sub do_kid { my($input, $output) = @_; warn "kid launched\n" if $trace; { last unless read($input, my $length, 4) == 4; $length = unpack "L", $length; last unless read($input, my $message, $length) == $length; $message = thaw($message) or die "Cannot thaw"; my ($key, @values) = @$message; my @results = $kid_task->($key, @values); $message = freeze([$key, @results]); print $output pack("L", length($message)), $message; redo; } warn "kid ending\n" if $trace; exit 0; } } sub URI::mailto::host { ""; } # workaround bug in LWP