#!/home/merlyn/bin/perl -w use strict; $| = 1; ## Copyright (c) 1996,97,98,99 by Randal L. Schwartz ## This program is free software; you can redistribute it ## and/or modify it under the same terms as Perl itself. use URI; use LWP; use Storable; use constant DAYS => 24 * 60 * 60; # for configs below ## begin configure my $DATABASE = "/home/merlyn/.slinkydata"; my @CHECK = # list of initial starting points qw(http://www.stonehenge.comX/ http://www.effectiveperl.comX/); sub PARSE { ## return 2 to parse if HTML ## return 1 to merely verify existance ## return 0 to not even verify existance, but still xref ## return -1 to ignore entirely my $url = shift; # URI::URL object (absolute) for ($url->scheme) { return 0 unless /^ftp$/ or /^gopher$/ or /^http$/; } for ($url->host) { if (/\.stonehenge\.com$/) { for ($url->path_query) { return -1 if /\/\?[DNS]=[AD]$/; # silly mod_index } for ($url->path) { return 0 if /^\/(cgi|fors)\// or /col\d\d|refindex/; return -1 if /^\/merlyn\/Pictures\/.*\.jpg$/s; } return 2; # default stonehenge.com } return 2 if /\.effectiveperl\.com$/; return 1; # ping the world } } sub HACK_URL { my $url = shift; # URI object { $url = $url->canonical; if ($url->scheme eq "http") { $url->host("www.stonehenge.com") if $url->host eq "w3.stonehenge.com"; if ($url->host eq "www.stonehenge.com") { ($url = URI->new("$1")), redo if $url->path_query =~ /^\/cgi\/go\/(.*)/s; $url->path("$1") if $url->path =~ /^(.*\/)index\.html$/s; } } } $url->canonical; } my $VERBOSE = 2; # 0 = quiet, 1 = noise, 2 = lots of noise my $RECHECK = 10; # seconds between rechecking any URL my $RECHECK_GOOD = 1 * DAYS; # seconds between rechecking good URLs my $FOLLOW_GHOST = 14 * DAYS; # seconds before tossing bad URLs links my $REPORT = 3 * DAYS; # seconds before bad enough to report my $TIMEOUT = 15; # seconds to timeout on fetch my $FOLLOW_REDIRECT = 1; # follow redirects? ## end configure (no user-servicable parts below this line) BEGIN { package ParseLink; use HTML::Parser; use vars qw(@ISA); @ISA = qw(HTML::Parser); sub new_line { my $self = shift; $self->{Line}++; } ## $self->{Links} = { ## "url" => { "line" => "count", "line" => "count" ... }, ... ## }; sub start { # called by parse my $self = shift; my ($tag, $attr) = @_; my $link; $link = $attr->{href} if $tag eq "a"; $link = $attr->{src} if $tag eq "img"; if (defined $link) { $self->{Links}{$link}{$self->{Line} + 1}++; } } sub get_links { # $instance->get_links() my $self = shift; $self->{Links}; } } # end of ParseLink BEGIN { my $AGENT = LWP::UserAgent->new; $AGENT->agent("slinky/1.02 " . $AGENT->agent); $AGENT->env_proxy; $AGENT->timeout($TIMEOUT); sub fetch { if ($FOLLOW_REDIRECT) { $AGENT->request(shift); } else { $AGENT->simple_request(shift); } } } ## the persistant data: my (%OLD, %NEW, @TODO); my $SAVE_WANTED = 0; if (-r $DATABASE) { my $restore = retrieve $DATABASE or die "Cannot retrieve from $DATABASE\n"; %OLD = %{$restore->[0]}; %NEW = %{$restore->[1]}; @TODO = @{$restore->[2]}; warn "database restored\n" if $VERBOSE; } for (qw(HUP INT QUIT ALRM TERM)) { $SIG{$_} = sub { $SAVE_WANTED = 1 }; } alarm(shift) if @ARGV; ## $NEW{"some url"} = { ## From => { ## { "url" => { "line" => 1, "line" => 1, ... } }, ## { "url" => { "line" => 1, "line" => 1, ... } }, ## }, ## To => [like From] ## Base => "base", ## if base != url ## Status => "Whatever", ## Checked => time, ## when did we last look? ## Good => time, ## when was it good (if ever) ## LastModified => time, ## when it was good, when was it last modified? ## } URL: { unless (@TODO) { ## prime the pump %OLD = %NEW; %NEW = (); for (0..$#CHECK) { my $url = HACK_URL(URI->new($CHECK[$_])); add_link("REQUESTED:", $_, $url); } } warn @TODO." to go...\n" if $VERBOSE; my $url = splice(@TODO, rand @TODO, 1); # the lucky winner is... $url = URI->new($url); for (qw(Checked Good LastModified)) { $NEW{$url}{$_} = $OLD{$url}{$_} || 0; } my $parse = PARSE($url); if ($parse >= 2) { warn "Parsing $url\n" if $VERBOSE; my $links = $OLD{$url}{To} || {}; my $base; if (time < $NEW{$url}{Checked} + $RECHECK or time < $NEW{$url}{Good} + $RECHECK_GOOD and $NEW{$url}{LastModified} > 0) { warn ".. too early to recheck\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; } else { my $req = HTTP::Request->new(GET => $url); $req->if_modified_since($NEW{$url}{LastModified}); my $res = fetch($req); push(@TODO, $url->as_string), next if $SAVE_WANTED; if ($res->is_success) { warn ".. successful fetch\n" if $VERBOSE; $base = $res->base; $NEW{$url}{Checked} = $NEW{$url}{Good} = time; $NEW{$url}{Base} = $base if $base ne $url; $NEW{$url}{LastModified} = $res->last_modified || $res->date; unless ($res->content_type =~ /text\/html/i) { warn ".. not HTML\n" if $VERBOSE; $NEW{$url}{Status} = "Verified (content = ".($res->content_type).")"; next; } $NEW{$url}{Status} = "Verified (and parsed)"; my $p = ParseLink->new; for ($res->content =~ /(.+|\n)/g) { $p->parse($_); $p->new_line() if $_ eq "\n"; } $p->parse(undef); # signal the end of parse $links = $p->get_links; # key is relative url, value is href } elsif ($res->code == 304) { warn ".. not modified\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; $NEW{$url}{Checked} = $NEW{$url}{Good} = time; } else { warn ".. not verified\n" if $VERBOSE; $NEW{$url}{Status} = "NOT Verified (status = ".($res->code).")"; $NEW{$url}{Checked} = time; next if time > $NEW{$url}{Good} + $FOLLOW_GHOST; warn ".. but following ghost links\n" if $VERBOSE; } } for my $link (sort keys %$links) { my $abs = $link; if ($base) { # we fetched a page $abs = URI->new_abs($link,$base); $abs->fragment(undef); # blow away any fragment $abs = HACK_URL($abs)->as_string; } warn "... $abs ($link)\n" if $VERBOSE > 1; for my $line (sort keys %{$links->{$link}}) { add_link($url, $line, $abs) if PARSE(URI->new($abs)) >= 0; } } next; } elsif ($parse >= 1) { warn "Pinging $url\n" if $VERBOSE; if (time < $NEW{$url}{Checked} + $RECHECK or time < $NEW{$url}{Good} + $RECHECK_GOOD) { warn ".. too early to recheck\n" if $VERBOSE; $NEW{$url}{Status} = $OLD{$url}{Status}; } else { my $res; for my $method (qw(HEAD GET)) { my $req = new HTTP::Request($method,$url); $res = fetch($req); push(@TODO, $url->as_string), next URL if $SAVE_WANTED; if ($res->is_success) { $NEW{$url}{Status} = "Verified (contents not examined)"; $NEW{$url}{Checked} = $NEW{$url}{Good} = time; $NEW{$url}{LastModified} = 0; next URL; } } $NEW{$url}{Status} = "NOT Verified (status = ".($res->code).")"; $NEW{$url}{Checked} = time; next; } } else { # $parse < 0 warn "Skipping $url\n" if $VERBOSE; $NEW{$url}{Status} = "Skipped"; $NEW{$url}{Checked} = 0; # we no longer check this next; } } continue { if ($SAVE_WANTED) { warn "dumping data to $DATABASE...\n" if $VERBOSE; store [\%OLD, \%NEW, \@TODO], $DATABASE; exit 0; } redo if @TODO; warn "dumping data to $DATABASE...\n" if $VERBOSE; store [\%OLD, \%NEW, \@TODO], $DATABASE; print "\nBEGIN REPORT at ".localtime()."\n\n"; for my $url (sort keys %NEW) { next if $url =~ /^requested:/i; my $entry = $NEW{$url}; # href next unless $entry->{Checked} > $entry->{Good} + $REPORT; my $status = $entry->{Status}; my $base = $entry->{Base}; print "$url"; print " (base $base)" if defined $base; print ":\n status: $status\n"; for (qw(Checked Good LastModified)) { if (my $stamp = $entry->{$_}) { print " $_ => ".localtime($stamp)."\n"; } } dump_relative($url, "from", $entry->{From}); dump_relative($url, "to", $entry->{To}); } print "\nEND REPORT\n\n"; ## redo; exit 0; } ## subroutines sub add_link { my ($from,$line,$to) = @_; $NEW{$to}{From}{$from}{$line}++; $NEW{$from}{To}{$to}{$line}++; unless (defined $NEW{$to}{Status}) { push @TODO, $to; $NEW{$to}{Status} = "[to be probed]"; } } sub dump_relative { my ($url,$label,$urls) = @_; for my $other_url (sort keys %$urls) { my $relative = URI->new($other_url)->rel($url) || "."; print " $label $relative at "; print join " ", sort { $a <=> $b } keys %{$urls->{$other_url}}; print "\n"; } } sub URI::mailto::authority { ""; } # workaround bug