#!/home/merlyn/bin/perl -Tw use strict; use LWP::UserAgent; use HTML::Parser; use URI::URL; ## begin configure my @CHECK = # list of initial starting points qw(http://www.stonehenge.com/index.html); sub PARSE { # verify existence, parse for further URLs ## $_[0] is the absolute URL $_[0] =~ m!^http://www\.stonehenge\.com/! and not $_[0] =~ /refindex|col\d\d\.html|fors/; } sub PING { # verify existence, but don't parse ## $_[0] is the absolute URL $_[0] =~ m!^(http|ftp|gopher)://! and not $_[0] =~ m!perl\.com/CPAN/!; # presume all CPAN refs are good } ## end configure (no user-servicable parts below this line) BEGIN { package ParseLink; @ParseLink::ISA = qw(HTML::Parser); sub set_line { # $instance->set_line(nnn) my $self = shift; $self->{Line} = shift; } ## $self->{Links} = { ## "url" => { "line" => "count", "line" => "count" ... }, ... ## }; sub start { # called by parse my $self = shift; my ($tag, $attr) = @_; my $link; $link = $attr->{href} if $tag eq "a"; $link = $attr->{src} if $tag eq "img"; if (defined $link) { $self->{Links}{$link}{$self->{Line}}++; } } sub get_links { # $instance->get_links() my $self = shift; $self->{Links}; } } # end of ParseLink my $ua = new LWP::UserAgent; $ua->agent("hverify/2.0"); $ua->env_proxy; $| = 1; ## global database my %URL_LIST = (); ## format: ## $URL_LIST{"some url"} = { ## Source => { "where" => "count", "where" => "count", ... }, ## Dest => { "where" => "count", "where" => "count", ... }, ## Base => "base", ## if base != url ## Status => "Whatever", ## undef if not checked yet ## } ## prime the pump for (@CHECK) { $URL_LIST{$_}{Source}{"[requested]"}++; } ## now walk it { my @this_time = grep !defined $URL_LIST{$_}{Status}, keys %URL_LIST; last unless @this_time; URL: for my $url (@this_time) { if (PARSE $url) { ## print "Fetching $url\n"; my $request = new HTTP::Request('GET', $url); my $response = $ua->request($request); # fetch! unless ($response->is_success) { $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; next URL; } unless ($response->content_type =~ /text\/html/i) { $URL_LIST{$url}{Status} = "Verified (content not HTML)"; next URL; } my $base = $response->base; $URL_LIST{$url}{Base} = $base if $base ne $url; my $p = ParseLink->new; { my @content = $response->content =~ /(.*\n?)/g; my $line = 1; { last unless @content; $p->set_line($line); # tell it the line number $p->parse(shift @content); # and parse it $line++; redo; } } $p->parse(undef); # signal the end of parse my $links = $p->get_links; # key is relative url, value is href for my $link (sort keys %$links) { my $abs = url($link, $base)->abs; ## requested url is used for forward relative xref links, ## but actual url after redirection is used for backwards links. my ($forward_rel, $backward_rel) = do { local ($^W) = 0; # workaround for buglet map { $_ || "." } url($abs, $url)->rel, url($base, $abs)->rel; }; my $where = $links->{$link}; # key is line number, val is count for my $line (sort keys %$where) { $URL_LIST{$abs}{Source}{"$backward_rel at line $line"} += $where->{$line}; $URL_LIST{$url}{Dest}{"$forward_rel at line $line"} += $where->{$line}; } } $URL_LIST{$url}{Status} = "Verified (and parsed)"; next URL; } if (PING $url) { ## print "Verifying $url\n"; my $response; for my $method (qw(HEAD GET)) { my $request = new HTTP::Request($method,$url); $response = $ua->request($request); # fetch! if ($response->is_success) { $URL_LIST{$url}{Status} = "Verified (contents not examined)"; next URL; } } $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; next URL; } $URL_LIST{$url}{Status} = "Skipped"; next URL; } redo; } for my $url (sort keys %URL_LIST) { my $entry = $URL_LIST{$url}; # href my $status = $entry->{Status}; my $base = $entry->{Base}; print "$url"; print " (base $base)" if defined $base; print ":\n status: $status\n"; my $sources = $entry->{Source}; for my $source (sort keys %$sources) { print " from $source\n"; } my $dests = $entry->{Dest}; for my $dest (sort keys %$dests) { print " to $dest\n"; } }