#!/home/merlyn/bin/perl -Tw use strict; use URI::URL; use LWP::Parallel::UserAgent qw(:CALLBACK); ## begin configure my @CHECK = # list of initial starting points qw(http://XXwww.stonehenge.com/index.html); sub PARSE { # verify existence, parse for further URLs ## $_[0] is the absolute URL $_[0] =~ m!^http://XXwww\.( 5sigma| perltraining| effectiveperl| stonehenge)\.com(/|$)!x and not $_[0] =~ /stonehenge.*(col\d\d\.html|fors|refindex)/; } sub PING { # verify existence, but don't parse ## $_[0] is the absolute URL $_[0] =~ m!^(http|ftp)://!; } sub HACK_URL { local $_ = shift; s!^http://w3\.stonehenge\.com/!http://www.stonehenge.com/!; s!^(http://www\.stonehenge\.com/(.*/)?)index\.html$!$1!; s!^http://www\.stonehenge\.com/cgi/go/!!; $_; } my $VERBOSE = 1; # be (very) noisy my $MAX_CONTENT = 100_000; # maximum content parsed in a URL my $MAX_HOSTS = 5; # simultaneous host count my $MAX_REQ = 3; # simultaneous request count ## end configure (no user-servicable parts below this line) BEGIN { package ParseLink; use HTML::Parser; use vars qw(@ISA); @ISA = qw(HTML::Parser); sub new_line { my $self = shift; $self->{Line}++; } ## $self->{Links} = { ## "url" => { "line" => "count", "line" => "count" ... }, ... ## }; sub start { # called by parse my $self = shift; my ($tag, $attr) = @_; my $link; $link = $attr->{href} if $tag eq "a"; $link = $attr->{src} if $tag eq "img"; if (defined $link) { $self->{Links}{$link}{$self->{Line} + 1}++; } } sub get_links { # $instance->get_links() my $self = shift; $self->{Links}; } } # end of ParseLink my $AGENT = new LWP::Parallel::UserAgent; $AGENT->agent("pverify/1.2"); $AGENT->env_proxy; $AGENT->redirect(0); $AGENT->max_hosts($MAX_HOSTS); $AGENT->max_req($MAX_REQ); $| = 1; ## global database my %URL_LIST = (); ## format: ## $URL_LIST{"some url"} = { ## Source => { "where" => "count", "where" => "count", ... }, ## Dest => { "where" => "count", "where" => "count", ... }, ## Base => "base", ## if base != url ## Status => "Whatever", ## undef if not checked yet ## } ## prime the pump for (@CHECK) { my $url = add_url($_); $URL_LIST{$url}{Source}{"[requested]"}++; } my $ENTRIES = $AGENT->wait(15); print "-----\n" if $VERBOSE; for my $response (map {$ENTRIES->{$_}->response} keys %$ENTRIES) { my $url = $response->request->url; next if ($URL_LIST{$url}{Status} || "") =~ /^[^\[]/; # we got a good one print "patching up bad status for $url: ", $response->code, "\n" if $VERBOSE; $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; } print "-----\n" if $VERBOSE; for my $url (sort keys %URL_LIST) { my $entry = $URL_LIST{$url}; # href my $status = $entry->{Status}; my $base = $entry->{Base}; print "$url"; print " (base $base)" if defined $base; print ":\n status: $status\n"; my $sources = $entry->{Source}; for my $source (sort keys %$sources) { print " from $source\n"; } my $dests = $entry->{Dest}; for my $dest (sort keys %$dests) { print " to $dest\n"; } } ## subroutines sub add_url { my $url = shift; $url = url(HACK_URL $url)->abs->as_string; return $url if defined $URL_LIST{$url}{Status}; if (PARSE $url) { print "Fetching $url\n" if $VERBOSE; $URL_LIST{$url}{Status} = "[PARSE]"; my $request = new HTTP::Request('GET', $url); $AGENT->register($request,\&callback_for_parse); } elsif (PING $url) { print "Pinging $url\n" if $VERBOSE; $URL_LIST{$url}{Status} = "[PING]"; my $request = new HTTP::Request('GET', $url); $AGENT->register($request,\&callback_for_ping); } else { $URL_LIST{$url}{Status} = "Skipped"; } $url; } sub callback_for_parse { my ($content, $response, $protocol, $entry) = @_; print "PARSE: Handling answer from '",$response->request->url,": ", length($content), " bytes, Code ", $response->code, ", ", $response->message,"\n" if $VERBOSE; if (length $content) { $response->add_content($content); if (length($response->content) < $MAX_CONTENT and $response->content_type =~ /text\/html/i) { return length $content; # go get some more } } parse_content_for_response($response); $response->content(""); # discard it (free up memory) return C_ENDCON; # no more data from here } sub callback_for_ping { my ($content, $response, $protocol, $entry) = @_; print "PING: Handling answer from '",$response->request->url,": ", length($content), " bytes, Code ", $response->code, ", ", $response->message,"\n" if $VERBOSE; my $url = $response->request->url; if ($response->is_success) { $URL_LIST{$url}{Status} = "Verified (contents not examined)"; } else { $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; } return C_ENDCON; # ping ok, end connection } sub parse_content_for_response { my $response = shift; my $url = $response->request->url; unless ($response->is_success) { $URL_LIST{$url}{Status} = "NOT Verified (status = ".($response->code).")"; return; } unless ($response->content_type =~ /text\/html/i) { $URL_LIST{$url}{Status} = "Verified (content not HTML)"; return; } my $base = $response->base; $URL_LIST{$url}{Base} = $base if $base ne $url; my $p = ParseLink->new; for ($response->content =~ /(.+|\n)/g) { $p->parse($_); $p->new_line() if $_ eq "\n"; } $p->parse(undef); # signal the end of parse my $links = $p->get_links; # key is relative url, value is href for my $link (sort keys %$links) { my $abs = url($link,$base); $abs->frag(undef); # blow away any frag $abs = add_url($abs->abs->as_string); print "... $abs\n" if $VERBOSE; ## requested url is used for forward relative xref links, ## but actual url after redirection is used for backwards links. my ($forward_rel, $backward_rel) = do { map { $_ || "." } url($abs, $url)->rel, url($base, $abs)->rel; }; my $where = $links->{$link}; # key is line number, val is count for my $line (sort keys %$where) { $URL_LIST{$abs}{Source}{"$backward_rel at line $line"} += $where->{$line}; $URL_LIST{$url}{Dest}{"$forward_rel at line $line"} += $where->{$line}; } } $URL_LIST{$url}{Status} = "Verified (and parsed)"; }