#!/usr/bin/perl use lib "/home/merlyn/CPAN/lib"; use LWP::UserAgent; use HTML::Parser; use URI::URL; ## begin configure @CHECK = # list of initial starting points qw(http://www.teleport.com/~merlyn/); sub PARSE { # verify existance, parse for further URLs ## $_[0] is the absolute URL $_[0] =~ m!^http://www\.(teleport|stonehenge)\.com/~merlyn! and not $_[0] =~ /refindex/; } sub PING { # verify existence, but don't parse ## $_[0] is the absolute URL $_[0] =~ m!^(http|ftp|gopher):!; } ## end configure { package ParseLink; @ISA = qw(HTML::Parser); sub start { # called by parse my $this = shift; my ($tag, $attr) = @_; if ($tag eq "a") { $this->{links}{$attr->{href}}++; } elsif ($tag eq "img") { $this->{links}{$attr->{src}}++; } } sub get_links { my $this = shift; sort keys %{$this->{links}}; } } # end of ParseLink $ua = new LWP::UserAgent; $ua->agent("hverify/1.0"); $ua->env_proxy; $| = 1; MAINLOOP: while ($thisurl = shift @CHECK) { $thisurl =~ s/%7e/~/ig; # ugh :-) next if $did{$thisurl}++; if (PARSE $thisurl) { warn "fetching $thisurl\n"; $request = new HTTP::Request('GET',$thisurl); $response = $ua->request($request); # fetch! unless ($response->is_success) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; next MAINLOOP; } next MAINLOOP unless $response->content_type =~ /text\/html/i; $base = $response->base; my $p = ParseLink->new; $p->parse($response->content); $p->parse(undef); for $link ($p->get_links) { $abs = url($link, $base)->abs; warn "... $link => $abs\n"; push(@CHECK, $abs); } next MAINLOOP; } if (PING $thisurl) { warn "verifying $thisurl\n"; for $method (qw(HEAD GET)) { $request = new HTTP::Request($method,$thisurl); $response = $ua->request($request); # fetch! next MAINLOOP if $response->is_success; # ok } warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; next MAINLOOP; } warn "[skipping $thisurl]\n"; }