#!/usr/bin/perl -w use strict; $|++; use constant POCO_HTTP => "ua"; use POE qw(Component::Client::HTTP); my $TOP = "http://directory.google.comm/Top/Computers/Programming/Languages/Perl/"; POE::Component::Client::HTTP->spawn(Alias => POCO_HTTP, Timeout => 30); POE::Component::My::Master->spawn(UA => POCO_HTTP, TODO => [$TOP]); $poe_kernel->run; exit 0; BEGIN { package POE::Component::My::Master; use POE::Session; # for constants sub spawn { my $class = shift; POE::Session->create (package_states => [$class => [qw(_start ready done)]], heap => {KIDMAX => 10, KIDS => 0, @_}); } sub _start { my $heap = $_[HEAP]; for (@{$heap->{TODO}}) { $heap->{DONE}{$_ = make_canonical($_)} = 1; } $_[KERNEL]->yield("ready", "initial"); } sub ready { ## warn "ready because $_[ARG0]\n"; my $heap = $_[HEAP]; my $kernel = $_[KERNEL]; return if $heap->{KIDS} >= $heap->{KIDMAX}; return unless my $url = shift @{$heap->{TODO}}; ## warn "doing: $url\n"; $heap->{KIDS}++; POE::Component::My::Checker->spawn (UA => $heap->{UA}, URL => $url, POSTBACK => $_[SESSION]->postback("done", $url), ); $kernel->yield("ready", "looping"); } sub done { my $heap = $_[HEAP]; my ($request,$response) = @_[ARG0,ARG1]; my ($url) = @$request; my @links = @{$response->[0]}; for (@links) { $_ = make_canonical($_); push @{$heap->{TODO}}, $_ unless $heap->{DONE}{$_}++; } $heap->{KIDS}--; $_[KERNEL]->yield("ready", "child done"); } sub make_canonical { # not a POE require URI; my $uri = URI->new(shift); $uri->fragment(undef); # toss fragment $uri->canonical->as_string; # return value } } # end POE::Component::My::Master BEGIN { package POE::Component::My::Checker; use POE::Session; sub spawn { my $class = shift; POE::Session->create (package_states => [$class => [qw(_start response)]], heap => {@_}); } sub _start { require HTTP::Request::Common; my $heap = $_[HEAP]; my $url = $heap->{URL}; my $request = HTTP::Request::Common::GET($url); $_[KERNEL]->post($heap->{UA}, 'request', 'response', $request); } sub response { my $url = $_[HEAP]{URL}; my ($request_packet, $response_packet) = @_[ARG0, ARG1]; my ($request, $request_tag) = @$request_packet; my ($response) = @$response_packet; my @links; if ($response->is_success) { if ($response->base =~ m{^\Q$TOP}) { if ($response->content_type eq "text/html") { require HTML::SimpleLinkExtor; my $e = HTML::SimpleLinkExtor->new($response->base); $e->parse($response->content); @links = grep m{^http:}, $e->links; ## warn "parsed: $url\n"; } else { ## warn "not HTML: $url\n"; } } else { warn "valid: $url\n"; } } else { warn "BAD (", $response->code, "): $url\n"; } $_[HEAP]{POSTBACK}(\@links); } } # end POE::Component::My::Checker