#!/usr/bin/perl -w use strict; $|++; ## config my $DATA_DB = "/home/merlyn/Web/profanity_quiz"; ## end config use LWP::UserAgent; use HTTP::Request::Common; use URI; dbmopen my %DATA, $DATA_DB, 0644 or die "Cannot open db: $!"; ## %DATA format: for each movie, keyed by partial URL, ## value is "$title\n$profanity_paragraph_with_newlines", as in: ## $DATA{"www.screenit.com/movies/1997/gone_fishin.html"} = ## "GONE FISHIN'\n
many\nlines\n
\n"; my $ua = LWP::UserAgent->new; for (my $start = 0; ; $start += 100) { ## fetch each index page: my $uri = URI->new("http://www.google.com/search"); $uri->query_form('q' => "site:screenit.com profanity", 'num' => 100, 'start' => $start, 'filter' => 0); my $response = $ua->simple_request(GET $uri); last unless $response->is_success; ## parse the index page looking for links to movie pages in cache: my @urls = $response->content =~ m{A HREF=/search\?q=cache:(.*?)\+}g; last unless @urls; ## fetch each cached movie page if it fits the profile: for my $url (@urls) { unless ($url =~ m{movies/\d\d\d\d/}) { print "skipping $url\n"; next; } if ($DATA{$url}) { print "skipping $url because we have it\n"; next; } ## get cached movie page from cache: $uri->query_form('q' => "cache:$url"); my $res = $ua->simple_request(GET $uri); print $uri, " ==>\n"; unless ($res->is_success) { print "___ FAILURE ___\n", $res->as_string, "______\n"; next; } ## look for profanity paragraph: unless ($res->content =~ m{ \n (
.*? (?:\n.*?)?? profanity\n (?:.+\n)*?
\n ) \n }ix) { print "can't find profanity DL in\n", $res->content; next; } my $prof = $1; ## look for title: unless ($res->content =~ m{ (?: SCREEN \s+ IT! \s+ \S+ \s+ REVIEW: \s+ )? ( .+ ) }ix) { print "can't find title in\n", $res->content; next; } my $title = $1; print "... $title\n"; # for tracing ## save data: $DATA{$url} = "$title\n$prof"; } } print scalar keys %DATA, " total movies for the quiz!\n";