#!/usr/bin/perl -w
use strict;
$|++;
## config
my $DATA_DB = "/home/merlyn/Web/profanity_quiz";
## end config
use LWP::UserAgent;
use HTTP::Request::Common;
use URI;
dbmopen my %DATA, $DATA_DB, 0644 or die "Cannot open db: $!";
## %DATA format: for each movie, keyed by partial URL,
## value is "$title\n$profanity_paragraph_with_newlines", as in:
## $DATA{"www.screenit.com/movies/1997/gone_fishin.html"} =
## "GONE FISHIN'\n
many\nlines\n
\n";
my $ua = LWP::UserAgent->new;
for (my $start = 0; ; $start += 100) {
## fetch each index page:
my $uri = URI->new("http://www.google.com/search");
$uri->query_form('q' => "site:screenit.com profanity",
'num' => 100, 'start' => $start, 'filter' => 0);
my $response = $ua->simple_request(GET $uri);
last unless $response->is_success;
## parse the index page looking for links to movie pages in cache:
my @urls = $response->content =~ m{A HREF=/search\?q=cache:(.*?)\+}g;
last unless @urls;
## fetch each cached movie page if it fits the profile:
for my $url (@urls) {
unless ($url =~ m{movies/\d\d\d\d/}) {
print "skipping $url\n";
next;
}
if ($DATA{$url}) {
print "skipping $url because we have it\n";
next;
}
## get cached movie page from cache:
$uri->query_form('q' => "cache:$url");
my $res = $ua->simple_request(GET $uri);
print $uri, " ==>\n";
unless ($res->is_success) {
print "___ FAILURE ___\n", $res->as_string, "______\n";
next;
}
## look for profanity paragraph:
unless ($res->content =~ m{
\n
(
.*?
(?:\n.*?)??
profanity\n
(?:.+\n)*?
\n
)
\n
}ix) {
print "can't find profanity DL in\n", $res->content;
next;
}
my $prof = $1;
## look for title:
unless ($res->content =~ m{
(?:
SCREEN \s+ IT! \s+ \S+ \s+ REVIEW: \s+
)?
(
.+
)
}ix) {
print "can't find title in\n", $res->content;
next;
}
my $title = $1;
print "... $title\n"; # for tracing
## save data:
$DATA{$url} = "$title\n$prof";
}
}
print scalar keys %DATA, " total movies for the quiz!\n";