(?: SCREEN \s+ IT! \s+ \S+ \s+ REVIEW: \s+ )? ( .+ )

#!/usr/bin/perl -w
use strict;
$|++;

## config
my $DATA_DB = "/home/merlyn/Web/profanity_quiz";
## end config

use LWP::UserAgent;
use HTTP::Request::Common;
use URI;

dbmopen my %DATA, $DATA_DB, 0644 or die "Cannot open db: $!";

## %DATA format: for each movie, keyed by partial URL,
## value is "$title\n$profanity_paragraph_with_newlines", as in:
## $DATA{"www.screenit.com/movies/1997/gone_fishin.html"} =
## "GONE FISHIN'\n<DL>many\nlines\n</DL>\n";

my $ua = LWP::UserAgent->new;

for (my $start = 0; ; $start += 100) {
  ## fetch each index page:
  my $uri = URI->new("http://www.google.com/search");
  $uri->query_form('q' => "site:screenit.com profanity",
                   'num' => 100, 'start' => $start, 'filter' => 0);
  my $response = $ua->simple_request(GET $uri);
  last unless $response->is_success;
  
  ## parse the index page looking for links to movie pages in cache:
  my @urls = $response->content =~ m{A HREF=/search\?q=cache:(.*?)\+}g;
  last unless @urls;

  ## fetch each cached movie page if it fits the profile:
  for my $url (@urls) {
    unless ($url =~ m{movies/\d\d\d\d/}) {
      print "skipping $url\n";
      next;
    }
    if ($DATA{$url}) {
      print "skipping $url because we have it\n";
      next;
    }

    ## get cached movie page from cache:
    $uri->query_form('q' => "cache:$url");
    my $res = $ua->simple_request(GET $uri);
    print $uri, " ==>\n";
    unless ($res->is_success) {
      print "___ FAILURE ___\n", $res->as_string, "______\n";
      next;
    }

    ## look for profanity paragraph:
    unless ($res->content =~ m{
                               \n
                               (
                                <dl>
                                .*?
                                (?:\n.*?)??
                                profanity</a>\n
                                (?:.+\n)*?
                                </dl>\n
                               )
                               \n
                              }ix) {
      print "can't find profanity DL in\n", $res->content;
      next;
    }
    my $prof = $1;

    ## look for title:
    unless ($res->content =~ m{
                               <title>
                               (?:
                                SCREEN \s+ IT! \s+ \S+ \s+ REVIEW: \s+
                               )?
                               (
                                .+
                               )
                               </title>
                              }ix) {
      print "can't find title in\n", $res->content;
      next;
    }
    my $title = $1;
    print "... $title\n";       # for tracing

    ## save data:
    $DATA{$url} = "$title\n$prof";
  }
}

print scalar keys %DATA, " total movies for the quiz!\n";