#!/home/merlyn/bin/perl -w
use strict;
$| = 1;

## Copyright (c) 1996,97,98,99 by Randal L. Schwartz
## This program is free software; you can redistribute it
## and/or modify it under the same terms as Perl itself.

use URI;
use LWP;
use Storable;

use constant DAYS => 24 * 60 * 60; # for configs below

## begin configure

my $DATABASE = "/home/merlyn/.slinkydata";

my @CHECK =                     # list of initial starting points
  qw(http://www.stonehenge.comX/ http://www.effectiveperl.comX/);

sub PARSE {
  ## return 2 to parse if HTML
  ## return 1 to merely verify existance
  ## return 0 to not even verify existance, but still xref
  ## return -1 to ignore entirely
  my $url = shift;              # URI::URL object (absolute)
  for ($url->scheme) {
    return 0 unless /^ftp$/ or /^gopher$/ or /^http$/;
  }
  for ($url->host) {
    if (/\.stonehenge\.com$/) {
      for ($url->path_query) {
        return -1 if /\/\?[DNS]=[AD]$/; # silly mod_index
      }
      for ($url->path) {
        return 0 if /^\/(cgi|fors)\// or /col\d\d|refindex/;
        return -1 if /^\/merlyn\/Pictures\/.*\.jpg$/s;
      }
      return 2;                 # default stonehenge.com
    }
    return 2 if /\.effectiveperl\.com$/;
    return 1;                   # ping the world
  }
}

sub HACK_URL {
  my $url = shift;              # URI object
  {
    $url = $url->canonical;
    if ($url->scheme eq "http") {
      $url->host("www.stonehenge.com") if $url->host eq "w3.stonehenge.com";
      if ($url->host eq "www.stonehenge.com") {
        ($url = URI->new("$1")), redo
          if $url->path_query =~ /^\/cgi\/go\/(.*)/s;
        $url->path("$1") if $url->path =~ /^(.*\/)index\.html$/s;
      }
    }
  }
  $url->canonical;
}

my $VERBOSE = 2;                # 0 = quiet, 1 = noise, 2 = lots of noise

my $RECHECK = 10;               # seconds between rechecking any URL
my $RECHECK_GOOD = 1 * DAYS;    # seconds between rechecking good URLs
my $FOLLOW_GHOST = 14 * DAYS;   # seconds before tossing bad URLs links
my $REPORT = 3 * DAYS;          # seconds before bad enough to report

my $TIMEOUT = 15;               # seconds to timeout on fetch

my $FOLLOW_REDIRECT = 1;        # follow redirects?

## end configure (no user-servicable parts below this line)

BEGIN {
  package ParseLink;
  use HTML::Parser;
  use vars qw(@ISA);

  @ISA = qw(HTML::Parser);

  sub new_line {
    my $self = shift;
    $self->{Line}++;
  }

  ## $self->{Links} = {
  ##    "url" => { "line" => "count", "line" => "count" ... }, ...
  ## };
  sub start {                   # called by parse
    my $self = shift;
    my ($tag, $attr) = @_;
    my $link;
    $link = $attr->{href} if $tag eq "a";
    $link = $attr->{src} if $tag eq "img";
    if (defined $link) {
      $self->{Links}{$link}{$self->{Line} + 1}++;
    }
  }

  sub get_links {               # $instance->get_links()
    my $self = shift;
    $self->{Links};
  }
}                               # end of ParseLink

BEGIN {
  my $AGENT = LWP::UserAgent->new;
  $AGENT->agent("slinky/1.02 " . $AGENT->agent);
  $AGENT->env_proxy;
  $AGENT->timeout($TIMEOUT);

  sub fetch {
    if ($FOLLOW_REDIRECT) {
      $AGENT->request(shift);
    } else {
      $AGENT->simple_request(shift);
    }
  }
}

## the persistant data:
my (%OLD, %NEW, @TODO);
my $SAVE_WANTED = 0;

if (-r $DATABASE) {
  my $restore = retrieve $DATABASE or die "Cannot retrieve from $DATABASE\n";
  %OLD = %{$restore->[0]};
  %NEW = %{$restore->[1]};
  @TODO = @{$restore->[2]};
  warn "database restored\n" if $VERBOSE;
}

for (qw(HUP INT QUIT ALRM TERM)) {
  $SIG{$_} = sub { $SAVE_WANTED = 1 };
}

alarm(shift) if @ARGV;
  
## $NEW{"some url"} = {
##   From => {
##     { "url" => { "line" => 1, "line" => 1, ... } },
##     { "url" => { "line" => 1, "line" => 1, ... } },
##   },
##   To => [like From]
##   Base => "base", ## if base != url
##   Status => "Whatever",
##   Checked => time, ## when did we last look?
##   Good => time, ## when was it good (if ever)
##   LastModified => time, ## when it was good, when was it last modified?
## }

URL: {
  unless (@TODO) {
    ## prime the pump
    %OLD = %NEW;
    %NEW = ();
    for (0..$#CHECK) {
      my $url = HACK_URL(URI->new($CHECK[$_]));
      add_link("REQUESTED:", $_, $url);
    }
  }
  warn @TODO." to go...\n" if $VERBOSE;
  my $url = splice(@TODO, rand @TODO, 1); # the lucky winner is...
  $url = URI->new($url);
  for (qw(Checked Good LastModified)) {
    $NEW{$url}{$_} = $OLD{$url}{$_} || 0;
  }
  my $parse = PARSE($url);
  if ($parse >= 2) {
    warn "Parsing $url\n" if $VERBOSE;
    my $links = $OLD{$url}{To} || {};
    my $base;
    if (time < $NEW{$url}{Checked} + $RECHECK or
        time < $NEW{$url}{Good} + $RECHECK_GOOD and
        $NEW{$url}{LastModified} > 0) {
      warn ".. too early to recheck\n" if $VERBOSE;
      $NEW{$url}{Status} = $OLD{$url}{Status};
    } else {
      my $req = HTTP::Request->new(GET => $url);
      $req->if_modified_since($NEW{$url}{LastModified});
      my $res = fetch($req);
      push(@TODO, $url->as_string), next if $SAVE_WANTED;
      if ($res->is_success) {
        warn ".. successful fetch\n" if $VERBOSE;
        $base = $res->base;
        $NEW{$url}{Checked} = $NEW{$url}{Good} = time;
        $NEW{$url}{Base} = $base if $base ne $url;
        $NEW{$url}{LastModified} = $res->last_modified || $res->date;
        unless ($res->content_type =~ /text\/html/i) {
          warn ".. not HTML\n" if $VERBOSE;
          $NEW{$url}{Status} = "Verified (content = ".($res->content_type).")";
          next;
        }
        $NEW{$url}{Status} = "Verified (and parsed)";
        my $p = ParseLink->new;
        for ($res->content =~ /(.+|\n)/g) {
          $p->parse($_);
          $p->new_line() if $_ eq "\n";
        }
        $p->parse(undef);               # signal the end of parse
        $links = $p->get_links; # key is relative url, value is href
      } elsif ($res->code == 304) {
        warn ".. not modified\n" if $VERBOSE;
        $NEW{$url}{Status} = $OLD{$url}{Status};
        $NEW{$url}{Checked} = $NEW{$url}{Good} = time;
      } else {
        warn ".. not verified\n" if $VERBOSE;
        $NEW{$url}{Status} = "NOT Verified (status = ".($res->code).")";
        $NEW{$url}{Checked} = time;
        next if time > $NEW{$url}{Good} + $FOLLOW_GHOST;
        warn ".. but following ghost links\n" if $VERBOSE;
      }
    }
    for my $link (sort keys %$links) {
      my $abs = $link;
      if ($base) {              # we fetched a page
        $abs = URI->new_abs($link,$base);
        $abs->fragment(undef);          # blow away any fragment
        $abs = HACK_URL($abs)->as_string;
      }
      warn "... $abs ($link)\n" if $VERBOSE > 1;
      for my $line (sort keys %{$links->{$link}}) {
        add_link($url, $line, $abs) if PARSE(URI->new($abs)) >= 0;
      }
    }
    next;
  } elsif ($parse >= 1) {
    warn "Pinging $url\n" if $VERBOSE;
    if (time < $NEW{$url}{Checked} + $RECHECK or
        time < $NEW{$url}{Good} + $RECHECK_GOOD) {
      warn ".. too early to recheck\n" if $VERBOSE;
      $NEW{$url}{Status} = $OLD{$url}{Status};
    } else {
      my $res;
      for my $method (qw(HEAD GET)) {
        my $req = new HTTP::Request($method,$url);
        $res = fetch($req);
        push(@TODO, $url->as_string), next URL if $SAVE_WANTED;
        if ($res->is_success) {
          $NEW{$url}{Status} = "Verified (contents not examined)";
          $NEW{$url}{Checked} = $NEW{$url}{Good} = time;
          $NEW{$url}{LastModified} = 0;
          next URL;
        }
      }
      $NEW{$url}{Status} = "NOT Verified (status = ".($res->code).")";
      $NEW{$url}{Checked} = time;
      next;
    }
  } else {                      # $parse < 0
    warn "Skipping $url\n" if $VERBOSE;
    $NEW{$url}{Status} = "Skipped";
    $NEW{$url}{Checked} = 0;    # we no longer check this
    next;
  }
} continue {
  if ($SAVE_WANTED) {
    warn "dumping data to $DATABASE...\n" if $VERBOSE;
    store [\%OLD, \%NEW, \@TODO], $DATABASE;
    exit 0;
  }
  redo if @TODO;
  warn "dumping data to $DATABASE...\n" if $VERBOSE;
  store [\%OLD, \%NEW, \@TODO], $DATABASE;
  print "\nBEGIN REPORT at ".localtime()."\n\n";
  for my $url (sort keys %NEW) {
    next if $url =~ /^requested:/i;
    my $entry = $NEW{$url};     # href
    next unless $entry->{Checked} > $entry->{Good} + $REPORT;
    my $status = $entry->{Status};
    my $base = $entry->{Base};
    print "$url";
    print " (base $base)" if defined $base;
    print ":\n  status: $status\n";
    for (qw(Checked Good LastModified)) {
      if (my $stamp = $entry->{$_}) {
        print "  $_ => ".localtime($stamp)."\n";
      }
    }
    dump_relative($url, "from", $entry->{From});
    dump_relative($url, "to", $entry->{To});
  }
  print "\nEND REPORT\n\n";
  ## redo;
  exit 0;
}       

## subroutines

sub add_link {
  my ($from,$line,$to) = @_;
  $NEW{$to}{From}{$from}{$line}++;
  $NEW{$from}{To}{$to}{$line}++;
  unless (defined $NEW{$to}{Status}) {
    push @TODO, $to;
    $NEW{$to}{Status} = "[to be probed]";
  }
}

sub dump_relative {
  my ($url,$label,$urls) = @_;
  for my $other_url (sort keys %$urls) {
    my $relative = URI->new($other_url)->rel($url) || ".";
    print "  $label $relative at ";
    print join " ", sort { $a <=> $b } keys %{$urls->{$other_url}};
    print "\n";
  }
}

sub URI::mailto::authority { ""; } # workaround bug