package My_HTML_Filter;
use strict;
require XML::LibXML;
my $PARSER = XML::LibXML->new;

sub new {
  my $class = shift;
  my $permitted = shift;
  return bless { permitted => $permitted }, $class;
}

sub strip {
  my $self = shift;
  my $html = shift;

  my $dom = $PARSER->parse_html_string($html) or die "Cannot parse";
  my $permitted = $self->{permitted};

  my $cur = $dom->firstChild;
  while ($cur) {
    my $delete = 0;             # default to safe

    ## I really really hate switching on class names
    ## but this is a bad interface design {sigh}
    if (ref $cur eq "XML::LibXML::Element") {
      ## "that which is not explicitly permitted is forbidden!"
      if (my $ok_attr = $permitted->{$cur->nodeName}) {
        ## so this element is permitted, but what about its attributes?
        for my $att ($cur->attributes) {
          my $name = $att->nodeName;
          $cur->removeAttribute($name) unless $ok_attr->{$name};
        }
        ## now descend if any kids
        if (my $next = $cur->firstChild) {
          $cur = $next;
          next;                 # don't execute code at bottom
        }
      } else {
        ## bogon - delete!
        ## we must hoist any kids to be after our current position in
        ## reverse order, since we always inserting right after old node
        my $parent = $cur->parentNode or die "Expecting parent of $cur";
        for (reverse $cur->childNodes) {
          $parent->insertAfter($_, $cur);
        }
        ## and flag this one for deletion
        $delete = 1;
        ## fall out
      }
    } elsif (ref $cur eq "XML::LibXML::Text"
             or ref $cur eq "XML::LibXML::CDATASection") {
      ## fall out
    } elsif (ref $cur eq "XML::LibXML::Dtd"
             or ref $cur eq "XML::LibXML::Comment") {
      ## delete these
      $delete = 1;
      ## fall out
    } else {
      warn "[what to do with a $cur?]"; # I hope we don't hit this
    }

    ## determine next node ala XPath "following::node()[1]"
    my $next = $cur;
    {
      if (my $sib = $next->nextSibling) {
        $next = $sib;
        last;
      }
      ## no sibling... must try parent node's sibling
      $next = $next->parentNode;
      redo if $next;
    }
    ## $next might be undef at this point, and we'll be done

    ## delete the current node if needed
    $cur->parentNode->removeChild($cur)
      if $delete;

    $cur = $next;
  }

  my $output_html = $dom->toStringHTML;
  $output_html =~ s/.*\n//;     # strip the doctype

  return $output_html;
}

1;