package My_HTML_Filter;
use strict;
require XML::LibXML;
my $PARSER = XML::LibXML->new;
sub new {
my $class = shift;
my $permitted = shift;
return bless { permitted => $permitted }, $class;
}
sub strip {
my $self = shift;
my $html = shift;
my $dom = $PARSER->parse_html_string($html) or die "Cannot parse";
my $permitted = $self->{permitted};
my $cur = $dom->firstChild;
while ($cur) {
my $delete = 0; # default to safe
## I really really hate switching on class names
## but this is a bad interface design {sigh}
if (ref $cur eq "XML::LibXML::Element") {
## "that which is not explicitly permitted is forbidden!"
if (my $ok_attr = $permitted->{$cur->nodeName}) {
## so this element is permitted, but what about its attributes?
for my $att ($cur->attributes) {
my $name = $att->nodeName;
$cur->removeAttribute($name) unless $ok_attr->{$name};
}
## now descend if any kids
if (my $next = $cur->firstChild) {
$cur = $next;
next; # don't execute code at bottom
}
} else {
## bogon - delete!
## we must hoist any kids to be after our current position in
## reverse order, since we always inserting right after old node
my $parent = $cur->parentNode or die "Expecting parent of $cur";
for (reverse $cur->childNodes) {
$parent->insertAfter($_, $cur);
}
## and flag this one for deletion
$delete = 1;
## fall out
}
} elsif (ref $cur eq "XML::LibXML::Text"
or ref $cur eq "XML::LibXML::CDATASection") {
## fall out
} elsif (ref $cur eq "XML::LibXML::Dtd"
or ref $cur eq "XML::LibXML::Comment") {
## delete these
$delete = 1;
## fall out
} else {
warn "[what to do with a $cur?]"; # I hope we don't hit this
}
## determine next node ala XPath "following::node()[1]"
my $next = $cur;
{
if (my $sib = $next->nextSibling) {
$next = $sib;
last;
}
## no sibling... must try parent node's sibling
$next = $next->parentNode;
redo if $next;
}
## $next might be undef at this point, and we'll be done
## delete the current node if needed
$cur->parentNode->removeChild($cur)
if $delete;
$cur = $next;
}
my $output_html = $dom->toStringHTML;
$output_html =~ s/.*\n//; # strip the doctype
return $output_html;
}
1;