package My_HTML_Filter; use strict; require XML::LibXML; my $PARSER = XML::LibXML->new; sub new { my $class = shift; my $permitted = shift; return bless { permitted => $permitted }, $class; } sub strip { my $self = shift; my $html = shift; my $dom = $PARSER->parse_html_string($html) or die "Cannot parse"; my $permitted = $self->{permitted}; my $cur = $dom->firstChild; while ($cur) { my $delete = 0; # default to safe ## I really really hate switching on class names ## but this is a bad interface design {sigh} if (ref $cur eq "XML::LibXML::Element") { ## "that which is not explicitly permitted is forbidden!" if (my $ok_attr = $permitted->{$cur->nodeName}) { ## so this element is permitted, but what about its attributes? for my $att ($cur->attributes) { my $name = $att->nodeName; $cur->removeAttribute($name) unless $ok_attr->{$name}; } ## now descend if any kids if (my $next = $cur->firstChild) { $cur = $next; next; # don't execute code at bottom } } else { ## bogon - delete! ## we must hoist any kids to be after our current position in ## reverse order, since we always inserting right after old node my $parent = $cur->parentNode or die "Expecting parent of $cur"; for (reverse $cur->childNodes) { $parent->insertAfter($_, $cur); } ## and flag this one for deletion $delete = 1; ## fall out } } elsif (ref $cur eq "XML::LibXML::Text" or ref $cur eq "XML::LibXML::CDATASection") { ## fall out } elsif (ref $cur eq "XML::LibXML::Dtd" or ref $cur eq "XML::LibXML::Comment") { ## delete these $delete = 1; ## fall out } else { warn "[what to do with a $cur?]"; # I hope we don't hit this } ## determine next node ala XPath "following::node()[1]" my $next = $cur; { if (my $sib = $next->nextSibling) { $next = $sib; last; } ## no sibling... must try parent node's sibling $next = $next->parentNode; redo if $next; } ## $next might be undef at this point, and we'll be done ## delete the current node if needed $cur->parentNode->removeChild($cur) if $delete; $cur = $next; } my $output_html = $dom->toStringHTML; $output_html =~ s/.*\n//; # strip the doctype return $output_html; } 1;