mirror of
https://github.com/OpenXE-org/OpenXE.git
synced 2025-01-15 16:21:14 +01:00
182 lines
7.4 KiB
PHP
182 lines
7.4 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Takes a well formed list of tokens and fixes their nesting.
|
|
*
|
|
* HTML elements dictate which elements are allowed to be their children,
|
|
* for example, you can't have a p tag in a span tag. Other elements have
|
|
* much more rigorous definitions: tables, for instance, require a specific
|
|
* order for their elements. There are also constraints not expressible by
|
|
* document type definitions, such as the chameleon nature of ins/del
|
|
* tags and global child exclusions.
|
|
*
|
|
* The first major objective of this strategy is to iterate through all
|
|
* the nodes and determine whether or not their children conform to the
|
|
* element's definition. If they do not, the child definition may
|
|
* optionally supply an amended list of elements that is valid or
|
|
* require that the entire node be deleted (and the previous node
|
|
* rescanned).
|
|
*
|
|
* The second objective is to ensure that explicitly excluded elements of
|
|
* an element do not appear in its children. Code that accomplishes this
|
|
* task is pervasive through the strategy, though the two are distinct tasks
|
|
* and could, theoretically, be seperated (although it's not recommended).
|
|
*
|
|
* @note Whether or not unrecognized children are silently dropped or
|
|
* translated into text depends on the child definitions.
|
|
*
|
|
* @todo Enable nodes to be bubbled out of the structure. This is
|
|
* easier with our new algorithm.
|
|
*/
|
|
|
|
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
|
{
|
|
|
|
/**
|
|
* @param HTMLPurifier_Token[] $tokens
|
|
* @param HTMLPurifier_Config $config
|
|
* @param HTMLPurifier_Context $context
|
|
* @return array|HTMLPurifier_Token[]
|
|
*/
|
|
public function execute($tokens, $config, $context)
|
|
{
|
|
|
|
//####################################################################//
|
|
// Pre-processing
|
|
|
|
// O(n) pass to convert to a tree, so that we can efficiently
|
|
// refer to substrings
|
|
$top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
|
|
|
|
// get a copy of the HTML definition
|
|
$definition = $config->getHTMLDefinition();
|
|
|
|
$excludes_enabled = !$config->get('Core.DisableExcludes');
|
|
|
|
// setup the context variable 'IsInline', for chameleon processing
|
|
// is 'false' when we are not inline, 'true' when it must always
|
|
// be inline, and an integer when it is inline for a certain
|
|
// branch of the document tree
|
|
$is_inline = $definition->info_parent_def->descendants_are_inline;
|
|
$context->register('IsInline', $is_inline);
|
|
|
|
// setup error collector
|
|
$e =& $context->get('ErrorCollector', true);
|
|
|
|
//####################################################################//
|
|
// Loop initialization
|
|
|
|
// stack that contains all elements that are excluded
|
|
// it is organized by parent elements, similar to $stack,
|
|
// but it is only populated when an element with exclusions is
|
|
// processed, i.e. there won't be empty exclusions.
|
|
$exclude_stack = array($definition->info_parent_def->excludes);
|
|
|
|
// variable that contains the start token while we are processing
|
|
// nodes. This enables error reporting to do its job
|
|
$node = $top_node;
|
|
// dummy token
|
|
list($token, $d) = $node->toTokenPair();
|
|
$context->register('CurrentNode', $node);
|
|
$context->register('CurrentToken', $token);
|
|
|
|
//####################################################################//
|
|
// Loop
|
|
|
|
// We need to implement a post-order traversal iteratively, to
|
|
// avoid running into stack space limits. This is pretty tricky
|
|
// to reason about, so we just manually stack-ify the recursive
|
|
// variant:
|
|
//
|
|
// function f($node) {
|
|
// foreach ($node->children as $child) {
|
|
// f($child);
|
|
// }
|
|
// validate($node);
|
|
// }
|
|
//
|
|
// Thus, we will represent a stack frame as array($node,
|
|
// $is_inline, stack of children)
|
|
// e.g. array_reverse($node->children) - already processed
|
|
// children.
|
|
|
|
$parent_def = $definition->info_parent_def;
|
|
$stack = array(
|
|
array($top_node,
|
|
$parent_def->descendants_are_inline,
|
|
$parent_def->excludes, // exclusions
|
|
0)
|
|
);
|
|
|
|
while (!empty($stack)) {
|
|
list($node, $is_inline, $excludes, $ix) = array_pop($stack);
|
|
// recursive call
|
|
$go = false;
|
|
$def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
|
|
while (isset($node->children[$ix])) {
|
|
$child = $node->children[$ix++];
|
|
if ($child instanceof HTMLPurifier_Node_Element) {
|
|
$go = true;
|
|
$stack[] = array($node, $is_inline, $excludes, $ix);
|
|
$stack[] = array($child,
|
|
// ToDo: I don't think it matters if it's def or
|
|
// child_def, but double check this...
|
|
$is_inline || $def->descendants_are_inline,
|
|
empty($def->excludes) ? $excludes
|
|
: array_merge($excludes, $def->excludes),
|
|
0);
|
|
break;
|
|
}
|
|
};
|
|
if ($go) continue;
|
|
list($token, $d) = $node->toTokenPair();
|
|
// base case
|
|
if ($excludes_enabled && isset($excludes[$node->name])) {
|
|
$node->dead = true;
|
|
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
|
|
} else {
|
|
// XXX I suppose it would be slightly more efficient to
|
|
// avoid the allocation here and have children
|
|
// strategies handle it
|
|
$children = array();
|
|
foreach ($node->children as $child) {
|
|
if (!$child->dead) $children[] = $child;
|
|
}
|
|
$result = $def->child->validateChildren($children, $config, $context);
|
|
if ($result === true) {
|
|
// nop
|
|
$node->children = $children;
|
|
} elseif ($result === false) {
|
|
$node->dead = true;
|
|
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
|
|
} else {
|
|
$node->children = $result;
|
|
if ($e) {
|
|
// XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
|
|
if (empty($result) && !empty($children)) {
|
|
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
|
|
} else if ($result != $children) {
|
|
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//####################################################################//
|
|
// Post-processing
|
|
|
|
// remove context variables
|
|
$context->destroy('IsInline');
|
|
$context->destroy('CurrentNode');
|
|
$context->destroy('CurrentToken');
|
|
|
|
//####################################################################//
|
|
// Return
|
|
|
|
return HTMLPurifier_Arborize::flatten($node, $config, $context);
|
|
}
|
|
}
|
|
|
|
// vim: et sw=4 sts=4
|