*/
/*
- HTML Purifier 2.1.2 - Standards Compliant HTML Filtering
- Copyright (C) 2006 Edward Z. Yang
+ HTML Purifier 2.1.3 - Standards Compliant HTML Filtering
+ Copyright (C) 2006-2007 Edward Z. Yang
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
// constants are slow, but we'll make one exception
define('HTMLPURIFIER_PREFIX', dirname(__FILE__));
-// almost every class has an undocumented dependency to these, so make sure
-// they get included
-require_once 'HTMLPurifier/ConfigSchema.php'; // important
+// every class has an undocumented dependency to these, must be included!
+require_once 'HTMLPurifier/ConfigSchema.php'; // fatal errors if not included
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'CollectErrors', false, 'bool', '
Whether or not to collect errors found while filtering the document. This
-is a useful way to give feedback to your users. CURRENTLY NOT IMPLEMENTED.
-This directive has been available since 2.0.0.
+is a useful way to give feedback to your users. <strong>Warning:</strong>
+Currently this feature is very patchy and experimental, with lots of
+possible error messages not yet implemented. It will not cause any problems,
+but it may not help your users either. This directive has been available
+since 2.0.0.
');
/**
- * Main library execution class.
+ * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
*
- * Facade that performs calls to the HTMLPurifier_Lexer,
- * HTMLPurifier_Strategy and HTMLPurifier_Generator subsystems in order to
- * purify HTML.
+ * @note There are several points in which configuration can be specified
+ * for HTML Purifier. The precedence of these (from lowest to
+ * highest) is as follows:
+ * -# Instance: new HTMLPurifier($config)
+ * -# Invocation: purify($html, $config)
+ * These configurations are entirely independent of each other and
+ * are *not* merged.
*
* @todo We need an easier way to inject strategies, it'll probably end
* up getting done through config though.
class HTMLPurifier
{
- var $version = '2.1.2';
+ var $version = '2.1.3';
var $config;
- var $filters;
+ var $filters = array();
var $strategy, $generator;
/**
- * Final HTMLPurifier_Context of last run purification. Might be an array.
+ * Resultant HTMLPurifier_Context of last run purification. Is an array
+ * of contexts if the last called method was purifyArray().
* @public
*/
var $context;
$context->register('ErrorCollector', $error_collector);
}
+ // setup id_accumulator context, necessary due to the fact that
+ // AttrValidator can be called from many places
+ $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
+ $context->register('IDAccumulator', $id_accumulator);
+
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
/**
* Singleton for enforcing just one HTML Purifier in your system
+ * @param $prototype Optional prototype HTMLPurifier instance to
+ * overload singleton with.
*/
function &getInstance($prototype = null) {
static $htmlpurifier;
$result = $uri->validate($config, $context);
if (!$result) break;
- // chained validation
+ // chained filtering
$uri_def =& $config->getDefinition('URI');
$result = $uri_def->filter($uri, $config, $context);
if (!$result) break;
<?php
require_once 'HTMLPurifier/AttrDef.php';
-require_once 'HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php';
class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
{
}
+// sub-implementations
+require_once 'HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php';
$definition = $config->getHTMLDefinition();
$e =& $context->get('ErrorCollector', true);
+ // initialize IDAccumulator if necessary
+ $ok =& $context->get('IDAccumulator', true);
+ if (!$ok) {
+ $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
+ $context->register('IDAccumulator', $id_accumulator);
+ }
+
// initialize CurrentToken if necessary
$current_token =& $context->get('CurrentToken', true);
if (!$current_token) $context->register('CurrentToken', $token);
var $type = 'optional';
function validateChildren($tokens_of_children, $config, &$context) {
$result = parent::validateChildren($tokens_of_children, $config, $context);
- if ($result === false) return array();
+ if ($result === false) {
+ if (empty($tokens_of_children)) return true;
+ else return array();
+ }
return $result;
}
}
/**
* HTML Purifier's version
*/
- var $version = '2.1.2';
+ var $version = '2.1.3';
/**
* Two-level associative array of configuration directives
/**
* Adds a custom element to your HTML definition
* @note See HTMLPurifier_HTMLModule::addElement for detailed
- * parameter descriptions.
+ * parameter and return value descriptions.
*/
- function addElement($element_name, $type, $contents, $attr_collections, $attributes) {
+ function &addElement($element_name, $type, $contents, $attr_collections, $attributes) {
$module =& $this->getAnonymousModule();
// assume that if the user is calling this, the element
// is safe. This may not be a good idea
- $module->addElement($element_name, true, $type, $contents, $attr_collections, $attributes);
+ $element =& $module->addElement($element_name, true, $type, $contents, $attr_collections, $attributes);
+ return $element;
+ }
+
+ /**
+ * Adds a blank element to your HTML definition, for overriding
+ * existing behavior
+ * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
+ * parameter and return value descriptions.
+ */
+ function &addBlankElement($element_name) {
+ $module =& $this->getAnonymousModule();
+ $element =& $module->addBlankElement($element_name);
+ return $element;
}
/**
require_once 'HTMLPurifier/AttrTransform/ImgSpace.php';
require_once 'HTMLPurifier/AttrTransform/EnumToCSS.php';
+require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
+
class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends
HTMLPurifier_HTMLModule_Tidy
{
{
var $name = 'Tidy_Strict';
var $defaultLevel = 'light';
+
+ function makeFixes() {
+ $r = parent::makeFixes();
+ $r['blockquote#content_model_type'] = 'strictblockquote';
+ return $r;
+ }
+
+ var $defines_child_def = true;
+ function getChildDef($def) {
+ if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
+ return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
+ }
}
require_once 'HTMLPurifier/HTMLModule/Tidy.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
-require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/Proprietary.php';
HTMLPurifier_ConfigSchema::define(
$this->doctypes->register(
'XHTML 1.0 Strict', true,
array_merge($common, $xml, $non_xml),
- array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict', 'Tidy_Proprietary'),
+ array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary'),
array(),
'-//W3C//DTD XHTML 1.0 Strict//EN',
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
$this->doctypes->register(
'XHTML 1.1', true,
array_merge($common, $xml, array('Ruby')),
- array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_XHTMLStrict'), // Tidy_XHTML1_1
+ array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict'), // Tidy_XHTML1_1
array(),
'-//W3C//DTD XHTML 1.1//EN',
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
<?php
+HTMLPurifier_ConfigSchema::define(
+ 'Attr', 'IDBlacklist', array(), 'list',
+ 'Array of IDs not allowed in the document.'
+);
+
/**
* Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
* @note In Slashdot-speak, dupe means duplicate.
- * @note This class does not accept $config or $context, thus, it is the
- * burden of the callee to register the appropriate errors or
- * configuration.
+ * @note The default constructor does not accept $config or $context objects:
+ * use must use the static build() factory method to perform initialization.
*/
class HTMLPurifier_IDAccumulator
{
*/
var $ids = array();
+ /**
+ * Builds an IDAccumulator, also initializing the default blacklist
+ * @param $config Instance of HTMLPurifier_Config
+ * @param $context Instance of HTMLPurifier_Context
+ * @return Fully initialized HTMLPurifier_IDAccumulator
+ * @static
+ */
+ function build($config, &$context) {
+ $id_accumulator = new HTMLPurifier_IDAccumulator();
+ $id_accumulator->load($config->get('Attr', 'IDBlacklist'));
+ return $id_accumulator;
+ }
+
/**
* Add an ID to the lookup table.
* @param $id ID to be added.
* Injects tokens into the document while parsing for well-formedness.
* This enables "formatter-like" functionality such as auto-paragraphing,
* smiley-ification and linkification to take place.
+ *
+ * @todo Allow injectors to request a re-run on their output. This
+ * would help if an operation is recursive.
*/
class HTMLPurifier_Injector
{
*/
function handleElement(&$token) {}
+ /**
+ * Notifier that is called when an end token is processed
+ * @note This differs from handlers in that the token is read-only
+ */
+ function notifyEnd($token) {}
+
+
}
'AutoFormat', 'AutoParagraph', false, 'bool', '
<p>
This directive turns on auto-paragraphing, where double newlines are
- converted in to paragraphs whenever possible. Auto-paragraphing
- applies when:
+ converted in to paragraphs whenever possible. Auto-paragraphing:
</p>
<ul>
- <li>There are inline elements or text in the root node</li>
- <li>There are inline elements or text with double newlines or
- block elements in nodes that allow paragraph tags</li>
- <li>There are double newlines in paragraph tags</li>
+ <li>Always applies to inline elements or text in the root node,</li>
+ <li>Applies to inline elements or text with double newlines in nodes
+ that allow paragraph tags,</li>
+ <li>Applies to double newlines in paragraph tags</li>
</ul>
<p>
<code>p</code> tags must be allowed for this directive to take effect.
We do not use <code>br</code> tags for paragraphing, as that is
semantically incorrect.
</p>
+<p>
+ To prevent auto-paragraphing as a content-producer, refrain from using
+ double-newlines except to specify a new paragraph or in contexts where
+ it has special meaning (whitespace usually has no meaning except in
+ tags like <code>pre</code>, so this should not be difficult.) To prevent
+ the paragraphing of inline text adjacent to block elements, wrap them
+ in <code>div</code> tags (the behavior is slightly different outside of
+ the root node.)
+</p>
<p>
This directive has been available since 2.0.1.
</p>
$ok = false;
// test if up-coming tokens are either block or have
// a double newline in them
+ $nesting = 0;
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start'){
if (!$this->_isInline($this->inputTokens[$i])) {
- $ok = true;
+ // we haven't found a double-newline, and
+ // we've hit a block element, so don't paragraph
+ $ok = false;
+ break;
}
- break;
+ $nesting++;
+ }
+ if ($this->inputTokens[$i]->type == 'end') {
+ if ($nesting <= 0) break;
+ $nesting--;
}
- if ($this->inputTokens[$i]->type == 'end') break;
if ($this->inputTokens[$i]->type == 'text') {
+ // found it!
if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
$ok = true;
+ break;
}
- if (!$this->inputTokens[$i]->is_whitespace) break;
}
}
if ($ok) {
}
HTMLPurifier_ConfigSchema::define(
- 'Core', 'AcceptFullDocuments', true, 'bool',
- 'This parameter determines whether or not the filter should accept full '.
- 'HTML documents, not just HTML fragments. When on, it will '.
- 'drop all sections except the content between body.'
-);
+ 'Core', 'ConvertDocumentToFragment', true, 'bool', '
+This parameter determines whether or not the filter should convert
+input that is a full document with html and body tags to a fragment
+of just the contents of a body tag. This parameter is simply something
+HTML Purifier can do during an edge-case: for most inputs, this
+processing is not necessary.
+');
+HTMLPurifier_ConfigSchema::defineAlias('Core', 'AcceptFullDocuments', 'Core', 'ConvertDocumentToFragment');
HTMLPurifier_ConfigSchema::define(
'Core', 'LexerImpl', null, 'mixed/null', '
function normalize($html, $config, &$context) {
// extract body from document if applicable
- if ($config->get('Core', 'AcceptFullDocuments')) {
+ if ($config->get('Core', 'ConvertDocumentToFragment')) {
$html = $this->extractBody($html);
}
$segment = substr($html, $cursor, $strlen_segment);
+ if ($segment === false) {
+ // somehow, we attempted to access beyond the end of
+ // the string, defense-in-depth, reported by Nate Abele
+ break;
+ }
+
// Check if it's a comment
if (
- substr($segment, 0, 3) == '!--'
+ substr($segment, 0, 3) === '!--'
) {
// re-determine segment length, looking for -->
$position_comment_end = strpos($html, '-->', $cursor);
// trailing slash. Remember, we could have a tag like <br>, so
// any later token processing scripts must convert improperly
// classified EmptyTags from StartTags.
- $is_self_closing= (strrpos($segment,'/') === $strlen_segment-1);
+ $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
if ($is_self_closing) {
$strlen_segment--;
$segment = substr($segment, 0, $strlen_segment);
\r
}\r
\r
-// begin PHP5P source code here\r
-\r
/*\r
\r
Copyright 2007 Jeroen van der Meer <http://jero.net/> \r
}\r
}\r
\r
- private function generateImpliedEndTags(array $exclude = array()) {\r
+ private function generateImpliedEndTags($exclude = array()) {\r
/* When the steps below require the UA to generate implied end tags,\r
then, if the current node is a dd element, a dt element, an li element,\r
a p element, a td element, a th element, or a tr element, the UA must\r
}\r
}\r
\r
- private function getElementCategory($name) {\r
+ private function getElementCategory($node) {\r
+ $name = $node->tagName;\r
if(in_array($name, $this->special))\r
return self::SPECIAL;\r
\r
return $this->dom;\r
}\r
}\r
+?>\r
//################################################################//
// Process result by interpreting $result
- if ($result === true) {
+ if ($result === true || $child_tokens === $result) {
// leave the node as is
// register start token as a parental node start
$definition = $config->getHTMLDefinition();
- // CurrentNesting
+ // local variables
+ $result = array();
+ $generator = new HTMLPurifier_Generator();
+ $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
+ $e =& $context->get('ErrorCollector', true);
+
+ // member variables
$this->currentNesting = array();
- $context->register('CurrentNesting', $this->currentNesting);
+ $this->inputIndex = false;
+ $this->inputTokens =& $tokens;
+ $this->outputTokens =& $result;
- // InputIndex
- $this->inputIndex = false;
+ // context variables
+ $context->register('CurrentNesting', $this->currentNesting);
$context->register('InputIndex', $this->inputIndex);
-
- // InputTokens
$context->register('InputTokens', $tokens);
- $this->inputTokens =& $tokens;
-
- // OutputTokens
- $result = array();
- $this->outputTokens =& $result;
-
- // %Core.EscapeInvalidTags
- $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
- $generator = new HTMLPurifier_Generator();
-
- $e =& $context->get('ErrorCollector', true);
// -- begin INJECTOR --
trigger_error("Cannot enable $name injector because $error is not allowed", E_USER_WARNING);
}
+ // warning: most foreach loops follow the convention $i => $x.
+ // be sure, for PHP4 compatibility, to only perform write operations
+ // directly referencing the object using $i: $x is only safe for reads
+
// -- end INJECTOR --
$token = false;
// if all goes well, this token will be passed through unharmed
$token = $tokens[$this->inputIndex];
+ //printTokens($tokens, $this->inputIndex);
+
foreach ($this->injectors as $i => $x) {
if ($x->skip > 0) $this->injectors[$i]->skip--;
}
if ($token->type === 'text') {
// injector handler code; duplicated for performance reasons
foreach ($this->injectors as $i => $x) {
- if (!$x->skip) $x->handleText($token);
+ if (!$x->skip) $this->injectors[$i]->handleText($token);
if (is_array($token)) {
$this->currentInjector = $i;
break;
// injector handler code; duplicated for performance reasons
if ($ok) {
foreach ($this->injectors as $i => $x) {
- if (!$x->skip) $x->handleElement($token);
+ if (!$x->skip) $this->injectors[$i]->handleElement($token);
if (is_array($token)) {
$this->currentInjector = $i;
break;
$current_parent = array_pop($this->currentNesting);
if ($current_parent->name == $token->name) {
$result[] = $token;
+ foreach ($this->injectors as $i => $x) {
+ $this->injectors[$i]->notifyEnd($token);
+ }
continue;
}
// okay, we found it, close all the skipped tags
// note that skipped tags contains the element we need closed
- $size = count($skipped_tags);
- for ($i = $size - 1; $i > 0; $i--) {
- if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
+ for ($i = count($skipped_tags) - 1; $i >= 0; $i--) {
+ if ($i && $e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]);
}
- $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
+ $result[] = $new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
+ foreach ($this->injectors as $j => $x) { // $j, not $i!!!
+ $this->injectors[$j]->notifyEnd($new_token);
+ }
}
- $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
-
}
$context->destroy('CurrentNesting');
$context->destroy('InputIndex');
$context->destroy('CurrentToken');
- // we're at the end now, fix all still unclosed tags
- // not using processToken() because at this point we don't
- // care about current nesting
+ // we're at the end now, fix all still unclosed tags (this is
+ // duplicated from the end of the loop with some slight modifications)
+ // not using $skipped_tags since it would invariably be all of them
if (!empty($this->currentNesting)) {
- $size = count($this->currentNesting);
- for ($i = $size - 1; $i >= 0; $i--) {
+ for ($i = count($this->currentNesting) - 1; $i >= 0; $i--) {
if ($e && !isset($this->currentNesting[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]);
}
- $result[] =
- new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
+ $result[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
+ foreach ($this->injectors as $j => $x) { // $j, not $i!!!
+ $this->injectors[$j]->notifyEnd($new_token);
+ }
}
}
// adjust the injector skips based on the array substitution
if ($this->injectors) {
- $offset = count($token) + 1;
+ $offset = count($token);
for ($i = 0; $i <= $this->currentInjector; $i++) {
+ // because of the skip back, we need to add one more
+ // for uninitialized injectors. I'm not exactly
+ // sure why this is the case, but I think it has to
+ // do with the fact that we're decrementing skips
+ // before re-checking text
+ if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
$this->injectors[$i]->skip += $offset;
}
}
// mostly everything's good, but
// we need to make sure required attributes are in order
if (
+ ($token->type === 'start' || $token->type === 'empty') &&
$definition->info[$token->name]->required_attr &&
($token->name != 'img' || $remove_invalid_img) // ensure config option still works
) {
$token->armor['ValidateAttributes'] = true;
}
- // CAN BE GENERICIZED
if (isset($hidden_elements[$token->name]) && $token->type == 'start') {
$textify_comments = $token->name;
} elseif ($token->name === $textify_comments && $token->type == 'end') {
require_once 'HTMLPurifier/AttrValidator.php';
-HTMLPurifier_ConfigSchema::define(
- 'Attr', 'IDBlacklist', array(), 'list',
- 'Array of IDs not allowed in the document.');
-
/**
* Validate all attributes in the tokens.
*/
function execute($tokens, $config, &$context) {
- // setup id_accumulator context
- $id_accumulator = new HTMLPurifier_IDAccumulator();
- $id_accumulator->load($config->get('Attr', 'IDBlacklist'));
- $context->register('IDAccumulator', $id_accumulator);
-
// setup validator
$validator = new HTMLPurifier_AttrValidator();
$tokens[$key] = $token; // for PHP 4
}
-
- $context->destroy('IDAccumulator');
$context->destroy('CurrentToken');
return $tokens;
<?php
/**
- * Chainable filters for custom URI processing
+ * Chainable filters for custom URI processing.
+ *
+ * These filters can perform custom actions on a URI filter object,
+ * including transformation or blacklisting.
+ *
+ * @warning This filter is called before scheme object validation occurs.
+ * Make sure, if you require a specific scheme object, you
+ * you check that it exists. This allows filters to convert
+ * proprietary URI schemes into regular ones.
*/
class HTMLPurifier_URIFilter
{
+
+ /**
+ * Unique identifier of filter
+ */
var $name;
/**
* @param &$uri Reference to URI object
* @param $config Instance of HTMLPurifier_Config
* @param &$context Instance of HTMLPurifier_Context
+ * @return bool Whether or not to continue processing: false indicates
+ * URL is no good, true indicates continue processing. Note that
+ * all changes are committed directly on the URI object
*/
function filter(&$uri, $config, &$context) {
trigger_error('Cannot call abstract function', E_USER_ERROR);
}
+
}
// absolute URI already: don't change
if (!is_null($uri->host)) return true;
$scheme_obj = $uri->getSchemeObj($config, $context);
+ if (!$scheme_obj) {
+ // scheme not recognized
+ return false;
+ }
if (!$scheme_obj->hierarchical) {
// non-hierarchal URI with explicit scheme, don't change
return true;
-Description of HTML Purifier v2.1.2 Lite library import into Moodle
+Description of HTML Purifier v2.1.3 Lite library import into Moodle
Changes:
* HMLTModule/Text.php - added <nolink>, <tex>, <lang> and <algebra> tags
* HMLTModule/XMLCommonAttributes.php - remove xml:lang - needed for multilang
- * AttrDef/Lang.php - relaxt lang check - needed for multilang
+ * AttrDef/Lang.php - relax lang check - needed for multilang
skodak