]> git.mjollnir.org Git - moodle.git/commitdiff
MDL-14894 html purifier updated, merged from MOODLE_19_STABLE
authorskodak <skodak>
Mon, 19 May 2008 06:24:33 +0000 (06:24 +0000)
committerskodak <skodak>
Mon, 19 May 2008 06:24:33 +0000 (06:24 +0000)
27 files changed:
lib/htmlpurifier/HTMLPurifier.php
lib/htmlpurifier/HTMLPurifier/AttrDef.php
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS.php
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Background.php
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Border.php
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Color.php
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php
lib/htmlpurifier/HTMLPurifier/CSSDefinition.php
lib/htmlpurifier/HTMLPurifier/Config.php
lib/htmlpurifier/HTMLPurifier/DefinitionCache.php
lib/htmlpurifier/HTMLPurifier/DefinitionCacheFactory.php
lib/htmlpurifier/HTMLPurifier/ElementDef.php
lib/htmlpurifier/HTMLPurifier/Encoder.php
lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php
lib/htmlpurifier/HTMLPurifier/IDAccumulator.php
lib/htmlpurifier/HTMLPurifier/Language.php
lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-testmini.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/LanguageFactory.php
lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php
lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php
lib/htmlpurifier/HTMLPurifier/PercentEncoder.php
lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php
lib/htmlpurifier/HTMLPurifier/URI.php
lib/htmlpurifier/HTMLPurifier/URIParser.php
lib/htmlpurifier/readme_moodle.txt

index e9dfe5f4048fafcd8e2a35da1f5b58ed9aee7a65..a7bba317e5e2697fdbcf958ebe5d09a4ac5078f8 100644 (file)
@@ -22,7 +22,7 @@
  */
 
 /*
-    HTML Purifier 2.1.3 - Standards Compliant HTML Filtering
+    HTML Purifier 2.1.4 - Standards Compliant HTML Filtering
     Copyright (C) 2006-2007 Edward Z. Yang
 
     This library is free software; you can redistribute it and/or
@@ -83,7 +83,7 @@ since 2.0.0.
 class HTMLPurifier
 {
     
-    var $version = '2.1.3';
+    var $version = '2.1.4';
     
     var $config;
     var $filters = array();
@@ -213,7 +213,7 @@ class HTMLPurifier
      * @param $prototype Optional prototype HTMLPurifier instance to
      *                   overload singleton with.
      */
-    function &getInstance($prototype = null) {
+    function &instance($prototype = null) {
         static $htmlpurifier;
         if (!$htmlpurifier || $prototype) {
             if (is_a($prototype, 'HTMLPurifier')) {
@@ -227,6 +227,9 @@ class HTMLPurifier
         return $htmlpurifier;
     }
     
+    function &getInstance($prototype = null) {
+        return HTMLPurifier::instance($prototype);
+    }
     
 }
 
index 882b6260432ac0b7eab44c6b7c42c4a9ea018c9c..e94ee713d2e6e962b1bdadd3cd8a2a0213ea6a23 100644 (file)
@@ -82,5 +82,13 @@ class HTMLPurifier_AttrDef
         return $this;
     }
     
+    /**
+     * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
+     * properly. THIS IS A HACK!
+     */
+    function mungeRgb($string) {
+        return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
+    }
+    
 }
 
index d0f49bc4adeac21e2d1b9fe117434faa4bd40152..71523be1f1df6b6e321eedb352e20d1739a86139 100644 (file)
@@ -38,7 +38,20 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
             list($property, $value) = explode(':', $declaration, 2);
             $property = trim($property);
             $value    = trim($value);
-            if (!isset($definition->info[$property])) continue;
+            $ok = false;
+            do {
+                if (isset($definition->info[$property])) {
+                    $ok = true;
+                    break;
+                }
+                if (ctype_lower($property)) break;
+                $property = strtolower($property);
+                if (isset($definition->info[$property])) {
+                    $ok = true;
+                    break;
+                }
+            } while(0);
+            if (!$ok) continue;
             // inefficient call, since the validator will do this again
             if (strtolower(trim($value)) !== 'inherit') {
                 // inherit works for everything (but only on the base property)
index b82e98e5812449110296f3a9f18d546aca3766c1..a5c1046a827492a2a9fc70b4112c6a4e89d679af 100644 (file)
@@ -31,6 +31,9 @@ class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
         $string = $this->parseCDATA($string);
         if ($string === '') return false;
         
+        // munge rgb() decl if necessary
+        $string = $this->mungeRgb($string);
+        
         // assumes URI doesn't have spaces in it
         $bits = explode(' ', strtolower($string)); // bits to process
         
index f6d4d684e3c52877a612c3239bab5120cb25cb3d..4eb3e25abd58806e0c5be2f02f029200f5aeabb3 100644 (file)
@@ -22,7 +22,7 @@ class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
     
     function validate($string, $config, &$context) {
         $string = $this->parseCDATA($string);
-        // we specifically will not support rgb() syntax with spaces
+        $string = $this->mungeRgb($string);
         $bits = explode(' ', $string);
         $done = array(); // segments we've finished
         $ret = ''; // return value
index 30b38f9293531e464d827c7827c1a1ef1cfe6eee..a6711f717556635fff5ee85a97659a469a5d5bd0 100644 (file)
@@ -39,20 +39,13 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
         if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
         
         $color = trim($color);
-        if (!$color) return false;
+        if ($color === '') return false;
         
         $lower = strtolower($color);
         if (isset($colors[$lower])) return $colors[$lower];
         
-        if ($color[0] === '#') {
-            // hexadecimal handling
-            $hex = substr($color, 1);
-            $length = strlen($hex);
-            if ($length !== 3 && $length !== 6) return false;
-            if (!ctype_xdigit($hex)) return false;
-        } else {
+        if (strpos($color, 'rgb(') !== false) {
             // rgb literal handling
-            if (strpos($color, 'rgb(')) return false;
             $length = strlen($color);
             if (strpos($color, ')') !== $length - 1) return false;
             $triad = substr($color, 4, $length - 4 - 1);
@@ -90,6 +83,17 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
             }
             $new_triad = implode(',', $new_parts);
             $color = "rgb($new_triad)";
+        } else {
+            // hexadecimal handling
+            if ($color[0] === '#') {
+                $hex = substr($color, 1);
+            } else {
+                $hex = $color;
+                $color = '#' . $color;
+            }
+            $length = strlen($hex);
+            if ($length !== 3 && $length !== 6) return false;
+            if (!ctype_xdigit($hex)) return false;
         }
         
         return $color;
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php
new file mode 100644 (file)
index 0000000..b0a6db9
--- /dev/null
@@ -0,0 +1,26 @@
+<?php
+
+/**
+ * Decorator which enables CSS properties to be disabled for specific elements.
+ */
+class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
+{
+    var $def, $element;
+    
+    /**
+     * @param $def Definition to wrap
+     * @param $element Element to deny
+     */
+    function HTMLPurifier_AttrDef_CSS_DenyElementDecorator(&$def, $element) {
+        $this->def =& $def;
+        $this->element = $element;
+    }
+    /**
+     * Checks if CurrentToken is set and equal to $this->element
+     */
+    function validate($string, $config, $context) {
+        $token = $context->get('CurrentToken', true);
+        if ($token && $token->name == $this->element) return false;
+        return $this->def->validate($string, $config, $context);
+    }
+}
index 0e9a5f4739839ea4903907effc4861ad27244d63..52b4193b98271c9620a1f22a91faf640dadb4be9 100644 (file)
@@ -68,7 +68,7 @@ HTMLPurifier_ConfigSchema::define(
 class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
 {
     
-    var $parser, $percentEncoder;
+    var $parser;
     var $embedsResource;
     
     /**
@@ -76,7 +76,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
      */
     function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
         $this->parser = new HTMLPurifier_URIParser();
-        $this->percentEncoder = new HTMLPurifier_PercentEncoder();
         $this->embedsResource = (bool) $embeds_resource;
     }
     
@@ -84,9 +83,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
         
         if ($config->get('URI', 'Disable')) return false;
         
-        // initial operations
         $uri = $this->parseCDATA($uri);
-        $uri = $this->percentEncoder->normalize($uri);
         
         // parse the URI
         $uri = $this->parser->parse($uri);
@@ -122,13 +119,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
         $context->destroy('EmbeddedURI');
         if (!$ok) return false;
         
-        // munge scheme off if necessary (this must be last)
-        if (!is_null($uri->scheme) && is_null($uri->host)) {
-            if ($uri_def->defaultScheme == $uri->scheme) {
-                $uri->scheme = null;
-            }
-        }
-        
         // back to string
         $result = $uri->toString();
         
index ac729ebd93c03579d5101a8620428910f1e589be..4812ad1d3dc46ae59b4ede735d357172a8189c17 100644 (file)
@@ -40,11 +40,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
         $ipv4 = $this->ipv4->validate($string, $config, $context);
         if ($ipv4 !== false) return $ipv4;
         
-        // validate a domain name here, do filtering, etc etc etc
+        // A regular domain name.
         
-        // We could use this, but it would break I18N domain names
-        //$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string);
-        //if (!$match) return false;
+        // This breaks I18N domain names, but we don't have proper IRI support,
+        // so force users to insert Punycode. If there's complaining we'll 
+        // try to fix things into an international friendly form.
+        
+        // The productions describing this are:
+        $a   = '[a-z]';     // alpha
+        $an  = '[a-z0-9]';  // alphanum
+        $and = '[a-z0-9-]'; // alphanum | "-"
+        // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
+        $domainlabel   = "$an($and*$an)?";
+        // toplabel    = alpha | alpha *( alphanum | "-" ) alphanum
+        $toplabel      = "$a($and*$an)?";
+        // hostname    = *( domainlabel "." ) toplabel [ "." ]
+        $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
+        if (!$match) return false;
         
         return $string;
     }
index 2acf7cf83b2f0e0af4854dea5b84e95b9114df49..2fc73b905d286f651480a5bb6a55f02c91b32dd7 100644 (file)
@@ -7,6 +7,7 @@ require_once 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
 require_once 'HTMLPurifier/AttrDef/CSS/Border.php';
 require_once 'HTMLPurifier/AttrDef/CSS/Color.php';
 require_once 'HTMLPurifier/AttrDef/CSS/Composite.php';
+require_once 'HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php';
 require_once 'HTMLPurifier/AttrDef/CSS/Font.php';
 require_once 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
 require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
@@ -176,12 +177,13 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
         ));
         
         $this->info['width'] =
-        $this->info['height'] = 
+        $this->info['height'] =
+        new HTMLPurifier_AttrDef_CSS_DenyElementDecorator(
         new HTMLPurifier_AttrDef_CSS_Composite(array(
             new HTMLPurifier_AttrDef_CSS_Length(true),
             new HTMLPurifier_AttrDef_CSS_Percentage(true),
             new HTMLPurifier_AttrDef_Enum(array('auto'))
-        ));
+        )), 'img');
         
         $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
         
index 203542f0aa7c296b4f1481994ce33ccec1684fda..1c043aeb713b3f8be14e500c629da06843ec8ca8 100644 (file)
@@ -42,7 +42,7 @@ class HTMLPurifier_Config
     /**
      * HTML Purifier's version
      */
-    var $version = '2.1.3';
+    var $version = '2.1.4';
     
     /**
      * Two-level associative array of configuration directives
index d4c9d239f2e8eba84cdcfbc711982bf415965bd4..5b14fdfe4dd7f1f9bb3b2f3be4c38a92ebd10dff 100644 (file)
@@ -120,6 +120,9 @@ class HTMLPurifier_DefinitionCache
     
     /**
      * Clears all expired (older version or revision) objects from cache
+     * @note Be carefuly implementing this method as flush. Flush must
+     *       not interfere with other Definition types, and cleanup()
+     *       should not be repeatedly called by userland code.
      */
     function cleanup($config) {
         trigger_error('Cannot call abstract method', E_USER_ERROR);
index acc661828a4ee532f7c05777a7f0a6e1e1cbefb3..dead92a32e9217d902be75db6a9d800aa66a6196 100644 (file)
@@ -1,6 +1,7 @@
 <?php
 
 require_once 'HTMLPurifier/DefinitionCache.php';
+require_once 'HTMLPurifier/DefinitionCache/Serializer.php';
 
 HTMLPurifier_ConfigSchema::define(
     'Cache', 'DefinitionImpl', 'Serializer', 'string/null', '
@@ -10,10 +11,6 @@ to disable caching (not recommended, as you will see a definite
 performance degradation). This directive has been available since 2.0.0.
 ');
 
-HTMLPurifier_ConfigSchema::defineAllowedValues(
-    'Cache', 'DefinitionImpl', array('Serializer')
-);
-
 HTMLPurifier_ConfigSchema::defineAlias(
     'Core', 'DefinitionCache',
     'Cache', 'DefinitionImpl'
@@ -27,6 +24,7 @@ class HTMLPurifier_DefinitionCacheFactory
 {
     
     var $caches = array('Serializer' => array());
+    var $implementations = array();
     var $decorators = array();
     
     /**
@@ -51,14 +49,21 @@ class HTMLPurifier_DefinitionCacheFactory
         return $instance;
     }
     
+    /**
+     * Registers a new definition cache object
+     * @param $short Short name of cache object, for reference
+     * @param $long Full class name of cache object, for construction 
+     */
+    function register($short, $long) {
+        $this->implementations[$short] = $long;
+    }
+    
     /**
      * Factory method that creates a cache object based on configuration
      * @param $name Name of definitions handled by cache
      * @param $config Instance of HTMLPurifier_Config
      */
     function &create($type, $config) {
-        // only one implementation as for right now, $config will
-        // be used to determine implementation
         $method = $config->get('Cache', 'DefinitionImpl');
         if ($method === null) {
             $null = new HTMLPurifier_DefinitionCache_Null($type);
@@ -67,7 +72,17 @@ class HTMLPurifier_DefinitionCacheFactory
         if (!empty($this->caches[$method][$type])) {
             return $this->caches[$method][$type];
         }
-        $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
+        if (
+          isset($this->implementations[$method]) &&
+          class_exists($class = $this->implementations[$method])
+        ) {
+            $cache = new $class($type);
+        } else {
+            if ($method != 'Serializer') {
+                trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
+            }
+            $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
+        }
         foreach ($this->decorators as $decorator) {
             $new_cache = $decorator->decorate($cache);
             // prevent infinite recursion in PHP 4
index 21e1a5a764b15009a49726283a2077c8f14fd35d..b6439d1a5b6de3e0488f5fd2283895b0d9647e6b 100644 (file)
@@ -82,7 +82,7 @@ class HTMLPurifier_ElementDef
     
     /**
      * List of the names of required attributes this element has. Dynamically
-     * populated.
+     * populated by HTMLPurifier_HTMLDefinition::getElement
      * @public
      */
     var $required_attr = array();
index e5adf83f5985ec06f37ac7f76e36d373659a4146..31ebb785ff7324a74b82578b36c4cdbcbfaa8c0e 100644 (file)
@@ -62,6 +62,11 @@ class HTMLPurifier_Encoder
         trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
     }
     
+    /**
+     * Error-handler that mutes errors, alternative to shut-up operator.
+     */
+    function muteErrorHandler() {}
+    
     /**
      * Cleans a UTF-8 string for well-formedness and SGML validity
      * 
@@ -106,9 +111,18 @@ class HTMLPurifier_Encoder
         static $iconv = null;
         if ($iconv === null) $iconv = function_exists('iconv');
         
+        // UTF-8 validity is checked since PHP 4.3.5
+        // This is an optimization: if the string is already valid UTF-8, no
+        // need to do iconv/php stuff. 99% of the time, this will be the case.
+        if (preg_match('/^.{1}/us', $str)) {
+            return strtr($str, $non_sgml_chars);
+        }
+        
         if ($iconv && !$force_php) {
             // do the shortcut way
-            $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
+            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+            $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
+            restore_error_handler();
             return strtr($str, $non_sgml_chars);
         }
         
index e13e0c62b0e8183430d8f0dfeb5879270b99e8d1..51367ca4038bb20afc8c21f7d520da57892e097d 100644 (file)
@@ -222,6 +222,8 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
     
     /**
      * Adds a custom attribute to a pre-existing element
+     * @note This is strictly convenience, and does not have a corresponding
+     *       method in HTMLPurifier_HTMLModule
      * @param $element_name String element name to add attribute to
      * @param $attr_name String name of attribute
      * @param $def Attribute definition, can be string or object, see
@@ -229,7 +231,11 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
      */
     function addAttribute($element_name, $attr_name, $def) {
         $module =& $this->getAnonymousModule();
-        $element =& $module->addBlankElement($element_name);
+        if (!isset($module->info[$element_name])) {
+            $element =& $module->addBlankElement($element_name);
+        } else {
+            $element =& $module->info[$element_name];
+        }
         $element->attr[$attr_name] = $def;
     }
     
index 60715afc1ecbea36154a07d0a34ed6de363ed343..e746e565bbd1a89cf25325d33c42f89bac7ba036 100644 (file)
@@ -28,9 +28,9 @@ class HTMLPurifier_IDAccumulator
      * @static
      */
     function build($config, &$context) {
-        $id_accumulator = new HTMLPurifier_IDAccumulator();
-        $id_accumulator->load($config->get('Attr', 'IDBlacklist'));
-        return $id_accumulator;
+        $acc = new HTMLPurifier_IDAccumulator();
+        $acc->load($config->get('Attr', 'IDBlacklist'));
+        return $acc;
     }
     
     /**
index c9a3c20fe2f7efc5a58e3666a9a5ce6f819873b6..c0833b7f7945e830e369071f042058c88d782367 100644 (file)
@@ -25,6 +25,13 @@ class HTMLPurifier_Language
      */
     var $errorNames = array();
     
+    /**
+     * True if no message file was found for this language, so English
+     * is being used instead. Check this if you'd like to notify the
+     * user that they've used a non-supported language.
+     */
+    var $error = false;
+    
     /**
      * Has the language object been loaded yet?
      * @private
diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-testmini.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-testmini.php
new file mode 100644 (file)
index 0000000..4b16cd2
--- /dev/null
@@ -0,0 +1,11 @@
+<?php
+
+// private language message file for unit testing purposes
+// this language file has no class associated with it
+
+$fallback = 'en';
+
+$messages = array(
+    'HTMLPurifier' => 'HTML Purifier XNone'
+);
+
index 9d26cd70375350f6a403a6e6052d0f532601b2fa..715c3fee9adde5c1c28afab6f8fe082c6e39bde7 100644 (file)
@@ -16,6 +16,7 @@ This directive has been available since 2.0.0.
  * caching and fallbacks.
  * @note Thanks to MediaWiki for the general logic, although this version
  *       has been entirely rewritten
+ * @todo Serialized cache for languages
  */
 class HTMLPurifier_LanguageFactory
 {
@@ -89,40 +90,42 @@ class HTMLPurifier_LanguageFactory
      * Creates a language object, handles class fallbacks
      * @param $config Instance of HTMLPurifier_Config
      * @param $context Instance of HTMLPurifier_Context
+     * @param $code Code to override configuration with. Private parameter.
      */
-    function create($config, &$context) {
+    function create($config, &$context, $code = false) {
         
         // validate language code
-        $code = $this->validator->validate(
-          $config->get('Core', 'Language'), $config, $context
-        );
+        if ($code === false) {
+            $code = $this->validator->validate(
+              $config->get('Core', 'Language'), $config, $context
+            );
+        } else {
+            $code = $this->validator->validate($code, $config, $context);
+        }
         if ($code === false) $code = 'en'; // malformed code becomes English
         
         $pcode = str_replace('-', '_', $code); // make valid PHP classname
         static $depth = 0; // recursion protection
         
         if ($code == 'en') {
-            $class = 'HTMLPurifier_Language';
-            $file  = $this->dir . '/Language.php';
+            $lang = new HTMLPurifier_Language($config, $context);
         } else {
             $class = 'HTMLPurifier_Language_' . $pcode;
             $file  = $this->dir . '/Language/classes/' . $code . '.php';
-            // PHP5/APC deps bug workaround can go here
-            // you can bypass the conditional include by loading the
-            // file yourself
-            if (file_exists($file) && !class_exists($class)) {
-                include_once $file;
-                               }
-        }
-        
-        if (!class_exists($class)) {
-            // go fallback
-            $fallback = HTMLPurifier_LanguageFactory::getFallbackFor($code);
-            $depth++;
-            $lang = HTMLPurifier_LanguageFactory::factory( $fallback );
-            $depth--;
-        } else {
-            $lang = new $class($config, $context);
+            if (file_exists($file)) {
+                include $file;
+                $lang = new $class($config, $context);
+            } else {
+                // Go fallback
+                $raw_fallback = $this->getFallbackFor($code);
+                $fallback = $raw_fallback ? $raw_fallback : 'en';
+                $depth++;
+                $lang = $this->create($config, $context, $fallback);
+                if (!$raw_fallback) {
+                    $lang->error = true;
+                }
+                $depth--;
+            }
         }
         $lang->code = $code;
         
index 56bd4a4828198aa82e43c7d3d6ed624370261e00..9aef335ba842bc82409d08a401c9189ead3fc117 100644 (file)
@@ -90,10 +90,27 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
             $tokens[] = $this->factory->createText($node->data);
             return;
         } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
-            // undo DOM's special treatment of <script> tags
-            $tokens[] = $this->factory->createText($this->parseData($node->data));
+            // undo libxml's special treatment of <script> and <style> tags
+            $last = end($tokens);
+            $data = $node->data;
+            // (note $node->tagname is already normalized)
+            if ($last instanceof HTMLPurifier_Token_Start && $last->name == 'script') {
+                $new_data = trim($data);
+                if (substr($new_data, 0, 4) === '<!--') {
+                    $data = substr($new_data, 4);
+                    if (substr($data, -3) === '-->') {
+                        $data = substr($data, 0, -3);
+                    } else {
+                        // Highly suspicious! Not sure what to do...
+                    }
+                }
+            }
+            $tokens[] = $this->factory->createText($this->parseData($data));
             return;
         } elseif ($node->nodeType === XML_COMMENT_NODE) {
+            // this is code is only invoked for comments in script/style in versions
+            // of libxml pre-2.6.28 (regular comments, of course, are still
+            // handled regularly)
             $tokens[] = $this->factory->createComment($node->data);
             return;
         } elseif (
index 86c0a2112b09a01528eeeba018afef4666996f9d..1b1016874611bbb9232af95cc2bf852e7a6b7478 100644 (file)
@@ -168,7 +168,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 
                 // Check if it's a comment
                 if (
-                    substr($segment, 0, 3) === '!--'
+                    strncmp('!--', $segment, 3) === 0
                 ) {
                     // re-determine segment length, looking for -->
                     $position_comment_end = strpos($html, '-->', $cursor);
@@ -184,12 +184,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     }
                     $strlen_segment = $position_comment_end - $cursor;
                     $segment = substr($html, $cursor, $strlen_segment);
-                    $token = new
-                        HTMLPurifier_Token_Comment(
-                            substr(
-                                $segment, 3, $strlen_segment - 3
-                            )
-                        );
+                    $token = new HTMLPurifier_Token_Comment(substr($segment, 3));
                     if ($maintain_line_numbers) {
                         $token->line = $current_line;
                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
index e32421e1c1a618e3723fc64708b5645ae1bd5c94..c3b6c8e6794d333770ffe211eb07cc48ad322304 100644 (file)
@@ -2,12 +2,68 @@
 
 /**
  * Class that handles operations involving percent-encoding in URIs.
+ *
+ * @warning
+ *      Be careful when reusing instances of PercentEncoder. The object
+ *      you use for normalize() SHOULD NOT be used for encode(), or
+ *      vice-versa.
  */
 class HTMLPurifier_PercentEncoder
 {
     
     /**
-     * Fix up percent-encoding by decoding unreserved characters and normalizing
+     * Reserved characters to preserve when using encode().
+     */
+    var $preserve = array();
+    
+    /**
+     * String of characters that should be preserved while using encode().
+     */
+    function HTMLPurifier_PercentEncoder($preserve = false) {
+        // unreserved letters, ought to const-ify
+        for ($i = 48; $i <= 57;  $i++) $this->preserve[$i] = true; // digits
+        for ($i = 65; $i <= 90;  $i++) $this->preserve[$i] = true; // upper-case
+        for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
+        $this->preserve[45] = true; // Dash         -
+        $this->preserve[46] = true; // Period       .
+        $this->preserve[95] = true; // Underscore   _
+        $this->preserve[126]= true; // Tilde        ~
+        
+        // extra letters not to escape
+        if ($preserve !== false) {
+            for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
+                $this->preserve[ord($preserve[$i])] = true;
+            }
+        }
+    }
+    
+    /**
+     * Our replacement for urlencode, it encodes all non-reserved characters,
+     * as well as any extra characters that were instructed to be preserved.
+     * @note
+     *      Assumes that the string has already been normalized, making any
+     *      and all percent escape sequences valid. Percents will not be
+     *      re-escaped, regardless of their status in $preserve
+     * @param $string String to be encoded
+     * @return Encoded string.
+     */
+    function encode($string) {
+        $ret = '';
+        for ($i = 0, $c = strlen($string); $i < $c; $i++) {
+            if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
+                $ret .= '%' . sprintf('%02X', $int);
+            } else {
+                $ret .= $string[$i];
+            }
+        }
+        return $ret;
+    }
+    
+    /**
+     * Fix up percent-encoding by decoding unreserved characters and normalizing.
+     * @warning This function is affected by $preserve, even though the
+     *          usual desired behavior is for this not to preserve those
+     *          characters. Be careful when reusing instances of PercentEncoder!
      * @param $string String to normalize
      */
     function normalize($string) {
@@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder
                 continue;
             }
             $int = hexdec($encoding);
-            if (
-                ($int >= 48 && $int <= 57) || // digits
-                ($int >= 65 && $int <= 90) || // uppercase letters
-                ($int >= 97 && $int <= 122) || // lowercase letters
-                $int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
-            ) {
+            if (isset($this->preserve[$int])) {
                 $ret .= chr($int) . $text;
                 continue;
             }
index 4b6f498f67307026cc26d0806fbcc229e9b30526..30208ba14793833c32cc6900641e23e2bfbf732b 100644 (file)
@@ -158,10 +158,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
                     // the parent
                     if (!isset($parent_info->child->elements[$token->name])) {
                         if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
-                        // close the parent, then append the token
+                        // close the parent, then re-loop to reprocess token
                         $result[] = new HTMLPurifier_Token_End($parent->name);
-                        $result[] = $token;
-                        $this->currentNesting[] = $token;
+                        $this->inputIndex--;
                         continue;
                     }
                     
index ed7ffdd6a543bca9f7c3b9bb03d52697dc27254c..c68fc48866f3c55a91f9ccd7522af9fc666b6e22 100644 (file)
@@ -4,7 +4,12 @@ require_once 'HTMLPurifier/URIParser.php';
 require_once 'HTMLPurifier/URIFilter.php';
 
 /**
- * HTML Purifier's internal representation of a URI
+ * HTML Purifier's internal representation of a URI.
+ * @note
+ *      Internal data-structures are completely escaped. If the data needs
+ *      to be used in a non-URI context (which is very unlikely), be sure
+ *      to decode it first. The URI may not necessarily be well-formed until
+ *      validate() is called.
  */
 class HTMLPurifier_URI
 {
@@ -52,13 +57,27 @@ class HTMLPurifier_URI
     }
     
     /**
-     * Generic validation method applicable for all schemes
+     * Generic validation method applicable for all schemes. May modify
+     * this URI in order to get it into a compliant form.
      * @param $config Instance of HTMLPurifier_Config
      * @param $context Instance of HTMLPurifier_Context
      * @return True if validation/filtering succeeds, false if failure
      */
     function validate($config, &$context) {
         
+        // ABNF definitions from RFC 3986
+        $chars_sub_delims = '!$&\'()*+,;=';
+        $chars_gen_delims = ':/?#[]@';
+        $chars_pchar = $chars_sub_delims . ':@';
+        
+        // validate scheme (MUST BE FIRST!)
+        if (!is_null($this->scheme) && is_null($this->host)) {
+            $def = $config->getDefinition('URI');
+            if ($def->defaultScheme === $this->scheme) {
+                $this->scheme = null;
+            }
+        }
+        
         // validate host
         if (!is_null($this->host)) {
             $host_def = new HTMLPurifier_AttrDef_URI_Host();
@@ -66,18 +85,51 @@ class HTMLPurifier_URI
             if ($this->host === false) $this->host = null;
         }
         
+        // validate username
+        if (!is_null($this->userinfo)) {
+            $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
+            $this->userinfo = $encoder->encode($this->userinfo);
+        }
+        
         // validate port
         if (!is_null($this->port)) {
             if ($this->port < 1 || $this->port > 65535) $this->port = null;
         }
         
-        // query and fragment are quite simple in terms of definition:
-        // *( pchar / "/" / "?" ), so define their validation routines
-        // when we start fixing percent encoding
-        
-        // path gets to be validated against a hodge-podge of rules depending
-        // on the status of authority and scheme, but it's not that important,
-        // esp. since it won't be applicable to everyone
+        // validate path
+        $path_parts = array();
+        $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
+        if (!is_null($this->host)) {
+            // path-abempty (hier and relative)
+            $this->path = $segments_encoder->encode($this->path);
+        } elseif ($this->path !== '' && $this->path[0] === '/') {
+            // path-absolute (hier and relative)
+            if (strlen($this->path) >= 2 && $this->path[1] === '/') {
+                // This shouldn't ever happen!
+                $this->path = '';
+            } else {
+                $this->path = $segments_encoder->encode($this->path);
+            }
+        } elseif (!is_null($this->scheme) && $this->path !== '') {
+            // path-rootless (hier)
+            // Short circuit evaluation means we don't need to check nz
+            $this->path = $segments_encoder->encode($this->path);
+        } elseif (is_null($this->scheme) && $this->path !== '') {
+            // path-noscheme (relative)
+            // (once again, not checking nz)
+            $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
+            $c = strpos($this->path, '/');
+            if ($c !== false) {
+                $this->path = 
+                    $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
+                    $segments_encoder->encode(substr($this->path, $c));
+            } else {
+                $this->path = $segment_nc_encoder->encode($this->path);
+            }
+        } else {
+            // path-empty (hier and relative)
+            $this->path = ''; // just to be safe
+        }
         
         return true;
         
index dff7e28ef86659e035774ef0ccdac3a6576325fa..8ba485cf35130fd444a2eaa0bfc5d1c0755a5c94 100644 (file)
@@ -4,24 +4,39 @@ require_once 'HTMLPurifier/URI.php';
 
 /**
  * Parses a URI into the components and fragment identifier as specified
- * by RFC 2396.
- * @todo Replace regexps with a native PHP parser
+ * by RFC 3986.
  */
 class HTMLPurifier_URIParser
 {
     
     /**
-     * Parses a URI
+     * Instance of HTMLPurifier_PercentEncoder to do normalization with.
+     */
+    var $percentEncoder;
+    
+    function HTMLPurifier_URIParser() {
+        $this->percentEncoder = new HTMLPurifier_PercentEncoder();
+    }
+    
+    /**
+     * Parses a URI.
      * @param $uri string URI to parse
-     * @return HTMLPurifier_URI representation of URI
+     * @return HTMLPurifier_URI representation of URI. This representation has
+     *         not been validated yet and may not conform to RFC.
      */
     function parse($uri) {
+        
+        $uri = $this->percentEncoder->normalize($uri);
+        
+        // Regexp is as per Appendix B.
+        // Note that ["<>] are an addition to the RFC's recommended 
+        // characters, because they represent external delimeters.
         $r_URI = '!'.
-            '(([^:/?#<>\'"]+):)?'. // 2. Scheme
-            '(//([^/?#<>\'"]*))?'. // 4. Authority
-            '([^?#<>\'"]*)'.       // 5. Path
-            '(\?([^#<>\'"]*))?'.   // 7. Query
-            '(#([^<>\'"]*))?'.     // 8. Fragment
+            '(([^:/?#"<>]+):)?'. // 2. Scheme
+            '(//([^/?#"<>]*))?'. // 4. Authority
+            '([^?#"<>]*)'.       // 5. Path
+            '(\?([^#"<>]*))?'.   // 7. Query
+            '(#([^"<>]*))?'.     // 8. Fragment
             '!';
         
         $matches = array();
@@ -38,13 +53,7 @@ class HTMLPurifier_URIParser
         
         // further parse authority
         if ($authority !== null) {
-            // ridiculously inefficient: it's a stacked regex!
-            $HEXDIG = '[A-Fa-f0-9]';
-            $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
-            $sub_delims = '!$&\'()'; // needs []
-            $pct_encoded = "%$HEXDIG$HEXDIG";
-            $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
-            $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
+            $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
             $matches = array();
             preg_match($r_authority, $authority, $matches);
             $userinfo   = !empty($matches[1]) ? $matches[2] : null;
index 3731abf892b63ddb967cde295aeadd6e6f5dba27..fbec8286c9a1da0a603ea3fe2f8b45a941823a58 100644 (file)
@@ -1,4 +1,4 @@
-Description of HTML Purifier v2.1.3 Lite library import into Moodle
+Description of HTML Purifier v2.1.4 Lite library import into Moodle
 
 Changes:
  * HMLTModule/Text.php - added  <nolink>, <tex>, <lang> and <algebra> tags