]> git.mjollnir.org Git - moodle.git/commitdiff
MDL-9151 HTML Purifier cleaning support - enable switch is in experimental section
authorskodak <skodak>
Wed, 18 Apr 2007 21:52:03 +0000 (21:52 +0000)
committerskodak <skodak>
Wed, 18 Apr 2007 21:52:03 +0000 (21:52 +0000)
MDL-9435 Reviewved url cleaning in redirect()

129 files changed:
admin/settings/misc.php
lang/en_utf8/admin.php
lang/en_utf8/docs/credits.html
lib/htmlpurifier/CREDITS [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier.auto.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier.func.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrCollections.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Background.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Border.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Color.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Composite.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Font.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/FontFamily.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Length.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/ListStyle.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Multiple.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Number.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Percentage.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/TextDecoration.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/URI.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/Enum.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Length.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/LinkTypes.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/MultiLength.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Nmtokens.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Pixels.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/Integer.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/Lang.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/Text.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/AttrTypes.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/CSSDefinition.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Config.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ConfigDef.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ConfigSchema.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ContentSets.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Context.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/ElementDef.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Encoder.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/EntityLookup.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/EntityLookup/entities.ser [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/EntityParser.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Error.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Filter.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Filter/YouTube.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Generator.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Bdo.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/CommonAttributes.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Edit.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Hypertext.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Image.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Presentation.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/StyleAttribute.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/Text.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/IDAccumulator.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Language.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Language/messages/en.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/LanguageFactory.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Lexer.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/PercentEncoder.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Printer.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/Core.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Strategy/ValidateAttributes.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/TagTransform.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/TagTransform/Center.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/TagTransform/Font.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/TagTransform/Simple.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/Token.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/TokenFactory.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/ftp.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/http.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/https.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/mailto.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/news.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URIScheme/nntp.php [new file with mode: 0644]
lib/htmlpurifier/HTMLPurifier/URISchemeRegistry.php [new file with mode: 0644]
lib/htmlpurifier/readme_moodle.txt [new file with mode: 0644]
lib/weblib.php

index 117dcbb291f6cb74c60947858036ef1a0d325b58..d910603816ec9c0641e33eb01e74f9146f9f0fb8 100644 (file)
@@ -7,6 +7,7 @@ $temp = new admin_settingpage('experimental', get_string('experimental', 'admin'
 $temp->add(new admin_setting_configcheckbox('enableajax', get_string('enableajax', 'admin'), get_string('configenableajax', 'admin'), 0));
 $temp->add(new admin_setting_configcheckbox('enableglobalsearch', get_string('enableglobalsearch', 'admin'), get_string('configenableglobalsearch', 'admin'), 0));
 $temp->add(new admin_setting_configcheckbox('smartpix', get_string('smartpix', 'admin'), get_string('configsmartpix', 'admin'), 0));
+$temp->add(new admin_setting_configcheckbox('enablehtmlpurifier', get_string('enablehtmlpurifier', 'admin'), get_string('configenablehtmlpurifier', 'admin'), 0));
 $ADMIN->add('misc', $temp);
 
 // XMLDB editor
index f79bab02b3d10084817eceadaf3d02a4eeacbaef..d75adb3ca6e667500f810be95b92ce94b97294bb 100644 (file)
@@ -87,6 +87,7 @@ $string['configeditordictionary'] = 'This value will be used if aspell doesn\'t
 $string['configenableajax'] = 'This setting allows you to control the use of AJAX (advanced client/server interfaces using Javascript) across the whole site.  With this setting enabled users can sill make a choice in their profile, otherwise AJAX is disabled for everybody.';
 $string['configenablecourserequests'] = 'This will allow any user to request a course be created.';
 $string['configenableglobalsearch'] = 'This setting enables global text searching in resources and activities, it is not compatible with PHP 4.';
+$string['configenablehtmlpurifier'] = 'Use HTML Purifier instead of KSES for celaning of untrusted text. HTML Purifier is actively developed and is belived to be more secure, but it is more resource intensive. Expect minor visual differences in the resulting html code. Please note that embed and object tags can not be enabled, MathML tags and old lang tags are not supported. ';
 $string['configenablerssfeeds'] = 'This switch will enable RSS feeds from across the site.  To actually see any change you will need to enable RSS feeds in the individual modules too - go to the Modules settings under Admin Configuration.';
 $string['configenablerssfeedsdisabled'] = 'It is not available because RSS feeds are disabled in all the Site. To enable them, go to the Variables settings under Admin Configuration.';
 $string['configenablestats'] = 'If you choose \'yes\' here, Moodle\'s cronjob will process the logs and gather some statistics.  Depending on the amount of traffic on your site, this can take awhile. If you enable this, you will be able to see some interesting graphs and statistics about each of your courses, or on a sitewide basis.';
@@ -261,6 +262,7 @@ $string['editstrings'] = 'Edit words or phrases';
 $string['enableajax'] = 'Enable AJAX';
 $string['enablecourserequests'] = 'Enable course requests';
 $string['enableglobalsearch'] = 'Enable global search';
+$string['enablehtmlpurifier'] = 'Enable HTML Purifier';
 $string['enablerecordcache'] = 'Enable Record Cache';
 $string['enablerssfeeds'] = 'Enable RSS feeds';
 $string['enablestats'] = 'Enable statistics';
index 05fe0eae45ac1aa628eaaf79fb973cd18b76773a..dee954aef15f5ccd955fba4a6140aae73ed2c70d 100644 (file)
     URL: <a href="http://typo3.org/">http://typo3.org/</a><br />
   </p>
     </blockquote>
+
+    <p><b>HTML Purifier</b>&nbsp; - &nbsp; lib/htmlpurifier</p>
+    <blockquote>
+      <p>Standards-compliant HTML filter library.<br />
+    <br />
+    CVS version: 1.60<br />
+    Copyright (C) 2006 Edward Z. Yang<br />
+    License: GNU LGPL<br />
+    URL: <a href="http://hp.jpsband.org/">http://hp.jpsband.org/</a><br />
+  </p>
+    </blockquote>
     
   </blockquote>
   <p align="center"><font size="1"><a href="." target="_top">Moodle Documentation</a></font></p>
diff --git a/lib/htmlpurifier/CREDITS b/lib/htmlpurifier/CREDITS
new file mode 100644 (file)
index 0000000..c3e7bb8
--- /dev/null
@@ -0,0 +1,7 @@
+
+CREDITS
+
+Almost everything written by Edward Z. Yang (Ambush Commander).  Lots of thanks
+to the DevNetwork Community for their help (see docs/ref-devnetwork.html for
+more details), Feyd especially (namely IPv6 and optimization).  Thanks to RSnake
+for letting me package his fantastic XSS cheatsheet for a smoketest.
diff --git a/lib/htmlpurifier/HTMLPurifier.auto.php b/lib/htmlpurifier/HTMLPurifier.auto.php
new file mode 100644 (file)
index 0000000..a66fd2e
--- /dev/null
@@ -0,0 +1,10 @@
+<?php
+
+/**
+ * This is a stub include that automatically configures the include path.
+ */
+
+set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
+require_once 'HTMLPurifier.php';
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier.func.php b/lib/htmlpurifier/HTMLPurifier.func.php
new file mode 100644 (file)
index 0000000..876ad7b
--- /dev/null
@@ -0,0 +1,21 @@
+<?php
+
+/**
+ * Function wrapper for HTML Purifier for quick use.
+ * @note This function only includes the library when it is called. While
+ *       this is efficient for instances when you only use HTML Purifier
+ *       on a few of your pages, it murders bytecode caching. You still
+ *       need to add HTML Purifier to your path.
+ * @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()''
+ */
+
+function HTMLPurifier($html, $config = null) {
+    static $purifier = false;
+    if (!$purifier) {
+        require_once 'HTMLPurifier.php';
+        $purifier = new HTMLPurifier();
+    }
+    return $purifier->purify($html, $config);
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier.php b/lib/htmlpurifier/HTMLPurifier.php
new file mode 100644 (file)
index 0000000..5a0ce99
--- /dev/null
@@ -0,0 +1,170 @@
+<?php
+
+/*!
+ * @mainpage
+ * 
+ * HTML Purifier is an HTML filter that will take an arbitrary snippet of
+ * HTML and rigorously test, validate and filter it into a version that
+ * is safe for output onto webpages. It achieves this by:
+ * 
+ *  -# Lexing (parsing into tokens) the document,
+ *  -# Executing various strategies on the tokens:
+ *      -# Removing all elements not in the whitelist,
+ *      -# Making the tokens well-formed,
+ *      -# Fixing the nesting of the nodes, and
+ *      -# Validating attributes of the nodes; and
+ *  -# Generating HTML from the purified tokens.
+ * 
+ * However, most users will only need to interface with the HTMLPurifier
+ * class, so this massive amount of infrastructure is usually concealed.
+ * If you plan on working with the internals, be sure to include
+ * HTMLPurifier_ConfigSchema and HTMLPurifier_Config.
+ */
+
+/*
+    HTML Purifier 1.6.0 - Standards Compliant HTML Filtering
+    Copyright (C) 2006 Edward Z. Yang
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+// almost every class has an undocumented dependency to these, so make sure
+// they get included
+require_once 'HTMLPurifier/ConfigSchema.php';
+require_once 'HTMLPurifier/Config.php';
+require_once 'HTMLPurifier/Context.php';
+
+require_once 'HTMLPurifier/Lexer.php';
+require_once 'HTMLPurifier/Generator.php';
+require_once 'HTMLPurifier/Strategy/Core.php';
+require_once 'HTMLPurifier/Encoder.php';
+
+/**
+ * Main library execution class.
+ * 
+ * Facade that performs calls to the HTMLPurifier_Lexer,
+ * HTMLPurifier_Strategy and HTMLPurifier_Generator subsystems in order to
+ * purify HTML.
+ * 
+ * @todo We need an easier way to inject strategies, it'll probably end
+ *       up getting done through config though.
+ */
+class HTMLPurifier
+{
+    
+    var $version = '1.6.0';
+    
+    var $config;
+    var $filters;
+    
+    var $lexer, $strategy, $generator;
+    
+    /**
+     * Final HTMLPurifier_Context of last run purification. Might be an array.
+     * @public
+     */
+    var $context;
+    
+    /**
+     * Initializes the purifier.
+     * @param $config Optional HTMLPurifier_Config object for all instances of
+     *                the purifier, if omitted, a default configuration is
+     *                supplied (which can be overridden on a per-use basis).
+     *                The parameter can also be any type that
+     *                HTMLPurifier_Config::create() supports.
+     */
+    function HTMLPurifier($config = null) {
+        
+        $this->config = HTMLPurifier_Config::create($config);
+        
+        $this->lexer        = HTMLPurifier_Lexer::create();
+        $this->strategy     = new HTMLPurifier_Strategy_Core();
+        $this->generator    = new HTMLPurifier_Generator();
+        
+    }
+    
+    /**
+     * Adds a filter to process the output. First come first serve
+     * @param $filter HTMLPurifier_Filter object
+     */
+    function addFilter($filter) {
+        $this->filters[] = $filter;
+    }
+    
+    /**
+     * Filters an HTML snippet/document to be XSS-free and standards-compliant.
+     * 
+     * @param $html String of HTML to purify
+     * @param $config HTMLPurifier_Config object for this operation, if omitted,
+     *                defaults to the config object specified during this
+     *                object's construction. The parameter can also be any type
+     *                that HTMLPurifier_Config::create() supports.
+     * @return Purified HTML
+     */
+    function purify($html, $config = null) {
+        
+        $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
+        
+        $context = new HTMLPurifier_Context();
+        $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
+        
+        for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
+            $html = $this->filters[$i]->preFilter($html, $config, $context);
+        }
+        
+        // purified HTML
+        $html = 
+            $this->generator->generateFromTokens(
+                // list of tokens
+                $this->strategy->execute(
+                    // list of un-purified tokens
+                    $this->lexer->tokenizeHTML(
+                        // un-purified HTML
+                        $html, $config, $context
+                    ),
+                    $config, $context
+                ),
+                $config, $context
+            );
+        
+        for ($i = $size - 1; $i >= 0; $i--) {
+            $html = $this->filters[$i]->postFilter($html, $config, $context);
+        }
+        
+        $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
+        $this->context =& $context;
+        return $html;
+    }
+    
+    /**
+     * Filters an array of HTML snippets
+     * @param $config Optional HTMLPurifier_Config object for this operation.
+     *                See HTMLPurifier::purify() for more details.
+     * @return Array of purified HTML
+     */
+    function purifyArray($array_of_html, $config = null) {
+        $context_array = array();
+        foreach ($array_of_html as $key => $html) {
+            $array_of_html[$key] = $this->purify($html, $config);
+            $context_array[$key] = $this->context;
+        }
+        $this->context = $context_array;
+        return $array_of_html;
+    }
+    
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrCollections.php b/lib/htmlpurifier/HTMLPurifier/AttrCollections.php
new file mode 100644 (file)
index 0000000..8318abb
--- /dev/null
@@ -0,0 +1,100 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTypes.php';
+require_once 'HTMLPurifier/AttrDef/Lang.php';
+
+/**
+ * Defines common attribute collections that modules reference
+ */
+
+class HTMLPurifier_AttrCollections
+{
+    
+    /**
+     * Associative array of attribute collections, indexed by name
+     * @note Technically, the composition of these is more complicated,
+     *       but we bypass it using our own excludes property
+     */
+    var $info = array();
+    
+    /**
+     * Performs all expansions on internal data for use by other inclusions
+     * It also collects all attribute collection extensions from
+     * modules
+     * @param $attr_types HTMLPurifier_AttrTypes instance
+     * @param $modules Hash array of HTMLPurifier_HTMLModule members
+     */
+    function HTMLPurifier_AttrCollections($attr_types, $modules) {
+        $info =& $this->info;
+        // load extensions from the modules
+        foreach ($modules as $module) {
+            foreach ($module->attr_collections as $coll_i => $coll) {
+                foreach ($coll as $attr_i => $attr) {
+                    if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) {
+                        // merge in includes
+                        $info[$coll_i][$attr_i] = array_merge(
+                            $info[$coll_i][$attr_i], $attr);
+                        continue;
+                    }
+                    $info[$coll_i][$attr_i] = $attr;
+                }
+            }
+        }
+        // perform internal expansions and inclusions
+        foreach ($info as $name => $attr) {
+            // merge attribute collections that include others
+            $this->performInclusions($info[$name]);
+            // replace string identifiers with actual attribute objects
+            $this->expandIdentifiers($info[$name], $attr_types);
+        }
+    }
+    
+    /**
+     * Takes a reference to an attribute associative array and performs
+     * all inclusions specified by the zero index.
+     * @param &$attr Reference to attribute array
+     */
+    function performInclusions(&$attr) {
+        if (!isset($attr[0])) return;
+        $merge = $attr[0];
+        // loop through all the inclusions
+        for ($i = 0; isset($merge[$i]); $i++) {
+            // foreach attribute of the inclusion, copy it over
+            foreach ($this->info[$merge[$i]] as $key => $value) {
+                if (isset($attr[$key])) continue; // also catches more inclusions
+                $attr[$key] = $value;
+            }
+            if (isset($info[$merge[$i]][0])) {
+                // recursion
+                $merge = array_merge($merge, isset($info[$merge[$i]][0]));
+            }
+        }
+        unset($attr[0]);
+    }
+    
+    /**
+     * Expands all string identifiers in an attribute array by replacing
+     * them with the appropriate values inside HTMLPurifier_AttrTypes
+     * @param &$attr Reference to attribute array
+     * @param $attr_types HTMLPurifier_AttrTypes instance
+     */
+    function expandIdentifiers(&$attr, $attr_types) {
+        foreach ($attr as $def_i => $def) {
+            if ($def_i === 0) continue;
+            if (!is_string($def)) continue;
+            if ($def === false) {
+                unset($attr[$def_i]);
+                continue;
+            }
+            if (isset($attr_types->info[$def])) {
+                $attr[$def_i] = $attr_types->info[$def];
+            } else {
+                trigger_error('Attempted to reference undefined attribute type', E_USER_ERROR);
+                unset($attr[$def_i]);
+            }
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef.php b/lib/htmlpurifier/HTMLPurifier/AttrDef.php
new file mode 100644 (file)
index 0000000..334a7ac
--- /dev/null
@@ -0,0 +1,67 @@
+<?php
+
+/**
+ * Base class for all validating attribute definitions.
+ * 
+ * This family of classes forms the core for not only HTML attribute validation,
+ * but also any sort of string that needs to be validated or cleaned (which
+ * means CSS properties and composite definitions are defined here too).  
+ * Besides defining (through code) what precisely makes the string valid,
+ * subclasses are also responsible for cleaning the code if possible.
+ */
+
+class HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Tells us whether or not an HTML attribute is minimized. Only the
+     * boolean attribute vapourware would use this.
+     */
+    var $minimized = false;
+    
+    /**
+     * Validates and cleans passed string according to a definition.
+     * 
+     * @public
+     * @param $string String to be validated and cleaned.
+     * @param $config Mandatory HTMLPurifier_Config object.
+     * @param $context Mandatory HTMLPurifier_AttrContext object.
+     */
+    function validate($string, $config, &$context) {
+        trigger_error('Cannot call abstract function', E_USER_ERROR);
+    }
+    
+    /**
+     * Convenience method that parses a string as if it were CDATA.
+     * 
+     * This method process a string in the manner specified at
+     * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
+     * leading and trailing whitespace, ignoring line feeds, and replacing
+     * carriage returns and tabs with spaces.  While most useful for HTML
+     * attributes specified as CDATA, it can also be applied to most CSS
+     * values.
+     * 
+     * @note This method is not entirely standards compliant, as trim() removes
+     *       more types of whitespace than specified in the spec. In practice,
+     *       this is rarely a problem, as those extra characters usually have
+     *       already been removed by HTMLPurifier_Encoder.
+     * 
+     * @warning This processing is inconsistent with XML's whitespace handling
+     *          as specified by section 3.3.3 and referenced XHTML 1.0 section
+     *          4.7.  Compliant processing requires all line breaks normalized
+     *          to "\n", so the fix is not as simple as fixing it in this
+     *          function.  Trim and whitespace collapsing are supposed to only
+     *          occur in NMTOKENs.  However, note that we are NOT necessarily
+     *          parsing XML, thus, this behavior may still be correct.
+     * 
+     * @public
+     */
+    function parseCDATA($string) {
+        $string = trim($string);
+        $string = str_replace("\n", '', $string);
+        $string = str_replace(array("\r", "\t"), ' ', $string);
+        return $string;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS.php
new file mode 100644 (file)
index 0000000..220ec0d
--- /dev/null
@@ -0,0 +1,69 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/CSSDefinition.php';
+
+/**
+ * Validates the HTML attribute style, otherwise known as CSS.
+ * @note We don't implement the whole CSS specification, so it might be
+ *       difficult to reuse this component in the context of validating
+ *       actual stylesheet declarations.
+ * @note If we were really serious about validating the CSS, we would
+ *       tokenize the styles and then parse the tokens. Obviously, we
+ *       are not doing that. Doing that could seriously harm performance,
+ *       but would make these components a lot more viable for a CSS
+ *       filtering solution.
+ */
+class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
+{
+    
+    function validate($css, $config, &$context) {
+        
+        $css = $this->parseCDATA($css);
+        
+        $definition = $config->getCSSDefinition();
+        
+        // we're going to break the spec and explode by semicolons.
+        // This is because semicolon rarely appears in escaped form
+        // Doing this is generally flaky but fast
+        // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
+        // for details
+        
+        $declarations = explode(';', $css);
+        $propvalues = array();
+        
+        foreach ($declarations as $declaration) {
+            if (!$declaration) continue;
+            if (!strpos($declaration, ':')) continue;
+            list($property, $value) = explode(':', $declaration, 2);
+            $property = trim($property);
+            $value    = trim($value);
+            if (!isset($definition->info[$property])) continue;
+            // inefficient call, since the validator will do this again
+            if (strtolower(trim($value)) !== 'inherit') {
+                // inherit works for everything (but only on the base property)
+                $result = $definition->info[$property]->validate(
+                    $value, $config, $context );
+            } else {
+                $result = 'inherit';
+            }
+            if ($result === false) continue;
+            $propvalues[$property] = $result;
+        }
+        
+        // procedure does not write the new CSS simultaneously, so it's
+        // slightly inefficient, but it's the only way of getting rid of
+        // duplicates. Perhaps config to optimize it, but not now.
+        
+        $new_declarations = '';
+        foreach ($propvalues as $prop => $value) {
+            $new_declarations .= "$prop:$value;";
+        }
+        
+        return $new_declarations ? $new_declarations : false;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Background.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Background.php
new file mode 100644 (file)
index 0000000..42d8bcf
--- /dev/null
@@ -0,0 +1,87 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/CSSDefinition.php';
+
+/**
+ * Validates shorthand CSS property background.
+ * @warning Does not support url tokens that have internal spaces.
+ */
+class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Local copy of component validators.
+     * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
+     */
+    var $info;
+    
+    function HTMLPurifier_AttrDef_CSS_Background($config) {
+        $def = $config->getCSSDefinition();
+        $this->info['background-color'] = $def->info['background-color'];
+        $this->info['background-image'] = $def->info['background-image'];
+        $this->info['background-repeat'] = $def->info['background-repeat'];
+        $this->info['background-attachment'] = $def->info['background-attachment'];
+        $this->info['background-position'] = $def->info['background-position'];
+    }
+    
+    function validate($string, $config, &$context) {
+        
+        // regular pre-processing
+        $string = $this->parseCDATA($string);
+        if ($string === '') return false;
+        
+        // assumes URI doesn't have spaces in it
+        $bits = explode(' ', strtolower($string)); // bits to process
+        
+        $caught = array();
+        $caught['color']    = false;
+        $caught['image']    = false;
+        $caught['repeat']   = false;
+        $caught['attachment'] = false;
+        $caught['position'] = false;
+        
+        $i = 0; // number of catches
+        $none = false;
+        
+        foreach ($bits as $bit) {
+            if ($bit === '') continue;
+            foreach ($caught as $key => $status) {
+                if ($key != 'position') {
+                    if ($status !== false) continue;
+                    $r = $this->info['background-' . $key]->validate($bit, $config, $context);
+                } else {
+                    $r = $bit;
+                }
+                if ($r === false) continue;
+                if ($key == 'position') {
+                    if ($caught[$key] === false) $caught[$key] = '';
+                    $caught[$key] .= $r . ' ';
+                } else {
+                    $caught[$key] = $r;
+                }
+                $i++;
+                break;
+            }
+        }
+        
+        if (!$i) return false;
+        if ($caught['position'] !== false) {
+            $caught['position'] = $this->info['background-position']->
+                validate($caught['position'], $config, $context);
+        }
+        
+        $ret = array();
+        foreach ($caught as $value) {
+            if ($value === false) continue;
+            $ret[] = $value;
+        }
+        
+        if (empty($ret)) return false;
+        return implode(' ', $ret);
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php
new file mode 100644 (file)
index 0000000..77a3ddd
--- /dev/null
@@ -0,0 +1,130 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
+
+/* W3C says:
+    [ // adjective and number must be in correct order, even if
+      // you could switch them without introducing ambiguity.
+      // some browsers support that syntax
+        [
+            <percentage> | <length> | left | center | right
+        ]
+        [ 
+            <percentage> | <length> | top | center | bottom
+        ]?
+    ] |
+    [ // this signifies that the vertical and horizontal adjectives
+      // can be arbitrarily ordered, however, there can only be two,
+      // one of each, or none at all
+        [
+            left | center | right
+        ] ||
+        [
+            top | center | bottom
+        ]
+    ]
+    top, left = 0%
+    center, (none) = 50%
+    bottom, right = 100%
+*/
+
+/* QuirksMode says:
+    keyword + length/percentage must be ordered correctly, as per W3C
+    
+    Internet Explorer and Opera, however, support arbitrary ordering. We
+    should fix it up.
+    
+    Minor issue though, not strictly necessary.
+*/
+
+// control freaks may appreciate the ability to convert these to
+// percentages or something, but it's not necessary
+
+/**
+ * Validates the value of background-position.
+ */
+class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
+{
+    
+    var $length;
+    var $percentage;
+    
+    function HTMLPurifier_AttrDef_CSS_BackgroundPosition() {
+        $this->length     = new HTMLPurifier_AttrDef_CSS_Length();
+        $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
+    }
+    
+    function validate($string, $config, &$context) {
+        $string = $this->parseCDATA($string);
+        $bits = explode(' ', $string);
+        
+        $keywords = array();
+        $keywords['h'] = false; // left, right
+        $keywords['v'] = false; // top, bottom
+        $keywords['c'] = false; // center
+        $measures = array();
+        
+        $i = 0;
+        
+        $lookup = array(
+            'top' => 'v',
+            'bottom' => 'v',
+            'left' => 'h',
+            'right' => 'h',
+            'center' => 'c'
+        );
+        
+        foreach ($bits as $bit) {
+            if ($bit === '') continue;
+            
+            // test for keyword
+            $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
+            if (isset($lookup[$lbit])) {
+                $status = $lookup[$lbit];
+                $keywords[$status] = $lbit;
+                $i++;
+            }
+            
+            // test for length
+            $r = $this->length->validate($bit, $config, $context);
+            if ($r !== false) {
+                $measures[] = $r;
+                $i++;
+            }
+            
+            // test for percentage
+            $r = $this->percentage->validate($bit, $config, $context);
+            if ($r !== false) {
+                $measures[] = $r;
+                $i++;
+            }
+            
+        }
+        
+        if (!$i) return false; // no valid values were caught
+        
+        
+        $ret = array();
+        
+        // first keyword
+        if     ($keywords['h'])     $ret[] = $keywords['h'];
+        elseif (count($measures))   $ret[] = array_shift($measures);
+        elseif ($keywords['c']) {
+            $ret[] = $keywords['c'];
+            $keywords['c'] = false; // prevent re-use: center = center center
+        }
+        
+        if     ($keywords['v'])     $ret[] = $keywords['v'];
+        elseif (count($measures))   $ret[] = array_shift($measures);
+        elseif ($keywords['c'])     $ret[] = $keywords['c'];
+        
+        if (empty($ret)) return false;
+        return implode(' ', $ret);
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Border.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Border.php
new file mode 100644 (file)
index 0000000..583f14f
--- /dev/null
@@ -0,0 +1,45 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates the border property as defined by CSS.
+ */
+class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Local copy of properties this property is shorthand for.
+     */
+    var $info = array();
+    
+    function HTMLPurifier_AttrDef_CSS_Border($config) {
+        $def = $config->getCSSDefinition();
+        $this->info['border-width'] = $def->info['border-width'];
+        $this->info['border-style'] = $def->info['border-style'];
+        $this->info['border-top-color'] = $def->info['border-top-color'];
+    }
+    
+    function validate($string, $config, &$context) {
+        $string = $this->parseCDATA($string);
+        // we specifically will not support rgb() syntax with spaces
+        $bits = explode(' ', $string);
+        $done = array(); // segments we've finished
+        $ret = ''; // return value
+        foreach ($bits as $bit) {
+            foreach ($this->info as $propname => $validator) {
+                if (isset($done[$propname])) continue;
+                $r = $validator->validate($bit, $config, $context);
+                if ($r !== false) {
+                    $ret .= $r . ' ';
+                    $done[$propname] = true;
+                    break;
+                }
+            }
+        }
+        return rtrim($ret);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Color.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Color.php
new file mode 100644 (file)
index 0000000..4e6a78a
--- /dev/null
@@ -0,0 +1,97 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates Color as defined by CSS.
+ */
+class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Color keyword lookup table.
+     * @todo Extend it to include all usually allowed colors.
+     */
+    var $colors = array(
+        'maroon'    => '#800000',
+        'red'       => '#F00',
+        'orange'    => '#FFA500',
+        'yellow'    => '#FF0',
+        'olive'     => '#808000',
+        'purple'    => '#800080',
+        'fuchsia'   => '#F0F',
+        'white'     => '#FFF',
+        'lime'      => '#0F0',
+        'green'     => '#008000',
+        'navy'      => '#000080',
+        'blue'      => '#00F',
+        'aqua'      => '#0FF',
+        'teal'      => '#008080',
+        'black'     => '#000',
+        'silver'    => '#C0C0C0',
+        'gray'      => '#808080'
+    );
+    
+    function validate($color, $config, &$context) {
+        
+        $color = trim($color);
+        if (!$color) return false;
+        
+        $lower = strtolower($color);
+        if (isset($this->colors[$lower])) return $this->colors[$lower];
+        
+        if ($color[0] === '#') {
+            // hexadecimal handling
+            $hex = substr($color, 1);
+            $length = strlen($hex);
+            if ($length !== 3 && $length !== 6) return false;
+            if (!ctype_xdigit($hex)) return false;
+        } else {
+            // rgb literal handling
+            if (strpos($color, 'rgb(')) return false;
+            $length = strlen($color);
+            if (strpos($color, ')') !== $length - 1) return false;
+            $triad = substr($color, 4, $length - 4 - 1);
+            $parts = explode(',', $triad);
+            if (count($parts) !== 3) return false;
+            $type = false; // to ensure that they're all the same type
+            $new_parts = array();
+            foreach ($parts as $part) {
+                $part = trim($part);
+                if ($part === '') return false;
+                $length = strlen($part);
+                if ($part[$length - 1] === '%') {
+                    // handle percents
+                    if (!$type) {
+                        $type = 'percentage';
+                    } elseif ($type !== 'percentage') {
+                        return false;
+                    }
+                    $num = (float) substr($part, 0, $length - 1);
+                    if ($num < 0) $num = 0;
+                    if ($num > 100) $num = 100;
+                    $new_parts[] = "$num%";
+                } else {
+                    // handle integers
+                    if (!$type) {
+                        $type = 'integer';
+                    } elseif ($type !== 'integer') {
+                        return false;
+                    }
+                    $num = (int) $part;
+                    if ($num < 0) $num = 0;
+                    if ($num > 255) $num = 255;
+                    $new_parts[] = (string) $num;
+                }
+            }
+            $new_triad = implode(',', $new_parts);
+            $color = "rgb($new_triad)";
+        }
+        
+        return $color;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Composite.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Composite.php
new file mode 100644 (file)
index 0000000..9d2803d
--- /dev/null
@@ -0,0 +1,38 @@
+<?php
+
+/**
+ * Allows multiple validators to attempt to validate attribute.
+ * 
+ * Composite is just what it sounds like: a composite of many validators.
+ * This means that multiple HTMLPurifier_AttrDef objects will have a whack
+ * at the string.  If one of them passes, that's what is returned.  This is
+ * especially useful for CSS values, which often are a choice between
+ * an enumerated set of predefined values or a flexible data type.
+ */
+class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * List of HTMLPurifier_AttrDef objects that may process strings
+     * @protected
+     */
+    var $defs;
+    
+    /**
+     * @param $defs List of HTMLPurifier_AttrDef objects
+     */
+    function HTMLPurifier_AttrDef_CSS_Composite($defs) {
+        $this->defs = $defs;
+    }
+    
+    function validate($string, $config, &$context) {
+        foreach ($this->defs as $i => $def) {
+            $result = $this->defs[$i]->validate($string, $config, $context);
+            if ($result !== false) return $result;
+        }
+        return false;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Font.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Font.php
new file mode 100644 (file)
index 0000000..1b3b090
--- /dev/null
@@ -0,0 +1,154 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates shorthand CSS property font.
+ */
+class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Local copy of component validators.
+     * 
+     * @note If we moved specific CSS property definitions to their own
+     *       classes instead of having them be assembled at run time by
+     *       CSSDefinition, this wouldn't be necessary.  We'd instantiate
+     *       our own copies.
+     */
+    var $info = array();
+    
+    /**
+     * System font keywords.
+     */
+    var $system_fonts = array(
+        'caption' => true,
+        'icon' => true,
+        'menu' => true,
+        'message-box' => true,
+        'small-caption' => true,
+        'status-bar' => true
+    );
+    
+    function HTMLPurifier_AttrDef_CSS_Font($config) {
+        $def = $config->getCSSDefinition();
+        $this->info['font-style']   = $def->info['font-style'];
+        $this->info['font-variant'] = $def->info['font-variant'];
+        $this->info['font-weight']  = $def->info['font-weight'];
+        $this->info['font-size']    = $def->info['font-size'];
+        $this->info['line-height']  = $def->info['line-height'];
+        $this->info['font-family']  = $def->info['font-family'];
+    }
+    
+    function validate($string, $config, &$context) {
+        
+        // regular pre-processing
+        $string = $this->parseCDATA($string);
+        if ($string === '') return false;
+        
+        // check if it's one of the keywords
+        $lowercase_string = strtolower($string);
+        if (isset($this->system_fonts[$lowercase_string])) {
+            return $lowercase_string;
+        }
+        
+        $bits = explode(' ', $string); // bits to process
+        $stage = 0; // this indicates what we're looking for
+        $caught = array(); // which stage 0 properties have we caught?
+        $stage_1 = array('font-style', 'font-variant', 'font-weight');
+        $final = ''; // output
+        
+        for ($i = 0, $size = count($bits); $i < $size; $i++) {
+            if ($bits[$i] === '') continue;
+            switch ($stage) {
+                
+                // attempting to catch font-style, font-variant or font-weight
+                case 0:
+                    foreach ($stage_1 as $validator_name) {
+                        if (isset($caught[$validator_name])) continue;
+                        $r = $this->info[$validator_name]->validate(
+                                                $bits[$i], $config, $context);
+                        if ($r !== false) {
+                            $final .= $r . ' ';
+                            $caught[$validator_name] = true;
+                            break;
+                        }
+                    }
+                    // all three caught, continue on
+                    if (count($caught) >= 3) $stage = 1;
+                    if ($r !== false) break;
+                
+                // attempting to catch font-size and perhaps line-height
+                case 1:
+                    $found_slash = false;
+                    if (strpos($bits[$i], '/') !== false) {
+                        list($font_size, $line_height) =
+                                                    explode('/', $bits[$i]);
+                        if ($line_height === '') {
+                            // ooh, there's a space after the slash!
+                            $line_height = false;
+                            $found_slash = true;
+                        }
+                    } else {
+                        $font_size = $bits[$i];
+                        $line_height = false;
+                    }
+                    $r = $this->info['font-size']->validate(
+                                              $font_size, $config, $context);
+                    if ($r !== false) {
+                        $final .= $r;
+                        // attempt to catch line-height
+                        if ($line_height === false) {
+                            // we need to scroll forward
+                            for ($j = $i + 1; $j < $size; $j++) {
+                                if ($bits[$j] === '') continue;
+                                if ($bits[$j] === '/') {
+                                    if ($found_slash) {
+                                        return false;
+                                    } else {
+                                        $found_slash = true;
+                                        continue;
+                                    }
+                                }
+                                $line_height = $bits[$j];
+                                break;
+                            }
+                        } else {
+                            // slash already found
+                            $found_slash = true;
+                            $j = $i;
+                        }
+                        if ($found_slash) {
+                            $i = $j;
+                            $r = $this->info['line-height']->validate(
+                                              $line_height, $config, $context);
+                            if ($r !== false) {
+                                $final .= '/' . $r;
+                            }
+                        }
+                        $final .= ' ';
+                        $stage = 2;
+                        break;
+                    }
+                    return false;
+                
+                // attempting to catch font-family
+                case 2:
+                    $font_family =
+                        implode(' ', array_slice($bits, $i, $size - $i));
+                    $r = $this->info['font-family']->validate(
+                                              $font_family, $config, $context);
+                    if ($r !== false) {
+                        $final .= $r . ' ';
+                        // processing completed successfully
+                        return rtrim($final);
+                    }
+                    return false;
+            }
+        }
+        return false;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/FontFamily.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/FontFamily.php
new file mode 100644 (file)
index 0000000..15cbbf3
--- /dev/null
@@ -0,0 +1,66 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+// whitelisting allowed fonts would be nice
+
+/**
+ * Validates a font family list according to CSS spec
+ */
+class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Generic font family keywords.
+     * @protected
+     */
+    var $generic_names = array(
+        'serif' => true,
+        'sans-serif' => true,
+        'monospace' => true,
+        'fantasy' => true,
+        'cursive' => true
+    );
+    
+    function validate($string, $config, &$context) {
+        $string = $this->parseCDATA($string);
+        // assume that no font names contain commas in them
+        $fonts = explode(',', $string);
+        $final = '';
+        foreach($fonts as $font) {
+            $font = trim($font);
+            if ($font === '') continue;
+            // match a generic name
+            if (isset($this->generic_names[$font])) {
+                $final .= $font . ', ';
+                continue;
+            }
+            // match a quoted name
+            if ($font[0] === '"' || $font[0] === "'") {
+                $length = strlen($font);
+                if ($length <= 2) continue;
+                $quote = $font[0];
+                if ($font[$length - 1] !== $quote) continue;
+                $font = substr($font, 1, $length - 2);
+            }
+            // process font
+            if (ctype_alnum($font)) {
+                // very simple font, allow it in unharmed
+                $final .= $font . ', ';
+                continue;
+            }
+            $nospace = str_replace(array(' ', '.', '!'), '', $font);
+            if (ctype_alnum($nospace)) {
+                // font with spaces in it
+                $final .= "'$font', ";
+                continue;
+            }
+        }
+        $final = rtrim($final, ', ');
+        if ($final === '') return false;
+        return $final;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Length.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Length.php
new file mode 100644 (file)
index 0000000..7da26a8
--- /dev/null
@@ -0,0 +1,56 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
+
+/**
+ * Represents a Length as defined by CSS.
+ */
+class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Valid unit lookup table.
+     * @warning The code assumes all units are two characters long.  Be careful
+     *          if we have to change this behavior!
+     */
+    var $units = array('em' => true, 'ex' => true, 'px' => true, 'in' => true,
+         'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true);
+    /**
+     * Instance of HTMLPurifier_AttrDef_Number to defer number validation to
+     */
+    var $number_def;
+    
+    /**
+     * @param $non_negative Bool indication whether or not negative values are
+     *                      allowed.
+     */
+    function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) {
+        $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
+    }
+    
+    function validate($length, $config, &$context) {
+        
+        $length = $this->parseCDATA($length);
+        if ($length === '') return false;
+        if ($length === '0') return '0';
+        $strlen = strlen($length);
+        if ($strlen === 1) return false; // impossible!
+        
+        // we assume all units are two characters
+        $unit = substr($length, $strlen - 2);
+        if (!ctype_lower($unit)) $unit = strtolower($unit);
+        $number = substr($length, 0, $strlen - 2);
+        
+        if (!isset($this->units[$unit])) return false;
+        
+        $number = $this->number_def->validate($number, $config, $context);
+        if ($number === false) return false;
+        
+        return $number . $unit;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/ListStyle.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/ListStyle.php
new file mode 100644 (file)
index 0000000..2d2ed12
--- /dev/null
@@ -0,0 +1,80 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates shorthand CSS property list-style.
+ * @warning Does not support url tokens that have internal spaces.
+ */
+class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Local copy of component validators.
+     * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
+     */
+    var $info;
+    
+    function HTMLPurifier_AttrDef_CSS_ListStyle($config) {
+        $def = $config->getCSSDefinition();
+        $this->info['list-style-type']     = $def->info['list-style-type'];
+        $this->info['list-style-position'] = $def->info['list-style-position'];
+        $this->info['list-style-image'] = $def->info['list-style-image'];
+    }
+    
+    function validate($string, $config, &$context) {
+        
+        // regular pre-processing
+        $string = $this->parseCDATA($string);
+        if ($string === '') return false;
+        
+        // assumes URI doesn't have spaces in it
+        $bits = explode(' ', strtolower($string)); // bits to process
+        
+        $caught = array();
+        $caught['type']     = false;
+        $caught['position'] = false;
+        $caught['image']    = false;
+        
+        $i = 0; // number of catches
+        $none = false;
+        
+        foreach ($bits as $bit) {
+            if ($i >= 3) return; // optimization bit
+            if ($bit === '') continue;
+            foreach ($caught as $key => $status) {
+                if ($status !== false) continue;
+                $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
+                if ($r === false) continue;
+                if ($r === 'none') {
+                    if ($none) continue;
+                    else $none = true;
+                    if ($key == 'image') continue;
+                }
+                $caught[$key] = $r;
+                $i++;
+                break;
+            }
+        }
+        
+        if (!$i) return false;
+        
+        $ret = array();
+        
+        // construct type
+        if ($caught['type']) $ret[] = $caught['type'];
+        
+        // construct image
+        if ($caught['image']) $ret[] = $caught['image'];
+        
+        // construct position
+        if ($caught['position']) $ret[] = $caught['position'];
+        
+        if (empty($ret)) return false;
+        return implode(' ', $ret);
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Multiple.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Multiple.php
new file mode 100644 (file)
index 0000000..0d1c840
--- /dev/null
@@ -0,0 +1,58 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Framework class for strings that involve multiple values.
+ * 
+ * Certain CSS properties such as border-width and margin allow multiple
+ * lengths to be specified.  This class can take a vanilla border-width
+ * definition and multiply it, usually into a max of four.
+ * 
+ * @note Even though the CSS specification isn't clear about it, inherit
+ *       can only be used alone: it will never manifest as part of a multi
+ *       shorthand declaration.  Thus, this class does not allow inherit.
+ */
+class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Instance of component definition to defer validation to.
+     */
+    var $single;
+    
+    /**
+     * Max number of values allowed.
+     */
+    var $max;
+    
+    /**
+     * @param $single HTMLPurifier_AttrDef to multiply
+     * @param $max Max number of values allowed (usually four)
+     */
+    function HTMLPurifier_AttrDef_CSS_Multiple($single, $max = 4) {
+        $this->single = $single;
+        $this->max = $max;
+    }
+    
+    function validate($string, $config, &$context) {
+        $string = $this->parseCDATA($string);
+        if ($string === '') return false;
+        $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
+        $length = count($parts);
+        $final = '';
+        for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
+            if (ctype_space($parts[$i])) continue;
+            $result = $this->single->validate($parts[$i], $config, $context);
+            if ($result !== false) {
+                $final .= $result . ' ';
+                $num++;
+            }
+        }
+        if ($final === '') return false;
+        return rtrim($final);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Number.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Number.php
new file mode 100644 (file)
index 0000000..48f1335
--- /dev/null
@@ -0,0 +1,61 @@
+<?php
+
+/**
+ * Validates a number as defined by the CSS spec.
+ */
+class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Bool indicating whether or not only positive values allowed.
+     */
+    var $non_negative = false;
+    
+    /**
+     * @param $non_negative Bool indicating whether negatives are forbidden
+     */
+    function HTMLPurifier_AttrDef_CSS_Number($non_negative = false) {
+        $this->non_negative = $non_negative;
+    }
+    
+    function validate($number, $config, &$context) {
+        
+        $number = $this->parseCDATA($number);
+        
+        if ($number === '') return false;
+        
+        $sign = '';
+        switch ($number[0]) {
+            case '-':
+                if ($this->non_negative) return false;
+                $sign = '-';
+            case '+':
+                $number = substr($number, 1);
+        }
+        
+        if (ctype_digit($number)) {
+            $number = ltrim($number, '0');
+            return $number ? $sign . $number : '0';
+        }
+        if (!strpos($number, '.')) return false;
+        
+        list($left, $right) = explode('.', $number, 2);
+        
+        if (!ctype_digit($left)) return false;
+        $left = ltrim($left, '0');
+        
+        $right = rtrim($right, '0');
+        
+        if ($right === '') {
+            return $left ? $sign . $left : '0';
+        } elseif (!ctype_digit($right)) {
+            return false;
+        }
+        
+        return $sign . $left . '.' . $right;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Percentage.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/Percentage.php
new file mode 100644 (file)
index 0000000..cc96f15
--- /dev/null
@@ -0,0 +1,43 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
+
+/**
+ * Validates a Percentage as defined by the CSS spec.
+ */
+class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
+     */
+    var $number_def;
+    
+    /**
+     * @param Bool indicating whether to forbid negative values
+     */
+    function HTMLPurifier_AttrDef_CSS_Percentage($non_negative = false) {
+        $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
+    }
+    
+    function validate($string, $config, &$context) {
+        
+        $string = $this->parseCDATA($string);
+        
+        if ($string === '') return false;
+        $length = strlen($string);
+        if ($length === 1) return false;
+        if ($string[$length - 1] !== '%') return false;
+        
+        $number = substr($string, 0, $length - 1);
+        $number = $this->number_def->validate($number, $config, $context);
+        
+        if ($number === false) return false;
+        return "$number%";
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/TextDecoration.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/TextDecoration.php
new file mode 100644 (file)
index 0000000..294dd83
--- /dev/null
@@ -0,0 +1,41 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates the value for the CSS property text-decoration
+ * @note This class could be generalized into a version that acts sort of
+ *       like Enum except you can compound the allowed values.
+ */
+class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Lookup table of allowed values.
+     * @protected
+     */
+    var $allowed_values = array(
+        'line-through' => true,
+        'overline' => true,
+        'underline' => true
+    );
+    
+    function validate($string, $config, &$context) {
+        
+        $string = strtolower($this->parseCDATA($string));
+        $parts = explode(' ', $string);
+        $final = '';
+        foreach ($parts as $part) {
+            if (isset($this->allowed_values[$part])) {
+                $final .= $part . ' ';
+            }
+        }
+        $final = rtrim($final);
+        if ($final === '') return false;
+        return $final;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/URI.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/CSS/URI.php
new file mode 100644 (file)
index 0000000..b310907
--- /dev/null
@@ -0,0 +1,58 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef/URI.php';
+
+/**
+ * Validates a URI in CSS syntax, which uses url('http://example.com')
+ * @note While theoretically speaking a URI in a CSS document could
+ *       be non-embedded, as of CSS2 there is no such usage so we're
+ *       generalizing it. This may need to be changed in the future.
+ * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
+ *          the separator, you cannot put a literal semicolon in
+ *          in the URI. Try percent encoding it, in that case.
+ */
+class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
+{
+    
+    function HTMLPurifier_AttrDef_CSS_URI() {
+        $this->HTMLPurifier_AttrDef_URI(true); // always embedded
+    }
+    
+    function validate($uri_string, $config, &$context) {
+        // parse the URI out of the string and then pass it onto
+        // the parent object
+        
+        $uri_string = $this->parseCDATA($uri_string);
+        if (strpos($uri_string, 'url(') !== 0) return false;
+        $uri_string = substr($uri_string, 4);
+        $new_length = strlen($uri_string) - 1;
+        if ($uri_string[$new_length] != ')') return false;
+        $uri = trim(substr($uri_string, 0, $new_length));
+        
+        if (isset($uri[0]) && ($uri[0] == "'" || $uri[0] == '"')) {
+            $quote = $uri[0];
+            $new_length = strlen($uri) - 1;
+            if ($uri[$new_length] !== $quote) return false;
+            $uri = substr($uri, 1, $new_length - 1);
+        }
+        
+        $keys   = array(  '(',   ')',   ',',   ' ',   '"',   "'");
+        $values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
+        $uri = str_replace($values, $keys, $uri);
+        
+        $result = parent::validate($uri, $config, $context);
+        
+        if ($result === false) return false;
+        
+        // escape necessary characters according to CSS spec
+        // except for the comma, none of these should appear in the
+        // URI at all
+        $result = str_replace($keys, $values, $result);
+        
+        return "url($result)";
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/Enum.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/Enum.php
new file mode 100644 (file)
index 0000000..3246318
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+// Enum = Enumerated
+/**
+ * Validates a keyword against a list of valid values.
+ */
+class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Lookup table of valid values.
+     */
+    var $valid_values   = array();
+    
+    /**
+     * Bool indicating whether or not enumeration is case sensitive.
+     * @note In general this is always case insensitive.
+     */
+    var $case_sensitive = false; // values according to W3C spec
+    
+    /**
+     * @param $valid_values List of valid values
+     * @param $case_sensitive Bool indicating whether or not case sensitive
+     */
+    function HTMLPurifier_AttrDef_Enum(
+        $valid_values = array(), $case_sensitive = false
+    ) {
+        $this->valid_values = array_flip($valid_values);
+        $this->case_sensitive = $case_sensitive;
+    }
+    
+    function validate($string, $config, &$context) {
+        $string = trim($string);
+        if (!$this->case_sensitive) {
+            $string = ctype_lower($string) ? $string : strtolower($string);
+        }
+        $result = isset($this->valid_values[$string]);
+        
+        return $result ? $string : false;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/ID.php
new file mode 100644 (file)
index 0000000..c8bf299
--- /dev/null
@@ -0,0 +1,121 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/IDAccumulator.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'EnableID', false, 'bool',
+    'Allows the ID attribute in HTML.  This is disabled by default '.
+    'due to the fact that without proper configuration user input can '.
+    'easily break the validation of a webpage by specifying an ID that is '.
+    'already on the surrounding HTML.  If you don\'t mind throwing caution to '.
+    'the wind, enable this directive, but I strongly recommend you also '.
+    'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '.
+    'user supplied IDs (%Attr.IDPrefix).  This directive has been available '.
+    'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '.
+    'versions.'
+);
+HTMLPurifier_ConfigSchema::defineAlias(
+    'HTML', 'EnableAttrID', 'Attr', 'EnableID'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'IDPrefix', '', 'string',
+    'String to prefix to IDs.  If you have no idea what IDs your pages '.
+    'may use, you may opt to simply add a prefix to all user-submitted ID '.
+    'attributes so that they are still usable, but will not conflict with '.
+    'core page IDs. Example: setting the directive to \'user_\' will result in '.
+    'a user submitted \'foo\' to become \'user_foo\'  Be sure to set '.
+    '%HTML.EnableAttrID to true before using '.
+    'this.  This directive was available since 1.2.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'IDPrefixLocal', '', 'string',
+    'Temporary prefix for IDs used in conjunction with %Attr.IDPrefix.  If '.
+    'you need to allow multiple sets of '.
+    'user content on web page, you may need to have a seperate prefix that '.
+    'changes with each iteration.  This way, seperately submitted user content '.
+    'displayed on the same page doesn\'t clobber each other. Ideal values '.
+    'are unique identifiers for the content it represents (i.e. the id of '.
+    'the row in the database). Be sure to add a seperator (like an underscore) '.
+    'at the end.  Warning: this directive will not work unless %Attr.IDPrefix '.
+    'is set to a non-empty value! This directive was available since 1.2.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'IDBlacklistRegexp', null, 'string/null',
+    'PCRE regular expression to be matched against all IDs. If the expression '.
+    'is matches, the ID is rejected. Use this with care: may cause '.
+    'significant degradation. ID matching is done after all other '.
+    'validation. This directive was available since 1.6.0.'
+);
+
+/**
+ * Validates the HTML attribute ID.
+ * @warning Even though this is the id processor, it
+ *          will ignore the directive Attr:IDBlacklist, since it will only
+ *          go according to the ID accumulator. Since the accumulator is
+ *          automatically generated, it will have already absorbed the
+ *          blacklist. If you're hacking around, make sure you use load()!
+ */
+
+class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
+{
+    
+    // ref functionality disabled, since we also have to verify
+    // whether or not the ID it refers to exists
+    
+    function validate($id, $config, &$context) {
+        
+        if (!$config->get('Attr', 'EnableID')) return false;
+        
+        $id = trim($id); // trim it first
+        
+        if ($id === '') return false;
+        
+        $prefix = $config->get('Attr', 'IDPrefix');
+        if ($prefix !== '') {
+            $prefix .= $config->get('Attr', 'IDPrefixLocal');
+            // prevent re-appending the prefix
+            if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
+        } elseif ($config->get('Attr', 'IDPrefixLocal') !== '') {
+            trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
+                '%Attr.IDPrefix is set', E_USER_WARNING);
+        }
+        
+        //if (!$this->ref) {
+            $id_accumulator =& $context->get('IDAccumulator');
+            if (isset($id_accumulator->ids[$id])) return false;
+        //}
+        
+        // we purposely avoid using regex, hopefully this is faster
+        
+        if (ctype_alpha($id)) {
+            $result = true;
+        } else {
+            if (!ctype_alpha(@$id[0])) return false;
+            $trim = trim( // primitive style of regexps, I suppose
+                $id,
+                'A..Za..z0..9:-._'
+              );
+            $result = ($trim === '');
+        }
+        
+        $regexp = $config->get('Attr', 'IDBlacklistRegexp');
+        if ($regexp && preg_match($regexp, $id)) {
+            return false;
+        }
+        
+        if (/*!$this->ref && */$result) $id_accumulator->add($id);
+        
+        // if no change was made to the ID, return the result
+        // else, return the new id if stripping whitespace made it
+        //     valid, or return false.
+        return $result ? $id : false;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Length.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Length.php
new file mode 100644 (file)
index 0000000..ac83295
--- /dev/null
@@ -0,0 +1,44 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
+
+/**
+ * Validates the HTML type length (not to be confused with CSS's length).
+ * 
+ * This accepts integer pixels or percentages as lengths for certain
+ * HTML attributes.
+ */
+
+class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
+{
+    
+    function validate($string, $config, &$context) {
+        
+        $string = trim($string);
+        if ($string === '') return false;
+        
+        $parent_result = parent::validate($string, $config, $context);
+        if ($parent_result !== false) return $parent_result;
+        
+        $length = strlen($string);
+        $last_char = $string[$length - 1];
+        
+        if ($last_char !== '%') return false;
+        
+        $points = substr($string, 0, $length - 1);
+        
+        if (!is_numeric($points)) return false;
+        
+        $points = (int) $points;
+        
+        if ($points < 0) return '0%';
+        if ($points > 100) return '100%';
+        
+        return ((string) $points) . '%';
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/LinkTypes.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/LinkTypes.php
new file mode 100644 (file)
index 0000000..94a47ba
--- /dev/null
@@ -0,0 +1,75 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'AllowedRel', array(), 'lookup',
+    'List of allowed forward document relationships in the rel attribute. '.
+    'Common values may be nofollow or print. By default, this is empty, '.
+    'meaning that no document relationships are allowed. This directive '.
+    'was available since 1.6.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'AllowedRev', array(), 'lookup',
+    'List of allowed reverse document relationships in the rev attribute. '.
+    'This attribute is a bit of an edge-case; if you don\'t know what it '.
+    'is for, stay away. This directive was available since 1.6.0.'
+);
+
+/**
+ * Validates a rel/rev link attribute against a directive of allowed values
+ * @note We cannot use Enum because link types allow multiple
+ *       values.
+ * @note Assumes link types are ASCII text
+ */
+class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
+{
+    
+    /** Lookup array of attribute names to configuration name */
+    var $configLookup = array(
+        'rel' => 'AllowedRel',
+        'rev' => 'AllowedRev'
+    );
+    
+    /** Name config attribute to pull. */
+    var $name;
+    
+    function HTMLPurifier_AttrDef_HTML_LinkTypes($name) {
+        if (!isset($this->configLookup[$name])) {
+            trigger_error('Unrecognized attribute name for link '.
+                'relationship.', E_USER_ERROR);
+            return;
+        }
+        $this->name = $this->configLookup[$name];
+    }
+    
+    function validate($string, $config, &$context) {
+        
+        $allowed = $config->get('Attr', $this->name);
+        if (empty($allowed)) return false;
+        
+        $string = $this->parseCDATA($string);
+        $parts = explode(' ', $string);
+        
+        // lookup to prevent duplicates
+        $ret_lookup = array();
+        foreach ($parts as $part) {
+            $part = strtolower(trim($part));
+            if (!isset($allowed[$part])) continue;
+            $ret_lookup[$part] = true;
+        }
+        
+        if (empty($ret_lookup)) return false;
+        
+        $ret_array = array();
+        foreach ($ret_lookup as $part => $bool) $ret_array[] = $part;
+        $string = implode(' ', $ret_array);
+        
+        return $string;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/MultiLength.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/MultiLength.php
new file mode 100644 (file)
index 0000000..f50259b
--- /dev/null
@@ -0,0 +1,44 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
+
+/**
+ * Validates a MultiLength as defined by the HTML spec.
+ * 
+ * A multilength is either a integer (pixel count), a percentage, or
+ * a relative number.
+ */
+class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
+{
+    
+    function validate($string, $config, &$context) {
+        
+        $string = trim($string);
+        if ($string === '') return false;
+        
+        $parent_result = parent::validate($string, $config, $context);
+        if ($parent_result !== false) return $parent_result;
+        
+        $length = strlen($string);
+        $last_char = $string[$length - 1];
+        
+        if ($last_char !== '*') return false;
+        
+        $int = substr($string, 0, $length - 1);
+        
+        if ($int == '') return '*';
+        if (!is_numeric($int)) return false;
+        
+        $int = (int) $int;
+        
+        if ($int < 0) return false;
+        if ($int == 0) return '0';
+        if ($int == 1) return '*';
+        return ((string) $int) . '*';
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Nmtokens.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Nmtokens.php
new file mode 100644 (file)
index 0000000..1eaeaa7
--- /dev/null
@@ -0,0 +1,51 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/Config.php';
+
+/**
+ * Validates contents based on NMTOKENS attribute type.
+ * @note The only current use for this is the class attribute in HTML
+ * @note Could have some functionality factored out into Nmtoken class
+ * @warning We cannot assume this class will be used only for 'class'
+ *          attributes. Not sure how to hook in magic behavior, then.
+ */
+class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
+{
+    
+    function validate($string, $config, &$context) {
+        
+        $string = trim($string);
+        
+        // early abort: '' and '0' (strings that convert to false) are invalid
+        if (!$string) return false;
+        
+        // OPTIMIZABLE!
+        // do the preg_match, capture all subpatterns for reformulation
+        
+        // we don't support U+00A1 and up codepoints or
+        // escaping because I don't know how to do that with regexps
+        // and plus it would complicate optimization efforts (you never
+        // see that anyway).
+        $matches = array();
+        $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
+                   '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
+                   '(?:(?=\s)|\z)/'; // look ahead for space or string end
+        preg_match_all($pattern, $string, $matches);
+        
+        if (empty($matches[1])) return false;
+        
+        // reconstruct string
+        $new_string = '';
+        foreach ($matches[1] as $token) {
+            $new_string .= $token . ' ';
+        }
+        $new_string = rtrim($new_string);
+        
+        return $new_string;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Pixels.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/HTML/Pixels.php
new file mode 100644 (file)
index 0000000..4c29091
--- /dev/null
@@ -0,0 +1,37 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates an integer representation of pixels according to the HTML spec.
+ */
+class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
+{
+    
+    function validate($string, $config, &$context) {
+        
+        $string = trim($string);
+        if ($string === '0') return $string;
+        if ($string === '')  return false;
+        $length = strlen($string);
+        if (substr($string, $length - 2) == 'px') {
+            $string = substr($string, 0, $length - 2);
+        }
+        if (!is_numeric($string)) return false;
+        $int = (int) $string;
+        
+        if ($int < 0) return '0';
+        
+        // upper-bound value, extremely high values can
+        // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
+        // WARNING, above link WILL crash you if you're using Windows
+        
+        if ($int > 1200) return '1200';
+        
+        return (string) $int;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/Integer.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/Integer.php
new file mode 100644 (file)
index 0000000..d6953d6
--- /dev/null
@@ -0,0 +1,75 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates an integer.
+ * @note While this class was modeled off the CSS definition, no currently
+ *       allowed CSS uses this type.  The properties that do are: widows,
+ *       orphans, z-index, counter-increment, counter-reset.  Some of the
+ *       HTML attributes, however, find use for a non-negative version of this.
+ */
+class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Bool indicating whether or not negative values are allowed
+     */
+    var $negative = true;
+    
+    /**
+     * Bool indicating whether or not zero is allowed
+     */
+    var $zero = true;
+    
+    /**
+     * Bool indicating whether or not positive values are allowed
+     */
+    var $positive = true;
+    
+    /**
+     * @param $negative Bool indicating whether or not negative values are allowed
+     * @param $zero Bool indicating whether or not zero is allowed
+     * @param $positive Bool indicating whether or not positive values are allowed
+     */
+    function HTMLPurifier_AttrDef_Integer(
+        $negative = true, $zero = true, $positive = true
+    ) {
+        $this->negative = $negative;
+        $this->zero     = $zero;
+        $this->positive = $positive;
+    }
+    
+    function validate($integer, $config, &$context) {
+        
+        $integer = $this->parseCDATA($integer);
+        if ($integer === '') return false;
+        
+        // we could possibly simply typecast it to integer, but there are
+        // certain fringe cases that must not return an integer.
+        
+        // clip leading sign
+        if ( $this->negative && $integer[0] === '-' ) {
+            $digits = substr($integer, 1);
+            if ($digits === '0') $integer = '0'; // rm minus sign for zero
+        } elseif( $this->positive && $integer[0] === '+' ) {
+            $digits = $integer = substr($integer, 1); // rm unnecessary plus
+        } else {
+            $digits = $integer;
+        }
+        
+        // test if it's numeric
+        if (!ctype_digit($digits)) return false;
+        
+        // perform scope tests
+        if (!$this->zero     && $integer == 0) return false;
+        if (!$this->positive && $integer > 0) return false;
+        if (!$this->negative && $integer < 0) return false;
+        
+        return $integer;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/Lang.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/Lang.php
new file mode 100644 (file)
index 0000000..72d67f6
--- /dev/null
@@ -0,0 +1,75 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates the HTML attribute lang, effectively a language code.
+ * @note Built according to RFC 3066, which obsoleted RFC 1766
+ */
+class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
+{
+    
+    function validate($string, $config, &$context) {
+        
+        $string = trim($string);
+        if (!$string) return false;
+        
+        $subtags = explode('-', $string);
+        $num_subtags = count($subtags);
+        
+        if ($num_subtags == 0) return false; // sanity check
+        
+        // process primary subtag : $subtags[0]
+        $length = strlen($subtags[0]);
+        switch ($length) {
+            case 0:
+                return false;
+            case 1:
+                if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
+                    return false;
+                }
+                break;
+            case 2:
+            case 3:
+                if (! ctype_alpha($subtags[0]) ) {
+                    return false;
+                } elseif (! ctype_lower($subtags[0]) ) {
+                    $subtags[0] = strtolower($subtags[0]);
+                }
+                break;
+            default:
+                return false;
+        }
+        
+        $new_string = $subtags[0];
+        if ($num_subtags == 1) return $new_string;
+        
+        // process second subtag : $subtags[1]
+        $length = strlen($subtags[1]);
+        if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
+            return $new_string;
+        }
+        if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
+        
+        $new_string .= '-' . $subtags[1];
+        if ($num_subtags == 2) return $new_string;
+        
+        // process all other subtags, index 2 and up
+        for ($i = 2; $i < $num_subtags; $i++) {
+            $length = strlen($subtags[$i]);
+            if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
+                return $new_string;
+            }
+            if (!ctype_lower($subtags[$i])) {
+                $subtags[$i] = strtolower($subtags[$i]);
+            }
+            $new_string .= '-' . $subtags[$i];
+        }
+        
+        return $new_string;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/Text.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/Text.php
new file mode 100644 (file)
index 0000000..eb2a24a
--- /dev/null
@@ -0,0 +1,17 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates arbitrary text according to the HTML spec.
+ */
+class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
+{
+    
+    function validate($string, $config, &$context) {
+        return $this->parseCDATA($string);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI.php
new file mode 100644 (file)
index 0000000..7102718
--- /dev/null
@@ -0,0 +1,296 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/URIScheme.php';
+require_once 'HTMLPurifier/URISchemeRegistry.php';
+require_once 'HTMLPurifier/AttrDef/URI/Host.php';
+require_once 'HTMLPurifier/PercentEncoder.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'DefaultScheme', 'http', 'string',
+    'Defines through what scheme the output will be served, in order to '.
+    'select the proper object validator when no scheme information is present.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'Host', null, 'string/null',
+    'Defines the domain name of the server, so we can determine whether or '.
+    'an absolute URI is from your website or not.  Not strictly necessary, '.
+    'as users should be using relative URIs to reference resources on your '.
+    'website.  It will, however, let you use absolute URIs to link to '.
+    'subdomains of the domain you post here: i.e. example.com will allow '.
+    'sub.example.com.  However, higher up domains will still be excluded: '.
+    'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
+    'This directive has been available since 1.2.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'DisableExternal', false, 'bool',
+    'Disables links to external websites.  This is a highly effective '.
+    'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
+    'links or images outside of your domain will be allowed.  Non-linkified '.
+    'URIs will still be preserved.  If you want to be able to link to '.
+    'subdomains or use absolute URIs, specify %URI.Host for your website. '.
+    'This directive has been available since 1.2.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'DisableExternalResources', false, 'bool',
+    'Disables the embedding of external resources, preventing users from '.
+    'embedding things like images from other hosts. This prevents '.
+    'access tracking (good for email viewers), bandwidth leeching, '.
+    'cross-site request forging, goatse.cx posting, and '.
+    'other nasties, but also results in '.
+    'a loss of end-user functionality (they can\'t directly post a pic '.
+    'they posted from Flickr anymore). Use it if you don\'t have a '.
+    'robust user-content moderation team. This directive has been '.
+    'available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'DisableResources', false, 'bool',
+    'Disables embedding resources, essentially meaning no pictures. You can '.
+    'still link to them though. See %URI.DisableExternalResources for why '.
+    'this might be a good idea. This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'Munge', null, 'string/null',
+    'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
+    'redirection service. Pass this directive a URI, with %s inserted where '.
+    'the url-encoded original URI should be inserted (sample: '.
+    '<code>http://www.google.com/url?q=%s</code>). '.
+    'This prevents PageRank leaks, while being as transparent as possible '.
+    'to users (you may also want to add some client side JavaScript to '.
+    'override the text in the statusbar). Warning: many security experts '.
+    'believe that this form of protection does not deter spam-bots. '.
+    'You can also use this directive to redirect users to a splash page '.
+    'telling them they are leaving your website. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'HostBlacklist', array(), 'list',
+    'List of strings that are forbidden in the host of any URI. Use it to '.
+    'kill domain names of spam, etc. Note that it will catch anything in '.
+    'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'Disable', false, 'bool',
+    'Disables all URIs in all forms. Not sure why you\'d want to do that '.
+    '(after all, the Internet\'s founded on the notion of a hyperlink). '.
+    'This directive has been available since 1.3.0.'
+);
+HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
+
+/**
+ * Validates a URI as defined by RFC 3986.
+ * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
+ */
+class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
+{
+    
+    var $host;
+    var $PercentEncoder;
+    var $embeds_resource;
+    
+    /**
+     * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
+     */
+    function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
+        $this->host = new HTMLPurifier_AttrDef_URI_Host();
+        $this->PercentEncoder = new HTMLPurifier_PercentEncoder();
+        $this->embeds_resource = (bool) $embeds_resource;
+    }
+    
+    function validate($uri, $config, &$context) {
+        
+        // We'll write stack-based parsers later, for now, use regexps to
+        // get things working as fast as possible (irony)
+        
+        if ($config->get('URI', 'Disable')) return false;
+        
+        // parse as CDATA
+        $uri = $this->parseCDATA($uri);
+        
+        // fix up percent-encoding
+        $uri = $this->PercentEncoder->normalize($uri);
+        
+        // while it would be nice to use parse_url(), that's specifically
+        // for HTTP and thus won't work for our generic URI parsing
+        
+        // according to the RFC... (but this cuts corners, i.e. non-validating)
+        $r_URI = '!'.
+            '(([^:/?#<>\'"]+):)?'. // 2. Scheme
+            '(//([^/?#<>\'"]*))?'. // 4. Authority
+            '([^?#<>\'"]*)'.       // 5. Path
+            '(\?([^#<>\'"]*))?'.   // 7. Query
+            '(#([^<>\'"]*))?'.     // 8. Fragment
+            '!';
+        
+        $matches = array();
+        $result = preg_match($r_URI, $uri, $matches);
+        
+        if (!$result) return false; // invalid URI
+        
+        // seperate out parts
+        $scheme     = !empty($matches[1]) ? $matches[2] : null;
+        $authority  = !empty($matches[3]) ? $matches[4] : null;
+        $path       = $matches[5]; // always present, can be empty
+        $query      = !empty($matches[6]) ? $matches[7] : null;
+        $fragment   = !empty($matches[8]) ? $matches[9] : null;
+        
+        
+        
+        $registry =& HTMLPurifier_URISchemeRegistry::instance();
+        if ($scheme !== null) {
+            // no need to validate the scheme's fmt since we do that when we
+            // retrieve the specific scheme object from the registry
+            $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme);
+            $scheme_obj = $registry->getScheme($scheme, $config, $context);
+            if (!$scheme_obj) return false; // invalid scheme, clean it out
+        } else {
+            $scheme_obj = $registry->getScheme(
+                $config->get('URI', 'DefaultScheme'), $config, $context
+            );
+        }
+        
+        
+        // the URI we're processing embeds_resource a resource in the page, but the URI
+        // it references cannot be located
+        if ($this->embeds_resource && !$scheme_obj->browsable) {
+            return false;
+        }
+        
+        
+        if ($authority !== null) {
+            
+            // remove URI if it's absolute and we disabled externals or
+            // if it's absolute and embedded and we disabled external resources
+            unset($our_host);
+            if (
+                $config->get('URI', 'DisableExternal') ||
+                (
+                    $config->get('URI', 'DisableExternalResources') &&
+                    $this->embeds_resource
+                )
+            ) {
+                $our_host = $config->get('URI', 'Host');
+                if ($our_host === null) return false;
+            }
+            
+            $HEXDIG = '[A-Fa-f0-9]';
+            $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
+            $sub_delims = '!$&\'()'; // needs []
+            $pct_encoded = "%$HEXDIG$HEXDIG";
+            $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
+            $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
+            $matches = array();
+            preg_match($r_authority, $authority, $matches);
+            // overloads regexp!
+            $userinfo   = !empty($matches[1]) ? $matches[2] : null;
+            $host       = !empty($matches[3]) ? $matches[3] : null;
+            $port       = !empty($matches[4]) ? $matches[5] : null;
+            
+            // validate port
+            if ($port !== null) {
+                $port = (int) $port;
+                if ($port < 1 || $port > 65535) $port = null;
+            }
+            
+            $host = $this->host->validate($host, $config, $context);
+            if ($host === false) $host = null;
+            
+            if ($this->checkBlacklist($host, $config, $context)) return false;
+            
+            // more lenient absolute checking
+            if (isset($our_host)) {
+                $host_parts = array_reverse(explode('.', $host));
+                // could be cached
+                $our_host_parts = array_reverse(explode('.', $our_host));
+                foreach ($our_host_parts as $i => $discard) {
+                    if (!isset($host_parts[$i])) return false;
+                    if ($host_parts[$i] != $our_host_parts[$i]) return false;
+                }
+            }
+            
+            // userinfo and host are validated within the regexp
+            
+        } else {
+            $port = $host = $userinfo = null;
+        }
+        
+        
+        // query and fragment are quite simple in terms of definition:
+        // *( pchar / "/" / "?" ), so define their validation routines
+        // when we start fixing percent encoding
+        
+        
+        
+        // path gets to be validated against a hodge-podge of rules depending
+        // on the status of authority and scheme, but it's not that important,
+        // esp. since it won't be applicable to everyone
+        
+        
+        
+        // okay, now we defer execution to the subobject for more processing
+        // note that $fragment is omitted
+        list($userinfo, $host, $port, $path, $query) = 
+            $scheme_obj->validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context
+            );
+        
+        
+        // reconstruct authority
+        $authority = null;
+        if (!is_null($userinfo) || !is_null($host) || !is_null($port)) {
+            $authority = '';
+            if($userinfo !== null) $authority .= $userinfo . '@';
+            $authority .= $host;
+            if($port !== null) $authority .= ':' . $port;
+        }
+        
+        // reconstruct the result
+        $result = '';
+        if ($scheme !== null) $result .= "$scheme:";
+        if ($authority !== null) $result .= "//$authority";
+        $result .= $path;
+        if ($query !== null) $result .= "?$query";
+        if ($fragment !== null) $result .= "#$fragment";
+        
+        // munge if necessary
+        $munge = $config->get('URI', 'Munge');
+        if (!empty($scheme_obj->browsable) && $munge !== null) {
+            if ($authority !== null) {
+                $result = str_replace('%s', rawurlencode($result), $munge);
+            }
+        }
+        
+        return $result;
+        
+    }
+    
+    /**
+     * Checks a host against an array blacklist
+     * @param $host Host to check
+     * @param $config HTMLPurifier_Config instance
+     * @param $context HTMLPurifier_Context instance
+     * @return bool Is spam?
+     */
+    function checkBlacklist($host, &$config, &$context) {
+        $blacklist = $config->get('URI', 'HostBlacklist');
+        if (!empty($blacklist)) {
+            foreach($blacklist as $blacklisted_host_fragment) {
+                if (strpos($host, $blacklisted_host_fragment) !== false) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php
new file mode 100644 (file)
index 0000000..80b8d36
--- /dev/null
@@ -0,0 +1,17 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Unpacks a mailbox into its display-name and address
+     */
+    function unpack($string) {
+        // needs to be implemented
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php
new file mode 100644 (file)
index 0000000..e35b1b4
--- /dev/null
@@ -0,0 +1,23 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef/URI/Email.php';
+
+/**
+ * Primitive email validation class based on the regexp found at 
+ * http://www.regular-expressions.info/email.html
+ */
+class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
+{
+    
+    function validate($string, $config, &$context) {
+        // no support for named mailboxes i.e. "Bob <bob@example.com>"
+        // that needs more percent encoding to be done
+        if ($string == '') return false;
+        $string = trim($string);
+        $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
+        return $result ? $string : false;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php
new file mode 100644 (file)
index 0000000..5344cda
--- /dev/null
@@ -0,0 +1,54 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
+require_once 'HTMLPurifier/AttrDef/URI/IPv6.php';
+
+/**
+ * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
+ */
+class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
+     */
+    var $ipv4;
+    
+    /**
+     * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
+     */
+    var $ipv6;
+    
+    function HTMLPurifier_AttrDef_URI_Host() {
+        $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
+        $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
+    }
+    
+    function validate($string, $config, &$context) {
+        $length = strlen($string);
+        if ($string === '') return '';
+        if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
+            //IPv6
+            $ip = substr($string, 1, $length - 2);
+            $valid = $this->ipv6->validate($ip, $config, $context);
+            if ($valid === false) return false;
+            return '['. $valid . ']';
+        }
+        
+        // need to do checks on unusual encodings too
+        $ipv4 = $this->ipv4->validate($string, $config, $context);
+        if ($ipv4 !== false) return $ipv4;
+        
+        // validate a domain name here, do filtering, etc etc etc
+        
+        // We could use this, but it would break I18N domain names
+        //$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string);
+        //if (!$match) return false;
+        
+        return $string;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php
new file mode 100644 (file)
index 0000000..0730bbc
--- /dev/null
@@ -0,0 +1,36 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+
+/**
+ * Validates an IPv4 address
+ * @author Feyd @ forums.devnetwork.net (public domain)
+ */
+class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
+{
+    
+    /**
+     * IPv4 regex, protected so that IPv6 can reuse it
+     * @protected
+     */
+    var $ip4;
+    
+    function HTMLPurifier_AttrDef_URI_IPv4() {
+        $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
+        $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
+    }
+    
+    function validate($aIP, $config, &$context) {
+        
+        if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
+        {
+                return $aIP;
+        }
+        
+        return false;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php
new file mode 100644 (file)
index 0000000..73f085e
--- /dev/null
@@ -0,0 +1,99 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
+
+/**
+ * Validates an IPv6 address.
+ * @author Feyd @ forums.devnetwork.net (public domain)
+ * @note This function requires brackets to have been removed from address
+ *       in URI.
+ */
+class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
+{
+    
+    function validate($aIP, $config, &$context) {
+        
+        $original = $aIP;
+        
+        $hex = '[0-9a-fA-F]';
+        $blk = '(?:' . $hex . '{1,4})';
+        $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))';   // /0 - /128
+        
+        //      prefix check
+        if (strpos($aIP, '/') !== false)
+        {
+                if (preg_match('#' . $pre . '$#s', $aIP, $find))
+                {
+                        $aIP = substr($aIP, 0, 0-strlen($find[0]));
+                        unset($find);
+                }
+                else
+                {
+                        return false;
+                }
+        }
+        
+        //      IPv4-compatiblity check       
+        if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
+        {
+                $aIP = substr($aIP, 0, 0-strlen($find[0]));
+                $ip = explode('.', $find[0]);
+                $ip = array_map('dechex', $ip);
+                $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
+                unset($find, $ip);
+        }
+        
+        //      compression check
+        $aIP = explode('::', $aIP);
+        $c = count($aIP);
+        if ($c > 2)
+        {
+                return false;
+        }
+        elseif ($c == 2)
+        {
+                list($first, $second) = $aIP;
+                $first = explode(':', $first);
+                $second = explode(':', $second);
+               
+                if (count($first) + count($second) > 8)
+                {
+                        return false;
+                }
+               
+                while(count($first) < 8)
+                {
+                        array_push($first, '0');
+                }
+
+                array_splice($first, 8 - count($second), 8, $second);
+                $aIP = $first;
+                unset($first,$second);
+        }
+        else
+        {
+                $aIP = explode(':', $aIP[0]);
+        }
+        $c = count($aIP);
+        
+        if ($c != 8)
+        {
+                return false;
+        }
+       
+        //      All the pieces should be 16-bit hex strings. Are they?
+        foreach ($aIP as $piece)
+        {
+                if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
+                {
+                        return false;
+                }
+        }
+        
+        return $original;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform.php
new file mode 100644 (file)
index 0000000..3513669
--- /dev/null
@@ -0,0 +1,34 @@
+<?php
+
+/**
+ * Processes an entire attribute array for corrections needing multiple values.
+ * 
+ * Occasionally, a certain attribute will need to be removed and popped onto
+ * another value.  Instead of creating a complex return syntax for
+ * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
+ * specialized object and have that do the special work.  That is the
+ * family of HTMLPurifier_AttrTransform.
+ * 
+ * An attribute transformation can be assigned to run before or after
+ * HTMLPurifier_AttrDef validation.  See HTMLPurifier_HTMLDefinition for
+ * more details.
+ */
+
+class HTMLPurifier_AttrTransform
+{
+    
+    /**
+     * Abstract: makes changes to the attributes dependent on multiple values.
+     * 
+     * @param $attr Assoc array of attributes, usually from
+     *              HTMLPurifier_Token_Tag::$attr
+     * @param $config Mandatory HTMLPurifier_Config object.
+     * @param $context Mandatory HTMLPurifier_Context object
+     * @returns Processed attribute array.
+     */
+    function transform($attr, $config, &$context) {
+        trigger_error('Cannot call abstract function', E_USER_ERROR);
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php
new file mode 100644 (file)
index 0000000..0ea5eb6
--- /dev/null
@@ -0,0 +1,31 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+// this MUST be placed in post, as it assumes that any value in dir is valid
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'DefaultTextDir', 'ltr', 'string',
+    'Defines the default text direction (ltr or rtl) of the document '.
+    'being parsed.  This generally is the same as the value of the dir '.
+    'attribute in HTML, or ltr if that is not specified.'
+);
+HTMLPurifier_ConfigSchema::defineAllowedValues(
+    'Attr', 'DefaultTextDir', array( 'ltr', 'rtl' )
+);
+
+/**
+ * Post-trasnform that ensures that bdo tags have the dir attribute set.
+ */
+class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
+{
+    
+    function transform($attr, $config, &$context) {
+        if (isset($attr['dir'])) return $attr;
+        $attr['dir'] = $config->get('Attr', 'DefaultTextDir');
+        return $attr;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php
new file mode 100644 (file)
index 0000000..abfd034
--- /dev/null
@@ -0,0 +1,28 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Pre-transform that changes deprecated bgcolor attribute to CSS.
+ */
+class HTMLPurifier_AttrTransform_BgColor
+extends HTMLPurifier_AttrTransform {
+
+    function transform($attr, $config, &$context) {
+        
+        if (!isset($attr['bgcolor'])) return $attr;
+        
+        $bgcolor = $attr['bgcolor'];
+        unset($attr['bgcolor']);
+        // some validation should happen here
+        
+        $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
+        $attr['style'] = "background-color:$bgcolor;" . $attr['style'];
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php
new file mode 100644 (file)
index 0000000..0b745d3
--- /dev/null
@@ -0,0 +1,28 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Pre-transform that changes deprecated border attribute to CSS.
+ */
+class HTMLPurifier_AttrTransform_Border
+extends HTMLPurifier_AttrTransform {
+
+    function transform($attr, $config, &$context) {
+        
+        if (!isset($attr['border'])) return $attr;
+        
+        $border_width = $attr['border'];
+        unset($attr['border']);
+        // some validation should happen here
+        
+        $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
+        $attr['style'] = "border:{$border_width}px solid;" . $attr['style'];
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php
new file mode 100644 (file)
index 0000000..4ff356d
--- /dev/null
@@ -0,0 +1,50 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+// must be called POST validation
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'DefaultInvalidImage', '', 'string',
+    'This is the default image an img tag will be pointed to if it does '.
+    'not have a valid src attribute.  In future versions, we may allow the '.
+    'image tag to be removed completely, but due to design issues, this is '.
+    'not possible right now.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'DefaultInvalidImageAlt', 'Invalid image', 'string',
+    'This is the content of the alt tag of an invalid image if the user '.
+    'had not previously specified an alt attribute.  It has no effect when the '.
+    'image is valid but there was no alt attribute present.'
+);
+
+/**
+ * Post-transform that ensures the required attrs of img (alt and src) are set
+ */
+class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
+{
+    
+    function transform($attr, $config, &$context) {
+        
+        $src = true;
+        if (!isset($attr['src'])) {
+            $attr['src'] = $config->get('Attr', 'DefaultInvalidImage');
+            $src = false;
+        }
+        
+        if (!isset($attr['alt'])) {
+            if ($src) {
+                $attr['alt'] = basename($attr['src']);
+            } else {
+                $attr['alt'] = $config->get('Attr', 'DefaultInvalidImageAlt');
+            }
+        }
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php
new file mode 100644 (file)
index 0000000..acb1786
--- /dev/null
@@ -0,0 +1,30 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Post-transform that copies lang's value to xml:lang (and vice-versa)
+ * @note Theoretically speaking, this could be a pre-transform, but putting
+ *       post is more efficient.
+ */
+class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
+{
+    
+    function transform($attr, $config, &$context) {
+        
+        $lang     = isset($attr['lang']) ? $attr['lang'] : false;
+        $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
+        
+        if ($lang !== false && $xml_lang === false) {
+            $attr['xml:lang'] = $lang;
+        } elseif ($xml_lang !== false) {
+            $attr['lang'] = $xml_lang;
+        }
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php
new file mode 100644 (file)
index 0000000..16d3d1d
--- /dev/null
@@ -0,0 +1,33 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Class for handling width/height length attribute transformations to CSS
+ */
+class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
+{
+    
+    var $name;
+    var $cssName;
+    
+    function HTMLPurifier_AttrTransform_Length($name, $css_name = null) {
+        $this->name = $name;
+        $this->cssName = $css_name ? $css_name : $name;
+    }
+    
+    function transform($attr, $config, &$context) {
+        if (!isset($attr[$this->name])) return $attr;
+        $length = $attr[$this->name];
+        unset($attr[$this->name]);
+        if(ctype_digit($length)) $length .= 'px';
+        
+        $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
+        $attr['style'] = $this->cssName . ":$length;" . $attr['style'];
+        
+        return $attr;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php
new file mode 100644 (file)
index 0000000..0f815b6
--- /dev/null
@@ -0,0 +1,31 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Pre-transform that changes deprecated name attribute to ID if necessary
+ */
+class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
+{
+    
+    function transform($attr, $config, &$context) {
+        
+        if (!isset($attr['name'])) return $attr;
+        
+        $name = $attr['name'];
+        unset($attr['name']);
+        
+        if (isset($attr['id'])) {
+            // ID already set, discard name
+            return $attr;
+        }
+        
+        $attr['id'] = $name;
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php
new file mode 100644 (file)
index 0000000..09088fe
--- /dev/null
@@ -0,0 +1,36 @@
+<?php
+
+require_once 'HTMLPurifier/AttrTransform.php';
+
+/**
+ * Pre-transform that changes deprecated align attribute to text-align.
+ */
+class HTMLPurifier_AttrTransform_TextAlign
+extends HTMLPurifier_AttrTransform {
+
+    function transform($attr, $config, &$context) {
+        
+        if (!isset($attr['align'])) return $attr;
+        
+        $align = strtolower(trim($attr['align']));
+        unset($attr['align']);
+        
+        $values = array('left' => 1,
+                        'right' => 1,
+                        'center' => 1,
+                        'justify' => 1);
+        
+        if (!isset($values[$align])) {
+            return $attr;
+        }
+        
+        $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
+        $attr['style'] = "text-align:$align;" . $attr['style'];
+        
+        return $attr;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php
new file mode 100644 (file)
index 0000000..e13d0d3
--- /dev/null
@@ -0,0 +1,41 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef/HTML/ID.php';
+require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
+require_once 'HTMLPurifier/AttrDef/HTML/MultiLength.php';
+require_once 'HTMLPurifier/AttrDef/HTML/Nmtokens.php';
+require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
+require_once 'HTMLPurifier/AttrDef/Integer.php';
+require_once 'HTMLPurifier/AttrDef/Text.php';
+require_once 'HTMLPurifier/AttrDef/URI.php';
+
+/**
+ * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
+ */
+class HTMLPurifier_AttrTypes
+{
+    /**
+     * Lookup array of attribute string identifiers to concrete implementations
+     * @public
+     */
+    var $info = array();
+    
+    /**
+     * Constructs the info array
+     */
+    function HTMLPurifier_AttrTypes() {
+        $this->info['CDATA']    = new HTMLPurifier_AttrDef_Text();
+        $this->info['ID']       = new HTMLPurifier_AttrDef_HTML_ID();
+        $this->info['Length']   = new HTMLPurifier_AttrDef_HTML_Length();
+        $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
+        $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
+        $this->info['Pixels']   = new HTMLPurifier_AttrDef_HTML_Pixels();
+        $this->info['Text']     = new HTMLPurifier_AttrDef_Text();
+        $this->info['URI']      = new HTMLPurifier_AttrDef_URI();
+        
+        // number is really a positive integer (one or more digits)
+        $this->info['Number']   = new HTMLPurifier_AttrDef_Integer(false, false, true);
+    }
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php b/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php
new file mode 100644 (file)
index 0000000..5de49b6
--- /dev/null
@@ -0,0 +1,213 @@
+<?php
+
+require_once 'HTMLPurifier/AttrDef/CSS/Background.php';
+require_once 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Border.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Color.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Composite.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Font.php';
+require_once 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
+require_once 'HTMLPurifier/AttrDef/CSS/ListStyle.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Multiple.php';
+require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
+require_once 'HTMLPurifier/AttrDef/CSS/TextDecoration.php';
+require_once 'HTMLPurifier/AttrDef/CSS/URI.php';
+require_once 'HTMLPurifier/AttrDef/Enum.php';
+
+/**
+ * Defines allowed CSS attributes and what their values are.
+ * @see HTMLPurifier_HTMLDefinition
+ */
+class HTMLPurifier_CSSDefinition
+{
+    
+    /**
+     * Assoc array of attribute name to definition object.
+     */
+    var $info = array();
+    
+    /**
+     * Constructs the info array.  The meat of this class.
+     */
+    function setup($config) {
+        
+        $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
+            array('left', 'right', 'center', 'justify'), false);
+        
+        $border_style =
+        $this->info['border-bottom-style'] = 
+        $this->info['border-right-style'] = 
+        $this->info['border-left-style'] = 
+        $this->info['border-top-style'] =  new HTMLPurifier_AttrDef_Enum(
+            array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
+            'groove', 'ridge', 'inset', 'outset'), false);
+        
+        $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
+        
+        $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
+            array('none', 'left', 'right', 'both'), false);
+        $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
+            array('none', 'left', 'right'), false);
+        $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
+            array('normal', 'italic', 'oblique'), false);
+        $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
+            array('normal', 'small-caps'), false);
+        
+        $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
+            array(
+                new HTMLPurifier_AttrDef_Enum(array('none')),
+                new HTMLPurifier_AttrDef_CSS_URI()
+            )
+        );
+        
+        $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
+            array('inside', 'outside'), false);
+        $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
+            array('disc', 'circle', 'square', 'decimal', 'lower-roman',
+            'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
+        $this->info['list-style-image'] = $uri_or_none;
+        
+        $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
+        
+        $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
+            array('capitalize', 'uppercase', 'lowercase', 'none'), false);
+        $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
+        
+        $this->info['background-image'] = $uri_or_none;
+        $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
+            array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
+        );
+        $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
+            array('scroll', 'fixed')
+        );
+        $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
+        
+        $border_color = 
+        $this->info['border-top-color'] = 
+        $this->info['border-bottom-color'] = 
+        $this->info['border-left-color'] = 
+        $this->info['border-right-color'] = 
+        $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('transparent')),
+            new HTMLPurifier_AttrDef_CSS_Color()
+        ));
+        
+        $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
+        
+        $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
+        
+        $border_width = 
+        $this->info['border-top-width'] = 
+        $this->info['border-bottom-width'] = 
+        $this->info['border-left-width'] = 
+        $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
+            new HTMLPurifier_AttrDef_CSS_Length(true) //disallow negative
+        ));
+        
+        $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
+        
+        $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('normal')),
+            new HTMLPurifier_AttrDef_CSS_Length()
+        ));
+        
+        $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('normal')),
+            new HTMLPurifier_AttrDef_CSS_Length()
+        ));
+        
+        $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
+                'small', 'medium', 'large', 'x-large', 'xx-large',
+                'larger', 'smaller')),
+            new HTMLPurifier_AttrDef_CSS_Percentage(),
+            new HTMLPurifier_AttrDef_CSS_Length()
+        ));
+        
+        $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('normal')),
+            new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
+            new HTMLPurifier_AttrDef_CSS_Length(true),
+            new HTMLPurifier_AttrDef_CSS_Percentage(true)
+        ));
+        
+        $margin =
+        $this->info['margin-top'] = 
+        $this->info['margin-bottom'] = 
+        $this->info['margin-left'] = 
+        $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_CSS_Length(),
+            new HTMLPurifier_AttrDef_CSS_Percentage(),
+            new HTMLPurifier_AttrDef_Enum(array('auto'))
+        ));
+        
+        $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
+        
+        // non-negative
+        $padding =
+        $this->info['padding-top'] = 
+        $this->info['padding-bottom'] = 
+        $this->info['padding-left'] = 
+        $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_CSS_Length(true),
+            new HTMLPurifier_AttrDef_CSS_Percentage(true)
+        ));
+        
+        $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
+        
+        $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_CSS_Length(),
+            new HTMLPurifier_AttrDef_CSS_Percentage()
+        ));
+        
+        $this->info['width'] =
+        $this->info['height'] = 
+        new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_CSS_Length(true),
+            new HTMLPurifier_AttrDef_CSS_Percentage(true),
+            new HTMLPurifier_AttrDef_Enum(array('auto'))
+        ));
+        
+        $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
+        
+        $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
+        
+        // this could use specialized code
+        $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
+            array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
+            '400', '500', '600', '700', '800', '900'), false);
+        
+        // MUST be called after other font properties, as it references
+        // a CSSDefinition object
+        $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
+        
+        // same here
+        $this->info['border'] =
+        $this->info['border-bottom'] = 
+        $this->info['border-top'] = 
+        $this->info['border-left'] = 
+        $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
+        
+        $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
+            'collapse', 'seperate'));
+        
+        $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
+            'top', 'bottom'));
+        
+        $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
+            'auto', 'fixed'));
+        
+        $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+            new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
+                'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
+            new HTMLPurifier_AttrDef_CSS_Length(),
+            new HTMLPurifier_AttrDef_CSS_Percentage()
+        ));
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef.php b/lib/htmlpurifier/HTMLPurifier/ChildDef.php
new file mode 100644 (file)
index 0000000..bed43ca
--- /dev/null
@@ -0,0 +1,55 @@
+<?php
+
+// HTMLPurifier_ChildDef and inheritance have three types of output:
+// true = leave nodes as is
+// false = delete parent node and all children
+// array(...) = replace children nodes with these
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'EscapeInvalidChildren', false, 'bool',
+    'When true, a child is found that is not allowed in the context of the '.
+    'parent element will be transformed into text as if it were ASCII. When '.
+    'false, that element and all internal tags will be dropped, though text '.
+    'will be preserved.  There is no option for dropping the element but '.
+    'preserving child nodes.'
+);
+
+/**
+ * Defines allowed child nodes and validates tokens against it.
+ */
+class HTMLPurifier_ChildDef
+{
+    /**
+     * Type of child definition, usually right-most part of class name lowercase.
+     * Used occasionally in terms of context.
+     * @public
+     */
+    var $type;
+    
+    /**
+     * Bool that indicates whether or not an empty array of children is okay
+     * 
+     * This is necessary for redundant checking when changes affecting
+     * a child node may cause a parent node to now be disallowed.
+     * 
+     * @public
+     */
+    var $allow_empty;
+    
+    /**
+     * Validates nodes according to definition and returns modification.
+     * 
+     * @public
+     * @param $tokens_of_children Array of HTMLPurifier_Token
+     * @param $config HTMLPurifier_Config object
+     * @param $context HTMLPurifier_Context object
+     * @return bool true to leave nodes as is
+     * @return bool false to remove parent node
+     * @return array of replacement child tokens
+     */
+    function validateChildren($tokens_of_children, $config, &$context) {
+        trigger_error('Call to abstract function', E_USER_ERROR);
+    }
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php
new file mode 100644 (file)
index 0000000..afe0299
--- /dev/null
@@ -0,0 +1,51 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef.php';
+
+/**
+ * Definition that uses different definitions depending on context.
+ * 
+ * The del and ins tags are notable because they allow different types of
+ * elements depending on whether or not they're in a block or inline context.
+ * Chameleon allows this behavior to happen by using two different
+ * definitions depending on context.  While this somewhat generalized,
+ * it is specifically intended for those two tags.
+ */
+class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
+{
+    
+    /**
+     * Instance of the definition object to use when inline. Usually stricter.
+     * @public
+     */
+    var $inline;
+    
+    /**
+     * Instance of the definition object to use when block.
+     * @public
+     */
+    var $block;
+    
+    var $type = 'chameleon';
+    
+    /**
+     * @param $inline List of elements to allow when inline.
+     * @param $block List of elements to allow when block.
+     */
+    function HTMLPurifier_ChildDef_Chameleon($inline, $block) {
+        $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
+        $this->block  = new HTMLPurifier_ChildDef_Optional($block);
+    }
+    
+    function validateChildren($tokens_of_children, $config, &$context) {
+        if ($context->get('IsInline') === false) {
+            return $this->block->validateChildren(
+                $tokens_of_children, $config, $context);
+        } else {
+            return $this->inline->validateChildren(
+                $tokens_of_children, $config, $context);
+        }
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php
new file mode 100644 (file)
index 0000000..de18cd7
--- /dev/null
@@ -0,0 +1,75 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef.php';
+
+/**
+ * Custom validation class, accepts DTD child definitions
+ * 
+ * @warning Currently this class is an all or nothing proposition, that is,
+ *          it will only give a bool return value.
+ * @note This class is currently not used by any code, although it is unit
+ *       tested.
+ */
+class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
+{
+    var $type = 'custom';
+    var $allow_empty = false;
+    /**
+     * Allowed child pattern as defined by the DTD
+     */
+    var $dtd_regex;
+    /**
+     * PCRE regex derived from $dtd_regex
+     * @private
+     */
+    var $_pcre_regex;
+    /**
+     * @param $dtd_regex Allowed child pattern from the DTD
+     */
+    function HTMLPurifier_ChildDef_Custom($dtd_regex) {
+        $this->dtd_regex = $dtd_regex;
+        $this->_compileRegex();
+    }
+    /**
+     * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
+     */
+    function _compileRegex() {
+        $raw = str_replace(' ', '', $this->dtd_regex);
+        if ($raw{0} != '(') {
+            $raw = "($raw)";
+        }
+        $reg = str_replace(',', ',?', $raw);
+        $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
+        $this->_pcre_regex = $reg;
+    }
+    function validateChildren($tokens_of_children, $config, &$context) {
+        $list_of_children = '';
+        $nesting = 0; // depth into the nest
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) continue;
+            
+            $is_child = ($nesting == 0); // direct
+            
+            if ($token->type == 'start') {
+                $nesting++;
+            } elseif ($token->type == 'end') {
+                $nesting--;
+            }
+            
+            if ($is_child) {
+                $list_of_children .= $token->name . ',';
+            }
+        }
+        $list_of_children = rtrim($list_of_children, ',');
+        
+        $okay =
+            preg_match(
+                '/^'.$this->_pcre_regex.'$/',
+                $list_of_children
+            );
+        
+        return (bool) $okay;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php
new file mode 100644 (file)
index 0000000..1ab4fdd
--- /dev/null
@@ -0,0 +1,22 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef.php';
+
+/**
+ * Definition that disallows all elements.
+ * @warning validateChildren() in this class is actually never called, because
+ *          empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
+ *          before child definitions are parsed in earnest by
+ *          HTMLPurifier_Strategy_FixNesting.
+ */
+class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
+{
+    var $allow_empty = true;
+    var $type = 'empty';
+    function HTMLPurifier_ChildDef_Empty() {}
+    function validateChildren($tokens_of_children, $config, &$context) {
+        return array();
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php
new file mode 100644 (file)
index 0000000..cc88832
--- /dev/null
@@ -0,0 +1,23 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef/Required.php';
+
+/**
+ * Definition that allows a set of elements, and allows no children.
+ * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
+ *       really, one shouldn't inherit from the other.  Only altered behavior
+ *       is to overload a returned false with an array.  Thus, it will never
+ *       return false.
+ */
+class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
+{
+    var $allow_empty = true;
+    var $type = 'optional';
+    function validateChildren($tokens_of_children, $config, &$context) {
+        $result = parent::validateChildren($tokens_of_children, $config, $context);
+        if ($result === false) return array();
+        return $result;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php
new file mode 100644 (file)
index 0000000..c6f706e
--- /dev/null
@@ -0,0 +1,107 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef.php';
+
+/**
+ * Definition that allows a set of elements, but disallows empty children.
+ */
+class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
+{
+    /**
+     * Lookup table of allowed elements.
+     * @public
+     */
+    var $elements = array();
+    /**
+     * @param $elements List of allowed element names (lowercase).
+     */
+    function HTMLPurifier_ChildDef_Required($elements) {
+        if (is_string($elements)) {
+            $elements = str_replace(' ', '', $elements);
+            $elements = explode('|', $elements);
+        }
+        $keys = array_keys($elements);
+        if ($keys == array_keys($keys)) {
+            $elements = array_flip($elements);
+            foreach ($elements as $i => $x) {
+                $elements[$i] = true;
+                if (empty($i)) unset($elements[$i]);
+            }
+        }
+        $this->elements = $elements;
+        $this->gen = new HTMLPurifier_Generator();
+    }
+    var $allow_empty = false;
+    var $type = 'required';
+    function validateChildren($tokens_of_children, $config, &$context) {
+        // if there are no tokens, delete parent node
+        if (empty($tokens_of_children)) return false;
+        
+        // the new set of children
+        $result = array();
+        
+        // current depth into the nest
+        $nesting = 0;
+        
+        // whether or not we're deleting a node
+        $is_deleting = false;
+        
+        // whether or not parsed character data is allowed
+        // this controls whether or not we silently drop a tag
+        // or generate escaped HTML from it
+        $pcdata_allowed = isset($this->elements['#PCDATA']);
+        
+        // a little sanity check to make sure it's not ALL whitespace
+        $all_whitespace = true;
+        
+        // some configuration
+        $escape_invalid_children = $config->get('Core', 'EscapeInvalidChildren');
+        
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) {
+                $result[] = $token;
+                continue;
+            }
+            $all_whitespace = false; // phew, we're not talking about whitespace
+            
+            $is_child = ($nesting == 0);
+            
+            if ($token->type == 'start') {
+                $nesting++;
+            } elseif ($token->type == 'end') {
+                $nesting--;
+            }
+            
+            if ($is_child) {
+                $is_deleting = false;
+                if (!isset($this->elements[$token->name])) {
+                    $is_deleting = true;
+                    if ($pcdata_allowed && $token->type == 'text') {
+                        $result[] = $token;
+                    } elseif ($pcdata_allowed && $escape_invalid_children) {
+                        $result[] = new HTMLPurifier_Token_Text(
+                            $this->gen->generateFromToken($token, $config)
+                        );
+                    }
+                    continue;
+                }
+            }
+            if (!$is_deleting || ($pcdata_allowed && $token->type == 'text')) {
+                $result[] = $token;
+            } elseif ($pcdata_allowed && $escape_invalid_children) {
+                $result[] =
+                    new HTMLPurifier_Token_Text(
+                        $this->gen->generateFromToken( $token, $config )
+                    );
+            } else {
+                // drop silently
+            }
+        }
+        if (empty($result)) return false;
+        if ($all_whitespace) return false;
+        if ($tokens_of_children == $result) return true;
+        return $result;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php
new file mode 100644 (file)
index 0000000..9280a9f
--- /dev/null
@@ -0,0 +1,76 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef/Required.php';
+
+/**
+ * Takes the contents of blockquote when in strict and reformats for validation.
+ */
+class   HTMLPurifier_ChildDef_StrictBlockquote
+extends HTMLPurifier_ChildDef_Required
+{
+    var $real_elements;
+    var $fake_elements;
+    var $allow_empty = true;
+    var $type = 'strictblockquote';
+    var $init = false;
+    function validateChildren($tokens_of_children, $config, &$context) {
+        
+        $def = $config->getHTMLDefinition();
+        if (!$this->init) {
+            // allow all inline elements
+            $this->real_elements = $this->elements;
+            $this->fake_elements = $def->info_content_sets['Flow'];
+            $this->fake_elements['#PCDATA'] = true;
+            $this->init = true;
+        }
+        
+        // trick the parent class into thinking it allows more
+        $this->elements = $this->fake_elements;
+        $result = parent::validateChildren($tokens_of_children, $config, $context);
+        $this->elements = $this->real_elements;
+        
+        if ($result === false) return array();
+        if ($result === true) $result = $tokens_of_children;
+        
+        $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
+        $block_wrap_end   = new HTMLPurifier_Token_End(  $def->info_block_wrapper);
+        $is_inline = false;
+        $depth = 0;
+        $ret = array();
+        
+        // assuming that there are no comment tokens
+        foreach ($result as $i => $token) {
+            $token = $result[$i];
+            // ifs are nested for readability
+            if (!$is_inline) {
+                if (!$depth) {
+                     if (
+                        $token->type == 'text' ||
+                        !isset($this->elements[$token->name])
+                     ) {
+                        $is_inline = true;
+                        $ret[] = $block_wrap_start;
+                     }
+                }
+            } else {
+                if (!$depth) {
+                    // starting tokens have been inline text / empty
+                    if ($token->type == 'start' || $token->type == 'empty') {
+                        if (isset($this->elements[$token->name])) {
+                            // ended
+                            $ret[] = $block_wrap_end;
+                            $is_inline = false;
+                        }
+                    }
+                }
+            }
+            $ret[] = $token;
+            if ($token->type == 'start') $depth++;
+            if ($token->type == 'end')   $depth--;
+        }
+        if ($is_inline) $ret[] = $block_wrap_end;
+        return $ret;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php
new file mode 100644 (file)
index 0000000..3534cdd
--- /dev/null
@@ -0,0 +1,142 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef.php';
+
+/**
+ * Definition for tables
+ */
+class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
+{
+    var $allow_empty = false;
+    var $type = 'table';
+    function HTMLPurifier_ChildDef_Table() {}
+    function validateChildren($tokens_of_children, $config, &$context) {
+        if (empty($tokens_of_children)) return false;
+        
+        // this ensures that the loop gets run one last time before closing
+        // up. It's a little bit of a hack, but it works! Just make sure you
+        // get rid of the token later.
+        $tokens_of_children[] = false;
+        
+        // only one of these elements is allowed in a table
+        $caption = false;
+        $thead   = false;
+        $tfoot   = false;
+        
+        // as many of these as you want
+        $cols    = array();
+        $content = array();
+        
+        $nesting = 0; // current depth so we can determine nodes
+        $is_collecting = false; // are we globbing together tokens to package
+                                // into one of the collectors?
+        $collection = array(); // collected nodes
+        $tag_index = 0; // the first node might be whitespace,
+                            // so this tells us where the start tag is
+        
+        foreach ($tokens_of_children as $token) {
+            $is_child = ($nesting == 0);
+            
+            if ($token === false) {
+                // terminating sequence started
+            } elseif ($token->type == 'start') {
+                $nesting++;
+            } elseif ($token->type == 'end') {
+                $nesting--;
+            }
+            
+            // handle node collection
+            if ($is_collecting) {
+                if ($is_child) {
+                    // okay, let's stash the tokens away
+                    // first token tells us the type of the collection
+                    switch ($collection[$tag_index]->name) {
+                        case 'tr':
+                        case 'tbody':
+                            $content[] = $collection;
+                            break;
+                        case 'caption':
+                            if ($caption !== false) break;
+                            $caption = $collection;
+                            break;
+                        case 'thead':
+                        case 'tfoot':
+                            // access the appropriate variable, $thead or $tfoot
+                            $var = $collection[$tag_index]->name;
+                            if ($$var === false) {
+                                $$var = $collection;
+                            } else {
+                                // transmutate the first and less entries into
+                                // tbody tags, and then put into content
+                                $collection[$tag_index]->name = 'tbody';
+                                $collection[count($collection)-1]->name = 'tbody';
+                                $content[] = $collection;
+                            }
+                            break;
+                         case 'colgroup':
+                            $cols[] = $collection;
+                            break;
+                    }
+                    $collection = array();
+                    $is_collecting = false;
+                    $tag_index = 0;
+                } else {
+                    // add the node to the collection
+                    $collection[] = $token;
+                }
+            }
+            
+            // terminate
+            if ($token === false) break;
+            
+            if ($is_child) {
+                // determine what we're dealing with
+                if ($token->name == 'col') {
+                    // the only empty tag in the possie, we can handle it
+                    // immediately
+                    $cols[] = array_merge($collection, array($token));
+                    $collection = array();
+                    $tag_index = 0;
+                    continue;
+                }
+                switch($token->name) {
+                    case 'caption':
+                    case 'colgroup':
+                    case 'thead':
+                    case 'tfoot':
+                    case 'tbody':
+                    case 'tr':
+                        $is_collecting = true;
+                        $collection[] = $token;
+                        continue;
+                    default:
+                        if ($token->type == 'text' && $token->is_whitespace) {
+                            $collection[] = $token;
+                            $tag_index++;
+                        }
+                        continue;
+                }
+            }
+        }
+        
+        if (empty($content)) return false;
+        
+        $ret = array();
+        if ($caption !== false) $ret = array_merge($ret, $caption);
+        if ($cols !== false)    foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
+        if ($thead !== false)   $ret = array_merge($ret, $thead);
+        if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
+        foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+        if (!empty($collection) && $is_collecting == false){
+            // grab the trailing space
+            $ret = array_merge($ret, $collection);
+        }
+        
+        array_pop($tokens_of_children); // remove phantom token
+        
+        return ($ret === $tokens_of_children) ? true : $ret;
+        
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Config.php b/lib/htmlpurifier/HTMLPurifier/Config.php
new file mode 100644 (file)
index 0000000..c94e01f
--- /dev/null
@@ -0,0 +1,222 @@
+<?php
+
+/**
+ * Configuration object that triggers customizable behavior.
+ *
+ * @warning This class is strongly defined: that means that the class
+ *          will fail if an undefined directive is retrieved or set.
+ * 
+ * @note Many classes that could (although many times don't) use the
+ *       configuration object make it a mandatory parameter.  This is
+ *       because a configuration object should always be forwarded,
+ *       otherwise, you run the risk of missing a parameter and then
+ *       being stumped when a configuration directive doesn't work.
+ */
+class HTMLPurifier_Config
+{
+    
+    /**
+     * Two-level associative array of configuration directives
+     */
+    var $conf;
+    
+    /**
+     * Reference HTMLPurifier_ConfigSchema for value checking
+     */
+    var $def;
+    
+    /**
+     * Cached instance of HTMLPurifier_HTMLDefinition
+     */
+    var $html_definition;
+    
+    /**
+     * Cached instance of HTMLPurifier_CSSDefinition
+     */
+    var $css_definition;
+    
+    /**
+     * @param $definition HTMLPurifier_ConfigSchema that defines what directives
+     *                    are allowed.
+     */
+    function HTMLPurifier_Config(&$definition) {
+        $this->conf = $definition->defaults; // set up, copy in defaults
+        $this->def  = $definition; // keep a copy around for checking
+    }
+    
+    /**
+     * Convenience constructor that creates a config object based on a mixed var
+     * @static
+     * @param mixed $config Variable that defines the state of the config
+     *                      object. Can be: a HTMLPurifier_Config() object,
+     *                      an array of directives based on loadArray(),
+     *                      or a string filename of an ini file.
+     * @return Configured HTMLPurifier_Config object
+     */
+    function create($config) {
+        if (is_a($config, 'HTMLPurifier_Config')) return $config;
+        $ret = HTMLPurifier_Config::createDefault();
+        if (is_string($config)) $ret->loadIni($config);
+        elseif (is_array($config)) $ret->loadArray($config);
+        return $ret;
+    }
+    
+    /**
+     * Convenience constructor that creates a default configuration object.
+     * @static
+     * @return Default HTMLPurifier_Config object.
+     */
+    function createDefault() {
+        $definition =& HTMLPurifier_ConfigSchema::instance();
+        $config = new HTMLPurifier_Config($definition);
+        return $config;
+    }
+    
+    /**
+     * Retreives a value from the configuration.
+     * @param $namespace String namespace
+     * @param $key String key
+     */
+    function get($namespace, $key, $from_alias = false) {
+        if (!isset($this->def->info[$namespace][$key])) {
+            trigger_error('Cannot retrieve value of undefined directive',
+                E_USER_WARNING);
+            return;
+        }
+        if ($this->def->info[$namespace][$key]->class == 'alias') {
+            trigger_error('Cannot get value from aliased directive, use real name',
+                E_USER_ERROR);
+            return;
+        }
+        return $this->conf[$namespace][$key];
+    }
+    
+    /**
+     * Retreives an array of directives to values from a given namespace
+     * @param $namespace String namespace
+     */
+    function getBatch($namespace) {
+        if (!isset($this->def->info[$namespace])) {
+            trigger_error('Cannot retrieve undefined namespace',
+                E_USER_WARNING);
+            return;
+        }
+        return $this->conf[$namespace];
+    }
+    
+    /**
+     * Sets a value to configuration.
+     * @param $namespace String namespace
+     * @param $key String key
+     * @param $value Mixed value
+     */
+    function set($namespace, $key, $value, $from_alias = false) {
+        if (!isset($this->def->info[$namespace][$key])) {
+            trigger_error('Cannot set undefined directive to value',
+                E_USER_WARNING);
+            return;
+        }
+        if ($this->def->info[$namespace][$key]->class == 'alias') {
+            if ($from_alias) {
+                trigger_error('Double-aliases not allowed, please fix '.
+                    'ConfigSchema bug');
+            }
+            $this->set($this->def->info[$namespace][$key]->namespace,
+                       $this->def->info[$namespace][$key]->name,
+                       $value, true);
+            return;
+        }
+        $value = $this->def->validate(
+                    $value,
+                    $this->def->info[$namespace][$key]->type,
+                    $this->def->info[$namespace][$key]->allow_null
+                 );
+        if (is_string($value)) {
+            // resolve value alias if defined
+            if (isset($this->def->info[$namespace][$key]->aliases[$value])) {
+                $value = $this->def->info[$namespace][$key]->aliases[$value];
+            }
+            if ($this->def->info[$namespace][$key]->allowed !== true) {
+                // check to see if the value is allowed
+                if (!isset($this->def->info[$namespace][$key]->allowed[$value])) {
+                    trigger_error('Value not supported', E_USER_WARNING);
+                    return;
+                }
+            }
+        }
+        if ($this->def->isError($value)) {
+            trigger_error('Value is of invalid type', E_USER_WARNING);
+            return;
+        }
+        $this->conf[$namespace][$key] = $value;
+        if ($namespace == 'HTML' || $namespace == 'Attr') {
+            // reset HTML definition if relevant attributes changed
+            $this->html_definition = null;
+        }
+        if ($namespace == 'CSS') {
+            $this->css_definition = null;
+        }
+    }
+    
+    /**
+     * Retrieves reference to the HTML definition.
+     * @param $raw Return a copy that has not been setup yet. Must be
+     *             called before it's been setup, otherwise won't work.
+     */
+    function &getHTMLDefinition($raw = false) {
+        if (
+            empty($this->html_definition) || // hasn't ever been setup
+            ($raw && $this->html_definition->setup) // requesting new one
+        ) {
+            $this->html_definition = new HTMLPurifier_HTMLDefinition($this);
+            if ($raw) return $this->html_definition; // no setup!
+        }
+        if (!$this->html_definition->setup) $this->html_definition->setup();
+        return $this->html_definition;
+    }
+    
+    /**
+     * Retrieves reference to the CSS definition
+     */
+    function &getCSSDefinition() {
+        if ($this->css_definition === null) {
+            $this->css_definition = new HTMLPurifier_CSSDefinition();
+            $this->css_definition->setup($this);
+        }
+        return $this->css_definition;
+    }
+    
+    /**
+     * Loads configuration values from an array with the following structure:
+     * Namespace.Directive => Value
+     * @param $config_array Configuration associative array
+     */
+    function loadArray($config_array) {
+        foreach ($config_array as $key => $value) {
+            $key = str_replace('_', '.', $key);
+            if (strpos($key, '.') !== false) {
+                // condensed form
+                list($namespace, $directive) = explode('.', $key);
+                $this->set($namespace, $directive, $value);
+            } else {
+                $namespace = $key;
+                $namespace_values = $value;
+                foreach ($namespace_values as $directive => $value) {
+                    $this->set($namespace, $directive, $value);
+                }
+            }
+        }
+    }
+    
+    /**
+     * Loads configuration values from an ini file
+     * @param $filename Name of ini file
+     */
+    function loadIni($filename) {
+        $array = parse_ini_file($filename, true);
+        $this->loadArray($array);
+    }
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef.php
new file mode 100644 (file)
index 0000000..b92640d
--- /dev/null
@@ -0,0 +1,10 @@
+<?php
+
+/**
+ * Base class for configuration entity
+ */
+class HTMLPurifier_ConfigDef {
+    var $class = false;
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php
new file mode 100644 (file)
index 0000000..3902654
--- /dev/null
@@ -0,0 +1,74 @@
+<?php
+
+require_once 'HTMLPurifier/ConfigDef.php';
+
+/**
+ * Structure object containing definition of a directive.
+ * @note This structure does not contain default values
+ */
+class HTMLPurifier_ConfigDef_Directive extends HTMLPurifier_ConfigDef
+{
+    
+    var $class = 'directive';
+    
+    function HTMLPurifier_ConfigDef_Directive(
+        $type = null,
+        $descriptions = null,
+        $allow_null = null,
+        $allowed = null,
+        $aliases = null
+    ) {
+        if (        $type !== null)         $this->type = $type;
+        if ($descriptions !== null) $this->descriptions = $descriptions;
+        if (  $allow_null !== null)   $this->allow_null = $allow_null;
+        if (     $allowed !== null)      $this->allowed = $allowed;
+        if (     $aliases !== null)      $this->aliases = $aliases;
+    }
+    
+    /**
+     * Allowed type of the directive. Values are:
+     *      - string
+     *      - istring (case insensitive string)
+     *      - int
+     *      - float
+     *      - bool
+     *      - lookup (array of value => true)
+     *      - list (regular numbered index array)
+     *      - hash (array of key => value)
+     *      - mixed (anything goes)
+     */
+    var $type = 'mixed';
+    
+    /**
+     * Plaintext descriptions of the configuration entity is. Organized by
+     * file and line number, so multiple descriptions are allowed.
+     */
+    var $descriptions = array();
+    
+    /**
+     * Is null allowed? Has no effect for mixed type.
+     * @bool
+     */
+    var $allow_null = false;
+    
+    /**
+     * Lookup table of allowed values of the element, bool true if all allowed.
+     */
+    var $allowed = true;
+    
+    /**
+     * Hash of value aliases, i.e. values that are equivalent.
+     */
+    var $aliases = array();
+    
+    /**
+     * Adds a description to the array
+     */
+    function addDescription($file, $line, $description) {
+        if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array();
+        $this->descriptions[$file][$line] = $description;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php
new file mode 100644 (file)
index 0000000..81a4451
--- /dev/null
@@ -0,0 +1,27 @@
+<?php
+
+require_once 'HTMLPurifier/ConfigDef.php';
+
+/**
+ * Structure object describing a directive alias
+ */
+class HTMLPurifier_ConfigDef_DirectiveAlias extends HTMLPurifier_ConfigDef
+{
+    var $class = 'alias';
+    
+    /**
+     * Namespace being aliased to
+     */
+    var $namespace;
+    /**
+     * Directive being aliased to
+     */
+    var $name;
+    
+    function HTMLPurifier_ConfigDef_DirectiveAlias($namespace, $name) {
+        $this->namespace = $namespace;
+        $this->name = $name;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php
new file mode 100644 (file)
index 0000000..f53892b
--- /dev/null
@@ -0,0 +1,23 @@
+<?php
+
+require_once 'HTMLPurifier/ConfigDef.php';
+
+/**
+ * Structure object describing of a namespace
+ */
+class HTMLPurifier_ConfigDef_Namespace extends HTMLPurifier_ConfigDef {
+    
+    function HTMLPurifier_ConfigDef_Namespace($description = null) {
+        $this->description = $description;
+    }
+    
+    var $class = 'namespace';
+    
+    /**
+     * String description of what kinds of directives go in this namespace.
+     */
+    var $description;
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php b/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php
new file mode 100644 (file)
index 0000000..9f1f3e3
--- /dev/null
@@ -0,0 +1,386 @@
+<?php
+
+require_once 'HTMLPurifier/Error.php';
+require_once 'HTMLPurifier/ConfigDef.php';
+require_once 'HTMLPurifier/ConfigDef/Namespace.php';
+require_once 'HTMLPurifier/ConfigDef/Directive.php';
+require_once 'HTMLPurifier/ConfigDef/DirectiveAlias.php';
+
+/**
+ * Configuration definition, defines directives and their defaults.
+ * @todo The ability to define things multiple times is confusing and should
+ *       be factored out to its own function named registerDependency() or 
+ *       addNote(), where only the namespace.name and an extra descriptions
+ *       documenting the nature of the dependency are needed.  Since it's
+ *       possible that the dependency is registered before the configuration
+ *       is defined, deferring it to some sort of cache until it actually
+ *       gets defined would be wise, keeping it opaque until it does get
+ *       defined. We could add a finalize() method which would cause it to
+ *       error out if we get a dangling dependency.  It's difficult, however,
+ *       to know whether or not it's a dependency, or a codependency, that is
+ *       neither of them fully depends on it. Where does the configuration go
+ *       then?  This could be partially resolved by allowing blanket definitions
+ *       and then splitting them up into finer-grained versions, however, there
+ *       might be implementation difficulties in ini files regarding order of
+ *       execution.
+ */
+class HTMLPurifier_ConfigSchema {
+    
+    /**
+     * Defaults of the directives and namespaces.
+     * @note This shares the exact same structure as HTMLPurifier_Config::$conf
+     */
+    var $defaults = array();
+    
+    /**
+     * Definition of the directives.
+     */
+    var $info = array();
+    
+    /**
+     * Definition of namespaces.
+     */
+    var $info_namespace = array();
+    
+    /**
+     * Lookup table of allowed types.
+     */
+    var $types = array(
+        'string'    => 'String',
+        'istring'   => 'Case-insensitive string',
+        'int'       => 'Integer',
+        'float'     => 'Float',
+        'bool'      => 'Boolean',
+        'lookup'    => 'Lookup array',
+        'list'      => 'Array list',
+        'hash'      => 'Associative array',
+        'mixed'     => 'Mixed'
+    );
+    
+    /**
+     * Initializes the default namespaces.
+     */
+    function initialize() {
+        $this->defineNamespace('Core', 'Core features that are always available.');
+        $this->defineNamespace('Attr', 'Features regarding attribute validation.');
+        $this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.');
+        $this->defineNamespace('HTML', 'Configuration regarding allowed HTML.');
+        $this->defineNamespace('CSS', 'Configuration regarding allowed CSS.');
+        $this->defineNamespace('Test', 'Developer testing configuration for our unit tests.');
+    }
+    
+    /**
+     * Retrieves an instance of the application-wide configuration definition.
+     * @static
+     */
+    function &instance($prototype = null) {
+        static $instance;
+        if ($prototype !== null) {
+            $instance = $prototype;
+        } elseif ($instance === null || $prototype === true) {
+            $instance = new HTMLPurifier_ConfigSchema();
+            $instance->initialize();
+        }
+        return $instance;
+    }
+    
+    /**
+     * Defines a directive for configuration
+     * @static
+     * @warning Will fail of directive's namespace is defined
+     * @param $namespace Namespace the directive is in
+     * @param $name Key of directive
+     * @param $default Default value of directive
+     * @param $type Allowed type of the directive. See
+     *      HTMLPurifier_DirectiveDef::$type for allowed values
+     * @param $description Description of directive for documentation
+     */
+    function define(
+        $namespace, $name, $default, $type, 
+        $description
+    ) {
+        $def =& HTMLPurifier_ConfigSchema::instance();
+        if (!isset($def->info[$namespace])) {
+            trigger_error('Cannot define directive for undefined namespace',
+                E_USER_ERROR);
+            return;
+        }
+        if (!ctype_alnum($name)) {
+            trigger_error('Directive name must be alphanumeric',
+                E_USER_ERROR);
+            return;
+        }
+        if (empty($description)) {
+            trigger_error('Description must be non-empty',
+                E_USER_ERROR);
+            return;
+        }
+        if (isset($def->info[$namespace][$name])) {
+            if (
+                $def->info[$namespace][$name]->type !== $type ||
+                $def->defaults[$namespace][$name]   !== $default
+            ) {
+                trigger_error('Inconsistent default or type, cannot redefine');
+                return;
+            }
+        } else {
+            // process modifiers
+            $type_values = explode('/', $type, 2);
+            $type = $type_values[0];
+            $modifier = isset($type_values[1]) ? $type_values[1] : false;
+            $allow_null = ($modifier === 'null');
+            
+            if (!isset($def->types[$type])) {
+                trigger_error('Invalid type for configuration directive',
+                    E_USER_ERROR);
+                return;
+            }
+            $default = $def->validate($default, $type, $allow_null);
+            if ($def->isError($default)) {
+                trigger_error('Default value does not match directive type',
+                    E_USER_ERROR);
+                return;
+            }
+            $def->info[$namespace][$name] =
+                new HTMLPurifier_ConfigDef_Directive();
+            $def->info[$namespace][$name]->type = $type;
+            $def->info[$namespace][$name]->allow_null = $allow_null;
+            $def->defaults[$namespace][$name]   = $default;
+        }
+        $backtrace = debug_backtrace();
+        $file = $def->mungeFilename($backtrace[0]['file']);
+        $line = $backtrace[0]['line'];
+        $def->info[$namespace][$name]->addDescription($file,$line,$description);
+    }
+    
+    /**
+     * Defines a namespace for directives to be put into.
+     * @static
+     * @param $namespace Namespace's name
+     * @param $description Description of the namespace
+     */
+    function defineNamespace($namespace, $description) {
+        $def =& HTMLPurifier_ConfigSchema::instance();
+        if (isset($def->info[$namespace])) {
+            trigger_error('Cannot redefine namespace', E_USER_ERROR);
+            return;
+        }
+        if (!ctype_alnum($namespace)) {
+            trigger_error('Namespace name must be alphanumeric',
+                E_USER_ERROR);
+            return;
+        }
+        if (empty($description)) {
+            trigger_error('Description must be non-empty',
+                E_USER_ERROR);
+            return;
+        }
+        $def->info[$namespace] = array();
+        $def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace();
+        $def->info_namespace[$namespace]->description = $description;
+        $def->defaults[$namespace] = array();
+    }
+    
+    /**
+     * Defines a directive value alias.
+     * 
+     * Directive value aliases are convenient for developers because it lets
+     * them set a directive to several values and get the same result.
+     * @static
+     * @param $namespace Directive's namespace
+     * @param $name Name of Directive
+     * @param $alias Name of aliased value
+     * @param $real Value aliased value will be converted into
+     */
+    function defineValueAliases($namespace, $name, $aliases) {
+        $def =& HTMLPurifier_ConfigSchema::instance();
+        if (!isset($def->info[$namespace][$name])) {
+            trigger_error('Cannot set value alias for non-existant directive',
+                E_USER_ERROR);
+            return;
+        }
+        foreach ($aliases as $alias => $real) {
+            if (!$def->info[$namespace][$name] !== true &&
+                !isset($def->info[$namespace][$name]->allowed[$real])
+            ) {
+                trigger_error('Cannot define alias to value that is not allowed',
+                    E_USER_ERROR);
+                return;
+            }
+            if (isset($def->info[$namespace][$name]->allowed[$alias])) {
+                trigger_error('Cannot define alias over allowed value',
+                    E_USER_ERROR);
+                return;
+            }
+            $def->info[$namespace][$name]->aliases[$alias] = $real;
+        }
+    }
+    
+    /**
+     * Defines a set of allowed values for a directive.
+     * @static
+     * @param $namespace Namespace of directive
+     * @param $name Name of directive
+     * @param $allowed_values Arraylist of allowed values
+     */
+    function defineAllowedValues($namespace, $name, $allowed_values) {
+        $def =& HTMLPurifier_ConfigSchema::instance();
+        if (!isset($def->info[$namespace][$name])) {
+            trigger_error('Cannot define allowed values for undefined directive',
+                E_USER_ERROR);
+            return;
+        }
+        $directive =& $def->info[$namespace][$name];
+        $type = $directive->type;
+        if ($type != 'string' && $type != 'istring') {
+            trigger_error('Cannot define allowed values for directive whose type is not string',
+                E_USER_ERROR);
+            return;
+        }
+        if ($directive->allowed === true) {
+            $directive->allowed = array();
+        }
+        foreach ($allowed_values as $value) {
+            $directive->allowed[$value] = true;
+        }
+        if ($def->defaults[$namespace][$name] !== null &&
+            !isset($directive->allowed[$def->defaults[$namespace][$name]])) {
+            trigger_error('Default value must be in allowed range of variables',
+                E_USER_ERROR);
+            $directive->allowed = true; // undo undo!
+            return;
+        }
+    }
+    
+    /**
+     * Defines a directive alias for backwards compatibility
+     * @static
+     * @param $namespace
+     * @param $name Directive that will be aliased
+     * @param $new_namespace
+     * @param $new_name Directive that the alias will be to
+     */
+    function defineAlias($namespace, $name, $new_namespace, $new_name) {
+        $def =& HTMLPurifier_ConfigSchema::instance();
+        if (!isset($def->info[$namespace])) {
+            trigger_error('Cannot define directive alias in undefined namespace',
+                E_USER_ERROR);
+            return;
+        }
+        if (!ctype_alnum($name)) {
+            trigger_error('Directive name must be alphanumeric',
+                E_USER_ERROR);
+            return;
+        }
+        if (isset($def->info[$namespace][$name])) {
+            trigger_error('Cannot define alias over directive',
+                E_USER_ERROR);
+            return;
+        }
+        if (!isset($def->info[$new_namespace][$new_name])) {
+            trigger_error('Cannot define alias to undefined directive',
+                E_USER_ERROR);
+            return;
+        }
+        if ($def->info[$new_namespace][$new_name]->class == 'alias') {
+            trigger_error('Cannot define alias to alias',
+                E_USER_ERROR);
+            return;
+        }
+        $def->info[$namespace][$name] =
+            new HTMLPurifier_ConfigDef_DirectiveAlias(
+                $new_namespace, $new_name);
+    }
+    
+    /**
+     * Validate a variable according to type. Return null if invalid.
+     */
+    function validate($var, $type, $allow_null = false) {
+        if (!isset($this->types[$type])) {
+            trigger_error('Invalid type', E_USER_ERROR);
+            return;
+        }
+        if ($allow_null && $var === null) return null;
+        switch ($type) {
+            case 'mixed':
+                return $var;
+            case 'istring':
+            case 'string':
+                if (!is_string($var)) break;
+                if ($type === 'istring') $var = strtolower($var);
+                return $var;
+            case 'int':
+                if (is_string($var) && ctype_digit($var)) $var = (int) $var;
+                elseif (!is_int($var)) break;
+                return $var;
+            case 'float':
+                if (is_string($var) && is_numeric($var)) $var = (float) $var;
+                elseif (!is_float($var)) break;
+                return $var;
+            case 'bool':
+                if (is_int($var) && ($var === 0 || $var === 1)) {
+                    $var = (bool) $var;
+                } elseif (is_string($var)) {
+                    if ($var == 'on' || $var == 'true' || $var == '1') {
+                        $var = true;
+                    } elseif ($var == 'off' || $var == 'false' || $var == '0') {
+                        $var = false;
+                    } else {
+                        break;
+                    }
+                } elseif (!is_bool($var)) break;
+                return $var;
+            case 'list':
+            case 'hash':
+            case 'lookup':
+                if (is_string($var)) {
+                    // simplistic string to array method that only works
+                    // for simple lists of tag names or alphanumeric characters
+                    $var = explode(',',$var);
+                    // remove spaces
+                    foreach ($var as $i => $j) $var[$i] = trim($j);
+                }
+                if (!is_array($var)) break;
+                $keys = array_keys($var);
+                if ($keys === array_keys($keys)) {
+                    if ($type == 'list') return $var;
+                    elseif ($type == 'lookup') {
+                        $new = array();
+                        foreach ($var as $key) {
+                            $new[$key] = true;
+                        }
+                        return $new;
+                    } else break;
+                }
+                if ($type === 'lookup') {
+                    foreach ($var as $key => $value) {
+                        $var[$key] = true;
+                    }
+                }
+                return $var;
+        }
+        $error = new HTMLPurifier_Error();
+        return $error;
+    }
+    
+    /**
+     * Takes an absolute path and munges it into a more manageable relative path
+     */
+    function mungeFilename($filename) {
+        $offset = strrpos($filename, 'HTMLPurifier');
+        $filename = substr($filename, $offset);
+        $filename = str_replace('\\', '/', $filename);
+        return $filename;
+    }
+    
+    /**
+     * Checks if var is an HTMLPurifier_Error object
+     */
+    function isError($var) {
+        if (!is_object($var)) return false;
+        if (!is_a($var, 'HTMLPurifier_Error')) return false;
+        return true;
+    }
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/ContentSets.php b/lib/htmlpurifier/HTMLPurifier/ContentSets.php
new file mode 100644 (file)
index 0000000..de5c532
--- /dev/null
@@ -0,0 +1,148 @@
+<?php
+
+// common defs that we'll support by default
+require_once 'HTMLPurifier/ChildDef.php';
+require_once 'HTMLPurifier/ChildDef/Empty.php';
+require_once 'HTMLPurifier/ChildDef/Required.php';
+require_once 'HTMLPurifier/ChildDef/Optional.php';
+
+class HTMLPurifier_ContentSets
+{
+    
+    /**
+     * List of content set strings (pipe seperators) indexed by name.
+     * @public
+     */
+    var $info = array();
+    
+    /**
+     * List of content set lookups (element => true) indexed by name.
+     * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
+     * @public
+     */
+    var $lookup = array();
+    
+    /**
+     * Synchronized list of defined content sets (keys of info)
+     */
+    var $keys = array();
+    /**
+     * Synchronized list of defined content values (values of info)
+     */
+    var $values = array();
+    
+    /**
+     * Merges in module's content sets, expands identifiers in the content
+     * sets and populates the keys, values and lookup member variables.
+     * @param $modules List of HTMLPurifier_HTMLModule
+     */
+    function HTMLPurifier_ContentSets($modules) {
+        if (!is_array($modules)) $modules = array($modules);
+        // populate content_sets based on module hints
+        // sorry, no way of overloading
+        foreach ($modules as $module_i => $module) {
+            foreach ($module->content_sets as $key => $value) {
+                if (isset($this->info[$key])) {
+                    // add it into the existing content set
+                    $this->info[$key] = $this->info[$key] . ' | ' . $value;
+                } else {
+                    $this->info[$key] = $value;
+                }
+            }
+        }
+        // perform content_set expansions
+        $this->keys = array_keys($this->info);
+        foreach ($this->info as $i => $set) {
+            // only performed once, so infinite recursion is not
+            // a problem
+            $this->info[$i] =
+                str_replace(
+                    $this->keys,
+                    // must be recalculated each time due to
+                    // changing substitutions
+                    array_values($this->info),
+                $set);
+        }
+        $this->values = array_values($this->info);
+        
+        // generate lookup tables
+        foreach ($this->info as $name => $set) {
+            $this->lookup[$name] = $this->convertToLookup($set);
+        }
+    }
+    
+    /**
+     * Accepts a definition; generates and assigns a ChildDef for it
+     * @param $def HTMLPurifier_ElementDef reference
+     * @param $module Module that defined the ElementDef
+     */
+    function generateChildDef(&$def, $module) {
+        if (!empty($def->child)) return; // already done!
+        $content_model = $def->content_model;
+        if (is_string($content_model)) {
+            $def->content_model = str_replace(
+                $this->keys, $this->values, $content_model);
+        }
+        $def->child = $this->getChildDef($def, $module);
+    }
+    
+    /**
+     * Instantiates a ChildDef based on content_model and content_model_type
+     * member variables in HTMLPurifier_ElementDef
+     * @note This will also defer to modules for custom HTMLPurifier_ChildDef
+     *       subclasses that need content set expansion
+     * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
+     * @return HTMLPurifier_ChildDef corresponding to ElementDef
+     */
+    function getChildDef($def, $module) {
+        $value = $def->content_model;
+        if (is_object($value)) {
+            trigger_error(
+                'Literal object child definitions should be stored in '.
+                'ElementDef->child not ElementDef->content_model',
+                E_USER_NOTICE
+            );
+            return $value;
+        }
+        switch ($def->content_model_type) {
+            case 'required':
+                return new HTMLPurifier_ChildDef_Required($value);
+            case 'optional':
+                return new HTMLPurifier_ChildDef_Optional($value);
+            case 'empty':
+                return new HTMLPurifier_ChildDef_Empty();
+            case 'custom':
+                return new HTMLPurifier_ChildDef_Custom($value);
+        }
+        // defer to its module
+        $return = false;
+        if ($module->defines_child_def) { // save a func call
+            $return = $module->getChildDef($def);
+        }
+        if ($return !== false) return $return;
+        // error-out
+        trigger_error(
+            'Could not determine which ChildDef class to instantiate',
+            E_USER_ERROR
+        );
+        return false;
+    }
+    
+    /**
+     * Converts a string list of elements separated by pipes into
+     * a lookup array.
+     * @param $string List of elements
+     * @return Lookup array of elements
+     */
+    function convertToLookup($string) {
+        $array = explode('|', str_replace(' ', '', $string));
+        $ret = array();
+        foreach ($array as $i => $k) {
+            $ret[$k] = true;
+        }
+        return $ret;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Context.php b/lib/htmlpurifier/HTMLPurifier/Context.php
new file mode 100644 (file)
index 0000000..ce6fe51
--- /dev/null
@@ -0,0 +1,76 @@
+<?php
+
+/**
+ * Registry object that contains information about the current context.
+ */
+class HTMLPurifier_Context
+{
+    
+    /**
+     * Private array that stores the references.
+     * @private
+     */
+    var $_storage = array();
+    
+    /**
+     * Registers a variable into the context.
+     * @param $name String name
+     * @param $ref Variable to be registered
+     */
+    function register($name, &$ref) {
+        if (isset($this->_storage[$name])) {
+            trigger_error('Name collision, cannot re-register',
+                          E_USER_ERROR);
+            return;
+        }
+        $this->_storage[$name] =& $ref;
+    }
+    
+    /**
+     * Retrieves a variable reference from the context.
+     * @param $name String name
+     */
+    function &get($name) {
+        if (!isset($this->_storage[$name])) {
+            trigger_error('Attempted to retrieve non-existent variable',
+                          E_USER_ERROR);
+            $var = null; // so we can return by reference
+            return $var;
+        }
+        return $this->_storage[$name];
+    }
+    
+    /**
+     * Destorys a variable in the context.
+     * @param $name String name
+     */
+    function destroy($name) {
+        if (!isset($this->_storage[$name])) {
+            trigger_error('Attempted to destroy non-existent variable',
+                          E_USER_ERROR);
+            return;
+        }
+        unset($this->_storage[$name]);
+    }
+    
+    /**
+     * Checks whether or not the variable exists.
+     * @param $name String name
+     */
+    function exists($name) {
+        return isset($this->_storage[$name]);
+    }
+    
+    /**
+     * Loads a series of variables from an associative array
+     * @param $context_array Assoc array of variables to load
+     */
+    function loadArray(&$context_array) {
+        foreach ($context_array as $key => $discard) {
+            $this->register($key, $context_array[$key]);
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ElementDef.php b/lib/htmlpurifier/HTMLPurifier/ElementDef.php
new file mode 100644 (file)
index 0000000..21bc5f3
--- /dev/null
@@ -0,0 +1,122 @@
+<?php
+
+/**
+ * Structure that stores an HTML element definition. Used by
+ * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
+ */
+class HTMLPurifier_ElementDef
+{
+    
+    /**
+     * Does the definition work by itself, or is it created solely
+     * for the purpose of merging into another definition?
+     */
+    var $standalone = true;
+    
+    /**
+     * Associative array of attribute name to HTMLPurifier_AttrDef
+     * @note Before being processed by HTMLPurifier_AttrCollections
+     *       when modules are finalized during
+     *       HTMLPurifier_HTMLDefinition->setup(), this array may also
+     *       contain an array at index 0 that indicates which attribute
+     *       collections to load into the full array. It may also
+     *       contain string indentifiers in lieu of HTMLPurifier_AttrDef,
+     *       see HTMLPurifier_AttrTypes on how they are expanded during
+     *       HTMLPurifier_HTMLDefinition->setup() processing.
+     * @public
+     */
+    var $attr = array();
+    
+    /**
+     * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
+     * @public
+     */
+    var $attr_transform_pre = array();
+    
+    /**
+     * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
+     * @public
+     */
+    var $attr_transform_post = array();
+    
+    
+    
+    /**
+     * HTMLPurifier_ChildDef of this tag.
+     * @public
+     */
+    var $child;
+    
+    /**
+     * Abstract string representation of internal ChildDef rules. See
+     * HTMLPurifier_ContentSets for how this is parsed and then transformed
+     * into an HTMLPurifier_ChildDef.
+     * @public
+     */
+    var $content_model;
+    
+    /**
+     * Value of $child->type, used to determine which ChildDef to use,
+     * used in combination with $content_model.
+     * @public
+     */
+    var $content_model_type;
+    
+    
+    
+    /**
+     * Lookup table of tags that close this tag. Used during parsing
+     * to make sure we don't attempt to nest unclosed tags.
+     * @public
+     */
+    var $auto_close = array();
+    
+    /**
+     * Does the element have a content model (#PCDATA | Inline)*? This
+     * is important for chameleon ins and del processing in 
+     * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
+     * have to worry about this one.
+     * @public
+     */
+    var $descendants_are_inline;
+    
+    /**
+     * Lookup table of tags excluded from all descendants of this tag.
+     * @public
+     */
+    var $excludes = array();
+    
+    /**
+     * Merges the values of another element definition into this one.
+     * Values from the new element def take precedence if a value is
+     * not mergeable.
+     */
+    function mergeIn($def) {
+        
+        // later keys takes precedence
+        foreach($def->attr as $k => $v) {
+            if ($k == 0) {
+                // merge in the includes
+                // sorry, no way to override an include
+                foreach ($v as $v2) {
+                    $def->attr[0][] = $v2;
+                }
+                continue;
+            }
+            $this->attr[$k] = $v;
+        }
+        foreach($def->attr_transform_pre    as $k => $v) $this->attr_transform_pre[$k]  = $v;
+        foreach($def->attr_transform_post   as $k => $v) $this->attr_transform_post[$k] = $v;
+        foreach($def->auto_close            as $k => $v) $this->auto_close[$k]          = $v;
+        foreach($def->excludes              as $k => $v) $this->excludes[$k]            = $v;
+        
+        if(!is_null($def->child)) $this->child = $def->child;
+        if(!empty($def->content_model)) $this->content_model .= ' | ' . $def->content_model;
+        if(!empty($def->content_model_type)) $this->content_model_type = $def->content_model_type;
+        if(!is_null($def->descendants_are_inline)) $this->descendants_are_inline = $def->descendants_are_inline;
+        
+    }
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/Encoder.php b/lib/htmlpurifier/HTMLPurifier/Encoder.php
new file mode 100644 (file)
index 0000000..1a22b45
--- /dev/null
@@ -0,0 +1,403 @@
+<?php
+
+require_once 'HTMLPurifier/EntityLookup.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'Encoding', 'utf-8', 'istring', 
+    'If for some reason you are unable to convert all webpages to UTF-8, '. 
+    'you can use this directive as a stop-gap compatibility change to '. 
+    'let HTML Purifier deal with non UTF-8 input.  This technique has '. 
+    'notable deficiencies: absolutely no characters outside of the selected '. 
+    'character encoding will be preserved, not even the ones that have '. 
+    'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
+    'that automatically resolves all entities), making it pretty useless '.
+    'for anything except the most I18N-blind applications, although '.
+    '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
+    'another tradeoff. This directive '.
+    'only accepts ISO-8859-1 if iconv is not enabled.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'EscapeNonASCIICharacters', false, 'bool',
+    'This directive overcomes a deficiency in %Core.Encoding by blindly '.
+    'converting all non-ASCII characters into decimal numeric entities before '.
+    'converting it to its native encoding. This means that even '.
+    'characters that can be expressed in the non-UTF-8 encoding will '.
+    'be entity-ized, which can be a real downer for encodings like Big5. '.
+    'It also assumes that the ASCII repetoire is available, although '.
+    'this is the case for almost all encodings. Anyway, use UTF-8! This '.
+    'directive has been available since 1.4.0.'
+);
+
+if ( !function_exists('iconv') ) {
+    // only encodings with native PHP support
+    HTMLPurifier_ConfigSchema::defineAllowedValues(
+        'Core', 'Encoding', array(
+            'utf-8',
+            'iso-8859-1'
+        )
+    );
+    HTMLPurifier_ConfigSchema::defineValueAliases(
+        'Core', 'Encoding', array(
+            'iso8859-1' => 'iso-8859-1'
+        )
+    );
+}
+
+HTMLPurifier_ConfigSchema::define(
+    'Test', 'ForceNoIconv', false, 'bool', 
+    'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
+    'exist and use only pure PHP implementations.'
+);
+
+/**
+ * A UTF-8 specific character encoder that handles cleaning and transforming.
+ * @note All functions in this class should be static.
+ */
+class HTMLPurifier_Encoder
+{
+    
+    /**
+     * Constructor throws fatal error if you attempt to instantiate class
+     */
+    function HTMLPurifier_Encoder() {
+        trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
+    }
+    
+    /**
+     * Cleans a UTF-8 string for well-formedness and SGML validity
+     * 
+     * It will parse according to UTF-8 and return a valid UTF8 string, with
+     * non-SGML codepoints excluded.
+     * 
+     * @static
+     * @note Just for reference, the non-SGML code points are 0 to 31 and
+     *       127 to 159, inclusive.  However, we allow code points 9, 10
+     *       and 13, which are the tab, line feed and carriage return
+     *       respectively. 128 and above the code points map to multibyte
+     *       UTF-8 representations.
+     * 
+     * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
+     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
+     *       LGPL license.  Notes on what changed are inside, but in general,
+     *       the original code transformed UTF-8 text into an array of integer
+     *       Unicode codepoints. Understandably, transforming that back to
+     *       a string would be somewhat expensive, so the function was modded to
+     *       directly operate on the string.  However, this discourages code
+     *       reuse, and the logic enumerated here would be useful for any
+     *       function that needs to be able to understand UTF-8 characters.
+     *       As of right now, only smart lossless character encoding converters
+     *       would need that, and I'm probably not going to implement them.
+     *       Once again, PHP 6 should solve all our problems.
+     */
+    function cleanUTF8($str, $force_php = false) {
+        
+        static $non_sgml_chars = array();
+        if (empty($non_sgml_chars)) {
+            for ($i = 0; $i <= 31; $i++) {
+                // non-SGML ASCII chars
+                // save \r, \t and \n
+                if ($i == 9 || $i == 13 || $i == 10) continue;
+                $non_sgml_chars[chr($i)] = '';
+            }
+            for ($i = 127; $i <= 159; $i++) {
+                $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
+            }
+        }
+        
+        static $iconv = null;
+        if ($iconv === null) $iconv = function_exists('iconv');
+        
+        if ($iconv && !$force_php) {
+            // do the shortcut way
+            $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
+            return strtr($str, $non_sgml_chars);
+        }
+        
+        $mState = 0; // cached expected number of octets after the current octet
+                     // until the beginning of the next UTF8 character sequence
+        $mUcs4  = 0; // cached Unicode character
+        $mBytes = 1; // cached expected number of octets in the current sequence
+        
+        // original code involved an $out that was an array of Unicode
+        // codepoints.  Instead of having to convert back into UTF-8, we've
+        // decided to directly append valid UTF-8 characters onto a string
+        // $out once they're done.  $char accumulates raw bytes, while $mUcs4
+        // turns into the Unicode code point, so there's some redundancy.
+        
+        $out = '';
+        $char = '';
+        
+        $len = strlen($str);
+        for($i = 0; $i < $len; $i++) {
+            $in = ord($str{$i});
+            $char .= $str[$i]; // append byte to char
+            if (0 == $mState) {
+                // When mState is zero we expect either a US-ASCII character 
+                // or a multi-octet sequence.
+                if (0 == (0x80 & ($in))) {
+                    // US-ASCII, pass straight through.
+                    if (($in <= 31 || $in == 127) && 
+                        !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
+                    ) {
+                        // control characters, remove
+                    } else {
+                        $out .= $char;
+                    }
+                    // reset
+                    $char = '';
+                    $mBytes = 1;
+                } elseif (0xC0 == (0xE0 & ($in))) {
+                    // First octet of 2 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
+                    $mState = 1;
+                    $mBytes = 2;
+                } elseif (0xE0 == (0xF0 & ($in))) {
+                    // First octet of 3 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
+                    $mState = 2;
+                    $mBytes = 3;
+                } elseif (0xF0 == (0xF8 & ($in))) {
+                    // First octet of 4 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x07) << 18;
+                    $mState = 3;
+                    $mBytes = 4;
+                } elseif (0xF8 == (0xFC & ($in))) {
+                    // First octet of 5 octet sequence.
+                    // 
+                    // This is illegal because the encoded codepoint must be 
+                    // either:
+                    // (a) not the shortest form or
+                    // (b) outside the Unicode range of 0-0x10FFFF.
+                    // Rather than trying to resynchronize, we will carry on 
+                    // until the end of the sequence and let the later error
+                    // handling code catch it.
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x03) << 24;
+                    $mState = 4;
+                    $mBytes = 5;
+                } elseif (0xFC == (0xFE & ($in))) {
+                    // First octet of 6 octet sequence, see comments for 5
+                    // octet sequence.
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 1) << 30;
+                    $mState = 5;
+                    $mBytes = 6;
+                } else {
+                    // Current octet is neither in the US-ASCII range nor a 
+                    // legal first octet of a multi-octet sequence.
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char = '';
+                }
+            } else {
+                // When mState is non-zero, we expect a continuation of the
+                // multi-octet sequence
+                if (0x80 == (0xC0 & ($in))) {
+                    // Legal continuation.
+                    $shift = ($mState - 1) * 6;
+                    $tmp = $in;
+                    $tmp = ($tmp & 0x0000003F) << $shift;
+                    $mUcs4 |= $tmp;
+                    
+                    if (0 == --$mState) {
+                        // End of the multi-octet sequence. mUcs4 now contains
+                        // the final Unicode codepoint to be output
+                        
+                        // Check for illegal sequences and codepoints.
+                        
+                        // From Unicode 3.1, non-shortest form is illegal
+                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
+                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
+                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
+                            (4 < $mBytes) ||
+                            // From Unicode 3.2, surrogate characters = illegal
+                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
+                            // Codepoints outside the Unicode range are illegal
+                            ($mUcs4 > 0x10FFFF)
+                        ) {
+                            
+                        } elseif (0xFEFF != $mUcs4 && // omit BOM
+                            !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
+                        ) {
+                            $out .= $char;
+                        }
+                        // initialize UTF8 cache (reset)
+                        $mState = 0;
+                        $mUcs4  = 0;
+                        $mBytes = 1;
+                        $char = '';
+                    }
+                } else {
+                    // ((0xC0 & (*in) != 0x80) && (mState != 0))
+                    // Incomplete multi-octet sequence.
+                    // used to result in complete fail, but we'll reset
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char ='';
+                }
+            }
+        }
+        return $out;
+    }
+    
+    /**
+     * Translates a Unicode codepoint into its corresponding UTF-8 character.
+     * @static
+     * @note Based on Feyd's function at
+     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
+     *       which is in public domain.
+     * @note While we're going to do code point parsing anyway, a good
+     *       optimization would be to refuse to translate code points that
+     *       are non-SGML characters.  However, this could lead to duplication.
+     * @note This is very similar to the unichr function in
+     *       maintenance/generate-entity-file.php (although this is superior,
+     *       due to its sanity checks).
+     */
+    
+    // +----------+----------+----------+----------+
+    // | 33222222 | 22221111 | 111111   |          |
+    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
+    // +----------+----------+----------+----------+
+    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
+    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
+    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
+    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
+    // +----------+----------+----------+----------+
+    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
+    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
+    // +----------+----------+----------+----------+ 
+    
+    function unichr($code) {
+        if($code > 1114111 or $code < 0 or
+          ($code >= 55296 and $code <= 57343) ) {
+            // bits are set outside the "valid" range as defined
+            // by UNICODE 4.1.0 
+            return '';
+        }
+        
+        $x = $y = $z = $w = 0; 
+        if ($code < 128) {
+            // regular ASCII character
+            $x = $code;
+        } else {
+            // set up bits for UTF-8
+            $x = ($code & 63) | 128;
+            if ($code < 2048) {
+                $y = (($code & 2047) >> 6) | 192;
+            } else {
+                $y = (($code & 4032) >> 6) | 128;
+                if($code < 65536) {
+                    $z = (($code >> 12) & 15) | 224;
+                } else {
+                    $z = (($code >> 12) & 63) | 128;
+                    $w = (($code >> 18) & 7)  | 240;
+                }
+            } 
+        }
+        // set up the actual character
+        $ret = '';
+        if($w) $ret .= chr($w);
+        if($z) $ret .= chr($z);
+        if($y) $ret .= chr($y);
+        $ret .= chr($x); 
+        
+        return $ret;
+    }
+    
+    /**
+     * Converts a string to UTF-8 based on configuration.
+     * @static
+     */
+    function convertToUTF8($str, $config, &$context) {
+        static $iconv = null;
+        if ($iconv === null) $iconv = function_exists('iconv');
+        $encoding = $config->get('Core', 'Encoding');
+        if ($encoding === 'utf-8') return $str;
+        if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
+            return @iconv($encoding, 'utf-8//IGNORE', $str);
+        } elseif ($encoding === 'iso-8859-1') {
+            return @utf8_encode($str);
+        }
+        trigger_error('Encoding not supported', E_USER_ERROR);
+    }
+    
+    /**
+     * Converts a string from UTF-8 based on configuration.
+     * @static
+     * @note Currently, this is a lossy conversion, with unexpressable
+     *       characters being omitted.
+     */
+    function convertFromUTF8($str, $config, &$context) {
+        static $iconv = null;
+        if ($iconv === null) $iconv = function_exists('iconv');
+        $encoding = $config->get('Core', 'Encoding');
+        if ($encoding === 'utf-8') return $str;
+        if ($config->get('Core', 'EscapeNonASCIICharacters')) {
+            $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
+        }
+        if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
+            return @iconv('utf-8', $encoding . '//IGNORE', $str);
+        } elseif ($encoding === 'iso-8859-1') {
+            return @utf8_decode($str);
+        }
+        trigger_error('Encoding not supported', E_USER_ERROR);
+    }
+    
+    /**
+     * Lossless (character-wise) conversion of HTML to ASCII
+     * @static
+     * @param $str UTF-8 string to be converted to ASCII
+     * @returns ASCII encoded string with non-ASCII character entity-ized
+     * @warning Adapted from MediaWiki, claiming fair use: this is a common
+     *       algorithm. If you disagree with this license fudgery,
+     *       implement it yourself.
+     * @note Uses decimal numeric entities since they are best supported.
+     * @note This is a DUMB function: it has no concept of keeping
+     *       character entities that the projected character encoding
+     *       can allow. We could possibly implement a smart version
+     *       but that would require it to also know which Unicode
+     *       codepoints the charset supported (not an easy task).
+     * @note Sort of with cleanUTF8() but it assumes that $str is
+     *       well-formed UTF-8
+     */
+    function convertToASCIIDumbLossless($str) {
+        $bytesleft = 0;
+        $result = '';
+        $working = 0;
+        $len = strlen($str);
+        for( $i = 0; $i < $len; $i++ ) {
+            $bytevalue = ord( $str[$i] );
+            if( $bytevalue <= 0x7F ) { //0xxx xxxx
+                $result .= chr( $bytevalue );
+                $bytesleft = 0;
+            } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
+                $working = $working << 6;
+                $working += ($bytevalue & 0x3F);
+                $bytesleft--;
+                if( $bytesleft <= 0 ) {
+                    $result .= "&#" . $working . ";";
+                }
+            } elseif( $bytevalue <= 0xDF ) { //110x xxxx
+                $working = $bytevalue & 0x1F;
+                $bytesleft = 1;
+            } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
+                $working = $bytevalue & 0x0F;
+                $bytesleft = 2;
+            } else { //1111 0xxx
+                $working = $bytevalue & 0x07;
+                $bytesleft = 3;
+            }
+        }
+        return $result;
+    }
+    
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/EntityLookup.php b/lib/htmlpurifier/HTMLPurifier/EntityLookup.php
new file mode 100644 (file)
index 0000000..f950cc2
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+/**
+ * Object that provides entity lookup table from entity name to character
+ */
+class HTMLPurifier_EntityLookup {
+    
+    /**
+     * Assoc array of entity name to character represented.
+     * @public
+     */
+    var $table;
+    
+    /**
+     * Sets up the entity lookup table from the serialized file contents.
+     * @note The serialized contents are versioned, but were generated
+     *       using the maintenance script generate_entity_file.php
+     * @warning This is not in constructor to help enforce the Singleton
+     */
+    function setup($file = false) {
+        if (!$file) {
+            $file = dirname(__FILE__) . '/EntityLookup/entities.ser';
+        }
+        $this->table = unserialize(file_get_contents($file));
+    }
+    
+    /**
+     * Retrieves sole instance of the object.
+     * @static
+     * @param Optional prototype of custom lookup table to overload with.
+     */
+    function instance($prototype = false) {
+        // no references, since PHP doesn't copy unless modified
+        static $instance = null;
+        if ($prototype) {
+            $instance = $prototype;
+        } elseif (!$instance) {
+            $instance = new HTMLPurifier_EntityLookup();
+            $instance->setup();
+        }
+        return $instance;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/EntityLookup/entities.ser b/lib/htmlpurifier/HTMLPurifier/EntityLookup/entities.ser
new file mode 100644 (file)
index 0000000..f2b8b8f
--- /dev/null
@@ -0,0 +1 @@
+a:246:{s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"­";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"‌";s:3:"zwj";s:3:"‍";s:3:"lrm";s:3:"‎";s:3:"rlm";s:3:"‏";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";}
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/EntityParser.php b/lib/htmlpurifier/HTMLPurifier/EntityParser.php
new file mode 100644 (file)
index 0000000..069c5ce
--- /dev/null
@@ -0,0 +1,158 @@
+<?php
+
+require_once 'HTMLPurifier/EntityLookup.php';
+require_once 'HTMLPurifier/Encoder.php';
+
+// if want to implement error collecting here, we'll need to use some sort
+// of global data (probably trigger_error) because it's impossible to pass
+// $config or $context to the callback functions.
+
+/**
+ * Handles referencing and derefencing character entities
+ */
+class HTMLPurifier_EntityParser
+{
+    
+    /**
+     * Reference to entity lookup table.
+     * @protected
+     */
+    var $_entity_lookup;
+    
+    /**
+     * Callback regex string for parsing entities.
+     * @protected
+     */                             
+    var $_substituteEntitiesRegex =
+'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
+//     1. hex             2. dec      3. string
+    
+    
+    /**
+     * Decimal to parsed string conversion table for special entities.
+     * @protected
+     */
+    var $_special_dec2str =
+            array(
+                    34 => '"',
+                    38 => '&',
+                    39 => "'",
+                    60 => '<',
+                    62 => '>'
+            );
+    
+    /**
+     * Stripped entity names to decimal conversion table for special entities.
+     * @protected
+     */
+    var $_special_ent2dec =
+            array(
+                    'quot' => 34,
+                    'amp'  => 38,
+                    'lt'   => 60,
+                    'gt'   => 62
+            );
+    
+    /**
+     * Substitutes non-special entities with their parsed equivalents. Since
+     * running this whenever you have parsed character is t3h 5uck, we run
+     * it before everything else.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteNonSpecialEntities($string) {
+        // it will try to detect missing semicolons, but don't rely on it
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array($this, 'nonSpecialEntityCallback'),
+            $string
+            );
+    }
+    
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     */
+    
+    function nonSpecialEntityCallback($matches) {
+        // replaces all but big five
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            
+            // abort for special characters
+            if (isset($this->_special_dec2str[$code]))  return $entity;
+            
+            return HTMLPurifier_Encoder::unichr($code);
+        } else {
+            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+            if (!$this->_entity_lookup) {
+                require_once 'HTMLPurifier/EntityLookup.php';
+                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$matches[3]])) {
+                return $this->_entity_lookup->table[$matches[3]];
+            } else {
+                return $entity;
+            }
+        }
+    }
+    
+    /**
+     * Substitutes only special entities with their parsed equivalents.
+     * 
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteSpecialEntities($string) {
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array($this, 'specialEntityCallback'),
+            $string);
+    }
+    
+    /**
+     * Callback function for substituteSpecialEntities() that does the work.
+     * 
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     */
+    function specialEntityCallback($matches) {
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            return isset($this->_special_dec2str[$int]) ?
+                $this->_special_dec2str[$int] :
+                $entity;
+        } else {
+            return isset($this->_special_ent2dec[$matches[3]]) ?
+                $this->_special_ent2dec[$matches[3]] :
+                $entity;
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Error.php b/lib/htmlpurifier/HTMLPurifier/Error.php
new file mode 100644 (file)
index 0000000..adc81dc
--- /dev/null
@@ -0,0 +1,8 @@
+<?php
+
+/**
+ * Return object from functions that signifies error when null doesn't cut it
+ */
+class HTMLPurifier_Error {}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Filter.php b/lib/htmlpurifier/HTMLPurifier/Filter.php
new file mode 100644 (file)
index 0000000..94c5ae7
--- /dev/null
@@ -0,0 +1,39 @@
+<?php
+
+/**
+ * Represents a pre or post processing filter on HTML Purifier's output
+ * 
+ * Sometimes, a little ad-hoc fixing of HTML has to be done before
+ * it gets sent through HTML Purifier: you can use filters to acheive
+ * this effect. For instance, YouTube videos can be preserved using
+ * this manner. You could have used a decorator for this task, but
+ * PHP's support for them is not terribly robust, so we're going
+ * to just loop through the filters.
+ * 
+ * Filters should be exited first in, last out. If there are three filters,
+ * named 1, 2 and 3, the order of execution should go 1->preFilter,
+ * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
+ * 1->postFilter.
+ */
+
+class HTMLPurifier_Filter
+{
+    
+    /**
+     * Name of the filter for identification purposes
+     */
+    var $name;
+    
+    /**
+     * Pre-processor function, handles HTML before HTML Purifier 
+     */
+    function preFilter($html, $config, &$context) {}
+    
+    /**
+     * Post-processor function, handles HTML after HTML Purifier
+     */
+    function postFilter($html, $config, &$context) {}
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Filter/YouTube.php b/lib/htmlpurifier/HTMLPurifier/Filter/YouTube.php
new file mode 100644 (file)
index 0000000..433f17c
--- /dev/null
@@ -0,0 +1,34 @@
+<?php
+
+require_once 'HTMLPurifier/Filter.php';
+
+class HTMLPurifier_Filter_YouTube extends HTMLPurifier_Filter
+{
+    
+    var $name = 'YouTube preservation';
+    
+    function preFilter($html, $config, &$context) {
+        $pre_regex = '#<object[^>]+>.+?'.
+            'http://www.youtube.com/v/([A-Za-z0-9\-_]+).+?</object>#s';
+        $pre_replace = '<span class="youtube-embed">\1</span>';
+        return preg_replace($pre_regex, $pre_replace, $html);
+    }
+    
+    function postFilter($html, $config, &$context) {
+        $post_regex = '#<span class="youtube-embed">([A-Za-z0-9\-_]+)</span>#';
+        $post_replace = '<object width="425" height="350" '.
+            'data="http://www.youtube.com/v/\1">'.
+            '<param name="movie" value="http://www.youtube.com/v/\1"></param>'.
+            '<param name="wmode" value="transparent"></param>'.
+            '<!--[if IE]>'.
+            '<embed src="http://www.youtube.com/v/\1"'.
+            'type="application/x-shockwave-flash"'.
+            'wmode="transparent" width="425" height="350" />'.
+            '<![endif]-->'.
+            '</object>';
+        return preg_replace($post_regex, $post_replace, $html);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Generator.php b/lib/htmlpurifier/HTMLPurifier/Generator.php
new file mode 100644 (file)
index 0000000..b6a9aa2
--- /dev/null
@@ -0,0 +1,158 @@
+<?php
+
+require_once 'HTMLPurifier/Lexer.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'CleanUTF8DuringGeneration', false, 'bool',
+    'When true, HTMLPurifier_Generator will also check all strings it '.
+    'escapes for UTF-8 well-formedness as a defense in depth measure. '.
+    'This could cause a considerable performance impact, and is not '.
+    'strictly necessary due to the fact that the Lexers should have '.
+    'ensured that all the UTF-8 strings were well-formed.  Note that '.
+    'the configuration value is only read at the beginning of '.
+    'generateFromTokens.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'XHTML', true, 'bool',
+    'Determines whether or not output is XHTML or not.  When disabled, HTML '.
+    'Purifier goes into HTML 4.01 removes XHTML-specific markup constructs, '.
+    'such as boolean attribute expansion and trailing slashes in empty tags. '.
+    'This directive was available since 1.1.'
+);
+
+// extension constraints could be factored into ConfigSchema
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'TidyFormat', false, 'bool',
+    '<p>Determines whether or not to run Tidy on the final output for pretty '.
+    'formatting reasons, such as indentation and wrap.</p><p>This can greatly '.
+    'improve readability for editors who are hand-editing the HTML, but is '.
+    'by no means necessary as HTML Purifier has already fixed all major '.
+    'errors the HTML may have had. Tidy is a non-default extension, and this directive '.
+    'will silently fail if Tidy is not available.</p><p>If you are looking to make '.
+    'the overall look of your page\'s source better, I recommend running Tidy '.
+    'on the entire page rather than just user-content (after all, the '.
+    'indentation relative to the containing blocks will be incorrect).</p><p>This '.
+    'directive was available since 1.1.1.</p>'
+);
+
+/**
+ * Generates HTML from tokens.
+ */
+class HTMLPurifier_Generator
+{
+    
+    /**
+     * Bool cache of %Core.CleanUTF8DuringGeneration
+     * @private
+     */
+    var $_clean_utf8 = false;
+    
+    /**
+     * Bool cache of %Core.XHTML
+     * @private
+     */
+    var $_xhtml = true;
+    
+    /**
+     * Generates HTML from an array of tokens.
+     * @param $tokens Array of HTMLPurifier_Token
+     * @param $config HTMLPurifier_Config object
+     * @return Generated HTML
+     */
+    function generateFromTokens($tokens, $config, &$context) {
+        $html = '';
+        if (!$config) $config = HTMLPurifier_Config::createDefault();
+        $this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
+        $this->_xhtml = $config->get('Core', 'XHTML');
+        if (!$tokens) return '';
+        foreach ($tokens as $token) {
+            $html .= $this->generateFromToken($token);
+        }
+        if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) {
+            
+            $tidy_options = array(
+               'indent'=> true,
+               'output-xhtml' => $this->_xhtml,
+               'show-body-only' => true,
+               'indent-spaces' => 2,
+               'wrap' => 68,
+            );
+            if (version_compare(PHP_VERSION, '5', '<')) {
+                tidy_set_encoding('utf8');
+                foreach ($tidy_options as $key => $value) {
+                    tidy_setopt($key, $value);
+                }
+                tidy_parse_string($html);
+                tidy_clean_repair();
+                $html = tidy_get_output();
+            } else {
+                $tidy = new Tidy;
+                $tidy->parseString($html, $tidy_options, 'utf8');
+                $tidy->cleanRepair();
+                $html = (string) $tidy;
+            }
+        }
+        return $html;
+    }
+    
+    /**
+     * Generates HTML from a single token.
+     * @param $token HTMLPurifier_Token object.
+     * @return Generated HTML
+     */
+    function generateFromToken($token) {
+        if (!isset($token->type)) return '';
+        if ($token->type == 'start') {
+            $attr = $this->generateAttributes($token->attr);
+            return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
+            
+        } elseif ($token->type == 'end') {
+            return '</' . $token->name . '>';
+            
+        } elseif ($token->type == 'empty') {
+            $attr = $this->generateAttributes($token->attr);
+             return '<' . $token->name . ($attr ? ' ' : '') . $attr .
+                ( $this->_xhtml ? ' /': '' )
+                . '>';
+            
+        } elseif ($token->type == 'text') {
+            return $this->escape($token->data);
+            
+        } else {
+            return '';
+            
+        }
+    }
+    
+    /**
+     * Generates attribute declarations from attribute array.
+     * @param $assoc_array_of_attributes Attribute array
+     * @return Generate HTML fragment for insertion.
+     */
+    function generateAttributes($assoc_array_of_attributes) {
+        $html = '';
+        foreach ($assoc_array_of_attributes as $key => $value) {
+            if (!$this->_xhtml) {
+                // remove namespaced attributes
+                if (strpos($key, ':') !== false) continue;
+                // also needed: check for attribute minimization
+            }
+            $html .= $key.'="'.$this->escape($value).'" ';
+        }
+        return rtrim($html);
+    }
+    
+    /**
+     * Escapes raw text data.
+     * @param $string String data to escape for HTML.
+     * @return String escaped data.
+     */
+    function escape($string) {
+        if ($this->_clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
+        return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php b/lib/htmlpurifier/HTMLPurifier/HTMLDefinition.php
new file mode 100644 (file)
index 0000000..3af445c
--- /dev/null
@@ -0,0 +1,281 @@
+<?php
+
+// components
+require_once 'HTMLPurifier/HTMLModuleManager.php';
+
+// this definition and its modules MUST NOT define configuration directives
+// outside of the HTML or Attr namespaces
+
+// will be superceded by more accurate doctype declaration schemes
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'Strict', false, 'bool',
+    'Determines whether or not to use Transitional (loose) or Strict rulesets. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'BlockWrapper', 'p', 'string',
+    'String name of element to wrap inline elements that are inside a block '.
+    'context.  This only occurs in the children of blockquote in strict mode. '.
+    'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
+    'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
+    '<code>&lt;p&gt;</code> tags can be replaced '.
+    'with whatever you desire, as long as it is a block level element. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'Parent', 'div', 'string',
+    'String name of element that HTML fragment passed to library will be '.
+    'inserted in.  An interesting variation would be using span as the '.
+    'parent element, meaning that only inline tags would be allowed. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'AllowedElements', null, 'lookup/null',
+    'If HTML Purifier\'s tag set is unsatisfactory for your needs, you '.
+    'can overload it with your own list of tags to allow.  Note that this '.
+    'method is subtractive: it does its job by taking away from HTML Purifier '.
+    'usual feature set, so you cannot add a tag that HTML Purifier never '.
+    'supported in the first place (like embed, form or head).  If you change this, you '.
+    'probably also want to change %HTML.AllowedAttributes. '.
+    '<strong>Warning:</strong> If another directive conflicts with the '.
+    'elements here, <em>that</em> directive will win and override. '.
+    'This directive has been available since 1.3.0.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'AllowedAttributes', null, 'lookup/null',
+    'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '.
+    'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '.
+    '(style, id, class, dir, lang, xml:lang).'.
+    '<strong>Warning:</strong> If another directive conflicts with the '.
+    'elements here, <em>that</em> directive will win and override. For '.
+    'example, %HTML.EnableAttrID will take precedence over *.id in this '.
+    'directive.  You must set that directive to true before you can use '.
+    'IDs at all. This directive has been available since 1.3.0.'
+);
+
+/**
+ * Definition of the purified HTML that describes allowed children,
+ * attributes, and many other things.
+ * 
+ * Conventions:
+ * 
+ * All member variables that are prefixed with info
+ * (including the main $info array) are used by HTML Purifier internals
+ * and should not be directly edited when customizing the HTMLDefinition.
+ * They can usually be set via configuration directives or custom
+ * modules.
+ * 
+ * On the other hand, member variables without the info prefix are used
+ * internally by the HTMLDefinition and MUST NOT be used by other HTML
+ * Purifier internals. Many of them, however, are public, and may be
+ * edited by userspace code to tweak the behavior of HTMLDefinition.
+ * 
+ * HTMLPurifier_Printer_HTMLDefinition is a notable exception to this
+ * rule: in the interest of comprehensiveness, it will sniff everything.
+ */
+class HTMLPurifier_HTMLDefinition
+{
+    
+    /** FULLY-PUBLIC VARIABLES */
+    
+    /**
+     * Associative array of element names to HTMLPurifier_ElementDef
+     * @public
+     */
+    var $info = array();
+    
+    /**
+     * Associative array of global attribute name to attribute definition.
+     * @public
+     */
+    var $info_global_attr = array();
+    
+    /**
+     * String name of parent element HTML will be going into.
+     * @public
+     */
+    var $info_parent = 'div';
+    
+    /**
+     * Definition for parent element, allows parent element to be a
+     * tag that's not allowed inside the HTML fragment.
+     * @public
+     */
+    var $info_parent_def;
+    
+    /**
+     * String name of element used to wrap inline elements in block context
+     * @note This is rarely used except for BLOCKQUOTEs in strict mode
+     * @public
+     */
+    var $info_block_wrapper = 'p';
+    
+    /**
+     * Associative array of deprecated tag name to HTMLPurifier_TagTransform
+     * @public
+     */
+    var $info_tag_transform = array();
+    
+    /**
+     * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
+     * @public
+     */
+    var $info_attr_transform_pre = array();
+    
+    /**
+     * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
+     * @public
+     */
+    var $info_attr_transform_post = array();
+    
+    /**
+     * Nested lookup array of content set name (Block, Inline) to
+     * element name to whether or not it belongs in that content set.
+     * @public
+     */
+    var $info_content_sets = array();
+    
+    
+    
+    /** PUBLIC BUT INTERNAL VARIABLES */
+    
+    var $setup = false; /**< Has setup() been called yet? */
+    var $config; /**< Temporary instance of HTMLPurifier_Config */
+    
+    var $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
+    
+    /**
+     * Performs low-cost, preliminary initialization.
+     * @param $config Instance of HTMLPurifier_Config
+     */
+    function HTMLPurifier_HTMLDefinition(&$config) {
+        $this->config =& $config;
+        $this->manager = new HTMLPurifier_HTMLModuleManager();
+    }
+    
+    /**
+     * Processes internals into form usable by HTMLPurifier internals. 
+     * Modifying the definition after calling this function should not
+     * be done.
+     */
+    function setup() {
+        
+        // multiple call guard
+        if ($this->setup) {return;} else {$this->setup = true;}
+        
+        $this->processModules();
+        $this->setupConfigStuff();
+        
+        unset($this->config);
+        unset($this->manager);
+        
+    }
+    
+    /**
+     * Extract out the information from the manager
+     */
+    function processModules() {
+        
+        $this->manager->setup($this->config);
+        
+        foreach ($this->manager->activeModules as $module) {
+            foreach($module->info_tag_transform         as $k => $v) $this->info_tag_transform[$k]      = $v;
+            foreach($module->info_attr_transform_pre    as $k => $v) $this->info_attr_transform_pre[$k] = $v;
+            foreach($module->info_attr_transform_post   as $k => $v) $this->info_attr_transform_post[$k]= $v;
+        }
+        
+        $this->info = $this->manager->getElements($this->config);
+        $this->info_content_sets = $this->manager->contentSets->lookup;
+        
+    }
+    
+    /**
+     * Sets up stuff based on config. We need a better way of doing this.
+     */
+    function setupConfigStuff() {
+        
+        $block_wrapper = $this->config->get('HTML', 'BlockWrapper');
+        if (isset($this->info_content_sets['Block'][$block_wrapper])) {
+            $this->info_block_wrapper = $block_wrapper;
+        } else {
+            trigger_error('Cannot use non-block element as block wrapper.',
+                E_USER_ERROR);
+        }
+        
+        $parent = $this->config->get('HTML', 'Parent');
+        $def = $this->manager->getElement($parent, $this->config);
+        if ($def) {
+            $this->info_parent = $parent;
+            $this->info_parent_def = $def;
+        } else {
+            trigger_error('Cannot use unrecognized element as parent.',
+                E_USER_ERROR);
+            $this->info_parent_def = $this->manager->getElement(
+                $this->info_parent, $this->config);
+        }
+        
+        // support template text
+        $support = "(for information on implementing this, see the ".
+                   "support forums) ";
+        
+        // setup allowed elements, SubtractiveWhitelist module
+        $allowed_elements = $this->config->get('HTML', 'AllowedElements');
+        if (is_array($allowed_elements)) {
+            foreach ($this->info as $name => $d) {
+                if(!isset($allowed_elements[$name])) unset($this->info[$name]);
+                unset($allowed_elements[$name]);
+            }
+            // emit errors
+            foreach ($allowed_elements as $element => $d) {
+                trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
+            }
+        }
+        
+        $allowed_attributes = $this->config->get('HTML', 'AllowedAttributes');
+        $allowed_attributes_mutable = $allowed_attributes; // by copy!
+        if (is_array($allowed_attributes)) {
+            foreach ($this->info_global_attr as $attr_key => $info) {
+                if (!isset($allowed_attributes["*.$attr_key"])) {
+                    unset($this->info_global_attr[$attr_key]);
+                } elseif (isset($allowed_attributes_mutable["*.$attr_key"])) {
+                    unset($allowed_attributes_mutable["*.$attr_key"]);
+                }
+            }
+            foreach ($this->info as $tag => $info) {
+                foreach ($info->attr as $attr => $attr_info) {
+                    if (!isset($allowed_attributes["$tag.$attr"]) &&
+                        !isset($allowed_attributes["*.$attr"])) {
+                        unset($this->info[$tag]->attr[$attr]);
+                    } else {
+                        if (isset($allowed_attributes_mutable["$tag.$attr"])) {
+                            unset($allowed_attributes_mutable["$tag.$attr"]);
+                        } elseif (isset($allowed_attributes_mutable["*.$attr"])) {
+                            unset($allowed_attributes_mutable["*.$attr"]);
+                        }
+                    }
+                }
+            }
+            // emit errors
+            foreach ($allowed_attributes_mutable as $elattr => $d) {
+                list($element, $attribute) = explode('.', $elattr);
+                if ($element == '*') {
+                    trigger_error("Global attribute '$attribute' is not ".
+                        "supported in any elements $support",
+                        E_USER_WARNING);
+                } else {
+                    trigger_error("Attribute '$attribute' in element '$element' not supported $support",
+                        E_USER_WARNING);
+                }
+            }
+        }
+        
+    }
+    
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule.php
new file mode 100644 (file)
index 0000000..930b605
--- /dev/null
@@ -0,0 +1,125 @@
+<?php
+
+/**
+ * Represents an XHTML 1.1 module, with information on elements, tags
+ * and attributes.
+ * @note Even though this is technically XHTML 1.1, it is also used for
+ *       regular HTML parsing. We are using modulization as a convenient
+ *       way to represent the internals of HTMLDefinition, and our
+ *       implementation is by no means conforming and does not directly
+ *       use the normative DTDs or XML schemas.
+ * @note The public variables in a module should almost directly
+ *       correspond to the variables in HTMLPurifier_HTMLDefinition.
+ *       However, the prefix info carries no special meaning in these
+ *       objects (include it anyway if that's the correspondence though).
+ */
+
+class HTMLPurifier_HTMLModule
+{
+    /**
+     * Short unique string identifier of the module
+     */
+    var $name;
+    
+    /**
+     * Dynamically set integer that specifies when the module was loaded in.
+     */
+    var $order;
+    
+    /**
+     * Informally, a list of elements this module changes. Not used in
+     * any significant way.
+     * @protected
+     */
+    var $elements = array();
+    
+    /**
+     * Associative array of element names to element definitions.
+     * Some definitions may be incomplete, to be merged in later
+     * with the full definition.
+     * @public
+     */
+    var $info = array();
+    
+    /**
+     * Associative array of content set names to content set additions.
+     * This is commonly used to, say, add an A element to the Inline
+     * content set. This corresponds to an internal variable $content_sets
+     * and NOT info_content_sets member variable of HTMLDefinition.
+     * @public
+     */
+    var $content_sets = array();
+    
+    /**
+     * Associative array of attribute collection names to attribute
+     * collection additions. More rarely used for adding attributes to
+     * the global collections. Example is the StyleAttribute module adding
+     * the style attribute to the Core. Corresponds to HTMLDefinition's
+     * attr_collections->info, since the object's data is only info,
+     * with extra behavior associated with it.
+     * @public
+     */
+    var $attr_collections = array();
+    
+    /**
+     * Associative array of deprecated tag name to HTMLPurifier_TagTransform
+     * @public
+     */
+    var $info_tag_transform = array();
+    
+    /**
+     * List of HTMLPurifier_AttrTransform to be performed before validation.
+     * @public
+     */
+    var $info_attr_transform_pre = array();
+    
+    /**
+     * List of HTMLPurifier_AttrTransform to be performed after validation.
+     * @public
+     */
+    var $info_attr_transform_post = array();
+    
+    /**
+     * Boolean flag that indicates whether or not getChildDef is implemented.
+     * For optimization reasons: may save a call to a function. Be sure
+     * to set it if you do implement getChildDef(), otherwise it will have
+     * no effect!
+     * @public
+     */
+    var $defines_child_def = false;
+    
+    /**
+     * Retrieves a proper HTMLPurifier_ChildDef subclass based on 
+     * content_model and content_model_type member variables of
+     * the HTMLPurifier_ElementDef class. There is a similar function
+     * in HTMLPurifier_HTMLDefinition.
+     * @param $def HTMLPurifier_ElementDef instance
+     * @return HTMLPurifier_ChildDef subclass
+     * @public
+     */
+    function getChildDef($def) {return false;}
+    
+    /**
+     * Hook method that lets module perform arbitrary operations on
+     * HTMLPurifier_HTMLDefinition before the module gets processed.
+     * @param $definition Reference to HTMLDefinition being setup
+     */
+    function preProcess(&$definition) {}
+    
+    /**
+     * Hook method that lets module perform arbitrary operations
+     * on HTMLPurifier_HTMLDefinition after the module gets processed.
+     * @param $definition Reference to HTMLDefinition being setup
+     */
+    function postProcess(&$definition) {}
+    
+    /**
+     * Hook method that is called when a module gets registered to
+     * the definition.
+     * @param $definition Reference to HTMLDefinition being setup
+     */
+    function setup(&$definition) {}
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Bdo.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Bdo.php
new file mode 100644 (file)
index 0000000..17e5e98
--- /dev/null
@@ -0,0 +1,43 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/AttrTransform/BdoDir.php';
+
+/**
+ * XHTML 1.1 Bi-directional Text Module, defines elements that
+ * declare directionality of content. Text Extension Module.
+ */
+class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Bdo';
+    var $elements = array('bdo');
+    var $info = array();
+    var $content_sets = array('Inline' => 'bdo');
+    var $attr_collections = array(
+        'I18N' => array('dir' => false)
+    );
+    
+    function HTMLPurifier_HTMLModule_Bdo() {
+        $dir = new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false);
+        $this->attr_collections['I18N']['dir'] = $dir;
+        $this->info['bdo'] = new HTMLPurifier_ElementDef();
+        $this->info['bdo']->attr = array(
+            0 => array('Core', 'Lang'),
+            'dir' => $dir, // required
+            // The Abstract Module specification has the attribute
+            // inclusions wrong for bdo: bdo allows
+            // xml:lang too (and we'll toss in lang for good measure,
+            // though it is not allowed for XHTML 1.1, this will
+            // be managed with a global attribute transform)
+        );
+        $this->info['bdo']->content_model = '#PCDATA | Inline';
+        $this->info['bdo']->content_model_type = 'optional';
+        // provides fallback behavior if dir's missing (dir is required)
+        $this->info['bdo']->attr_transform_post['required-dir'] =
+            new HTMLPurifier_AttrTransform_BdoDir();
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/CommonAttributes.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/CommonAttributes.php
new file mode 100644 (file)
index 0000000..8f17c2f
--- /dev/null
@@ -0,0 +1,31 @@
+<?php
+
+class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
+{
+    var $name = 'CommonAttributes';
+    
+    var $attr_collections = array(
+        'Core' => array(
+            0 => array('Style'),
+            // 'xml:space' => false,
+            'class' => 'NMTOKENS',
+            'id' => 'ID',
+            'title' => 'CDATA',
+        ),
+        'Lang' => array(
+            'xml:lang' => false, // see constructor
+        ),
+        'I18N' => array(
+            0 => array('Lang'), // proprietary, for xml:lang/lang
+        ),
+        'Common' => array(
+            0 => array('Core', 'I18N')
+        )
+    );
+    
+    function HTMLPurifier_HTMLModule_CommonAttributes() {
+        $this->attr_collections['Lang']['xml:lang'] = new HTMLPurifier_AttrDef_Lang();
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Edit.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Edit.php
new file mode 100644 (file)
index 0000000..6a41590
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/ChildDef/Chameleon.php';
+
+/**
+ * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
+ * Module.
+ */
+class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Edit';
+    var $elements = array('del', 'ins');
+    var $info = array();
+    var $content_sets = array('Inline' => 'del | ins');
+    
+    function HTMLPurifier_HTMLModule_Edit() {
+        foreach ($this->elements as $element) {
+            $this->info[$element] = new HTMLPurifier_ElementDef();
+            $this->info[$element]->attr = array(
+                0 => array('Common'),
+                'cite' => 'URI',
+                // 'datetime' => 'Datetime' // Datetime not implemented
+            );
+            // Inline context ! Block context (exclamation mark is
+            // separator, see getChildDef for parsing)
+            $this->info[$element]->content_model =
+                '#PCDATA | Inline ! #PCDATA | Flow';
+            // HTML 4.01 specifies that ins/del must not contain block
+            // elements when used in an inline context, chameleon is
+            // a complicated workaround to acheive this effect
+            $this->info[$element]->content_model_type = 'chameleon';
+        }
+    }
+    
+    var $defines_child_def = true;
+    function getChildDef($def) {
+        if ($def->content_model_type != 'chameleon') return false;
+        $value = explode('!', $def->content_model);
+        return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Hypertext.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Hypertext.php
new file mode 100644 (file)
index 0000000..e285e8b
--- /dev/null
@@ -0,0 +1,37 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/AttrDef/HTML/LinkTypes.php';
+
+/**
+ * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
+ */
+class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Hypertext';
+    var $elements = array('a');
+    var $info = array();
+    var $content_sets = array('Inline' => 'a');
+    
+    function HTMLPurifier_HTMLModule_Hypertext() {
+        $this->info['a'] = new HTMLPurifier_ElementDef();
+        $this->info['a']->attr = array(
+            0 => array('Common'),
+            // 'accesskey' => 'Character',
+            // 'charset' => 'Charset',
+            'href' => 'URI',
+            //'hreflang' => 'LanguageCode',
+            'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
+            'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
+            //'tabindex' => 'Number',
+            //'type' => 'ContentType',
+        );
+        $this->info['a']->content_model = '#PCDATA | Inline';
+        $this->info['a']->content_model_type = 'optional';
+        $this->info['a']->excludes = array('a' => true);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Image.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Image.php
new file mode 100644 (file)
index 0000000..3852836
--- /dev/null
@@ -0,0 +1,38 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+
+require_once 'HTMLPurifier/AttrDef/URI.php';
+require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
+
+/**
+ * XHTML 1.1 Image Module provides basic image embedding.
+ * @note There is specialized code for removing empty images in
+ *       HTMLPurifier_Strategy_RemoveForeignElements
+ */
+class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Image';
+    var $elements = array('img');
+    var $info = array();
+    var $content_sets = array('Inline' => 'img');
+    
+    function HTMLPurifier_HTMLModule_Image() {
+        $this->info['img'] = new HTMLPurifier_ElementDef();
+        $this->info['img']->attr = array(
+            0 => array('Common'),
+            'alt' => 'Text',
+            'height' => 'Length',
+            'longdesc' => 'URI', 
+            'src' => new HTMLPurifier_AttrDef_URI(true), // embedded
+            'width' => 'Length'
+        );
+        $this->info['img']->content_model_type = 'empty';
+        $this->info['img']->attr_transform_post[] =
+            new HTMLPurifier_AttrTransform_ImgRequired();
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Legacy.php
new file mode 100644 (file)
index 0000000..a0613a2
--- /dev/null
@@ -0,0 +1,60 @@
+<?php
+
+/**
+ * XHTML 1.1 Legacy module defines elements that were previously 
+ * deprecated.
+ * 
+ * @note Not all legacy elements have been implemented yet, which
+ *       is a bit of a reverse problem as compared to browsers! In
+ *       addition, this legacy module may implement a bit more than
+ *       mandated by XHTML 1.1.
+ * 
+ * This module can be used in combination with TransformToStrict in order
+ * to transform as many deprecated elements as possible, but retain
+ * questionably deprecated elements that do not have good alternatives
+ * as well as transform elements that don't have an implementation.
+ * See docs/ref-strictness.txt for more details.
+ */
+
+class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
+{
+    
+    // incomplete
+    
+    var $name = 'Legacy';
+    var $elements = array('u', 's', 'strike');
+    var $non_standalone_elements = array('li', 'ol', 'address', 'blockquote');
+    
+    function HTMLPurifier_HTMLModule_Legacy() {
+        // setup new elements
+        foreach ($this->elements as $name) {
+            $this->info[$name] = new HTMLPurifier_ElementDef();
+            // for u, s, strike, as more elements get added, add
+            // conditionals as necessary
+            $this->info[$name]->content_model = 'Inline | #PCDATA';
+            $this->info[$name]->content_model_type = 'optional';
+            $this->info[$name]->attr[0] = array('Common');
+        }
+        
+        // setup modifications to old elements
+        foreach ($this->non_standalone_elements as $name) {
+            $this->info[$name] = new HTMLPurifier_ElementDef();
+            $this->info[$name]->standalone = false;
+        }
+        
+        $this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer();
+        $this->info['ol']->attr['start'] = new HTMLPurifier_AttrDef_Integer();
+        
+        $this->info['address']->content_model = 'Inline | #PCDATA | p';
+        $this->info['address']->content_model_type = 'optional';
+        $this->info['address']->child = false;
+        
+        $this->info['blockquote']->content_model = 'Flow | #PCDATA';
+        $this->info['blockquote']->content_model_type = 'optional';
+        $this->info['blockquote']->child = false;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/List.php
new file mode 100644 (file)
index 0000000..c74982d
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+
+/**
+ * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
+ */
+class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'List';
+    var $elements = array('dl', 'dt', 'dd', 'ol', 'ul', 'li');
+    var $info = array();
+    // According to the abstract schema, the List content set is a fully formed
+    // one or more expr, but it invariably occurs in an optional declaration
+    // so we're not going to do that subtlety. It might cause trouble
+    // if a user defines "List" and expects that multiple lists are
+    // allowed to be specified, but then again, that's not very intuitive.
+    // Furthermore, the actual XML Schema may disagree. Regardless,
+    // we don't have support for such nested expressions without using
+    // the incredibly inefficient and draconic Custom ChildDef.
+    var $content_sets = array('List' => 'dl | ol | ul', 'Flow' => 'List');
+    
+    function HTMLPurifier_HTMLModule_List() {
+        foreach ($this->elements as $element) {
+            $this->info[$element] = new HTMLPurifier_ElementDef();
+            $this->info[$element]->attr = array(0 => array('Common'));
+            if ($element == 'li' || $element == 'dd') {
+                $this->info[$element]->content_model = '#PCDATA | Flow';
+                $this->info[$element]->content_model_type = 'optional';
+            } elseif ($element == 'ol' || $element == 'ul') {
+                $this->info[$element]->content_model = 'li';
+                $this->info[$element]->content_model_type = 'required';
+            }
+        }
+        $this->info['dt']->content_model = '#PCDATA | Inline';
+        $this->info['dt']->content_model_type = 'optional';
+        $this->info['dl']->content_model = 'dt | dd';
+        $this->info['dl']->content_model_type = 'required';
+        // this could be a LOT more robust
+        $this->info['li']->auto_close = array('li' => true);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Presentation.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Presentation.php
new file mode 100644 (file)
index 0000000..42d9c11
--- /dev/null
@@ -0,0 +1,41 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+
+/**
+ * XHTML 1.1 Presentation Module, defines simple presentation-related
+ * markup. Text Extension Module.
+ * @note The official XML Schema and DTD specs further divide this into
+ *       two modules:
+ *          - Block Presentation (hr)
+ *          - Inline Presentation (b, big, i, small, sub, sup, tt)
+ *       We have chosen not to heed this distinction, as content_sets
+ *       provides satisfactory disambiguation.
+ */
+class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Presentation';
+    var $elements = array('b', 'big', 'hr', 'i', 'small', 'sub', 'sup', 'tt');
+    var $info = array();
+    var $content_sets = array(
+        'Block' => 'hr',
+        'Inline' => 'b | big | i | small | sub | sup | tt'
+    );
+    
+    function HTMLPurifier_HTMLModule_Presentation() {
+        foreach ($this->elements as $element) {
+            $this->info[$element] = new HTMLPurifier_ElementDef();
+            $this->info[$element]->attr = array(0 => array('Common'));
+            if ($element == 'hr') {
+                $this->info[$element]->content_model_type = 'empty';
+            } else {
+                $this->info[$element]->content_model = '#PCDATA | Inline';
+                $this->info[$element]->content_model_type = 'optional';
+            }
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/StyleAttribute.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/StyleAttribute.php
new file mode 100644 (file)
index 0000000..5ee5d1c
--- /dev/null
@@ -0,0 +1,27 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/AttrDef/CSS.php';
+
+/**
+ * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
+ * Module.
+ */
+class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'StyleAttribute';
+    var $attr_collections = array(
+        // The inclusion routine differs from the Abstract Modules but
+        // is in line with the DTD and XML Schemas.
+        'Style' => array('style' => false), // see constructor
+        'Core' => array(0 => array('Style'))
+    );
+    
+    function HTMLPurifier_HTMLModule_StyleAttribute() {
+        $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tables.php
new file mode 100644 (file)
index 0000000..ea41f5b
--- /dev/null
@@ -0,0 +1,88 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/ChildDef/Table.php';
+
+/**
+ * XHTML 1.1 Tables Module, fully defines accessible table elements.
+ */
+class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Tables';
+    var $elements = array('caption', 'table', 'td', 'th', 'tr', 'col',
+        'colgroup', 'tbody', 'thead', 'tfoot');
+    var $info = array();
+    var $content_sets = array('Block' => 'table');
+    
+    function HTMLPurifier_HTMLModule_Tables() {
+        foreach ($this->elements as $e) {
+            $this->info[$e] = new HTMLPurifier_ElementDef();
+            $this->info[$e]->attr = array(0 => array('Common'));
+            $attr =& $this->info[$e]->attr;
+            if ($e == 'caption') continue;
+            if ($e == 'table'){
+                $attr['border'] = 'Pixels';
+                $attr['cellpadding'] = 'Length';
+                $attr['cellspacing'] = 'Length';
+                $attr['frame'] = new HTMLPurifier_AttrDef_Enum(array(
+                    'void', 'above', 'below', 'hsides', 'lhs', 'rhs',
+                    'vsides', 'box', 'border'
+                ), false);
+                $attr['rules'] = new HTMLPurifier_AttrDef_Enum(array(
+                    'none', 'groups', 'rows', 'cols', 'all'
+                ), false);
+                $attr['summary'] = 'Text';
+                $attr['width'] = 'Length';
+                continue;
+            }
+            if ($e == 'col' || $e == 'colgroup') {
+                $attr['span'] = 'Number';
+                $attr['width'] = 'MultiLength';
+            }
+            if ($e == 'td' || $e == 'th') {
+                $attr['abbr'] = 'Text';
+                $attr['colspan'] = 'Number';
+                $attr['rowspan'] = 'Number';
+            }
+            $attr['align'] = new HTMLPurifier_AttrDef_Enum(array(
+                'left', 'center', 'right', 'justify', 'char'
+            ), false);
+            $attr['valign'] = new HTMLPurifier_AttrDef_Enum(array(
+                'top', 'middle', 'bottom', 'baseline'
+            ), false);
+            $attr['charoff'] = 'Length';
+        }
+        $this->info['caption']->content_model = '#PCDATA | Inline';
+        $this->info['caption']->content_model_type = 'optional';
+        
+        // Is done directly because it doesn't leverage substitution
+        // mechanisms. True model is:
+        // 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))'
+        $this->info['table']->child = new HTMLPurifier_ChildDef_Table();
+        
+        $this->info['td']->content_model = 
+        $this->info['th']->content_model = '#PCDATA | Flow';
+        $this->info['td']->content_model_type = 
+        $this->info['th']->content_model_type = 'optional';
+        
+        $this->info['tr']->content_model = 'td | th';
+        $this->info['tr']->content_model_type = 'required';
+        
+        $this->info['col']->content_model_type = 'empty';
+        
+        $this->info['colgroup']->content_model = 'col';
+        $this->info['colgroup']->content_model_type = 'optional';
+        
+        $this->info['tbody']->content_model = 
+        $this->info['thead']->content_model = 
+        $this->info['tfoot']->content_model = 'tr';
+        $this->info['tbody']->content_model_type = 
+        $this->info['thead']->content_model_type = 
+        $this->info['tfoot']->content_model_type = 'required';
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Text.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Text.php
new file mode 100644 (file)
index 0000000..bac0598
--- /dev/null
@@ -0,0 +1,78 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+
+/**
+ * XHTML 1.1 Text Module, defines basic text containers. Core Module.
+ * @note In the normative XML Schema specification, this module
+ *       is further abstracted into the following modules:
+ *          - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
+ *          - Block Structural (div, p)
+ *          - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
+ *          - Inline Structural (br, span)
+ *       We have elected not to follow suite, but this may change.
+ */
+class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'Text';
+    
+    var $elements = array('abbr', 'acronym', 'address', 'blockquote',
+        'br', 'cite', 'code', 'dfn', 'div', 'em', 'h1', 'h2', 'h3',
+        'h4', 'h5', 'h6', 'kbd', 'p', 'pre', 'q', 'samp', 'span', 'strong',
+        'var', 'nolink', 'tex', 'algebra'); //moodle modification
+    
+    var $info = array();
+    
+    var $content_sets = array(
+        'Heading' => 'h1 | h2 | h3 | h4 | h5 | h6',
+        'Block' => 'address | blockquote | div | p | pre | nolink | tex | algebra', //moodle modification
+        'Inline' => 'abbr | acronym | br | cite | code | dfn | em | kbd | q | samp | span | strong | var',
+        'Flow' => 'Heading | Block | Inline'
+    );
+    
+    function HTMLPurifier_HTMLModule_Text() {
+        foreach ($this->elements as $element) {
+            $this->info[$element] = new HTMLPurifier_ElementDef();
+            // attributes
+            if ($element == 'br') {
+                $this->info[$element]->attr = array(0 => array('Core'));
+            } elseif ($element == 'blockquote' || $element == 'q') {
+                $this->info[$element]->attr = array(0 => array('Common'), 'cite' => 'URI');
+            } else {
+                $this->info[$element]->attr = array(0 => array('Common'));
+            }
+            // content models
+            if ($element == 'br') {
+                $this->info[$element]->content_model_type = 'empty';
+            } elseif ($element == 'blockquote') {
+                $this->info[$element]->content_model = 'Heading | Block | List';
+                $this->info[$element]->content_model_type = 'optional';
+            } elseif ($element == 'div') {
+                $this->info[$element]->content_model = '#PCDATA | Flow';
+                $this->info[$element]->content_model_type = 'optional';
+            } else {
+                $this->info[$element]->content_model = '#PCDATA | Inline';
+                $this->info[$element]->content_model_type = 'optional';
+            }
+        }
+        // SGML permits exclusions for all descendants, but this is
+        // not possible with DTDs or XML Schemas. W3C has elected to
+        // use complicated compositions of content_models to simulate
+        // exclusion for children, but we go the simpler, SGML-style
+        // route of flat-out exclusions. Note that the Abstract Module
+        // is blithely unaware of such distinctions.
+        $this->info['pre']->excludes = array_flip(array(
+            'img', 'big', 'small',
+            'object', 'applet', 'font', 'basefont' // generally not allowed
+        ));
+        $this->info['p']->auto_close = array_flip(array(
+            'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
+            'table', 'ul', 'nolink', 'tex', 'algebra' //moodle modification
+        ));
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php
new file mode 100644 (file)
index 0000000..cdbe373
--- /dev/null
@@ -0,0 +1,108 @@
+<?php
+
+require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
+
+require_once 'HTMLPurifier/TagTransform/Simple.php';
+require_once 'HTMLPurifier/TagTransform/Center.php';
+require_once 'HTMLPurifier/TagTransform/Font.php';
+
+require_once 'HTMLPurifier/AttrTransform/Lang.php';
+require_once 'HTMLPurifier/AttrTransform/TextAlign.php';
+require_once 'HTMLPurifier/AttrTransform/BgColor.php';
+require_once 'HTMLPurifier/AttrTransform/Border.php';
+require_once 'HTMLPurifier/AttrTransform/Name.php';
+require_once 'HTMLPurifier/AttrTransform/Length.php';
+
+/**
+ * Proprietary module that transforms deprecated elements into Strict
+ * HTML (see HTML 4.01 and XHTML 1.0) when possible.
+ */
+
+class HTMLPurifier_HTMLModule_TransformToStrict extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'TransformToStrict';
+    
+    // we're actually modifying these elements, not defining them
+    var $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p',
+        'blockquote', 'table', 'td', 'th', 'tr', 'img', 'a', 'hr');
+    
+    var $info_tag_transform = array(
+        // placeholders, see constructor for definitions
+        'font'  => false,
+        'menu'  => false,
+        'dir'   => false,
+        'center'=> false
+    );
+    
+    var $attr_collections = array(
+        'Lang' => array(
+            'lang' => false // placeholder
+        )
+    );
+    
+    var $info_attr_transform_post = array(
+        'lang' => false // placeholder
+    );
+    
+    function HTMLPurifier_HTMLModule_TransformToStrict() {
+        
+        // deprecated tag transforms
+        $this->info_tag_transform['font']   = new HTMLPurifier_TagTransform_Font();
+        $this->info_tag_transform['menu']   = new HTMLPurifier_TagTransform_Simple('ul');
+        $this->info_tag_transform['dir']    = new HTMLPurifier_TagTransform_Simple('ul');
+        $this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center();
+        
+        foreach ($this->elements as $name) {
+            $this->info[$name] = new HTMLPurifier_ElementDef();
+            $this->info[$name]->standalone = false;
+        }
+        
+        // deprecated attribute transforms
+        $this->info['h1']->attr_transform_pre['align'] =
+        $this->info['h2']->attr_transform_pre['align'] =
+        $this->info['h3']->attr_transform_pre['align'] =
+        $this->info['h4']->attr_transform_pre['align'] =
+        $this->info['h5']->attr_transform_pre['align'] =
+        $this->info['h6']->attr_transform_pre['align'] =
+        $this->info['p'] ->attr_transform_pre['align'] = 
+                    new HTMLPurifier_AttrTransform_TextAlign();
+        
+        // xml:lang <=> lang mirroring, implement in TransformToStrict,
+        // this is overridden in TransformToXHTML11
+        $this->info_attr_transform_post['lang'] = new HTMLPurifier_AttrTransform_Lang();
+        $this->attr_collections['Lang']['lang'] = new HTMLPurifier_AttrDef_Lang();
+        
+        // this should not be applied to XHTML 1.0 Transitional, ONLY
+        // XHTML 1.0 Strict. We may need three classes
+        $this->info['blockquote']->content_model_type = 'strictblockquote';
+        $this->info['blockquote']->child = false; // recalculate please!
+        
+        $this->info['table']->attr_transform_pre['bgcolor'] = 
+        $this->info['tr']->attr_transform_pre['bgcolor'] = 
+        $this->info['td']->attr_transform_pre['bgcolor'] = 
+        $this->info['th']->attr_transform_pre['bgcolor'] = new HTMLPurifier_AttrTransform_BgColor();
+        
+        $this->info['img']->attr_transform_pre['border'] = new HTMLPurifier_AttrTransform_Border();
+        
+        $this->info['img']->attr_transform_pre['name'] = 
+        $this->info['a']->attr_transform_pre['name'] = new HTMLPurifier_AttrTransform_Name();
+        
+        $this->info['td']->attr_transform_pre['width'] = 
+        $this->info['th']->attr_transform_pre['width'] = 
+        $this->info['hr']->attr_transform_pre['width'] = new HTMLPurifier_AttrTransform_Length('width');
+        
+        $this->info['td']->attr_transform_pre['height'] = 
+        $this->info['th']->attr_transform_pre['height'] = new HTMLPurifier_AttrTransform_Length('height');
+        
+    }
+    
+    var $defines_child_def = true;
+    function getChildDef($def) {
+        if ($def->content_model_type != 'strictblockquote') return false;
+        return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php
new file mode 100644 (file)
index 0000000..0915f5b
--- /dev/null
@@ -0,0 +1,30 @@
+<?php
+
+/**
+ * Proprietary module that transforms XHTML 1.0 deprecated aspects into
+ * XHTML 1.1 compliant ones, when possible. For maximum effectiveness,
+ * HTMLPurifier_HTMLModule_TransformToStrict must also be loaded
+ * (otherwise, elements that were deprecated from Transitional to Strict
+ * will not be transformed).
+ * 
+ * XHTML 1.1 compliant document are automatically XHTML 1.0 compliant too,
+ * although they may not be as friendly to legacy browsers.
+ */
+
+class HTMLPurifier_HTMLModule_TransformToXHTML11 extends HTMLPurifier_HTMLModule
+{
+    
+    var $name = 'TransformToXHTML11';
+    var $attr_collections = array(
+        'Lang' => array(
+            'lang' => false // remove it
+        )
+    );
+    
+    var $info_attr_transform_post = array(
+        'lang' => false // remove it
+    );
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php b/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php
new file mode 100644 (file)
index 0000000..e009047
--- /dev/null
@@ -0,0 +1,558 @@
+<?php
+
+require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/ElementDef.php';
+
+require_once 'HTMLPurifier/ContentSets.php';
+require_once 'HTMLPurifier/AttrTypes.php';
+require_once 'HTMLPurifier/AttrCollections.php';
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/Enum.php';
+
+// W3C modules
+require_once 'HTMLPurifier/HTMLModule/CommonAttributes.php';
+require_once 'HTMLPurifier/HTMLModule/Text.php';
+require_once 'HTMLPurifier/HTMLModule/Hypertext.php';
+require_once 'HTMLPurifier/HTMLModule/List.php';
+require_once 'HTMLPurifier/HTMLModule/Presentation.php';
+require_once 'HTMLPurifier/HTMLModule/Edit.php';
+require_once 'HTMLPurifier/HTMLModule/Bdo.php';
+require_once 'HTMLPurifier/HTMLModule/Tables.php';
+require_once 'HTMLPurifier/HTMLModule/Image.php';
+require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php';
+require_once 'HTMLPurifier/HTMLModule/Legacy.php';
+
+// proprietary modules
+require_once 'HTMLPurifier/HTMLModule/TransformToStrict.php';
+require_once 'HTMLPurifier/HTMLModule/TransformToXHTML11.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'HTML', 'Doctype', null, 'string/null',
+    'Doctype to use, valid values are HTML 4.01 Transitional, HTML 4.01 '.
+    'Strict, XHTML 1.0 Transitional, XHTML 1.0 Strict, XHTML 1.1. '.
+    'Technically speaking this is not actually a doctype (as it does '.
+    'not identify a corresponding DTD), but we are using this name '.
+    'for sake of simplicity. This will override any older directives '.
+    'like %Core.XHTML or %HTML.Strict.'
+);
+
+class HTMLPurifier_HTMLModuleManager
+{
+    
+    /**
+     * Array of HTMLPurifier_Module instances, indexed by module's class name.
+     * All known modules, regardless of use, are in this array.
+     */
+    var $modules = array();
+    
+    /**
+     * String doctype we will validate against. See $validModules for use.
+     * 
+     * @note
+     * There is a special doctype '*' that acts both as the "default"
+     * doctype if a customized system only defines one doctype and
+     * also a catch-all doctype that gets merged into all the other
+     * module collections. When possible, use a private collection to
+     * share modules between doctypes: this special doctype is to
+     * make life more convenient for users.
+     */
+    var $doctype;
+    var $doctypeAliases = array(); /**< Lookup array of strings to real doctypes */
+    
+    /**
+     * Associative array: $collections[$type][$doctype] = list of modules.
+     * This is used to logically separate types of functionality so that
+     * based on the doctype and other configuration settings they may
+     * be easily switched and on and off. Custom setups may not need
+     * to use this abstraction, opting to have only one big collection
+     * with one valid doctype.
+     */
+    var $collections = array();
+    
+    /**
+     * Modules that may be used in a valid doctype of this kind.
+     * Correctional and leniency modules should not be placed in this
+     * array unless the user said so: don't stuff every possible lenient
+     * module for this doctype in here.
+     */
+    var $validModules = array();
+    var $validCollections = array(); /**< Collections to merge into $validModules */
+    
+    /**
+     * Modules that we will allow in input, subset of $validModules. Single
+     * element definitions may result in us consulting validModules.
+     */
+    var $activeModules = array();
+    var $activeCollections = array(); /**< Collections to merge into $activeModules */
+    
+    var $counter = 0; /**< Designates next available integer order for modules. */
+    var $initialized = false; /**< Says whether initialize() was called */
+    
+    /**
+     * Specifies what doctype to siphon new modules from addModule() to,
+     * or false to disable the functionality. Must be used in conjunction
+     * with $autoCollection.
+     */
+    var $autoDoctype = false;
+    /**
+     * Specifies what collection to siphon new modules from addModule() to,
+     * or false to disable the functionality. Must be used in conjunction
+     * with $autoCollection.
+     */
+    var $autoCollection = false;
+    
+    /** Associative array of element name to defining modules (always array) */
+    var $elementLookup = array();
+    
+    /** List of prefixes we should use for resolving small names */
+    var $prefixes = array('HTMLPurifier_HTMLModule_');
+    
+    var $contentSets; /**< Instance of HTMLPurifier_ContentSets */
+    var $attrTypes; /**< Instance of HTMLPurifier_AttrTypes */
+    var $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
+    
+    /**
+     * @param $blank If true, don't do any initializing
+     */
+    function HTMLPurifier_HTMLModuleManager($blank = false) {
+        
+        // the only editable internal object. The rest need to
+        // be manipulated through modules
+        $this->attrTypes = new HTMLPurifier_AttrTypes();
+        
+        if (!$blank) $this->initialize();
+        
+    }
+    
+    function initialize() {
+        $this->initialized = true;
+        
+        // load default modules to the recognized modules list (not active)
+        $modules = array(
+            // define
+            'CommonAttributes',
+            'Text', 'Hypertext', 'List', 'Presentation',
+            'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute',
+            // define-redefine
+            'Legacy',
+            // redefine
+            'TransformToStrict', 'TransformToXHTML11'
+        );
+        foreach ($modules as $module) {
+            $this->addModule($module);
+        }
+        
+        // Safe modules for supported doctypes. These are included
+        // in the valid and active module lists by default
+        $this->collections['Safe'] = array(
+            '_Common' => array( // leading _ indicates private
+                'CommonAttributes', 'Text', 'Hypertext', 'List',
+                'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
+                'StyleAttribute'
+            ),
+            // HTML definitions, defer to XHTML definitions
+            'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
+            'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
+            // XHTML definitions
+            'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy' ),
+            'XHTML 1.0 Strict' => array(array('_Common')),
+            'XHTML 1.1' => array(array('_Common')),
+        );
+        
+        // Modules that specify elements that are unsafe from untrusted
+        // third-parties. These should be registered in $validModules but
+        // almost never $activeModules unless you really know what you're
+        // doing.
+        $this->collections['Unsafe'] = array();
+        
+        // Modules to import if lenient mode (attempt to convert everything
+        // to a valid representation) is on. These must not be in $validModules
+        // unless specified so.
+        $this->collections['Lenient'] = array(
+            'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
+            'XHTML 1.0 Strict' => array('TransformToStrict'),
+            'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11')
+        );
+        
+        // Modules to import if correctional mode (correct everything that
+        // is feasible to strict mode) is on. These must not be in $validModules
+        // unless specified so.
+        $this->collections['Correctional'] = array(
+            'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
+            'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one
+        );
+        
+        // User-space modules, custom code or whatever
+        $this->collections['Extension'] = array();
+        
+        // setup active versus valid modules. ORDER IS IMPORTANT!
+        // definition modules
+        $this->makeCollectionActive('Safe');
+        $this->makeCollectionValid('Unsafe');
+        // redefinition modules
+        $this->makeCollectionActive('Lenient');
+        $this->makeCollectionActive('Correctional');
+        
+        $this->autoDoctype    = '*';
+        $this->autoCollection = 'Extension';
+        
+    }
+    
+    /**
+     * Adds a module to the recognized module list. This does not
+     * do anything else: the module must be added to a corresponding
+     * collection to be "activated".
+     * @param $module Mixed: string module name, with or without
+     *                HTMLPurifier_HTMLModule prefix, or instance of
+     *                subclass of HTMLPurifier_HTMLModule.
+     */
+    function addModule($module) {
+        if (is_string($module)) {
+            $original_module = $module;
+            if (!class_exists($module)) {
+                foreach ($this->prefixes as $prefix) {
+                    $module = $prefix . $original_module;
+                    if (class_exists($module)) break;
+                }
+            }
+            if (!class_exists($module)) {
+                trigger_error($original_module . ' module does not exist',
+                    E_USER_ERROR);
+                return;
+            }
+            $module = new $module();
+        }
+        $module->order = $this->counter++; // assign then increment
+        $this->modules[$module->name] = $module;
+        if ($this->autoDoctype !== false && $this->autoCollection !== false) {
+            $this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name;
+        }
+    }
+    
+    /**
+     * Makes a collection active, while also making it valid if not
+     * already done so. See $activeModules for the semantics of "active".
+     * @param $collection_name Name of collection to activate
+     */
+    function makeCollectionActive($collection_name) {
+        if (!in_array($collection_name, $this->validCollections)) {
+            $this->makeCollectionValid($collection_name);
+        }
+        $this->activeCollections[] = $collection_name;
+    }
+    
+    /**
+     * Makes a collection valid. See $validModules for the semantics of "valid"
+     */
+    function makeCollectionValid($collection_name) {
+        $this->validCollections[] = $collection_name;
+    }
+    
+    /**
+     * Adds a class prefix that addModule() will use to resolve a
+     * string name to a concrete class
+     */
+    function addPrefix($prefix) {
+        $this->prefixes[] = (string) $prefix;
+    }
+    
+    function setup($config) {
+        
+        // load up the autocollection
+        if ($this->autoCollection !== false) {
+            $this->makeCollectionActive($this->autoCollection);
+        }
+        
+        // retrieve the doctype
+        $this->doctype = $this->getDoctype($config);
+        if (isset($this->doctypeAliases[$this->doctype])) {
+            $this->doctype = $this->doctypeAliases[$this->doctype];
+        }
+        
+        // process module collections to module name => module instance form
+        foreach ($this->collections as $col_i => $x) {
+            $this->processCollections($this->collections[$col_i]);
+        }
+        
+        $this->validModules  = $this->assembleModules($this->validCollections);
+        $this->activeModules = $this->assembleModules($this->activeCollections);
+        
+        // setup lookup table based on all valid modules
+        foreach ($this->validModules as $module) {
+            foreach ($module->info as $name => $def) {
+                if (!isset($this->elementLookup[$name])) {
+                    $this->elementLookup[$name] = array();
+                }
+                $this->elementLookup[$name][] = $module->name;
+            }
+        }
+        
+        // note the different choice
+        $this->contentSets = new HTMLPurifier_ContentSets(
+            // content models that contain non-allowed elements are 
+            // harmless because RemoveForeignElements will ensure
+            // they never get in anyway, and there is usually no
+            // reason why you should want to restrict a content
+            // model beyond what is mandated by the doctype.
+            // Note, however, that this means redefinitions of
+            // content models can't be tossed in validModels willy-nilly:
+            // that stuff still is regulated by configuration.
+            $this->validModules
+        );
+        $this->attrCollections = new HTMLPurifier_AttrCollections(
+            $this->attrTypes,
+            // only explicitly allowed modules are allowed to affect
+            // the global attribute collections. This mean's there's
+            // a distinction between loading the Bdo module, and the
+            // bdo element: Bdo will enable the dir attribute on all
+            // elements, while bdo will only define the bdo element,
+            // which will not have an editable directionality. This might
+            // catch people who are loading only elements by surprise, so
+            // we should consider loading an entire module if all the
+            // elements it defines are requested by the user, especially
+            // if it affects the global attribute collections.
+            $this->activeModules
+        );
+        
+    }
+    
+    /**
+     * Takes a list of collections and merges together all the defined
+     * modules for the current doctype from those collections.
+     * @param $collections List of collection suffixes we should grab
+     *                     modules from (like 'Safe' or 'Lenient')
+     */
+    function assembleModules($collections) {
+        $modules = array();
+        $numOfCollectionsUsed = 0;
+        foreach ($collections as $name) {
+            $disable_global = false;
+            if (!isset($this->collections[$name])) {
+                trigger_error("$name collection is undefined", E_USER_ERROR);
+                continue;
+            }
+            $cols = $this->collections[$name];
+            if (isset($cols[$this->doctype])) {
+                if (isset($cols[$this->doctype]['*'])) {
+                    unset($cols[$this->doctype]['*']);
+                    $disable_global = true;
+                }
+                $modules += $cols[$this->doctype];
+                $numOfCollectionsUsed++;
+            }
+            // accept catch-all doctype
+            if (
+                $this->doctype !== '*' && 
+                isset($cols['*']) &&
+                !$disable_global
+            ) {
+                $modules += $cols['*'];
+            }
+        }
+        
+        if ($numOfCollectionsUsed < 1) {
+            // possible XSS injection if user-specified doctypes
+            // are allowed
+            trigger_error("Doctype {$this->doctype} does not exist, ".
+                "check for typos (if you desire a doctype that allows ".
+                "no elements, use an empty array collection)", E_USER_ERROR);
+        }
+        return $modules;
+    }
+    
+    /**
+     * Takes a collection and performs inclusions and substitutions for it.
+     * @param $cols Reference to collections class member variable
+     */
+    function processCollections(&$cols) {
+        
+        // $cols is the set of collections
+        // $col_i is the name (index) of a collection
+        // $col is a collection/list of modules
+        
+        // perform inclusions
+        foreach ($cols as $col_i => $col) {
+            $seen = array();
+            if (!empty($col[0]) && is_array($col[0])) {
+                $seen[$col_i] = true; // recursion reporting
+                $includes = $col[0];
+                unset($cols[$col_i][0]); // remove inclusions value, recursion guard
+            } else {
+                $includes = array();
+            }
+            if (empty($includes)) continue;
+            for ($i = 0; isset($includes[$i]); $i++) {
+                $inc = $includes[$i];
+                if (isset($seen[$inc])) {
+                    trigger_error(
+                        "Circular inclusion detected in $col_i collection",
+                        E_USER_ERROR
+                    );
+                    continue;
+                } else {
+                    $seen[$inc] = true;
+                }
+                if (!isset($cols[$inc])) {
+                    trigger_error(
+                        "Collection $col_i tried to include undefined ".
+                        "collection $inc", E_USER_ERROR);
+                    continue;
+                }
+                foreach ($cols[$inc] as $module) {
+                    if (is_array($module)) { // another inclusion!
+                        foreach ($module as $inc2) $includes[] = $inc2;
+                        continue;
+                    }
+                    $cols[$col_i][] = $module; // merge in the other modules
+                }
+            }
+        }
+        
+        // replace with real modules, invert module from list to
+        // assoc array of module name to module instance
+        foreach ($cols as $col_i => $col) {
+            $ignore_global = false;
+            $order = array();
+            foreach ($col as $module_i => $module) {
+                unset($cols[$col_i][$module_i]);
+                if (is_array($module)) {
+                    trigger_error("Illegal inclusion array at index".
+                        " $module_i found collection $col_i, inclusion".
+                        " arrays must be at start of collection (index 0)",
+                        E_USER_ERROR);
+                    continue;
+                }
+                if ($module_i === '*' && $module === false) {
+                    $ignore_global = true;
+                    continue;
+                }
+                if (!isset($this->modules[$module])) {
+                    trigger_error(
+                        "Collection $col_i references undefined ".
+                        "module $module",
+                        E_USER_ERROR
+                    );
+                    continue;
+                }
+                $module = $this->modules[$module];
+                $cols[$col_i][$module->name] = $module;
+                $order[$module->name] = $module->order;
+            }
+            array_multisort(
+                $order, SORT_ASC, SORT_NUMERIC, $cols[$col_i]
+            );
+            if ($ignore_global) $cols[$col_i]['*'] = false;
+        }
+        
+        // delete pseudo-collections
+        foreach ($cols as $col_i => $col) {
+            if ($col_i[0] == '_') unset($cols[$col_i]);
+        }
+        
+    }
+    
+    /**
+     * Retrieves the doctype from the configuration object
+     */
+    function getDoctype($config) {
+        $doctype = $config->get('HTML', 'Doctype');
+        if ($doctype !== null) {
+            return $doctype;
+        }
+        if (!$this->initialized) {
+            // don't do HTML-oriented backwards compatibility stuff
+            // use either the auto-doctype, or the catch-all doctype
+            return $this->autoDoctype ? $this->autoDoctype : '*';
+        }
+        // this is backwards-compatibility stuff
+        if ($config->get('Core', 'XHTML')) {
+            $doctype = 'XHTML 1.0';
+        } else {
+            $doctype = 'HTML 4.01';
+        }
+        if ($config->get('HTML', 'Strict')) {
+            $doctype .= ' Strict';
+        } else {
+            $doctype .= ' Transitional';
+        }
+        return $doctype;
+    }
+    
+    /**
+     * Retrieves merged element definitions for all active elements.
+     * @note We may want to generate an elements array during setup
+     *       and pass that on, because a specific combination of
+     *       elements may trigger the loading of a module.
+     * @param $config Instance of HTMLPurifier_Config, for determining
+     *                stray elements.
+     */
+    function getElements($config) {
+        
+        $elements = array();
+        foreach ($this->activeModules as $module) {
+            foreach ($module->elements as $name) {
+                $elements[$name] = $this->getElement($name, $config);
+            }
+        }
+        
+        // standalone elements now loaded
+        
+        return $elements;
+        
+    }
+    
+    /**
+     * Retrieves a single merged element definition
+     * @param $name Name of element
+     * @param $config Instance of HTMLPurifier_Config, may not be necessary.
+     */
+    function getElement($name, $config) {
+        
+        $def = false;
+        
+        $modules = $this->validModules;
+        
+        if (!isset($this->elementLookup[$name])) {
+            return false;
+        }
+        
+        foreach($this->elementLookup[$name] as $module_name) {
+            
+            $module = $modules[$module_name];
+            $new_def = $module->info[$name];
+            
+            if (!$def && $new_def->standalone) {
+                $def = $new_def;
+            } elseif ($def) {
+                $def->mergeIn($new_def);
+            } else {
+                // could "save it for another day":
+                // non-standalone definitions that don't have a standalone
+                // to merge into could be deferred to the end
+                continue;
+            }
+            
+            // attribute value expansions
+            $this->attrCollections->performInclusions($def->attr);
+            $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
+            
+            // descendants_are_inline, for ChildDef_Chameleon
+            if (is_string($def->content_model) &&
+                strpos($def->content_model, 'Inline') !== false) {
+                if ($name != 'del' && $name != 'ins') {
+                    // this is for you, ins/del
+                    $def->descendants_are_inline = true;
+                }
+            }
+            
+            $this->contentSets->generateChildDef($def, $module);
+        }
+        
+        return $def;
+        
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php b/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php
new file mode 100644 (file)
index 0000000..40ff238
--- /dev/null
@@ -0,0 +1,42 @@
+<?php
+
+/**
+ * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
+ * @note In Slashdot-speak, dupe means duplicate.
+ * @note This class does not accept $config or $context, thus, it is the
+ *       burden of the callee to register the appropriate errors or
+ *       configuration.
+ */
+class HTMLPurifier_IDAccumulator
+{
+    
+    /**
+     * Lookup table of IDs we've accumulated.
+     * @public
+     */
+    var $ids = array();
+    
+    /**
+     * Add an ID to the lookup table.
+     * @param $id ID to be added.
+     * @return Bool status, true if success, false if there's a dupe
+     */
+    function add($id) {
+        if (isset($this->ids[$id])) return false;
+        return $this->ids[$id] = true;
+    }
+    
+    /**
+     * Load a list of IDs into the lookup table
+     * @param $array_of_ids Array of IDs to load
+     * @note This function doesn't care about duplicates
+     */
+    function load($array_of_ids) {
+        foreach ($array_of_ids as $id) {
+            $this->ids[$id] = true;
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Language.php b/lib/htmlpurifier/HTMLPurifier/Language.php
new file mode 100644 (file)
index 0000000..ca6fe03
--- /dev/null
@@ -0,0 +1,56 @@
+<?php
+
+require_once 'HTMLPurifier/LanguageFactory.php';
+
+class HTMLPurifier_Language
+{
+    
+    /**
+     * ISO 639 language code of language. Prefers shortest possible version
+     */
+    var $code = 'en';
+    
+    /**
+     * Fallback language code
+     */
+    var $fallback = false;
+    
+    /**
+     * Array of localizable messages
+     */
+    var $messages = array();
+    
+    /**
+     * Has the language object been loaded yet?
+     * @private
+     */
+    var $_loaded = false;
+    
+    /**
+     * Loads language object with necessary info from factory cache
+     * @note This is a lazy loader
+     */
+    function load() {
+        if ($this->_loaded) return;
+        $factory = HTMLPurifier_LanguageFactory::instance();
+        $factory->loadLanguage($this->code);
+        foreach ($factory->keys as $key) {
+            $this->$key = $factory->cache[$this->code][$key];
+        }
+        $this->_loaded = true;
+    }
+    
+    /**
+     * Retrieves a localised message. Does not perform any operations.
+     * @param $key string identifier of message
+     * @return string localised message
+     */
+    function getMessage($key) {
+        if (!$this->_loaded) $this->load();
+        if (!isset($this->messages[$key])) return '';
+        return $this->messages[$key];
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php b/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php
new file mode 100644 (file)
index 0000000..303ba4b
--- /dev/null
@@ -0,0 +1,12 @@
+<?php
+
+// private class for unit testing
+
+class HTMLPurifier_Language_en_x_test extends HTMLPurifier_Language
+{
+    
+    
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php
new file mode 100644 (file)
index 0000000..115662b
--- /dev/null
@@ -0,0 +1,11 @@
+<?php
+
+// private language message file for unit testing purposes
+
+$fallback = 'en';
+
+$messages = array(
+    'htmlpurifier' => 'HTML Purifier X'
+);
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php
new file mode 100644 (file)
index 0000000..7650b81
--- /dev/null
@@ -0,0 +1,12 @@
+<?php
+
+$fallback = false;
+
+$messages = array(
+
+'htmlpurifier' => 'HTML Purifier',
+'pizza' => 'Pizza', // for unit testing purposes
+
+);
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php b/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php
new file mode 100644 (file)
index 0000000..7097ced
--- /dev/null
@@ -0,0 +1,196 @@
+<?php
+
+require_once 'HTMLPurifier/Language.php';
+require_once 'HTMLPurifier/AttrDef/Lang.php';
+
+/**
+ * Class responsible for generating HTMLPurifier_Language objects, managing
+ * caching and fallbacks.
+ * @note Thanks to MediaWiki for the general logic, although this version
+ *       has been entirely rewritten
+ */
+class HTMLPurifier_LanguageFactory
+{
+    
+    /**
+     * Cache of language code information used to load HTMLPurifier_Language objects
+     * Structure is: $factory->cache[$language_code][$key] = $value
+     * @value array map
+     */
+    var $cache;
+    
+    /**
+     * Valid keys in the HTMLPurifier_Language object. Designates which
+     * variables to slurp out of a message file.
+     * @value array list
+     */
+    var $keys = array('fallback', 'messages');
+    
+    /**
+     * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
+     * @value object HTMLPurifier_AttrDef_Lang
+     */
+    var $validator;
+    
+    /**
+     * Cached copy of dirname(__FILE__), directory of current file without
+     * trailing slash
+     * @value string filename
+     */
+    var $dir;
+    
+    /**
+     * Keys whose contents are a hash map and can be merged
+     * @value array lookup
+     */
+    var $mergeable_keys_map = array('messages' => true);
+    
+    /**
+     * Keys whose contents are a list and can be merged
+     * @value array lookup
+     */
+    var $mergeable_keys_list = array();
+    
+    /**
+     * Retrieve sole instance of the factory.
+     * @static
+     * @param $prototype Optional prototype to overload sole instance with,
+     *                   or bool true to reset to default factory.
+     */
+    function &instance($prototype = null) {
+        static $instance = null;
+        if ($prototype !== null) {
+            $instance = $prototype;
+        } elseif ($instance === null || $prototype == true) {
+            $instance = new HTMLPurifier_LanguageFactory();
+            $instance->setup();
+        }
+        return $instance;
+    }
+    
+    /**
+     * Sets up the singleton, much like a constructor
+     * @note Prevents people from getting this outside of the singleton
+     */
+    function setup() {
+        $this->validator = new HTMLPurifier_AttrDef_Lang();
+        $this->dir = dirname(__FILE__);
+    }
+    
+    /**
+     * Creates a language object, handles class fallbacks
+     * @param $code string language code
+     */
+    function create($code) {
+        
+        $config = $context = false; // hope it doesn't use these!
+        $code = $this->validator->validate($code, $config, $context);
+        if ($code === false) $code = 'en'; // malformed code becomes English
+        
+        $pcode = str_replace('-', '_', $code); // make valid PHP classname
+        static $depth = 0; // recursion protection
+        
+        if ($code == 'en') {
+            $class = 'HTMLPurifier_Language';
+            $file  = $this->dir . '/Language.php';
+        } else {
+            $class = 'HTMLPurifier_Language_' . $pcode;
+            $file  = $this->dir . '/Language/classes/' . $code . '.php';
+            // PHP5/APC deps bug workaround can go here
+            // you can bypass the conditional include by loading the
+            // file yourself
+            if (file_exists($file) && !class_exists($class)) {
+                               include_once $file;
+                       }
+        }
+        
+        if (!class_exists($class)) {
+            // go fallback
+            $fallback = HTMLPurifier_Language::getFallbackFor($code);
+            $depth++;
+            $lang = Language::factory( $fallback );
+            $depth--;
+        } else {
+            $lang = new $class;
+        }
+        $lang->code = $code;
+        
+        return $lang;
+        
+    }
+    
+    /**
+     * Returns the fallback language for language
+     * @note Loads the original language into cache
+     * @param $code string language code
+     */
+    function getFallbackFor($code) {
+        $this->loadLanguage($code);
+        return $this->cache[$code]['fallback'];
+    }
+    
+    /**
+     * Loads language into the cache, handles message file and fallbacks
+     * @param $code string language code
+     */
+    function loadLanguage($code) {
+        static $languages_seen = array(); // recursion guard
+        
+        // abort if we've already loaded it
+        if (isset($this->cache[$code])) return;
+        
+        // generate filename
+        $filename = $this->dir . '/Language/messages/' . $code . '.php';
+        
+        // default fallback : may be overwritten by the ensuing include
+        $fallback = ($code != 'en') ? 'en' : false;
+        
+        // load primary localisation
+        if (!file_exists($filename)) {
+            // skip the include: will rely solely on fallback
+            $filename = $this->dir . '/Language/messages/en.php';
+            $cache = array();
+        } else {
+            include $filename;
+            $cache = compact($this->keys);
+        }
+        
+        // load fallback localisation
+        if (!empty($fallback)) {
+            
+            // infinite recursion guard
+            if (isset($languages_seen[$code])) {
+                trigger_error('Circular fallback reference in language ' .
+                    $code, E_USER_ERROR);
+                $fallback = 'en';
+            }
+            $language_seen[$code] = true;
+            
+            // load the fallback recursively
+            $this->loadLanguage($fallback);
+            $fallback_cache = $this->cache[$fallback];
+            
+            // merge fallback with current language
+            foreach ( $this->keys as $key ) {
+                               if (isset($cache[$key]) && isset($fallback_cache[$key])) {
+                    if (isset($this->mergeable_keys_map[$key])) {
+                        $cache[$key] = $cache[$key] + $fallback_cache[$key];
+                    } elseif (isset($this->mergeable_keys_list[$key])) {
+                        $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
+                    }
+                               } else {
+                                       $cache[$key] = $fallback_cache[$key];
+                               }
+            }
+            
+        }
+        
+        // save to cache for later retrieval
+        $this->cache[$code] = $cache;
+        
+        return;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer.php b/lib/htmlpurifier/HTMLPurifier/Lexer.php
new file mode 100644 (file)
index 0000000..e7242e1
--- /dev/null
@@ -0,0 +1,237 @@
+<?php
+
+require_once 'HTMLPurifier/Token.php';
+require_once 'HTMLPurifier/Encoder.php';
+require_once 'HTMLPurifier/EntityParser.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'AcceptFullDocuments', true, 'bool',
+    'This parameter determines whether or not the filter should accept full '.
+    'HTML documents, not just HTML fragments.  When on, it will '.
+    'drop all sections except the content between body.'
+);
+
+/**
+ * Forgivingly lexes HTML (SGML-style) markup into tokens.
+ * 
+ * A lexer parses a string of SGML-style markup and converts them into
+ * corresponding tokens.  It doesn't check for well-formedness, although its
+ * internal mechanism may make this automatic (such as the case of
+ * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
+ * from.
+ * 
+ * A lexer is HTML-oriented: it might work with XML, but it's not
+ * recommended, as we adhere to a subset of the specification for optimization
+ * reasons.
+ * 
+ * This class should not be directly instantiated, but you may use create() to
+ * retrieve a default copy of the lexer.  Being a supertype, this class
+ * does not actually define any implementation, but offers commonly used
+ * convenience functions for subclasses.
+ * 
+ * @note The unit tests will instantiate this class for testing purposes, as
+ *       many of the utility functions require a class to be instantiated.
+ *       Be careful when porting this class to PHP 5.
+ * 
+ * @par
+ * 
+ * @note
+ * We use tokens rather than create a DOM representation because DOM would:
+ * 
+ * @par
+ *  -# Require more processing power to create,
+ *  -# Require recursion to iterate,
+ *  -# Must be compatible with PHP 5's DOM (otherwise duplication),
+ *  -# Has the entire document structure (html and body not needed), and
+ *  -# Has unknown readability improvement.
+ * 
+ * @par
+ * What the last item means is that the functions for manipulating tokens are
+ * already fairly compact, and when well-commented, more abstraction may not
+ * be needed.
+ * 
+ * @see HTMLPurifier_Token
+ */
+class HTMLPurifier_Lexer
+{
+    
+    function HTMLPurifier_Lexer() {
+        $this->_entity_parser = new HTMLPurifier_EntityParser();
+    }
+    
+    
+    /**
+     * Most common entity to raw value conversion table for special entities.
+     * @protected
+     */
+    var $_special_entity2str =
+            array(
+                    '&quot;' => '"',
+                    '&amp;'  => '&',
+                    '&lt;'   => '<',
+                    '&gt;'   => '>',
+                    '&#39;'  => "'",
+                    '&#039;' => "'",
+                    '&#x27;' => "'"
+            );
+    
+    /**
+     * Parses special entities into the proper characters.
+     * 
+     * This string will translate escaped versions of the special characters
+     * into the correct ones.
+     * 
+     * @warning
+     * You should be able to treat the output of this function as
+     * completely parsed, but that's only because all other entities should
+     * have been handled previously in substituteNonSpecialEntities()
+     * 
+     * @param $string String character data to be parsed.
+     * @returns Parsed character data.
+     */
+    function parseData($string) {
+        
+        // following functions require at least one character
+        if ($string === '') return '';
+        
+        // subtracts amps that cannot possibly be escaped
+        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if (!$num_amp) return $string; // abort if no entities
+        $num_esc_amp = substr_count($string, '&amp;');
+        $string = strtr($string, $this->_special_entity2str);
+        
+        // code duplication for sake of optimization, see above
+        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if ($num_amp_2 <= $num_esc_amp) return $string;
+        
+        // hmm... now we have some uncommon entities. Use the callback.
+        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        return $string;
+    }
+    
+    /**
+     * Lexes an HTML string into tokens.
+     * 
+     * @param $string String HTML.
+     * @return HTMLPurifier_Token array representation of HTML.
+     */
+    function tokenizeHTML($string, $config, &$context) {
+        trigger_error('Call to abstract class', E_USER_ERROR);
+    }
+    
+    /**
+     * Retrieves or sets the default Lexer as a Prototype Factory.
+     * 
+     * Depending on what PHP version you are running, the abstract base
+     * Lexer class will determine which concrete Lexer is best for you:
+     * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex
+     * for PHP 5 and beyond.
+     * 
+     * Passing the optional prototype lexer parameter will override the
+     * default with your own implementation.  A copy/reference of the prototype
+     * lexer will now be returned when you request a new lexer.
+     * 
+     * @static
+     * 
+     * @note
+     * Though it is possible to call this factory method from subclasses,
+     * such usage is not recommended.
+     * 
+     * @param $prototype Optional prototype lexer.
+     * @return Concrete lexer.
+     */
+    function create($prototype = null) {
+        // we don't really care if it's a reference or a copy
+        static $lexer = null;
+        if ($prototype) {
+            $lexer = $prototype;
+        }
+        if (empty($lexer)) {
+            if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
+                class_exists('DOMDocument')) { // check for DOM support
+                require_once 'HTMLPurifier/Lexer/DOMLex.php';
+                $lexer = new HTMLPurifier_Lexer_DOMLex();
+            } else {
+                require_once 'HTMLPurifier/Lexer/DirectLex.php';
+                $lexer = new HTMLPurifier_Lexer_DirectLex();
+            }
+        }
+        return $lexer;
+    }
+    
+    /**
+     * Translates CDATA sections into regular sections (through escaping).
+     * 
+     * @static
+     * @protected
+     * @param $string HTML string to process.
+     * @returns HTML with CDATA sections escaped.
+     */
+    function escapeCDATA($string) {
+        return preg_replace_callback(
+            '/<!\[CDATA\[(.+?)\]\]>/',
+            array('HTMLPurifier_Lexer', 'CDATACallback'),
+            $string
+        );
+    }
+    
+    /**
+     * Callback function for escapeCDATA() that does the work.
+     * 
+     * @static
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @params $matches PCRE matches array, with index 0 the entire match
+     *                  and 1 the inside of the CDATA section.
+     * @returns Escaped internals of the CDATA section.
+     */
+    function CDATACallback($matches) {
+        // not exactly sure why the character set is needed, but whatever
+        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
+    }
+    
+    /**
+     * Takes a piece of HTML and normalizes it by converting entities, fixing
+     * encoding, extracting bits, and other good stuff.
+     */
+    function normalize($html, $config, &$context) {
+        
+        // extract body from document if applicable
+        if ($config->get('Core', 'AcceptFullDocuments')) {
+            $html = $this->extractBody($html);
+        }
+        
+        // escape CDATA
+        $html = $this->escapeCDATA($html);
+        
+        // expand entities that aren't the big five
+        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        
+        // clean into wellformed UTF-8 string for an SGML context: this has
+        // to be done after entity expansion because the entities sometimes
+        // represent non-SGML characters (horror, horror!)
+        $html = HTMLPurifier_Encoder::cleanUTF8($html);
+        
+        return $html;
+    }
+    
+    /**
+     * Takes a string of HTML (fragment or document) and returns the content
+     */
+    function extractBody($html) {
+        $matches = array();
+        $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
+        if ($result) {
+            return $matches[1];
+        } else {
+            return $html;
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php
new file mode 100644 (file)
index 0000000..9286b02
--- /dev/null
@@ -0,0 +1,152 @@
+<?php
+
+require_once 'HTMLPurifier/Lexer.php';
+require_once 'HTMLPurifier/TokenFactory.php';
+
+/**
+ * Parser that uses PHP 5's DOM extension (part of the core).
+ * 
+ * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
+ * It gives us a forgiving HTML parser, which we use to transform the HTML
+ * into a DOM, and then into the tokens.  It is blazingly fast (for large
+ * documents, it performs twenty times faster than
+ * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 
+ * 
+ * @note Any empty elements will have empty tokens associated with them, even if
+ * this is prohibited by the spec. This is cannot be fixed until the spec
+ * comes into play.
+ * 
+ * @note PHP's DOM extension does not actually parse any entities, we use
+ *       our own function to do that.
+ * 
+ * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
+ *          If this is a huge problem, due to the fact that HTML is hand
+ *          edited and you are unable to get a parser cache that caches the
+ *          the output of HTML Purifier while keeping the original HTML lying
+ *          around, you may want to run Tidy on the resulting output or use
+ *          HTMLPurifier_DirectLex
+ */
+
+class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
+{
+    
+    private $factory;
+    
+    public function __construct() {
+        // setup the factory
+        parent::HTMLPurifier_Lexer();
+        $this->factory = new HTMLPurifier_TokenFactory();
+    }
+    
+    public function tokenizeHTML($string, $config, &$context) {
+        
+        $string = $this->normalize($string, $config, $context);
+        
+        // preprocess string, essential for UTF-8
+        $string =
+            '<!DOCTYPE html '.
+                'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
+                '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.
+            '<html><head>'.
+            '<meta http-equiv="Content-Type" content="text/html;'.
+                ' charset=utf-8" />'.
+            '</head><body><div>'.$string.'</div></body></html>';
+        
+        $doc = new DOMDocument();
+        $doc->encoding = 'UTF-8'; // technically does nothing, but whatever
+        
+        // DOM will toss errors if the HTML its parsing has really big
+        // problems, so we're going to mute them. This can cause problems
+        // if a custom error handler that doesn't implement error_reporting
+        // is set, as noted by a Drupal plugin of HTML Purifier. Consider
+        // making our own error reporter to temporarily load in
+        @$doc->loadHTML($string);
+        
+        $tokens = array();
+        $this->tokenizeDOM(
+            $doc->getElementsByTagName('html')->item(0)-> // html
+                  getElementsByTagName('body')->item(0)-> // body
+                  getElementsByTagName('div')->item(0) // div
+            , $tokens);
+        return $tokens;
+    }
+    
+    /**
+     * Recursive function that tokenizes a node, putting it into an accumulator.
+     * 
+     * @param $node     DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                  false at first recursion because it's the implicit DIV
+     *                  tag you're dealing with.
+     * @returns Tokens of node appended to previously passed tokens.
+     */
+    protected function tokenizeDOM($node, &$tokens, $collect = false) {
+        // recursive goodness!
+        
+        // intercept non element nodes. WE MUST catch all of them,
+        // but we're not getting the character reference nodes because
+        // those should have been preprocessed
+        if ($node->nodeType === XML_TEXT_NODE ||
+                  $node->nodeType === XML_CDATA_SECTION_NODE) {
+            $tokens[] = $this->factory->createText($node->data);
+            return;
+        } elseif ($node->nodeType === XML_COMMENT_NODE) {
+            $tokens[] = $this->factory->createComment($node->data);
+            return;
+        } elseif (
+            // not-well tested: there may be other nodes we have to grab
+            $node->nodeType !== XML_ELEMENT_NODE
+        ) {
+            return;
+        }
+        
+        $attr = $node->hasAttributes() ?
+            $this->transformAttrToAssoc($node->attributes) :
+            array();
+        
+        // We still have to make sure that the element actually IS empty
+        if (!$node->childNodes->length) {
+            if ($collect) {
+                $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
+            }
+        } else {
+            if ($collect) { // don't wrap on first iteration
+                $tokens[] = $this->factory->createStart(
+                    $tag_name = $node->tagName, // somehow, it get's dropped
+                    $attr
+                );
+            }
+            foreach ($node->childNodes as $node) {
+                // remember, it's an accumulator. Otherwise, we'd have
+                // to use array_merge
+                $this->tokenizeDOM($node, $tokens, true);
+            }
+            if ($collect) {
+                $tokens[] = $this->factory->createEnd($tag_name);
+            }
+        }
+        
+    }
+    
+    /**
+     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
+     * 
+     * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
+     * @returns Associative array of attributes.
+     */
+    protected function transformAttrToAssoc($node_map) {
+        // NamedNodeMap is documented very well, so we're using undocumented
+        // features, namely, the fact that it implements Iterator and
+        // has a ->length attribute
+        if ($node_map->length === 0) return array();
+        $array = array();
+        foreach ($node_map as $attr) {
+            $array[$attr->name] = $attr->value;
+        }
+        return $array;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php b/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php
new file mode 100644 (file)
index 0000000..65d95a7
--- /dev/null
@@ -0,0 +1,309 @@
+<?php
+
+require_once 'HTMLPurifier/Lexer.php';
+
+/**
+ * Our in-house implementation of a parser.
+ * 
+ * A pure PHP parser, DirectLex has absolutely no dependencies, making
+ * it a reasonably good default for PHP4.  Written with efficiency in mind,
+ * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
+ * pales in comparison to HTMLPurifier_Lexer_DOMLex.  It will support UTF-8
+ * completely eventually.
+ * 
+ * @todo Reread XML spec and document differences.
+ * 
+ * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
+ */
+class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
+{
+    
+    /**
+     * Whitespace characters for str(c)spn.
+     * @protected
+     */
+    var $_whitespace = "\x20\x09\x0D\x0A";
+    
+    function tokenizeHTML($html, $config, &$context) {
+        
+        $html = $this->normalize($html, $config, $context);
+        
+        $cursor = 0; // our location in the text
+        $inside_tag = false; // whether or not we're parsing the inside of a tag
+        $array = array(); // result array
+        
+        // infinite loop protection
+        // has to be pretty big, since html docs can be big
+        // we're allow two hundred thousand tags... more than enough?
+        $loops = 0;
+        
+        while(true) {
+            
+            // infinite loop protection
+            if (++$loops > 200000) return array();
+            
+            $position_next_lt = strpos($html, '<', $cursor);
+            $position_next_gt = strpos($html, '>', $cursor);
+            
+            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
+            if ($position_next_lt === $cursor) {
+                $inside_tag = true;
+                $cursor++;
+            }
+            
+            if (!$inside_tag && $position_next_lt !== false) {
+                // We are not inside tag and there still is another tag to parse
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        $this->parseData(
+                            substr(
+                                $html, $cursor, $position_next_lt - $cursor
+                            )
+                        )
+                    );
+                $cursor  = $position_next_lt + 1;
+                $inside_tag = true;
+                continue;
+            } elseif (!$inside_tag) {
+                // We are not inside tag but there are no more tags
+                // If we're already at the end, break
+                if ($cursor === strlen($html)) break;
+                // Create Text of rest of string
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        $this->parseData(
+                            substr(
+                                $html, $cursor
+                            )
+                        )
+                    );
+                break;
+            } elseif ($inside_tag && $position_next_gt !== false) {
+                // We are in tag and it is well formed
+                // Grab the internals of the tag
+                $strlen_segment = $position_next_gt - $cursor;
+                $segment = substr($html, $cursor, $strlen_segment);
+                
+                // Check if it's a comment
+                if (
+                    substr($segment, 0, 3) == '!--' &&
+                    substr($segment, $strlen_segment-2, 2) == '--'
+                ) {
+                    $array[] = new
+                        HTMLPurifier_Token_Comment(
+                            substr(
+                                $segment, 3, $strlen_segment - 5
+                            )
+                        );
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Check if it's an end tag
+                $is_end_tag = (strpos($segment,'/') === 0);
+                if ($is_end_tag) {
+                    $type = substr($segment, 1);
+                    $array[] = new HTMLPurifier_Token_End($type);
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Check if it is explicitly self closing, if so, remove
+                // trailing slash. Remember, we could have a tag like <br>, so
+                // any later token processing scripts must convert improperly
+                // classified EmptyTags from StartTags.
+                $is_self_closing= (strpos($segment,'/') === $strlen_segment-1);
+                if ($is_self_closing) {
+                    $strlen_segment--;
+                    $segment = substr($segment, 0, $strlen_segment);
+                }
+                
+                // Check if there are any attributes
+                $position_first_space = strcspn($segment, $this->_whitespace);
+                
+                if ($position_first_space >= $strlen_segment) {
+                    if ($is_self_closing) {
+                        $array[] = new HTMLPurifier_Token_Empty($segment);
+                    } else {
+                        $array[] = new HTMLPurifier_Token_Start($segment);
+                    }
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Grab out all the data
+                $type = substr($segment, 0, $position_first_space);
+                $attribute_string =
+                    trim(
+                        substr(
+                            $segment, $position_first_space
+                        )
+                    );
+                if ($attribute_string) {
+                    $attr = $this->parseAttributeString(
+                                    $attribute_string
+                                  , $config, $context
+                              );
+                } else {
+                    $attr = array();
+                }
+                
+                if ($is_self_closing) {
+                    $array[] = new HTMLPurifier_Token_Empty($type, $attr);
+                } else {
+                    $array[] = new HTMLPurifier_Token_Start($type, $attr);
+                }
+                $cursor = $position_next_gt + 1;
+                $inside_tag = false;
+                continue;
+            } else {
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        '<' .
+                        $this->parseData(
+                            substr($html, $cursor)
+                        )
+                    );
+                break;
+            }
+            break;
+        }
+        return $array;
+    }
+    
+    /**
+     * Takes the inside of an HTML tag and makes an assoc array of attributes.
+     * 
+     * @param $string Inside of tag excluding name.
+     * @returns Assoc array of attributes.
+     */
+    function parseAttributeString($string, $config, &$context) {
+        $string = (string) $string; // quick typecast
+        
+        if ($string == '') return array(); // no attributes
+        
+        // let's see if we can abort as quickly as possible
+        // one equal sign, no spaces => one attribute
+        $num_equal = substr_count($string, '=');
+        $has_space = strpos($string, ' ');
+        if ($num_equal === 0 && !$has_space) {
+            // bool attribute
+            return array($string => $string);
+        } elseif ($num_equal === 1 && !$has_space) {
+            // only one attribute
+            list($key, $quoted_value) = explode('=', $string);
+            $quoted_value = trim($quoted_value);
+            if (!$key) return array();
+            if (!$quoted_value) return array($key => '');
+            $first_char = @$quoted_value[0];
+            $last_char  = @$quoted_value[strlen($quoted_value)-1];
+            
+            $same_quote = ($first_char == $last_char);
+            $open_quote = ($first_char == '"' || $first_char == "'");
+            
+            if ( $same_quote && $open_quote) {
+                // well behaved
+                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
+            } else {
+                // not well behaved
+                if ($open_quote) {
+                    $value = substr($quoted_value, 1);
+                } else {
+                    $value = $quoted_value;
+                }
+            }
+            return array($key => $value);
+        }
+        
+        // setup loop environment
+        $array  = array(); // return assoc array of attributes
+        $cursor = 0; // current position in string (moves forward)
+        $size   = strlen($string); // size of the string (stays the same)
+        
+        // if we have unquoted attributes, the parser expects a terminating
+        // space, so let's guarantee that there's always a terminating space.
+        $string .= ' ';
+        
+        // infinite loop protection
+        $loops = 0;
+        
+        while(true) {
+            
+            // infinite loop protection
+            if (++$loops > 1000) return array();
+            
+            if ($cursor >= $size) {
+                break;
+            }
+            
+            $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
+            
+            // grab the key
+            
+            $key_begin = $cursor; //we're currently at the start of the key
+            
+            // scroll past all characters that are the key (not whitespace or =)
+            $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
+            
+            $key_end = $cursor; // now at the end of the key
+            
+            $key = substr($string, $key_begin, $key_end - $key_begin);
+            
+            if (!$key) continue; // empty key
+            
+            // scroll past all whitespace
+            $cursor += strspn($string, $this->_whitespace, $cursor);
+            
+            if ($cursor >= $size) {
+                $array[$key] = $key;
+                break;
+            }
+            
+            // if the next character is an equal sign, we've got a regular
+            // pair, otherwise, it's a bool attribute
+            $first_char = @$string[$cursor];
+            
+            if ($first_char == '=') {
+                // key="value"
+                
+                $cursor++;
+                $cursor += strspn($string, $this->_whitespace, $cursor);
+                
+                // we might be in front of a quote right now
+                
+                $char = @$string[$cursor];
+                
+                if ($char == '"' || $char == "'") {
+                    // it's quoted, end bound is $char
+                    $cursor++;
+                    $value_begin = $cursor;
+                    $cursor = strpos($string, $char, $cursor);
+                    $value_end = $cursor;
+                } else {
+                    // it's not quoted, end bound is whitespace
+                    $value_begin = $cursor;
+                    $cursor += strcspn($string, $this->_whitespace, $cursor);
+                    $value_end = $cursor;
+                }
+                
+                $value = substr($string, $value_begin, $value_end - $value_begin);
+                $array[$key] = $this->parseData($value);
+                $cursor++;
+                
+            } else {
+                // boolattr
+                if ($key !== '') {
+                    $array[$key] = $key;
+                }
+                
+            }
+        }
+        return $array;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php
new file mode 100644 (file)
index 0000000..18777ef
--- /dev/null
@@ -0,0 +1,110 @@
+<?php
+
+require_once 'XML/HTMLSax3.php'; // PEAR
+require_once 'HTMLPurifier/Lexer.php';
+
+/**
+ * Proof-of-concept lexer that uses the PEAR package XML_HTMLSax3 to parse HTML.
+ * 
+ * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
+ * very much about implementation, but it's fairly well written.  However, that
+ * abstraction comes at a price: performance. You need to have it installed,
+ * and if the API changes, it might break our adapter. Not sure whether or not
+ * it's UTF-8 aware, but it has some entity parsing trouble (in all areas,
+ * text and attributes).
+ * 
+ * Quite personally, I don't recommend using the PEAR class, and the defaults
+ * don't use it. The unit tests do perform the tests on the SAX parser too, but
+ * whatever it does for poorly formed HTML is up to it.
+ * 
+ * @todo Generalize so that XML_HTMLSax is also supported.
+ * 
+ * @warning Entity-resolution inside attributes is broken.
+ */
+
+class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
+{
+    
+    /**
+     * Internal accumulator array for SAX parsers.
+     * @protected
+     */
+    var $tokens = array();
+    
+    function tokenizeHTML($string, $config, &$context) {
+        
+        $this->tokens = array();
+        
+        $string = $this->normalize($string, $config, $context);
+        
+        $parser = new XML_HTMLSax3();
+        $parser->set_object($this);
+        $parser->set_element_handler('openHandler','closeHandler');
+        $parser->set_data_handler('dataHandler');
+        $parser->set_escape_handler('escapeHandler');
+        
+        // doesn't seem to work correctly for attributes
+        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
+        
+        $parser->parse($string);
+        
+        return $this->tokens;
+        
+    }
+    
+    /**
+     * Open tag event handler, interface is defined by PEAR package.
+     */
+    function openHandler(&$parser, $name, $attrs, $closed) {
+        // entities are not resolved in attrs
+        foreach ($attrs as $key => $attr) {
+            $attrs[$key] = $this->parseData($attr);
+        }
+        if ($closed) {
+            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
+        } else {
+            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
+        }
+        return true;
+    }
+    
+    /**
+     * Close tag event handler, interface is defined by PEAR package.
+     */
+    function closeHandler(&$parser, $name) {
+        // HTMLSax3 seems to always send empty tags an extra close tag
+        // check and ignore if you see it:
+        // [TESTME] to make sure it doesn't overreach
+        if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
+            return true;
+        }
+        $this->tokens[] = new HTMLPurifier_Token_End($name);
+        return true;
+    }
+    
+    /**
+     * Data event handler, interface is defined by PEAR package.
+     */
+    function dataHandler(&$parser, $data) {
+        $this->tokens[] = new HTMLPurifier_Token_Text($data);
+        return true;
+    }
+    
+    /**
+     * Escaped text handler, interface is defined by PEAR package.
+     */
+    function escapeHandler(&$parser, $data) {
+        if (strpos($data, '--') === 0) {
+            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+        }
+        // CDATA is handled elsewhere, but if it was handled here:
+        //if (strpos($data, '[CDATA[') === 0) {
+        //    $this->tokens[] = new HTMLPurifier_Token_Text(
+        //        substr($data, 7, strlen($data) - 9) );
+        //}
+        return true;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php b/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php
new file mode 100644 (file)
index 0000000..7a12caa
--- /dev/null
@@ -0,0 +1,47 @@
+<?php
+
+/**
+ * Class that handles operations involving percent-encoding in URIs.
+ */
+class HTMLPurifier_PercentEncoder
+{
+    
+    /**
+     * Fix up percent-encoding by decoding unreserved characters and normalizing
+     * @param $string String to normalize
+     */
+    function normalize($string) {
+        if ($string == '') return '';
+        $parts = explode('%', $string);
+        $ret = array_shift($parts);
+        foreach ($parts as $part) {
+            $length = strlen($part);
+            if ($length < 2) {
+                $ret .= '%25' . $part;
+                continue;
+            }
+            $encoding = substr($part, 0, 2);
+            $text     = substr($part, 2);
+            if (!ctype_xdigit($encoding)) {
+                $ret .= '%25' . $part;
+                continue;
+            }
+            $int = hexdec($encoding);
+            if (
+                ($int >= 48 && $int <= 57) || // digits
+                ($int >= 65 && $int <= 90) || // uppercase letters
+                ($int >= 97 && $int <= 122) || // lowercase letters
+                $int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
+            ) {
+                $ret .= chr($int) . $text;
+                continue;
+            }
+            $encoding = strtoupper($encoding);
+            $ret .= '%' . $encoding . $text;
+        }
+        return $ret;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Printer.php b/lib/htmlpurifier/HTMLPurifier/Printer.php
new file mode 100644 (file)
index 0000000..14135fd
--- /dev/null
@@ -0,0 +1,149 @@
+<?php
+
+require_once 'HTMLPurifier/Generator.php';
+require_once 'HTMLPurifier/Token.php';
+require_once 'HTMLPurifier/Encoder.php';
+
+class HTMLPurifier_Printer
+{
+    
+    /**
+     * Instance of HTMLPurifier_Generator for HTML generation convenience funcs
+     */
+    var $generator;
+    
+    /**
+     * Instance of HTMLPurifier_Config, for easy access
+     */
+    var $config;
+    
+    /**
+     * Initialize $generator.
+     */
+    function HTMLPurifier_Printer() {
+        $this->generator = new HTMLPurifier_Generator();
+    }
+    
+    /**
+     * Main function that renders object or aspect of that object
+     * @param $config Configuration object
+     */
+    function render($config) {}
+    
+    /**
+     * Returns a start tag
+     * @param $tag Tag name
+     * @param $attr Attribute array
+     */
+    function start($tag, $attr = array()) {
+        return $this->generator->generateFromToken(
+                    new HTMLPurifier_Token_Start($tag, $attr ? $attr : array())
+               );
+    }
+    
+    /**
+     * Returns an end teg
+     * @param $tag Tag name
+     */
+    function end($tag) {
+        return $this->generator->generateFromToken(
+                    new HTMLPurifier_Token_End($tag)
+               );
+    }
+    
+    /**
+     * Prints a complete element with content inside
+     * @param $tag Tag name
+     * @param $contents Element contents
+     * @param $attr Tag attributes
+     * @param $escape Bool whether or not to escape contents
+     */
+    function element($tag, $contents, $attr = array(), $escape = true) {
+        return $this->start($tag, $attr) .
+               ($escape ? $this->escape($contents) : $contents) .
+               $this->end($tag);
+    }
+    
+    /**
+     * Prints a simple key/value row in a table.
+     * @param $name Key
+     * @param $value Value
+     */
+    function row($name, $value) {
+        if (is_bool($value)) $value = $value ? 'On' : 'Off';
+        return
+            $this->start('tr') . "\n" .
+                $this->element('th', $name) . "\n" .
+                $this->element('td', $value) . "\n" .
+            $this->end('tr')
+        ;
+    }
+    
+    /**
+     * Escapes a string for HTML output.
+     * @param $string String to escape
+     */
+    function escape($string) {
+        $string = HTMLPurifier_Encoder::cleanUTF8($string);
+        $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+        return $string;
+    }
+    
+    /**
+     * Takes a list of strings and turns them into a single list
+     * @param $array List of strings
+     * @param $polite Bool whether or not to add an end before the last
+     */
+    function listify($array, $polite = false) {
+        if (empty($array)) return 'None';
+        $ret = '';
+        $i = count($array);
+        foreach ($array as $value) {
+            $i--;
+            $ret .= $value;
+            if ($i > 0 && !($polite && $i == 1)) $ret .= ', ';
+            if ($polite && $i == 1) $ret .= 'and ';
+        }
+        return $ret;
+    }
+    
+    /**
+     * Retrieves the class of an object without prefixes, as well as metadata
+     * @param $obj Object to determine class of
+     * @param $prefix Further prefix to remove
+     */
+    function getClass($obj, $sec_prefix = '') {
+        static $five = null;
+        if ($five === null) $five = version_compare(PHP_VERSION, '5', '>=');
+        $prefix = 'HTMLPurifier_' . $sec_prefix;
+        if (!$five) $prefix = strtolower($prefix);
+        $class = str_replace($prefix, '', get_class($obj));
+        $lclass = strtolower($class);
+        $class .= '(';
+        switch ($lclass) {
+            case 'enum':
+                $values = array();
+                foreach ($obj->valid_values as $value => $bool) {
+                    $values[] = $value;
+                }
+                $class .= implode(', ', $values);
+                break;
+            case 'composite':
+                $values = array();
+                foreach ($obj->defs as $def) {
+                    $values[] = $this->getClass($def, $sec_prefix);
+                }
+                $class .= implode(', ', $values);
+                break;
+            case 'multiple':
+                $class .= $this->getClass($obj->single, $sec_prefix) . ', ';
+                $class .= $obj->max;
+                break;
+        }
+        $class .= ')';
+        return $class;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php b/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php
new file mode 100644 (file)
index 0000000..7745f5f
--- /dev/null
@@ -0,0 +1,40 @@
+<?php
+
+require_once 'HTMLPurifier/Printer.php';
+
+class HTMLPurifier_Printer_CSSDefinition extends HTMLPurifier_Printer
+{
+    
+    var $def;
+    
+    function render($config) {
+        $this->def = $config->getCSSDefinition();
+        $ret = '';
+        
+        $ret .= $this->start('div', array('class' => 'HTMLPurifier_Printer'));
+        $ret .= $this->start('table');
+        
+        $ret .= $this->element('caption', 'Properties ($info)');
+        
+        $ret .= $this->start('thead');
+        $ret .= $this->start('tr');
+        $ret .= $this->element('th', 'Property', array('class' => 'heavy'));
+        $ret .= $this->element('th', 'Definition', array('class' => 'heavy', 'style' => 'width:auto;'));
+        $ret .= $this->end('tr');
+        $ret .= $this->end('thead');
+        
+        ksort($this->def->info);
+        foreach ($this->def->info as $property => $obj) {
+            $name = $this->getClass($obj, 'AttrDef_');
+            $ret .= $this->row($property, $name);
+        }
+        
+        $ret .= $this->end('table');
+        $ret .= $this->end('div');
+        
+        return $ret;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php b/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php
new file mode 100644 (file)
index 0000000..a677c58
--- /dev/null
@@ -0,0 +1,210 @@
+<?php
+
+require_once 'HTMLPurifier/Printer.php';
+
+class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
+{
+    
+    /**
+     * Instance of HTMLPurifier_HTMLDefinition, for easy access
+     */
+    var $def;
+    
+    function render($config) {
+        $ret = '';
+        $this->config =& $config;
+        
+        $this->def = $config->getHTMLDefinition();
+        $def =& $this->def;
+        
+        $ret .= $this->start('div', array('class' => 'HTMLPurifier_Printer'));
+        $ret .= $this->start('table');
+        $ret .= $this->element('caption', 'Environment');
+        
+        $ret .= $this->row('Parent of fragment', $def->info_parent);
+        $ret .= $this->renderChildren($def->info_parent_def->child);
+        $ret .= $this->row('Block wrap name', $def->info_block_wrapper);
+        
+        $ret .= $this->start('tr');
+            $ret .= $this->element('th', 'Global attributes');
+            $ret .= $this->element('td', $this->listifyAttr($def->info_global_attr),0,0);
+        $ret .= $this->end('tr');
+        
+        $ret .= $this->start('tr');
+            $ret .= $this->element('th', 'Tag transforms');
+            $list = array();
+            foreach ($def->info_tag_transform as $old => $new) {
+                $new = $this->getClass($new, 'TagTransform_');
+                $list[] = "<$old> with $new";
+            }
+            $ret .= $this->element('td', $this->listify($list));
+        $ret .= $this->end('tr');
+        
+        $ret .= $this->start('tr');
+            $ret .= $this->element('th', 'Pre-AttrTransform');
+            $ret .= $this->element('td', $this->listifyObjectList($def->info_attr_transform_pre));
+        $ret .= $this->end('tr');
+        
+        $ret .= $this->start('tr');
+            $ret .= $this->element('th', 'Post-AttrTransform');
+            $ret .= $this->element('td', $this->listifyObjectList($def->info_attr_transform_post));
+        $ret .= $this->end('tr');
+        
+        $ret .= $this->end('table');
+        
+        
+        $ret .= $this->renderInfo();
+        
+        
+        $ret .= $this->end('div');
+        
+        return $ret;
+    }
+    
+    /**
+     * Renders the Elements ($info) table
+     */
+    function renderInfo() {
+        $ret = '';
+        $ret .= $this->start('table');
+        $ret .= $this->element('caption', 'Elements ($info)');
+        ksort($this->def->info);
+        $ret .= $this->start('tr');
+        $ret .= $this->element('th', 'Allowed tags', array('colspan' => 2, 'class' => 'heavy'));
+        $ret .= $this->end('tr');
+        $ret .= $this->start('tr');
+        $ret .= $this->element('td', $this->listifyTagLookup($this->def->info), array('colspan' => 2));
+        $ret .= $this->end('tr');
+        foreach ($this->def->info as $name => $def) {
+            $ret .= $this->start('tr');
+                $ret .= $this->element('th', "<$name>", array('class'=>'heavy', 'colspan' => 2));
+            $ret .= $this->end('tr');
+            $ret .= $this->start('tr');
+                $ret .= $this->element('th', 'Inline content');
+                $ret .= $this->element('td', $def->descendants_are_inline ? 'Yes' : 'No');
+            $ret .= $this->end('tr');
+            if (!empty($def->excludes)) {
+                $ret .= $this->start('tr');
+                    $ret .= $this->element('th', 'Excludes');
+                    $ret .= $this->element('td', $this->listifyTagLookup($def->excludes));
+                $ret .= $this->end('tr');
+            }
+            if (!empty($def->attr_transform_pre)) {
+                $ret .= $this->start('tr');
+                    $ret .= $this->element('th', 'Pre-AttrTransform');
+                    $ret .= $this->element('td', $this->listifyObjectList($def->attr_transform_pre));
+                $ret .= $this->end('tr');
+            }
+            if (!empty($def->attr_transform_post)) {
+                $ret .= $this->start('tr');
+                    $ret .= $this->element('th', 'Post-AttrTransform');
+                    $ret .= $this->element('td', $this->listifyObjectList($def->attr_transform_post));
+                $ret .= $this->end('tr');
+            }
+            if (!empty($def->auto_close)) {
+                $ret .= $this->start('tr');
+                    $ret .= $this->element('th', 'Auto closed by');
+                    $ret .= $this->element('td', $this->listifyTagLookup($def->auto_close));
+                $ret .= $this->end('tr');
+            }
+            $ret .= $this->start('tr');
+                $ret .= $this->element('th', 'Allowed attributes');
+                $ret .= $this->element('td',$this->listifyAttr($def->attr),0,0);
+            $ret .= $this->end('tr');
+            
+            $ret .= $this->renderChildren($def->child);
+        }
+        $ret .= $this->end('table');
+        return $ret;
+    }
+    
+    /** 
+     * Renders a row describing the allowed children of an element
+     * @param $def HTMLPurifier_ChildDef of pertinent element
+     */
+    function renderChildren($def) {
+        $context = new HTMLPurifier_Context();
+        $ret = '';
+        $ret .= $this->start('tr');
+            $elements = array();
+            $attr = array();
+            if (isset($def->elements)) {
+                if ($def->type == 'strictblockquote') {
+                    $def->validateChildren(array(), $this->config, $context);
+                }
+                $elements = $def->elements;
+            } elseif ($def->type == 'chameleon') {
+                $attr['rowspan'] = 2;
+            } elseif ($def->type == 'empty') {
+                $elements = array();
+            } elseif ($def->type == 'table') {
+                $elements = array_flip(array('col', 'caption', 'colgroup', 'thead',
+                    'tfoot', 'tbody', 'tr'));
+            }
+            $ret .= $this->element('th', 'Allowed children', $attr);
+            
+            if ($def->type == 'chameleon') {
+                
+                $ret .= $this->element('td',
+                    '<em>Block</em>: ' .
+                    $this->escape($this->listifyTagLookup($def->block->elements)),0,0);
+                $ret .= $this->end('tr');
+                $ret .= $this->start('tr');
+                $ret .= $this->element('td',
+                    '<em>Inline</em>: ' .
+                    $this->escape($this->listifyTagLookup($def->inline->elements)),0,0);
+                
+            } else {
+                $ret .= $this->element('td',
+                    '<em>'.ucfirst($def->type).'</em>: ' .
+                    $this->escape($this->listifyTagLookup($elements)),0,0);
+            }
+        $ret .= $this->end('tr');
+        return $ret;
+    }
+    
+    /** 
+     * Listifies a tag lookup table.
+     * @param $array Tag lookup array in form of array('tagname' => true)
+     */
+    function listifyTagLookup($array) {
+        ksort($array);
+        $list = array();
+        foreach ($array as $name => $discard) {
+            if ($name !== '#PCDATA' && !isset($this->def->info[$name])) continue;
+            $list[] = $name;
+        }
+        return $this->listify($list);
+    }
+    
+    /**
+     * Listifies a list of objects by retrieving class names and internal state
+     * @param $array List of objects
+     * @todo Also add information about internal state
+     */
+    function listifyObjectList($array) {
+        ksort($array);
+        $list = array();
+        foreach ($array as $discard => $obj) {
+            $list[] = $this->getClass($obj, 'AttrTransform_');
+        }
+        return $this->listify($list);
+    }
+    
+    /**
+     * Listifies a hash of attributes to AttrDef classes
+     * @param $array Array hash in form of array('attrname' => HTMLPurifier_AttrDef)
+     */
+    function listifyAttr($array) {
+        ksort($array);
+        $list = array();
+        foreach ($array as $name => $obj) {
+            if ($obj === false) continue;
+            $list[] = "$name&nbsp;=&nbsp;<i>" . $this->getClass($obj, 'AttrDef_') . '</i>';
+        }
+        return $this->listify($list);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy.php b/lib/htmlpurifier/HTMLPurifier/Strategy.php
new file mode 100644 (file)
index 0000000..746b0a2
--- /dev/null
@@ -0,0 +1,33 @@
+<?php
+
+/**
+ * Supertype for classes that define a strategy for modifying/purifying tokens.
+ * 
+ * While HTMLPurifier's core purpose is fixing HTML into something proper, 
+ * strategies provide plug points for extra configuration or even extra
+ * features, such as custom tags, custom parsing of text, etc.
+ */
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'EscapeInvalidTags', false, 'bool',
+    'When true, invalid tags will be written back to the document as plain '.
+    'text.  Otherwise, they are silently dropped.'
+);
+class HTMLPurifier_Strategy
+{
+    
+    /**
+     * Executes the strategy on the tokens.
+     * 
+     * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
+     * @param $config Configuration options
+     * @returns Processed array of token objects.
+     */
+    function execute($tokens, $config, &$context) {
+        trigger_error('Cannot call abstract function', E_USER_ERROR);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php b/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php
new file mode 100644 (file)
index 0000000..bd86874
--- /dev/null
@@ -0,0 +1,30 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy.php';
+require_once 'HTMLPurifier/Config.php';
+
+/**
+ * Composite strategy that runs multiple strategies on tokens.
+ */
+class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
+{
+    
+    /**
+     * List of strategies to run tokens through.
+     */
+    var $strategies = array();
+    
+    function HTMLPurifier_Strategy_Composite() {
+        trigger_error('Attempt to instantiate abstract object', E_USER_ERROR);
+    }
+    
+    function execute($tokens, $config, &$context) {
+        foreach ($this->strategies as $strategy) {
+            $tokens = $strategy->execute($tokens, $config, $context);
+        }
+        return $tokens;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php b/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php
new file mode 100644 (file)
index 0000000..66e7bb3
--- /dev/null
@@ -0,0 +1,25 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy/Composite.php';
+
+require_once 'HTMLPurifier/Strategy/RemoveForeignElements.php';
+require_once 'HTMLPurifier/Strategy/MakeWellFormed.php';
+require_once 'HTMLPurifier/Strategy/FixNesting.php';
+require_once 'HTMLPurifier/Strategy/ValidateAttributes.php';
+
+/**
+ * Core strategy composed of the big four strategies.
+ */
+class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
+{
+    
+    function HTMLPurifier_Strategy_Core() {
+        $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
+        $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
+        $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
+        $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php b/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php
new file mode 100644 (file)
index 0000000..08f9075
--- /dev/null
@@ -0,0 +1,292 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy.php';
+require_once 'HTMLPurifier/HTMLDefinition.php';
+
+/**
+ * Takes a well formed list of tokens and fixes their nesting.
+ * 
+ * HTML elements dictate which elements are allowed to be their children,
+ * for example, you can't have a p tag in a span tag.  Other elements have
+ * much more rigorous definitions: tables, for instance, require a specific
+ * order for their elements.  There are also constraints not expressible by
+ * document type definitions, such as the chameleon nature of ins/del
+ * tags and global child exclusions.
+ * 
+ * The first major objective of this strategy is to iterate through all the
+ * nodes (not tokens) of the list of tokens and determine whether or not
+ * their children conform to the element's definition.  If they do not, the
+ * child definition may optionally supply an amended list of elements that
+ * is valid or require that the entire node be deleted (and the previous
+ * node rescanned).
+ * 
+ * The second objective is to ensure that explicitly excluded elements of
+ * an element do not appear in its children.  Code that accomplishes this
+ * task is pervasive through the strategy, though the two are distinct tasks
+ * and could, theoretically, be seperated (although it's not recommended).
+ * 
+ * @note Whether or not unrecognized children are silently dropped or
+ *       translated into text depends on the child definitions.
+ * 
+ * @todo Enable nodes to be bubbled out of the structure.
+ */
+
+class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
+{
+    
+    function execute($tokens, $config, &$context) {
+        //####################################################################//
+        // Pre-processing
+        
+        // get a copy of the HTML definition
+        $definition = $config->getHTMLDefinition();
+        
+        // insert implicit "parent" node, will be removed at end.
+        // ! we might want to move this to configuration
+        // DEFINITION CALL
+        $parent_name = $definition->info_parent;
+        array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
+        $tokens[] = new HTMLPurifier_Token_End($parent_name);
+        
+        // setup the context variables
+        $is_inline = false; // reference var that we alter
+        $context->register('IsInline', $is_inline);
+        
+        //####################################################################//
+        // Loop initialization
+        
+        // stack that contains the indexes of all parents,
+        // $stack[count($stack)-1] being the current parent
+        $stack = array();
+        
+        // stack that contains all elements that are excluded
+        // same structure as $stack, but it is only populated when an element
+        // with exclusions is processed, i.e. there won't be empty exclusions.
+        $exclude_stack = array();
+        
+        //####################################################################//
+        // Loop
+        
+        // iterate through all start nodes. Determining the start node
+        // is complicated so it has been omitted from the loop construct
+        for ($i = 0, $size = count($tokens) ; $i < $size; ) {
+            
+            //################################################################//
+            // Gather information on children
+            
+            // child token accumulator
+            $child_tokens = array();
+            
+            // scroll to the end of this node, report number, and collect
+            // all children
+            for ($j = $i, $depth = 0; ; $j++) {
+                if ($tokens[$j]->type == 'start') {
+                    $depth++;
+                    // skip token assignment on first iteration, this is the
+                    // token we currently are on
+                    if ($depth == 1) continue;
+                } elseif ($tokens[$j]->type == 'end') {
+                    $depth--;
+                    // skip token assignment on last iteration, this is the
+                    // end token of the token we're currently on
+                    if ($depth == 0) break;
+                }
+                $child_tokens[] = $tokens[$j];
+            }
+            
+            // $i is index of start token
+            // $j is index of end token
+            
+            //################################################################//
+            // Gather information on parent
+            
+            // calculate parent information
+            if ($count = count($stack)) {
+                $parent_index = $stack[$count-1];
+                $parent_name  = $tokens[$parent_index]->name;
+                if ($parent_index == 0) {
+                    $parent_def   = $definition->info_parent_def;
+                } else {
+                    $parent_def   = $definition->info[$parent_name];
+                }
+            } else {
+                // unknown info, it won't be used anyway
+                $parent_index = $parent_name = $parent_def = null;
+            }
+            
+            // calculate context
+            if ($is_inline === false) {
+                // check if conditions make it inline
+                if (!empty($parent_def) && $parent_def->descendants_are_inline) {
+                    $is_inline = $count - 1;
+                }
+            } else {
+                // check if we're out of inline
+                if ($count === $is_inline) {
+                    $is_inline = false;
+                }
+            }
+            
+            //################################################################//
+            // Determine whether element is explicitly excluded SGML-style
+            
+            // determine whether or not element is excluded by checking all
+            // parent exclusions. The array should not be very large, two
+            // elements at most.
+            $excluded = false;
+            if (!empty($exclude_stack)) {
+                foreach ($exclude_stack as $lookup) {
+                    if (isset($lookup[$tokens[$i]->name])) {
+                        $excluded = true;
+                        // no need to continue processing
+                        break;
+                    }
+                }
+            }
+            
+            //################################################################//
+            // Perform child validation
+            
+            if ($excluded) {
+                // there is an exclusion, remove the entire node
+                $result = false;
+                $excludes = array(); // not used, but good to initialize anyway
+            } else {
+                // DEFINITION CALL
+                if ($i === 0) {
+                    // special processing for the first node
+                    $def = $definition->info_parent_def;
+                } else {
+                    $def = $definition->info[$tokens[$i]->name];
+                    
+                }
+                
+                if (!empty($def->child)) {
+                    // have DTD child def validate children
+                    $result = $def->child->validateChildren(
+                        $child_tokens, $config, $context);
+                } else {
+                    // weird, no child definition, get rid of everything
+                    $result = false;
+                }
+                
+                // determine whether or not this element has any exclusions
+                $excludes = $def->excludes;
+            }
+            
+            // $result is now a bool or array
+            
+            //################################################################//
+            // Process result by interpreting $result
+            
+            if ($result === true) {
+                // leave the node as is
+                
+                // register start token as a parental node start
+                $stack[] = $i;
+                
+                // register exclusions if there are any
+                if (!empty($excludes)) $exclude_stack[] = $excludes;
+                
+                // move cursor to next possible start node
+                $i++;
+                
+            } elseif($result === false) {
+                // remove entire node
+                
+                // calculate length of inner tokens and current tokens
+                $length = $j - $i + 1;
+                
+                // perform removal
+                array_splice($tokens, $i, $length);
+                
+                // update size
+                $size -= $length;
+                
+                // there is no start token to register,
+                // current node is now the next possible start node
+                // unless it turns out that we need to do a double-check
+                
+                if (!$parent_def->child->allow_empty) {
+                    // we need to do a double-check
+                    $i = $parent_index;
+                    array_pop($stack);
+                }
+                
+                // PROJECTED OPTIMIZATION: Process all children elements before
+                // reprocessing parent node.
+                
+            } else {
+                // replace node with $result
+                
+                // calculate length of inner tokens
+                $length = $j - $i - 1;
+                
+                // perform replacement
+                array_splice($tokens, $i + 1, $length, $result);
+                
+                // update size
+                $size -= $length;
+                $size += count($result);
+                
+                // register start token as a parental node start
+                $stack[] = $i;
+                
+                // register exclusions if there are any
+                if (!empty($excludes)) $exclude_stack[] = $excludes;
+                
+                // move cursor to next possible start node
+                $i++;
+                
+            }
+            
+            //################################################################//
+            // Scroll to next start node
+            
+            // We assume, at this point, that $i is the index of the token
+            // that is the first possible new start point for a node.
+            
+            // Test if the token indeed is a start tag, if not, move forward
+            // and test again.
+            $size = count($tokens);
+            while ($i < $size and $tokens[$i]->type != 'start') {
+                if ($tokens[$i]->type == 'end') {
+                    // pop a token index off the stack if we ended a node
+                    array_pop($stack);
+                    // pop an exclusion lookup off exclusion stack if
+                    // we ended node and that node had exclusions
+                    if ($i == 0 || $i == $size - 1) {
+                        // use specialized var if it's the super-parent
+                        $s_excludes = $definition->info_parent_def->excludes;
+                    } else {
+                        $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
+                    }
+                    if ($s_excludes) {
+                        array_pop($exclude_stack);
+                    }
+                }
+                $i++;
+            }
+            
+        }
+        
+        //####################################################################//
+        // Post-processing
+        
+        // remove implicit parent tokens at the beginning and end
+        array_shift($tokens);
+        array_pop($tokens);
+        
+        // remove context variables
+        $context->destroy('IsInline');
+        
+        //####################################################################//
+        // Return
+        
+        return $tokens;
+        
+    }
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php b/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php
new file mode 100644 (file)
index 0000000..84580d3
--- /dev/null
@@ -0,0 +1,158 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy.php';
+require_once 'HTMLPurifier/HTMLDefinition.php';
+require_once 'HTMLPurifier/Generator.php';
+
+/**
+ * Takes tokens makes them well-formed (balance end tags, etc.)
+ */
+class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
+{
+    
+    function execute($tokens, $config, &$context) {
+        $definition = $config->getHTMLDefinition();
+        $generator = new HTMLPurifier_Generator();
+        $result = array();
+        $current_nesting = array();
+        $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
+        foreach ($tokens as $token) {
+            if (empty( $token->is_tag )) {
+                $result[] = $token;
+                continue;
+            }
+            
+            // DEFINITION CALL
+            $info = $definition->info[$token->name]->child;
+            
+            // test if it claims to be a start tag but is empty
+            if ($info->type == 'empty' &&
+                $token->type == 'start' ) {
+                
+                $result[] = new HTMLPurifier_Token_Empty($token->name,
+                                                         $token->attr);
+                continue;
+            }
+            
+            // test if it claims to be empty but really is a start tag
+            if ($info->type != 'empty' &&
+                $token->type == 'empty' ) {
+                
+                $result[] = new HTMLPurifier_Token_Start($token->name,
+                                                         $token->attr);
+                $result[] = new HTMLPurifier_Token_End($token->name);
+                
+                continue;
+            }
+            
+            // automatically insert empty tags
+            if ($token->type == 'empty') {
+                $result[] = $token;
+                continue;
+            }
+            
+            // we give start tags precedence, so automatically accept unless...
+            // it's one of those special cases
+            if ($token->type == 'start') {
+                
+                // if there's a parent, check for special case
+                if (!empty($current_nesting)) {
+                    
+                    $parent = array_pop($current_nesting);
+                    $parent_name = $parent->name;
+                    $parent_info = $definition->info[$parent_name];
+                    
+                    if (isset($parent_info->auto_close[$token->name])) {
+                        $result[] = new HTMLPurifier_Token_End($parent_name);
+                        $result[] = $token;
+                        $current_nesting[] = $token;
+                        continue;
+                    }
+                    
+                    $current_nesting[] = $parent; // undo the pop
+                }
+                
+                $result[] = $token;
+                $current_nesting[] = $token;
+                continue;
+            }
+            
+            // sanity check
+            if ($token->type != 'end') continue;
+            
+            // okay, we're dealing with a closing tag
+            
+            // make sure that we have something open
+            if (empty($current_nesting)) {
+                if ($escape_invalid_tags) {
+                    $result[] = new HTMLPurifier_Token_Text(
+                        $generator->generateFromToken($token, $config, $context)
+                    );
+                }
+                continue;
+            }
+            
+            // first, check for the simplest case: everything closes neatly
+            
+            // current_nesting is modified
+            $current_parent = array_pop($current_nesting);
+            if ($current_parent->name == $token->name) {
+                $result[] = $token;
+                continue;
+            }
+            
+            // undo the array_pop
+            $current_nesting[] = $current_parent;
+            
+            // okay, so we're trying to close the wrong tag
+            
+            // scroll back the entire nest, trying to find our tag
+            // feature could be to specify how far you'd like to go
+            $size = count($current_nesting);
+            // -2 because -1 is the last element, but we already checked that
+            $skipped_tags = false;
+            for ($i = $size - 2; $i >= 0; $i--) {
+                if ($current_nesting[$i]->name == $token->name) {
+                    // current nesting is modified
+                    $skipped_tags = array_splice($current_nesting, $i);
+                    break;
+                }
+            }
+            
+            // we still didn't find the tag, so translate to text
+            if ($skipped_tags === false) {
+                if ($escape_invalid_tags) {
+                    $result[] = new HTMLPurifier_Token_Text(
+                        $generator->generateFromToken($token, $config, $context)
+                    );
+                }
+                continue;
+            }
+            
+            // okay, we found it, close all the skipped tags
+            // note that skipped tags contains the element we need closed
+            $size = count($skipped_tags);
+            for ($i = $size - 1; $i >= 0; $i--) {
+                $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
+            }
+            
+            // done!
+            
+        }
+        
+        // we're at the end now, fix all still unclosed tags
+        
+        if (!empty($current_nesting)) {
+            $size = count($current_nesting);
+            for ($i = $size - 1; $i >= 0; $i--) {
+                $result[] =
+                    new HTMLPurifier_Token_End($current_nesting[$i]->name);
+            }
+        }
+        
+        return $result;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php
new file mode 100644 (file)
index 0000000..27caf36
--- /dev/null
@@ -0,0 +1,86 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy.php';
+require_once 'HTMLPurifier/HTMLDefinition.php';
+require_once 'HTMLPurifier/Generator.php';
+require_once 'HTMLPurifier/TagTransform.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'RemoveInvalidImg', true, 'bool',
+    'This directive enables pre-emptive URI checking in <code>img</code> '.
+    'tags, as the attribute validation strategy is not authorized to '.
+    'remove elements from the document.  This directive has been available '.
+    'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.'
+);
+
+/**
+ * Removes all unrecognized tags from the list of tokens.
+ * 
+ * This strategy iterates through all the tokens and removes unrecognized
+ * tokens. If a token is not recognized but a TagTransform is defined for
+ * that element, the element will be transformed accordingly.
+ */
+
+class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
+{
+    
+    function execute($tokens, $config, &$context) {
+        $definition = $config->getHTMLDefinition();
+        $generator = new HTMLPurifier_Generator();
+        $result = array();
+        $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
+        foreach($tokens as $token) {
+            if (!empty( $token->is_tag )) {
+                // DEFINITION CALL
+                if (isset($definition->info[$token->name])) {
+                    // leave untouched, except for a few special cases:
+                    
+                    // hard-coded image special case, pre-emptively drop
+                    // if not available. Probably not abstract-able
+                    if ( $token->name == 'img' ) {
+                        if (!isset($token->attr['src'])) {
+                            continue;
+                        }
+                        if (!isset($definition->info['img']->attr['src'])) {
+                            continue;
+                        }
+                        $token->attr['src'] =
+                            $definition->
+                                info['img']->
+                                    attr['src']->
+                                        validate($token->attr['src'],
+                                            $config, $context);
+                        if ($token->attr['src'] === false) continue;
+                    }
+                    
+                } elseif (
+                    isset($definition->info_tag_transform[$token->name])
+                ) {
+                    // there is a transformation for this tag
+                    // DEFINITION CALL
+                    $token = $definition->
+                                info_tag_transform[$token->name]->
+                                    transform($token, $config, $context);
+                } elseif ($escape_invalid_tags) {
+                    // invalid tag, generate HTML and insert in
+                    $token = new HTMLPurifier_Token_Text(
+                        $generator->generateFromToken($token, $config, $context)
+                    );
+                } else {
+                    continue;
+                }
+            } elseif ($token->type == 'comment') {
+                // strip comments
+                continue;
+            } elseif ($token->type == 'text') {
+            } else {
+                continue;
+            }
+            $result[] = $token;
+        }
+        return $result;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/ValidateAttributes.php b/lib/htmlpurifier/HTMLPurifier/Strategy/ValidateAttributes.php
new file mode 100644 (file)
index 0000000..07744f8
--- /dev/null
@@ -0,0 +1,129 @@
+<?php
+
+require_once 'HTMLPurifier/Strategy.php';
+require_once 'HTMLPurifier/HTMLDefinition.php';
+require_once 'HTMLPurifier/IDAccumulator.php';
+
+HTMLPurifier_ConfigSchema::define(
+    'Attr', 'IDBlacklist', array(), 'list',
+    'Array of IDs not allowed in the document.');
+
+/**
+ * Validate all attributes in the tokens.
+ */
+
+class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
+{
+    
+    function execute($tokens, $config, &$context) {
+        
+        $definition = $config->getHTMLDefinition();
+        
+        // setup id_accumulator context
+        $id_accumulator = new HTMLPurifier_IDAccumulator();
+        $id_accumulator->load($config->get('Attr', 'IDBlacklist'));
+        $context->register('IDAccumulator', $id_accumulator);
+        
+        // create alias to global definition array, see also $defs
+        // DEFINITION CALL
+        $d_defs = $definition->info_global_attr;
+        
+        foreach ($tokens as $key => $token) {
+            
+            // only process tokens that have attributes,
+            //   namely start and empty tags
+            if ($token->type !== 'start' && $token->type !== 'empty') continue;
+            
+            // copy out attributes for easy manipulation
+            $attr = $token->attr;
+            
+            // do global transformations (pre)
+            // nothing currently utilizes this
+            foreach ($definition->info_attr_transform_pre as $transform) {
+                $attr = $transform->transform($attr, $config, $context);
+            }
+            
+            // do local transformations only applicable to this element (pre)
+            // ex. <p align="right"> to <p style="text-align:right;">
+            foreach ($definition->info[$token->name]->attr_transform_pre
+                as $transform
+            ) {
+                $attr = $transform->transform($attr, $config, $context);
+            }
+            
+            // create alias to this element's attribute definition array, see
+            // also $d_defs (global attribute definition array)
+            // DEFINITION CALL
+            $defs = $definition->info[$token->name]->attr;
+            
+            // iterate through all the attribute keypairs
+            // Watch out for name collisions: $key has previously been used
+            foreach ($attr as $attr_key => $value) {
+                
+                // call the definition
+                if ( isset($defs[$attr_key]) ) {
+                    // there is a local definition defined
+                    if ($defs[$attr_key] === false) {
+                        // We've explicitly been told not to allow this element.
+                        // This is usually when there's a global definition
+                        // that must be overridden.
+                        // Theoretically speaking, we could have a
+                        // AttrDef_DenyAll, but this is faster!
+                        $result = false;
+                    } else {
+                        // validate according to the element's definition
+                        $result = $defs[$attr_key]->validate(
+                                        $value, $config, $context
+                                   );
+                    }
+                } elseif ( isset($d_defs[$attr_key]) ) {
+                    // there is a global definition defined, validate according
+                    // to the global definition
+                    $result = $d_defs[$attr_key]->validate(
+                                    $value, $config, $context
+                               );
+                } else {
+                    // system never heard of the attribute? DELETE!
+                    $result = false;
+                }
+                
+                // put the results into effect
+                if ($result === false || $result === null) {
+                    // remove the attribute
+                    unset($attr[$attr_key]);
+                } elseif (is_string($result)) {
+                    // simple substitution
+                    $attr[$attr_key] = $result;
+                }
+                
+                // we'd also want slightly more complicated substitution
+                // involving an array as the return value,
+                // although we're not sure how colliding attributes would
+                // resolve (certain ones would be completely overriden,
+                // others would prepend themselves).
+            }
+            
+            // post transforms
+            
+            // ex. <x lang="fr"> to <x lang="fr" xml:lang="fr">
+            foreach ($definition->info_attr_transform_post as $transform) {
+                $attr = $transform->transform($attr, $config, $context);
+            }
+            
+            // ex. <bdo> to <bdo dir="ltr">
+            foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
+                $attr = $transform->transform($attr, $config, $context);
+            }
+            
+            // commit changes
+            // could interfere with flyweight implementation
+            $tokens[$key]->attr = $attr;
+        }
+        $context->destroy('IDAccumulator');
+        
+        return $tokens;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/TagTransform.php b/lib/htmlpurifier/HTMLPurifier/TagTransform.php
new file mode 100644 (file)
index 0000000..f5dc5c9
--- /dev/null
@@ -0,0 +1,29 @@
+<?php
+
+require_once 'HTMLPurifier/Token.php';
+
+/**
+ * Defines a mutation of an obsolete tag into a valid tag.
+ */
+class HTMLPurifier_TagTransform
+{
+    
+    /**
+     * Tag name to transform the tag to.
+     * @public
+     */
+    var $transform_to;
+    
+    /**
+     * Transforms the obsolete tag into the valid tag.
+     * @param $tag Tag to be transformed.
+     * @param $config Mandatory HTMLPurifier_Config object
+     * @param $context Mandatory HTMLPurifier_Context object
+     */
+    function transform($tag, $config, &$context) {
+        trigger_error('Call to abstract function', E_USER_ERROR);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/TagTransform/Center.php b/lib/htmlpurifier/HTMLPurifier/TagTransform/Center.php
new file mode 100644 (file)
index 0000000..571bb9d
--- /dev/null
@@ -0,0 +1,34 @@
+<?php
+
+require_once 'HTMLPurifier/TagTransform.php';
+
+/**
+ * Transforms CENTER tags into proper version (DIV with text-align CSS)
+ * 
+ * Takes a CENTER tag, parses the align attribute, and then if it's valid
+ * assigns it to the CSS property text-align.
+ */
+class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform
+{
+    var $transform_to = 'div';
+    
+    function transform($tag, $config, &$context) {
+        if ($tag->type == 'end') {
+            $new_tag = new HTMLPurifier_Token_End($this->transform_to);
+            return $new_tag;
+        }
+        $attr = $tag->attr;
+        $prepend_css = 'text-align:center;';
+        if (isset($attr['style'])) {
+            $attr['style'] = $prepend_css . $attr['style'];
+        } else {
+            $attr['style'] = $prepend_css;
+        }
+        $new_tag = $tag->copy();
+        $new_tag->name = $this->transform_to;
+        $new_tag->attr = $attr;
+        return $new_tag;
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/TagTransform/Font.php b/lib/htmlpurifier/HTMLPurifier/TagTransform/Font.php
new file mode 100644 (file)
index 0000000..ae6d783
--- /dev/null
@@ -0,0 +1,83 @@
+<?php
+
+require_once 'HTMLPurifier/TagTransform.php';
+
+/**
+ * Transforms FONT tags to the proper form (SPAN with CSS styling)
+ * 
+ * This transformation takes the three proprietary attributes of FONT and
+ * transforms them into their corresponding CSS attributes.  These are color,
+ * face, and size.
+ * 
+ * @note Size is an interesting case because it doesn't map cleanly to CSS.
+ *       Thanks to
+ *       http://style.cleverchimp.com/font_size_intervals/altintervals.html
+ *       for reasonable mappings.
+ */
+class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
+{
+    
+    var $transform_to = 'span';
+    
+    var $_size_lookup = array(
+        '1' => 'xx-small',
+        '2' => 'small',
+        '3' => 'medium',
+        '4' => 'large',
+        '5' => 'x-large',
+        '6' => 'xx-large',
+        '7' => '300%',
+        '-1' => 'smaller',
+        '+1' => 'larger',
+        '-2' => '60%',
+        '+2' => '150%',
+        '+4' => '300%'
+    );
+    
+    function transform($tag, $config, &$context) {
+        
+        if ($tag->type == 'end') {
+            $new_tag = new HTMLPurifier_Token_End($this->transform_to);
+            return $new_tag;
+        }
+        
+        $attr = $tag->attr;
+        $prepend_style = '';
+        
+        // handle color transform
+        if (isset($attr['color'])) {
+            $prepend_style .= 'color:' . $attr['color'] . ';';
+            unset($attr['color']);
+        }
+        
+        // handle face transform
+        if (isset($attr['face'])) {
+            $prepend_style .= 'font-family:' . $attr['face'] . ';';
+            unset($attr['face']);
+        }
+        
+        // handle size transform
+        if (isset($attr['size'])) {
+            if (isset($this->_size_lookup[$attr['size']])) {
+                $prepend_style .= 'font-size:' .
+                  $this->_size_lookup[$attr['size']] . ';';
+            }
+            unset($attr['size']);
+        }
+        
+        if ($prepend_style) {
+            $attr['style'] = isset($attr['style']) ?
+                $prepend_style . $attr['style'] :
+                $prepend_style;
+        }
+        
+        $new_tag = $tag->copy();
+        $new_tag->name = $this->transform_to;
+        $new_tag->attr = $attr;
+        
+        return $new_tag;
+        
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/TagTransform/Simple.php b/lib/htmlpurifier/HTMLPurifier/TagTransform/Simple.php
new file mode 100644 (file)
index 0000000..6ffd0ea
--- /dev/null
@@ -0,0 +1,26 @@
+<?php
+
+require_once 'HTMLPurifier/TagTransform.php';
+
+/**
+ * Simple transformation, just change tag name to something else.
+ */
+class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
+{
+    
+    /**
+     * @param $transform_to Tag name to transform to.
+     */
+    function HTMLPurifier_TagTransform_Simple($transform_to) {
+        $this->transform_to = $transform_to;
+    }
+    
+    function transform($tag, $config, &$context) {
+        $new_tag = $tag->copy();
+        $new_tag->name = $this->transform_to;
+        return $new_tag;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Token.php b/lib/htmlpurifier/HTMLPurifier/Token.php
new file mode 100644 (file)
index 0000000..555e76f
--- /dev/null
@@ -0,0 +1,168 @@
+<?php
+
+/**
+ * Defines a set of immutable value object tokens for HTML representation.
+ * 
+ * @file
+ */
+
+/**
+ * Abstract base token class that all others inherit from.
+ */
+class HTMLPurifier_Token {
+    var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
+    
+    /**
+     * Copies the tag into a new one (clone substitute).
+     * @return Copied token
+     */
+    function copy() {
+        trigger_error('Cannot copy abstract class', E_USER_ERROR);
+    }
+}
+
+/**
+ * Abstract class of a tag token (start, end or empty), and its behavior.
+ */
+class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
+{
+    /**
+     * Static bool marker that indicates the class is a tag.
+     * 
+     * This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
+     * without having to use a function call <tt>is_a()</tt>.
+     * 
+     * @public
+     */
+    var $is_tag = true;
+    
+    /**
+     * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
+     * 
+     * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
+     * be lower-casing them, but these tokens cater to HTML tags, which are
+     * insensitive.
+     * 
+     * @public
+     */
+    var $name;
+    
+    /**
+     * Associative array of the tag's attributes.
+     */
+    var $attr = array();
+    
+    /**
+     * Non-overloaded constructor, which lower-cases passed tag name.
+     * 
+     * @param $name String name.
+     * @param $attr Associative array of attributes.
+     */
+    function HTMLPurifier_Token_Tag($name, $attr = array()) {
+        $this->name = ctype_lower($name) ? $name : strtolower($name);
+        foreach ($attr as $key => $value) {
+            // normalization only necessary when key is not lowercase
+            if (!ctype_lower($key)) {
+                $new_key = strtolower($key);
+                if (!isset($attr[$new_key])) {
+                    $attr[$new_key] = $attr[$key];
+                }
+                if ($new_key !== $key) {
+                    unset($attr[$key]);
+                }
+            }
+        }
+        $this->attr = $attr;
+    }
+}
+
+/**
+ * Concrete start token class.
+ */
+class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
+{
+    var $type = 'start';
+    function copy() {
+        return new HTMLPurifier_Token_Start($this->name, $this->attr);
+    }
+}
+
+/**
+ * Concrete empty token class.
+ */
+class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
+{
+    var $type = 'empty';
+    function copy() {
+        return new HTMLPurifier_Token_Empty($this->name, $this->attr);
+    }
+}
+
+/**
+ * Concrete end token class.
+ * 
+ * @warning This class accepts attributes even though end tags cannot. This
+ * is for optimization reasons, as under normal circumstances, the Lexers
+ * do not pass attributes.
+ */
+class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
+{
+    var $type = 'end';
+    function copy() {
+        return new HTMLPurifier_Token_End($this->name);
+    }
+}
+
+/**
+ * Concrete text token class.
+ * 
+ * Text tokens comprise of regular parsed character data (PCDATA) and raw
+ * character data (from the CDATA sections). Internally, their
+ * data is parsed with all entities expanded. Surprisingly, the text token
+ * does have a "tag name" called #PCDATA, which is how the DTD represents it
+ * in permissible child nodes.
+ */
+class HTMLPurifier_Token_Text extends HTMLPurifier_Token
+{
+    
+    var $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. @public */
+    var $type = 'text';
+    var $data; /**< Parsed character data of text. @public */
+    var $is_whitespace; /**< Bool indicating if node is whitespace. @public */
+    
+    /**
+     * Constructor, accepts data and determines if it is whitespace.
+     * 
+     * @param $data String parsed character data.
+     */
+    function HTMLPurifier_Token_Text($data) {
+        $this->data = $data;
+        $this->is_whitespace = ctype_space($data);
+    }
+    function copy() {
+        return new HTMLPurifier_Token_Text($this->data);
+    }
+    
+}
+
+/**
+ * Concrete comment token class. Generally will be ignored.
+ */
+class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
+{
+    var $data; /**< Character data within comment. @public */
+    var $type = 'comment';
+    /**
+     * Transparent constructor.
+     * 
+     * @param $data String comment data.
+     */
+    function HTMLPurifier_Token_Comment($data) {
+        $this->data = $data;
+    }
+    function copy() {
+        return new HTMLPurifier_Token_Comment($this->data);
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/TokenFactory.php b/lib/htmlpurifier/HTMLPurifier/TokenFactory.php
new file mode 100644 (file)
index 0000000..25cc412
--- /dev/null
@@ -0,0 +1,96 @@
+<?php
+
+require_once 'HTMLPurifier/Token.php';
+
+/**
+ * Factory for token generation (PHP 5 only).
+ * 
+ * @note Doing some benchmarking indicates that the new operator is much
+ *       slower than the clone operator (even discounting the cost of the
+ *       constructor).  This class is for that optimization.  We may want to
+ *       consider porting this to PHP 4 by virtue of the fact it makes the code
+ *       easier to read.  Other then that, there's not much point as we don't
+ *       maintain parallel HTMLPurifier_Token hierarchies (the main reason why
+ *       you'd want to use an abstract factory).
+ */
+class HTMLPurifier_TokenFactory
+{
+    
+    /**
+     * Prototypes that will be cloned.
+     * @private
+     */
+    // p stands for prototype
+    private $p_start, $p_end, $p_empty, $p_text, $p_comment;
+    
+    /**
+     * Generates blank prototypes for cloning.
+     */
+    public function __construct() {
+        $this->p_start  = new HTMLPurifier_Token_Start('', array());
+        $this->p_end    = new HTMLPurifier_Token_End('');
+        $this->p_empty  = new HTMLPurifier_Token_Empty('', array());
+        $this->p_text   = new HTMLPurifier_Token_Text('');
+        $this->p_comment= new HTMLPurifier_Token_Comment('');
+    }
+    
+    /**
+     * Creates a HTMLPurifier_Token_Start.
+     * @param $name Tag name
+     * @param $attr Associative array of attributes
+     * @return Generated HTMLPurifier_Token_Start
+     */
+    public function createStart($name, $attr = array()) {
+        $p = clone $this->p_start;
+        $p->HTMLPurifier_Token_Tag($name, $attr);
+        return $p;
+    }
+    
+    /**
+     * Creates a HTMLPurifier_Token_End.
+     * @param $name Tag name
+     * @return Generated HTMLPurifier_Token_End
+     */
+    public function createEnd($name) {
+        $p = clone $this->p_end;
+        $p->HTMLPurifier_Token_Tag($name);
+        return $p;
+    }
+    
+    /**
+     * Creates a HTMLPurifier_Token_Empty.
+     * @param $name Tag name
+     * @param $attr Associative array of attributes
+     * @return Generated HTMLPurifier_Token_Empty
+     */
+    public function createEmpty($name, $attr = array()) {
+        $p = clone $this->p_empty;
+        $p->HTMLPurifier_Token_Tag($name, $attr);
+        return $p;
+    }
+    
+    /**
+     * Creates a HTMLPurifier_Token_Text.
+     * @param $data Data of text token
+     * @return Generated HTMLPurifier_Token_Text
+     */
+    public function createText($data) {
+        $p = clone $this->p_text;
+        $p->HTMLPurifier_Token_Text($data);
+        return $p;
+    }
+    
+    /**
+     * Creates a HTMLPurifier_Token_Comment.
+     * @param $data Data of comment token
+     * @return Generated HTMLPurifier_Token_Comment
+     */
+    public function createComment($data) {
+        $p = clone $this->p_comment;
+        $p->HTMLPurifier_Token_Comment($data);
+        return $p;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme.php b/lib/htmlpurifier/HTMLPurifier/URIScheme.php
new file mode 100644 (file)
index 0000000..20a9781
--- /dev/null
@@ -0,0 +1,44 @@
+<?php
+
+/**
+ * Validator for the components of a URI for a specific scheme
+ */
+class HTMLPurifier_URIScheme
+{
+    
+    /**
+     * Scheme's default port (integer)
+     * @public
+     */
+    var $default_port = null;
+    
+    /**
+     * Whether or not URIs of this schem are locatable by a browser
+     * http and ftp are accessible, while mailto and news are not.
+     * @public
+     */
+    var $browsable = false;
+    
+    /**
+     * Validates the components of a URI
+     * @note This implementation should be called by children if they define
+     *       a default port, as it does port processing.
+     * @note Fragment is omitted as that is scheme independent
+     * @param $userinfo User info found before at sign in authority
+     * @param $host Hostname in authority
+     * @param $port Port found after colon in authority
+     * @param $path Path of URI
+     * @param $query Query of URI, found after question mark
+     * @param $config HTMLPurifier_Config object
+     * @param $context HTMLPurifier_Context object
+     */
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        if ($this->default_port == $port) $port = null;
+        return array($userinfo, $host, $port, $path, $query);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/ftp.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/ftp.php
new file mode 100644 (file)
index 0000000..dab9c98
--- /dev/null
@@ -0,0 +1,45 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme.php';
+
+/**
+ * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
+ */
+class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
+    
+    var $default_port = 21;
+    var $browsable = true; // usually
+    
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        list($userinfo, $host, $port, $path, $query) = 
+            parent::validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context );
+        $semicolon_pos = strrpos($path, ';'); // reverse
+        if ($semicolon_pos !== false) {
+            // typecode check
+            $type = substr($path, $semicolon_pos + 1); // no semicolon
+            $path = substr($path, 0, $semicolon_pos);
+            $type_ret = '';
+            if (strpos($type, '=') !== false) {
+                // figure out whether or not the declaration is correct
+                list($key, $typecode) = explode('=', $type, 2);
+                if ($key !== 'type') {
+                    // invalid key, tack it back on encoded
+                    $path .= '%3B' . $type;
+                } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
+                    $type_ret = ";type=$typecode";
+                }
+            } else {
+                $path .= '%3B' . $type;
+            }
+            $path = str_replace(';', '%3B', $path);
+            $path .= $type_ret;
+        }
+        return array($userinfo, $host, $port, $path, null);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/http.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/http.php
new file mode 100644 (file)
index 0000000..54b250d
--- /dev/null
@@ -0,0 +1,24 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme.php';
+
+/**
+ * Validates http (HyperText Transfer Protocol) as defined by RFC 2616
+ */
+class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
+    
+    var $default_port = 80;
+    var $browsable = true;
+    
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        list($userinfo, $host, $port, $path, $query) = 
+            parent::validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context );
+        return array(null, $host, $port, $path, $query);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/https.php
new file mode 100644 (file)
index 0000000..7f89659
--- /dev/null
@@ -0,0 +1,14 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme/http.php';
+
+/**
+ * Validates https (Secure HTTP) according to http scheme.
+ */
+class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
+    
+    var $default_port = 443;
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/mailto.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/mailto.php
new file mode 100644 (file)
index 0000000..2292072
--- /dev/null
@@ -0,0 +1,30 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme.php';
+
+// VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
+// email is valid, but be careful!
+
+/**
+ * Validates mailto (for E-mail) according to RFC 2368
+ * @todo Validate the email address
+ * @todo Filter allowed query parameters
+ */
+
+class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
+    
+    var $browsable = false;
+    
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        list($userinfo, $host, $port, $path, $query) = 
+            parent::validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context );
+        // we need to validate path against RFC 2368's addr-spec
+        return array(null, null, null, $path, $query);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/news.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/news.php
new file mode 100644 (file)
index 0000000..c9d1c2b
--- /dev/null
@@ -0,0 +1,24 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme.php';
+
+/**
+ * Validates news (Usenet) as defined by generic RFC 1738
+ */
+class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
+    
+    var $browsable = false;
+    
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        list($userinfo, $host, $port, $path, $query) = 
+            parent::validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context );
+        // typecode check needed on path
+        return array(null, null, null, $path, null);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme/nntp.php b/lib/htmlpurifier/HTMLPurifier/URIScheme/nntp.php
new file mode 100644 (file)
index 0000000..49fca4c
--- /dev/null
@@ -0,0 +1,24 @@
+<?php
+
+require_once 'HTMLPurifier/URIScheme.php';
+
+/**
+ * Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
+ */
+class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
+    
+    var $default_port = 119;
+    var $browsable = false;
+    
+    function validateComponents(
+        $userinfo, $host, $port, $path, $query, $config, &$context
+    ) {
+        list($userinfo, $host, $port, $path, $query) = 
+            parent::validateComponents(
+                $userinfo, $host, $port, $path, $query, $config, $context );
+        return array(null, $host, $port, $path, null);
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/URISchemeRegistry.php b/lib/htmlpurifier/HTMLPurifier/URISchemeRegistry.php
new file mode 100644 (file)
index 0000000..d840068
--- /dev/null
@@ -0,0 +1,104 @@
+<?php
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'AllowedSchemes', array(
+        'http'  => true, // "Hypertext Transfer Protocol", nuf' said
+        'https' => true, // HTTP over SSL (Secure Socket Layer)
+        // quite useful, but not necessary
+        'mailto' => true,// Email
+        'ftp'   => true, // "File Transfer Protocol"
+        'irc'   => true, // "Internet Relay Chat", usually needs another app
+        // for Usenet, these two are similar, but distinct
+        'nntp'  => true, // individual Netnews articles
+        'news'  => true  // newsgroup or individual Netnews articles
+    ), 'lookup',
+    'Whitelist that defines the schemes that a URI is allowed to have.  This '.
+    'prevents XSS attacks from using pseudo-schemes like javascript or mocha.'
+);
+
+HTMLPurifier_ConfigSchema::define(
+    'URI', 'OverrideAllowedSchemes', true, 'bool',
+    'If this is set to true (which it is by default), you can override '.
+    '%URI.AllowedSchemes by simply registering a HTMLPurifier_URIScheme '.
+    'to the registry.  If false, you will also have to update that directive '.
+    'in order to add more schemes.'
+);
+
+/**
+ * Registry for retrieving specific URI scheme validator objects.
+ */
+class HTMLPurifier_URISchemeRegistry
+{
+    
+    /**
+     * Retrieve sole instance of the registry.
+     * @static
+     * @param $prototype Optional prototype to overload sole instance with,
+     *                   or bool true to reset to default registry.
+     * @note Pass a registry object $prototype with a compatible interface and
+     *       the function will copy it and return it all further times.
+     */
+    function &instance($prototype = null) {
+        static $instance = null;
+        if ($prototype !== null) {
+            $instance = $prototype;
+        } elseif ($instance === null || $prototype == true) {
+            $instance = new HTMLPurifier_URISchemeRegistry();
+        }
+        return $instance;
+    }
+    
+    /**
+     * Cache of retrieved schemes.
+     * @protected
+     */
+    var $schemes = array();
+    
+    /**
+     * Directory where scheme objects can be found
+     * @private
+     */
+    var $_scheme_dir = null;
+    
+    /**
+     * Retrieves a scheme validator object
+     * @param $scheme String scheme name like http or mailto
+     * @param $config HTMLPurifier_Config object
+     * @param $config HTMLPurifier_Context object
+     */
+    function &getScheme($scheme, $config, &$context) {
+        if (!$config) $config = HTMLPurifier_Config::createDefault();
+        $null = null; // for the sake of passing by reference
+        
+        // important, otherwise attacker could include arbitrary file
+        $allowed_schemes = $config->get('URI', 'AllowedSchemes');
+        if (!$config->get('URI', 'OverrideAllowedSchemes') &&
+            !isset($allowed_schemes[$scheme])
+        ) {
+            return $null;
+        }
+        
+        if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
+        if (empty($this->_dir)) $this->_dir = dirname(__FILE__) . '/URIScheme/';
+        
+        if (!isset($allowed_schemes[$scheme])) return $null;
+        
+        @include_once $this->_dir . $scheme . '.php';
+        $class = 'HTMLPurifier_URIScheme_' . $scheme;
+        if (!class_exists($class)) return $null;
+        $this->schemes[$scheme] = new $class();
+        return $this->schemes[$scheme];
+    }
+    
+    /**
+     * Registers a custom scheme to the cache.
+     * @param $scheme Scheme name
+     * @param $scheme_obj HTMLPurifier_URIScheme object
+     */
+    function register($scheme, &$scheme_obj) {
+        $this->schemes[$scheme] =& $scheme_obj;
+    }
+    
+}
+
+?>
diff --git a/lib/htmlpurifier/readme_moodle.txt b/lib/htmlpurifier/readme_moodle.txt
new file mode 100644 (file)
index 0000000..53cf107
--- /dev/null
@@ -0,0 +1,8 @@
+Description of HTML Purifier v1.6.0 library import into Moodle
+
+Changes:
+ * Text.php - added nolink, tex and algebra tags
+
+skodak
+
+$Id$
index 986b01c41c101eb138cac0b27cc2fe8dd8011434..5f34a0805f8782c2e87ba94ce953826f1a41b562 100644 (file)
@@ -1658,7 +1658,9 @@ function trusttext_prepare_edit(&$text, &$format, $usehtmleditor, $context) {
  */
 function clean_text($text, $format=FORMAT_MOODLE) {
 
-    global $ALLOWED_TAGS;
+    if (empty($text) or is_numeric($text)) {
+       return (string)$text; 
+    }
 
     switch ($format) {
         case FORMAT_PLAIN:
@@ -1667,17 +1669,21 @@ function clean_text($text, $format=FORMAT_MOODLE) {
 
         default:
 
-        /// Fix non standard entity notations
-            $text = preg_replace('/(&#[0-9]+)(;?)/', "\\1;", $text);
-            $text = preg_replace('/(&#x[0-9a-fA-F]+)(;?)/', "\\1;", $text);
-
-        /// Remove tags that are not allowed
-            $text = strip_tags($text, $ALLOWED_TAGS);
-
-        /// Clean up embedded scripts and , using kses
-            $text = cleanAttributes($text);
+            if (!empty($CFG->enablehtmlpurifier)) {
+                $text = purify_html($text);
+            } else {
+            /// Fix non standard entity notations
+                $text = preg_replace('/(&#[0-9]+)(;?)/', "\\1;", $text);
+                $text = preg_replace('/(&#x[0-9a-fA-F]+)(;?)/', "\\1;", $text);
+    
+            /// Remove tags that are not allowed
+                $text = strip_tags($text, $ALLOWED_TAGS);
+    
+            /// Clean up embedded scripts and , using kses
+                $text = cleanAttributes($text);
+            }
 
-        /// Remove script events
+        /// Remove potential script events - some extra protection for undiscovered bugs in our code
             $text = eregi_replace("([^a-z])language([[:space:]]*)=", "\\1Xlanguage=", $text);
             $text = eregi_replace("([^a-z])on([a-z]+)([[:space:]]*)=", "\\1Xon\\2=", $text);
 
@@ -1685,6 +1691,24 @@ function clean_text($text, $format=FORMAT_MOODLE) {
     }
 }
 
+/**
+ * KSES replacement cleaning function - uses HTML Purifier.
+ */
+function purify_html($text) {
+    global $CFG;
+
+    static $purifier = false;
+    if (!$purifier) {
+        require_once $CFG->libdir.'/htmlpurifier/HTMLPurifier.auto.php';
+        $config = HTMLPurifier_Config::createDefault();
+        $config->set('Core', 'AcceptFullDocuments', false);
+        //$config->set('HTML', 'Strict', true);
+        $config->set('URI', 'AllowedSchemes', array('http'=>1, 'https'=>1, 'ftp'=>1, 'irc'=>1, 'nntp'=>1, 'news'=>1, 'rtsp'=>1, 'teamspeak'=>1, 'gopher'=>1, 'mms'=>1));
+        $purifier = new HTMLPurifier($config);
+    }
+    return $purifier->purify($text);
+}
+
 /**
  * This function takes a string and examines it for HTML tags.
  * If tags are detected it passes the string to a helper function {@link cleanAttributes2()}
@@ -5032,13 +5056,9 @@ function redirect($url, $message='', $delay=-1, $adminroot = '') {
 
     $message = clean_text($message);
 
-    $url = html_entity_decode($url);
-    $url = str_replace(array("\n", "\r"), '', $url); // some more cleaning
-    $encodedurl = htmlentities($url);
-    $tmpstr = clean_text('<a href="'.$encodedurl.'" />'); //clean encoded URL
-    $encodedurl = substr($tmpstr, 9, strlen($tmpstr)-13);
-    $url = html_entity_decode($encodedurl);
-    $surl = addslashes($url);
+    $encodedurl = preg_replace("/\&(?![a-zA-Z0-9#]{1,8};)/", "&amp;", $url);
+    $encodedurl = preg_replace('/^.*href="([^"]*)".*$/', "\\1", clean_text('<a href="'.$encodedurl.'" />'));
+    $url = str_replace('&amp;', '&', $encodedurl);
 
 /// At developer debug level. Don't redirect if errors have been printed on screen.
 /// Currenly only works in PHP 5.2+
@@ -5081,7 +5101,7 @@ function redirect($url, $message='', $delay=-1, $adminroot = '') {
         @header('Location: '.$url);
         //another way for older browsers and already sent headers (eg trailing whitespace in config.php)
         echo '<meta http-equiv="refresh" content="'. $delay .'; url='. $encodedurl .'" />';
-        echo '<script type="text/javascript">'. "\n" .'//<![CDATA['. "\n". "location.replace('$surl');". "\n". '//]]>'. "\n". '</script>';   // To cope with Mozilla bug
+        echo '<script type="text/javascript">'. "\n" .'//<![CDATA['. "\n". "location.replace('".addslashes_js($url)."');". "\n". '//]]>'. "\n". '</script>';   // To cope with Mozilla bug
         die;
     }
 
@@ -5104,7 +5124,7 @@ function redirect($url, $message='', $delay=-1, $adminroot = '') {
 //<![CDATA[
 
   function redirect() {
-      document.location.replace('<?php echo $surl ?>');
+      document.location.replace('<?php echo addslashes_js($url) ?>');
   }
   setTimeout("redirect()", <?php echo ($delay * 1000) ?>);
 //]]>