From 5adad31057ff7ab44254cfc446add0b261a74733 Mon Sep 17 00:00:00 2001
From: skodak %s
where
+ the url-encoded original URI should be inserted (sample:
+ http://www.google.com/url?q=%s
).
+
+ Uses for this directive: +
++ This directive has been available since 1.3.0. +
+'); -HTMLPurifier_ConfigSchema::define( - 'URI', 'Munge', null, 'string/null', - 'Munges all browsable (usually http, https and ftp) URI\'s into some URL '. - 'redirection service. Pass this directive a URI, with %s inserted where '. - 'the url-encoded original URI should be inserted (sample: '. - 'http://www.google.com/url?q=%s
). '.
- 'This prevents PageRank leaks, while being as transparent as possible '.
- 'to users (you may also want to add some client side JavaScript to '.
- 'override the text in the statusbar). Warning: many security experts '.
- 'believe that this form of protection does not deter spam-bots. '.
- 'You can also use this directive to redirect users to a splash page '.
- 'telling them they are leaving your website. '.
- 'This directive has been available since 1.3.0.'
-);
+// disabling directives
HTMLPurifier_ConfigSchema::define(
- 'URI', 'HostBlacklist', array(), 'list',
- 'List of strings that are forbidden in the host of any URI. Use it to '.
- 'kill domain names of spam, etc. Note that it will catch anything in '.
- 'the domain, so moo.com will catch moo.com.example.com. '.
- 'This directive has been available since 1.3.0.'
-);
+ 'URI', 'Disable', false, 'bool', '
++ Disables all URIs in all forms. Not sure why you\'d want to do that + (after all, the Internet\'s founded on the notion of a hyperlink). + This directive has been available since 1.3.0. +
+'); +HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); HTMLPurifier_ConfigSchema::define( - 'URI', 'Disable', false, 'bool', - 'Disables all URIs in all forms. Not sure why you\'d want to do that '. - '(after all, the Internet\'s founded on the notion of a hyperlink). '. - 'This directive has been available since 1.3.0.' -); -HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); + 'URI', 'DisableResources', false, 'bool', ' ++ Disables embedding resources, essentially meaning no pictures. You can + still link to them though. See %URI.DisableExternalResources for why + this might be a good idea. This directive has been available since 1.3.0. +
+'); /** * Validates a URI as defined by RFC 3986. @@ -92,205 +67,83 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { - var $host; - var $PercentEncoder; - var $embeds_resource; + var $parser, $percentEncoder; + var $embedsResource; /** * @param $embeds_resource_resource Does the URI here result in an extra HTTP request? */ function HTMLPurifier_AttrDef_URI($embeds_resource = false) { - $this->host = new HTMLPurifier_AttrDef_URI_Host(); - $this->PercentEncoder = new HTMLPurifier_PercentEncoder(); - $this->embeds_resource = (bool) $embeds_resource; + $this->parser = new HTMLPurifier_URIParser(); + $this->percentEncoder = new HTMLPurifier_PercentEncoder(); + $this->embedsResource = (bool) $embeds_resource; } function validate($uri, $config, &$context) { - // We'll write stack-based parsers later, for now, use regexps to - // get things working as fast as possible (irony) - if ($config->get('URI', 'Disable')) return false; - // parse as CDATA + // initial operations $uri = $this->parseCDATA($uri); + $uri = $this->percentEncoder->normalize($uri); - // fix up percent-encoding - $uri = $this->PercentEncoder->normalize($uri); - - // while it would be nice to use parse_url(), that's specifically - // for HTTP and thus won't work for our generic URI parsing + // parse the URI + $uri = $this->parser->parse($uri); + if ($uri === false) return false; - // according to the RFC... (but this cuts corners, i.e. non-validating) - $r_URI = '!'. - '(([^:/?#<>\'"]+):)?'. // 2. Scheme - '(//([^/?#<>\'"]*))?'. // 4. Authority - '([^?#<>\'"]*)'. // 5. Path - '(\?([^#<>\'"]*))?'. // 7. Query - '(#([^<>\'"]*))?'. // 8. Fragment - '!'; + // add embedded flag to context for validators + $context->register('EmbeddedURI', $this->embedsResource); - $matches = array(); - $result = preg_match($r_URI, $uri, $matches); - - if (!$result) return false; // invalid URI - - // seperate out parts - $scheme = !empty($matches[1]) ? $matches[2] : null; - $authority = !empty($matches[3]) ? $matches[4] : null; - $path = $matches[5]; // always present, can be empty - $query = !empty($matches[6]) ? $matches[7] : null; - $fragment = !empty($matches[8]) ? $matches[9] : null; - - - - $registry =& HTMLPurifier_URISchemeRegistry::instance(); - if ($scheme !== null) { - // no need to validate the scheme's fmt since we do that when we - // retrieve the specific scheme object from the registry - $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); - $scheme_obj = $registry->getScheme($scheme, $config, $context); - if (!$scheme_obj) return false; // invalid scheme, clean it out - } else { - $scheme_obj = $registry->getScheme( - $config->get('URI', 'DefaultScheme'), $config, $context - ); - } - - - // the URI we're processing embeds_resource a resource in the page, but the URI - // it references cannot be located - if ($this->embeds_resource && !$scheme_obj->browsable) { - return false; - } - - - if ($authority !== null) { + $ok = false; + do { - // remove URI if it's absolute and we disabled externals or - // if it's absolute and embedded and we disabled external resources - unset($our_host); - if ( - $config->get('URI', 'DisableExternal') || - ( - $config->get('URI', 'DisableExternalResources') && - $this->embeds_resource - ) - ) { - $our_host = $config->get('URI', 'Host'); - if ($our_host === null) return false; - } + // generic validation + $result = $uri->validate($config, $context); + if (!$result) break; - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] - $sub_delims = '!$&\'()'; // needs [] - $pct_encoded = "%$HEXDIG$HEXDIG"; - $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; - $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; - $matches = array(); - preg_match($r_authority, $authority, $matches); - // overloads regexp! - $userinfo = !empty($matches[1]) ? $matches[2] : null; - $host = !empty($matches[3]) ? $matches[3] : null; - $port = !empty($matches[4]) ? $matches[5] : null; + // chained validation + $uri_def =& $config->getDefinition('URI'); + $result = $uri_def->filter($uri, $config, $context); + if (!$result) break; - // validate port - if ($port !== null) { - $port = (int) $port; - if ($port < 1 || $port > 65535) $port = null; - } - - $host = $this->host->validate($host, $config, $context); - if ($host === false) $host = null; - - if ($this->checkBlacklist($host, $config, $context)) return false; + // scheme-specific validation + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj) break; + if ($this->embedsResource && !$scheme_obj->browsable) break; + $result = $scheme_obj->validate($uri, $config, $context); + if (!$result) break; - // more lenient absolute checking - if (isset($our_host)) { - $host_parts = array_reverse(explode('.', $host)); - // could be cached - $our_host_parts = array_reverse(explode('.', $our_host)); - foreach ($our_host_parts as $i => $discard) { - if (!isset($host_parts[$i])) return false; - if ($host_parts[$i] != $our_host_parts[$i]) return false; - } - } - - // userinfo and host are validated within the regexp + // survived gauntlet + $ok = true; - } else { - $port = $host = $userinfo = null; - } - - - // query and fragment are quite simple in terms of definition: - // *( pchar / "/" / "?" ), so define their validation routines - // when we start fixing percent encoding - - - - // path gets to be validated against a hodge-podge of rules depending - // on the status of authority and scheme, but it's not that important, - // esp. since it won't be applicable to everyone - + } while (false); + $context->destroy('EmbeddedURI'); + if (!$ok) return false; - // okay, now we defer execution to the subobject for more processing - // note that $fragment is omitted - list($userinfo, $host, $port, $path, $query) = - $scheme_obj->validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context - ); - - - // reconstruct authority - $authority = null; - if (!is_null($userinfo) || !is_null($host) || !is_null($port)) { - $authority = ''; - if($userinfo !== null) $authority .= $userinfo . '@'; - $authority .= $host; - if($port !== null) $authority .= ':' . $port; + // munge scheme off if necessary (this must be last) + if (!is_null($uri->scheme) && is_null($uri->host)) { + if ($uri_def->defaultScheme == $uri->scheme) { + $uri->scheme = null; + } } - // reconstruct the result - $result = ''; - if ($scheme !== null) $result .= "$scheme:"; - if ($authority !== null) $result .= "//$authority"; - $result .= $path; - if ($query !== null) $result .= "?$query"; - if ($fragment !== null) $result .= "#$fragment"; + // back to string + $result = $uri->toString(); - // munge if necessary - $munge = $config->get('URI', 'Munge'); - if (!empty($scheme_obj->browsable) && $munge !== null) { - if ($authority !== null) { - $result = str_replace('%s', rawurlencode($result), $munge); - } + // munge entire URI if necessary + if ( + !is_null($uri->host) && // indicator for authority + !empty($scheme_obj->browsable) && + !is_null($munge = $config->get('URI', 'Munge')) + ) { + $result = str_replace('%s', rawurlencode($result), $munge); } return $result; } - /** - * Checks a host against an array blacklist - * @param $host Host to check - * @param $config HTMLPurifier_Config instance - * @param $context HTMLPurifier_Context instance - * @return bool Is spam? - */ - function checkBlacklist($host, &$config, &$context) { - $blacklist = $config->get('URI', 'HostBlacklist'); - if (!empty($blacklist)) { - foreach($blacklist as $blacklisted_host_fragment) { - if (strpos($host, $blacklisted_host_fragment) !== false) { - return true; - } - } - } - return false; - } - } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php index 80b8d367e1..5a7085db7a 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email.php @@ -14,4 +14,3 @@ class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php index e35b1b4b28..6623f1907f 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php @@ -20,4 +20,3 @@ class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_UR } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php index 5344cdac25..ac729ebd93 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/Host.php @@ -51,4 +51,3 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php index 0730bbc8ac..9a1af293ba 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv4.php @@ -15,13 +15,10 @@ class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef */ var $ip4; - function HTMLPurifier_AttrDef_URI_IPv4() { - $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 - $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; - } - function validate($aIP, $config, &$context) { + if (!$this->ip4) $this->_loadRegex(); + if (preg_match('#^' . $this->ip4 . '$#s', $aIP)) { return $aIP; @@ -31,6 +28,14 @@ class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef } + /** + * Lazy load function to prevent regex from being stuffed in + * cache. + */ + function _loadRegex() { + $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 + $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; + } + } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php index 73f085e55e..f48b803dd7 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrDef/URI/IPv6.php @@ -13,6 +13,8 @@ class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4 function validate($aIP, $config, &$context) { + if (!$this->ip4) $this->_loadRegex(); + $original = $aIP; $hex = '[0-9a-fA-F]'; @@ -96,4 +98,3 @@ class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4 } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform.php index 2fa07b4755..ce69fcbe82 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform.php @@ -55,4 +55,3 @@ class HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php index 0ea5eb6dc2..f127feb2b2 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BdoDir.php @@ -28,4 +28,3 @@ class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php index a7bb2b4564..de2867efdd 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BgColor.php @@ -23,4 +23,3 @@ extends HTMLPurifier_AttrTransform { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BoolToCSS.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BoolToCSS.php index f4a16a7f17..25548eea7c 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/BoolToCSS.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/BoolToCSS.php @@ -36,4 +36,3 @@ extends HTMLPurifier_AttrTransform { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php index 10c62e3c5b..7da4f6a804 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Border.php @@ -17,4 +17,3 @@ class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/EnumToCSS.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/EnumToCSS.php index ed4dfc32dd..0470413dd4 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/EnumToCSS.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/EnumToCSS.php @@ -57,4 +57,3 @@ class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php index 4ff356d889..d042805538 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php @@ -20,7 +20,10 @@ HTMLPurifier_ConfigSchema::define( ); /** - * Post-transform that ensures the required attrs of img (alt and src) are set + * Transform that supplies default values for the src and alt attributes + * in img tags, as well as prevents the img tag from being removed + * because of a missing alt tag. This needs to be registered as both + * a pre and post attribute transform. */ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform { @@ -29,6 +32,7 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform $src = true; if (!isset($attr['src'])) { + if ($config->get('Core', 'RemoveInvalidImg')) return $attr; $attr['src'] = $config->get('Attr', 'DefaultInvalidImage'); $src = false; } @@ -47,4 +51,3 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgSpace.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgSpace.php index 53c787e2c9..60d5edc781 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgSpace.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgSpace.php @@ -44,4 +44,3 @@ extends HTMLPurifier_AttrTransform { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php index acb1786ae9..899f5c8dc5 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Lang.php @@ -27,4 +27,3 @@ class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php index 2292aa133e..a8904c5e44 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Length.php @@ -26,4 +26,3 @@ class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php index f14c147989..248d0e02fe 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Name.php @@ -18,4 +18,3 @@ class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php deleted file mode 100644 index 09088fe176..0000000000 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/TextAlign.php +++ /dev/null @@ -1,36 +0,0 @@ - 1, - 'right' => 1, - 'center' => 1, - 'justify' => 1); - - if (!isset($values[$align])) { - return $attr; - } - - $attr['style'] = isset($attr['style']) ? $attr['style'] : ''; - $attr['style'] = "text-align:$align;" . $attr['style']; - - return $attr; - - } - -} - -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php index e13d0d3005..4cb70be7ad 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php @@ -1,10 +1,14 @@ info['Enum'] = new HTMLPurifier_AttrDef_Enum(); + $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool(); + $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text(); $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID(); $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length(); @@ -32,10 +41,42 @@ class HTMLPurifier_AttrTypes $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels(); $this->info['Text'] = new HTMLPurifier_AttrDef_Text(); $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); + $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang(); + $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color(); // number is really a positive integer (one or more digits) + // FIXME: ^^ not always, see start and value of list items $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); } + + /** + * Retrieves a type + * @param $type String type name + * @return Object AttrDef for type + */ + function get($type) { + + // determine if there is any extra info tacked on + if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2); + else $string = ''; + + if (!isset($this->info[$type])) { + trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR); + return; + } + + return $this->info[$type]->make($string); + + } + + /** + * Sets a new implementation for a type + * @param $type String type name + * @param $impl Object AttrDef for type + */ + function set($type, $impl) { + $this->info[$type] = $impl; + } } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/AttrValidator.php b/lib/htmlpurifier/HTMLPurifier/AttrValidator.php new file mode 100644 index 0000000000..f02bd2087c --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrValidator.php @@ -0,0 +1,139 @@ +getHTMLDefinition(); + $e =& $context->get('ErrorCollector', true); + + // initialize CurrentToken if necessary + $current_token =& $context->get('CurrentToken', true); + if (!$current_token) $context->register('CurrentToken', $token); + + if ($token->type !== 'start' && $token->type !== 'empty') return $token; + + // create alias to global definition array, see also $defs + // DEFINITION CALL + $d_defs = $definition->info_global_attr; + + // reference attributes for easy manipulation + $attr =& $token->attr; + + // do global transformations (pre) + // nothing currently utilizes this + foreach ($definition->info_attr_transform_pre as $transform) { + $attr = $transform->transform($o = $attr, $config, $context); + if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } + + // do local transformations only applicable to this element (pre) + // ex.to
+ foreach ($definition->info[$token->name]->attr_transform_pre as $transform) { + $attr = $transform->transform($o = $attr, $config, $context); + if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } + + // create alias to this element's attribute definition array, see + // also $d_defs (global attribute definition array) + // DEFINITION CALL + $defs = $definition->info[$token->name]->attr; + + $attr_key = false; + $context->register('CurrentAttr', $attr_key); + + // iterate through all the attribute keypairs + // Watch out for name collisions: $key has previously been used + foreach ($attr as $attr_key => $value) { + + // call the definition + if ( isset($defs[$attr_key]) ) { + // there is a local definition defined + if ($defs[$attr_key] === false) { + // We've explicitly been told not to allow this element. + // This is usually when there's a global definition + // that must be overridden. + // Theoretically speaking, we could have a + // AttrDef_DenyAll, but this is faster! + $result = false; + } else { + // validate according to the element's definition + $result = $defs[$attr_key]->validate( + $value, $config, $context + ); + } + } elseif ( isset($d_defs[$attr_key]) ) { + // there is a global definition defined, validate according + // to the global definition + $result = $d_defs[$attr_key]->validate( + $value, $config, $context + ); + } else { + // system never heard of the attribute? DELETE! + $result = false; + } + + // put the results into effect + if ($result === false || $result === null) { + // this is a generic error message that should replaced + // with more specific ones when possible + if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed'); + + // remove the attribute + unset($attr[$attr_key]); + } elseif (is_string($result)) { + // generally, if a substitution is happening, there + // was some sort of implicit correction going on. We'll + // delegate it to the attribute classes to say exactly what. + + // simple substitution + $attr[$attr_key] = $result; + } + + // we'd also want slightly more complicated substitution + // involving an array as the return value, + // although we're not sure how colliding attributes would + // resolve (certain ones would be completely overriden, + // others would prepend themselves). + } + + $context->destroy('CurrentAttr'); + + // post transforms + + // global (error reporting untested) + foreach ($definition->info_attr_transform_post as $transform) { + $attr = $transform->transform($o = $attr, $config, $context); + if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } + + // local (error reporting untested) + foreach ($definition->info[$token->name]->attr_transform_post as $transform) { + $attr = $transform->transform($o = $attr, $config, $context); + if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } + + // destroy CurrentToken if we made it ourselves + if (!$current_token) $context->destroy('CurrentToken'); + + } + + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php b/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php index 23a66ab76a..8de2aa7b70 100644 --- a/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php +++ b/lib/htmlpurifier/HTMLPurifier/CSSDefinition.php @@ -1,5 +1,7 @@ + Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. This directive has been available + since 2.0.0. +
+'); + /** * Defines allowed CSS attributes and what their values are. * @see HTMLPurifier_HTMLDefinition */ -class HTMLPurifier_CSSDefinition +class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition { + var $type = 'CSS'; + /** * Assoc array of attribute name to definition object. */ @@ -30,7 +43,7 @@ class HTMLPurifier_CSSDefinition /** * Constructs the info array. The meat of this class. */ - function setup($config) { + function doSetup($config) { $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum( array('left', 'right', 'center', 'justify'), false); @@ -213,4 +226,3 @@ class HTMLPurifier_CSSDefinition } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef.php b/lib/htmlpurifier/HTMLPurifier/ChildDef.php index bed43cacd3..5236d266c5 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef.php @@ -36,6 +36,11 @@ class HTMLPurifier_ChildDef */ var $allow_empty; + /** + * Lookup array of all elements that this definition could possibly allow + */ + var $elements = array(); + /** * Validates nodes according to definition and returns modification. * @@ -52,4 +57,4 @@ class HTMLPurifier_ChildDef } } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php index afe0299fa7..b338354d38 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Chameleon.php @@ -35,6 +35,7 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef function HTMLPurifier_ChildDef_Chameleon($inline, $block) { $this->inline = new HTMLPurifier_ChildDef_Optional($inline); $this->block = new HTMLPurifier_ChildDef_Optional($block); + $this->elements = $this->block->elements; } function validateChildren($tokens_of_children, $config, &$context) { @@ -48,4 +49,3 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php index de18cd7070..ba722d0595 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php @@ -38,8 +38,27 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef if ($raw{0} != '(') { $raw = "($raw)"; } - $reg = str_replace(',', ',?', $raw); - $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg); + $el = '[#a-zA-Z0-9_.-]+'; + $reg = $raw; + + // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M + // DOING! Seriously: if there's problems, please report them. + + // collect all elements into the $elements array + preg_match_all("/$el/", $reg, $matches); + foreach ($matches[0] as $match) { + $this->elements[$match] = true; + } + + // setup all elements as parentheticals with leading commas + $reg = preg_replace("/$el/", '(,\\0)', $reg); + + // remove commas when they were not solicited + $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg); + + // remove all non-paranthetical commas: they are handled by first regex + $reg = preg_replace("/,\(/", '(', $reg); + $this->_pcre_regex = $reg; } function validateChildren($tokens_of_children, $config, &$context) { @@ -60,11 +79,11 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef $list_of_children .= $token->name . ','; } } - $list_of_children = rtrim($list_of_children, ','); - + // add leading comma to deal with stray comma declarations + $list_of_children = ',' . rtrim($list_of_children, ','); $okay = preg_match( - '/^'.$this->_pcre_regex.'$/', + '/^,?'.$this->_pcre_regex.'$/', $list_of_children ); @@ -72,4 +91,3 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php index 1ab4fdd657..6e63730770 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Empty.php @@ -19,4 +19,3 @@ class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php index cc8883263e..779a7f06b9 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Optional.php @@ -20,4 +20,3 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php index c6f706e29a..f4d908b05d 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Required.php @@ -25,11 +25,10 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $elements = array_flip($elements); foreach ($elements as $i => $x) { $elements[$i] = true; - if (empty($i)) unset($elements[$i]); + if (empty($i)) unset($elements[$i]); // remove blank } } $this->elements = $elements; - $this->gen = new HTMLPurifier_Generator(); } var $allow_empty = false; var $type = 'required'; @@ -57,6 +56,12 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef // some configuration $escape_invalid_children = $config->get('Core', 'EscapeInvalidChildren'); + // generator + static $gen = null; + if ($gen === null) { + $gen = new HTMLPurifier_Generator(); + } + foreach ($tokens_of_children as $token) { if (!empty($token->is_whitespace)) { $result[] = $token; @@ -80,7 +85,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $result[] = $token; } elseif ($pcdata_allowed && $escape_invalid_children) { $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken($token, $config) + $gen->generateFromToken($token, $config) ); } continue; @@ -91,7 +96,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef } elseif ($pcdata_allowed && $escape_invalid_children) { $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken( $token, $config ) + $gen->generateFromToken( $token, $config ) ); } else { // drop silently @@ -104,4 +109,3 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php index 9280a9f50a..60dcbc4a15 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -45,8 +45,8 @@ extends HTMLPurifier_ChildDef_Required if (!$is_inline) { if (!$depth) { if ( - $token->type == 'text' || - !isset($this->elements[$token->name]) + ($token->type == 'text' && !$token->is_whitespace) || + ($token->type != 'text' && !isset($this->elements[$token->name])) ) { $is_inline = true; $ret[] = $block_wrap_start; @@ -73,4 +73,3 @@ extends HTMLPurifier_ChildDef_Required } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php index 3534cdd0a6..ca3c83cc0e 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Table.php @@ -9,6 +9,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef { var $allow_empty = false; var $type = 'table'; + var $elements = array('tr' => true, 'tbody' => true, 'thead' => true, + 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true); function HTMLPurifier_ChildDef_Table() {} function validateChildren($tokens_of_children, $config, &$context) { if (empty($tokens_of_children)) return false; @@ -139,4 +141,3 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Config.php b/lib/htmlpurifier/HTMLPurifier/Config.php index c94e01f636..b9b77178e0 100644 --- a/lib/htmlpurifier/HTMLPurifier/Config.php +++ b/lib/htmlpurifier/HTMLPurifier/Config.php @@ -1,5 +1,29 @@ +if (!defined('PHP_EOL')) { + switch (strtoupper(substr(PHP_OS, 0, 3))) { + case 'WIN': + define('PHP_EOL', "\r\n"); + break; + case 'DAR': + define('PHP_EOL', "\r"); + break; + default: + define('PHP_EOL', "\n"); + } +} + /** * Configuration object that triggers customizable behavior. * @@ -15,6 +39,11 @@ class HTMLPurifier_Config { + /** + * HTML Purifier's version + */ + var $version = '2.1.1'; + /** * Two-level associative array of configuration directives */ @@ -26,14 +55,31 @@ class HTMLPurifier_Config var $def; /** - * Cached instance of HTMLPurifier_HTMLDefinition + * Indexed array of definitions + */ + var $definitions; + + /** + * Bool indicator whether or not config is finalized */ - var $html_definition; + var $finalized = false; /** - * Cached instance of HTMLPurifier_CSSDefinition + * Bool indicator whether or not to automatically finalize + * the object if a read operation is done */ - var $css_definition; + var $autoFinalize = true; + + /** + * Namespace indexed array of serials for specific namespaces (see + * getSerial for more info). + */ + var $serials = array(); + + /** + * Serial for entire configuration object + */ + var $serial; /** * @param $definition HTMLPurifier_ConfigSchema that defines what directives @@ -54,7 +100,10 @@ class HTMLPurifier_Config * @return Configured HTMLPurifier_Config object */ function create($config) { - if (is_a($config, 'HTMLPurifier_Config')) return $config; + if (is_a($config, 'HTMLPurifier_Config')) { + // pass-through + return $config; + } $ret = HTMLPurifier_Config::createDefault(); if (is_string($config)) $ret->loadIni($config); elseif (is_array($config)) $ret->loadArray($config); @@ -78,13 +127,16 @@ class HTMLPurifier_Config * @param $key String key */ function get($namespace, $key, $from_alias = false) { + if (!$this->finalized && $this->autoFinalize) $this->finalize(); if (!isset($this->def->info[$namespace][$key])) { - trigger_error('Cannot retrieve value of undefined directive', + // can't add % due to SimpleTest bug + trigger_error('Cannot retrieve value of undefined directive ' . htmlspecialchars("$namespace.$key"), E_USER_WARNING); return; } if ($this->def->info[$namespace][$key]->class == 'alias') { - trigger_error('Cannot get value from aliased directive, use real name', + $d = $this->def->info[$namespace][$key]; + trigger_error('Cannot get value from aliased directive, use real name ' . $d->namespace . '.' . $d->name, E_USER_ERROR); return; } @@ -96,14 +148,50 @@ class HTMLPurifier_Config * @param $namespace String namespace */ function getBatch($namespace) { + if (!$this->finalized && $this->autoFinalize) $this->finalize(); if (!isset($this->def->info[$namespace])) { - trigger_error('Cannot retrieve undefined namespace', + trigger_error('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace), E_USER_WARNING); return; } return $this->conf[$namespace]; } + /** + * Returns a md5 signature of a segment of the configuration object + * that uniquely identifies that particular configuration + * @note Revision is handled specially and is removed from the batch + * before processing! + * @param $namespace Namespace to get serial for + */ + function getBatchSerial($namespace) { + if (empty($this->serials[$namespace])) { + $batch = $this->getBatch($namespace); + unset($batch['DefinitionRev']); + $this->serials[$namespace] = md5(serialize($batch)); + } + return $this->serials[$namespace]; + } + + /** + * Returns a md5 signature for the entire configuration object + * that uniquely identifies that particular configuration + */ + function getSerial() { + if (empty($this->serial)) { + $this->serial = md5(serialize($this->getAll())); + } + return $this->serial; + } + + /** + * Retrieves all directives, organized by namespace + */ + function getAll() { + if (!$this->finalized && $this->autoFinalize) $this->finalize(); + return $this->conf; + } + /** * Sets a value to configuration. * @param $namespace String namespace @@ -111,15 +199,16 @@ class HTMLPurifier_Config * @param $value Mixed value */ function set($namespace, $key, $value, $from_alias = false) { + if ($this->isFinalized('Cannot set directive after finalization')) return; if (!isset($this->def->info[$namespace][$key])) { - trigger_error('Cannot set undefined directive to value', + trigger_error('Cannot set undefined directive ' . htmlspecialchars("$namespace.$key") . ' to value', E_USER_WARNING); return; } if ($this->def->info[$namespace][$key]->class == 'alias') { if ($from_alias) { trigger_error('Double-aliases not allowed, please fix '. - 'ConfigSchema bug'); + 'ConfigSchema bug with' . "$namespace.$key"); } $this->set($this->def->info[$namespace][$key]->namespace, $this->def->info[$namespace][$key]->name, @@ -128,7 +217,7 @@ class HTMLPurifier_Config } $value = $this->def->validate( $value, - $this->def->info[$namespace][$key]->type, + $type = $this->def->info[$namespace][$key]->type, $this->def->info[$namespace][$key]->allow_null ); if (is_string($value)) { @@ -139,23 +228,36 @@ class HTMLPurifier_Config if ($this->def->info[$namespace][$key]->allowed !== true) { // check to see if the value is allowed if (!isset($this->def->info[$namespace][$key]->allowed[$value])) { - trigger_error('Value not supported', E_USER_WARNING); + trigger_error('Value not supported, valid values are: ' . + $this->_listify($this->def->info[$namespace][$key]->allowed), E_USER_WARNING); return; } } } if ($this->def->isError($value)) { - trigger_error('Value is of invalid type', E_USER_WARNING); + trigger_error('Value for ' . "$namespace.$key" . ' is of invalid type, should be ' . $type, E_USER_WARNING); return; } $this->conf[$namespace][$key] = $value; - if ($namespace == 'HTML' || $namespace == 'Attr') { - // reset HTML definition if relevant attributes changed - $this->html_definition = null; - } - if ($namespace == 'CSS') { - $this->css_definition = null; + + // reset definitions if the directives they depend on changed + // this is a very costly process, so it's discouraged + // with finalization + if ($namespace == 'HTML' || $namespace == 'CSS') { + $this->definitions[$namespace] = null; } + + $this->serials[$namespace] = false; + } + + /** + * Convenience function for error reporting + * @private + */ + function _listify($lookup) { + $list = array(); + foreach ($lookup as $name => $b) $list[] = $name; + return implode(', ', $list); } /** @@ -164,26 +266,76 @@ class HTMLPurifier_Config * called before it's been setup, otherwise won't work. */ function &getHTMLDefinition($raw = false) { - if ( - empty($this->html_definition) || // hasn't ever been setup - ($raw && $this->html_definition->setup) // requesting new one - ) { - $this->html_definition = new HTMLPurifier_HTMLDefinition($this); - if ($raw) return $this->html_definition; // no setup! - } - if (!$this->html_definition->setup) $this->html_definition->setup(); - return $this->html_definition; + $def =& $this->getDefinition('HTML', $raw); + return $def; // prevent PHP 4.4.0 from complaining } /** * Retrieves reference to the CSS definition */ - function &getCSSDefinition() { - if ($this->css_definition === null) { - $this->css_definition = new HTMLPurifier_CSSDefinition(); - $this->css_definition->setup($this); + function &getCSSDefinition($raw = false) { + $def =& $this->getDefinition('CSS', $raw); + return $def; + } + + /** + * Retrieves a definition + * @param $type Type of definition: HTML, CSS, etc + * @param $raw Whether or not definition should be returned raw + */ + function &getDefinition($type, $raw = false) { + if (!$this->finalized && $this->autoFinalize) $this->finalize(); + $factory = HTMLPurifier_DefinitionCacheFactory::instance(); + $cache = $factory->create($type, $this); + if (!$raw) { + // see if we can quickly supply a definition + if (!empty($this->definitions[$type])) { + if (!$this->definitions[$type]->setup) { + $this->definitions[$type]->setup($this); + $cache->set($this->definitions[$type], $this); + } + return $this->definitions[$type]; + } + // memory check missed, try cache + $this->definitions[$type] = $cache->get($this); + if ($this->definitions[$type]) { + // definition in cache, return it + return $this->definitions[$type]; + } + } elseif ( + !empty($this->definitions[$type]) && + !$this->definitions[$type]->setup + ) { + // raw requested, raw in memory, quick return + return $this->definitions[$type]; + } + // quick checks failed, let's create the object + if ($type == 'HTML') { + $this->definitions[$type] = new HTMLPurifier_HTMLDefinition(); + } elseif ($type == 'CSS') { + $this->definitions[$type] = new HTMLPurifier_CSSDefinition(); + } elseif ($type == 'URI') { + $this->definitions[$type] = new HTMLPurifier_URIDefinition(); + } else { + trigger_error("Definition of $type type not supported"); + $false = false; + return $false; + } + // quick abort if raw + if ($raw) { + if (is_null($this->get($type, 'DefinitionID'))) { + // fatally error out if definition ID not set + trigger_error("Cannot retrieve raw version without specifying %$type.DefinitionID", E_USER_ERROR); + $false = new HTMLPurifier_Error(); + return $false; + } + return $this->definitions[$type]; } - return $this->css_definition; + // set it up + $this->definitions[$type]->setup($this); + // save in cache + $cache->set($this->definitions[$type], $this); + return $this->definitions[$type]; } /** @@ -192,6 +344,7 @@ class HTMLPurifier_Config * @param $config_array Configuration associative array */ function loadArray($config_array) { + if ($this->isFinalized('Cannot load directives after finalization')) return; foreach ($config_array as $key => $value) { $key = str_replace('_', '.', $key); if (strpos($key, '.') !== false) { @@ -208,15 +361,134 @@ class HTMLPurifier_Config } } + /** + * Returns a list of array(namespace, directive) for all directives + * that are allowed in a web-form context as per an allowed + * namespaces/directives list. + * @param $allowed List of allowed namespaces/directives + * @static + */ + function getAllowedDirectivesForForm($allowed) { + $schema = HTMLPurifier_ConfigSchema::instance(); + if ($allowed !== true) { + if (is_string($allowed)) $allowed = array($allowed); + $allowed_ns = array(); + $allowed_directives = array(); + $blacklisted_directives = array(); + foreach ($allowed as $ns_or_directive) { + if (strpos($ns_or_directive, '.') !== false) { + // directive + if ($ns_or_directive[0] == '-') { + $blacklisted_directives[substr($ns_or_directive, 1)] = true; + } else { + $allowed_directives[$ns_or_directive] = true; + } + } else { + // namespace + $allowed_ns[$ns_or_directive] = true; + } + } + } + $ret = array(); + foreach ($schema->info as $ns => $keypairs) { + foreach ($keypairs as $directive => $def) { + if ($allowed !== true) { + if (isset($blacklisted_directives["$ns.$directive"])) continue; + if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue; + } + if ($def->class == 'alias') continue; + if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue; + $ret[] = array($ns, $directive); + } + } + return $ret; + } + + /** + * Loads configuration values from $_GET/$_POST that were posted + * via ConfigForm + * @param $array $_GET or $_POST array to import + * @param $index Index/name that the config variables are in + * @param $allowed List of allowed namespaces/directives + * @param $mq_fix Boolean whether or not to enable magic quotes fix + * @static + */ + function loadArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { + $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix); + $config = HTMLPurifier_Config::create($ret); + return $config; + } + + /** + * Merges in configuration values from $_GET/$_POST to object. NOT STATIC. + * @note Same parameters as loadArrayFromForm + */ + function mergeArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { + $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix); + $this->loadArray($ret); + } + + /** + * Prepares an array from a form into something usable for the more + * strict parts of HTMLPurifier_Config + * @static + */ + function prepareArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { + $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array(); + $mq = get_magic_quotes_gpc() && $mq_fix; + + $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed); + $ret = array(); + foreach ($allowed as $key) { + list($ns, $directive) = $key; + $skey = "$ns.$directive"; + if (!empty($array["Null_$skey"])) { + $ret[$ns][$directive] = null; + continue; + } + if (!isset($array[$skey])) continue; + $value = $mq ? stripslashes($array[$skey]) : $array[$skey]; + $ret[$ns][$directive] = $value; + } + return $ret; + } + /** * Loads configuration values from an ini file * @param $filename Name of ini file */ function loadIni($filename) { + if ($this->isFinalized('Cannot load directives after finalization')) return; $array = parse_ini_file($filename, true); $this->loadArray($array); } + /** + * Checks whether or not the configuration object is finalized. + * @param $error String error message, or false for no error + */ + function isFinalized($error = false) { + if ($this->finalized && $error) { + trigger_error($error, E_USER_ERROR); + } + return $this->finalized; + } + + /** + * Finalizes configuration only if auto finalize is on and not + * already finalized + */ + function autoFinalize() { + if (!$this->finalized && $this->autoFinalize) $this->finalize(); + } + + /** + * Finalizes a configuration object, prohibiting further change + */ + function finalize() { + $this->finalized = true; + } + } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef.php index b92640dc61..21825e01b8 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigDef.php +++ b/lib/htmlpurifier/HTMLPurifier/ConfigDef.php @@ -7,4 +7,3 @@ class HTMLPurifier_ConfigDef { var $class = false; } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php index 39026540b3..21c33fae8d 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php +++ b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Directive.php @@ -61,6 +61,12 @@ class HTMLPurifier_ConfigDef_Directive extends HTMLPurifier_ConfigDef */ var $aliases = array(); + /** + * Advisory list of directive aliases, i.e. other directives that + * redirect here + */ + var $directiveAliases = array(); + /** * Adds a description to the array */ @@ -71,4 +77,3 @@ class HTMLPurifier_ConfigDef_Directive extends HTMLPurifier_ConfigDef } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php index 81a4451413..6637802621 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php +++ b/lib/htmlpurifier/HTMLPurifier/ConfigDef/DirectiveAlias.php @@ -24,4 +24,3 @@ class HTMLPurifier_ConfigDef_DirectiveAlias extends HTMLPurifier_ConfigDef } } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php index f53892b47e..21d732114f 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php +++ b/lib/htmlpurifier/HTMLPurifier/ConfigDef/Namespace.php @@ -20,4 +20,3 @@ class HTMLPurifier_ConfigDef_Namespace extends HTMLPurifier_ConfigDef { } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php b/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php index 940e8e6199..d6700e6ec1 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema.php @@ -6,8 +6,11 @@ require_once 'HTMLPurifier/ConfigDef/Namespace.php'; require_once 'HTMLPurifier/ConfigDef/Directive.php'; require_once 'HTMLPurifier/ConfigDef/DirectiveAlias.php'; +if (!defined('HTMLPURIFIER_SCHEMA_STRICT')) define('HTMLPURIFIER_SCHEMA_STRICT', false); + /** * Configuration definition, defines directives and their defaults. + * @note If you update this, please update Printer_ConfigForm * @todo The ability to define things multiple times is confusing and should * be factored out to its own function named registerDependency() or * addNote(), where only the namespace.name and an extra descriptions @@ -48,6 +51,8 @@ class HTMLPurifier_ConfigSchema { var $types = array( 'string' => 'String', 'istring' => 'Case-insensitive string', + 'text' => 'Text', + 'itext' => 'Case-insensitive text', 'int' => 'Integer', 'float' => 'Float', 'bool' => 'Boolean', @@ -66,6 +71,10 @@ class HTMLPurifier_ConfigSchema { $this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.'); $this->defineNamespace('HTML', 'Configuration regarding allowed HTML.'); $this->defineNamespace('CSS', 'Configuration regarding allowed CSS.'); + $this->defineNamespace('AutoFormat', 'Configuration for activating auto-formatting functionality (also known asInjector
s)');
+ $this->defineNamespace('AutoFormatParam', 'Configuration for customizing auto-formatting functionality');
+ $this->defineNamespace('Output', 'Configuration relating to the generation of (X)HTML.');
+ $this->defineNamespace('Cache', 'Configuration for DefinitionCache and related subclasses.');
$this->defineNamespace('Test', 'Developer testing configuration for our unit tests.');
}
@@ -95,27 +104,30 @@ class HTMLPurifier_ConfigSchema {
* HTMLPurifier_DirectiveDef::$type for allowed values
* @param $description Description of directive for documentation
*/
- function define(
- $namespace, $name, $default, $type,
- $description
- ) {
+ function define($namespace, $name, $default, $type, $description) {
$def =& HTMLPurifier_ConfigSchema::instance();
- if (!isset($def->info[$namespace])) {
- trigger_error('Cannot define directive for undefined namespace',
- E_USER_ERROR);
- return;
- }
- if (!ctype_alnum($name)) {
- trigger_error('Directive name must be alphanumeric',
- E_USER_ERROR);
- return;
- }
- if (empty($description)) {
- trigger_error('Description must be non-empty',
- E_USER_ERROR);
- return;
+
+ // basic sanity checks
+ if (HTMLPURIFIER_SCHEMA_STRICT) {
+ if (!isset($def->info[$namespace])) {
+ trigger_error('Cannot define directive for undefined namespace',
+ E_USER_ERROR);
+ return;
+ }
+ if (!ctype_alnum($name)) {
+ trigger_error('Directive name must be alphanumeric',
+ E_USER_ERROR);
+ return;
+ }
+ if (empty($description)) {
+ trigger_error('Description must be non-empty',
+ E_USER_ERROR);
+ return;
+ }
}
+
if (isset($def->info[$namespace][$name])) {
+ // already defined
if (
$def->info[$namespace][$name]->type !== $type ||
$def->defaults[$namespace][$name] !== $default
@@ -124,29 +136,35 @@ class HTMLPurifier_ConfigSchema {
return;
}
} else {
- // process modifiers
+ // needs defining
+
+ // process modifiers (OPTIMIZE!)
$type_values = explode('/', $type, 2);
$type = $type_values[0];
$modifier = isset($type_values[1]) ? $type_values[1] : false;
$allow_null = ($modifier === 'null');
- if (!isset($def->types[$type])) {
- trigger_error('Invalid type for configuration directive',
- E_USER_ERROR);
- return;
- }
- $default = $def->validate($default, $type, $allow_null);
- if ($def->isError($default)) {
- trigger_error('Default value does not match directive type',
- E_USER_ERROR);
- return;
+ if (HTMLPURIFIER_SCHEMA_STRICT) {
+ if (!isset($def->types[$type])) {
+ trigger_error('Invalid type for configuration directive',
+ E_USER_ERROR);
+ return;
+ }
+ $default = $def->validate($default, $type, $allow_null);
+ if ($def->isError($default)) {
+ trigger_error('Default value does not match directive type',
+ E_USER_ERROR);
+ return;
+ }
}
+
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigDef_Directive();
$def->info[$namespace][$name]->type = $type;
$def->info[$namespace][$name]->allow_null = $allow_null;
$def->defaults[$namespace][$name] = $default;
}
+ if (!HTMLPURIFIER_SCHEMA_STRICT) return;
$backtrace = debug_backtrace();
$file = $def->mungeFilename($backtrace[0]['file']);
$line = $backtrace[0]['line'];
@@ -161,19 +179,21 @@ class HTMLPurifier_ConfigSchema {
*/
function defineNamespace($namespace, $description) {
$def =& HTMLPurifier_ConfigSchema::instance();
- if (isset($def->info[$namespace])) {
- trigger_error('Cannot redefine namespace', E_USER_ERROR);
- return;
- }
- if (!ctype_alnum($namespace)) {
- trigger_error('Namespace name must be alphanumeric',
- E_USER_ERROR);
- return;
- }
- if (empty($description)) {
- trigger_error('Description must be non-empty',
- E_USER_ERROR);
- return;
+ if (HTMLPURIFIER_SCHEMA_STRICT) {
+ if (isset($def->info[$namespace])) {
+ trigger_error('Cannot redefine namespace', E_USER_ERROR);
+ return;
+ }
+ if (!ctype_alnum($namespace)) {
+ trigger_error('Namespace name must be alphanumeric',
+ E_USER_ERROR);
+ return;
+ }
+ if (empty($description)) {
+ trigger_error('Description must be non-empty',
+ E_USER_ERROR);
+ return;
+ }
}
$def->info[$namespace] = array();
$def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace();
@@ -194,23 +214,25 @@ class HTMLPurifier_ConfigSchema {
*/
function defineValueAliases($namespace, $name, $aliases) {
$def =& HTMLPurifier_ConfigSchema::instance();
- if (!isset($def->info[$namespace][$name])) {
+ if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) {
trigger_error('Cannot set value alias for non-existant directive',
E_USER_ERROR);
return;
}
foreach ($aliases as $alias => $real) {
- if (!$def->info[$namespace][$name] !== true &&
- !isset($def->info[$namespace][$name]->allowed[$real])
- ) {
- trigger_error('Cannot define alias to value that is not allowed',
- E_USER_ERROR);
- return;
- }
- if (isset($def->info[$namespace][$name]->allowed[$alias])) {
- trigger_error('Cannot define alias over allowed value',
- E_USER_ERROR);
- return;
+ if (HTMLPURIFIER_SCHEMA_STRICT) {
+ if (!$def->info[$namespace][$name] !== true &&
+ !isset($def->info[$namespace][$name]->allowed[$real])
+ ) {
+ trigger_error('Cannot define alias to value that is not allowed',
+ E_USER_ERROR);
+ return;
+ }
+ if (isset($def->info[$namespace][$name]->allowed[$alias])) {
+ trigger_error('Cannot define alias over allowed value',
+ E_USER_ERROR);
+ return;
+ }
}
$def->info[$namespace][$name]->aliases[$alias] = $real;
}
@@ -225,14 +247,14 @@ class HTMLPurifier_ConfigSchema {
*/
function defineAllowedValues($namespace, $name, $allowed_values) {
$def =& HTMLPurifier_ConfigSchema::instance();
- if (!isset($def->info[$namespace][$name])) {
+ if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) {
trigger_error('Cannot define allowed values for undefined directive',
E_USER_ERROR);
return;
}
$directive =& $def->info[$namespace][$name];
$type = $directive->type;
- if ($type != 'string' && $type != 'istring') {
+ if (HTMLPURIFIER_SCHEMA_STRICT && $type != 'string' && $type != 'istring') {
trigger_error('Cannot define allowed values for directive whose type is not string',
E_USER_ERROR);
return;
@@ -243,8 +265,11 @@ class HTMLPurifier_ConfigSchema {
foreach ($allowed_values as $value) {
$directive->allowed[$value] = true;
}
- if ($def->defaults[$namespace][$name] !== null &&
- !isset($directive->allowed[$def->defaults[$namespace][$name]])) {
+ if (
+ HTMLPURIFIER_SCHEMA_STRICT &&
+ $def->defaults[$namespace][$name] !== null &&
+ !isset($directive->allowed[$def->defaults[$namespace][$name]])
+ ) {
trigger_error('Default value must be in allowed range of variables',
E_USER_ERROR);
$directive->allowed = true; // undo undo!
@@ -262,34 +287,37 @@ class HTMLPurifier_ConfigSchema {
*/
function defineAlias($namespace, $name, $new_namespace, $new_name) {
$def =& HTMLPurifier_ConfigSchema::instance();
- if (!isset($def->info[$namespace])) {
- trigger_error('Cannot define directive alias in undefined namespace',
- E_USER_ERROR);
- return;
- }
- if (!ctype_alnum($name)) {
- trigger_error('Directive name must be alphanumeric',
- E_USER_ERROR);
- return;
- }
- if (isset($def->info[$namespace][$name])) {
- trigger_error('Cannot define alias over directive',
- E_USER_ERROR);
- return;
- }
- if (!isset($def->info[$new_namespace][$new_name])) {
- trigger_error('Cannot define alias to undefined directive',
- E_USER_ERROR);
- return;
- }
- if ($def->info[$new_namespace][$new_name]->class == 'alias') {
- trigger_error('Cannot define alias to alias',
- E_USER_ERROR);
- return;
+ if (HTMLPURIFIER_SCHEMA_STRICT) {
+ if (!isset($def->info[$namespace])) {
+ trigger_error('Cannot define directive alias in undefined namespace',
+ E_USER_ERROR);
+ return;
+ }
+ if (!ctype_alnum($name)) {
+ trigger_error('Directive name must be alphanumeric',
+ E_USER_ERROR);
+ return;
+ }
+ if (isset($def->info[$namespace][$name])) {
+ trigger_error('Cannot define alias over directive',
+ E_USER_ERROR);
+ return;
+ }
+ if (!isset($def->info[$new_namespace][$new_name])) {
+ trigger_error('Cannot define alias to undefined directive',
+ E_USER_ERROR);
+ return;
+ }
+ if ($def->info[$new_namespace][$new_name]->class == 'alias') {
+ trigger_error('Cannot define alias to alias',
+ E_USER_ERROR);
+ return;
+ }
}
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigDef_DirectiveAlias(
$new_namespace, $new_name);
+ $def->info[$new_namespace][$new_name]->directiveAliases[] = "$namespace.$name";
}
/**
@@ -303,11 +331,14 @@ class HTMLPurifier_ConfigSchema {
if ($allow_null && $var === null) return null;
switch ($type) {
case 'mixed':
+ //if (is_string($var)) $var = unserialize($var);
return $var;
case 'istring':
case 'string':
+ case 'text': // no difference, just is longer/multiple line string
+ case 'itext':
if (!is_string($var)) break;
- if ($type === 'istring') $var = strtolower($var);
+ if ($type === 'istring' || $type === 'itext') $var = strtolower($var);
return $var;
case 'int':
if (is_string($var) && ctype_digit($var)) $var = (int) $var;
@@ -338,11 +369,25 @@ class HTMLPurifier_ConfigSchema {
// a single empty string item, but having an empty
// array is more intuitive
if ($var == '') return array();
- // simplistic string to array method that only works
- // for simple lists of tag names or alphanumeric characters
- $var = explode(',',$var);
+ if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
+ // simplistic string to array method that only works
+ // for simple lists of tag names or alphanumeric characters
+ $var = explode(',',$var);
+ } else {
+ $var = preg_split('/(,|[\n\r]+)/', $var);
+ }
// remove spaces
foreach ($var as $i => $j) $var[$i] = trim($j);
+ if ($type === 'hash') {
+ // key:value,key2:value2
+ $nvar = array();
+ foreach ($var as $keypair) {
+ $c = explode(':', $keypair, 2);
+ if (!isset($c[1])) continue;
+ $nvar[$c[0]] = $c[1];
+ }
+ $var = $nvar;
+ }
}
if (!is_array($var)) break;
$keys = array_keys($var);
@@ -371,6 +416,7 @@ class HTMLPurifier_ConfigSchema {
* Takes an absolute path and munges it into a more manageable relative path
*/
function mungeFilename($filename) {
+ if (!HTMLPURIFIER_SCHEMA_STRICT) return $filename;
$offset = strrpos($filename, 'HTMLPurifier');
$filename = substr($filename, $offset);
$filename = str_replace('\\', '/', $filename);
@@ -387,4 +433,4 @@ class HTMLPurifier_ConfigSchema {
}
}
-?>
+
diff --git a/lib/htmlpurifier/HTMLPurifier/ContentSets.php b/lib/htmlpurifier/HTMLPurifier/ContentSets.php
index de5c532e18..7baf7a3101 100644
--- a/lib/htmlpurifier/HTMLPurifier/ContentSets.php
+++ b/lib/htmlpurifier/HTMLPurifier/ContentSets.php
@@ -5,6 +5,9 @@ require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/ChildDef/Empty.php';
require_once 'HTMLPurifier/ChildDef/Required.php';
require_once 'HTMLPurifier/ChildDef/Optional.php';
+require_once 'HTMLPurifier/ChildDef/Custom.php';
+
+// NOT UNIT TESTED!!!
class HTMLPurifier_ContentSets
{
@@ -145,4 +148,3 @@ class HTMLPurifier_ContentSets
}
-?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Context.php b/lib/htmlpurifier/HTMLPurifier/Context.php
index ce6fe51e05..a78a6fb6f6 100644
--- a/lib/htmlpurifier/HTMLPurifier/Context.php
+++ b/lib/htmlpurifier/HTMLPurifier/Context.php
@@ -2,6 +2,8 @@
/**
* Registry object that contains information about the current context.
+ * @warning Is a bit buggy when variables are set to null: it thinks
+ * they don't exist! So use false instead, please.
*/
class HTMLPurifier_Context
{
@@ -19,7 +21,7 @@ class HTMLPurifier_Context
*/
function register($name, &$ref) {
if (isset($this->_storage[$name])) {
- trigger_error('Name collision, cannot re-register',
+ trigger_error("Name $name produces collision, cannot re-register",
E_USER_ERROR);
return;
}
@@ -29,11 +31,14 @@ class HTMLPurifier_Context
/**
* Retrieves a variable reference from the context.
* @param $name String name
+ * @param $ignore_error Boolean whether or not to ignore error
*/
- function &get($name) {
+ function &get($name, $ignore_error = false) {
if (!isset($this->_storage[$name])) {
- trigger_error('Attempted to retrieve non-existent variable',
- E_USER_ERROR);
+ if (!$ignore_error) {
+ trigger_error("Attempted to retrieve non-existent variable $name",
+ E_USER_ERROR);
+ }
$var = null; // so we can return by reference
return $var;
}
@@ -46,7 +51,7 @@ class HTMLPurifier_Context
*/
function destroy($name) {
if (!isset($this->_storage[$name])) {
- trigger_error('Attempted to destroy non-existent variable',
+ trigger_error("Attempted to destroy non-existent variable $name",
E_USER_ERROR);
return;
}
@@ -73,4 +78,3 @@ class HTMLPurifier_Context
}
-?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Definition.php b/lib/htmlpurifier/HTMLPurifier/Definition.php
new file mode 100644
index 0000000000..8f958e4798
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/Definition.php
@@ -0,0 +1,40 @@
+setup) return;
+ $this->setup = true;
+ $this->doSetup($config);
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCache.php b/lib/htmlpurifier/HTMLPurifier/DefinitionCache.php
new file mode 100644
index 0000000000..d4c9d239f2
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCache.php
@@ -0,0 +1,128 @@
+type = $type;
+ }
+
+ /**
+ * Generates a unique identifier for a particular configuration
+ * @param Instance of HTMLPurifier_Config
+ */
+ function generateKey($config) {
+ return $config->version . '-' . // possibly replace with function calls
+ $config->getBatchSerial($this->type) . '-' .
+ $config->get($this->type, 'DefinitionRev');
+ }
+
+ /**
+ * Tests whether or not a key is old with respect to the configuration's
+ * version and revision number.
+ * @param $key Key to test
+ * @param $config Instance of HTMLPurifier_Config to test against
+ */
+ function isOld($key, $config) {
+ if (substr_count($key, '-') < 2) return true;
+ list($version, $hash, $revision) = explode('-', $key, 3);
+ $compare = version_compare($version, $config->version);
+ // version mismatch, is always old
+ if ($compare != 0) return true;
+ // versions match, ids match, check revision number
+ if (
+ $hash == $config->getBatchSerial($this->type) &&
+ $revision < $config->get($this->type, 'DefinitionRev')
+ ) return true;
+ return false;
+ }
+
+ /**
+ * Checks if a definition's type jives with the cache's type
+ * @note Throws an error on failure
+ * @param $def Definition object to check
+ * @return Boolean true if good, false if not
+ */
+ function checkDefType($def) {
+ if ($def->type !== $this->type) {
+ trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Adds a definition object to the cache
+ */
+ function add($def, $config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Unconditionally saves a definition object to the cache
+ */
+ function set($def, $config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Replace an object in the cache
+ */
+ function replace($def, $config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Retrieves a definition object from the cache
+ */
+ function get($config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Removes a definition object to the cache
+ */
+ function remove($config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Clears all objects from cache
+ */
+ function flush($config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+
+ /**
+ * Clears all expired (older version or revision) objects from cache
+ */
+ function cleanup($config) {
+ trigger_error('Cannot call abstract method', E_USER_ERROR);
+ }
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator.php b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator.php
new file mode 100644
index 0000000000..14fca85974
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator.php
@@ -0,0 +1,59 @@
+copy();
+ // reference is necessary for mocks in PHP 4
+ $decorator->cache =& $cache;
+ $decorator->type = $cache->type;
+ return $decorator;
+ }
+
+ /**
+ * Cross-compatible clone substitute
+ */
+ function copy() {
+ return new HTMLPurifier_DefinitionCache_Decorator();
+ }
+
+ function add($def, $config) {
+ return $this->cache->add($def, $config);
+ }
+
+ function set($def, $config) {
+ return $this->cache->set($def, $config);
+ }
+
+ function replace($def, $config) {
+ return $this->cache->replace($def, $config);
+ }
+
+ function get($config) {
+ return $this->cache->get($config);
+ }
+
+ function flush($config) {
+ return $this->cache->flush($config);
+ }
+
+ function cleanup($config) {
+ return $this->cache->cleanup($config);
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Cleanup.php b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Cleanup.php
new file mode 100644
index 0000000000..eb47c433fb
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Cleanup.php
@@ -0,0 +1,44 @@
+definitions[$this->generateKey($config)] = $def;
+ return $status;
+ }
+
+ function set($def, $config) {
+ $status = parent::set($def, $config);
+ if ($status) $this->definitions[$this->generateKey($config)] = $def;
+ return $status;
+ }
+
+ function replace($def, $config) {
+ $status = parent::replace($def, $config);
+ if ($status) $this->definitions[$this->generateKey($config)] = $def;
+ return $status;
+ }
+
+ function get($config) {
+ $key = $this->generateKey($config);
+ if (isset($this->definitions[$key])) return $this->definitions[$key];
+ $this->definitions[$key] = parent::get($config);
+ return $this->definitions[$key];
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in
new file mode 100644
index 0000000000..62235e225d
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in
@@ -0,0 +1,46 @@
+
+ Absolute path with no trailing slash to store serialized definitions in.
+ Default is within the
+ HTML Purifier library inside DefinitionCache/Serializer. This
+ path must be writable by the webserver. This directive has been
+ available since 2.0.0.
+
+');
+
+class HTMLPurifier_DefinitionCache_Serializer extends
+ HTMLPurifier_DefinitionCache
+{
+
+ function add($def, $config) {
+ if (!$this->checkDefType($def)) return;
+ $file = $this->generateFilePath($config);
+ if (file_exists($file)) return false;
+ if (!$this->_prepareDir($config)) return false;
+ return $this->_write($file, serialize($def));
+ }
+
+ function set($def, $config) {
+ if (!$this->checkDefType($def)) return;
+ $file = $this->generateFilePath($config);
+ if (!$this->_prepareDir($config)) return false;
+ return $this->_write($file, serialize($def));
+ }
+
+ function replace($def, $config) {
+ if (!$this->checkDefType($def)) return;
+ $file = $this->generateFilePath($config);
+ if (!file_exists($file)) return false;
+ if (!$this->_prepareDir($config)) return false;
+ return $this->_write($file, serialize($def));
+ }
+
+ function get($config) {
+ $file = $this->generateFilePath($config);
+ if (!file_exists($file)) return false;
+ return unserialize(file_get_contents($file));
+ }
+
+ function remove($config) {
+ $file = $this->generateFilePath($config);
+ if (!file_exists($file)) return false;
+ return unlink($file);
+ }
+
+ function flush($config) {
+ if (!$this->_prepareDir($config)) return false;
+ $dir = $this->generateDirectoryPath($config);
+ $dh = opendir($dir);
+ while (false !== ($filename = readdir($dh))) {
+ if (empty($filename)) continue;
+ if ($filename[0] === '.') continue;
+ unlink($dir . '/' . $filename);
+ }
+ }
+
+ function cleanup($config) {
+ if (!$this->_prepareDir($config)) return false;
+ $dir = $this->generateDirectoryPath($config);
+ $dh = opendir($dir);
+ while (false !== ($filename = readdir($dh))) {
+ if (empty($filename)) continue;
+ if ($filename[0] === '.') continue;
+ $key = substr($filename, 0, strlen($filename) - 4);
+ if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
+ }
+ }
+
+ /**
+ * Generates the file path to the serial file corresponding to
+ * the configuration and definition name
+ */
+ function generateFilePath($config) {
+ $key = $this->generateKey($config);
+ return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
+ }
+
+ /**
+ * Generates the path to the directory contain this cache's serial files
+ * @note No trailing slash
+ */
+ function generateDirectoryPath($config) {
+ $base = $this->generateBaseDirectoryPath($config);
+ return $base . '/' . $this->type;
+ }
+
+ /**
+ * Generates path to base directory that contains all definition type
+ * serials
+ */
+ function generateBaseDirectoryPath($config) {
+ $base = $config->get('Cache', 'SerializerPath');
+ $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
+ return $base;
+ }
+
+ /**
+ * Convenience wrapper function for file_put_contents
+ * @param $file File name to write to
+ * @param $data Data to write into file
+ * @return Number of bytes written if success, or false if failure.
+ */
+ function _write($file, $data) {
+ static $file_put_contents;
+ if ($file_put_contents === null) {
+ $file_put_contents = function_exists('file_put_contents');
+ }
+ if ($file_put_contents) {
+ return file_put_contents($file, $data);
+ }
+ $fh = fopen($file, 'w');
+ if (!$fh) return false;
+ $status = fwrite($fh, $data);
+ fclose($fh);
+ return $status;
+ }
+
+ /**
+ * Prepares the directory that this type stores the serials in
+ * @return True if successful
+ */
+ function _prepareDir($config) {
+ $directory = $this->generateDirectoryPath($config);
+ if (!is_dir($directory)) {
+ $base = $this->generateBaseDirectoryPath($config);
+ if (!is_dir($base)) {
+ trigger_error('Base directory '.$base.' does not exist,
+ please create or change using %Cache.SerializerPath',
+ E_USER_ERROR);
+ return false;
+ } elseif (!$this->_testPermissions($base)) {
+ return false;
+ }
+ mkdir($directory);
+ } elseif (!$this->_testPermissions($directory)) {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Tests permissions on a directory and throws out friendly
+ * error messages and attempts to chmod it itself if possible
+ */
+ function _testPermissions($dir) {
+ // early abort, if it is writable, everything is hunky-dory
+ if (is_writable($dir)) return true;
+ if (!is_dir($dir)) {
+ // generally, you'll want to handle this beforehand
+ // so a more specific error message can be given
+ trigger_error('Directory '.$dir.' does not exist',
+ E_USER_ERROR);
+ return false;
+ }
+ if (function_exists('posix_getuid')) {
+ // POSIX system, we can give more specific advice
+ if (fileowner($dir) === posix_getuid()) {
+ // we can chmod it ourselves
+ chmod($dir, 0755);
+ return true;
+ } elseif (filegroup($dir) === posix_getgid()) {
+ $chmod = '775';
+ } else {
+ // PHP's probably running as nobody, so we'll
+ // need to give global permissions
+ $chmod = '777';
+ }
+ trigger_error('Directory '.$dir.' not writable, '.
+ 'please chmod to ' . $chmod,
+ E_USER_ERROR);
+ } else {
+ // generic error message
+ trigger_error('Directory '.$dir.' not writable, '.
+ 'please alter file permissions',
+ E_USER_ERROR);
+ }
+ return false;
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCacheFactory.php b/lib/htmlpurifier/HTMLPurifier/DefinitionCacheFactory.php
new file mode 100644
index 0000000000..acc661828a
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCacheFactory.php
@@ -0,0 +1,94 @@
+ array());
+ var $decorators = array();
+
+ /**
+ * Initialize default decorators
+ */
+ function setup() {
+ $this->addDecorator('Cleanup');
+ }
+
+ /**
+ * Retrieves an instance of global definition cache factory.
+ * @static
+ */
+ function &instance($prototype = null) {
+ static $instance;
+ if ($prototype !== null) {
+ $instance = $prototype;
+ } elseif ($instance === null || $prototype === true) {
+ $instance = new HTMLPurifier_DefinitionCacheFactory();
+ $instance->setup();
+ }
+ return $instance;
+ }
+
+ /**
+ * Factory method that creates a cache object based on configuration
+ * @param $name Name of definitions handled by cache
+ * @param $config Instance of HTMLPurifier_Config
+ */
+ function &create($type, $config) {
+ // only one implementation as for right now, $config will
+ // be used to determine implementation
+ $method = $config->get('Cache', 'DefinitionImpl');
+ if ($method === null) {
+ $null = new HTMLPurifier_DefinitionCache_Null($type);
+ return $null;
+ }
+ if (!empty($this->caches[$method][$type])) {
+ return $this->caches[$method][$type];
+ }
+ $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
+ foreach ($this->decorators as $decorator) {
+ $new_cache = $decorator->decorate($cache);
+ // prevent infinite recursion in PHP 4
+ unset($cache);
+ $cache = $new_cache;
+ }
+ $this->caches[$method][$type] = $cache;
+ return $this->caches[$method][$type];
+ }
+
+ /**
+ * Registers a decorator to add to all new cache objects
+ * @param
+ */
+ function addDecorator($decorator) {
+ if (is_string($decorator)) {
+ $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
+ $decorator = new $class;
+ }
+ $this->decorators[$decorator->name] = $decorator;
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/Doctype.php b/lib/htmlpurifier/HTMLPurifier/Doctype.php
new file mode 100644
index 0000000000..7afdcd74a2
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/Doctype.php
@@ -0,0 +1,66 @@
+renderDoctype.
+ * If structure changes, please update that function.
+ */
+class HTMLPurifier_Doctype
+{
+ /**
+ * Full name of doctype
+ */
+ var $name;
+
+ /**
+ * List of standard modules (string identifiers or literal objects)
+ * that this doctype uses
+ */
+ var $modules = array();
+
+ /**
+ * List of modules to use for tidying up code
+ */
+ var $tidyModules = array();
+
+ /**
+ * Is the language derived from XML (i.e. XHTML)?
+ */
+ var $xml = true;
+
+ /**
+ * List of aliases for this doctype
+ */
+ var $aliases = array();
+
+ /**
+ * Public DTD identifier
+ */
+ var $dtdPublic;
+
+ /**
+ * System DTD identifier
+ */
+ var $dtdSystem;
+
+ function HTMLPurifier_Doctype($name = null, $xml = true, $modules = array(),
+ $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
+ ) {
+ $this->name = $name;
+ $this->xml = $xml;
+ $this->modules = $modules;
+ $this->tidyModules = $tidyModules;
+ $this->aliases = $aliases;
+ $this->dtdPublic = $dtd_public;
+ $this->dtdSystem = $dtd_system;
+ }
+
+ /**
+ * Clones the doctype, use before resolving modes and the like
+ */
+ function copy() {
+ return unserialize(serialize($this));
+ }
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/DoctypeRegistry.php b/lib/htmlpurifier/HTMLPurifier/DoctypeRegistry.php
new file mode 100644
index 0000000000..e657b3da4b
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/DoctypeRegistry.php
@@ -0,0 +1,124 @@
+doctypes[$doctype->name] =& $doctype;
+ $name = $doctype->name;
+ // hookup aliases
+ foreach ($doctype->aliases as $alias) {
+ if (isset($this->doctypes[$alias])) continue;
+ $this->aliases[$alias] = $name;
+ }
+ // remove old aliases
+ if (isset($this->aliases[$name])) unset($this->aliases[$name]);
+ return $doctype;
+ }
+
+ /**
+ * Retrieves reference to a doctype of a certain name
+ * @note This function resolves aliases
+ * @note When possible, use the more fully-featured make()
+ * @param $doctype Name of doctype
+ * @return Reference to doctype object
+ */
+ function &get($doctype) {
+ if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
+ if (!isset($this->doctypes[$doctype])) {
+ trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
+ $anon = new HTMLPurifier_Doctype($doctype);
+ return $anon;
+ }
+ return $this->doctypes[$doctype];
+ }
+
+ /**
+ * Creates a doctype based on a configuration object,
+ * will perform initialization on the doctype
+ * @note Use this function to get a copy of doctype that config
+ * can hold on to (this is necessary in order to tell
+ * Generator whether or not the current document is XML
+ * based or not).
+ */
+ function make($config) {
+ $original_doctype = $this->get($this->getDoctypeFromConfig($config));
+ $doctype = $original_doctype->copy();
+ return $doctype;
+ }
+
+ /**
+ * Retrieves the doctype from the configuration object
+ */
+ function getDoctypeFromConfig($config) {
+ // recommended test
+ $doctype = $config->get('HTML', 'Doctype');
+ if (!empty($doctype)) return $doctype;
+ $doctype = $config->get('HTML', 'CustomDoctype');
+ if (!empty($doctype)) return $doctype;
+ // backwards-compatibility
+ if ($config->get('HTML', 'XHTML')) {
+ $doctype = 'XHTML 1.0';
+ } else {
+ $doctype = 'HTML 4.01';
+ }
+ if ($config->get('HTML', 'Strict')) {
+ $doctype .= ' Strict';
+ } else {
+ $doctype .= ' Transitional';
+ }
+ return $doctype;
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/ElementDef.php b/lib/htmlpurifier/HTMLPurifier/ElementDef.php
index 73c94abe13..21e1a5a764 100644
--- a/lib/htmlpurifier/HTMLPurifier/ElementDef.php
+++ b/lib/htmlpurifier/HTMLPurifier/ElementDef.php
@@ -3,6 +3,8 @@
/**
* Structure that stores an HTML element definition. Used by
* HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
+ * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
+ * Please update that class too.
*/
class HTMLPurifier_ElementDef
{
@@ -51,6 +53,8 @@ class HTMLPurifier_ElementDef
* Abstract string representation of internal ChildDef rules. See
* HTMLPurifier_ContentSets for how this is parsed and then transformed
* into an HTMLPurifier_ChildDef.
+ * @warning This is a temporary variable that is not available after
+ * being processed by HTMLDefinition
* @public
*/
var $content_model;
@@ -58,19 +62,15 @@ class HTMLPurifier_ElementDef
/**
* Value of $child->type, used to determine which ChildDef to use,
* used in combination with $content_model.
+ * @warning This must be lowercase
+ * @warning This is a temporary variable that is not available after
+ * being processed by HTMLDefinition
* @public
*/
var $content_model_type;
- /**
- * Lookup table of tags that close this tag. Used during parsing
- * to make sure we don't attempt to nest unclosed tags.
- * @public
- */
- var $auto_close = array();
-
/**
* Does the element have a content model (#PCDATA | Inline)*? This
* is important for chameleon ins and del processing in
@@ -78,14 +78,47 @@ class HTMLPurifier_ElementDef
* have to worry about this one.
* @public
*/
- var $descendants_are_inline;
+ var $descendants_are_inline = false;
+
+ /**
+ * List of the names of required attributes this element has. Dynamically
+ * populated.
+ * @public
+ */
+ var $required_attr = array();
/**
* Lookup table of tags excluded from all descendants of this tag.
+ * @note SGML permits exclusions for all descendants, but this is
+ * not possible with DTDs or XML Schemas. W3C has elected to
+ * use complicated compositions of content_models to simulate
+ * exclusion for children, but we go the simpler, SGML-style
+ * route of flat-out exclusions, which correctly apply to
+ * all descendants and not just children. Note that the XHTML
+ * Modularization Abstract Modules are blithely unaware of such
+ * distinctions.
* @public
*/
var $excludes = array();
+ /**
+ * Is this element safe for untrusted users to use?
+ */
+ var $safe;
+
+ /**
+ * Low-level factory constructor for creating new standalone element defs
+ * @static
+ */
+ function create($safe, $content_model, $content_model_type, $attr) {
+ $def = new HTMLPurifier_ElementDef();
+ $def->safe = (bool) $safe;
+ $def->content_model = $content_model;
+ $def->content_model_type = $content_model_type;
+ $def->attr = $attr;
+ return $def;
+ }
+
/**
* Merges the values of another element definition into this one.
* Values from the new element def take precedence if a value is
@@ -99,24 +132,56 @@ class HTMLPurifier_ElementDef
// merge in the includes
// sorry, no way to override an include
foreach ($v as $v2) {
- $def->attr[0][] = $v2;
+ $this->attr[0][] = $v2;
}
continue;
}
+ if ($v === false) {
+ if (isset($this->attr[$k])) unset($this->attr[$k]);
+ continue;
+ }
$this->attr[$k] = $v;
}
- foreach($def->attr_transform_pre as $k => $v) $this->attr_transform_pre[$k] = $v;
- foreach($def->attr_transform_post as $k => $v) $this->attr_transform_post[$k] = $v;
- foreach($def->auto_close as $k => $v) $this->auto_close[$k] = $v;
- foreach($def->excludes as $k => $v) $this->excludes[$k] = $v;
+ $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
+ $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
+ $this->_mergeAssocArray($this->excludes, $def->excludes);
+ if(!empty($def->content_model)) {
+ $this->content_model .= ' | ' . $def->content_model;
+ $this->child = false;
+ }
+ if(!empty($def->content_model_type)) {
+ $this->content_model_type = $def->content_model_type;
+ $this->child = false;
+ }
if(!is_null($def->child)) $this->child = $def->child;
- if(!empty($def->content_model)) $this->content_model .= ' | ' . $def->content_model;
- if(!empty($def->content_model_type)) $this->content_model_type = $def->content_model_type;
- if(!is_null($def->descendants_are_inline)) $this->descendants_are_inline = $def->descendants_are_inline;
+ if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
+ if(!is_null($def->safe)) $this->safe = $def->safe;
}
+ /**
+ * Merges one array into another, removes values which equal false
+ * @param $a1 Array by reference that is merged into
+ * @param $a2 Array that merges into $a1
+ */
+ function _mergeAssocArray(&$a1, $a2) {
+ foreach ($a2 as $k => $v) {
+ if ($v === false) {
+ if (isset($a1[$k])) unset($a1[$k]);
+ continue;
+ }
+ $a1[$k] = $v;
+ }
+ }
+
+ /**
+ * Retrieves a copy of the element definition
+ */
+ function copy() {
+ return unserialize(serialize($this));
+ }
+
}
-?>
+
diff --git a/lib/htmlpurifier/HTMLPurifier/Encoder.php b/lib/htmlpurifier/HTMLPurifier/Encoder.php
index 1a22b4525c..e5adf83f59 100644
--- a/lib/htmlpurifier/HTMLPurifier/Encoder.php
+++ b/lib/htmlpurifier/HTMLPurifier/Encoder.php
@@ -1,7 +1,5 @@
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/EntityLookup.php b/lib/htmlpurifier/HTMLPurifier/EntityLookup.php
index f950cc2231..8204867be3 100644
--- a/lib/htmlpurifier/HTMLPurifier/EntityLookup.php
+++ b/lib/htmlpurifier/HTMLPurifier/EntityLookup.php
@@ -19,7 +19,7 @@ class HTMLPurifier_EntityLookup {
*/
function setup($file = false) {
if (!$file) {
- $file = dirname(__FILE__) . '/EntityLookup/entities.ser';
+ $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
}
$this->table = unserialize(file_get_contents($file));
}
@@ -43,4 +43,3 @@ class HTMLPurifier_EntityLookup {
}
-?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/EntityParser.php b/lib/htmlpurifier/HTMLPurifier/EntityParser.php
index 069c5ce17e..2547241350 100644
--- a/lib/htmlpurifier/HTMLPurifier/EntityParser.php
+++ b/lib/htmlpurifier/HTMLPurifier/EntityParser.php
@@ -24,8 +24,8 @@ class HTMLPurifier_EntityParser
* @protected
*/
var $_substituteEntitiesRegex =
-'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
-// 1. hex 2. dec 3. string
+'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
+// 1. hex 2. dec 3. string (XML style)
/**
@@ -97,7 +97,6 @@ class HTMLPurifier_EntityParser
} else {
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
if (!$this->_entity_lookup) {
- require_once 'HTMLPurifier/EntityLookup.php';
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
}
if (isset($this->_entity_lookup->table[$matches[3]])) {
@@ -155,4 +154,3 @@ class HTMLPurifier_EntityParser
}
-?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/Error.php b/lib/htmlpurifier/HTMLPurifier/Error.php
index adc81dc56d..2ca4d7323c 100644
--- a/lib/htmlpurifier/HTMLPurifier/Error.php
+++ b/lib/htmlpurifier/HTMLPurifier/Error.php
@@ -5,4 +5,3 @@
*/
class HTMLPurifier_Error {}
-?>
\ No newline at end of file
diff --git a/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php b/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php
new file mode 100644
index 0000000000..70ac5d9a00
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php
@@ -0,0 +1,118 @@
+locale =& $context->get('Locale');
+ $this->generator =& $context->get('Generator');
+ $this->context =& $context;
+ }
+
+ /**
+ * Sends an error message to the collector for later use
+ * @param $line Integer line number, or HTMLPurifier_Token that caused error
+ * @param $severity int Error severity, PHP error style (don't use E_USER_)
+ * @param $msg string Error message text
+ */
+ function send($severity, $msg) {
+
+ $args = array();
+ if (func_num_args() > 2) {
+ $args = func_get_args();
+ array_shift($args);
+ unset($args[0]);
+ }
+
+ $token = $this->context->get('CurrentToken', true);
+ $line = $token ? $token->line : $this->context->get('CurrentLine', true);
+ $attr = $this->context->get('CurrentAttr', true);
+
+ // perform special substitutions, also add custom parameters
+ $subst = array();
+ if (!is_null($token)) {
+ $args['CurrentToken'] = $token;
+ }
+ if (!is_null($attr)) {
+ $subst['$CurrentAttr.Name'] = $attr;
+ if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
+ }
+
+ if (empty($args)) {
+ $msg = $this->locale->getMessage($msg);
+ } else {
+ $msg = $this->locale->formatMessage($msg, $args);
+ }
+
+ if (!empty($subst)) $msg = strtr($msg, $subst);
+
+ $this->errors[] = array($line, $severity, $msg);
+ }
+
+ /**
+ * Retrieves raw error data for custom formatter to use
+ * @param List of arrays in format of array(Error message text,
+ * token that caused error, tokens surrounding token)
+ */
+ function getRaw() {
+ return $this->errors;
+ }
+
+ /**
+ * Default HTML formatting implementation for error messages
+ * @param $config Configuration array, vital for HTML output nature
+ */
+ function getHTMLFormatted($config) {
+ $ret = array();
+
+ $errors = $this->errors;
+
+ // sort error array by line
+ // line numbers are enabled if they aren't explicitly disabled
+ if ($config->get('Core', 'MaintainLineNumbers') !== false) {
+ $has_line = array();
+ $lines = array();
+ $original_order = array();
+ foreach ($errors as $i => $error) {
+ $has_line[] = (int) (bool) $error[0];
+ $lines[] = $error[0];
+ $original_order[] = $i;
+ }
+ array_multisort($has_line, SORT_DESC, $lines, SORT_ASC, $original_order, SORT_ASC, $errors);
+ }
+
+ foreach ($errors as $error) {
+ list($line, $severity, $msg) = $error;
+ $string = '';
+ $string .= '' . $this->locale->getErrorName($severity) . ': ';
+ $string .= $this->generator->escape($msg);
+ if ($line) {
+ // have javascript link generation that causes
+ // textarea to skip to the specified line
+ $string .= $this->locale->formatMessage(
+ 'ErrorCollector: At line', array('line' => $line));
+ }
+ $ret[] = $string;
+ }
+
+ if (empty($errors)) {
+ return '' . $this->locale->getMessage('ErrorCollector: No errors') . '
'; + } else { + return 'This can greatly '. - 'improve readability for editors who are hand-editing the HTML, but is '. - 'by no means necessary as HTML Purifier has already fixed all major '. - 'errors the HTML may have had. Tidy is a non-default extension, and this directive '. - 'will silently fail if Tidy is not available.
If you are looking to make '. - 'the overall look of your page\'s source better, I recommend running Tidy '. - 'on the entire page rather than just user-content (after all, the '. - 'indentation relative to the containing blocks will be incorrect).
This '. - 'directive was available since 1.1.1.
' + 'Output', 'TidyFormat', false, 'bool', << + Determines whether or not to run Tidy on the final output for pretty + formatting reasons, such as indentation and wrap. + ++ This can greatly improve readability for editors who are hand-editing + the HTML, but is by no means necessary as HTML Purifier has already + fixed all major errors the HTML may have had. Tidy is a non-default + extension, and this directive will silently fail if Tidy is not + available. +
++ If you are looking to make the overall look of your page's source + better, I recommend running Tidy on the entire page rather than just + user-content (after all, the indentation relative to the containing + blocks will be incorrect). +
++ This directive was available since 1.1.1. +
+HTML ); +HTMLPurifier_ConfigSchema::defineAlias('Core', 'TidyFormat', 'Output', 'TidyFormat'); + +HTMLPurifier_ConfigSchema::define('Output', 'Newline', null, 'string/null', ' ++ Newline string to format final output with. If left null, HTML Purifier + will auto-detect the default newline type of the system and use that; + you can manually override it here. Remember, \r\n is Windows, \r + is Mac, and \n is Unix. This directive was available since 2.0.1. +
+'); /** * Generates HTML from tokens. + * @todo Refactor interface so that configuration/context is determined + * upon instantiation, no need for messy generateFromTokens() calls */ class HTMLPurifier_Generator { /** - * Bool cache of %Core.CleanUTF8DuringGeneration + * Bool cache of %HTML.XHTML * @private */ - var $_clean_utf8 = false; + var $_xhtml = true; /** - * Bool cache of %Core.XHTML + * Bool cache of %Output.CommentScriptContents * @private */ - var $_xhtml = true; + var $_scriptFix = false; + + /** + * Cache of HTMLDefinition + * @private + */ + var $_def; /** * Generates HTML from an array of tokens. @@ -63,13 +79,28 @@ class HTMLPurifier_Generator function generateFromTokens($tokens, $config, &$context) { $html = ''; if (!$config) $config = HTMLPurifier_Config::createDefault(); - $this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration'); - $this->_xhtml = $config->get('Core', 'XHTML'); + $this->_scriptFix = $config->get('Output', 'CommentScriptContents'); + + $this->_def = $config->getHTMLDefinition(); + $this->_xhtml = $this->_def->doctype->xml; + if (!$tokens) return ''; - foreach ($tokens as $token) { - $html .= $this->generateFromToken($token); + for ($i = 0, $size = count($tokens); $i < $size; $i++) { + if ($this->_scriptFix && $tokens[$i]->name === 'script' + && $i + 2 < $size && $tokens[$i+2]->type == 'end') { + // script special case + // the contents of the script block must be ONE token + // for this to work + $html .= $this->generateFromToken($tokens[$i++]); + $html .= $this->generateScriptFromToken($tokens[$i++]); + // We're not going to do this: it wouldn't be valid anyway + //while ($tokens[$i]->name != 'script') { + // $html .= $this->generateScriptFromToken($tokens[$i++]); + //} + } + $html .= $this->generateFromToken($tokens[$i]); } - if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) { + if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) { $tidy_options = array( 'indent'=> true, @@ -93,6 +124,10 @@ class HTMLPurifier_Generator $html = (string) $tidy; } } + // normalize newlines to system + $nl = $config->get('Output', 'Newline'); + if ($nl === null) $nl = PHP_EOL; + $html = str_replace("\n", $nl, $html); return $html; } @@ -104,14 +139,14 @@ class HTMLPurifier_Generator function generateFromToken($token) { if (!isset($token->type)) return ''; if ($token->type == 'start') { - $attr = $this->generateAttributes($token->attr); + $attr = $this->generateAttributes($token->attr, $token->name); return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; } elseif ($token->type == 'end') { return '' . $token->name . '>'; } elseif ($token->type == 'empty') { - $attr = $this->generateAttributes($token->attr); + $attr = $this->generateAttributes($token->attr, $token->name); return '<' . $token->name . ($attr ? ' ' : '') . $attr . ( $this->_xhtml ? ' /': '' ) . '>'; @@ -125,18 +160,35 @@ class HTMLPurifier_Generator } } + /** + * Special case processor for the contents of script tags + * @warning This runs into problems if there's already a literal + * --> somewhere inside the script contents. + */ + function generateScriptFromToken($token) { + if ($token->type != 'text') return $this->generateFromToken($token); + // return ''; + // more advanced version: + // thanks+$config = HTMLPurifier_Config::createDefault(); +$config->set(\'HTML\', \'DefinitionID\', \'1\'); +$def = $config->getHTMLDefinition(); +$def->addAttribute(\'a\', \'tabindex\', \'Number\'); ++
+ In the above example, the configuration is still at the defaults, but + using the advanced API, an extra attribute has been added. The + configuration object normally has no way of knowing that this change + has taken place, so it needs an extra directive: %HTML.DefinitionID. + If someone else attempts to use the default configuration, these two + pieces of code will not clobber each other in the cache, since one has + an extra directive attached to it. +
++ This directive has been available since 2.0.0, and in that version or + later you must specify a value to this directive to use the + advanced API features. +
+'); HTMLPurifier_ConfigSchema::define( - 'HTML', 'BlockWrapper', 'p', 'string', - 'String name of element to wrap inline elements that are inside a block '. - 'context. This only occurs in the children of blockquote in strict mode. '. - 'Example: by default value,<blockquote>Foo</blockquote>
'.
- 'would become <blockquote><p>Foo</p></blockquote>
. The '.
- '<p>
tags can be replaced '.
- 'with whatever you desire, as long as it is a block level element. '.
- 'This directive has been available since 1.3.0.'
-);
+ 'HTML', 'DefinitionRev', 1, 'int', '
++ Revision identifier for your custom definition specified in + %HTML.DefinitionID. This serves the same purpose: uniquely identifying + your custom definition, but this one does so in a chronological + context: revision 3 is more up-to-date then revision 2. Thus, when + this gets incremented, the cache handling is smart enough to clean + up any older revisions of your definition as well as flush the + cache. This directive has been available since 2.0.0. +
+'); HTMLPurifier_ConfigSchema::define( - 'HTML', 'Parent', 'div', 'string', - 'String name of element that HTML fragment passed to library will be '. - 'inserted in. An interesting variation would be using span as the '. - 'parent element, meaning that only inline tags would be allowed. '. - 'This directive has been available since 1.3.0.' -); + 'HTML', 'BlockWrapper', 'p', 'string', ' ++ String name of element to wrap inline elements that are inside a block + context. This only occurs in the children of blockquote in strict mode. +
+
+ Example: by default value,
+ <blockquote>Foo</blockquote>
would become
+ <blockquote><p>Foo</p></blockquote>
.
+ The <p>
tags can be replaced with whatever you desire,
+ as long as it is a block level element. This directive has been available
+ since 1.3.0.
+
+ String name of element that HTML fragment passed to library will be + inserted in. An interesting variation would be using span as the + parent element, meaning that only inline tags would be allowed. + This directive has been available since 1.3.0. +
+'); HTMLPurifier_ConfigSchema::define( - 'HTML', 'AllowedAttributes', null, 'lookup/null', - 'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '. - 'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '. - '(style, id, class, dir, lang, xml:lang).'. - 'Warning: If another directive conflicts with the '. - 'elements here, that directive will win and override. For '. - 'example, %HTML.EnableAttrID will take precedence over *.id in this '. - 'directive. You must set that directive to true before you can use '. - 'IDs at all. This directive has been available since 1.3.0.' -); + 'HTML', 'AllowedElements', null, 'lookup/null', ' ++ If HTML Purifier\'s tag set is unsatisfactory for your needs, you + can overload it with your own list of tags to allow. Note that this + method is subtractive: it does its job by taking away from HTML Purifier + usual feature set, so you cannot add a tag that HTML Purifier never + supported in the first place (like embed, form or head). If you + change this, you probably also want to change %HTML.AllowedAttributes. +
++ Warning: If another directive conflicts with the + elements here, that directive will win and override. + This directive has been available since 1.3.0. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'AllowedAttributes', null, 'lookup/null', ' ++ If HTML Purifier\'s attribute set is unsatisfactory, overload it! + The syntax is "tag.attr" or "*.attr" for the global attributes + (style, id, class, dir, lang, xml:lang). +
++ Warning: If another directive conflicts with the + elements here, that directive will win and override. For + example, %HTML.EnableAttrID will take precedence over *.id in this + directive. You must set that directive to true before you can use + IDs at all. This directive has been available since 1.3.0. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'Allowed', null, 'itext/null', ' +
+ This is a convenience directive that rolls the functionality of
+ %HTML.AllowedElements and %HTML.AllowedAttributes into one directive.
+ Specify elements and attributes that are allowed using:
+ element1[attr1|attr2],element2...
. You can also use
+ newlines instead of commas to separate elements.
+
+ Warning:
+ All of the constraints on the component directives are still enforced.
+ The syntax is a subset of TinyMCE\'s valid_elements
+ whitelist: directly copy-pasting it here will probably result in
+ broken whitelists. If %HTML.AllowedElements or %HTML.AllowedAttributes
+ are set, this directive has no effect.
+ This directive has been available since 2.0.0.
+
This directive has been available since 2.0.0
+' ); +HTMLPurifier_ConfigSchema::defineAllowedValues( + 'HTML', 'TidyLevel', array('none', 'light', 'medium', 'heavy') +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'TidyAdd', array(), 'lookup', ' +Fixes to add to the default set of Tidy fixes as per your level. This +directive has been available since 2.0.0. +' ); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'TidyRemove', array(), 'lookup', ' +Fixes to remove from the default set of Tidy fixes as per your level. This +directive has been available since 2.0.0. +' ); + +/** + * Abstract class for a set of proprietary modules that clean up (tidy) + * poorly written HTML. + */ +class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule +{ + + /** + * List of supported levels. Index zero is a special case "no fixes" + * level. + */ + var $levels = array(0 => 'none', 'light', 'medium', 'heavy'); + + /** + * Default level to place all fixes in. Disabled by default + */ + var $defaultLevel = null; + + /** + * Lists of fixes used by getFixesForLevel(). Format is: + * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2'); + */ + var $fixesForLevel = array( + 'light' => array(), + 'medium' => array(), + 'heavy' => array() + ); + + /** + * Lazy load constructs the module by determining the necessary + * fixes to create and then delegating to the populate() function. + * @todo Wildcard matching and error reporting when an added or + * subtracted fix has no effect. + */ + function construct($config) { + + // create fixes, initialize fixesForLevel + $fixes = $this->makeFixes(); + $this->makeFixesForLevel($fixes); + + // figure out which fixes to use + $level = $config->get('HTML', 'TidyLevel'); + $fixes_lookup = $this->getFixesForLevel($level); + + // get custom fix declarations: these need namespace processing + $add_fixes = $config->get('HTML', 'TidyAdd'); + $remove_fixes = $config->get('HTML', 'TidyRemove'); + + foreach ($fixes as $name => $fix) { + // needs to be refactored a little to implement globbing + if ( + isset($remove_fixes[$name]) || + (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name])) + ) { + unset($fixes[$name]); + } + } + + // populate this module with necessary fixes + $this->populate($fixes); + + } + + /** + * Retrieves all fixes per a level, returning fixes for that specific + * level as well as all levels below it. + * @param $level String level identifier, see $levels for valid values + * @return Lookup up table of fixes + */ + function getFixesForLevel($level) { + if ($level == $this->levels[0]) { + return array(); + } + $activated_levels = array(); + for ($i = 1, $c = count($this->levels); $i < $c; $i++) { + $activated_levels[] = $this->levels[$i]; + if ($this->levels[$i] == $level) break; + } + if ($i == $c) { + trigger_error( + 'Tidy level ' . htmlspecialchars($level) . ' not recognized', + E_USER_WARNING + ); + return array(); + } + $ret = array(); + foreach ($activated_levels as $level) { + foreach ($this->fixesForLevel[$level] as $fix) { + $ret[$fix] = true; + } + } + return $ret; + } + + /** + * Dynamically populates the $fixesForLevel member variable using + * the fixes array. It may be custom overloaded, used in conjunction + * with $defaultLevel, or not used at all. + */ + function makeFixesForLevel($fixes) { + if (!isset($this->defaultLevel)) return; + if (!isset($this->fixesForLevel[$this->defaultLevel])) { + trigger_error( + 'Default level ' . $this->defaultLevel . ' does not exist', + E_USER_ERROR + ); + return; + } + $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes); + } + + /** + * Populates the module with transforms and other special-case code + * based on a list of fixes passed to it + * @param $lookup Lookup table of fixes to activate + */ + function populate($fixes) { + foreach ($fixes as $name => $fix) { + // determine what the fix is for + list($type, $params) = $this->getFixType($name); + switch ($type) { + case 'attr_transform_pre': + case 'attr_transform_post': + $attr = $params['attr']; + if (isset($params['element'])) { + $element = $params['element']; + if (empty($this->info[$element])) { + $e =& $this->addBlankElement($element); + } else { + $e =& $this->info[$element]; + } + } else { + $type = "info_$type"; + $e =& $this; + } + $f =& $e->$type; + $f[$attr] = $fix; + break; + case 'tag_transform': + $this->info_tag_transform[$params['element']] = $fix; + break; + case 'child': + case 'content_model_type': + $element = $params['element']; + if (empty($this->info[$element])) { + $e =& $this->addBlankElement($element); + } else { + $e =& $this->info[$element]; + } + $e->$type = $fix; + break; + default: + trigger_error("Fix type $type not supported", E_USER_ERROR); + break; + } + } + } + + /** + * Parses a fix name and determines what kind of fix it is, as well + * as other information defined by the fix + * @param $name String name of fix + * @return array(string $fix_type, array $fix_parameters) + * @note $fix_parameters is type dependant, see populate() for usage + * of these parameters + */ + function getFixType($name) { + // parse it + $property = $attr = null; + if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name); + if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name); + + // figure out the parameters + $params = array(); + if ($name !== '') $params['element'] = $name; + if (!is_null($attr)) $params['attr'] = $attr; + + // special case: attribute transform + if (!is_null($attr)) { + if (is_null($property)) $property = 'pre'; + $type = 'attr_transform_' . $property; + return array($type, $params); + } + + // special case: tag transform + if (is_null($property)) { + return array('tag_transform', $params); + } + + return array($property, $params); + + } + + /** + * Defines all fixes the module will perform in a compact + * associative array of fix name to fix implementation. + * @abstract + */ + function makeFixes() {} + +} + + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Proprietary.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Proprietary.php new file mode 100644 index 0000000000..3b4b116024 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Proprietary.php @@ -0,0 +1,17 @@ + 'text-align:left;', + 'right' => 'text-align:right;', + 'top' => 'caption-side:top;', + 'bottom' => 'caption-side:bottom;' // not supported by IE + )); + + // @align for img ------------------------------------------------- + $r['img@align'] = + new HTMLPurifier_AttrTransform_EnumToCSS('align', array( + 'left' => 'float:left;', + 'right' => 'float:right;', + 'top' => 'vertical-align:top;', + 'middle' => 'vertical-align:middle;', + 'bottom' => 'vertical-align:baseline;', + )); + + // @align for table ----------------------------------------------- + $r['table@align'] = + new HTMLPurifier_AttrTransform_EnumToCSS('align', array( + 'left' => 'float:left;', + 'center' => 'margin-left:auto;margin-right:auto;', + 'right' => 'float:right;' + )); + + // @align for hr ----------------------------------------------- + $r['hr@align'] = + new HTMLPurifier_AttrTransform_EnumToCSS('align', array( + // we use both text-align and margin because these work + // for different browsers (IE and Firefox, respectively) + // and the melange makes for a pretty cross-compatible + // solution + 'left' => 'margin-left:0;margin-right:auto;text-align:left;', + 'center' => 'margin-left:auto;margin-right:auto;text-align:center;', + 'right' => 'margin-left:auto;margin-right:0;text-align:right;' + )); + + // @align for h1, h2, h3, h4, h5, h6, p, div ---------------------- + // {{{ + $align_lookup = array(); + $align_values = array('left', 'right', 'center', 'justify'); + foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;"; + // }}} + $r['h1@align'] = + $r['h2@align'] = + $r['h3@align'] = + $r['h4@align'] = + $r['h5@align'] = + $r['h6@align'] = + $r['p@align'] = + $r['div@align'] = + new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup); + + // @bgcolor for table, tr, td, th --------------------------------- + $r['table@bgcolor'] = + $r['td@bgcolor'] = + $r['th@bgcolor'] = + new HTMLPurifier_AttrTransform_BgColor(); + + // @border for img ------------------------------------------------ + $r['img@border'] = new HTMLPurifier_AttrTransform_Border(); + + // @clear for br -------------------------------------------------- + $r['br@clear'] = + new HTMLPurifier_AttrTransform_EnumToCSS('clear', array( + 'left' => 'clear:left;', + 'right' => 'clear:right;', + 'all' => 'clear:both;', + 'none' => 'clear:none;', + )); + + // @height for td, th --------------------------------------------- + $r['td@height'] = + $r['th@height'] = + new HTMLPurifier_AttrTransform_Length('height'); + + // @hspace for img ------------------------------------------------ + $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace'); + + // @name for img, a ----------------------------------------------- + $r['img@name'] = + $r['a@name'] = new HTMLPurifier_AttrTransform_Name(); + + // @noshade for hr ------------------------------------------------ + // this transformation is not precise but often good enough. + // different browsers use different styles to designate noshade + $r['hr@noshade'] = + new HTMLPurifier_AttrTransform_BoolToCSS( + 'noshade', + 'color:#808080;background-color:#808080;border:0;' + ); + + // @nowrap for td, th --------------------------------------------- + $r['td@nowrap'] = + $r['th@nowrap'] = + new HTMLPurifier_AttrTransform_BoolToCSS( + 'nowrap', + 'white-space:nowrap;' + ); + + // @size for hr -------------------------------------------------- + $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height'); + + // @type for li, ol, ul ------------------------------------------- + // {{{ + $ul_types = array( + 'disc' => 'list-style-type:disc;', + 'square' => 'list-style-type:square;', + 'circle' => 'list-style-type:circle;' + ); + $ol_types = array( + '1' => 'list-style-type:decimal;', + 'i' => 'list-style-type:lower-roman;', + 'I' => 'list-style-type:upper-roman;', + 'a' => 'list-style-type:lower-alpha;', + 'A' => 'list-style-type:upper-alpha;' + ); + $li_types = $ul_types + $ol_types; + // }}} + + $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types); + $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true); + $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true); + + // @vspace for img ------------------------------------------------ + $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace'); + + // @width for hr, td, th ------------------------------------------ + $r['td@width'] = + $r['th@width'] = + $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width'); + + return $r; + + } + +} + +class HTMLPurifier_HTMLModule_Tidy_Transitional extends + HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 +{ + var $name = 'Tidy_Transitional'; + var $defaultLevel = 'heavy'; +} + +class HTMLPurifier_HTMLModule_Tidy_Strict extends + HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 +{ + var $name = 'Tidy_Strict'; + var $defaultLevel = 'light'; +} + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php new file mode 100644 index 0000000000..b701491ecd --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php @@ -0,0 +1,26 @@ +content_model_type != 'strictblockquote') return false; + return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model); + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php deleted file mode 100644 index 0b6c8370ab..0000000000 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToStrict.php +++ /dev/null @@ -1,200 +0,0 @@ - false, - 'menu' => false, - 'dir' => false, - 'center'=> false - ); - - var $attr_collections = array( - 'Lang' => array( - 'lang' => false // placeholder - ) - ); - - var $info_attr_transform_post = array( - 'lang' => false // placeholder - ); - - function HTMLPurifier_HTMLModule_TransformToStrict() { - - // behavior with transformations when there's another CSS property - // working on it is interesting: the CSS will *always* override - // the deprecated attribute, whereas an inline CSS declaration will - // override the corresponding declaration in, say, an external - // stylesheet. This behavior won't affect most people, but it - // does represent an operational difference we CANNOT fix. - - // deprecated tag transforms - $this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font(); - $this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul'); - $this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul'); - $this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center(); - - foreach ($this->elements as $name) { - $this->info[$name] = new HTMLPurifier_ElementDef(); - $this->info[$name]->standalone = false; - } - - // deprecated attribute transforms - - // align battery - $align_lookup = array(); - $align_values = array('left', 'right', 'center', 'justify'); - foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;"; - $this->info['h1']->attr_transform_pre['align'] = - $this->info['h2']->attr_transform_pre['align'] = - $this->info['h3']->attr_transform_pre['align'] = - $this->info['h4']->attr_transform_pre['align'] = - $this->info['h5']->attr_transform_pre['align'] = - $this->info['h6']->attr_transform_pre['align'] = - $this->info['p'] ->attr_transform_pre['align'] = - new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup); - - // xml:lang <=> lang mirroring, implement in TransformToStrict, - // this is overridden in TransformToXHTML11 - $this->info_attr_transform_post['lang'] = new HTMLPurifier_AttrTransform_Lang(); - $this->attr_collections['Lang']['lang'] = new HTMLPurifier_AttrDef_Lang(); - - // this should not be applied to XHTML 1.0 Transitional, ONLY - // XHTML 1.0 Strict. We may need three classes - $this->info['blockquote']->content_model_type = 'strictblockquote'; - $this->info['blockquote']->child = false; // recalculate please! - - $this->info['table']->attr_transform_pre['bgcolor'] = - $this->info['tr']->attr_transform_pre['bgcolor'] = - $this->info['td']->attr_transform_pre['bgcolor'] = - $this->info['th']->attr_transform_pre['bgcolor'] = new HTMLPurifier_AttrTransform_BgColor(); - - $this->info['img']->attr_transform_pre['border'] = new HTMLPurifier_AttrTransform_Border(); - - $this->info['img']->attr_transform_pre['name'] = - $this->info['a']->attr_transform_pre['name'] = new HTMLPurifier_AttrTransform_Name(); - - $this->info['td']->attr_transform_pre['width'] = - $this->info['th']->attr_transform_pre['width'] = - $this->info['hr']->attr_transform_pre['width'] = new HTMLPurifier_AttrTransform_Length('width'); - - $this->info['td']->attr_transform_pre['nowrap'] = - $this->info['th']->attr_transform_pre['nowrap'] = new HTMLPurifier_AttrTransform_BoolToCSS('nowrap', 'white-space:nowrap;'); - - $this->info['td']->attr_transform_pre['height'] = - $this->info['th']->attr_transform_pre['height'] = new HTMLPurifier_AttrTransform_Length('height'); - - $this->info['img']->attr_transform_pre['hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace'); - $this->info['img']->attr_transform_pre['vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace'); - - $this->info['hr']->attr_transform_pre['size'] = new HTMLPurifier_AttrTransform_Length('size', 'height'); - - // this transformation is not precise but often good enough. - // different browsers use different styles to designate noshade - $this->info['hr']->attr_transform_pre['noshade'] = new HTMLPurifier_AttrTransform_BoolToCSS('noshade', 'color:#808080;background-color:#808080;border: 0;'); - - $this->info['br']->attr_transform_pre['clear'] = - new HTMLPurifier_AttrTransform_EnumToCSS('clear', array( - 'left' => 'clear:left;', - 'right' => 'clear:right;', - 'all' => 'clear:both;', - 'none' => 'clear:none;', - )); - - // this is a slightly unreasonable attribute - $this->info['caption']->attr_transform_pre['align'] = - new HTMLPurifier_AttrTransform_EnumToCSS('align', array( - // we're following IE's behavior, not Firefox's, due - // to the fact that no one supports caption-side:right, - // W3C included (with CSS 2.1) - 'left' => 'text-align:left;', - 'right' => 'text-align:right;', - 'top' => 'caption-side:top;', - 'bottom' => 'caption-side:bottom;' // not supported by IE - )); - - $this->info['table']->attr_transform_pre['align'] = - new HTMLPurifier_AttrTransform_EnumToCSS('align', array( - 'left' => 'float:left;', - 'center' => 'margin-left:auto;margin-right:auto;', - 'right' => 'float:right;' - )); - - $this->info['img']->attr_transform_pre['align'] = - new HTMLPurifier_AttrTransform_EnumToCSS('align', array( - 'left' => 'float:left;', - 'right' => 'float:right;', - 'top' => 'vertical-align:top;', - 'middle' => 'vertical-align:middle;', - 'bottom' => 'vertical-align:baseline;', - )); - - $this->info['hr']->attr_transform_pre['align'] = - new HTMLPurifier_AttrTransform_EnumToCSS('align', array( - 'left' => 'margin-left:0;margin-right:auto;text-align:left;', - 'center' => 'margin-left:auto;margin-right:auto;text-align:center;', - 'right' => 'margin-left:auto;margin-right:0;text-align:right;' - )); - - $ul_types = array( - 'disc' => 'list-style-type:disc;', - 'square' => 'list-style-type:square;', - 'circle' => 'list-style-type:circle;' - ); - $ol_types = array( - '1' => 'list-style-type:decimal;', - 'i' => 'list-style-type:lower-roman;', - 'I' => 'list-style-type:upper-roman;', - 'a' => 'list-style-type:lower-alpha;', - 'A' => 'list-style-type:upper-alpha;' - ); - $li_types = $ul_types + $ol_types; - - $this->info['ul']->attr_transform_pre['type'] = - new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types); - $this->info['ol']->attr_transform_pre['type'] = - new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true); - $this->info['li']->attr_transform_pre['type'] = - new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true); - - - } - - var $defines_child_def = true; - function getChildDef($def) { - if ($def->content_model_type != 'strictblockquote') return false; - return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model); - } - -} - -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php deleted file mode 100644 index 68aac61312..0000000000 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/TransformToXHTML11.php +++ /dev/null @@ -1,36 +0,0 @@ - array( - 'lang' => false // remove it - ) - ); - - var $info_attr_transform_post = array( - 'lang' => false // remove it - ); - - function HTMLPurifier_HTMLModule_TransformToXHTML11() { - $this->info_attr_transform_pre['lang'] = new HTMLPurifier_AttrTransform_Lang(); - } - -} - -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/XMLCommonAttributes.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/XMLCommonAttributes.php new file mode 100644 index 0000000000..67f7fc8ae1 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/XMLCommonAttributes.php @@ -0,0 +1,15 @@ + array( + 'xml:lang' => 'LanguageCode', + ) + ); +} + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php b/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php index 81ef13a5f4..d4f10d0c7c 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModuleManager.php @@ -2,6 +2,8 @@ require_once 'HTMLPurifier/HTMLModule.php'; require_once 'HTMLPurifier/ElementDef.php'; +require_once 'HTMLPurifier/Doctype.php'; +require_once 'HTMLPurifier/DoctypeRegistry.php'; require_once 'HTMLPurifier/ContentSets.php'; require_once 'HTMLPurifier/AttrTypes.php'; @@ -23,188 +25,209 @@ require_once 'HTMLPurifier/HTMLModule/Image.php'; require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php'; require_once 'HTMLPurifier/HTMLModule/Legacy.php'; require_once 'HTMLPurifier/HTMLModule/Target.php'; +require_once 'HTMLPurifier/HTMLModule/Scripting.php'; +require_once 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php'; +require_once 'HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php'; +require_once 'HTMLPurifier/HTMLModule/Ruby.php'; -// proprietary modules -require_once 'HTMLPurifier/HTMLModule/TransformToStrict.php'; -require_once 'HTMLPurifier/HTMLModule/TransformToXHTML11.php'; +// tidy modules +require_once 'HTMLPurifier/HTMLModule/Tidy.php'; +require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php'; +require_once 'HTMLPurifier/HTMLModule/Tidy/XHTML.php'; +require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php'; +require_once 'HTMLPurifier/HTMLModule/Tidy/Proprietary.php'; HTMLPurifier_ConfigSchema::define( - 'HTML', 'Doctype', null, 'string/null', - 'Doctype to use, valid values are HTML 4.01 Transitional, HTML 4.01 '. - 'Strict, XHTML 1.0 Transitional, XHTML 1.0 Strict, XHTML 1.1. '. + 'HTML', 'Doctype', '', 'string', + 'Doctype to use during filtering. '. 'Technically speaking this is not actually a doctype (as it does '. 'not identify a corresponding DTD), but we are using this name '. - 'for sake of simplicity. This will override any older directives '. - 'like %Core.XHTML or %HTML.Strict.' + 'for sake of simplicity. When non-blank, this will override any older directives '. + 'like %HTML.XHTML or %HTML.Strict.' ); +HTMLPurifier_ConfigSchema::defineAllowedValues('HTML', 'Doctype', array( + '', 'HTML 4.01 Transitional', 'HTML 4.01 Strict', + 'XHTML 1.0 Transitional', 'XHTML 1.0 Strict', + 'XHTML 1.1' +)); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'CustomDoctype', null, 'string/null', +' +A custom doctype for power-users who defined there own document +type. This directive only applies when %HTML.Doctype is blank. +This directive has been available since 2.0.1. +' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'Trusted', false, 'bool', + 'Indicates whether or not the user input is trusted or not. If the '. + 'input is trusted, a more expansive set of allowed tags and attributes '. + 'will be used. This directive has been available since 2.0.0.' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'AllowedModules', null, 'lookup/null', ' ++ A doctype comes with a set of usual modules to use. Without having + to mucking about with the doctypes, you can quickly activate or + disable these modules by specifying which modules you wish to allow + with this directive. This is most useful for unit testing specific + modules, although end users may find it useful for their own ends. +
++ If you specify a module that does not exist, the manager will silently + fail to use it, so be careful! User-defined modules are not affected + by this directive. Modules defined in %HTML.CoreModules are not + affected by this directive. This directive has been available since 2.0.0. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'CoreModules', array( + 'Structure' => true, + 'Text' => true, + 'Hypertext' => true, + 'List' => true, + 'NonXMLCommonAttributes' => true, + 'XMLCommonAttributes' => true, + 'CommonAttributes' => true + ), 'lookup', ' ++ Certain modularized doctypes (XHTML, namely), have certain modules + that must be included for the doctype to be an conforming document + type: put those modules here. By default, XHTML\'s core modules + are used. You can set this to a blank array to disable core module + protection, but this is not recommended. This directive has been + available since 2.0.0. +
+'); class HTMLPurifier_HTMLModuleManager { /** - * Array of HTMLPurifier_Module instances, indexed by module's class name. - * All known modules, regardless of use, are in this array. + * Instance of HTMLPurifier_DoctypeRegistry + * @public */ - var $modules = array(); + var $doctypes; /** - * String doctype we will validate against. See $validModules for use. - * - * @note - * There is a special doctype '*' that acts both as the "default" - * doctype if a customized system only defines one doctype and - * also a catch-all doctype that gets merged into all the other - * module collections. When possible, use a private collection to - * share modules between doctypes: this special doctype is to - * make life more convenient for users. + * Instance of current doctype + * @public */ var $doctype; - var $doctypeAliases = array(); /**< Lookup array of strings to real doctypes */ /** - * Associative array: $collections[$type][$doctype] = list of modules. - * This is used to logically separate types of functionality so that - * based on the doctype and other configuration settings they may - * be easily switched and on and off. Custom setups may not need - * to use this abstraction, opting to have only one big collection - * with one valid doctype. + * Instance of HTMLPurifier_AttrTypes + * @public */ - var $collections = array(); + var $attrTypes; /** - * Modules that may be used in a valid doctype of this kind. - * Correctional and leniency modules should not be placed in this - * array unless the user said so: don't stuff every possible lenient - * module for this doctype in here. + * Active instances of modules for the specified doctype are + * indexed, by name, in this array. */ - var $validModules = array(); - var $validCollections = array(); /**< Collections to merge into $validModules */ + var $modules = array(); /** - * Modules that we will allow in input, subset of $validModules. Single - * element definitions may result in us consulting validModules. + * Array of recognized HTMLPurifier_Module instances, indexed by + * module's class name. This array is usually lazy loaded, but a + * user can overload a module by pre-emptively registering it. */ - var $activeModules = array(); - var $activeCollections = array(); /**< Collections to merge into $activeModules */ - - var $counter = 0; /**< Designates next available integer order for modules. */ - var $initialized = false; /**< Says whether initialize() was called */ + var $registeredModules = array(); /** - * Specifies what doctype to siphon new modules from addModule() to, - * or false to disable the functionality. Must be used in conjunction - * with $autoCollection. + * List of extra modules that were added by the user using addModule(). + * These get unconditionally merged into the current doctype, whatever + * it may be. */ - var $autoDoctype = false; + var $userModules = array(); + /** - * Specifies what collection to siphon new modules from addModule() to, - * or false to disable the functionality. Must be used in conjunction - * with $autoCollection. + * Associative array of element name to list of modules that have + * definitions for the element; this array is dynamically filled. */ - var $autoCollection = false; - - /** Associative array of element name to defining modules (always array) */ var $elementLookup = array(); - /** List of prefixes we should use for resolving small names */ + /** List of prefixes we should use for registering small names */ var $prefixes = array('HTMLPurifier_HTMLModule_'); - var $contentSets; /**< Instance of HTMLPurifier_ContentSets */ - var $attrTypes; /**< Instance of HTMLPurifier_AttrTypes */ + var $contentSets; /**< Instance of HTMLPurifier_ContentSets */ var $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */ - /** - * @param $blank If true, don't do any initializing - */ - function HTMLPurifier_HTMLModuleManager($blank = false) { + /** If set to true, unsafe elements and attributes will be allowed */ + var $trusted = false; + + function HTMLPurifier_HTMLModuleManager() { - // the only editable internal object. The rest need to - // be manipulated through modules + // editable internal objects $this->attrTypes = new HTMLPurifier_AttrTypes(); + $this->doctypes = new HTMLPurifier_DoctypeRegistry(); - if (!$blank) $this->initialize(); + // setup default HTML doctypes - } - - function initialize() { - $this->initialized = true; - - // load default modules to the recognized modules list (not active) - $modules = array( - // define - 'CommonAttributes', - 'Text', 'Hypertext', 'List', 'Presentation', - 'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute', - 'Target', - // define-redefine - 'Legacy', - // redefine - 'TransformToStrict', 'TransformToXHTML11' + // module reuse + $common = array( + 'CommonAttributes', 'Text', 'Hypertext', 'List', + 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', + 'StyleAttribute', 'Scripting' ); - foreach ($modules as $module) { - $this->addModule($module); - } - - // Safe modules for supported doctypes. These are included - // in the valid and active module lists by default - $this->collections['Safe'] = array( - '_Common' => array( // leading _ indicates private - 'CommonAttributes', 'Text', 'Hypertext', 'List', - 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', - 'StyleAttribute' - ), - // HTML definitions, defer to XHTML definitions - 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')), - 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')), - // XHTML definitions - 'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy', 'Target' ), - 'XHTML 1.0 Strict' => array(array('_Common')), - 'XHTML 1.1' => array(array('_Common')), + $transitional = array('Legacy', 'Target'); + $xml = array('XMLCommonAttributes'); + $non_xml = array('NonXMLCommonAttributes'); + + $this->doctypes->register( + 'HTML 4.01 Transitional', false, + array_merge($common, $transitional, $non_xml), + array('Tidy_Transitional', 'Tidy_Proprietary'), + array(), + '-//W3C//DTD HTML 4.01 Transitional//EN', + 'http://www.w3.org/TR/html4/loose.dtd' ); - // Modules that specify elements that are unsafe from untrusted - // third-parties. These should be registered in $validModules but - // almost never $activeModules unless you really know what you're - // doing. - $this->collections['Unsafe'] = array(); - - // Modules to import if lenient mode (attempt to convert everything - // to a valid representation) is on. These must not be in $validModules - // unless specified so. - $this->collections['Lenient'] = array( - 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')), - 'XHTML 1.0 Strict' => array('TransformToStrict'), - 'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11') + $this->doctypes->register( + 'HTML 4.01 Strict', false, + array_merge($common, $non_xml), + array('Tidy_Strict', 'Tidy_Proprietary'), + array(), + '-//W3C//DTD HTML 4.01//EN', + 'http://www.w3.org/TR/html4/strict.dtd' ); - // Modules to import if correctional mode (correct everything that - // is feasible to strict mode) is on. These must not be in $validModules - // unless specified so. - $this->collections['Correctional'] = array( - 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')), - 'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one + $this->doctypes->register( + 'XHTML 1.0 Transitional', true, + array_merge($common, $transitional, $xml, $non_xml), + array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary'), + array(), + '-//W3C//DTD XHTML 1.0 Transitional//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' ); - // User-space modules, custom code or whatever - $this->collections['Extension'] = array(); - - // setup active versus valid modules. ORDER IS IMPORTANT! - // definition modules - $this->makeCollectionActive('Safe'); - $this->makeCollectionValid('Unsafe'); - // redefinition modules - $this->makeCollectionActive('Lenient'); - $this->makeCollectionActive('Correctional'); + $this->doctypes->register( + 'XHTML 1.0 Strict', true, + array_merge($common, $xml, $non_xml), + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict', 'Tidy_Proprietary'), + array(), + '-//W3C//DTD XHTML 1.0 Strict//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' + ); - $this->autoDoctype = '*'; - $this->autoCollection = 'Extension'; + $this->doctypes->register( + 'XHTML 1.1', true, + array_merge($common, $xml, array('Ruby')), + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_XHTMLStrict'), // Tidy_XHTML1_1 + array(), + '-//W3C//DTD XHTML 1.1//EN', + 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' + ); } /** - * Adds a module to the recognized module list. This does not - * do anything else: the module must be added to a corresponding - * collection to be "activated". + * Registers a module to the recognized module list, useful for + * overloading pre-existing modules. * @param $module Mixed: string module name, with or without * HTMLPurifier_HTMLModule prefix, or instance of * subclass of HTMLPurifier_HTMLModule. @@ -217,10 +240,15 @@ class HTMLPurifier_HTMLModuleManager * - Check for literal object name * - Throw fatal error * If your object name collides with an internal class, specify - * your module manually. + * your module manually. All modules must have been included + * externally: registerModule will not perform inclusions for you! + * @warning If your module has the same name as an already loaded + * module, your module will overload the old one WITHOUT + * warning. */ - function addModule($module) { + function registerModule($module) { if (is_string($module)) { + // attempt to load the module $original_module = $module; $ok = false; foreach ($this->prefixes as $prefix) { @@ -240,16 +268,19 @@ class HTMLPurifier_HTMLModuleManager } $module = new $module(); } - $module->order = $this->counter++; // assign then increment - $this->modules[$module->name] = $module; - if ($this->autoDoctype !== false && $this->autoCollection !== false) { - $this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name; + if (empty($module->name)) { + trigger_error('Module instance of ' . get_class($module) . ' must have name'); + return; } + $this->registeredModules[$module->name] = $module; } /** * Safely tests for class existence without invoking __autoload in PHP5 + * or greater. * @param $name String class name to test + * @note If any other class needs it, we'll need to stash in a + * conjectured "compatibility" class * @private */ function _classExists($name) { @@ -265,55 +296,63 @@ class HTMLPurifier_HTMLModuleManager } /** - * Makes a collection active, while also making it valid if not - * already done so. See $activeModules for the semantics of "active". - * @param $collection_name Name of collection to activate - */ - function makeCollectionActive($collection_name) { - if (!in_array($collection_name, $this->validCollections)) { - $this->makeCollectionValid($collection_name); - } - $this->activeCollections[] = $collection_name; - } - - /** - * Makes a collection valid. See $validModules for the semantics of "valid" + * Adds a module to the current doctype by first registering it, + * and then tacking it on to the active doctype */ - function makeCollectionValid($collection_name) { - $this->validCollections[] = $collection_name; + function addModule($module) { + $this->registerModule($module); + if (is_object($module)) $module = $module->name; + $this->userModules[] = $module; } /** - * Adds a class prefix that addModule() will use to resolve a + * Adds a class prefix that registerModule() will use to resolve a * string name to a concrete class */ function addPrefix($prefix) { - $this->prefixes[] = (string) $prefix; + $this->prefixes[] = $prefix; } + /** + * Performs processing on modules, after being called you may + * use getElement() and getElements() + * @param $config Instance of HTMLPurifier_Config + */ function setup($config) { - // load up the autocollection - if ($this->autoCollection !== false) { - $this->makeCollectionActive($this->autoCollection); - } + $this->trusted = $config->get('HTML', 'Trusted'); + + // generate + $this->doctype = $this->doctypes->make($config); + $modules = $this->doctype->modules; - // retrieve the doctype - $this->doctype = $this->getDoctype($config); - if (isset($this->doctypeAliases[$this->doctype])) { - $this->doctype = $this->doctypeAliases[$this->doctype]; + // take out the default modules that aren't allowed + $lookup = $config->get('HTML', 'AllowedModules'); + $special_cases = $config->get('HTML', 'CoreModules'); + + if (is_array($lookup)) { + foreach ($modules as $k => $m) { + if (isset($special_cases[$m])) continue; + if (!isset($lookup[$m])) unset($modules[$k]); + } } - // process module collections to module name => module instance form - foreach ($this->collections as $col_i => $x) { - $this->processCollections($this->collections[$col_i]); + // merge in custom modules + $modules = array_merge($modules, $this->userModules); + + foreach ($modules as $module) { + $this->processModule($module); } - $this->validModules = $this->assembleModules($this->validCollections); - $this->activeModules = $this->assembleModules($this->activeCollections); + foreach ($this->doctype->tidyModules as $module) { + $this->processModule($module); + if (method_exists($this->modules[$module], 'construct')) { + $this->modules[$module]->construct($config); + } + } // setup lookup table based on all valid modules - foreach ($this->validModules as $module) { + foreach ($this->modules as $module) { foreach ($module->info as $name => $def) { if (!isset($this->elementLookup[$name])) { $this->elementLookup[$name] = array(); @@ -324,214 +363,51 @@ class HTMLPurifier_HTMLModuleManager // note the different choice $this->contentSets = new HTMLPurifier_ContentSets( - // content models that contain non-allowed elements are - // harmless because RemoveForeignElements will ensure - // they never get in anyway, and there is usually no - // reason why you should want to restrict a content - // model beyond what is mandated by the doctype. - // Note, however, that this means redefinitions of - // content models can't be tossed in validModels willy-nilly: - // that stuff still is regulated by configuration. - $this->validModules + // content set assembly deals with all possible modules, + // not just ones deemed to be "safe" + $this->modules ); $this->attrCollections = new HTMLPurifier_AttrCollections( $this->attrTypes, - // only explicitly allowed modules are allowed to affect - // the global attribute collections. This mean's there's - // a distinction between loading the Bdo module, and the - // bdo element: Bdo will enable the dir attribute on all - // elements, while bdo will only define the bdo element, - // which will not have an editable directionality. This might - // catch people who are loading only elements by surprise, so - // we should consider loading an entire module if all the - // elements it defines are requested by the user, especially - // if it affects the global attribute collections. - $this->activeModules + // there is no way to directly disable a global attribute, + // but using AllowedAttributes or simply not including + // the module in your custom doctype should be sufficient + $this->modules ); - } /** - * Takes a list of collections and merges together all the defined - * modules for the current doctype from those collections. - * @param $collections List of collection suffixes we should grab - * modules from (like 'Safe' or 'Lenient') + * Takes a module and adds it to the active module collection, + * registering it if necessary. */ - function assembleModules($collections) { - $modules = array(); - $numOfCollectionsUsed = 0; - foreach ($collections as $name) { - $disable_global = false; - if (!isset($this->collections[$name])) { - trigger_error("$name collection is undefined", E_USER_ERROR); - continue; - } - $cols = $this->collections[$name]; - if (isset($cols[$this->doctype])) { - if (isset($cols[$this->doctype]['*'])) { - unset($cols[$this->doctype]['*']); - $disable_global = true; - } - $modules += $cols[$this->doctype]; - $numOfCollectionsUsed++; - } - // accept catch-all doctype - if ( - $this->doctype !== '*' && - isset($cols['*']) && - !$disable_global - ) { - $modules += $cols['*']; - } + function processModule($module) { + if (!isset($this->registeredModules[$module]) || is_object($module)) { + $this->registerModule($module); } - - if ($numOfCollectionsUsed < 1) { - // possible XSS injection if user-specified doctypes - // are allowed - trigger_error("Doctype {$this->doctype} does not exist, ". - "check for typos (if you desire a doctype that allows ". - "no elements, use an empty array collection)", E_USER_ERROR); - } - return $modules; + $this->modules[$module] = $this->registeredModules[$module]; } /** - * Takes a collection and performs inclusions and substitutions for it. - * @param $cols Reference to collections class member variable + * Retrieves merged element definitions. + * @return Array of HTMLPurifier_ElementDef */ - function processCollections(&$cols) { - - // $cols is the set of collections - // $col_i is the name (index) of a collection - // $col is a collection/list of modules - - // perform inclusions - foreach ($cols as $col_i => $col) { - $seen = array(); - if (!empty($col[0]) && is_array($col[0])) { - $seen[$col_i] = true; // recursion reporting - $includes = $col[0]; - unset($cols[$col_i][0]); // remove inclusions value, recursion guard - } else { - $includes = array(); - } - if (empty($includes)) continue; - for ($i = 0; isset($includes[$i]); $i++) { - $inc = $includes[$i]; - if (isset($seen[$inc])) { - trigger_error( - "Circular inclusion detected in $col_i collection", - E_USER_ERROR - ); - continue; - } else { - $seen[$inc] = true; - } - if (!isset($cols[$inc])) { - trigger_error( - "Collection $col_i tried to include undefined ". - "collection $inc", E_USER_ERROR); - continue; - } - foreach ($cols[$inc] as $module) { - if (is_array($module)) { // another inclusion! - foreach ($module as $inc2) $includes[] = $inc2; - continue; - } - $cols[$col_i][] = $module; // merge in the other modules - } - } - } - - // replace with real modules, invert module from list to - // assoc array of module name to module instance - foreach ($cols as $col_i => $col) { - $ignore_global = false; - $order = array(); - foreach ($col as $module_i => $module) { - unset($cols[$col_i][$module_i]); - if (is_array($module)) { - trigger_error("Illegal inclusion array at index". - " $module_i found collection $col_i, inclusion". - " arrays must be at start of collection (index 0)", - E_USER_ERROR); - continue; - } - if ($module_i === '*' && $module === false) { - $ignore_global = true; - continue; - } - if (!isset($this->modules[$module])) { - trigger_error( - "Collection $col_i references undefined ". - "module $module", - E_USER_ERROR - ); - continue; - } - $module = $this->modules[$module]; - $cols[$col_i][$module->name] = $module; - $order[$module->name] = $module->order; - } - array_multisort( - $order, SORT_ASC, SORT_NUMERIC, $cols[$col_i] - ); - if ($ignore_global) $cols[$col_i]['*'] = false; - } - - // delete pseudo-collections - foreach ($cols as $col_i => $col) { - if ($col_i[0] == '_') unset($cols[$col_i]); - } - - } - - /** - * Retrieves the doctype from the configuration object - */ - function getDoctype($config) { - $doctype = $config->get('HTML', 'Doctype'); - if ($doctype !== null) { - return $doctype; - } - if (!$this->initialized) { - // don't do HTML-oriented backwards compatibility stuff - // use either the auto-doctype, or the catch-all doctype - return $this->autoDoctype ? $this->autoDoctype : '*'; - } - // this is backwards-compatibility stuff - if ($config->get('Core', 'XHTML')) { - $doctype = 'XHTML 1.0'; - } else { - $doctype = 'HTML 4.01'; - } - if ($config->get('HTML', 'Strict')) { - $doctype .= ' Strict'; - } else { - $doctype .= ' Transitional'; - } - return $doctype; - } - - /** - * Retrieves merged element definitions for all active elements. - * @note We may want to generate an elements array during setup - * and pass that on, because a specific combination of - * elements may trigger the loading of a module. - * @param $config Instance of HTMLPurifier_Config, for determining - * stray elements. - */ - function getElements($config) { + function getElements() { $elements = array(); - foreach ($this->activeModules as $module) { + foreach ($this->modules as $module) { foreach ($module->info as $name => $v) { if (isset($elements[$name])) continue; - $elements[$name] = $this->getElement($name, $config); + // if element is not safe, don't use it + if (!$this->trusted && ($v->safe === false)) continue; + $elements[$name] = $this->getElement($name); } } - // standalone elements now loaded + // remove dud elements, this happens when an element that + // appeared to be safe actually wasn't + foreach ($elements as $n => $v) { + if ($v === false) unset($elements[$n]); + } return $elements; @@ -540,13 +416,16 @@ class HTMLPurifier_HTMLModuleManager /** * Retrieves a single merged element definition * @param $name Name of element - * @param $config Instance of HTMLPurifier_Config, may not be necessary. + * @param $trusted Boolean trusted overriding parameter: set to true + * if you want the full version of an element + * @return Merged HTMLPurifier_ElementDef */ - function getElement($name, $config) { + function getElement($name, $trusted = null) { $def = false; + if ($trusted === null) $trusted = $this->trusted; - $modules = $this->validModules; + $modules = $this->modules; if (!isset($this->elementLookup[$name])) { return false; @@ -555,9 +434,23 @@ class HTMLPurifier_HTMLModuleManager foreach($this->elementLookup[$name] as $module_name) { $module = $modules[$module_name]; - $new_def = $module->info[$name]; + + // copy is used because, ideally speaking, the original + // definition should not be modified. Usually, this will + // make no difference, but for consistency's sake + $new_def = $module->info[$name]->copy(); + + // refuse to create/merge in a definition that is deemed unsafe + if (!$trusted && ($new_def->safe === false)) { + $def = false; + continue; + } if (!$def && $new_def->standalone) { + // element with unknown safety is not to be trusted. + // however, a merge-in definition with undefined safety + // is fine + if (!$trusted && !$new_def->safe) continue; $def = $new_def; } elseif ($def) { $def->mergeIn($new_def); @@ -583,6 +476,13 @@ class HTMLPurifier_HTMLModuleManager $this->contentSets->generateChildDef($def, $module); } + + // add information on required attributes + foreach ($def->attr as $attr_name => $attr_def) { + if ($attr_def->required) { + $def->required_attr[] = $attr_name; + } + } return $def; @@ -590,4 +490,4 @@ class HTMLPurifier_HTMLModuleManager } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php b/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php index 40ff2384bb..525c9aa080 100644 --- a/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php +++ b/lib/htmlpurifier/HTMLPurifier/IDAccumulator.php @@ -39,4 +39,3 @@ class HTMLPurifier_IDAccumulator } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Injector.php b/lib/htmlpurifier/HTMLPurifier/Injector.php new file mode 100644 index 0000000000..5901716387 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Injector.php @@ -0,0 +1,111 @@ +inputTokens that indicates what token + * is currently being processed. + */ + var $inputIndex; + + /** + * Array of elements and attributes this injector creates and therefore + * need to be allowed by the definition. Takes form of + * array('element' => array('attr', 'attr2'), 'element2') + */ + var $needed = array(); + + /** + * Prepares the injector by giving it the config and context objects: + * this allows references to important variables to be made within + * the injector. This function also checks if the HTML environment + * will work with the Injector: if p tags are not allowed, the + * Auto-Paragraphing injector should not be enabled. + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @return Boolean false if success, string of missing needed element/attribute if failure + */ + function prepare($config, &$context) { + $this->htmlDefinition = $config->getHTMLDefinition(); + // perform $needed checks + foreach ($this->needed as $element => $attributes) { + if (is_int($element)) $element = $attributes; + if (!isset($this->htmlDefinition->info[$element])) return $element; + if (!is_array($attributes)) continue; + foreach ($attributes as $name) { + if (!isset($this->htmlDefinition->info[$element]->attr[$name])) return "$element.$name"; + } + } + $this->currentNesting =& $context->get('CurrentNesting'); + $this->inputTokens =& $context->get('InputTokens'); + $this->inputIndex =& $context->get('InputIndex'); + return false; + } + + /** + * Tests if the context node allows a certain element + * @param $name Name of element to test for + * @return True if element is allowed, false if it is not + */ + function allowsElement($name) { + if (!empty($this->currentNesting)) { + $parent_token = array_pop($this->currentNesting); + $this->currentNesting[] = $parent_token; + $parent = $this->htmlDefinition->info[$parent_token->name]; + } else { + $parent = $this->htmlDefinition->info_parent_def; + } + if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) { + return false; + } + return true; + } + + /** + * Handler that is called when a text token is processed + */ + function handleText(&$token) {} + + /** + * Handler that is called when a start or empty token is processed + */ + function handleElement(&$token) {} + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php b/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php new file mode 100644 index 0000000000..6e0a6a3ed5 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php @@ -0,0 +1,267 @@ + + This directive turns on auto-paragraphing, where double newlines are + converted in to paragraphs whenever possible. Auto-paragraphing + applies when: + +
+ p
tags must be allowed for this directive to take effect.
+ We do not use br
tags for paragraphing, as that is
+ semantically incorrect.
+
+ This directive has been available since 2.0.1. +
+'); + +/** + * Injector that auto paragraphs text in the root node based on + * double-spacing. + */ +class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector +{ + + var $name = 'AutoParagraph'; + var $needed = array('p'); + + function _pStart() { + $par = new HTMLPurifier_Token_Start('p'); + $par->armor['MakeWellFormed_TagClosedError'] = true; + return $par; + } + + function handleText(&$token) { + $text = $token->data; + if (empty($this->currentNesting)) { + if (!$this->allowsElement('p')) return; + // case 1: we're in root node (and it allows paragraphs) + $token = array($this->_pStart()); + $this->_splitText($text, $token); + } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') { + // case 2: we're in a paragraph + $token = array(); + $this->_splitText($text, $token); + } elseif ($this->allowsElement('p')) { + // case 3: we're in an element that allows paragraphs + if (strpos($text, "\n\n") !== false) { + // case 3.1: this text node has a double-newline + $token = array($this->_pStart()); + $this->_splitText($text, $token); + } else { + $ok = false; + // test if up-coming tokens are either block or have + // a double newline in them + for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { + if ($this->inputTokens[$i]->type == 'start'){ + if (!$this->_isInline($this->inputTokens[$i])) { + $ok = true; + } + break; + } + if ($this->inputTokens[$i]->type == 'end') break; + if ($this->inputTokens[$i]->type == 'text') { + if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { + $ok = true; + } + if (!$this->inputTokens[$i]->is_whitespace) break; + } + } + if ($ok) { + // case 3.2: this text node is next to another node + // that will start a paragraph + $token = array($this->_pStart(), $token); + } + } + } + + } + + function handleElement(&$token) { + // check if we're inside a tag already + if (!empty($this->currentNesting)) { + if ($this->allowsElement('p')) { + // special case: we're in an element that allows paragraphs + + // this token is already paragraph, abort + if ($token->name == 'p') return; + + // this token is a block level, abort + if (!$this->_isInline($token)) return; + + // check if this token is adjacent to the parent token + $prev = $this->inputTokens[$this->inputIndex - 1]; + if ($prev->type != 'start') { + // not adjacent, we can abort early + // add lead paragraph tag if our token is inline + // and the previous tag was an end paragraph + if ( + $prev->name == 'p' && $prev->type == 'end' && + $this->_isInline($token) + ) { + $token = array($this->_pStart(), $token); + } + return; + } + + // this token is the first child of the element that allows + // paragraph. We have to peek ahead and see whether or not + // there is anything inside that suggests that a paragraph + // will be needed + $ok = false; + // maintain a mini-nesting counter, this lets us bail out + // early if possible + $j = 1; // current nesting, one is due to parent (we recalculate current token) + for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) { + if ($this->inputTokens[$i]->type == 'start') $j++; + if ($this->inputTokens[$i]->type == 'end') $j--; + if ($this->inputTokens[$i]->type == 'text') { + if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { + $ok = true; + break; + } + } + if ($j <= 0) break; + } + if ($ok) { + $token = array($this->_pStart(), $token); + } + } + return; + } + + // check if the start tag counts as a "block" element + if (!$this->_isInline($token)) return; + + // append a paragraph tag before the token + $token = array($this->_pStart(), $token); + } + + /** + * Splits up a text in paragraph tokens and appends them + * to the result stream that will replace the original + * @param $data String text data that will be processed + * into paragraphs + * @param $result Reference to array of tokens that the + * tags will be appended onto + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @private + */ + function _splitText($data, &$result) { + $raw_paragraphs = explode("\n\n", $data); + + // remove empty paragraphs + $paragraphs = array(); + $needs_start = false; + $needs_end = false; + + $c = count($raw_paragraphs); + if ($c == 1) { + // there were no double-newlines, abort quickly + $result[] = new HTMLPurifier_Token_Text($data); + return; + } + + for ($i = 0; $i < $c; $i++) { + $par = $raw_paragraphs[$i]; + if (trim($par) !== '') { + $paragraphs[] = $par; + continue; + } + if ($i == 0 && empty($result)) { + // The empty result indicates that the AutoParagraph + // injector did not add any start paragraph tokens. + // The fact that the first paragraph is empty indicates + // that there was a double-newline at the start of the + // data. + // Combined together, this means that we are in a paragraph, + // and the newline means we should start a new one. + $result[] = new HTMLPurifier_Token_End('p'); + // However, the start token should only be added if + // there is more processing to be done (i.e. there are + // real paragraphs in here). If there are none, the + // next start paragraph tag will be handled by the + // next run-around the injector + $needs_start = true; + } elseif ($i + 1 == $c) { + // a double-paragraph at the end indicates that + // there is an overriding need to start a new paragraph + // for the next section. This has no effect until + // we've processed all of the other paragraphs though + $needs_end = true; + } + } + + // check if there are no "real" paragraphs to be processed + if (empty($paragraphs)) { + return; + } + + // add a start tag if an end tag was added while processing + // the raw paragraphs (that happens if there's a leading double + // newline) + if ($needs_start) $result[] = $this->_pStart(); + + // append the paragraphs onto the result + foreach ($paragraphs as $par) { + $result[] = new HTMLPurifier_Token_Text($par); + $result[] = new HTMLPurifier_Token_End('p'); + $result[] = $this->_pStart(); + } + + // remove trailing start token, if one is needed, it will + // be handled the next time this injector is called + array_pop($result); + + // check the outside to determine whether or not the + // end paragraph tag should be removed. It should be removed + // unless the next non-whitespace token is a paragraph + // or a block element. + $remove_paragraph_end = true; + + if (!$needs_end) { + // Start of the checks one after the current token's index + for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { + if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') { + $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); + } + // check if we can abort early (whitespace means we carry-on!) + if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break; + // end tags will automatically be handled by MakeWellFormed, + // so we don't have to worry about them + if ($this->inputTokens[$i]->type == 'end') break; + } + } else { + $remove_paragraph_end = false; + } + + // check the outside to determine whether or not the + // end paragraph tag should be removed + if ($remove_paragraph_end) { + array_pop($result); + } + + } + + /** + * Returns true if passed token is inline (and, ergo, allowed in + * paragraph tags) + * @private + */ + function _isInline($token) { + return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/Linkify.php b/lib/htmlpurifier/HTMLPurifier/Injector/Linkify.php new file mode 100644 index 0000000000..bf7abfa977 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Injector/Linkify.php @@ -0,0 +1,56 @@ + + This directive turns on linkification, auto-linking http, ftp and + https URLs.a
tags with the href
attribute
+ must be allowed. This directive has been available since 2.0.1.
+
+');
+
+/**
+ * Injector that converts http, https and ftp text URLs to actual links.
+ */
+class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
+{
+
+ var $name = 'Linkify';
+ var $needed = array('a' => array('href'));
+
+ function handleText(&$token) {
+ if (!$this->allowsElement('a')) return;
+
+ if (strpos($token->data, '://') === false) {
+ // our really quick heuristic failed, abort
+ // this may not work so well if we want to match things like
+ // "google.com", but then again, most people don't
+ return;
+ }
+
+ // there is/are URL(s). Let's split the string:
+ // Note: this regex is extremely permissive
+ $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
+
+ $token = array();
+
+ // $i = index
+ // $c = count
+ // $l = is link
+ for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
+ if (!$l) {
+ if ($bits[$i] === '') continue;
+ $token[] = new HTMLPurifier_Token_Text($bits[$i]);
+ } else {
+ $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
+ $token[] = new HTMLPurifier_Token_Text($bits[$i]);
+ $token[] = new HTMLPurifier_Token_End('a');
+ }
+ }
+
+ }
+
+}
+
diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/PurifierLinkify.php b/lib/htmlpurifier/HTMLPurifier/Injector/PurifierLinkify.php
new file mode 100644
index 0000000000..a7686297c2
--- /dev/null
+++ b/lib/htmlpurifier/HTMLPurifier/Injector/PurifierLinkify.php
@@ -0,0 +1,65 @@
+
+ Internal auto-formatter that converts configuration directives in
+ syntax %Namespace.Directive to links. a
tags
+ with the href
attribute must be allowed.
+ This directive has been available since 2.0.1.
+
+');
+
+HTMLPurifier_ConfigSchema::define(
+ 'AutoFormatParam', 'PurifierLinkifyDocURL', '#%s', 'string', '
++ Location of configuration documentation to link to, let %s substitute + into the configuration\'s namespace and directive names sans the percent + sign. This directive has been available since 2.0.1. +
+'); + +/** + * Injector that converts configuration directive syntax %Namespace.Directive + * to links + */ +class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector +{ + + var $name = 'PurifierLinkify'; + var $docURL; + var $needed = array('a' => array('href')); + + function prepare($config, &$context) { + $this->docURL = $config->get('AutoFormatParam', 'PurifierLinkifyDocURL'); + return parent::prepare($config, $context); + } + + function handleText(&$token) { + if (!$this->allowsElement('a')) return; + if (strpos($token->data, '%') === false) return; + + $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE); + $token = array(); + + // $i = index + // $c = count + // $l = is link + for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) { + if (!$l) { + if ($bits[$i] === '') continue; + $token[] = new HTMLPurifier_Token_Text($bits[$i]); + } else { + $token[] = new HTMLPurifier_Token_Start('a', + array('href' => str_replace('%s', $bits[$i], $this->docURL))); + $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]); + $token[] = new HTMLPurifier_Token_End('a'); + } + } + + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/Language.php b/lib/htmlpurifier/HTMLPurifier/Language.php index ca6fe03138..c9a3c20fe2 100644 --- a/lib/htmlpurifier/HTMLPurifier/Language.php +++ b/lib/htmlpurifier/HTMLPurifier/Language.php @@ -20,12 +20,27 @@ class HTMLPurifier_Language */ var $messages = array(); + /** + * Array of localizable error codes + */ + var $errorNames = array(); + /** * Has the language object been loaded yet? * @private */ var $_loaded = false; + /** + * Instances of HTMLPurifier_Config and HTMLPurifier_Context + */ + var $config, $context; + + function HTMLPurifier_Language($config, &$context) { + $this->config = $config; + $this->context =& $context; + } + /** * Loads language object with necessary info from factory cache * @note This is a lazy loader @@ -41,16 +56,98 @@ class HTMLPurifier_Language } /** - * Retrieves a localised message. Does not perform any operations. + * Retrieves a localised message. * @param $key string identifier of message * @return string localised message */ function getMessage($key) { if (!$this->_loaded) $this->load(); - if (!isset($this->messages[$key])) return ''; + if (!isset($this->messages[$key])) return "[$key]"; return $this->messages[$key]; } + /** + * Retrieves a localised error name. + * @param $int integer error number, corresponding to PHP's error + * reporting + * @return string localised message + */ + function getErrorName($int) { + if (!$this->_loaded) $this->load(); + if (!isset($this->errorNames[$int])) return "[Error: $int]"; + return $this->errorNames[$int]; + } + + /** + * Converts an array list into a string readable representation + */ + function listify($array) { + $sep = $this->getMessage('Item separator'); + $sep_last = $this->getMessage('Item separator last'); + $ret = ''; + for ($i = 0, $c = count($array); $i < $c; $i++) { + if ($i == 0) { + } elseif ($i + 1 < $c) { + $ret .= $sep; + } else { + $ret .= $sep_last; + } + $ret .= $array[$i]; + } + return $ret; + } + + /** + * Formats a localised message with passed parameters + * @param $key string identifier of message + * @param $args Parameters to substitute in + * @return string localised message + * @todo Implement conditionals? Right now, some messages make + * reference to line numbers, but those aren't always available + */ + function formatMessage($key, $args = array()) { + if (!$this->_loaded) $this->load(); + if (!isset($this->messages[$key])) return "[$key]"; + $raw = $this->messages[$key]; + $subst = array(); + $generator = false; + foreach ($args as $i => $value) { + if (is_object($value)) { + if (is_a($value, 'HTMLPurifier_Token')) { + // factor this out some time + if (!$generator) $generator = $this->context->get('Generator'); + if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name; + if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data; + $subst['$'.$i.'.Compact'] = + $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value); + // a more complex algorithm for compact representation + // could be introduced for all types of tokens. This + // may need to be factored out into a dedicated class + if (!empty($value->attr)) { + $stripped_token = $value->copy(); + $stripped_token->attr = array(); + $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token); + } + $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown'; + } + continue; + } elseif (is_array($value)) { + $keys = array_keys($value); + if (array_keys($keys) === $keys) { + // list + $subst['$'.$i] = $this->listify($value); + } else { + // associative array + // no $i implementation yet, sorry + $subst['$'.$i.'.Keys'] = $this->listify($keys); + $subst['$'.$i.'.Values'] = $this->listify(array_values($value)); + } + continue; + } + $subst['$' . $i] = $value; + } + return strtr($raw, $subst); + } + } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php b/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php index 303ba4bae0..cbf0e612b6 100644 --- a/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php +++ b/lib/htmlpurifier/HTMLPurifier/Language/classes/en-x-test.php @@ -9,4 +9,3 @@ class HTMLPurifier_Language_en_x_test extends HTMLPurifier_Language } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php index 115662bda9..3eac9ec65c 100644 --- a/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php +++ b/lib/htmlpurifier/HTMLPurifier/Language/messages/en-x-test.php @@ -5,7 +5,6 @@ $fallback = 'en'; $messages = array( - 'htmlpurifier' => 'HTML Purifier X' + 'HTMLPurifier' => 'HTML Purifier X' ); -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php index 7650b81803..b16c3ff385 100644 --- a/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php +++ b/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php @@ -4,9 +4,54 @@ $fallback = false; $messages = array( -'htmlpurifier' => 'HTML Purifier', -'pizza' => 'Pizza', // for unit testing purposes +'HTMLPurifier' => 'HTML Purifier', +// for unit testing purposes +'LanguageFactoryTest: Pizza' => 'Pizza', +'LanguageTest: List' => '$1', +'LanguageTest: Hash' => '$1.Keys; $1.Values', + +'Item separator' => ', ', +'Item separator last' => ' and ', // non-Harvard style + +'ErrorCollector: No errors' => 'No errors detected. However, because error reporting is still incomplete, there may have been errors that the error collector was not notified of; please inspect the output HTML carefully.', +'ErrorCollector: At line' => ' at line $line', + +'Lexer: Unclosed comment' => 'Unclosed comment', +'Lexer: Unescaped lt' => 'Unescaped less-than sign (<) should be <', +'Lexer: Missing gt' => 'Missing greater-than sign (>), previous less-than sign (<) should be escaped', +'Lexer: Missing attribute key' => 'Attribute declaration has no key', +'Lexer: Missing end quote' => 'Attribute declaration has no end quote', + +'Strategy_RemoveForeignElements: Tag transform' => '<$1> element transformed into $CurrentToken.Serialized', +'Strategy_RemoveForeignElements: Missing required attribute' => '$CurrentToken.Compact element missing required attribute $1', +'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $CurrentToken.Serialized tag converted to text', +'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $CurrentToken.Serialized tag removed', +'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$CurrentToken.Data" removed', +'Strategy_RemoveForeignElements: Foreign meta element removed' => 'Unrecognized $CurrentToken.Serialized meta tag and all descendants removed', +'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end', + +'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary $CurrentToken.Serialized tag removed', +'Strategy_MakeWellFormed: Unnecessary end tag to text' => 'Unnecessary $CurrentToken.Serialized tag converted to text', +'Strategy_MakeWellFormed: Tag auto closed' => '$1.Compact started on line $1.Line auto-closed by $CurrentToken.Compact', +'Strategy_MakeWellFormed: Stray end tag removed' => 'Stray $CurrentToken.Serialized tag removed', +'Strategy_MakeWellFormed: Stray end tag to text' => 'Stray $CurrentToken.Serialized tag converted to text', +'Strategy_MakeWellFormed: Tag closed by element end' => '$1.Compact tag started on line $1.Line closed by end of $CurrentToken.Serialized', +'Strategy_MakeWellFormed: Tag closed by document end' => '$1.Compact tag started on line $1.Line closed by end of document', + +'Strategy_FixNesting: Node removed' => '$CurrentToken.Compact node removed', +'Strategy_FixNesting: Node excluded' => '$CurrentToken.Compact node removed due to descendant exclusion by ancestor element', +'Strategy_FixNesting: Node reorganized' => 'Contents of $CurrentToken.Compact node reorganized to enforce its content model', +'Strategy_FixNesting: Node contents removed' => 'Contents of $CurrentToken.Compact node removed', + +'AttrValidator: Attributes transformed' => 'Attributes on $CurrentToken.Compact transformed from $1.Keys to $2.Keys', +'AttrValidator: Attribute removed' => '$CurrentAttr.Name attribute on $CurrentToken.Compact removed', + +); + +$errorNames = array( + E_ERROR => 'Error', + E_WARNING => 'Warning', + E_NOTICE => 'Notice' ); -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php b/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php index 7097ced767..9d26cd7037 100644 --- a/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php +++ b/lib/htmlpurifier/HTMLPurifier/LanguageFactory.php @@ -3,6 +3,14 @@ require_once 'HTMLPurifier/Language.php'; require_once 'HTMLPurifier/AttrDef/Lang.php'; +HTMLPurifier_ConfigSchema::define( + 'Core', 'Language', 'en', 'string', ' +ISO 639 language code for localizable things in HTML Purifier to use, +which is mainly error reporting. There is currently only an English (en) +translation, so this directive is currently useless. +This directive has been available since 2.0.0. +'); + /** * Class responsible for generating HTMLPurifier_Language objects, managing * caching and fallbacks. @@ -24,7 +32,7 @@ class HTMLPurifier_LanguageFactory * variables to slurp out of a message file. * @value array list */ - var $keys = array('fallback', 'messages'); + var $keys = array('fallback', 'messages', 'errorNames'); /** * Instance of HTMLPurifier_AttrDef_Lang to validate language codes @@ -43,7 +51,7 @@ class HTMLPurifier_LanguageFactory * Keys whose contents are a hash map and can be merged * @value array lookup */ - var $mergeable_keys_map = array('messages' => true); + var $mergeable_keys_map = array('messages' => true, 'errorNames' => true); /** * Keys whose contents are a list and can be merged @@ -74,17 +82,20 @@ class HTMLPurifier_LanguageFactory */ function setup() { $this->validator = new HTMLPurifier_AttrDef_Lang(); - $this->dir = dirname(__FILE__); + $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier'; } /** * Creates a language object, handles class fallbacks - * @param $code string language code + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context */ - function create($code) { + function create($config, &$context) { - $config = $context = false; // hope it doesn't use these! - $code = $this->validator->validate($code, $config, $context); + // validate language code + $code = $this->validator->validate( + $config->get('Core', 'Language'), $config, $context + ); if ($code === false) $code = 'en'; // malformed code becomes English $pcode = str_replace('-', '_', $code); // make valid PHP classname @@ -100,18 +111,18 @@ class HTMLPurifier_LanguageFactory // you can bypass the conditional include by loading the // file yourself if (file_exists($file) && !class_exists($class)) { - include_once $file; - } + include_once $file; + } } if (!class_exists($class)) { // go fallback - $fallback = HTMLPurifier_Language::getFallbackFor($code); + $fallback = HTMLPurifier_LanguageFactory::getFallbackFor($code); $depth++; - $lang = Language::factory( $fallback ); + $lang = HTMLPurifier_LanguageFactory::factory( $fallback ); $depth--; } else { - $lang = new $class; + $lang = new $class($config, $context); } $lang->code = $code; @@ -172,15 +183,15 @@ class HTMLPurifier_LanguageFactory // merge fallback with current language foreach ( $this->keys as $key ) { - if (isset($cache[$key]) && isset($fallback_cache[$key])) { + if (isset($cache[$key]) && isset($fallback_cache[$key])) { if (isset($this->mergeable_keys_map[$key])) { $cache[$key] = $cache[$key] + $fallback_cache[$key]; } elseif (isset($this->mergeable_keys_list[$key])) { $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] ); } - } else { - $cache[$key] = $fallback_cache[$key]; - } + } else { + $cache[$key] = $fallback_cache[$key]; + } } } @@ -193,4 +204,3 @@ class HTMLPurifier_LanguageFactory } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer.php b/lib/htmlpurifier/HTMLPurifier/Lexer.php index e7242e1e36..b1bd1ed0e1 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer.php @@ -4,6 +4,14 @@ require_once 'HTMLPurifier/Token.php'; require_once 'HTMLPurifier/Encoder.php'; require_once 'HTMLPurifier/EntityParser.php'; +// implementations +require_once 'HTMLPurifier/Lexer/DirectLex.php'; +if (version_compare(PHP_VERSION, "5", ">=")) { + // You can remove the if statement if you are running PHP 5 only. + // We ought to get the strict version to follow those rules. + require_once 'HTMLPurifier/Lexer/DOMLex.php'; +} + HTMLPurifier_ConfigSchema::define( 'Core', 'AcceptFullDocuments', true, 'bool', 'This parameter determines whether or not the filter should accept full '. @@ -11,6 +19,63 @@ HTMLPurifier_ConfigSchema::define( 'drop all sections except the content between body.' ); +HTMLPurifier_ConfigSchema::define( + 'Core', 'LexerImpl', null, 'mixed/null', ' ++ This parameter determines what lexer implementation can be used. The + valid values are: +
+HTMLPurifier_Lexer
.
+ I may remove this option simply because I don\'t expect anyone
+ to use it.
+ + This directive has been available since 2.0.0. +
+' +); + +HTMLPurifier_ConfigSchema::define( + 'Core', 'MaintainLineNumbers', null, 'bool/null', ' ++ If true, HTML Purifier will add line number information to all tokens. + This is useful when error reporting is turned on, but can result in + significant performance degradation and should not be used when + unnecessary. This directive must be used with the DirectLex lexer, + as the DOMLex lexer does not (yet) support this functionality. + If the value is null, an appropriate value will be selected based + on other configuration. This directive has been available since 2.0.0. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'Core', 'AggressivelyFixLt', false, 'bool', ' +This directive enables aggressive pre-filter fixes HTML Purifier can +perform in order to ensure that open angled-brackets do not get killed +during parsing stage. Enabling this will result in two preg_replace_callback +calls and one preg_replace call for every bit of HTML passed through here. +It is not necessary and will have no effect for PHP 4. +This directive has been available since 2.1.0. +'); + /** * Forgivingly lexes HTML (SGML-style) markup into tokens. * @@ -55,11 +120,87 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_Lexer { + // -- STATIC ---------------------------------------------------------- + + /** + * Retrieves or sets the default Lexer as a Prototype Factory. + * + * Depending on what PHP version you are running, the abstract base + * Lexer class will determine which concrete Lexer is best for you: + * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex + * for PHP 5 and beyond. This general rule has a few exceptions to it + * involving special features that only DirectLex implements. + * + * @static + * + * @note The behavior of this class has changed, rather than accepting + * a prototype object, it now accepts a configuration object. + * To specify your own prototype, set %Core.LexerImpl to it. + * This change in behavior de-singletonizes the lexer object. + * + * @note In PHP4, it is possible to call this factory method from + * subclasses, such usage is not recommended and not + * forwards-compatible. + * + * @param $prototype Optional prototype lexer or configuration object + * @return Concrete lexer. + */ + function create($config) { + + if (!is_a($config, 'HTMLPurifier_Config')) { + $lexer = $config; + trigger_error("Passing a prototype to + HTMLPurifier_Lexer::create() is deprecated, please instead + use %Core.LexerImpl", E_USER_WARNING); + } else { + $lexer = $config->get('Core', 'LexerImpl'); + } + + if (is_object($lexer)) { + return $lexer; + } + + if (is_null($lexer)) { do { + // auto-detection algorithm + + // once PHP DOM implements native line numbers, or we + // hack out something using XSLT, remove this stipulation + $line_numbers = $config->get('Core', 'MaintainLineNumbers'); + if ( + $line_numbers === true || + ($line_numbers === null && $config->get('Core', 'CollectErrors')) + ) { + $lexer = 'DirectLex'; + break; + } + + if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5 + class_exists('DOMDocument')) { // check for DOM support + $lexer = 'DOMLex'; + } else { + $lexer = 'DirectLex'; + } + + } while(0); } // do..while so we can break + + // instantiate recognized string names + switch ($lexer) { + case 'DOMLex': + return new HTMLPurifier_Lexer_DOMLex(); + case 'DirectLex': + return new HTMLPurifier_Lexer_DirectLex(); + default: + trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR); + } + + } + + // -- CONVENIENCE MEMBERS --------------------------------------------- + function HTMLPurifier_Lexer() { $this->_entity_parser = new HTMLPurifier_EntityParser(); } - /** * Most common entity to raw value conversion table for special entities. * @protected @@ -123,46 +264,6 @@ class HTMLPurifier_Lexer trigger_error('Call to abstract class', E_USER_ERROR); } - /** - * Retrieves or sets the default Lexer as a Prototype Factory. - * - * Depending on what PHP version you are running, the abstract base - * Lexer class will determine which concrete Lexer is best for you: - * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex - * for PHP 5 and beyond. - * - * Passing the optional prototype lexer parameter will override the - * default with your own implementation. A copy/reference of the prototype - * lexer will now be returned when you request a new lexer. - * - * @static - * - * @note - * Though it is possible to call this factory method from subclasses, - * such usage is not recommended. - * - * @param $prototype Optional prototype lexer. - * @return Concrete lexer. - */ - function create($prototype = null) { - // we don't really care if it's a reference or a copy - static $lexer = null; - if ($prototype) { - $lexer = $prototype; - } - if (empty($lexer)) { - if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5 - class_exists('DOMDocument')) { // check for DOM support - require_once 'HTMLPurifier/Lexer/DOMLex.php'; - $lexer = new HTMLPurifier_Lexer_DOMLex(); - } else { - require_once 'HTMLPurifier/Lexer/DirectLex.php'; - $lexer = new HTMLPurifier_Lexer_DirectLex(); - } - } - return $lexer; - } - /** * Translates CDATA sections into regular sections (through escaping). * @@ -173,7 +274,18 @@ class HTMLPurifier_Lexer */ function escapeCDATA($string) { return preg_replace_callback( - '//', + '//s', + array('HTMLPurifier_Lexer', 'CDATACallback'), + $string + ); + } + + /** + * Special CDATA case that is especiall convoluted for )#si', + array('HTMLPurifier_Lexer_DirectLex', 'scriptCallback'), $html); + } + $html = $this->normalize($html, $config, $context); $cursor = 0; // our location in the text $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array + $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); + + if ($maintain_line_numbers === null) { + // automatically determine line numbering by checking + // if error collection is on + $maintain_line_numbers = $config->get('Core', 'CollectErrors'); + } + + if ($maintain_line_numbers) $current_line = 1; + else $current_line = false; + $context->register('CurrentLine', $current_line); + $nl = "\n"; + // how often to manually recalculate. This will ALWAYS be right, + // but it's pretty wasteful. Set to 0 to turn off + $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); + + $e = false; + if ($config->get('Core', 'CollectErrors')) { + $e =& $context->get('ErrorCollector'); + } + // infinite loop protection // has to be pretty big, since html docs can be big // we're allow two hundred thousand tags... more than enough? + // NOTE: this is also used for synchronization, so watch out $loops = 0; while(true) { @@ -42,10 +92,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // infinite loop protection if (++$loops > 200000) return array(); + // recalculate lines + if ( + $maintain_line_numbers && // line number tracking is on + $synchronize_interval && // synchronization is on + $cursor > 0 && // cursor is further than zero + $loops % $synchronize_interval === 0 // time to synchronize! + ) { + $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); + } + $position_next_lt = strpos($html, '<', $cursor); $position_next_gt = strpos($html, '>', $cursor); // triggers on "asdf" but not "asdf " + // special case to set up context if ($position_next_lt === $cursor) { $inside_tag = true; $cursor++; @@ -53,7 +114,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if (!$inside_tag && $position_next_lt !== false) { // We are not inside tag and there still is another tag to parse - $array[] = new + $token = new HTMLPurifier_Token_Text( $this->parseData( substr( @@ -61,6 +122,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); + } + $array[] = $token; $cursor = $position_next_lt + 1; $inside_tag = true; continue; @@ -69,7 +135,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // If we're already at the end, break if ($cursor === strlen($html)) break; // Create Text of rest of string - $array[] = new + $token = new HTMLPurifier_Token_Text( $this->parseData( substr( @@ -77,26 +143,54 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); + if ($maintain_line_numbers) $token->line = $current_line; + $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { // We are in tag and it is well formed // Grab the internals of the tag $strlen_segment = $position_next_gt - $cursor; + + if ($strlen_segment < 1) { + // there's nothing to process! + $token = new HTMLPurifier_Token_Text('<'); + $cursor++; + continue; + } + $segment = substr($html, $cursor, $strlen_segment); // Check if it's a comment if ( - substr($segment, 0, 3) == '!--' && - substr($segment, $strlen_segment-2, 2) == '--' + substr($segment, 0, 3) == '!--' ) { - $array[] = new + // re-determine segment length, looking for --> + $position_comment_end = strpos($html, '-->', $cursor); + if ($position_comment_end === false) { + // uh oh, we have a comment that extends to + // infinity. Can't be helped: set comment + // end position to end of string + if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment'); + $position_comment_end = strlen($html); + $end = true; + } else { + $end = false; + } + $strlen_segment = $position_comment_end - $cursor; + $segment = substr($html, $cursor, $strlen_segment); + $token = new HTMLPurifier_Token_Comment( substr( - $segment, 3, $strlen_segment - 5 + $segment, 3, $strlen_segment - 3 ) ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); + } + $array[] = $token; + $cursor = $end ? $position_comment_end : $position_comment_end + 3; $inside_tag = false; - $cursor = $position_next_gt + 1; continue; } @@ -104,7 +198,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $is_end_tag = (strpos($segment,'/') === 0); if ($is_end_tag) { $type = substr($segment, 1); - $array[] = new HTMLPurifier_Token_End($type); + $token = new HTMLPurifier_Token_End($type); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; @@ -113,8 +212,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // Check leading character is alnum, if not, we may // have accidently grabbed an emoticon. Translate into // text and go our merry way - if (!ctype_alnum($segment[0])) { - $array[] = new + if (!ctype_alpha($segment[0])) { + // XML: $segment[0] !== '_' && $segment[0] !== ':' + if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); + $token = new HTMLPurifier_Token_Text( '<' . $this->parseData( @@ -122,6 +223,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) . '>' ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; @@ -142,10 +248,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($position_first_space >= $strlen_segment) { if ($is_self_closing) { - $array[] = new HTMLPurifier_Token_Empty($segment); + $token = new HTMLPurifier_Token_Empty($segment); } else { - $array[] = new HTMLPurifier_Token_Start($segment); + $token = new HTMLPurifier_Token_Start($segment); + } + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } + $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; @@ -169,28 +280,56 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } if ($is_self_closing) { - $array[] = new HTMLPurifier_Token_Empty($type, $attr); + $token = new HTMLPurifier_Token_Empty($type, $attr); } else { - $array[] = new HTMLPurifier_Token_Start($type, $attr); + $token = new HTMLPurifier_Token_Start($type, $attr); } + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; } else { - $array[] = new + // inside tag, but there's no ending > sign + if ($e) $e->send(E_WARNING, 'Lexer: Missing gt'); + $token = new HTMLPurifier_Token_Text( '<' . $this->parseData( substr($html, $cursor) ) ); + if ($maintain_line_numbers) $token->line = $current_line; + // no cursor scroll? Hmm... + $array[] = $token; break; } break; } + + $context->destroy('CurrentLine'); return $array; } + /** + * PHP 4 compatible substr_count that implements offset and length + */ + function substrCount($haystack, $needle, $offset, $length) { + static $oldVersion; + if ($oldVersion === null) { + $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); + } + if ($oldVersion) { + $haystack = substr($haystack, $offset, $length); + return substr_count($haystack, $needle); + } else { + return substr_count($haystack, $needle, $offset, $length); + } + } + /** * Takes the inside of an HTML tag and makes an assoc array of attributes. * @@ -202,6 +341,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($string == '') return array(); // no attributes + $e = false; + if ($config->get('Core', 'CollectErrors')) { + $e =& $context->get('ErrorCollector'); + } + // let's see if we can abort as quickly as possible // one equal sign, no spaces => one attribute $num_equal = substr_count($string, '='); @@ -213,7 +357,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // only one attribute list($key, $quoted_value) = explode('=', $string); $quoted_value = trim($quoted_value); - if (!$key) return array(); + if (!$key) { + if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); + return array(); + } if (!$quoted_value) return array($key => ''); $first_char = @$quoted_value[0]; $last_char = @$quoted_value[strlen($quoted_value)-1]; @@ -227,11 +374,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } else { // not well behaved if ($open_quote) { + if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote'); $value = substr($quoted_value, 1); } else { $value = $quoted_value; } } + if ($value === false) $value = ''; return array($key => $value); } @@ -246,18 +395,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // infinite loop protection $loops = 0; - while(true) { // infinite loop protection - if (++$loops > 1000) return array(); + if (++$loops > 1000) { + trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING); + return array(); + } if ($cursor >= $size) { break; } $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); - // grab the key $key_begin = $cursor; //we're currently at the start of the key @@ -269,7 +419,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $key = substr($string, $key_begin, $key_end - $key_begin); - if (!$key) continue; // empty key + if (!$key) { + if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); + $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop + continue; // empty key + } // scroll past all whitespace $cursor += strspn($string, $this->_whitespace, $cursor); @@ -289,6 +443,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $cursor++; $cursor += strspn($string, $this->_whitespace, $cursor); + if ($cursor === false) { + $array[$key] = ''; + break; + } + // we might be in front of a quote right now $char = @$string[$cursor]; @@ -306,7 +465,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $value_end = $cursor; } + // we reached a premature end + if ($cursor === false) { + $cursor = $size; + $value_end = $cursor; + } + $value = substr($string, $value_begin, $value_end - $value_begin); + if ($value === false) $value = ''; $array[$key] = $this->parseData($value); $cursor++; @@ -314,6 +480,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // boolattr if ($key !== '') { $array[$key] = $key; + } else { + // purely theoretical + if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); } } @@ -323,4 +492,3 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php index 18777ef7e8..3888229b07 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer/PEARSax3.php @@ -107,4 +107,3 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php b/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php index 7a12caaa76..e32421e1c1 100644 --- a/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php +++ b/lib/htmlpurifier/HTMLPurifier/PercentEncoder.php @@ -44,4 +44,3 @@ class HTMLPurifier_PercentEncoder } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Printer.php b/lib/htmlpurifier/HTMLPurifier/Printer.php index 14135fd8db..7e20daafe3 100644 --- a/lib/htmlpurifier/HTMLPurifier/Printer.php +++ b/lib/htmlpurifier/HTMLPurifier/Printer.php @@ -4,6 +4,8 @@ require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Token.php'; require_once 'HTMLPurifier/Encoder.php'; +// OUT OF DATE, NEEDS UPDATING! + class HTMLPurifier_Printer { @@ -24,11 +26,21 @@ class HTMLPurifier_Printer $this->generator = new HTMLPurifier_Generator(); } + /** + * Give generator necessary configuration if possible + */ + function prepareGenerator($config) { + // hack for smoketests/configForm.php + if (empty($config->conf['HTML'])) return; + $context = new HTMLPurifier_Context(); + $this->generator->generateFromTokens(array(), $config, $context); + } + /** * Main function that renders object or aspect of that object - * @param $config Configuration object + * @note Parameters vary depending on printer */ - function render($config) {} + // function render() {} /** * Returns a start tag @@ -64,6 +76,18 @@ class HTMLPurifier_Printer $this->end($tag); } + function elementEmpty($tag, $attr = array()) { + return $this->generator->generateFromToken( + new HTMLPurifier_Token_Empty($tag, $attr) + ); + } + + function text($text) { + return $this->generator->generateFromToken( + new HTMLPurifier_Token_Text($text) + ); + } + /** * Prints a simple key/value row in a table. * @param $name Key @@ -146,4 +170,3 @@ class HTMLPurifier_Printer } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php b/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php index 7745f5f444..7d3ad61e98 100644 --- a/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php +++ b/lib/htmlpurifier/HTMLPurifier/Printer/CSSDefinition.php @@ -37,4 +37,3 @@ class HTMLPurifier_Printer_CSSDefinition extends HTMLPurifier_Printer } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.css b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.css new file mode 100644 index 0000000000..0653bbb0c0 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.css @@ -0,0 +1,8 @@ + +.hp-config {} + +.hp-config tbody th {text-align:right; padding-right:0.5em;} +.hp-config thead, .hp-config .namespace {background:#3C578C; color:#FFF;} +.hp-config .namespace th {text-align:center;} +.hp-config .verbose {display:none;} +.hp-config .controls {text-align:center;} diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.js b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.js new file mode 100644 index 0000000000..119ca4a04d --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.js @@ -0,0 +1,3 @@ +function toggleWriteability(id_of_patient, checked) { + document.getElementById(id_of_patient).disabled = checked; +} \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.php b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.php new file mode 100644 index 0000000000..31da35f8ac --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Printer/ConfigForm.php @@ -0,0 +1,320 @@ +docURL = $doc_url; + $this->name = $name; + $this->compress = $compress; + $this->fields['default'] = new HTMLPurifier_Printer_ConfigForm_default(); + $this->fields['bool'] = new HTMLPurifier_Printer_ConfigForm_bool(); + } + + /** + * @param $cols Integer columns of textarea, null to use default + * @param $rows Integer rows of textarea, null to use default + */ + function setTextareaDimensions($cols = null, $rows = null) { + if ($cols) $this->fields['default']->cols = $cols; + if ($rows) $this->fields['default']->rows = $rows; + } + + /** + * Retrieves styling, in case the directory it's in is not publically + * available + */ + function getCSS() { + return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.css'); + } + + /** + * Retrieves JavaScript, in case directory is not public + */ + function getJavaScript() { + return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.js'); + } + + /** + * Returns HTML output for a configuration form + * @param $config Configuration object of current form state + * @param $allowed Optional namespace(s) and directives to restrict form to. + */ + function render($config, $allowed = true, $render_controls = true) { + $this->config = $config; + $this->prepareGenerator($config); + + $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed); + $all = array(); + foreach ($allowed as $key) { + list($ns, $directive) = $key; + $all[$ns][$directive] = $config->get($ns, $directive); + } + + $ret = ''; + $ret .= $this->start('table', array('class' => 'hp-config')); + $ret .= $this->start('thead'); + $ret .= $this->start('tr'); + $ret .= $this->element('th', 'Directive'); + $ret .= $this->element('th', 'Value'); + $ret .= $this->end('tr'); + $ret .= $this->end('thead'); + foreach ($all as $ns => $directives) { + $ret .= $this->renderNamespace($ns, $directives); + } + if ($render_controls) { + $ret .= $this->start('tfoot'); + $ret .= $this->start('tr'); + $ret .= $this->start('td', array('colspan' => 2, 'class' => 'controls')); + $ret .= $this->elementEmpty('input', array('type' => 'Submit', 'value' => 'Submit')); + $ret .= '[Reset]'; + $ret .= $this->end('td'); + $ret .= $this->end('tr'); + $ret .= $this->end('tfoot'); + } + $ret .= $this->end('table'); + return $ret; + } + + /** + * Renders a single namespace + * @param $ns String namespace name + * @param $directive Associative array of directives to values + * @protected + */ + function renderNamespace($ns, $directives) { + $ret = ''; + $ret .= $this->start('tbody', array('class' => 'namespace')); + $ret .= $this->start('tr'); + $ret .= $this->element('th', $ns, array('colspan' => 2)); + $ret .= $this->end('tr'); + $ret .= $this->end('tbody'); + $ret .= $this->start('tbody'); + foreach ($directives as $directive => $value) { + $ret .= $this->start('tr'); + $ret .= $this->start('th'); + if ($this->docURL) { + $url = str_replace('%s', urlencode("$ns.$directive"), $this->docURL); + $ret .= $this->start('a', array('href' => $url)); + } + $attr = array('for' => "{$this->name}:$ns.$directive"); + + // crop directive name if it's too long + if (!$this->compress || (strlen($directive) < $this->compress)) { + $directive_disp = $directive; + } else { + $directive_disp = substr($directive, 0, $this->compress - 2) . '...'; + $attr['title'] = $directive; + } + + $ret .= $this->element( + 'label', + $directive_disp, + // component printers must create an element with this id + $attr + ); + if ($this->docURL) $ret .= $this->end('a'); + $ret .= $this->end('th'); + + $ret .= $this->start('td'); + $def = $this->config->def->info[$ns][$directive]; + $type = $def->type; + if (!isset($this->fields[$type])) $type = 'default'; + $type_obj = $this->fields[$type]; + if ($def->allow_null) { + $type_obj = new HTMLPurifier_Printer_ConfigForm_NullDecorator($type_obj); + } + $ret .= $type_obj->render($ns, $directive, $value, $this->name, $this->config); + $ret .= $this->end('td'); + $ret .= $this->end('tr'); + } + $ret .= $this->end('tbody'); + return $ret; + } + +} + +/** + * Printer decorator for directives that accept null + */ +class HTMLPurifier_Printer_ConfigForm_NullDecorator extends HTMLPurifier_Printer { + /** + * Printer being decorated + */ + var $obj; + /** + * @param $obj Printer to decorate + */ + function HTMLPurifier_Printer_ConfigForm_NullDecorator($obj) { + parent::HTMLPurifier_Printer(); + $this->obj = $obj; + } + function render($ns, $directive, $value, $name, $config) { + $this->prepareGenerator($config); + $ret = ''; + $ret .= $this->start('label', array('for' => "$name:Null_$ns.$directive")); + $ret .= $this->element('span', "$ns.$directive:", array('class' => 'verbose')); + $ret .= $this->text(' Null/Disabled'); + $ret .= $this->end('label'); + $attr = array( + 'type' => 'checkbox', + 'value' => '1', + 'class' => 'null-toggle', + 'name' => "$name"."[Null_$ns.$directive]", + 'id' => "$name:Null_$ns.$directive", + 'onclick' => "toggleWriteability('$name:$ns.$directive',checked)" // INLINE JAVASCRIPT!!!! + ); + if ($value === null) $attr['checked'] = 'checked'; + $ret .= $this->elementEmpty('input', $attr); + $ret .= $this->text(' or '); + $ret .= $this->elementEmpty('br'); + $ret .= $this->obj->render($ns, $directive, $value, $name, $config); + return $ret; + } +} + +/** + * Swiss-army knife configuration form field printer + */ +class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer { + var $cols = 18; + var $rows = 5; + function render($ns, $directive, $value, $name, $config) { + $this->prepareGenerator($config); + // this should probably be split up a little + $ret = ''; + $def = $config->def->info[$ns][$directive]; + if (is_array($value)) { + switch ($def->type) { + case 'lookup': + $array = $value; + $value = array(); + foreach ($array as $val => $b) { + $value[] = $val; + } + case 'list': + $value = implode(PHP_EOL, $value); + break; + case 'hash': + $nvalue = ''; + foreach ($value as $i => $v) { + $nvalue .= "$i:$v" . PHP_EOL; + } + $value = $nvalue; + break; + default: + $value = ''; + } + } + if ($def->type === 'mixed') { + return 'Not supported'; + $value = serialize($value); + } + $attr = array( + 'name' => "$name"."[$ns.$directive]", + 'id' => "$name:$ns.$directive" + ); + if ($value === null) $attr['disabled'] = 'disabled'; + if (is_array($def->allowed)) { + $ret .= $this->start('select', $attr); + foreach ($def->allowed as $val => $b) { + $attr = array(); + if ($value == $val) $attr['selected'] = 'selected'; + $ret .= $this->element('option', $val, $attr); + } + $ret .= $this->end('select'); + } elseif ( + $def->type == 'text' || $def->type == 'itext' || + $def->type == 'list' || $def->type == 'hash' || $def->type == 'lookup' + ) { + $attr['cols'] = $this->cols; + $attr['rows'] = $this->rows; + $ret .= $this->start('textarea', $attr); + $ret .= $this->text($value); + $ret .= $this->end('textarea'); + } else { + $attr['value'] = $value; + $attr['type'] = 'text'; + $ret .= $this->elementEmpty('input', $attr); + } + return $ret; + } +} + +/** + * Bool form field printer + */ +class HTMLPurifier_Printer_ConfigForm_bool extends HTMLPurifier_Printer { + function render($ns, $directive, $value, $name, $config) { + $this->prepareGenerator($config); + $ret = ''; + $ret .= $this->start('div', array('id' => "$name:$ns.$directive")); + + $ret .= $this->start('label', array('for' => "$name:Yes_$ns.$directive")); + $ret .= $this->element('span', "$ns.$directive:", array('class' => 'verbose')); + $ret .= $this->text(' Yes'); + $ret .= $this->end('label'); + + $attr = array( + 'type' => 'radio', + 'name' => "$name"."[$ns.$directive]", + 'id' => "$name:Yes_$ns.$directive", + 'value' => '1' + ); + if ($value) $attr['checked'] = 'checked'; + $ret .= $this->elementEmpty('input', $attr); + + $ret .= $this->start('label', array('for' => "$name:No_$ns.$directive")); + $ret .= $this->element('span', "$ns.$directive:", array('class' => 'verbose')); + $ret .= $this->text(' No'); + $ret .= $this->end('label'); + + $attr = array( + 'type' => 'radio', + 'name' => "$name"."[$ns.$directive]", + 'id' => "$name:No_$ns.$directive", + 'value' => '0' + ); + if (!$value) $attr['checked'] = 'checked'; + $ret .= $this->elementEmpty('input', $attr); + + $ret .= $this->end('div'); + + return $ret; + } +} + diff --git a/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php b/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php index a677c58bf6..52650c6308 100644 --- a/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php +++ b/lib/htmlpurifier/HTMLPurifier/Printer/HTMLDefinition.php @@ -15,9 +15,44 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $this->config =& $config; $this->def = $config->getHTMLDefinition(); - $def =& $this->def; $ret .= $this->start('div', array('class' => 'HTMLPurifier_Printer')); + + $ret .= $this->renderDoctype(); + $ret .= $this->renderEnvironment(); + $ret .= $this->renderContentSets(); + $ret .= $this->renderInfo(); + + $ret .= $this->end('div'); + + return $ret; + } + + /** + * Renders the Doctype table + */ + function renderDoctype() { + $doctype = $this->def->doctype; + $ret = ''; + $ret .= $this->start('table'); + $ret .= $this->element('caption', 'Doctype'); + $ret .= $this->row('Name', $doctype->name); + $ret .= $this->row('XML', $doctype->xml ? 'Yes' : 'No'); + $ret .= $this->row('Default Modules', implode($doctype->modules, ', ')); + $ret .= $this->row('Default Tidy Modules', implode($doctype->tidyModules, ', ')); + $ret .= $this->end('table'); + return $ret; + } + + + /** + * Renders environment table, which is miscellaneous info + */ + function renderEnvironment() { + $def = $this->def; + + $ret = ''; + $ret .= $this->start('table'); $ret .= $this->element('caption', 'Environment'); @@ -51,13 +86,22 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $ret .= $this->end('tr'); $ret .= $this->end('table'); - - - $ret .= $this->renderInfo(); - - - $ret .= $this->end('div'); - + return $ret; + } + + /** + * Renders the Content Sets table + */ + function renderContentSets() { + $ret = ''; + $ret .= $this->start('table'); + $ret .= $this->element('caption', 'Content Sets'); + foreach ($this->def->info_content_sets as $name => $lookup) { + $ret .= $this->heavyHeader($name); + $ret .= $this->start('tr'); + $ret .= $this->element('td', $this->listifyTagLookup($lookup)); + $ret .= $this->end('tr'); + } return $ret; } @@ -69,15 +113,13 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $ret .= $this->start('table'); $ret .= $this->element('caption', 'Elements ($info)'); ksort($this->def->info); - $ret .= $this->start('tr'); - $ret .= $this->element('th', 'Allowed tags', array('colspan' => 2, 'class' => 'heavy')); - $ret .= $this->end('tr'); + $ret .= $this->heavyHeader('Allowed tags', 2); $ret .= $this->start('tr'); $ret .= $this->element('td', $this->listifyTagLookup($this->def->info), array('colspan' => 2)); $ret .= $this->end('tr'); foreach ($this->def->info as $name => $def) { $ret .= $this->start('tr'); - $ret .= $this->element('th', "<$name>", array('class'=>'heavy', 'colspan' => 2)); + $ret .= $this->element('th', "<$name>" . ($def->safe ? '' : ' (unsafe)'), array('class'=>'heavy' . ($def->safe ? '' : ' unsafe'), 'colspan' => 2)); $ret .= $this->end('tr'); $ret .= $this->start('tr'); $ret .= $this->element('th', 'Inline content'); @@ -109,9 +151,13 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer } $ret .= $this->start('tr'); $ret .= $this->element('th', 'Allowed attributes'); - $ret .= $this->element('td',$this->listifyAttr($def->attr),0,0); + $ret .= $this->element('td',$this->listifyAttr($def->attr), array(), 0); $ret .= $this->end('tr'); + if (!empty($def->required_attr)) { + $ret .= $this->row('Required attributes', $this->listify($def->required_attr)); + } + $ret .= $this->renderChildren($def->child); } $ret .= $this->end('table'); @@ -154,6 +200,11 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer 'Inline: ' . $this->escape($this->listifyTagLookup($def->inline->elements)),0,0); + } elseif ($def->type == 'custom') { + + $ret .= $this->element('td', ''.ucfirst($def->type).': ' . + $def->dtd_regex); + } else { $ret .= $this->element('td', ''.ucfirst($def->type).': ' . @@ -205,6 +256,16 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer return $this->listify($list); } + /** + * Creates a heavy header row + */ + function heavyHeader($text, $num = 1) { + $ret = ''; + $ret .= $this->start('tr'); + $ret .= $this->element('th', $text, array('colspan' => $num, 'class' => 'heavy')); + $ret .= $this->end('tr'); + return $ret; + } + } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy.php b/lib/htmlpurifier/HTMLPurifier/Strategy.php index 746b0a2d6e..a6ab7e8bca 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy.php @@ -30,4 +30,3 @@ class HTMLPurifier_Strategy } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php b/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php index bd86874798..fcd230f472 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/Composite.php @@ -27,4 +27,3 @@ class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php b/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php index 66e7bb3634..93d051046a 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/Core.php @@ -22,4 +22,3 @@ class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php b/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php index 08f907562f..51a14a78f4 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/FixNesting.php @@ -42,16 +42,21 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $definition = $config->getHTMLDefinition(); // insert implicit "parent" node, will be removed at end. - // ! we might want to move this to configuration // DEFINITION CALL $parent_name = $definition->info_parent; array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name)); $tokens[] = new HTMLPurifier_Token_End($parent_name); - // setup the context variables - $is_inline = false; // reference var that we alter + // setup the context variable 'IsInline', for chameleon processing + // is 'false' when we are not inline, 'true' when it must always + // be inline, and an integer when it is inline for a certain + // branch of the document tree + $is_inline = $definition->info_parent_def->descendants_are_inline; $context->register('IsInline', $is_inline); + // setup error collector + $e =& $context->get('ErrorCollector', true); + //####################################################################// // Loop initialization @@ -60,10 +65,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $stack = array(); // stack that contains all elements that are excluded - // same structure as $stack, but it is only populated when an element - // with exclusions is processed, i.e. there won't be empty exclusions. + // it is organized by parent elements, similar to $stack, + // but it is only populated when an element with exclusions is + // processed, i.e. there won't be empty exclusions. $exclude_stack = array(); + // variable that contains the start token while we are processing + // nodes. This enables error reporting to do its job + $start_token = false; + $context->register('CurrentToken', $start_token); + //####################################################################// // Loop @@ -97,6 +108,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // $i is index of start token // $j is index of end token + $start_token = $tokens[$i]; // to make token available via CurrentToken + //################################################################// // Gather information on parent @@ -110,7 +123,10 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $parent_def = $definition->info[$parent_name]; } } else { - // unknown info, it won't be used anyway + // processing as if the parent were the "root" node + // unknown info, it won't be used anyway, in the future, + // we may want to enforce one element only (this is + // necessary for HTML Purifier to clean entire documents $parent_index = $parent_name = $parent_def = null; } @@ -194,6 +210,14 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } elseif($result === false) { // remove entire node + if ($e) { + if ($excluded) { + $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); + } else { + $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); + } + } + // calculate length of inner tokens and current tokens $length = $j - $i + 1; @@ -207,6 +231,12 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // current node is now the next possible start node // unless it turns out that we need to do a double-check + // this is a rought heuristic that covers 100% of HTML's + // cases and 99% of all other cases. A child definition + // that would be tricked by this would be something like: + // ( | a b c) where it's all or nothing. Fortunately, + // our current implementation claims that that case would + // not allow empty, even if it did if (!$parent_def->child->allow_empty) { // we need to do a double-check $i = $parent_index; @@ -222,6 +252,14 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // calculate length of inner tokens $length = $j - $i - 1; + if ($e) { + if (empty($result) && $length) { + $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); + } else { + $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); + } + } + // perform replacement array_splice($tokens, $i + 1, $length, $result); @@ -279,6 +317,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // remove context variables $context->destroy('IsInline'); + $context->destroy('CurrentToken'); //####################################################################// // Return @@ -289,4 +328,4 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } -?> + diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php b/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php index 84580d3d34..b3e8aa7453 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php @@ -4,127 +4,234 @@ require_once 'HTMLPurifier/Strategy.php'; require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/Generator.php'; +require_once 'HTMLPurifier/Injector/AutoParagraph.php'; +require_once 'HTMLPurifier/Injector/Linkify.php'; +require_once 'HTMLPurifier/Injector/PurifierLinkify.php'; + +HTMLPurifier_ConfigSchema::define( + 'AutoFormat', 'Custom', array(), 'list', ' ++ This directive can be used to add custom auto-format injectors. + Specify an array of injector names (class name minus the prefix) + or concrete implementations. Injector class must exist. This directive + has been available since 2.0.1. +
+' +); + /** * Takes tokens makes them well-formed (balance end tags, etc.) */ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy { + /** + * Locally shared variable references + * @private + */ + var $inputTokens, $inputIndex, $outputTokens, $currentNesting, + $currentInjector, $injectors; + function execute($tokens, $config, &$context) { + $definition = $config->getHTMLDefinition(); - $generator = new HTMLPurifier_Generator(); + + // CurrentNesting + $this->currentNesting = array(); + $context->register('CurrentNesting', $this->currentNesting); + + // InputIndex + $this->inputIndex = false; + $context->register('InputIndex', $this->inputIndex); + + // InputTokens + $context->register('InputTokens', $tokens); + $this->inputTokens =& $tokens; + + // OutputTokens $result = array(); - $current_nesting = array(); + $this->outputTokens =& $result; + + // %Core.EscapeInvalidTags $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); - foreach ($tokens as $token) { - if (empty( $token->is_tag )) { - $result[] = $token; - continue; + $generator = new HTMLPurifier_Generator(); + + $e =& $context->get('ErrorCollector', true); + + // -- begin INJECTOR -- + + $this->injectors = array(); + + $injectors = $config->getBatch('AutoFormat'); + $custom_injectors = $injectors['Custom']; + unset($injectors['Custom']); // special case + foreach ($injectors as $injector => $b) { + $injector = "HTMLPurifier_Injector_$injector"; + if (!$b) continue; + $this->injectors[] = new $injector; + } + foreach ($custom_injectors as $injector) { + if (is_string($injector)) { + $injector = "HTMLPurifier_Injector_$injector"; + $injector = new $injector; } + $this->injectors[] = $injector; + } + + // array index of the injector that resulted in an array + // substitution. This enables processTokens() to know which + // injectors are affected by the added tokens and which are + // not (namely, the ones after the current injector are not + // affected) + $this->currentInjector = false; + + // give the injectors references to the definition and context + // variables for performance reasons + foreach ($this->injectors as $i => $x) { + $error = $this->injectors[$i]->prepare($config, $context); + if (!$error) continue; + list($injector) = array_splice($this->injectors, $i, 1); + $name = $injector->name; + trigger_error("Cannot enable $name injector because $error is not allowed", E_USER_WARNING); + } + + // -- end INJECTOR -- + + $token = false; + $context->register('CurrentToken', $token); + + for ($this->inputIndex = 0; isset($tokens[$this->inputIndex]); $this->inputIndex++) { - // DEFINITION CALL - $info = $definition->info[$token->name]->child; + // if all goes well, this token will be passed through unharmed + $token = $tokens[$this->inputIndex]; - // test if it claims to be a start tag but is empty - if ($info->type == 'empty' && - $token->type == 'start' ) { - - $result[] = new HTMLPurifier_Token_Empty($token->name, - $token->attr); - continue; + foreach ($this->injectors as $i => $x) { + if ($x->skip > 0) $this->injectors[$i]->skip--; } - // test if it claims to be empty but really is a start tag - if ($info->type != 'empty' && - $token->type == 'empty' ) { - - $result[] = new HTMLPurifier_Token_Start($token->name, - $token->attr); - $result[] = new HTMLPurifier_Token_End($token->name); - + // quick-check: if it's not a tag, no need to process + if (empty( $token->is_tag )) { + if ($token->type === 'text') { + // injector handler code; duplicated for performance reasons + foreach ($this->injectors as $i => $x) { + if (!$x->skip) $x->handleText($token); + if (is_array($token)) { + $this->currentInjector = $i; + break; + } + } + } + $this->processToken($token, $config, $context); continue; } - // automatically insert empty tags - if ($token->type == 'empty') { - $result[] = $token; - continue; - } + $info = $definition->info[$token->name]->child; - // we give start tags precedence, so automatically accept unless... - // it's one of those special cases - if ($token->type == 'start') { + // quick tag checks: anything that's *not* an end tag + $ok = false; + if ($info->type == 'empty' && $token->type == 'start') { + // test if it claims to be a start tag but is empty + $token = new HTMLPurifier_Token_Empty($token->name, $token->attr); + $ok = true; + } elseif ($info->type != 'empty' && $token->type == 'empty' ) { + // claims to be empty but really is a start tag + $token = array( + new HTMLPurifier_Token_Start($token->name, $token->attr), + new HTMLPurifier_Token_End($token->name) + ); + $ok = true; + } elseif ($token->type == 'empty') { + // real empty token + $ok = true; + } elseif ($token->type == 'start') { + // start tag - // if there's a parent, check for special case - if (!empty($current_nesting)) { + // ...unless they also have to close their parent + if (!empty($this->currentNesting)) { - $parent = array_pop($current_nesting); - $parent_name = $parent->name; - $parent_info = $definition->info[$parent_name]; + $parent = array_pop($this->currentNesting); + $parent_info = $definition->info[$parent->name]; - if (isset($parent_info->auto_close[$token->name])) { - $result[] = new HTMLPurifier_Token_End($parent_name); + // this can be replaced with a more general algorithm: + // if the token is not allowed by the parent, auto-close + // the parent + if (!isset($parent_info->child->elements[$token->name])) { + if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent); + // close the parent, then append the token + $result[] = new HTMLPurifier_Token_End($parent->name); $result[] = $token; - $current_nesting[] = $token; + $this->currentNesting[] = $token; continue; } - $current_nesting[] = $parent; // undo the pop + $this->currentNesting[] = $parent; // undo the pop } - - $result[] = $token; - $current_nesting[] = $token; + $ok = true; + } + + // injector handler code; duplicated for performance reasons + if ($ok) { + foreach ($this->injectors as $i => $x) { + if (!$x->skip) $x->handleElement($token); + if (is_array($token)) { + $this->currentInjector = $i; + break; + } + } + $this->processToken($token, $config, $context); continue; } - // sanity check + // sanity check: we should be dealing with a closing tag if ($token->type != 'end') continue; - // okay, we're dealing with a closing tag - // make sure that we have something open - if (empty($current_nesting)) { + if (empty($this->currentNesting)) { if ($escape_invalid_tags) { + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text'); $result[] = new HTMLPurifier_Token_Text( $generator->generateFromToken($token, $config, $context) ); + } elseif ($e) { + $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); } continue; } // first, check for the simplest case: everything closes neatly - - // current_nesting is modified - $current_parent = array_pop($current_nesting); + $current_parent = array_pop($this->currentNesting); if ($current_parent->name == $token->name) { $result[] = $token; continue; } - // undo the array_pop - $current_nesting[] = $current_parent; - // okay, so we're trying to close the wrong tag - // scroll back the entire nest, trying to find our tag - // feature could be to specify how far you'd like to go - $size = count($current_nesting); + // undo the pop previous pop + $this->currentNesting[] = $current_parent; + + // scroll back the entire nest, trying to find our tag. + // (feature could be to specify how far you'd like to go) + $size = count($this->currentNesting); // -2 because -1 is the last element, but we already checked that $skipped_tags = false; for ($i = $size - 2; $i >= 0; $i--) { - if ($current_nesting[$i]->name == $token->name) { + if ($this->currentNesting[$i]->name == $token->name) { // current nesting is modified - $skipped_tags = array_splice($current_nesting, $i); + $skipped_tags = array_splice($this->currentNesting, $i); break; } } - // we still didn't find the tag, so translate to text + // we still didn't find the tag, so remove if ($skipped_tags === false) { if ($escape_invalid_tags) { $result[] = new HTMLPurifier_Token_Text( $generator->generateFromToken($token, $config, $context) ); + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text'); + } elseif ($e) { + $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); } continue; } @@ -132,27 +239,68 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // okay, we found it, close all the skipped tags // note that skipped tags contains the element we need closed $size = count($skipped_tags); - for ($i = $size - 1; $i >= 0; $i--) { + for ($i = $size - 1; $i > 0; $i--) { + if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]); + } $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); } - // done! + $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); } - // we're at the end now, fix all still unclosed tags + $context->destroy('CurrentNesting'); + $context->destroy('InputTokens'); + $context->destroy('InputIndex'); + $context->destroy('CurrentToken'); - if (!empty($current_nesting)) { - $size = count($current_nesting); + // we're at the end now, fix all still unclosed tags + // not using processToken() because at this point we don't + // care about current nesting + if (!empty($this->currentNesting)) { + $size = count($this->currentNesting); for ($i = $size - 1; $i >= 0; $i--) { + if ($e && !isset($this->currentNesting[$i]->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]); + } $result[] = - new HTMLPurifier_Token_End($current_nesting[$i]->name); + new HTMLPurifier_Token_End($this->currentNesting[$i]->name); } } + unset($this->outputTokens, $this->injectors, $this->currentInjector, + $this->currentNesting, $this->inputTokens, $this->inputIndex); + return $result; } + function processToken($token, $config, &$context) { + if (is_array($token)) { + // the original token was overloaded by an injector, time + // to some fancy acrobatics + + // $this->inputIndex is decremented so that the entire set gets + // re-processed + array_splice($this->inputTokens, $this->inputIndex--, 1, $token); + + // adjust the injector skips based on the array substitution + if ($this->injectors) { + $offset = count($token) + 1; + for ($i = 0; $i <= $this->currentInjector; $i++) { + $this->injectors[$i]->skip += $offset; + } + } + } elseif ($token) { + // regular case + $this->outputTokens[] = $token; + if ($token->type == 'start') { + $this->currentNesting[] = $token; + } elseif ($token->type == 'end') { + array_pop($this->currentNesting); // not actually used + } + } + } + } -?> \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php index cb5c4dd1b3..2c280b23d7 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -5,12 +5,43 @@ require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/TagTransform.php'; +require_once 'HTMLPurifier/AttrValidator.php'; + +HTMLPurifier_ConfigSchema::define( + 'Core', 'RemoveInvalidImg', true, 'bool', ' +
+ This directive enables pre-emptive URI checking in img
+ tags, as the attribute validation strategy is not authorized to
+ remove elements from the document. This directive has been available
+ since 1.3.0, revert to pre-1.3.0 behavior by setting to false.
+
+ This directive enables HTML Purifier to remove not only script tags + but all of their contents. This directive has been deprecated since 2.1.0, + and when not set the value of %Core.HiddenElements will take + precedence. This directive has been available since 2.0.0, and can be used to + revert to pre-2.0.0 behavior by setting it to false. +
+' +); + HTMLPurifier_ConfigSchema::define( - 'Core', 'RemoveInvalidImg', true, 'bool', - 'This directive enables pre-emptive URI checking inimg
'.
- 'tags, as the attribute validation strategy is not authorized to '.
- 'remove elements from the document. This directive has been available '.
- 'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.'
+ 'Core', 'HiddenElements', array('script' => true, 'style' => true), 'lookup', '
+
+ This directive is a lookup array of elements which should have their
+ contents removed when they are not allowed by the HTML definition.
+ For example, the contents of a script
tag are not
+ normally shown in a document, so if script tags are to be removed,
+ their contents should be removed to. This is opposed to a b
+ tag, which defines some presentational changes but does not hide its
+ contents.
+
to
- foreach ($definition->info[$token->name]->attr_transform_pre
- as $transform
- ) {
- $attr = $transform->transform($attr, $config, $context);
- }
+ // skip tokens that are armored
+ if (!empty($token->armor['ValidateAttributes'])) continue;
- // create alias to this element's attribute definition array, see
- // also $d_defs (global attribute definition array)
- // DEFINITION CALL
- $defs = $definition->info[$token->name]->attr;
+ // note that we have no facilities here for removing tokens
+ $validator->validateToken($token, $config, $context);
- // iterate through all the attribute keypairs
- // Watch out for name collisions: $key has previously been used
- foreach ($attr as $attr_key => $value) {
-
- // call the definition
- if ( isset($defs[$attr_key]) ) {
- // there is a local definition defined
- if ($defs[$attr_key] === false) {
- // We've explicitly been told not to allow this element.
- // This is usually when there's a global definition
- // that must be overridden.
- // Theoretically speaking, we could have a
- // AttrDef_DenyAll, but this is faster!
- $result = false;
- } else {
- // validate according to the element's definition
- $result = $defs[$attr_key]->validate(
- $value, $config, $context
- );
- }
- } elseif ( isset($d_defs[$attr_key]) ) {
- // there is a global definition defined, validate according
- // to the global definition
- $result = $d_defs[$attr_key]->validate(
- $value, $config, $context
- );
- } else {
- // system never heard of the attribute? DELETE!
- $result = false;
- }
-
- // put the results into effect
- if ($result === false || $result === null) {
- // remove the attribute
- unset($attr[$attr_key]);
- } elseif (is_string($result)) {
- // simple substitution
- $attr[$attr_key] = $result;
- }
-
- // we'd also want slightly more complicated substitution
- // involving an array as the return value,
- // although we're not sure how colliding attributes would
- // resolve (certain ones would be completely overriden,
- // others would prepend themselves).
- }
-
- // post transforms
-
- // ex.
+ Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. This directive has been available + since 2.1.0. +
+'); + +// informative URI directives + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DefaultScheme', 'http', 'string', ' ++ Defines through what scheme the output will be served, in order to + select the proper object validator when no scheme information is present. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Host', null, 'string/null', ' ++ Defines the domain name of the server, so we can determine whether or + an absolute URI is from your website or not. Not strictly necessary, + as users should be using relative URIs to reference resources on your + website. It will, however, let you use absolute URIs to link to + subdomains of the domain you post here: i.e. example.com will allow + sub.example.com. However, higher up domains will still be excluded: + if you set %URI.Host to sub.example.com, example.com will be blocked. + Note: This directive overrides %URI.Base because + a given page may be on a sub-domain, but you wish HTML Purifier to be + more relaxed and allow some of the parent domains too. + This directive has been available since 1.2.0. +
+'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Base', null, 'string/null', ' ++ The base URI is the URI of the document this purified HTML will be + inserted into. This information is important if HTML Purifier needs + to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute + is on. You may use a non-absolute URI for this value, but behavior + may vary (%URI.MakeAbsolute deals nicely with both absolute and + relative paths, but forwards-compatibility is not guaranteed). + Warning: If set, the scheme on this URI + overrides the one specified by %URI.DefaultScheme. This directive has + been available since 2.1.0. +
+'); + +class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition +{ + + var $type = 'URI'; + var $filters = array(); + var $registeredFilters = array(); + + /** + * HTMLPurifier_URI object of the base specified at %URI.Base + */ + var $base; + + /** + * String host to consider "home" base + */ + var $host; + + /** + * Name of default scheme based on %URI.DefaultScheme and %URI.Base + */ + var $defaultScheme; + + function HTMLPurifier_URIDefinition() { + $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal()); + $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources()); + $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist()); + $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute()); + } + + function registerFilter($filter) { + $this->registeredFilters[$filter->name] = $filter; + } + + function addFilter($filter, $config) { + $filter->prepare($config); + $this->filters[$filter->name] = $filter; + } + + function doSetup($config) { + $this->setupMemberVariables($config); + $this->setupFilters($config); + } + + function setupFilters($config) { + foreach ($this->registeredFilters as $name => $filter) { + $conf = $config->get('URI', $name); + if ($conf !== false && $conf !== null) { + $this->addFilter($filter, $config); + } + } + unset($this->registeredFilters); + } + + function setupMemberVariables($config) { + $this->host = $config->get('URI', 'Host'); + $base_uri = $config->get('URI', 'Base'); + if (!is_null($base_uri)) { + $parser = new HTMLPurifier_URIParser(); + $this->base = $parser->parse($base_uri); + $this->defaultScheme = $this->base->scheme; + if (is_null($this->host)) $this->host = $this->base->host; + } + if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme'); + } + + function filter(&$uri, $config, &$context) { + foreach ($this->filters as $name => $x) { + $result = $this->filters[$name]->filter($uri, $config, $context); + if (!$result) return false; + } + return true; + } + +} diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter.php b/lib/htmlpurifier/HTMLPurifier/URIFilter.php new file mode 100644 index 0000000000..e0066f3bf0 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter.php @@ -0,0 +1,24 @@ +get('URI', 'Host'); + if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host)); + } + function filter(&$uri, $config, &$context) { + if (is_null($uri->host)) return true; + if ($this->ourHostParts === false) return false; + $host_parts = array_reverse(explode('.', $uri->host)); + foreach ($this->ourHostParts as $i => $x) { + if (!isset($host_parts[$i])) return false; + if ($host_parts[$i] != $this->ourHostParts[$i]) return false; + } + return true; + } +} + diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternalResources.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternalResources.php new file mode 100644 index 0000000000..dc00e74110 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternalResources.php @@ -0,0 +1,26 @@ +get('EmbeddedURI', true)) return true; + return parent::filter($uri, $config, $context); + } +} + diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php new file mode 100644 index 0000000000..d3429d5cbf --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/HostBlacklist.php @@ -0,0 +1,28 @@ +moo.com will catch moo.com.example.com. '. + 'This directive has been available since 1.3.0.' +); + +class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter +{ + var $name = 'HostBlacklist'; + var $blacklist = array(); + function prepare($config) { + $this->blacklist = $config->get('URI', 'HostBlacklist'); + } + function filter(&$uri, $config, &$context) { + foreach($this->blacklist as $blacklisted_host_fragment) { + if (strpos($uri->host, $blacklisted_host_fragment) !== false) { + return false; + } + } + return true; + } +} diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php new file mode 100644 index 0000000000..9935dc6ee9 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php @@ -0,0 +1,115 @@ + + Converts all URIs into absolute forms. This is useful when the HTML + being filtered assumes a specific base path, but will actually be + viewed in a different context (and setting an alternate base URI is + not possible). %URI.Base must be set for this directive to work. + This directive has been available since 2.1.0. + +'); + +class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter +{ + var $name = 'MakeAbsolute'; + var $base; + var $basePathStack = array(); + function prepare($config) { + $def = $config->getDefinition('URI'); + $this->base = $def->base; + if (is_null($this->base)) { + trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_ERROR); + return; + } + $this->base->fragment = null; // fragment is invalid for base URI + $stack = explode('/', $this->base->path); + array_pop($stack); // discard last segment + $stack = $this->_collapseStack($stack); // do pre-parsing + $this->basePathStack = $stack; + } + function filter(&$uri, $config, &$context) { + if (is_null($this->base)) return true; // abort early + if ( + $uri->path === '' && is_null($uri->scheme) && + is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment) + ) { + // reference to current document + $uri = $this->base->copy(); + return true; + } + if (!is_null($uri->scheme)) { + // absolute URI already: don't change + if (!is_null($uri->host)) return true; + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj->hierarchical) { + // non-hierarchal URI with explicit scheme, don't change + return true; + } + // special case: had a scheme but always is hierarchical and had no authority + } + if (!is_null($uri->host)) { + // network path, don't bother + return true; + } + if ($uri->path === '') { + $uri->path = $this->base->path; + }elseif ($uri->path[0] !== '/') { + // relative path, needs more complicated processing + $stack = explode('/', $uri->path); + $new_stack = array_merge($this->basePathStack, $stack); + $new_stack = $this->_collapseStack($new_stack); + $uri->path = implode('/', $new_stack); + } + // re-combine + $uri->scheme = $this->base->scheme; + if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo; + if (is_null($uri->host)) $uri->host = $this->base->host; + if (is_null($uri->port)) $uri->port = $this->base->port; + return true; + } + + /** + * Resolve dots and double-dots in a path stack + * @private + */ + function _collapseStack($stack) { + $result = array(); + for ($i = 0; isset($stack[$i]); $i++) { + $is_folder = false; + // absorb an internally duplicated slash + if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue; + if ($stack[$i] == '..') { + if (!empty($result)) { + $segment = array_pop($result); + if ($segment === '' && empty($result)) { + // error case: attempted to back out too far: + // restore the leading slash + $result[] = ''; + } elseif ($segment === '..') { + $result[] = '..'; // cannot remove .. with .. + } + } else { + // relative path, preserve the double-dots + $result[] = '..'; + } + $is_folder = true; + continue; + } + if ($stack[$i] == '.') { + // silently absorb + $is_folder = true; + continue; + } + $result[] = $stack[$i]; + } + if ($is_folder) $result[] = ''; + return $result; + } +} + diff --git a/lib/htmlpurifier/HTMLPurifier/URIParser.php b/lib/htmlpurifier/HTMLPurifier/URIParser.php new file mode 100644 index 0000000000..dff7e28ef8 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/URIParser.php @@ -0,0 +1,62 @@ +\'"]+):)?'. // 2. Scheme + '(//([^/?#<>\'"]*))?'. // 4. Authority + '([^?#<>\'"]*)'. // 5. Path + '(\?([^#<>\'"]*))?'. // 7. Query + '(#([^<>\'"]*))?'. // 8. Fragment + '!'; + + $matches = array(); + $result = preg_match($r_URI, $uri, $matches); + + if (!$result) return false; // *really* invalid URI + + // seperate out parts + $scheme = !empty($matches[1]) ? $matches[2] : null; + $authority = !empty($matches[3]) ? $matches[4] : null; + $path = $matches[5]; // always present, can be empty + $query = !empty($matches[6]) ? $matches[7] : null; + $fragment = !empty($matches[8]) ? $matches[9] : null; + + // further parse authority + if ($authority !== null) { + // ridiculously inefficient: it's a stacked regex! + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] + $sub_delims = '!$&\'()'; // needs [] + $pct_encoded = "%$HEXDIG$HEXDIG"; + $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; + $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; + $matches = array(); + preg_match($r_authority, $authority, $matches); + $userinfo = !empty($matches[1]) ? $matches[2] : null; + $host = !empty($matches[3]) ? $matches[3] : ''; + $port = !empty($matches[4]) ? (int) $matches[5] : null; + } else { + $port = $host = $userinfo = null; + } + + return new HTMLPurifier_URI( + $scheme, $userinfo, $host, $port, $path, $query, $fragment); + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/URIScheme.php b/lib/htmlpurifier/HTMLPurifier/URIScheme.php index 20a9781b48..41c02f70d2 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIScheme.php +++ b/lib/htmlpurifier/HTMLPurifier/URIScheme.php @@ -19,26 +19,25 @@ class HTMLPurifier_URIScheme */ var $browsable = false; + /** + * Whether or not the URI always uses