From a857b7ea3828d53eaac2856045fe8d9a48dd274e Mon Sep 17 00:00:00 2001 From: skodak Date: Sat, 1 Nov 2008 18:58:29 +0000 Subject: [PATCH] MDL-17099 htmlpurifier: new version from upstream --- lib/htmlpurifier/HTMLPurifier.php | 6 +- .../HTMLPurifier.safe-includes.php | 387 +- .../HTMLPurifier/AttrDef/HTML/LinkTypes.php | 5 +- .../HTMLPurifier/AttrTransform/Background.php | 22 + .../AttrTransform/ImgRequired.php | 7 +- .../HTMLPurifier/AttrTransform/Input.php | 39 + .../HTMLPurifier/AttrTransform/Textarea.php | 16 + lib/htmlpurifier/HTMLPurifier/AttrTypes.php | 3 + .../HTMLPurifier/AttrValidator.php | 22 +- lib/htmlpurifier/HTMLPurifier/ChildDef.php | 8 + .../HTMLPurifier/ChildDef/Custom.php | 2 - .../ChildDef/StrictBlockquote.php | 31 +- lib/htmlpurifier/HTMLPurifier/Config.php | 2 +- .../HTMLPurifier/ConfigSchema/schema.ser | 2 +- .../schema/Attr.AllowedFrameTargets.txt | 22 +- .../ConfigSchema/schema/Attr.AllowedRel.txt | 16 +- .../ConfigSchema/schema/Attr.AllowedRev.txt | 16 +- .../schema/Attr.DefaultImageAlt.txt | 9 + .../schema/Attr.DefaultInvalidImage.txt | 16 +- .../schema/Attr.DefaultInvalidImageAlt.txt | 14 +- .../schema/Attr.DefaultTextDir.txt | 18 +- .../ConfigSchema/schema/Attr.EnableID.txt | 30 +- .../ConfigSchema/schema/Attr.IDBlacklist.txt | 8 +- .../schema/Attr.IDBlacklistRegexp.txt | 16 +- .../ConfigSchema/schema/Attr.IDPrefix.txt | 22 +- .../schema/Attr.IDPrefixLocal.txt | 26 +- .../HTMLPurifier/ConfigSchema/schema/Attr.txt | 4 +- .../schema/AutoFormat.AutoParagraph.txt | 60 +- .../ConfigSchema/schema/AutoFormat.Custom.txt | 24 +- .../schema/AutoFormat.DisplayLinkURI.txt | 10 + .../schema/AutoFormat.Linkify.txt | 24 +- .../schema/AutoFormat.PurifierLinkify.txt | 24 +- .../schema/AutoFormat.RemoveEmpty.txt | 44 + .../ConfigSchema/schema/AutoFormat.txt | 4 +- .../AutoFormatParam.PurifierLinkifyDocURL.txt | 24 +- .../ConfigSchema/schema/AutoFormatParam.txt | 4 +- .../ConfigSchema/schema/CSS.DefinitionRev.txt | 22 +- .../ConfigSchema/schema/CSS.Proprietary.txt | 20 +- .../HTMLPurifier/ConfigSchema/schema/CSS.txt | 4 +- .../schema/Cache.DefinitionImpl.txt | 26 +- .../schema/Cache.SerializerPath.txt | 26 +- .../ConfigSchema/schema/Cache.txt | 4 +- .../schema/Core.AggressivelyFixLt.txt | 30 +- .../schema/Core.CollectErrors.txt | 22 +- .../schema/Core.ColorKeywords.txt | 58 +- .../schema/Core.ConvertDocumentToFragment.txt | 26 +- .../Core.DirectLexLineNumberSyncInterval.txt | 34 +- .../ConfigSchema/schema/Core.Encoding.txt | 28 +- .../schema/Core.EscapeInvalidChildren.txt | 18 +- .../schema/Core.EscapeInvalidTags.txt | 12 +- .../schema/Core.EscapeNonASCIICharacters.txt | 24 +- .../schema/Core.HiddenElements.txt | 38 +- .../ConfigSchema/schema/Core.Language.txt | 22 +- .../ConfigSchema/schema/Core.LexerImpl.txt | 66 +- .../schema/Core.MaintainLineNumbers.txt | 32 +- .../schema/Core.RemoveInvalidImg.txt | 24 +- .../schema/Core.RemoveScriptContents.txt | 22 +- .../HTMLPurifier/ConfigSchema/schema/Core.txt | 4 +- .../schema/Filter.ExtractStyleBlocks.txt | 50 +- .../ConfigSchema/schema/Filter.txt | 4 +- ...FilterParam.ExtractStyleBlocksEscaping.txt | 28 +- .../FilterParam.ExtractStyleBlocksScope.txt | 56 +- .../ConfigSchema/schema/HTML.Allowed.txt | 44 +- .../schema/HTML.AllowedAttributes.txt | 38 +- .../schema/HTML.AllowedElements.txt | 36 +- .../schema/HTML.AllowedModules.txt | 40 +- .../ConfigSchema/schema/HTML.BlockWrapper.txt | 36 +- .../ConfigSchema/schema/HTML.CoreModules.txt | 46 +- .../schema/HTML.CustomDoctype.txt | 20 +- .../ConfigSchema/schema/HTML.DefinitionID.txt | 66 +- .../schema/HTML.DefinitionRev.txt | 32 +- .../ConfigSchema/schema/HTML.Doctype.txt | 20 +- .../ConfigSchema/schema/HTML.Parent.txt | 24 +- .../ConfigSchema/schema/HTML.Strict.txt | 16 +- .../ConfigSchema/schema/HTML.TidyAdd.txt | 16 +- .../ConfigSchema/schema/HTML.TidyLevel.txt | 46 +- .../ConfigSchema/schema/HTML.TidyRemove.txt | 16 +- .../ConfigSchema/schema/HTML.Trusted.txt | 14 +- .../ConfigSchema/schema/HTML.XHTML.txt | 20 +- .../HTMLPurifier/ConfigSchema/schema/HTML.txt | 4 +- .../schema/Output.CommentScriptContents.txt | 18 +- .../ConfigSchema/schema/Output.Newline.txt | 26 +- .../ConfigSchema/schema/Output.SortAttr.txt | 13 + .../ConfigSchema/schema/Output.TidyFormat.txt | 48 +- .../ConfigSchema/schema/Output.txt | 4 +- .../ConfigSchema/schema/Test.ForceNoIconv.txt | 12 +- .../HTMLPurifier/ConfigSchema/schema/Test.txt | 4 +- .../schema/URI.AllowedSchemes.txt | 28 +- .../ConfigSchema/schema/URI.Base.txt | 34 +- .../ConfigSchema/schema/URI.DefaultScheme.txt | 20 +- .../ConfigSchema/schema/URI.DefinitionID.txt | 22 +- .../ConfigSchema/schema/URI.DefinitionRev.txt | 22 +- .../ConfigSchema/schema/URI.Disable.txt | 26 +- .../schema/URI.DisableExternal.txt | 20 +- .../schema/URI.DisableExternalResources.txt | 24 +- .../schema/URI.DisableResources.txt | 24 +- .../ConfigSchema/schema/URI.Host.txt | 38 +- .../ConfigSchema/schema/URI.HostBlacklist.txt | 16 +- .../ConfigSchema/schema/URI.MakeAbsolute.txt | 24 +- .../ConfigSchema/schema/URI.Munge.txt | 164 +- .../schema/URI.OverrideAllowedSchemes.txt | 16 +- .../HTMLPurifier/ConfigSchema/schema/URI.txt | 4 +- lib/htmlpurifier/HTMLPurifier/ContentSets.php | 54 +- .../DefinitionCache/Decorator/Template.php.in | 92 +- .../DefinitionCache/Serializer/README | 1 + lib/htmlpurifier/HTMLPurifier/Encoder.php | 8 +- .../HTMLPurifier/ErrorCollector.php | 158 +- lib/htmlpurifier/HTMLPurifier/ErrorStruct.php | 58 + lib/htmlpurifier/HTMLPurifier/Generator.php | 7 + .../HTMLPurifier/HTMLModule/Forms.php | 117 + .../HTMLPurifier/HTMLModule/Name.php | 16 + .../HTMLPurifier/HTMLModule/Object.php | 92 +- .../HTMLPurifier/HTMLModule/Tidy/Name.php | 23 + .../HTMLModule/Tidy/Proprietary.php | 10 +- .../HTMLModule/Tidy/XHTMLAndHTML4.php | 4 - .../HTMLPurifier/HTMLModuleManager.php | 26 +- lib/htmlpurifier/HTMLPurifier/Injector.php | 109 +- .../HTMLPurifier/Injector/AutoParagraph.php | 403 +- .../HTMLPurifier/Injector/DisplayLinkURI.php | 24 + .../HTMLPurifier/Injector/RemoveEmpty.php | 40 + .../HTMLPurifier/Injector/SafeObject.php | 5 +- .../HTMLPurifier/Language/messages/en.php | 9 +- lib/htmlpurifier/HTMLPurifier/Lexer.php | 101 +- .../HTMLPurifier/Lexer/DOMLex.php | 5 +- .../HTMLPurifier/Lexer/DirectLex.php | 75 +- lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php | 7812 ++++++++--------- .../HTMLPurifier/Strategy/MakeWellFormed.php | 421 +- .../Strategy/RemoveForeignElements.php | 20 + .../HTMLPurifier/StringHashParser.php | 2 +- lib/htmlpurifier/HTMLPurifier/Token.php | 35 +- .../HTMLPurifier/Token/Comment.php | 3 +- lib/htmlpurifier/HTMLPurifier/Token/End.php | 6 +- lib/htmlpurifier/HTMLPurifier/Token/Tag.php | 3 +- lib/htmlpurifier/HTMLPurifier/Token/Text.php | 3 +- .../URIFilter/DisableExternal.php | 2 +- .../HTMLPurifier/URIFilter/MakeAbsolute.php | 9 +- .../HTMLPurifier/URIFilter/Munge.php | 6 +- .../HTMLPurifier/UnitConverter.php | 12 + lib/htmlpurifier/readme_moodle.txt | 2 +- 139 files changed, 6846 insertions(+), 5755 deletions(-) create mode 100644 lib/htmlpurifier/HTMLPurifier/AttrTransform/Background.php create mode 100644 lib/htmlpurifier/HTMLPurifier/AttrTransform/Input.php create mode 100644 lib/htmlpurifier/HTMLPurifier/AttrTransform/Textarea.php create mode 100644 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultImageAlt.txt create mode 100644 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.DisplayLinkURI.txt create mode 100644 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt create mode 100644 lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.SortAttr.txt create mode 100644 lib/htmlpurifier/HTMLPurifier/DefinitionCache/Serializer/README create mode 100644 lib/htmlpurifier/HTMLPurifier/ErrorStruct.php create mode 100644 lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php create mode 100644 lib/htmlpurifier/HTMLPurifier/HTMLModule/Name.php create mode 100644 lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Name.php create mode 100644 lib/htmlpurifier/HTMLPurifier/Injector/DisplayLinkURI.php create mode 100644 lib/htmlpurifier/HTMLPurifier/Injector/RemoveEmpty.php diff --git a/lib/htmlpurifier/HTMLPurifier.php b/lib/htmlpurifier/HTMLPurifier.php index da2269dab4..629f3b2693 100644 --- a/lib/htmlpurifier/HTMLPurifier.php +++ b/lib/htmlpurifier/HTMLPurifier.php @@ -19,7 +19,7 @@ */ /* - HTML Purifier 3.1.1 - Standards Compliant HTML Filtering + HTML Purifier 3.2.0 - Standards Compliant HTML Filtering Copyright (C) 2006-2008 Edward Z. Yang This library is free software; you can redistribute it and/or @@ -55,10 +55,10 @@ class HTMLPurifier { /** Version of HTML Purifier */ - public $version = '3.1.1'; + public $version = '3.2.0'; /** Constant with version of HTML Purifier */ - const VERSION = '3.1.1'; + const VERSION = '3.2.0'; /** Global configuration object */ public $config; diff --git a/lib/htmlpurifier/HTMLPurifier.safe-includes.php b/lib/htmlpurifier/HTMLPurifier.safe-includes.php index 42acd76bbb..05cc0a3c33 100644 --- a/lib/htmlpurifier/HTMLPurifier.safe-includes.php +++ b/lib/htmlpurifier/HTMLPurifier.safe-includes.php @@ -1,189 +1,198 @@ - $bool) $ret_array[] = $part; - $string = implode(' ', $ret_array); + $string = implode(' ', array_keys($ret_lookup)); return $string; diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Background.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Background.php new file mode 100644 index 0000000000..f7f0fedab7 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Background.php @@ -0,0 +1,22 @@ +confiscateAttr($attr, 'background'); + // some validation should happen here + + $this->prependCSS($attr, "background-image:url($background);"); + + return $attr; + + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php index 52c716759a..e938f525dc 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/ImgRequired.php @@ -22,7 +22,12 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform if (!isset($attr['alt'])) { if ($src) { - $attr['alt'] = basename($attr['src']); + $alt = $config->get('Attr', 'DefaultImageAlt'); + if ($alt === null) { + $attr['alt'] = basename($attr['src']); + } else { + $attr['alt'] = $alt; + } } else { $attr['alt'] = $config->get('Attr', 'DefaultInvalidImageAlt'); } diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Input.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Input.php new file mode 100644 index 0000000000..f082c4de27 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Input.php @@ -0,0 +1,39 @@ +pixels = new HTMLPurifier_AttrDef_HTML_Pixels(); + } + + public function transform($attr, $config, $context) { + if (!isset($attr['type'])) $t = 'text'; + else $t = strtolower($attr['type']); + if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') { + unset($attr['checked']); + } + if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') { + unset($attr['maxlength']); + } + if (isset($attr['size']) && $t !== 'text' && $t !== 'password') { + $result = $this->pixels->validate($attr['size'], $config, $context); + if ($result === false) unset($attr['size']); + else $attr['size'] = $result; + } + if (isset($attr['src']) && $t !== 'image') { + unset($attr['src']); + } + if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) { + $attr['value'] = ''; + } + return $attr; + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTransform/Textarea.php b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Textarea.php new file mode 100644 index 0000000000..c7cc9888af --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/AttrTransform/Textarea.php @@ -0,0 +1,16 @@ + + */ +class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform +{ + + public function transform($attr, $config, $context) { + // Calculated from Firefox + if (!isset($attr['cols'])) $attr['cols'] = '22'; + if (!isset($attr['rows'])) $attr['rows'] = '3'; + return $attr; + } + +} \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php index 9262a098c1..c3116a162e 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrTypes.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrTypes.php @@ -32,6 +32,9 @@ class HTMLPurifier_AttrTypes // unimplemented aliases $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text(); + $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text(); + $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text(); + $this->info['Character'] = new HTMLPurifier_AttrDef_Text(); // number is really a positive integer (one or more digits) // FIXME: ^^ not always, see start and value of list items diff --git a/lib/htmlpurifier/HTMLPurifier/AttrValidator.php b/lib/htmlpurifier/HTMLPurifier/AttrValidator.php index ba4510e3a4..fb913fc020 100644 --- a/lib/htmlpurifier/HTMLPurifier/AttrValidator.php +++ b/lib/htmlpurifier/HTMLPurifier/AttrValidator.php @@ -35,8 +35,8 @@ class HTMLPurifier_AttrValidator if (!$current_token) $context->register('CurrentToken', $token); if ( - !$token instanceof HTMLPurifier_Token_Start && - !$token instanceof HTMLPurifier_Token_Empty + !$token instanceof HTMLPurifier_Token_Start && + !$token instanceof HTMLPurifier_Token_Empty ) return $token; // create alias to global definition array, see also $defs @@ -50,14 +50,18 @@ class HTMLPurifier_AttrValidator // nothing currently utilizes this foreach ($definition->info_attr_transform_pre as $transform) { $attr = $transform->transform($o = $attr, $config, $context); - if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + if ($e) { + if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } } // do local transformations only applicable to this element (pre) // ex.

to

foreach ($definition->info[$token->name]->attr_transform_pre as $transform) { $attr = $transform->transform($o = $attr, $config, $context); - if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + if ($e) { + if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } } // create alias to this element's attribute definition array, see @@ -114,6 +118,8 @@ class HTMLPurifier_AttrValidator // simple substitution $attr[$attr_key] = $result; + } else { + // nothing happens } // we'd also want slightly more complicated substitution @@ -130,13 +136,17 @@ class HTMLPurifier_AttrValidator // global (error reporting untested) foreach ($definition->info_attr_transform_post as $transform) { $attr = $transform->transform($o = $attr, $config, $context); - if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + if ($e) { + if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } } // local (error reporting untested) foreach ($definition->info[$token->name]->attr_transform_post as $transform) { $attr = $transform->transform($o = $attr, $config, $context); - if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + if ($e) { + if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); + } } $token->attr = $attr; diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef.php b/lib/htmlpurifier/HTMLPurifier/ChildDef.php index 0cc345674d..38f3692f51 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef.php @@ -24,6 +24,14 @@ abstract class HTMLPurifier_ChildDef */ public $elements = array(); + /** + * Get lookup of tag names that should not close this element automatically. + * All other elements will do so. + */ + public function getNonAutoCloseElements($config) { + return $this->elements; + } + /** * Validates nodes according to definition and returns modification. * diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php index 4ba0788858..f79bfcae60 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/Custom.php @@ -5,8 +5,6 @@ * * @warning Currently this class is an all or nothing proposition, that is, * it will only give a bool return value. - * @note This class is currently not used by any code, although it is unit - * tested. */ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef { diff --git a/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php b/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php index ecdb17ff64..ec0890d917 100644 --- a/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php +++ b/lib/htmlpurifier/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -10,16 +10,19 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi public $allow_empty = true; public $type = 'strictblockquote'; protected $init = false; + + /** + * @note We don't want MakeWellFormed to auto-close inline elements since + * they might be allowed. + */ + public function getNonAutoCloseElements($config) { + $this->init($config); + return $this->fake_elements; + } + public function validateChildren($tokens_of_children, $config, $context) { - $def = $config->getHTMLDefinition(); - if (!$this->init) { - // allow all inline elements - $this->real_elements = $this->elements; - $this->fake_elements = $def->info_content_sets['Flow']; - $this->fake_elements['#PCDATA'] = true; - $this->init = true; - } + $this->init($config); // trick the parent class into thinking it allows more $this->elements = $this->fake_elements; @@ -29,6 +32,7 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi if ($result === false) return array(); if ($result === true) $result = $tokens_of_children; + $def = $config->getHTMLDefinition(); $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper); $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper); $is_inline = false; @@ -68,5 +72,16 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi if ($is_inline) $ret[] = $block_wrap_end; return $ret; } + + private function init($config) { + if (!$this->init) { + $def = $config->getHTMLDefinition(); + // allow all inline elements + $this->real_elements = $this->elements; + $this->fake_elements = $def->info_content_sets['Flow']; + $this->fake_elements['#PCDATA'] = true; + $this->init = true; + } + } } diff --git a/lib/htmlpurifier/HTMLPurifier/Config.php b/lib/htmlpurifier/HTMLPurifier/Config.php index e2fe21b6b5..aca8498285 100644 --- a/lib/htmlpurifier/HTMLPurifier/Config.php +++ b/lib/htmlpurifier/HTMLPurifier/Config.php @@ -20,7 +20,7 @@ class HTMLPurifier_Config /** * HTML Purifier's version */ - public $version = '3.1.1'; + public $version = '3.2.0'; /** * Bool indicator whether or not to automatically finalize diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser index 1af702b7df..1ac5ff691e 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema.ser @@ -1 +1 @@ -O:25:"HTMLPurifier_ConfigSchema":2:{s:8:"defaults";a:12:{s:4:"Attr";a:11:{s:19:"AllowedFrameTargets";a:0:{}s:10:"AllowedRel";a:0:{}s:10:"AllowedRev";a:0:{}s:19:"DefaultInvalidImage";s:0:"";s:22:"DefaultInvalidImageAlt";s:13:"Invalid image";s:14:"DefaultTextDir";s:3:"ltr";s:8:"EnableID";b:0;s:11:"IDBlacklist";a:0:{}s:17:"IDBlacklistRegexp";N;s:8:"IDPrefix";s:0:"";s:13:"IDPrefixLocal";s:0:"";}s:10:"AutoFormat";a:4:{s:13:"AutoParagraph";b:0;s:6:"Custom";a:0:{}s:7:"Linkify";b:0;s:15:"PurifierLinkify";b:0;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";s:3:"#%s";}s:3:"CSS";a:6:{s:14:"AllowImportant";b:0;s:11:"AllowTricky";b:0;s:17:"AllowedProperties";N;s:13:"DefinitionRev";i:1;s:12:"MaxImgLength";s:6:"1200px";s:11:"Proprietary";b:0;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";s:10:"Serializer";s:14:"SerializerPath";N;}s:4:"Core";a:15:{s:17:"AggressivelyFixLt";b:0;s:13:"CollectErrors";b:0;s:13:"ColorKeywords";a:17:{s:6:"maroon";s:7:"#800000";s:3:"red";s:7:"#FF0000";s:6:"orange";s:7:"#FFA500";s:6:"yellow";s:7:"#FFFF00";s:5:"olive";s:7:"#808000";s:6:"purple";s:7:"#800080";s:7:"fuchsia";s:7:"#FF00FF";s:5:"white";s:7:"#FFFFFF";s:4:"lime";s:7:"#00FF00";s:5:"green";s:7:"#008000";s:4:"navy";s:7:"#000080";s:4:"blue";s:7:"#0000FF";s:4:"aqua";s:7:"#00FFFF";s:4:"teal";s:7:"#008080";s:5:"black";s:7:"#000000";s:6:"silver";s:7:"#C0C0C0";s:4:"gray";s:7:"#808080";}s:25:"ConvertDocumentToFragment";b:1;s:31:"DirectLexLineNumberSyncInterval";i:0;s:8:"Encoding";s:5:"utf-8";s:21:"EscapeInvalidChildren";b:0;s:17:"EscapeInvalidTags";b:0;s:24:"EscapeNonASCIICharacters";b:0;s:14:"HiddenElements";a:2:{s:6:"script";b:1;s:5:"style";b:1;}s:8:"Language";s:2:"en";s:9:"LexerImpl";N;s:19:"MaintainLineNumbers";N;s:16:"RemoveInvalidImg";b:1;s:20:"RemoveScriptContents";N;}s:6:"Filter";a:3:{s:6:"Custom";a:0:{}s:18:"ExtractStyleBlocks";b:0;s:7:"YouTube";b:0;}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";b:1;s:23:"ExtractStyleBlocksScope";N;s:26:"ExtractStyleBlocksTidyImpl";N;}s:4:"HTML";a:23:{s:7:"Allowed";N;s:17:"AllowedAttributes";N;s:15:"AllowedElements";N;s:14:"AllowedModules";N;s:12:"BlockWrapper";s:1:"p";s:11:"CoreModules";a:7:{s:9:"Structure";b:1;s:4:"Text";b:1;s:9:"Hypertext";b:1;s:4:"List";b:1;s:22:"NonXMLCommonAttributes";b:1;s:19:"XMLCommonAttributes";b:1;s:16:"CommonAttributes";b:1;}s:13:"CustomDoctype";N;s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Doctype";N;s:19:"ForbiddenAttributes";a:0:{}s:17:"ForbiddenElements";a:0:{}s:12:"MaxImgLength";i:1200;s:6:"Parent";s:3:"div";s:11:"Proprietary";b:0;s:9:"SafeEmbed";b:0;s:10:"SafeObject";b:0;s:6:"Strict";b:0;s:7:"TidyAdd";a:0:{}s:9:"TidyLevel";s:6:"medium";s:10:"TidyRemove";a:0:{}s:7:"Trusted";b:0;s:5:"XHTML";b:1;}s:6:"Output";a:3:{s:21:"CommentScriptContents";b:1;s:7:"Newline";N;s:10:"TidyFormat";b:0;}s:4:"Test";a:1:{s:12:"ForceNoIconv";b:0;}s:3:"URI";a:16:{s:14:"AllowedSchemes";a:6:{s:4:"http";b:1;s:5:"https";b:1;s:6:"mailto";b:1;s:3:"ftp";b:1;s:4:"nntp";b:1;s:4:"news";b:1;}s:4:"Base";N;s:13:"DefaultScheme";s:4:"http";s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Disable";b:0;s:15:"DisableExternal";b:0;s:24:"DisableExternalResources";b:0;s:16:"DisableResources";b:0;s:4:"Host";N;s:13:"HostBlacklist";a:0:{}s:12:"MakeAbsolute";b:0;s:5:"Munge";N;s:14:"MungeResources";b:0;s:14:"MungeSecretKey";N;s:22:"OverrideAllowedSchemes";b:1;}}s:4:"info";a:12:{s:4:"Attr";a:12:{s:19:"AllowedFrameTargets";i:8;s:10:"AllowedRel";i:8;s:10:"AllowedRev";i:8;s:19:"DefaultInvalidImage";i:1;s:22:"DefaultInvalidImageAlt";i:1;s:14:"DefaultTextDir";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:2:{s:3:"ltr";b:1;s:3:"rtl";b:1;}}s:8:"EnableID";i:7;s:11:"IDBlacklist";i:9;s:17:"IDBlacklistRegexp";i:-1;s:8:"IDPrefix";i:1;s:13:"IDPrefixLocal";i:1;s:10:"DisableURI";O:8:"stdClass":3:{s:9:"namespace";s:3:"URI";s:4:"name";s:7:"Disable";s:7:"isAlias";b:1;}}s:10:"AutoFormat";a:4:{s:13:"AutoParagraph";i:7;s:6:"Custom";i:9;s:7:"Linkify";i:7;s:15:"PurifierLinkify";i:7;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";i:1;}s:3:"CSS";a:6:{s:14:"AllowImportant";i:7;s:11:"AllowTricky";i:7;s:17:"AllowedProperties";i:-8;s:13:"DefinitionRev";i:5;s:12:"MaxImgLength";i:-1;s:11:"Proprietary";i:7;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";i:-1;s:14:"SerializerPath";i:-1;}s:4:"Core";a:20:{s:15:"DefinitionCache";O:8:"stdClass":3:{s:9:"namespace";s:5:"Cache";s:4:"name";s:14:"DefinitionImpl";s:7:"isAlias";b:1;}s:17:"AggressivelyFixLt";i:7;s:13:"CollectErrors";i:7;s:13:"ColorKeywords";i:10;s:25:"ConvertDocumentToFragment";i:7;s:19:"AcceptFullDocuments";O:8:"stdClass":3:{s:9:"namespace";s:4:"Core";s:4:"name";s:25:"ConvertDocumentToFragment";s:7:"isAlias";b:1;}s:31:"DirectLexLineNumberSyncInterval";i:5;s:8:"Encoding";i:2;s:21:"EscapeInvalidChildren";i:7;s:17:"EscapeInvalidTags";i:7;s:24:"EscapeNonASCIICharacters";i:7;s:14:"HiddenElements";i:8;s:8:"Language";i:1;s:9:"LexerImpl";i:-11;s:19:"MaintainLineNumbers";i:-7;s:16:"RemoveInvalidImg";i:7;s:20:"RemoveScriptContents";i:-7;s:5:"XHTML";O:8:"stdClass":3:{s:9:"namespace";s:4:"HTML";s:4:"name";s:5:"XHTML";s:7:"isAlias";b:1;}s:21:"CommentScriptContents";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:21:"CommentScriptContents";s:7:"isAlias";b:1;}s:10:"TidyFormat";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:10:"TidyFormat";s:7:"isAlias";b:1;}}s:6:"Filter";a:5:{s:6:"Custom";i:9;s:18:"ExtractStyleBlocks";i:7;s:7:"YouTube";i:7;s:26:"ExtractStyleBlocksEscaping";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:26:"ExtractStyleBlocksEscaping";s:7:"isAlias";b:1;}s:23:"ExtractStyleBlocksScope";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:23:"ExtractStyleBlocksScope";s:7:"isAlias";b:1;}}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";i:7;s:23:"ExtractStyleBlocksScope";i:-1;s:26:"ExtractStyleBlocksTidyImpl";i:-11;}s:4:"HTML";a:24:{s:12:"EnableAttrID";O:8:"stdClass":3:{s:9:"namespace";s:4:"Attr";s:4:"name";s:8:"EnableID";s:7:"isAlias";b:1;}s:7:"Allowed";i:-4;s:17:"AllowedAttributes";i:-8;s:15:"AllowedElements";i:-8;s:14:"AllowedModules";i:-8;s:12:"BlockWrapper";i:1;s:11:"CoreModules";i:8;s:13:"CustomDoctype";i:-1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Doctype";O:8:"stdClass":3:{s:4:"type";i:1;s:10:"allow_null";b:1;s:7:"allowed";a:5:{s:22:"HTML 4.01 Transitional";b:1;s:16:"HTML 4.01 Strict";b:1;s:22:"XHTML 1.0 Transitional";b:1;s:16:"XHTML 1.0 Strict";b:1;s:9:"XHTML 1.1";b:1;}}s:19:"ForbiddenAttributes";i:8;s:17:"ForbiddenElements";i:8;s:12:"MaxImgLength";i:-5;s:6:"Parent";i:1;s:11:"Proprietary";i:7;s:9:"SafeEmbed";i:7;s:10:"SafeObject";i:7;s:6:"Strict";i:7;s:7:"TidyAdd";i:8;s:9:"TidyLevel";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:4:{s:4:"none";b:1;s:5:"light";b:1;s:6:"medium";b:1;s:5:"heavy";b:1;}}s:10:"TidyRemove";i:8;s:7:"Trusted";i:7;s:5:"XHTML";i:7;}s:6:"Output";a:3:{s:21:"CommentScriptContents";i:7;s:7:"Newline";i:-1;s:10:"TidyFormat";i:7;}s:4:"Test";a:1:{s:12:"ForceNoIconv";i:7;}s:3:"URI";a:16:{s:14:"AllowedSchemes";i:8;s:4:"Base";i:-1;s:13:"DefaultScheme";i:1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Disable";i:7;s:15:"DisableExternal";i:7;s:24:"DisableExternalResources";i:7;s:16:"DisableResources";i:7;s:4:"Host";i:-1;s:13:"HostBlacklist";i:9;s:12:"MakeAbsolute";i:7;s:5:"Munge";i:-1;s:14:"MungeResources";i:7;s:14:"MungeSecretKey";i:-1;s:22:"OverrideAllowedSchemes";i:7;}}} \ No newline at end of file +O:25:"HTMLPurifier_ConfigSchema":2:{s:8:"defaults";a:12:{s:4:"Attr";a:12:{s:19:"AllowedFrameTargets";a:0:{}s:10:"AllowedRel";a:0:{}s:10:"AllowedRev";a:0:{}s:15:"DefaultImageAlt";N;s:19:"DefaultInvalidImage";s:0:"";s:22:"DefaultInvalidImageAlt";s:13:"Invalid image";s:14:"DefaultTextDir";s:3:"ltr";s:8:"EnableID";b:0;s:11:"IDBlacklist";a:0:{}s:17:"IDBlacklistRegexp";N;s:8:"IDPrefix";s:0:"";s:13:"IDPrefixLocal";s:0:"";}s:10:"AutoFormat";a:6:{s:13:"AutoParagraph";b:0;s:6:"Custom";a:0:{}s:14:"DisplayLinkURI";b:0;s:7:"Linkify";b:0;s:15:"PurifierLinkify";b:0;s:11:"RemoveEmpty";b:0;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";s:3:"#%s";}s:3:"CSS";a:6:{s:14:"AllowImportant";b:0;s:11:"AllowTricky";b:0;s:17:"AllowedProperties";N;s:13:"DefinitionRev";i:1;s:12:"MaxImgLength";s:6:"1200px";s:11:"Proprietary";b:0;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";s:10:"Serializer";s:14:"SerializerPath";N;}s:4:"Core";a:15:{s:17:"AggressivelyFixLt";b:1;s:13:"CollectErrors";b:0;s:13:"ColorKeywords";a:17:{s:6:"maroon";s:7:"#800000";s:3:"red";s:7:"#FF0000";s:6:"orange";s:7:"#FFA500";s:6:"yellow";s:7:"#FFFF00";s:5:"olive";s:7:"#808000";s:6:"purple";s:7:"#800080";s:7:"fuchsia";s:7:"#FF00FF";s:5:"white";s:7:"#FFFFFF";s:4:"lime";s:7:"#00FF00";s:5:"green";s:7:"#008000";s:4:"navy";s:7:"#000080";s:4:"blue";s:7:"#0000FF";s:4:"aqua";s:7:"#00FFFF";s:4:"teal";s:7:"#008080";s:5:"black";s:7:"#000000";s:6:"silver";s:7:"#C0C0C0";s:4:"gray";s:7:"#808080";}s:25:"ConvertDocumentToFragment";b:1;s:31:"DirectLexLineNumberSyncInterval";i:0;s:8:"Encoding";s:5:"utf-8";s:21:"EscapeInvalidChildren";b:0;s:17:"EscapeInvalidTags";b:0;s:24:"EscapeNonASCIICharacters";b:0;s:14:"HiddenElements";a:2:{s:6:"script";b:1;s:5:"style";b:1;}s:8:"Language";s:2:"en";s:9:"LexerImpl";N;s:19:"MaintainLineNumbers";N;s:16:"RemoveInvalidImg";b:1;s:20:"RemoveScriptContents";N;}s:6:"Filter";a:3:{s:6:"Custom";a:0:{}s:18:"ExtractStyleBlocks";b:0;s:7:"YouTube";b:0;}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";b:1;s:23:"ExtractStyleBlocksScope";N;s:26:"ExtractStyleBlocksTidyImpl";N;}s:4:"HTML";a:23:{s:7:"Allowed";N;s:17:"AllowedAttributes";N;s:15:"AllowedElements";N;s:14:"AllowedModules";N;s:12:"BlockWrapper";s:1:"p";s:11:"CoreModules";a:7:{s:9:"Structure";b:1;s:4:"Text";b:1;s:9:"Hypertext";b:1;s:4:"List";b:1;s:22:"NonXMLCommonAttributes";b:1;s:19:"XMLCommonAttributes";b:1;s:16:"CommonAttributes";b:1;}s:13:"CustomDoctype";N;s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Doctype";N;s:19:"ForbiddenAttributes";a:0:{}s:17:"ForbiddenElements";a:0:{}s:12:"MaxImgLength";i:1200;s:6:"Parent";s:3:"div";s:11:"Proprietary";b:0;s:9:"SafeEmbed";b:0;s:10:"SafeObject";b:0;s:6:"Strict";b:0;s:7:"TidyAdd";a:0:{}s:9:"TidyLevel";s:6:"medium";s:10:"TidyRemove";a:0:{}s:7:"Trusted";b:0;s:5:"XHTML";b:1;}s:6:"Output";a:4:{s:21:"CommentScriptContents";b:1;s:7:"Newline";N;s:8:"SortAttr";b:0;s:10:"TidyFormat";b:0;}s:4:"Test";a:1:{s:12:"ForceNoIconv";b:0;}s:3:"URI";a:16:{s:14:"AllowedSchemes";a:6:{s:4:"http";b:1;s:5:"https";b:1;s:6:"mailto";b:1;s:3:"ftp";b:1;s:4:"nntp";b:1;s:4:"news";b:1;}s:4:"Base";N;s:13:"DefaultScheme";s:4:"http";s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Disable";b:0;s:15:"DisableExternal";b:0;s:24:"DisableExternalResources";b:0;s:16:"DisableResources";b:0;s:4:"Host";N;s:13:"HostBlacklist";a:0:{}s:12:"MakeAbsolute";b:0;s:5:"Munge";N;s:14:"MungeResources";b:0;s:14:"MungeSecretKey";N;s:22:"OverrideAllowedSchemes";b:1;}}s:4:"info";a:12:{s:4:"Attr";a:13:{s:19:"AllowedFrameTargets";i:8;s:10:"AllowedRel";i:8;s:10:"AllowedRev";i:8;s:15:"DefaultImageAlt";i:-1;s:19:"DefaultInvalidImage";i:1;s:22:"DefaultInvalidImageAlt";i:1;s:14:"DefaultTextDir";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:2:{s:3:"ltr";b:1;s:3:"rtl";b:1;}}s:8:"EnableID";i:7;s:11:"IDBlacklist";i:9;s:17:"IDBlacklistRegexp";i:-1;s:8:"IDPrefix";i:1;s:13:"IDPrefixLocal";i:1;s:10:"DisableURI";O:8:"stdClass":3:{s:9:"namespace";s:3:"URI";s:4:"name";s:7:"Disable";s:7:"isAlias";b:1;}}s:10:"AutoFormat";a:6:{s:13:"AutoParagraph";i:7;s:6:"Custom";i:9;s:14:"DisplayLinkURI";i:7;s:7:"Linkify";i:7;s:15:"PurifierLinkify";i:7;s:11:"RemoveEmpty";i:7;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";i:1;}s:3:"CSS";a:6:{s:14:"AllowImportant";i:7;s:11:"AllowTricky";i:7;s:17:"AllowedProperties";i:-8;s:13:"DefinitionRev";i:5;s:12:"MaxImgLength";i:-1;s:11:"Proprietary";i:7;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";i:-1;s:14:"SerializerPath";i:-1;}s:4:"Core";a:20:{s:15:"DefinitionCache";O:8:"stdClass":3:{s:9:"namespace";s:5:"Cache";s:4:"name";s:14:"DefinitionImpl";s:7:"isAlias";b:1;}s:17:"AggressivelyFixLt";i:7;s:13:"CollectErrors";i:7;s:13:"ColorKeywords";i:10;s:25:"ConvertDocumentToFragment";i:7;s:19:"AcceptFullDocuments";O:8:"stdClass":3:{s:9:"namespace";s:4:"Core";s:4:"name";s:25:"ConvertDocumentToFragment";s:7:"isAlias";b:1;}s:31:"DirectLexLineNumberSyncInterval";i:5;s:8:"Encoding";i:2;s:21:"EscapeInvalidChildren";i:7;s:17:"EscapeInvalidTags";i:7;s:24:"EscapeNonASCIICharacters";i:7;s:14:"HiddenElements";i:8;s:8:"Language";i:1;s:9:"LexerImpl";i:-11;s:19:"MaintainLineNumbers";i:-7;s:16:"RemoveInvalidImg";i:7;s:20:"RemoveScriptContents";i:-7;s:5:"XHTML";O:8:"stdClass":3:{s:9:"namespace";s:4:"HTML";s:4:"name";s:5:"XHTML";s:7:"isAlias";b:1;}s:21:"CommentScriptContents";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:21:"CommentScriptContents";s:7:"isAlias";b:1;}s:10:"TidyFormat";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:10:"TidyFormat";s:7:"isAlias";b:1;}}s:6:"Filter";a:5:{s:6:"Custom";i:9;s:18:"ExtractStyleBlocks";i:7;s:7:"YouTube";i:7;s:26:"ExtractStyleBlocksEscaping";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:26:"ExtractStyleBlocksEscaping";s:7:"isAlias";b:1;}s:23:"ExtractStyleBlocksScope";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:23:"ExtractStyleBlocksScope";s:7:"isAlias";b:1;}}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";i:7;s:23:"ExtractStyleBlocksScope";i:-1;s:26:"ExtractStyleBlocksTidyImpl";i:-11;}s:4:"HTML";a:24:{s:12:"EnableAttrID";O:8:"stdClass":3:{s:9:"namespace";s:4:"Attr";s:4:"name";s:8:"EnableID";s:7:"isAlias";b:1;}s:7:"Allowed";i:-4;s:17:"AllowedAttributes";i:-8;s:15:"AllowedElements";i:-8;s:14:"AllowedModules";i:-8;s:12:"BlockWrapper";i:1;s:11:"CoreModules";i:8;s:13:"CustomDoctype";i:-1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Doctype";O:8:"stdClass":3:{s:4:"type";i:1;s:10:"allow_null";b:1;s:7:"allowed";a:5:{s:22:"HTML 4.01 Transitional";b:1;s:16:"HTML 4.01 Strict";b:1;s:22:"XHTML 1.0 Transitional";b:1;s:16:"XHTML 1.0 Strict";b:1;s:9:"XHTML 1.1";b:1;}}s:19:"ForbiddenAttributes";i:8;s:17:"ForbiddenElements";i:8;s:12:"MaxImgLength";i:-5;s:6:"Parent";i:1;s:11:"Proprietary";i:7;s:9:"SafeEmbed";i:7;s:10:"SafeObject";i:7;s:6:"Strict";i:7;s:7:"TidyAdd";i:8;s:9:"TidyLevel";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:4:{s:4:"none";b:1;s:5:"light";b:1;s:6:"medium";b:1;s:5:"heavy";b:1;}}s:10:"TidyRemove";i:8;s:7:"Trusted";i:7;s:5:"XHTML";i:7;}s:6:"Output";a:4:{s:21:"CommentScriptContents";i:7;s:7:"Newline";i:-1;s:8:"SortAttr";i:7;s:10:"TidyFormat";i:7;}s:4:"Test";a:1:{s:12:"ForceNoIconv";i:7;}s:3:"URI";a:16:{s:14:"AllowedSchemes";i:8;s:4:"Base";i:-1;s:13:"DefaultScheme";i:1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Disable";i:7;s:15:"DisableExternal";i:7;s:24:"DisableExternalResources";i:7;s:16:"DisableResources";i:7;s:4:"Host";i:-1;s:13:"HostBlacklist";i:9;s:12:"MakeAbsolute";i:7;s:5:"Munge";i:-1;s:14:"MungeResources";i:7;s:14:"MungeSecretKey";i:-1;s:22:"OverrideAllowedSchemes";i:7;}}} \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedFrameTargets.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedFrameTargets.txt index 6cecfa2f00..e1d82cf900 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedFrameTargets.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedFrameTargets.txt @@ -1,11 +1,11 @@ -Attr.AllowedFrameTargets -TYPE: lookup -DEFAULT: array() ---DESCRIPTION-- -Lookup table of all allowed link frame targets. Some commonly used link -targets include _blank, _self, _parent and _top. Values should be -lowercase, as validation will be done in a case-sensitive manner despite -W3C's recommendation. XHTML 1.0 Strict does not permit the target attribute -so this directive will have no effect in that doctype. XHTML 1.1 does not -enable the Target module by default, you will have to manually enable it -(see the module documentation for more details.) +Attr.AllowedFrameTargets +TYPE: lookup +DEFAULT: array() +--DESCRIPTION-- +Lookup table of all allowed link frame targets. Some commonly used link +targets include _blank, _self, _parent and _top. Values should be +lowercase, as validation will be done in a case-sensitive manner despite +W3C's recommendation. XHTML 1.0 Strict does not permit the target attribute +so this directive will have no effect in that doctype. XHTML 1.1 does not +enable the Target module by default, you will have to manually enable it +(see the module documentation for more details.) diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRel.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRel.txt index 26bbc37130..e9f3febe68 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRel.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRel.txt @@ -1,8 +1,8 @@ -Attr.AllowedRel -TYPE: lookup -VERSION: 1.6.0 -DEFAULT: array() ---DESCRIPTION-- -List of allowed forward document relationships in the rel attribute. Common -values may be nofollow or print. By default, this is empty, meaning that no -document relationships are allowed. +Attr.AllowedRel +TYPE: lookup +VERSION: 1.6.0 +DEFAULT: array() +--DESCRIPTION-- +List of allowed forward document relationships in the rel attribute. Common +values may be nofollow or print. By default, this is empty, meaning that no +document relationships are allowed. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRev.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRev.txt index b007bc58ac..95cc372e1b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRev.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.AllowedRev.txt @@ -1,8 +1,8 @@ -Attr.AllowedRev -TYPE: lookup -VERSION: 1.6.0 -DEFAULT: array() ---DESCRIPTION-- -List of allowed reverse document relationships in the rev attribute. This -attribute is a bit of an edge-case; if you don't know what it is for, stay -away. +Attr.AllowedRev +TYPE: lookup +VERSION: 1.6.0 +DEFAULT: array() +--DESCRIPTION-- +List of allowed reverse document relationships in the rev attribute. This +attribute is a bit of an edge-case; if you don't know what it is for, stay +away. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultImageAlt.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultImageAlt.txt new file mode 100644 index 0000000000..e0be51be86 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultImageAlt.txt @@ -0,0 +1,9 @@ +Attr.DefaultImageAlt +TYPE: string/null +DEFAULT: null +--DESCRIPTION-- +This is the content of the alt tag of an image if the user had not +previously specified an alt attribute. This applies to all images without +a valid alt attribute, as opposed to %Attr.DefaultInvalidImageAlt, which +only applies to invalid images, and overrides in the case of an invalid image. +Default behavior with null is to use the basename of the src tag for the alt. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImage.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImage.txt index 3d5c45d4ff..ebd15275ef 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImage.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImage.txt @@ -1,8 +1,8 @@ -Attr.DefaultInvalidImage -TYPE: string -DEFAULT: '' ---DESCRIPTION-- -This is the default image an img tag will be pointed to if it does not have -a valid src attribute. In future versions, we may allow the image tag to -be removed completely, but due to design issues, this is not possible right -now. +Attr.DefaultInvalidImage +TYPE: string +DEFAULT: '' +--DESCRIPTION-- +This is the default image an img tag will be pointed to if it does not have +a valid src attribute. In future versions, we may allow the image tag to +be removed completely, but due to design issues, this is not possible right +now. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImageAlt.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImageAlt.txt index cfc9f904be..883e12e414 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImageAlt.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultInvalidImageAlt.txt @@ -1,7 +1,7 @@ -Attr.DefaultInvalidImageAlt -TYPE: string -DEFAULT: 'Invalid image' ---DESCRIPTION-- -This is the content of the alt tag of an invalid image if the user had not -previously specified an alt attribute. It has no effect when the image is -valid but there was no alt attribute present. +Attr.DefaultInvalidImageAlt +TYPE: string +DEFAULT: 'Invalid image' +--DESCRIPTION-- +This is the content of the alt tag of an invalid image if the user had not +previously specified an alt attribute. It has no effect when the image is +valid but there was no alt attribute present. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultTextDir.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultTextDir.txt index 80296fccf4..f138c991b9 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultTextDir.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.DefaultTextDir.txt @@ -1,9 +1,9 @@ -Attr.DefaultTextDir -TYPE: string -DEFAULT: 'ltr' ---DESCRIPTION-- -Defines the default text direction (ltr or rtl) of the document being -parsed. This generally is the same as the value of the dir attribute in -HTML, or ltr if that is not specified. ---ALLOWED-- -'ltr', 'rtl' +Attr.DefaultTextDir +TYPE: string +DEFAULT: 'ltr' +--DESCRIPTION-- +Defines the default text direction (ltr or rtl) of the document being +parsed. This generally is the same as the value of the dir attribute in +HTML, or ltr if that is not specified. +--ALLOWED-- +'ltr', 'rtl' diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.EnableID.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.EnableID.txt index 358d6d07a6..8803e01f98 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.EnableID.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.EnableID.txt @@ -1,15 +1,15 @@ -Attr.EnableID -TYPE: bool -DEFAULT: false -VERSION: 1.2.0 ---DESCRIPTION-- -Allows the ID attribute in HTML. This is disabled by default due to the -fact that without proper configuration user input can easily break the -validation of a webpage by specifying an ID that is already on the -surrounding HTML. If you don't mind throwing caution to the wind, enable -this directive, but I strongly recommend you also consider blacklisting IDs -you use (%Attr.IDBlacklist) or prefixing all user supplied IDs -(%Attr.IDPrefix). When set to true HTML Purifier reverts to the behavior of -pre-1.2.0 versions. ---ALIASES-- -HTML.EnableAttrID +Attr.EnableID +TYPE: bool +DEFAULT: false +VERSION: 1.2.0 +--DESCRIPTION-- +Allows the ID attribute in HTML. This is disabled by default due to the +fact that without proper configuration user input can easily break the +validation of a webpage by specifying an ID that is already on the +surrounding HTML. If you don't mind throwing caution to the wind, enable +this directive, but I strongly recommend you also consider blacklisting IDs +you use (%Attr.IDBlacklist) or prefixing all user supplied IDs +(%Attr.IDPrefix). When set to true HTML Purifier reverts to the behavior of +pre-1.2.0 versions. +--ALIASES-- +HTML.EnableAttrID diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklist.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklist.txt index 16fc46e16b..7ab584440b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklist.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklist.txt @@ -1,4 +1,4 @@ -Attr.IDBlacklist -TYPE: list -DEFAULT: array() -DESCRIPTION: Array of IDs not allowed in the document. +Attr.IDBlacklist +TYPE: list +DEFAULT: array() +DESCRIPTION: Array of IDs not allowed in the document. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklistRegexp.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklistRegexp.txt index 98f7c5a632..08a74a0a46 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklistRegexp.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDBlacklistRegexp.txt @@ -1,8 +1,8 @@ -Attr.IDBlacklistRegexp -TYPE: string/null -VERSION: 1.6.0 -DEFAULT: NULL ---DESCRIPTION-- -PCRE regular expression to be matched against all IDs. If the expression is -matches, the ID is rejected. Use this with care: may cause significant -degradation. ID matching is done after all other validation. +Attr.IDBlacklistRegexp +TYPE: string/null +VERSION: 1.6.0 +DEFAULT: NULL +--DESCRIPTION-- +PCRE regular expression to be matched against all IDs. If the expression is +matches, the ID is rejected. Use this with care: may cause significant +degradation. ID matching is done after all other validation. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefix.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefix.txt index f996c08436..362aa2424c 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefix.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefix.txt @@ -1,11 +1,11 @@ -Attr.IDPrefix -TYPE: string -VERSION: 1.2.0 -DEFAULT: '' ---DESCRIPTION-- -String to prefix to IDs. If you have no idea what IDs your pages may use, -you may opt to simply add a prefix to all user-submitted ID attributes so -that they are still usable, but will not conflict with core page IDs. -Example: setting the directive to 'user_' will result in a user submitted -'foo' to become 'user_foo' Be sure to set %HTML.EnableAttrID to true -before using this. +Attr.IDPrefix +TYPE: string +VERSION: 1.2.0 +DEFAULT: '' +--DESCRIPTION-- +String to prefix to IDs. If you have no idea what IDs your pages may use, +you may opt to simply add a prefix to all user-submitted ID attributes so +that they are still usable, but will not conflict with core page IDs. +Example: setting the directive to 'user_' will result in a user submitted +'foo' to become 'user_foo' Be sure to set %HTML.EnableAttrID to true +before using this. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefixLocal.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefixLocal.txt index b403fa9cd2..b4f6b315b2 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefixLocal.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.IDPrefixLocal.txt @@ -1,13 +1,13 @@ -Attr.IDPrefixLocal -TYPE: string -VERSION: 1.2.0 -DEFAULT: '' ---DESCRIPTION-- -Temporary prefix for IDs used in conjunction with %Attr.IDPrefix. If you -need to allow multiple sets of user content on web page, you may need to -have a seperate prefix that changes with each iteration. This way, -seperately submitted user content displayed on the same page doesn't -clobber each other. Ideal values are unique identifiers for the content it -represents (i.e. the id of the row in the database). Be sure to add a -seperator (like an underscore) at the end. Warning: this directive will -not work unless %Attr.IDPrefix is set to a non-empty value! +Attr.IDPrefixLocal +TYPE: string +VERSION: 1.2.0 +DEFAULT: '' +--DESCRIPTION-- +Temporary prefix for IDs used in conjunction with %Attr.IDPrefix. If you +need to allow multiple sets of user content on web page, you may need to +have a seperate prefix that changes with each iteration. This way, +seperately submitted user content displayed on the same page doesn't +clobber each other. Ideal values are unique identifiers for the content it +represents (i.e. the id of the row in the database). Be sure to add a +seperator (like an underscore) at the end. Warning: this directive will +not work unless %Attr.IDPrefix is set to a non-empty value! diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.txt index fb18894fe9..de0591d37d 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Attr.txt @@ -1,2 +1,2 @@ -Attr -DESCRIPTION: Features regarding attribute validation. +Attr +DESCRIPTION: Features regarding attribute validation. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.AutoParagraph.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.AutoParagraph.txt index e1f7a475ec..1548c1f22d 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.AutoParagraph.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.AutoParagraph.txt @@ -1,30 +1,30 @@ -AutoFormat.AutoParagraph -TYPE: bool -VERSION: 2.0.1 -DEFAULT: false ---DESCRIPTION-- - -

- This directive turns on auto-paragraphing, where double newlines are - converted in to paragraphs whenever possible. Auto-paragraphing: -

- -

- p tags must be allowed for this directive to take effect. - We do not use br tags for paragraphing, as that is - semantically incorrect. -

-

- To prevent auto-paragraphing as a content-producer, refrain from using - double-newlines except to specify a new paragraph or in contexts where - it has special meaning (whitespace usually has no meaning except in - tags like pre, so this should not be difficult.) To prevent - the paragraphing of inline text adjacent to block elements, wrap them - in div tags (the behavior is slightly different outside of - the root node.) -

+AutoFormat.AutoParagraph +TYPE: bool +VERSION: 2.0.1 +DEFAULT: false +--DESCRIPTION-- + +

+ This directive turns on auto-paragraphing, where double newlines are + converted in to paragraphs whenever possible. Auto-paragraphing: +

+ +

+ p tags must be allowed for this directive to take effect. + We do not use br tags for paragraphing, as that is + semantically incorrect. +

+

+ To prevent auto-paragraphing as a content-producer, refrain from using + double-newlines except to specify a new paragraph or in contexts where + it has special meaning (whitespace usually has no meaning except in + tags like pre, so this should not be difficult.) To prevent + the paragraphing of inline text adjacent to block elements, wrap them + in div tags (the behavior is slightly different outside of + the root node.) +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Custom.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Custom.txt index c5a363b399..a8bd4eb116 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Custom.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Custom.txt @@ -1,12 +1,12 @@ -AutoFormat.Custom -TYPE: list -VERSION: 2.0.1 -DEFAULT: array() ---DESCRIPTION-- - -

- This directive can be used to add custom auto-format injectors. - Specify an array of injector names (class name minus the prefix) - or concrete implementations. Injector class must exist. -

- +AutoFormat.Custom +TYPE: list +VERSION: 2.0.1 +DEFAULT: array() +--DESCRIPTION-- + +

+ This directive can be used to add custom auto-format injectors. + Specify an array of injector names (class name minus the prefix) + or concrete implementations. Injector class must exist. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.DisplayLinkURI.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.DisplayLinkURI.txt new file mode 100644 index 0000000000..28a157ed48 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.DisplayLinkURI.txt @@ -0,0 +1,10 @@ +AutoFormat.DisplayLinkURI +TYPE: bool +VERSION: 3.2.0 +DEFAULT: false +--DESCRIPTION-- +

+ This directive turns on the in-text display of URIs in <a> tags, and disables + those links. For example, example becomes + example (http://example.com). +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Linkify.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Linkify.txt index 554086f359..6c1cd7b671 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Linkify.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.Linkify.txt @@ -1,12 +1,12 @@ -AutoFormat.Linkify -TYPE: bool -VERSION: 2.0.1 -DEFAULT: false ---DESCRIPTION-- - -

- This directive turns on linkification, auto-linking http, ftp and - https URLs. a tags with the href attribute - must be allowed. -

- +AutoFormat.Linkify +TYPE: bool +VERSION: 2.0.1 +DEFAULT: false +--DESCRIPTION-- + +

+ This directive turns on linkification, auto-linking http, ftp and + https URLs. a tags with the href attribute + must be allowed. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.PurifierLinkify.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.PurifierLinkify.txt index c7bcaf283e..cd7f0b8eb4 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.PurifierLinkify.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.PurifierLinkify.txt @@ -1,12 +1,12 @@ -AutoFormat.PurifierLinkify -TYPE: bool -VERSION: 2.0.1 -DEFAULT: false ---DESCRIPTION-- - -

- Internal auto-formatter that converts configuration directives in - syntax %Namespace.Directive to links. a tags - with the href attribute must be allowed. -

- +AutoFormat.PurifierLinkify +TYPE: bool +VERSION: 2.0.1 +DEFAULT: false +--DESCRIPTION-- + +

+ Internal auto-formatter that converts configuration directives in + syntax %Namespace.Directive to links. a tags + with the href attribute must be allowed. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt new file mode 100644 index 0000000000..9edf0d6256 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt @@ -0,0 +1,44 @@ +AutoFormat.RemoveEmpty +TYPE: bool +VERSION: 3.2.0 +DEFAULT: false +--DESCRIPTION-- +

+ When enabled, HTML Purifier will attempt to remove empty elements that + contribute no semantic information to the document. The following types + of nodes will be removed: +

+ +

+ Please be very careful when using this functionality; while it may not + seem that empty elements contain useful information, they can alter the + layout of a document given appropriate styling. This directive is most + useful when you are processing machine-generated HTML, please avoid using + it on regular user HTML. +

+

+ Elements that contain only whitespace will be treated as empty. Non-breaking + spaces, however, do not count as whitespace. +

+

+ This algorithm is not perfect; you may still notice some empty tags, + particularly if a node had elements, but those elements were later removed + because they were not permitted in that context, or tags that, after + being auto-closed by another tag, where empty. This is for safety reasons + to prevent clever code from breaking validation. The general rule of thumb: + if a tag looked empty on the way end, it will get removed; if HTML Purifier + made it empty, it will stay. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.txt index 4a7b5521f7..c63fb69985 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormat.txt @@ -1,2 +1,2 @@ -AutoFormat -DESCRIPTION: Configuration for activating auto-formatting functionality (also known as Injectors) +AutoFormat +DESCRIPTION: Configuration for activating auto-formatting functionality (also known as Injectors) diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.PurifierLinkifyDocURL.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.PurifierLinkifyDocURL.txt index 79d3358061..11d3b4e132 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.PurifierLinkifyDocURL.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.PurifierLinkifyDocURL.txt @@ -1,12 +1,12 @@ -AutoFormatParam.PurifierLinkifyDocURL -TYPE: string -VERSION: 2.0.1 -DEFAULT: '#%s' ---DESCRIPTION-- - -

- Location of configuration documentation to link to, let %s substitute - into the configuration's namespace and directive names sans the percent - sign. -

- +AutoFormatParam.PurifierLinkifyDocURL +TYPE: string +VERSION: 2.0.1 +DEFAULT: '#%s' +--DESCRIPTION-- + +

+ Location of configuration documentation to link to, let %s substitute + into the configuration's namespace and directive names sans the percent + sign. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.txt index 0ed78846ed..95bffcd88f 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/AutoFormatParam.txt @@ -1,2 +1,2 @@ -AutoFormatParam -DESCRIPTION: Configuration for customizing auto-formatting functionality +AutoFormatParam +DESCRIPTION: Configuration for customizing auto-formatting functionality diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.DefinitionRev.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.DefinitionRev.txt index e1e5992680..5b40de2a05 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.DefinitionRev.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.DefinitionRev.txt @@ -1,11 +1,11 @@ -CSS.DefinitionRev -TYPE: int -VERSION: 2.0.0 -DEFAULT: 1 ---DESCRIPTION-- - -

- Revision identifier for your custom definition. See - %HTML.DefinitionRev for details. -

- +CSS.DefinitionRev +TYPE: int +VERSION: 2.0.0 +DEFAULT: 1 +--DESCRIPTION-- + +

+ Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.Proprietary.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.Proprietary.txt index c9c068fc39..cc883dc578 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.Proprietary.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.Proprietary.txt @@ -1,10 +1,10 @@ -CSS.Proprietary -TYPE: bool -VERSION: 3.0.0 -DEFAULT: false ---DESCRIPTION-- - -

- Whether or not to allow safe, proprietary CSS values. -

- +CSS.Proprietary +TYPE: bool +VERSION: 3.0.0 +DEFAULT: false +--DESCRIPTION-- + +

+ Whether or not to allow safe, proprietary CSS values. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.txt index 060f5d3715..8bafc4b3c8 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/CSS.txt @@ -1,2 +1,2 @@ -CSS -DESCRIPTION: Configuration regarding allowed CSS. +CSS +DESCRIPTION: Configuration regarding allowed CSS. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.DefinitionImpl.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.DefinitionImpl.txt index e9fa699ae8..192db1f9e6 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.DefinitionImpl.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.DefinitionImpl.txt @@ -1,13 +1,13 @@ -Cache.DefinitionImpl -TYPE: string/null -VERSION: 2.0.0 -DEFAULT: 'Serializer' ---DESCRIPTION-- - -This directive defines which method to use when caching definitions, -the complex data-type that makes HTML Purifier tick. Set to null -to disable caching (not recommended, as you will see a definite -performance degradation). - ---ALIASES-- -Core.DefinitionCache +Cache.DefinitionImpl +TYPE: string/null +VERSION: 2.0.0 +DEFAULT: 'Serializer' +--DESCRIPTION-- + +This directive defines which method to use when caching definitions, +the complex data-type that makes HTML Purifier tick. Set to null +to disable caching (not recommended, as you will see a definite +performance degradation). + +--ALIASES-- +Core.DefinitionCache diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPath.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPath.txt index 3682cbb37e..600c567ecf 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPath.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPath.txt @@ -1,13 +1,13 @@ -Cache.SerializerPath -TYPE: string/null -VERSION: 2.0.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- Absolute path with no trailing slash to store serialized definitions in. - Default is within the - HTML Purifier library inside DefinitionCache/Serializer. This - path must be writable by the webserver. -

- +Cache.SerializerPath +TYPE: string/null +VERSION: 2.0.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ Absolute path with no trailing slash to store serialized definitions in. + Default is within the + HTML Purifier library inside DefinitionCache/Serializer. This + path must be writable by the webserver. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.txt index 2f7aaa268e..6edaae0c47 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Cache.txt @@ -1,2 +1,2 @@ -Cache -DESCRIPTION: Configuration for DefinitionCache and related subclasses. +Cache +DESCRIPTION: Configuration for DefinitionCache and related subclasses. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt index 0d60b89f6b..7fdca4924a 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt @@ -1,13 +1,17 @@ -Core.AggressivelyFixLt -TYPE: bool -VERSION: 2.1.0 -DEFAULT: false ---DESCRIPTION-- - -This directive enables aggressive pre-filter fixes HTML Purifier can -perform in order to ensure that open angled-brackets do not get killed -during parsing stage. Enabling this will result in two preg_replace_callback -calls and one preg_replace call for every bit of HTML passed through here. -It is not necessary and will have no effect for PHP 4. - - +Core.AggressivelyFixLt +TYPE: bool +VERSION: 2.1.0 +DEFAULT: true +--DESCRIPTION-- +

+ This directive enables aggressive pre-filter fixes HTML Purifier can + perform in order to ensure that open angled-brackets do not get killed + during parsing stage. Enabling this will result in two preg_replace_callback + calls and at least two preg_replace calls for every HTML document parsed; + if your users make very well-formed HTML, you can set this directive false. + This has no effect when DirectLex is used. +

+

+ Notice: This directive's default turned from false to true + in HTML Purifier 3.2.0. +

\ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.CollectErrors.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.CollectErrors.txt index dcf20563d2..ccaf2ae3d9 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.CollectErrors.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.CollectErrors.txt @@ -1,11 +1,11 @@ -Core.CollectErrors -TYPE: bool -VERSION: 2.0.0 -DEFAULT: false ---DESCRIPTION-- - -Whether or not to collect errors found while filtering the document. This -is a useful way to give feedback to your users. Warning: -Currently this feature is very patchy and experimental, with lots of -possible error messages not yet implemented. It will not cause any -problems, but it may not help your users either. +Core.CollectErrors +TYPE: bool +VERSION: 2.0.0 +DEFAULT: false +--DESCRIPTION-- + +Whether or not to collect errors found while filtering the document. This +is a useful way to give feedback to your users. Warning: +Currently this feature is very patchy and experimental, with lots of +possible error messages not yet implemented. It will not cause any +problems, but it may not help your users either. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt index 28c9682107..6ec36dbbcd 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt @@ -1,29 +1,29 @@ -Core.ColorKeywords -TYPE: hash -VERSION: 2.0.0 ---DEFAULT-- -array ( - 'maroon' => '#800000', - 'red' => '#FF0000', - 'orange' => '#FFA500', - 'yellow' => '#FFFF00', - 'olive' => '#808000', - 'purple' => '#800080', - 'fuchsia' => '#FF00FF', - 'white' => '#FFFFFF', - 'lime' => '#00FF00', - 'green' => '#008000', - 'navy' => '#000080', - 'blue' => '#0000FF', - 'aqua' => '#00FFFF', - 'teal' => '#008080', - 'black' => '#000000', - 'silver' => '#C0C0C0', - 'gray' => '#808080', -) ---DESCRIPTION-- - -Lookup array of color names to six digit hexadecimal number corresponding -to color, with preceding hash mark. Used when parsing colors. - - +Core.ColorKeywords +TYPE: hash +VERSION: 2.0.0 +--DEFAULT-- +array ( + 'maroon' => '#800000', + 'red' => '#FF0000', + 'orange' => '#FFA500', + 'yellow' => '#FFFF00', + 'olive' => '#808000', + 'purple' => '#800080', + 'fuchsia' => '#FF00FF', + 'white' => '#FFFFFF', + 'lime' => '#00FF00', + 'green' => '#008000', + 'navy' => '#000080', + 'blue' => '#0000FF', + 'aqua' => '#00FFFF', + 'teal' => '#008080', + 'black' => '#000000', + 'silver' => '#C0C0C0', + 'gray' => '#808080', +) +--DESCRIPTION-- + +Lookup array of color names to six digit hexadecimal number corresponding +to color, with preceding hash mark. Used when parsing colors. + + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ConvertDocumentToFragment.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ConvertDocumentToFragment.txt index 0f03d3aada..3f832c1046 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ConvertDocumentToFragment.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.ConvertDocumentToFragment.txt @@ -1,13 +1,13 @@ -Core.ConvertDocumentToFragment -TYPE: bool -DEFAULT: true ---DESCRIPTION-- - -This parameter determines whether or not the filter should convert -input that is a full document with html and body tags to a fragment -of just the contents of a body tag. This parameter is simply something -HTML Purifier can do during an edge-case: for most inputs, this -processing is not necessary. - ---ALIASES-- -Core.AcceptFullDocuments +Core.ConvertDocumentToFragment +TYPE: bool +DEFAULT: true +--DESCRIPTION-- + +This parameter determines whether or not the filter should convert +input that is a full document with html and body tags to a fragment +of just the contents of a body tag. This parameter is simply something +HTML Purifier can do during an edge-case: for most inputs, this +processing is not necessary. + +--ALIASES-- +Core.AcceptFullDocuments diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.DirectLexLineNumberSyncInterval.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.DirectLexLineNumberSyncInterval.txt index 392adb5a18..1ecc663787 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.DirectLexLineNumberSyncInterval.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.DirectLexLineNumberSyncInterval.txt @@ -1,17 +1,17 @@ -Core.DirectLexLineNumberSyncInterval -TYPE: int -VERSION: 2.0.0 -DEFAULT: 0 ---DESCRIPTION-- - -

- Specifies the number of tokens the DirectLex line number tracking - implementations should process before attempting to resyncronize the - current line count by manually counting all previous new-lines. When - at 0, this functionality is disabled. Lower values will decrease - performance, and this is only strictly necessary if the counting - algorithm is buggy (in which case you should report it as a bug). - This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is - not being used. -

- +Core.DirectLexLineNumberSyncInterval +TYPE: int +VERSION: 2.0.0 +DEFAULT: 0 +--DESCRIPTION-- + +

+ Specifies the number of tokens the DirectLex line number tracking + implementations should process before attempting to resyncronize the + current line count by manually counting all previous new-lines. When + at 0, this functionality is disabled. Lower values will decrease + performance, and this is only strictly necessary if the counting + algorithm is buggy (in which case you should report it as a bug). + This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is + not being used. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Encoding.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Encoding.txt index 9bca95c7c7..ab75dbd995 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Encoding.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Encoding.txt @@ -1,14 +1,14 @@ -Core.Encoding -TYPE: istring -DEFAULT: 'utf-8' ---DESCRIPTION-- -If for some reason you are unable to convert all webpages to UTF-8, you can -use this directive as a stop-gap compatibility change to let HTML Purifier -deal with non UTF-8 input. This technique has notable deficiencies: -absolutely no characters outside of the selected character encoding will be -preserved, not even the ones that have been ampersand escaped (this is due -to a UTF-8 specific feature that automatically resolves all -entities), making it pretty useless for anything except the most I18N-blind -applications, although %Core.EscapeNonASCIICharacters offers fixes this -trouble with another tradeoff. This directive only accepts ISO-8859-1 if -iconv is not enabled. +Core.Encoding +TYPE: istring +DEFAULT: 'utf-8' +--DESCRIPTION-- +If for some reason you are unable to convert all webpages to UTF-8, you can +use this directive as a stop-gap compatibility change to let HTML Purifier +deal with non UTF-8 input. This technique has notable deficiencies: +absolutely no characters outside of the selected character encoding will be +preserved, not even the ones that have been ampersand escaped (this is due +to a UTF-8 specific feature that automatically resolves all +entities), making it pretty useless for anything except the most I18N-blind +applications, although %Core.EscapeNonASCIICharacters offers fixes this +trouble with another tradeoff. This directive only accepts ISO-8859-1 if +iconv is not enabled. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt index cca96c6018..554d6a249b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt @@ -1,9 +1,9 @@ -Core.EscapeInvalidChildren -TYPE: bool -DEFAULT: false ---DESCRIPTION-- -When true, a child is found that is not allowed in the context of the -parent element will be transformed into text as if it were ASCII. When -false, that element and all internal tags will be dropped, though text will -be preserved. There is no option for dropping the element but preserving -child nodes. +Core.EscapeInvalidChildren +TYPE: bool +DEFAULT: false +--DESCRIPTION-- +When true, a child is found that is not allowed in the context of the +parent element will be transformed into text as if it were ASCII. When +false, that element and all internal tags will be dropped, though text will +be preserved. There is no option for dropping the element but preserving +child nodes. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidTags.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidTags.txt index 6e02a19bb4..779a573ece 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidTags.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidTags.txt @@ -1,6 +1,6 @@ -Core.EscapeInvalidTags -TYPE: bool -DEFAULT: false ---DESCRIPTION-- -When true, invalid tags will be written back to the document as plain text. -Otherwise, they are silently dropped. +Core.EscapeInvalidTags +TYPE: bool +DEFAULT: false +--DESCRIPTION-- +When true, invalid tags will be written back to the document as plain text. +Otherwise, they are silently dropped. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeNonASCIICharacters.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeNonASCIICharacters.txt index f8b7d38bfb..93b8a4b203 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeNonASCIICharacters.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.EscapeNonASCIICharacters.txt @@ -1,12 +1,12 @@ -Core.EscapeNonASCIICharacters -TYPE: bool -VERSION: 1.4.0 -DEFAULT: false ---DESCRIPTION-- -This directive overcomes a deficiency in %Core.Encoding by blindly -converting all non-ASCII characters into decimal numeric entities before -converting it to its native encoding. This means that even characters that -can be expressed in the non-UTF-8 encoding will be entity-ized, which can -be a real downer for encodings like Big5. It also assumes that the ASCII -repetoire is available, although this is the case for almost all encodings. -Anyway, use UTF-8! +Core.EscapeNonASCIICharacters +TYPE: bool +VERSION: 1.4.0 +DEFAULT: false +--DESCRIPTION-- +This directive overcomes a deficiency in %Core.Encoding by blindly +converting all non-ASCII characters into decimal numeric entities before +converting it to its native encoding. This means that even characters that +can be expressed in the non-UTF-8 encoding will be entity-ized, which can +be a real downer for encodings like Big5. It also assumes that the ASCII +repetoire is available, although this is the case for almost all encodings. +Anyway, use UTF-8! diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.HiddenElements.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.HiddenElements.txt index 0a86d96a02..f317221c8b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.HiddenElements.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.HiddenElements.txt @@ -1,19 +1,19 @@ -Core.HiddenElements -TYPE: lookup ---DEFAULT-- -array ( - 'script' => true, - 'style' => true, -) ---DESCRIPTION-- - -

- This directive is a lookup array of elements which should have their - contents removed when they are not allowed by the HTML definition. - For example, the contents of a script tag are not - normally shown in a document, so if script tags are to be removed, - their contents should be removed to. This is opposed to a b - tag, which defines some presentational changes but does not hide its - contents. -

- +Core.HiddenElements +TYPE: lookup +--DEFAULT-- +array ( + 'script' => true, + 'style' => true, +) +--DESCRIPTION-- + +

+ This directive is a lookup array of elements which should have their + contents removed when they are not allowed by the HTML definition. + For example, the contents of a script tag are not + normally shown in a document, so if script tags are to be removed, + their contents should be removed to. This is opposed to a b + tag, which defines some presentational changes but does not hide its + contents. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Language.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Language.txt index 1be60037fc..38d7e661de 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Language.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.Language.txt @@ -1,11 +1,11 @@ -Core.Language -TYPE: string -VERSION: 2.0.0 -DEFAULT: 'en' ---DESCRIPTION-- - -ISO 639 language code for localizable things in HTML Purifier to use, -which is mainly error reporting. There is currently only an English (en) -translation, so this directive is currently useless. - - +Core.Language +TYPE: string +VERSION: 2.0.0 +DEFAULT: 'en' +--DESCRIPTION-- + +ISO 639 language code for localizable things in HTML Purifier to use, +which is mainly error reporting. There is currently only an English (en) +translation, so this directive is currently useless. + + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.LexerImpl.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.LexerImpl.txt index 62125a4710..c32a6f6517 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.LexerImpl.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.LexerImpl.txt @@ -1,33 +1,33 @@ -Core.LexerImpl -TYPE: mixed/null -VERSION: 2.0.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- This parameter determines what lexer implementation can be used. The - valid values are: -

-
-
null
-
- Recommended, the lexer implementation will be auto-detected based on - your PHP-version and configuration. -
-
string lexer identifier
-
- This is a slim way of manually overridding the implementation. - Currently recognized values are: DOMLex (the default PHP5 -implementation) - and DirectLex (the default PHP4 implementation). Only use this if - you know what you are doing: usually, the auto-detection will - manage things for cases you aren't even aware of. -
-
object lexer instance
-
- Super-advanced: you can specify your own, custom, implementation that - implements the interface defined by HTMLPurifier_Lexer. - I may remove this option simply because I don't expect anyone - to use it. -
-
+Core.LexerImpl +TYPE: mixed/null +VERSION: 2.0.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ This parameter determines what lexer implementation can be used. The + valid values are: +

+
+
null
+
+ Recommended, the lexer implementation will be auto-detected based on + your PHP-version and configuration. +
+
string lexer identifier
+
+ This is a slim way of manually overridding the implementation. + Currently recognized values are: DOMLex (the default PHP5 +implementation) + and DirectLex (the default PHP4 implementation). Only use this if + you know what you are doing: usually, the auto-detection will + manage things for cases you aren't even aware of. +
+
object lexer instance
+
+ Super-advanced: you can specify your own, custom, implementation that + implements the interface defined by HTMLPurifier_Lexer. + I may remove this option simply because I don't expect anyone + to use it. +
+
diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.MaintainLineNumbers.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.MaintainLineNumbers.txt index de10208f85..00fa755020 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.MaintainLineNumbers.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.MaintainLineNumbers.txt @@ -1,16 +1,16 @@ -Core.MaintainLineNumbers -TYPE: bool/null -VERSION: 2.0.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- If true, HTML Purifier will add line number information to all tokens. - This is useful when error reporting is turned on, but can result in - significant performance degradation and should not be used when - unnecessary. This directive must be used with the DirectLex lexer, - as the DOMLex lexer does not (yet) support this functionality. - If the value is null, an appropriate value will be selected based - on other configuration. -

- +Core.MaintainLineNumbers +TYPE: bool/null +VERSION: 2.0.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ If true, HTML Purifier will add line number information to all tokens. + This is useful when error reporting is turned on, but can result in + significant performance degradation and should not be used when + unnecessary. This directive must be used with the DirectLex lexer, + as the DOMLex lexer does not (yet) support this functionality. + If the value is null, an appropriate value will be selected based + on other configuration. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveInvalidImg.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveInvalidImg.txt index 7ac86b3169..3b5c8fd0cd 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveInvalidImg.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveInvalidImg.txt @@ -1,12 +1,12 @@ -Core.RemoveInvalidImg -TYPE: bool -DEFAULT: true -VERSION: 1.3.0 ---DESCRIPTION-- - -

- This directive enables pre-emptive URI checking in img - tags, as the attribute validation strategy is not authorized to - remove elements from the document. Revert to pre-1.3.0 behavior by setting to false. -

- +Core.RemoveInvalidImg +TYPE: bool +DEFAULT: true +VERSION: 1.3.0 +--DESCRIPTION-- + +

+ This directive enables pre-emptive URI checking in img + tags, as the attribute validation strategy is not authorized to + remove elements from the document. Revert to pre-1.3.0 behavior by setting to false. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveScriptContents.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveScriptContents.txt index 531718ba7b..1d0e9779ef 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveScriptContents.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.RemoveScriptContents.txt @@ -1,11 +1,11 @@ -Core.RemoveScriptContents -TYPE: bool/null -DEFAULT: NULL -VERSION: 2.0.0 -DEPRECATED-VERSION: 2.1.0 -DEPRECATED-USE: Core.HiddenElements ---DESCRIPTION-- -

- This directive enables HTML Purifier to remove not only script tags - but all of their contents. -

+Core.RemoveScriptContents +TYPE: bool/null +DEFAULT: NULL +VERSION: 2.0.0 +DEPRECATED-VERSION: 2.1.0 +DEPRECATED-USE: Core.HiddenElements +--DESCRIPTION-- +

+ This directive enables HTML Purifier to remove not only script tags + but all of their contents. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.txt index 3240014d19..296982a5df 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Core.txt @@ -1,2 +1,2 @@ -Core -DESCRIPTION: Core features that are always available. +Core +DESCRIPTION: Core features that are always available. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.ExtractStyleBlocks.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.ExtractStyleBlocks.txt index 36d11e9594..aaa7416351 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.ExtractStyleBlocks.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.ExtractStyleBlocks.txt @@ -14,13 +14,49 @@ EXTERNAL: CSSTidy

Sample usage:

-
set('Filter', 'ExtractStyleBlocks', true);
-$purifier = new HTMLPurifier($config);
-$styles = $purifier->context->get('StyleBlocks');
-foreach ($styles as $style) {
-    echo '\n";
-}]]>
+
';
+?>
+
+
+
+  Filter.ExtractStyleBlocks
+body {color:#F00;} Some text';
+
+    $config = HTMLPurifier_Config::createDefault();
+    $config->set('Filter', 'ExtractStyleBlocks', true);
+    $purifier = new HTMLPurifier($config);
+    
+    $html = $purifier->purify($dirty);
+    
+    // This implementation writes the stylesheets to the styles/ directory.
+    // You can also echo the styles inside the document, but it's a bit
+    // more difficult to make sure they get interpreted properly by
+    // browsers; try the usual CSS armoring techniques.
+    $styles = $purifier->context->get('StyleBlocks');
+    $dir = 'styles/';
+    if (!is_dir($dir)) mkdir($dir);
+    $hash = sha1($_GET['html']);
+    foreach ($styles as $i => $style) {
+        file_put_contents($name = $dir . $hash . "_$i");
+        echo '';
+    }
+?>
+
+
+  
+ +
+ + +]]>

Warning: It is possible for a user to mount an imagecrash attack using this CSS. Counter-measures are difficult; diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.txt index 9fad43a8fa..cf5f89cdc0 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Filter.txt @@ -1,2 +1,2 @@ -Filter -DESCRIPTION: Directives for turning filters on and off, or specifying custom filters. +Filter +DESCRIPTION: Directives for turning filters on and off, or specifying custom filters. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksEscaping.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksEscaping.txt index d20010c7a6..96ac5e5d1c 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksEscaping.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksEscaping.txt @@ -1,14 +1,14 @@ -FilterParam.ExtractStyleBlocksEscaping -TYPE: bool -VERSION: 3.0.0 -DEFAULT: true -ALIASES: Filter.ExtractStyleBlocksEscaping ---DESCRIPTION-- - -

- Whether or not to escape the dangerous characters <, > and & - as \3C, \3E and \26, respectively. This is can be safely set to false - if the contents of StyleBlocks will be placed in an external stylesheet, - where there is no risk of it being interpreted as HTML. -

- +FilterParam.ExtractStyleBlocksEscaping +TYPE: bool +VERSION: 3.0.0 +DEFAULT: true +ALIASES: Filter.ExtractStyleBlocksEscaping +--DESCRIPTION-- + +

+ Whether or not to escape the dangerous characters <, > and & + as \3C, \3E and \26, respectively. This is can be safely set to false + if the contents of StyleBlocks will be placed in an external stylesheet, + where there is no risk of it being interpreted as HTML. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksScope.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksScope.txt index ec29078d85..366df44563 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksScope.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/FilterParam.ExtractStyleBlocksScope.txt @@ -1,28 +1,28 @@ -FilterParam.ExtractStyleBlocksScope -TYPE: string/null -VERSION: 3.0.0 -DEFAULT: NULL -ALIASES: Filter.ExtractStyleBlocksScope ---DESCRIPTION-- - -

- If you would like users to be able to define external stylesheets, but - only allow them to specify CSS declarations for a specific node and - prevent them from fiddling with other elements, use this directive. - It accepts any valid CSS selector, and will prepend this to any - CSS declaration extracted from the document. For example, if this - directive is set to #user-content and a user uses the - selector a:hover, the final selector will be - #user-content a:hover. -

-

- The comma shorthand may be used; consider the above example, with - #user-content, #user-content2, the final selector will - be #user-content a:hover, #user-content2 a:hover. -

-

- Warning: It is possible for users to bypass this measure - using a naughty + selector. This is a bug in CSS Tidy 1.3, not HTML - Purifier, and I am working to get it fixed. Until then, HTML Purifier - performs a basic check to prevent this. -

+FilterParam.ExtractStyleBlocksScope +TYPE: string/null +VERSION: 3.0.0 +DEFAULT: NULL +ALIASES: Filter.ExtractStyleBlocksScope +--DESCRIPTION-- + +

+ If you would like users to be able to define external stylesheets, but + only allow them to specify CSS declarations for a specific node and + prevent them from fiddling with other elements, use this directive. + It accepts any valid CSS selector, and will prepend this to any + CSS declaration extracted from the document. For example, if this + directive is set to #user-content and a user uses the + selector a:hover, the final selector will be + #user-content a:hover. +

+

+ The comma shorthand may be used; consider the above example, with + #user-content, #user-content2, the final selector will + be #user-content a:hover, #user-content2 a:hover. +

+

+ Warning: It is possible for users to bypass this measure + using a naughty + selector. This is a bug in CSS Tidy 1.3, not HTML + Purifier, and I am working to get it fixed. Until then, HTML Purifier + performs a basic check to prevent this. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Allowed.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Allowed.txt index 9329eb1544..5fcd12441c 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Allowed.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Allowed.txt @@ -1,22 +1,22 @@ -HTML.Allowed -TYPE: itext/null -VERSION: 2.0.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- This is a convenience directive that rolls the functionality of - %HTML.AllowedElements and %HTML.AllowedAttributes into one directive. - Specify elements and attributes that are allowed using: - element1[attr1|attr2],element2.... You can also use - newlines instead of commas to separate elements. -

-

- Warning: - All of the constraints on the component directives are still enforced. - The syntax is a subset of TinyMCE's valid_elements - whitelist: directly copy-pasting it here will probably result in - broken whitelists. If %HTML.AllowedElements or %HTML.AllowedAttributes - are set, this directive has no effect. -

- +HTML.Allowed +TYPE: itext/null +VERSION: 2.0.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ This is a convenience directive that rolls the functionality of + %HTML.AllowedElements and %HTML.AllowedAttributes into one directive. + Specify elements and attributes that are allowed using: + element1[attr1|attr2],element2.... You can also use + newlines instead of commas to separate elements. +

+

+ Warning: + All of the constraints on the component directives are still enforced. + The syntax is a subset of TinyMCE's valid_elements + whitelist: directly copy-pasting it here will probably result in + broken whitelists. If %HTML.AllowedElements or %HTML.AllowedAttributes + are set, this directive has no effect. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedAttributes.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedAttributes.txt index 6ff12fc5fa..88e696e812 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedAttributes.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedAttributes.txt @@ -1,19 +1,19 @@ -HTML.AllowedAttributes -TYPE: lookup/null -VERSION: 1.3.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- If HTML Purifier's attribute set is unsatisfactory, overload it! - The syntax is "tag.attr" or "*.attr" for the global attributes - (style, id, class, dir, lang, xml:lang). -

-

- Warning: If another directive conflicts with the - elements here, that directive will win and override. For - example, %HTML.EnableAttrID will take precedence over *.id in this - directive. You must set that directive to true before you can use - IDs at all. -

- +HTML.AllowedAttributes +TYPE: lookup/null +VERSION: 1.3.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ If HTML Purifier's attribute set is unsatisfactory, overload it! + The syntax is "tag.attr" or "*.attr" for the global attributes + (style, id, class, dir, lang, xml:lang). +

+

+ Warning: If another directive conflicts with the + elements here, that directive will win and override. For + example, %HTML.EnableAttrID will take precedence over *.id in this + directive. You must set that directive to true before you can use + IDs at all. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedElements.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedElements.txt index 031a300678..b667b3da8b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedElements.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedElements.txt @@ -1,18 +1,18 @@ -HTML.AllowedElements -TYPE: lookup/null -VERSION: 1.3.0 -DEFAULT: NULL ---DESCRIPTION-- -

- If HTML Purifier's tag set is unsatisfactory for your needs, you - can overload it with your own list of tags to allow. Note that this - method is subtractive: it does its job by taking away from HTML Purifier - usual feature set, so you cannot add a tag that HTML Purifier never - supported in the first place (like embed, form or head). If you - change this, you probably also want to change %HTML.AllowedAttributes. -

-

- Warning: If another directive conflicts with the - elements here, that directive will win and override. -

- +HTML.AllowedElements +TYPE: lookup/null +VERSION: 1.3.0 +DEFAULT: NULL +--DESCRIPTION-- +

+ If HTML Purifier's tag set is unsatisfactory for your needs, you + can overload it with your own list of tags to allow. Note that this + method is subtractive: it does its job by taking away from HTML Purifier + usual feature set, so you cannot add a tag that HTML Purifier never + supported in the first place (like embed, form or head). If you + change this, you probably also want to change %HTML.AllowedAttributes. +

+

+ Warning: If another directive conflicts with the + elements here, that directive will win and override. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedModules.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedModules.txt index 7b8367e1b8..85e39f7647 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedModules.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.AllowedModules.txt @@ -1,20 +1,20 @@ -HTML.AllowedModules -TYPE: lookup/null -VERSION: 2.0.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- A doctype comes with a set of usual modules to use. Without having - to mucking about with the doctypes, you can quickly activate or - disable these modules by specifying which modules you wish to allow - with this directive. This is most useful for unit testing specific - modules, although end users may find it useful for their own ends. -

-

- If you specify a module that does not exist, the manager will silently - fail to use it, so be careful! User-defined modules are not affected - by this directive. Modules defined in %HTML.CoreModules are not - affected by this directive. -

- +HTML.AllowedModules +TYPE: lookup/null +VERSION: 2.0.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ A doctype comes with a set of usual modules to use. Without having + to mucking about with the doctypes, you can quickly activate or + disable these modules by specifying which modules you wish to allow + with this directive. This is most useful for unit testing specific + modules, although end users may find it useful for their own ends. +

+

+ If you specify a module that does not exist, the manager will silently + fail to use it, so be careful! User-defined modules are not affected + by this directive. Modules defined in %HTML.CoreModules are not + affected by this directive. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.BlockWrapper.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.BlockWrapper.txt index 47fe1433f4..e12edb6414 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.BlockWrapper.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.BlockWrapper.txt @@ -1,18 +1,18 @@ -HTML.BlockWrapper -TYPE: string -VERSION: 1.3.0 -DEFAULT: 'p' ---DESCRIPTION-- - -

- String name of element to wrap inline elements that are inside a block - context. This only occurs in the children of blockquote in strict mode. -

-

- Example: by default value, - <blockquote>Foo</blockquote> would become - <blockquote><p>Foo</p></blockquote>. - The <p> tags can be replaced with whatever you desire, - as long as it is a block level element. -

- +HTML.BlockWrapper +TYPE: string +VERSION: 1.3.0 +DEFAULT: 'p' +--DESCRIPTION-- + +

+ String name of element to wrap inline elements that are inside a block + context. This only occurs in the children of blockquote in strict mode. +

+

+ Example: by default value, + <blockquote>Foo</blockquote> would become + <blockquote><p>Foo</p></blockquote>. + The <p> tags can be replaced with whatever you desire, + as long as it is a block level element. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CoreModules.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CoreModules.txt index 78bffdbe08..c629f66b45 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CoreModules.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CoreModules.txt @@ -1,23 +1,23 @@ -HTML.CoreModules -TYPE: lookup -VERSION: 2.0.0 ---DEFAULT-- -array ( - 'Structure' => true, - 'Text' => true, - 'Hypertext' => true, - 'List' => true, - 'NonXMLCommonAttributes' => true, - 'XMLCommonAttributes' => true, - 'CommonAttributes' => true, -) ---DESCRIPTION-- - -

- Certain modularized doctypes (XHTML, namely), have certain modules - that must be included for the doctype to be an conforming document - type: put those modules here. By default, XHTML's core modules - are used. You can set this to a blank array to disable core module - protection, but this is not recommended. -

- +HTML.CoreModules +TYPE: lookup +VERSION: 2.0.0 +--DEFAULT-- +array ( + 'Structure' => true, + 'Text' => true, + 'Hypertext' => true, + 'List' => true, + 'NonXMLCommonAttributes' => true, + 'XMLCommonAttributes' => true, + 'CommonAttributes' => true, +) +--DESCRIPTION-- + +

+ Certain modularized doctypes (XHTML, namely), have certain modules + that must be included for the doctype to be an conforming document + type: put those modules here. By default, XHTML's core modules + are used. You can set this to a blank array to disable core module + protection, but this is not recommended. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CustomDoctype.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CustomDoctype.txt index 4b6d39cfbf..ee410ff07c 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CustomDoctype.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.CustomDoctype.txt @@ -1,10 +1,10 @@ -HTML.CustomDoctype -TYPE: string/null -VERSION: 2.0.1 -DEFAULT: NULL ---DESCRIPTION-- - -A custom doctype for power-users who defined there own document -type. This directive only applies when %HTML.Doctype is blank. - - +HTML.CustomDoctype +TYPE: string/null +VERSION: 2.0.1 +DEFAULT: NULL +--DESCRIPTION-- + +A custom doctype for power-users who defined there own document +type. This directive only applies when %HTML.Doctype is blank. + + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionID.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionID.txt index 07f6b67cf9..36a3e9dab4 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionID.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionID.txt @@ -1,33 +1,33 @@ -HTML.DefinitionID -TYPE: string/null -DEFAULT: NULL -VERSION: 2.0.0 ---DESCRIPTION-- - -

- Unique identifier for a custom-built HTML definition. If you edit - the raw version of the HTMLDefinition, introducing changes that the - configuration object does not reflect, you must specify this variable. - If you change your custom edits, you should change this directive, or - clear your cache. Example: -

-
-$config = HTMLPurifier_Config::createDefault();
-$config->set('HTML', 'DefinitionID', '1');
-$def = $config->getHTMLDefinition();
-$def->addAttribute('a', 'tabindex', 'Number');
-
-

- In the above example, the configuration is still at the defaults, but - using the advanced API, an extra attribute has been added. The - configuration object normally has no way of knowing that this change - has taken place, so it needs an extra directive: %HTML.DefinitionID. - If someone else attempts to use the default configuration, these two - pieces of code will not clobber each other in the cache, since one has - an extra directive attached to it. -

-

- You must specify a value to this directive to use the - advanced API features. -

- +HTML.DefinitionID +TYPE: string/null +DEFAULT: NULL +VERSION: 2.0.0 +--DESCRIPTION-- + +

+ Unique identifier for a custom-built HTML definition. If you edit + the raw version of the HTMLDefinition, introducing changes that the + configuration object does not reflect, you must specify this variable. + If you change your custom edits, you should change this directive, or + clear your cache. Example: +

+
+$config = HTMLPurifier_Config::createDefault();
+$config->set('HTML', 'DefinitionID', '1');
+$def = $config->getHTMLDefinition();
+$def->addAttribute('a', 'tabindex', 'Number');
+
+

+ In the above example, the configuration is still at the defaults, but + using the advanced API, an extra attribute has been added. The + configuration object normally has no way of knowing that this change + has taken place, so it needs an extra directive: %HTML.DefinitionID. + If someone else attempts to use the default configuration, these two + pieces of code will not clobber each other in the cache, since one has + an extra directive attached to it. +

+

+ You must specify a value to this directive to use the + advanced API features. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionRev.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionRev.txt index dfee8e7741..bc084fa37f 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionRev.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.DefinitionRev.txt @@ -1,16 +1,16 @@ -HTML.DefinitionRev -TYPE: int -VERSION: 2.0.0 -DEFAULT: 1 ---DESCRIPTION-- - -

- Revision identifier for your custom definition specified in - %HTML.DefinitionID. This serves the same purpose: uniquely identifying - your custom definition, but this one does so in a chronological - context: revision 3 is more up-to-date then revision 2. Thus, when - this gets incremented, the cache handling is smart enough to clean - up any older revisions of your definition as well as flush the - cache. -

- +HTML.DefinitionRev +TYPE: int +VERSION: 2.0.0 +DEFAULT: 1 +--DESCRIPTION-- + +

+ Revision identifier for your custom definition specified in + %HTML.DefinitionID. This serves the same purpose: uniquely identifying + your custom definition, but this one does so in a chronological + context: revision 3 is more up-to-date then revision 2. Thus, when + this gets incremented, the cache handling is smart enough to clean + up any older revisions of your definition as well as flush the + cache. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Doctype.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Doctype.txt index 1c58e2a371..051cf20a62 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Doctype.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Doctype.txt @@ -1,10 +1,10 @@ -HTML.Doctype -TYPE: string/null -DEFAULT: NULL ---DESCRIPTION-- -Doctype to use during filtering. Technically speaking this is not actually -a doctype (as it does not identify a corresponding DTD), but we are using -this name for sake of simplicity. When non-blank, this will override any -older directives like %HTML.XHTML or %HTML.Strict. ---ALLOWED-- -'HTML 4.01 Transitional', 'HTML 4.01 Strict', 'XHTML 1.0 Transitional', 'XHTML 1.0 Strict', 'XHTML 1.1' +HTML.Doctype +TYPE: string/null +DEFAULT: NULL +--DESCRIPTION-- +Doctype to use during filtering. Technically speaking this is not actually +a doctype (as it does not identify a corresponding DTD), but we are using +this name for sake of simplicity. When non-blank, this will override any +older directives like %HTML.XHTML or %HTML.Strict. +--ALLOWED-- +'HTML 4.01 Transitional', 'HTML 4.01 Strict', 'XHTML 1.0 Transitional', 'XHTML 1.0 Strict', 'XHTML 1.1' diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Parent.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Parent.txt index 0e680f62e9..ecc2f77b60 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Parent.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Parent.txt @@ -1,12 +1,12 @@ -HTML.Parent -TYPE: string -VERSION: 1.3.0 -DEFAULT: 'div' ---DESCRIPTION-- - -

- String name of element that HTML fragment passed to library will be - inserted in. An interesting variation would be using span as the - parent element, meaning that only inline tags would be allowed. -

- +HTML.Parent +TYPE: string +VERSION: 1.3.0 +DEFAULT: 'div' +--DESCRIPTION-- + +

+ String name of element that HTML fragment passed to library will be + inserted in. An interesting variation would be using span as the + parent element, meaning that only inline tags would be allowed. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Strict.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Strict.txt index 39f8179634..c9697bde82 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Strict.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Strict.txt @@ -1,8 +1,8 @@ -HTML.Strict -TYPE: bool -VERSION: 1.3.0 -DEFAULT: false -DEPRECATED-VERSION: 1.7.0 -DEPRECATED-USE: HTML.Doctype ---DESCRIPTION-- -Determines whether or not to use Transitional (loose) or Strict rulesets. +HTML.Strict +TYPE: bool +VERSION: 1.3.0 +DEFAULT: false +DEPRECATED-VERSION: 1.7.0 +DEPRECATED-USE: HTML.Doctype +--DESCRIPTION-- +Determines whether or not to use Transitional (loose) or Strict rulesets. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyAdd.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyAdd.txt index 7bf3c6d409..7fadf06c73 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyAdd.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyAdd.txt @@ -1,8 +1,8 @@ -HTML.TidyAdd -TYPE: lookup -VERSION: 2.0.0 -DEFAULT: array() ---DESCRIPTION-- - -Fixes to add to the default set of Tidy fixes as per your level. - +HTML.TidyAdd +TYPE: lookup +VERSION: 2.0.0 +DEFAULT: array() +--DESCRIPTION-- + +Fixes to add to the default set of Tidy fixes as per your level. + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyLevel.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyLevel.txt index 7b98bc7e93..9383bbaaed 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyLevel.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyLevel.txt @@ -1,23 +1,23 @@ -HTML.TidyLevel -TYPE: string -VERSION: 2.0.0 -DEFAULT: 'medium' ---DESCRIPTION-- - -

General level of cleanliness the Tidy module should enforce. -There are four allowed values:

-
-
none
-
No extra tidying should be done
-
light
-
Only fix elements that would be discarded otherwise due to - lack of support in doctype
-
medium
-
Enforce best practices
-
heavy
-
Transform all deprecated elements and attributes to standards - compliant equivalents
-
- ---ALLOWED-- -'none', 'light', 'medium', 'heavy' +HTML.TidyLevel +TYPE: string +VERSION: 2.0.0 +DEFAULT: 'medium' +--DESCRIPTION-- + +

General level of cleanliness the Tidy module should enforce. +There are four allowed values:

+
+
none
+
No extra tidying should be done
+
light
+
Only fix elements that would be discarded otherwise due to + lack of support in doctype
+
medium
+
Enforce best practices
+
heavy
+
Transform all deprecated elements and attributes to standards + compliant equivalents
+
+ +--ALLOWED-- +'none', 'light', 'medium', 'heavy' diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyRemove.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyRemove.txt index 1e220695e0..4717f39fdb 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyRemove.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.TidyRemove.txt @@ -1,8 +1,8 @@ -HTML.TidyRemove -TYPE: lookup -VERSION: 2.0.0 -DEFAULT: array() ---DESCRIPTION-- - -Fixes to remove from the default set of Tidy fixes as per your level. - +HTML.TidyRemove +TYPE: lookup +VERSION: 2.0.0 +DEFAULT: array() +--DESCRIPTION-- + +Fixes to remove from the default set of Tidy fixes as per your level. + diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt index 9785137dc9..6a4015e7bf 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt @@ -1,7 +1,7 @@ -HTML.Trusted -TYPE: bool -VERSION: 2.0.0 -DEFAULT: false ---DESCRIPTION-- -Indicates whether or not the user input is trusted or not. If the input is -trusted, a more expansive set of allowed tags and attributes will be used. +HTML.Trusted +TYPE: bool +VERSION: 2.0.0 +DEFAULT: false +--DESCRIPTION-- +Indicates whether or not the user input is trusted or not. If the input is +trusted, a more expansive set of allowed tags and attributes will be used. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.XHTML.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.XHTML.txt index 7909203ef5..8cca4b052a 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.XHTML.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.XHTML.txt @@ -1,10 +1,10 @@ -HTML.XHTML -TYPE: bool -DEFAULT: true -VERSION: 1.1.0 -DEPRECATED-VERSION: 1.7.0 -DEPRECATED-USE: HTML.Doctype ---DESCRIPTION-- -Determines whether or not output is XHTML 1.0 or HTML 4.01 flavor. ---ALIASES-- -Core.XHTML +HTML.XHTML +TYPE: bool +DEFAULT: true +VERSION: 1.1.0 +DEPRECATED-VERSION: 1.7.0 +DEPRECATED-USE: HTML.Doctype +--DESCRIPTION-- +Determines whether or not output is XHTML 1.0 or HTML 4.01 flavor. +--ALIASES-- +Core.XHTML diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.txt index 9b8b3a74a4..9350008639 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/HTML.txt @@ -1,2 +1,2 @@ -HTML -DESCRIPTION: Configuration regarding allowed HTML. +HTML +DESCRIPTION: Configuration regarding allowed HTML. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.CommentScriptContents.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.CommentScriptContents.txt index 171b0fff6c..5a77edb4d2 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.CommentScriptContents.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.CommentScriptContents.txt @@ -1,9 +1,9 @@ -Output.CommentScriptContents -TYPE: bool -VERSION: 2.0.0 -DEFAULT: true ---DESCRIPTION-- -Determines whether or not HTML Purifier should attempt to fix up the -contents of script tags for legacy browsers with comments. ---ALIASES-- -Core.CommentScriptContents +Output.CommentScriptContents +TYPE: bool +VERSION: 2.0.0 +DEFAULT: true +--DESCRIPTION-- +Determines whether or not HTML Purifier should attempt to fix up the +contents of script tags for legacy browsers with comments. +--ALIASES-- +Core.CommentScriptContents diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.Newline.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.Newline.txt index 2021572516..2dd17999df 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.Newline.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.Newline.txt @@ -1,13 +1,13 @@ -Output.Newline -TYPE: string/null -VERSION: 2.0.1 -DEFAULT: NULL ---DESCRIPTION-- - -

- Newline string to format final output with. If left null, HTML Purifier - will auto-detect the default newline type of the system and use that; - you can manually override it here. Remember, \r\n is Windows, \r - is Mac, and \n is Unix. -

- +Output.Newline +TYPE: string/null +VERSION: 2.0.1 +DEFAULT: NULL +--DESCRIPTION-- + +

+ Newline string to format final output with. If left null, HTML Purifier + will auto-detect the default newline type of the system and use that; + you can manually override it here. Remember, \r\n is Windows, \r + is Mac, and \n is Unix. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.SortAttr.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.SortAttr.txt new file mode 100644 index 0000000000..5392b3bd51 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.SortAttr.txt @@ -0,0 +1,13 @@ +Output.SortAttr +TYPE: bool +VERSION: 3.2.0 +DEFAULT: false +--DESCRIPTION-- +

+ If true, HTML Purifier will sort attributes by name before writing them back + to the document, converting a tag like: <el b="" a="" c="" /> + to <el a="" b="" c="" />. This is a workaround for + a bug in FCKeditor which causes it to swap attributes order, adding noise + to text diffs. If you're not seeing this bug, chances are, you don't need + this directive. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.TidyFormat.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.TidyFormat.txt index 5d7f29d3f8..6d26c4ded9 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.TidyFormat.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.TidyFormat.txt @@ -1,24 +1,24 @@ -Output.TidyFormat -TYPE: bool -VERSION: 1.1.1 -DEFAULT: false ---DESCRIPTION-- -

- Determines whether or not to run Tidy on the final output for pretty - formatting reasons, such as indentation and wrap. -

-

- This can greatly improve readability for editors who are hand-editing - the HTML, but is by no means necessary as HTML Purifier has already - fixed all major errors the HTML may have had. Tidy is a non-default - extension, and this directive will silently fail if Tidy is not - available. -

-

- If you are looking to make the overall look of your page's source - better, I recommend running Tidy on the entire page rather than just - user-content (after all, the indentation relative to the containing - blocks will be incorrect). -

---ALIASES-- -Core.TidyFormat +Output.TidyFormat +TYPE: bool +VERSION: 1.1.1 +DEFAULT: false +--DESCRIPTION-- +

+ Determines whether or not to run Tidy on the final output for pretty + formatting reasons, such as indentation and wrap. +

+

+ This can greatly improve readability for editors who are hand-editing + the HTML, but is by no means necessary as HTML Purifier has already + fixed all major errors the HTML may have had. Tidy is a non-default + extension, and this directive will silently fail if Tidy is not + available. +

+

+ If you are looking to make the overall look of your page's source + better, I recommend running Tidy on the entire page rather than just + user-content (after all, the indentation relative to the containing + blocks will be incorrect). +

+--ALIASES-- +Core.TidyFormat diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.txt index 134e141f09..abd4b1b39e 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Output.txt @@ -1,2 +1,2 @@ -Output -DESCRIPTION: Configuration relating to the generation of (X)HTML. +Output +DESCRIPTION: Configuration relating to the generation of (X)HTML. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.ForceNoIconv.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.ForceNoIconv.txt index 99337628db..2cd7067ddc 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.ForceNoIconv.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.ForceNoIconv.txt @@ -1,6 +1,6 @@ -Test.ForceNoIconv -TYPE: bool -DEFAULT: false ---DESCRIPTION-- -When set to true, HTMLPurifier_Encoder will act as if iconv does not exist -and use only pure PHP implementations. +Test.ForceNoIconv +TYPE: bool +DEFAULT: false +--DESCRIPTION-- +When set to true, HTMLPurifier_Encoder will act as if iconv does not exist +and use only pure PHP implementations. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.txt index fec6f8d8f8..a9928fec20 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/Test.txt @@ -1,2 +1,2 @@ -Test -DESCRIPTION: Developer testing configuration for our unit tests. +Test +DESCRIPTION: Developer testing configuration for our unit tests. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt index 2686bac420..cf1271c0eb 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt @@ -1,14 +1,14 @@ -URI.AllowedSchemes -TYPE: lookup ---DEFAULT-- -array ( - 'http' => true, - 'https' => true, - 'mailto' => true, - 'ftp' => true, - 'nntp' => true, - 'news' => true, -) ---DESCRIPTION-- -Whitelist that defines the schemes that a URI is allowed to have. This -prevents XSS attacks from using pseudo-schemes like javascript or mocha. +URI.AllowedSchemes +TYPE: lookup +--DEFAULT-- +array ( + 'http' => true, + 'https' => true, + 'mailto' => true, + 'ftp' => true, + 'nntp' => true, + 'news' => true, +) +--DESCRIPTION-- +Whitelist that defines the schemes that a URI is allowed to have. This +prevents XSS attacks from using pseudo-schemes like javascript or mocha. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Base.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Base.txt index a0f8d97b05..f68b82596e 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Base.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Base.txt @@ -1,17 +1,17 @@ -URI.Base -TYPE: string/null -VERSION: 2.1.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- The base URI is the URI of the document this purified HTML will be - inserted into. This information is important if HTML Purifier needs - to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute - is on. You may use a non-absolute URI for this value, but behavior - may vary (%URI.MakeAbsolute deals nicely with both absolute and - relative paths, but forwards-compatibility is not guaranteed). - Warning: If set, the scheme on this URI - overrides the one specified by %URI.DefaultScheme. -

- +URI.Base +TYPE: string/null +VERSION: 2.1.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ The base URI is the URI of the document this purified HTML will be + inserted into. This information is important if HTML Purifier needs + to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute + is on. You may use a non-absolute URI for this value, but behavior + may vary (%URI.MakeAbsolute deals nicely with both absolute and + relative paths, but forwards-compatibility is not guaranteed). + Warning: If set, the scheme on this URI + overrides the one specified by %URI.DefaultScheme. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefaultScheme.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefaultScheme.txt index 2f39c2fad3..ba472edadc 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefaultScheme.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefaultScheme.txt @@ -1,10 +1,10 @@ -URI.DefaultScheme -TYPE: string -DEFAULT: 'http' ---DESCRIPTION-- - -

- Defines through what scheme the output will be served, in order to - select the proper object validator when no scheme information is present. -

- +URI.DefaultScheme +TYPE: string +DEFAULT: 'http' +--DESCRIPTION-- + +

+ Defines through what scheme the output will be served, in order to + select the proper object validator when no scheme information is present. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionID.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionID.txt index 20bfc1db77..dd324a5923 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionID.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionID.txt @@ -1,11 +1,11 @@ -URI.DefinitionID -TYPE: string/null -VERSION: 2.1.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- Unique identifier for a custom-built URI definition. If you want - to add custom URIFilters, you must specify this value. -

- +URI.DefinitionID +TYPE: string/null +VERSION: 2.1.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ Unique identifier for a custom-built URI definition. If you want + to add custom URIFilters, you must specify this value. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionRev.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionRev.txt index 7dabdc6d7f..9049897b66 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionRev.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DefinitionRev.txt @@ -1,11 +1,11 @@ -URI.DefinitionRev -TYPE: int -VERSION: 2.1.0 -DEFAULT: 1 ---DESCRIPTION-- - -

- Revision identifier for your custom definition. See - %HTML.DefinitionRev for details. -

- +URI.DefinitionRev +TYPE: int +VERSION: 2.1.0 +DEFAULT: 1 +--DESCRIPTION-- + +

+ Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Disable.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Disable.txt index a97b2e29e2..85fe3df10a 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Disable.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Disable.txt @@ -1,13 +1,13 @@ -URI.Disable -TYPE: bool -VERSION: 1.3.0 -DEFAULT: false ---DESCRIPTION-- - -

- Disables all URIs in all forms. Not sure why you'd want to do that - (after all, the Internet's founded on the notion of a hyperlink). -

- ---ALIASES-- -Attr.DisableURI +URI.Disable +TYPE: bool +VERSION: 1.3.0 +DEFAULT: false +--DESCRIPTION-- + +

+ Disables all URIs in all forms. Not sure why you'd want to do that + (after all, the Internet's founded on the notion of a hyperlink). +

+ +--ALIASES-- +Attr.DisableURI diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternal.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternal.txt index 6d3ceba86f..5f9dc31f9f 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternal.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternal.txt @@ -1,10 +1,10 @@ -URI.DisableExternal -TYPE: bool -VERSION: 1.2.0 -DEFAULT: false ---DESCRIPTION-- -Disables links to external websites. This is a highly effective anti-spam -and anti-pagerank-leech measure, but comes at a hefty price: nolinks or -images outside of your domain will be allowed. Non-linkified URIs will -still be preserved. If you want to be able to link to subdomains or use -absolute URIs, specify %URI.Host for your website. +URI.DisableExternal +TYPE: bool +VERSION: 1.2.0 +DEFAULT: false +--DESCRIPTION-- +Disables links to external websites. This is a highly effective anti-spam +and anti-pagerank-leech measure, but comes at a hefty price: nolinks or +images outside of your domain will be allowed. Non-linkified URIs will +still be preserved. If you want to be able to link to subdomains or use +absolute URIs, specify %URI.Host for your website. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternalResources.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternalResources.txt index 37f5d13f8a..af43b61402 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternalResources.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableExternalResources.txt @@ -1,12 +1,12 @@ -URI.DisableExternalResources -TYPE: bool -VERSION: 1.3.0 -DEFAULT: false ---DESCRIPTION-- -Disables the embedding of external resources, preventing users from -embedding things like images from other hosts. This prevents access -tracking (good for email viewers), bandwidth leeching, cross-site request -forging, goatse.cx posting, and other nasties, but also results in a loss -of end-user functionality (they can't directly post a pic they posted from -Flickr anymore). Use it if you don't have a robust user-content moderation -team. +URI.DisableExternalResources +TYPE: bool +VERSION: 1.3.0 +DEFAULT: false +--DESCRIPTION-- +Disables the embedding of external resources, preventing users from +embedding things like images from other hosts. This prevents access +tracking (good for email viewers), bandwidth leeching, cross-site request +forging, goatse.cx posting, and other nasties, but also results in a loss +of end-user functionality (they can't directly post a pic they posted from +Flickr anymore). Use it if you don't have a robust user-content moderation +team. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableResources.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableResources.txt index a456051451..2f4e57417a 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableResources.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.DisableResources.txt @@ -1,12 +1,12 @@ -URI.DisableResources -TYPE: bool -VERSION: 1.3.0 -DEFAULT: false ---DESCRIPTION-- - -

- Disables embedding resources, essentially meaning no pictures. You can - still link to them though. See %URI.DisableExternalResources for why - this might be a good idea. -

- +URI.DisableResources +TYPE: bool +VERSION: 1.3.0 +DEFAULT: false +--DESCRIPTION-- + +

+ Disables embedding resources, essentially meaning no pictures. You can + still link to them though. See %URI.DisableExternalResources for why + this might be a good idea. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Host.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Host.txt index 15356ac206..4815940753 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Host.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Host.txt @@ -1,19 +1,19 @@ -URI.Host -TYPE: string/null -VERSION: 1.2.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- Defines the domain name of the server, so we can determine whether or - an absolute URI is from your website or not. Not strictly necessary, - as users should be using relative URIs to reference resources on your - website. It will, however, let you use absolute URIs to link to - subdomains of the domain you post here: i.e. example.com will allow - sub.example.com. However, higher up domains will still be excluded: - if you set %URI.Host to sub.example.com, example.com will be blocked. - Note: This directive overrides %URI.Base because - a given page may be on a sub-domain, but you wish HTML Purifier to be - more relaxed and allow some of the parent domains too. -

- +URI.Host +TYPE: string/null +VERSION: 1.2.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ Defines the domain name of the server, so we can determine whether or + an absolute URI is from your website or not. Not strictly necessary, + as users should be using relative URIs to reference resources on your + website. It will, however, let you use absolute URIs to link to + subdomains of the domain you post here: i.e. example.com will allow + sub.example.com. However, higher up domains will still be excluded: + if you set %URI.Host to sub.example.com, example.com will be blocked. + Note: This directive overrides %URI.Base because + a given page may be on a sub-domain, but you wish HTML Purifier to be + more relaxed and allow some of the parent domains too. +

+ diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.HostBlacklist.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.HostBlacklist.txt index 5f5cbc5693..a62a354374 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.HostBlacklist.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.HostBlacklist.txt @@ -1,8 +1,8 @@ -URI.HostBlacklist -TYPE: list -VERSION: 1.3.0 -DEFAULT: array() ---DESCRIPTION-- -List of strings that are forbidden in the host of any URI. Use it to kill -domain names of spam, etc. Note that it will catch anything in the domain, -so moo.com will catch moo.com.example.com. +URI.HostBlacklist +TYPE: list +VERSION: 1.3.0 +DEFAULT: array() +--DESCRIPTION-- +List of strings that are forbidden in the host of any URI. Use it to kill +domain names of spam, etc. Note that it will catch anything in the domain, +so moo.com will catch moo.com.example.com. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.MakeAbsolute.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.MakeAbsolute.txt index 2d2b40a5e4..0832b2cc67 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.MakeAbsolute.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.MakeAbsolute.txt @@ -1,12 +1,12 @@ -URI.MakeAbsolute -TYPE: bool -VERSION: 2.1.0 -DEFAULT: false ---DESCRIPTION-- - -

- Converts all URIs into absolute forms. This is useful when the HTML - being filtered assumes a specific base path, but will actually be - viewed in a different context (and setting an alternate base URI is - not possible). %URI.Base must be set for this directive to work. -

+URI.MakeAbsolute +TYPE: bool +VERSION: 2.1.0 +DEFAULT: false +--DESCRIPTION-- + +

+ Converts all URIs into absolute forms. This is useful when the HTML + being filtered assumes a specific base path, but will actually be + viewed in a different context (and setting an alternate base URI is + not possible). %URI.Base must be set for this directive to work. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Munge.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Munge.txt index 7743ac2923..ed9e142d8b 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Munge.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.Munge.txt @@ -1,82 +1,82 @@ -URI.Munge -TYPE: string/null -VERSION: 1.3.0 -DEFAULT: NULL ---DESCRIPTION-- - -

- Munges all browsable (usually http, https and ftp) - absolute URIs into another URI, usually a URI redirection service. - This directive accepts a URI, formatted with a %s where - the url-encoded original URI should be inserted (sample: - http://www.google.com/url?q=%s). -

-

- Uses for this directive: -

- -

- Prior to HTML Purifier 3.1.1, this directive also enabled the munging - of browsable external resources, which could break things if your redirection - script was a splash page or used meta tags. To revert to - previous behavior, please use %URI.MungeResources. -

-

- You may want to also use %URI.MungeSecretKey along with this directive - in order to enforce what URIs your redirector script allows. Open - redirector scripts can be a security risk and negatively affect the - reputation of your domain name. -

-

- Starting with HTML Purifier 3.1.1, there is also these substitutions: -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDescriptionExample <a href="">
%r1 - The URI embeds a resource
(blank) - The URI is merely a link
%nThe name of the tag this URI came froma
%mThe name of the attribute this URI came fromhref
%pThe name of the CSS property this URI came from, or blank if irrelevant
-

- Admittedly, these letters are somewhat arbitrary; the only stipulation - was that they couldn't be a through f. r is for resource (I would have preferred - e, but you take what you can get), n is for name, m - was picked because it came after n (and I couldn't use a), p is for - property. -

+URI.Munge +TYPE: string/null +VERSION: 1.3.0 +DEFAULT: NULL +--DESCRIPTION-- + +

+ Munges all browsable (usually http, https and ftp) + absolute URIs into another URI, usually a URI redirection service. + This directive accepts a URI, formatted with a %s where + the url-encoded original URI should be inserted (sample: + http://www.google.com/url?q=%s). +

+

+ Uses for this directive: +

+ +

+ Prior to HTML Purifier 3.1.1, this directive also enabled the munging + of browsable external resources, which could break things if your redirection + script was a splash page or used meta tags. To revert to + previous behavior, please use %URI.MungeResources. +

+

+ You may want to also use %URI.MungeSecretKey along with this directive + in order to enforce what URIs your redirector script allows. Open + redirector scripts can be a security risk and negatively affect the + reputation of your domain name. +

+

+ Starting with HTML Purifier 3.1.1, there is also these substitutions: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyDescriptionExample <a href="">
%r1 - The URI embeds a resource
(blank) - The URI is merely a link
%nThe name of the tag this URI came froma
%mThe name of the attribute this URI came fromhref
%pThe name of the CSS property this URI came from, or blank if irrelevant
+

+ Admittedly, these letters are somewhat arbitrary; the only stipulation + was that they couldn't be a through f. r is for resource (I would have preferred + e, but you take what you can get), n is for name, m + was picked because it came after n (and I couldn't use a), p is for + property. +

diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.OverrideAllowedSchemes.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.OverrideAllowedSchemes.txt index c75b133142..e910ee5403 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.OverrideAllowedSchemes.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.OverrideAllowedSchemes.txt @@ -1,8 +1,8 @@ -URI.OverrideAllowedSchemes -TYPE: bool -DEFAULT: true ---DESCRIPTION-- -If this is set to true (which it is by default), you can override -%URI.AllowedSchemes by simply registering a HTMLPurifier_URIScheme to the -registry. If false, you will also have to update that directive in order -to add more schemes. +URI.OverrideAllowedSchemes +TYPE: bool +DEFAULT: true +--DESCRIPTION-- +If this is set to true (which it is by default), you can override +%URI.AllowedSchemes by simply registering a HTMLPurifier_URIScheme to the +registry. If false, you will also have to update that directive in order +to add more schemes. diff --git a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.txt b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.txt index 114ecb17f3..4bcdb95b1f 100644 --- a/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.txt +++ b/lib/htmlpurifier/HTMLPurifier/ConfigSchema/schema/URI.txt @@ -1,2 +1,2 @@ -URI -DESCRIPTION: Features regarding Uniform Resource Identifiers. +URI +DESCRIPTION: Features regarding Uniform Resource Identifiers. diff --git a/lib/htmlpurifier/HTMLPurifier/ContentSets.php b/lib/htmlpurifier/HTMLPurifier/ContentSets.php index 16438fda44..15c8d97fee 100644 --- a/lib/htmlpurifier/HTMLPurifier/ContentSets.php +++ b/lib/htmlpurifier/HTMLPurifier/ContentSets.php @@ -37,33 +37,35 @@ class HTMLPurifier_ContentSets // sorry, no way of overloading foreach ($modules as $module_i => $module) { foreach ($module->content_sets as $key => $value) { - if (isset($this->info[$key])) { + $temp = $this->convertToLookup($value); + if (isset($this->lookup[$key])) { // add it into the existing content set - $this->info[$key] = $this->info[$key] . ' | ' . $value; + $this->lookup[$key] = array_merge($this->lookup[$key], $temp); } else { - $this->info[$key] = $value; + $this->lookup[$key] = $temp; } } } - // perform content_set expansions - $this->keys = array_keys($this->info); - foreach ($this->info as $i => $set) { - // only performed once, so infinite recursion is not - // a problem - $this->info[$i] = - str_replace( - $this->keys, - // must be recalculated each time due to - // changing substitutions - array_values($this->info), - $set); + $old_lookup = false; + while ($old_lookup !== $this->lookup) { + $old_lookup = $this->lookup; + foreach ($this->lookup as $i => $set) { + $add = array(); + foreach ($set as $element => $x) { + if (isset($this->lookup[$element])) { + $add += $this->lookup[$element]; + unset($this->lookup[$i][$element]); + } + } + $this->lookup[$i] += $add; + } } - $this->values = array_values($this->info); - // generate lookup tables - foreach ($this->info as $name => $set) { - $this->lookup[$name] = $this->convertToLookup($set); + foreach ($this->lookup as $key => $lookup) { + $this->info[$key] = implode(' | ', array_keys($lookup)); } + $this->keys = array_keys($this->info); + $this->values = array_values($this->info); } /** @@ -75,12 +77,22 @@ class HTMLPurifier_ContentSets if (!empty($def->child)) return; // already done! $content_model = $def->content_model; if (is_string($content_model)) { - $def->content_model = str_replace( - $this->keys, $this->values, $content_model); + // Assume that $this->keys is alphanumeric + $def->content_model = preg_replace_callback( + '/\b(' . implode('|', $this->keys) . ')\b/', + array($this, 'generateChildDefCallback'), + $content_model + ); + //$def->content_model = str_replace( + // $this->keys, $this->values, $content_model); } $def->child = $this->getChildDef($def, $module); } + public function generateChildDefCallback($matches) { + return $this->info[$matches[0]]; + } + /** * Instantiates a ChildDef based on content_model and content_model_type * member variables in HTMLPurifier_ElementDef diff --git a/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in index 62235e225d..d741764c86 100644 --- a/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in +++ b/lib/htmlpurifier/HTMLPurifier/DefinitionCache/Decorator/Template.php.in @@ -1,46 +1,46 @@ -get('Test', 'ForceNoIconv')) { $str = iconv($encoding, 'utf-8//IGNORE', $str); + if ($str === false) { + // $encoding is not a valid encoding + restore_error_handler(); + trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); + return ''; + } // If the string is bjorked by Shift_JIS or a similar encoding // that doesn't support all of ASCII, convert the naughty // characters to their true byte-wise ASCII/UTF-8 equivalents. @@ -282,7 +288,7 @@ class HTMLPurifier_Encoder restore_error_handler(); return $str; } - trigger_error('Encoding not supported', E_USER_ERROR); + trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); } /** diff --git a/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php b/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php index 34354844c8..16cc89c67e 100644 --- a/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php +++ b/lib/htmlpurifier/HTMLPurifier/ErrorCollector.php @@ -7,22 +7,37 @@ class HTMLPurifier_ErrorCollector { - protected $errors = array(); + /** + * Identifiers for the returned error array. These are purposely numeric + * so list() can be used. + */ + const LINENO = 0; + const SEVERITY = 1; + const MESSAGE = 2; + const CHILDREN = 3; + + protected $errors; + protected $_current; + protected $_stacks = array(array()); protected $locale; protected $generator; protected $context; + protected $lines = array(); + public function __construct($context) { $this->locale =& $context->get('Locale'); - $this->generator =& $context->get('Generator'); $this->context = $context; + $this->_current =& $this->_stacks[0]; + $this->errors =& $this->_stacks[0]; } /** * Sends an error message to the collector for later use - * @param $line Integer line number, or HTMLPurifier_Token that caused error * @param $severity int Error severity, PHP error style (don't use E_USER_) * @param $msg string Error message text + * @param $subst1 string First substitution for $msg + * @param $subst2 string ... */ public function send($severity, $msg) { @@ -35,6 +50,7 @@ class HTMLPurifier_ErrorCollector $token = $this->context->get('CurrentToken', true); $line = $token ? $token->line : $this->context->get('CurrentLine', true); + $col = $token ? $token->col : $this->context->get('CurrentCol', true); $attr = $this->context->get('CurrentAttr', true); // perform special substitutions, also add custom parameters @@ -55,13 +71,66 @@ class HTMLPurifier_ErrorCollector if (!empty($subst)) $msg = strtr($msg, $subst); - $this->errors[] = array($line, $severity, $msg); + // (numerically indexed) + $error = array( + self::LINENO => $line, + self::SEVERITY => $severity, + self::MESSAGE => $msg, + self::CHILDREN => array() + ); + $this->_current[] = $error; + + + // NEW CODE BELOW ... + + $struct = null; + // Top-level errors are either: + // TOKEN type, if $value is set appropriately, or + // "syntax" type, if $value is null + $new_struct = new HTMLPurifier_ErrorStruct(); + $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN; + if ($token) $new_struct->value = clone $token; + if (is_int($line) && is_int($col)) { + if (isset($this->lines[$line][$col])) { + $struct = $this->lines[$line][$col]; + } else { + $struct = $this->lines[$line][$col] = $new_struct; + } + // These ksorts may present a performance problem + ksort($this->lines[$line], SORT_NUMERIC); + } else { + if (isset($this->lines[-1])) { + $struct = $this->lines[-1]; + } else { + $struct = $this->lines[-1] = $new_struct; + } + } + ksort($this->lines, SORT_NUMERIC); + + // Now, check if we need to operate on a lower structure + if (!empty($attr)) { + $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr); + if (!$struct->value) { + $struct->value = array($attr, 'PUT VALUE HERE'); + } + } + if (!empty($cssprop)) { + $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop); + if (!$struct->value) { + // if we tokenize CSS this might be a little more difficult to do + $struct->value = array($cssprop, 'PUT VALUE HERE'); + } + } + + // Ok, structs are all setup, now time to register the error + $struct->addError($severity, $msg); } /** * Retrieves raw error data for custom formatter to use - * @param List of arrays in format of array(Error message text, - * token that caused error, tokens surrounding token) + * @param List of arrays in format of array(line of error, + * error severity, error message, + * recursive sub-errors array) */ public function getRaw() { return $this->errors; @@ -70,38 +139,25 @@ class HTMLPurifier_ErrorCollector /** * Default HTML formatting implementation for error messages * @param $config Configuration array, vital for HTML output nature + * @param $errors Errors array to display; used for recursion. */ - public function getHTMLFormatted($config) { + public function getHTMLFormatted($config, $errors = null) { $ret = array(); - $errors = $this->errors; - - // sort error array by line - // line numbers are enabled if they aren't explicitly disabled - if ($config->get('Core', 'MaintainLineNumbers') !== false) { - $has_line = array(); - $lines = array(); - $original_order = array(); - foreach ($errors as $i => $error) { - $has_line[] = (int) (bool) $error[0]; - $lines[] = $error[0]; - $original_order[] = $i; - } - array_multisort($has_line, SORT_DESC, $lines, SORT_ASC, $original_order, SORT_ASC, $errors); - } + $this->generator = new HTMLPurifier_Generator($config, $this->context); + if ($errors === null) $errors = $this->errors; + + // 'At line' message needs to be removed - foreach ($errors as $error) { - list($line, $severity, $msg) = $error; - $string = ''; - $string .= '' . $this->locale->getErrorName($severity) . ': '; - $string .= $this->generator->escape($msg); - if ($line) { - // have javascript link generation that causes - // textarea to skip to the specified line - $string .= $this->locale->formatMessage( - 'ErrorCollector: At line', array('line' => $line)); + // generation code for new structure goes here. It needs to be recursive. + foreach ($this->lines as $line => $col_array) { + if ($line == -1) continue; + foreach ($col_array as $col => $struct) { + $this->_renderStruct($ret, $struct, $line, $col); } - $ret[] = $string; + } + if (isset($this->lines[-1])) { + $this->_renderStruct($ret, $this->lines[-1]); } if (empty($errors)) { @@ -112,5 +168,41 @@ class HTMLPurifier_ErrorCollector } + private function _renderStruct(&$ret, $struct, $line = null, $col = null) { + $stack = array($struct); + $context_stack = array(array()); + while ($current = array_pop($stack)) { + $context = array_pop($context_stack); + foreach ($current->errors as $error) { + list($severity, $msg) = $error; + $string = ''; + $string .= '
'; + // W3C uses an icon to indicate the severity of the error. + $error = $this->locale->getErrorName($severity); + $string .= "$error "; + if (!is_null($line) && !is_null($col)) { + $string .= "Line $line, Column $col: "; + } else { + $string .= 'End of Document: '; + } + $string .= '' . $this->generator->escape($msg) . ' '; + $string .= '
'; + // Here, have a marker for the character on the column appropriate. + // Be sure to clip extremely long lines. + //$string .= '
';
+                //$string .= '';
+                //$string .= '
'; + $ret[] = $string; + } + foreach ($current->children as $type => $array) { + $context[] = $current; + $stack = array_merge($stack, array_reverse($array, true)); + for ($i = count($array); $i > 0; $i--) { + $context_stack[] = $context; + } + } + } + } + } diff --git a/lib/htmlpurifier/HTMLPurifier/ErrorStruct.php b/lib/htmlpurifier/HTMLPurifier/ErrorStruct.php new file mode 100644 index 0000000000..9da712b714 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/ErrorStruct.php @@ -0,0 +1,58 @@ +children[$type][$id])) { + $this->children[$type][$id] = new HTMLPurifier_ErrorStruct(); + $this->children[$type][$id]->type = $type; + } + return $this->children[$type][$id]; + } + + public function addError($severity, $message) { + $this->errors[] = array($severity, $message); + } + +} diff --git a/lib/htmlpurifier/HTMLPurifier/Generator.php b/lib/htmlpurifier/HTMLPurifier/Generator.php index 42ea17fe6b..d4c42fbff9 100644 --- a/lib/htmlpurifier/HTMLPurifier/Generator.php +++ b/lib/htmlpurifier/HTMLPurifier/Generator.php @@ -26,6 +26,11 @@ class HTMLPurifier_Generator */ private $_def; + /** + * Cache of %Output.SortAttr + */ + private $_sortAttr; + /** * Configuration for the generator */ @@ -38,6 +43,7 @@ class HTMLPurifier_Generator public function __construct($config, $context) { $this->config = $config; $this->_scriptFix = $config->get('Output', 'CommentScriptContents'); + $this->_sortAttr = $config->get('Output', 'SortAttr'); $this->_def = $config->getHTMLDefinition(); $this->_xhtml = $this->_def->doctype->xml; } @@ -142,6 +148,7 @@ class HTMLPurifier_Generator */ public function generateAttributes($assoc_array_of_attributes, $element = false) { $html = ''; + if ($this->_sortAttr) ksort($assoc_array_of_attributes); foreach ($assoc_array_of_attributes as $key => $value) { if (!$this->_xhtml) { // Remove namespaced attributes diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php new file mode 100644 index 0000000000..6a2dc69a82 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Forms.php @@ -0,0 +1,117 @@ + 'Form', + 'Inline' => 'Formctrl', + ); + + public function setup($config) { + $form = $this->addElement('form', 'Form', + 'Required: Heading | List | Block | fieldset', 'Common', array( + 'accept' => 'ContentTypes', + 'accept-charset' => 'Charsets', + 'action*' => 'URI', + 'method' => 'Enum#get,post', + // really ContentType, but these two are the only ones used today + 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data', + )); + $form->excludes = array('form' => true); + + $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array( + 'accept' => 'ContentTypes', + 'accesskey' => 'Character', + 'alt' => 'Text', + 'checked' => 'Bool#checked', + 'disabled' => 'Bool#disabled', + 'maxlength' => 'Number', + 'name' => 'CDATA', + 'readonly' => 'Bool#readonly', + 'size' => 'Number', + 'src' => 'URI#embeds', + 'tabindex' => 'Number', + 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image', + 'value' => 'CDATA', + )); + $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input(); + + $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array( + 'disabled' => 'Bool#disabled', + 'multiple' => 'Bool#multiple', + 'name' => 'CDATA', + 'size' => 'Number', + 'tabindex' => 'Number', + )); + + $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array( + 'disabled' => 'Bool#disabled', + 'label' => 'Text', + 'selected' => 'Bool#selected', + 'value' => 'CDATA', + )); + // It's illegal for there to be more than one selected, but not + // be multiple. Also, no selected means undefined behavior. This might + // be difficult to implement; perhaps an injector, or a context variable. + + $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array( + 'accesskey' => 'Character', + 'cols*' => 'Number', + 'disabled' => 'Bool#disabled', + 'name' => 'CDATA', + 'readonly' => 'Bool#readonly', + 'rows*' => 'Number', + 'tabindex' => 'Number', + )); + $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea(); + + $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array( + 'accesskey' => 'Character', + 'disabled' => 'Bool#disabled', + 'name' => 'CDATA', + 'tabindex' => 'Number', + 'type' => 'Enum#button,submit,reset', + 'value' => 'CDATA', + )); + + // For exclusions, ideally we'd specify content sets, not literal elements + $button->excludes = $this->makeLookup( + 'form', 'fieldset', // Form + 'input', 'select', 'textarea', 'label', 'button', // Formctrl + 'a' // as per HTML 4.01 spec, this is omitted by modularization + ); + + // Extra exclusion: img usemap="" is not permitted within this element. + // We'll omit this for now, since we don't have any good way of + // indicating it yet. + + // This is HIGHLY user-unfriendly; we need a custom child-def for this + $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common'); + + $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array( + 'accesskey' => 'Character', + // 'for' => 'IDREF', // IDREF not implemented, cannot allow + )); + $label->excludes = array('label' => true); + + $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array( + 'accesskey' => 'Character', + )); + + $this->addElement('optgroup', false, 'Required: option', 'Common', array( + 'disabled' => 'Bool#disabled', + 'label*' => 'Text', + )); + + // Don't forget an injector for . This one's a little complex + // because it maps to multiple elements. + + } +} + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Name.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Name.php new file mode 100644 index 0000000000..db2ab4f4c1 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Name.php @@ -0,0 +1,16 @@ +addBlankElement($name); + $element->attr['name'] = 'ID'; + } + } + +} diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Object.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Object.php index 7dfd6b3134..cd4ce2648a 100644 --- a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Object.php +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Object.php @@ -1,46 +1,46 @@ - to cater to legacy browsers: this - * module does not allow this sort of behavior - */ -class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule -{ - - public $name = 'Object'; - public $safe = false; - - public function setup($config) { - - $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common', - array( - 'archive' => 'URI', - 'classid' => 'URI', - 'codebase' => 'URI', - 'codetype' => 'Text', - 'data' => 'URI', - 'declare' => 'Bool#declare', - 'height' => 'Length', - 'name' => 'CDATA', - 'standby' => 'Text', - 'tabindex' => 'Number', - 'type' => 'ContentType', - 'width' => 'Length' - ) - ); - - $this->addElement('param', false, 'Empty', false, - array( - 'id' => 'ID', - 'name*' => 'Text', - 'type' => 'Text', - 'value' => 'Text', - 'valuetype' => 'Enum#data,ref,object' - ) - ); - - } - -} - + to cater to legacy browsers: this + * module does not allow this sort of behavior + */ +class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule +{ + + public $name = 'Object'; + public $safe = false; + + public function setup($config) { + + $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common', + array( + 'archive' => 'URI', + 'classid' => 'URI', + 'codebase' => 'URI', + 'codetype' => 'Text', + 'data' => 'URI', + 'declare' => 'Bool#declare', + 'height' => 'Length', + 'name' => 'CDATA', + 'standby' => 'Text', + 'tabindex' => 'Number', + 'type' => 'ContentType', + 'width' => 'Length' + ) + ); + + $this->addElement('param', false, 'Empty', false, + array( + 'id' => 'ID', + 'name*' => 'Text', + 'type' => 'Text', + 'value' => 'Text', + 'valuetype' => 'Enum#data,ref,object' + ) + ); + + } + +} + diff --git a/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Name.php b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Name.php new file mode 100644 index 0000000000..9056e5fe14 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/HTMLModule/Tidy/Name.php @@ -0,0 +1,23 @@ +doctypes->register( 'HTML 4.01 Strict', false, array_merge($common, $non_xml), - array('Tidy_Strict', 'Tidy_Proprietary'), + array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'), array(), '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' @@ -91,7 +95,7 @@ class HTMLPurifier_HTMLModuleManager $this->doctypes->register( 'XHTML 1.0 Transitional', true, array_merge($common, $transitional, $xml, $non_xml), - array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary'), + array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'), array(), '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' @@ -100,7 +104,7 @@ class HTMLPurifier_HTMLModuleManager $this->doctypes->register( 'XHTML 1.0 Strict', true, array_merge($common, $xml, $non_xml), - array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary'), + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'), array(), '-//W3C//DTD XHTML 1.0 Strict//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' @@ -109,7 +113,7 @@ class HTMLPurifier_HTMLModuleManager $this->doctypes->register( 'XHTML 1.1', true, array_merge($common, $xml, array('Ruby')), - array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict'), // Tidy_XHTML1_1 + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1 array(), '-//W3C//DTD XHTML 1.1//EN', 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' @@ -212,9 +216,6 @@ class HTMLPurifier_HTMLModuleManager } } - // merge in custom modules - $modules = array_merge($modules, $this->userModules); - // add proprietary module (this gets special treatment because // it is completely removed from doctypes, etc.) if ($config->get('HTML', 'Proprietary')) { @@ -229,6 +230,9 @@ class HTMLPurifier_HTMLModuleManager $modules[] = 'SafeEmbed'; } + // merge in custom modules + $modules = array_merge($modules, $this->userModules); + foreach ($modules as $module) { $this->processModule($module); $this->modules[$module]->setup($config); @@ -378,7 +382,11 @@ class HTMLPurifier_HTMLModuleManager $this->contentSets->generateChildDef($def, $module); } - + + // This can occur if there is a blank definition, but no base to + // mix it in with + if (!$def) return false; + // add information on required attributes foreach ($def->attr as $attr_name => $attr_def) { if ($attr_def->required) { diff --git a/lib/htmlpurifier/HTMLPurifier/Injector.php b/lib/htmlpurifier/HTMLPurifier/Injector.php index 1947c340fe..2d6a861cc0 100644 --- a/lib/htmlpurifier/HTMLPurifier/Injector.php +++ b/lib/htmlpurifier/HTMLPurifier/Injector.php @@ -5,6 +5,11 @@ * This enables "formatter-like" functionality such as auto-paragraphing, * smiley-ification and linkification to take place. * + * A note on how handlers create changes; this is done by assigning a new + * value to the $token reference. These values can take a variety of forms and + * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken() + * documentation. + * * @todo Allow injectors to request a re-run on their output. This * would help if an operation is recursive. */ @@ -16,13 +21,6 @@ abstract class HTMLPurifier_Injector */ public $name; - /** - * Amount of tokens the injector needs to skip + 1. Because - * the decrement is the first thing that happens, this needs to - * be one greater than the "real" skip count. - */ - public $skip = 1; - /** * Instance of HTMLPurifier_HTMLDefinition */ @@ -54,6 +52,32 @@ abstract class HTMLPurifier_Injector */ public $needed = array(); + /** + * Index of inputTokens to rewind to. + */ + protected $rewind = false; + + /** + * Rewind to a spot to re-perform processing. This is useful if you + * deleted a node, and now need to see if this change affected any + * earlier nodes. Rewinding does not affect other injectors, and can + * result in infinite loops if not used carefully. + * @warning HTML Purifier will prevent you from fast-forwarding with this + * function. + */ + public function rewind($index) { + $this->rewind = $index; + } + + /** + * Retrieves rewind, and then unsets it. + */ + public function getRewind() { + $r = $this->rewind; + $this->rewind = false; + return $r; + } + /** * Prepares the injector by giving it the config and context objects: * this allows references to important variables to be made within @@ -116,6 +140,69 @@ abstract class HTMLPurifier_Injector return true; } + /** + * Iterator function, which starts with the next token and continues until + * you reach the end of the input tokens. + * @warning Please prevent previous references from interfering with this + * functions by setting $i = null beforehand! + * @param &$i Current integer index variable for inputTokens + * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference + */ + protected function forward(&$i, &$current) { + if ($i === null) $i = $this->inputIndex + 1; + else $i++; + if (!isset($this->inputTokens[$i])) return false; + $current = $this->inputTokens[$i]; + return true; + } + + /** + * Similar to _forward, but accepts a third parameter $nesting (which + * should be initialized at 0) and stops when we hit the end tag + * for the node $this->inputIndex starts in. + */ + protected function forwardUntilEndToken(&$i, &$current, &$nesting) { + $result = $this->forward($i, $current); + if (!$result) return false; + if ($nesting === null) $nesting = 0; + if ($current instanceof HTMLPurifier_Token_Start) $nesting++; + elseif ($current instanceof HTMLPurifier_Token_End) { + if ($nesting <= 0) return false; + $nesting--; + } + return true; + } + + /** + * Iterator function, starts with the previous token and continues until + * you reach the beginning of input tokens. + * @warning Please prevent previous references from interfering with this + * functions by setting $i = null beforehand! + * @param &$i Current integer index variable for inputTokens + * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference + */ + protected function backward(&$i, &$current) { + if ($i === null) $i = $this->inputIndex - 1; + else $i--; + if ($i < 0) return false; + $current = $this->inputTokens[$i]; + return true; + } + + /** + * Initializes the iterator at the current position. Use in a do {} while; + * loop to force the _forward and _backward functions to start at the + * current location. + * @warning Please prevent previous references from interfering with this + * functions by setting $i = null beforehand! + * @param &$i Current integer index variable for inputTokens + * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference + */ + protected function current(&$i, &$current) { + if ($i === null) $i = $this->inputIndex; + $current = $this->inputTokens[$i]; + } + /** * Handler that is called when a text token is processed */ @@ -126,9 +213,17 @@ abstract class HTMLPurifier_Injector */ public function handleElement(&$token) {} + /** + * Handler that is called when an end token is processed + */ + public function handleEnd(&$token) { + $this->notifyEnd($token); + } + /** * Notifier that is called when an end token is processed * @note This differs from handlers in that the token is read-only + * @deprecated */ public function notifyEnd($token) {} diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php b/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php index e51f4ece22..3a7666242f 100644 --- a/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php +++ b/lib/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php @@ -3,6 +3,8 @@ /** * Injector that auto paragraphs text in the root node based on * double-spacing. + * @todo Ensure all states are unit tested, including variations as well. + * @todo Make a graph of the flow control for this Injector. */ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector { @@ -18,116 +20,177 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector public function handleText(&$token) { $text = $token->data; - if (empty($this->currentNesting)) { - if (!$this->allowsElement('p')) return; - // case 1: we're in root node (and it allows paragraphs) - $token = array($this->_pStart()); - $this->_splitText($text, $token); - } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') { - // case 2: we're in a paragraph - $token = array(); - $this->_splitText($text, $token); - } elseif ($this->allowsElement('p')) { - // case 3: we're in an element that allows paragraphs - if (strpos($text, "\n\n") !== false) { - // case 3.1: this text node has a double-newline - $token = array($this->_pStart()); - $this->_splitText($text, $token); - } else { - $ok = false; - // test if up-coming tokens are either block or have - // a double newline in them - $nesting = 0; - for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start){ - if (!$this->_isInline($this->inputTokens[$i])) { - // we haven't found a double-newline, and - // we've hit a block element, so don't paragraph - $ok = false; - break; - } - $nesting++; - } - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) { - if ($nesting <= 0) break; - $nesting--; - } - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text) { - // found it! - if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { - $ok = true; - break; - } - } + // Does the current parent allow

tags? + if ($this->allowsElement('p')) { + if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { + // Note that we have differing behavior when dealing with text + // in the anonymous root node, or a node inside the document. + // If the text as a double-newline, the treatment is the same; + // if it doesn't, see the next if-block if you're in the document. + + $i = $nesting = null; + if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { + // State 1.1: ... ^ (whitespace, then document end) + // ---- + // This is a degenerate case + } else { + // State 1.2: PAR1 + // ---- + + // State 1.3: PAR1\n\nPAR2 + // ------------ + + // State 1.4:

PAR1\n\nPAR2 (see State 2) + // ------------ + $token = array($this->_pStart()); + $this->_splitText($text, $token); } - if ($ok) { - // case 3.2: this text node is next to another node - // that will start a paragraph + } else { + // State 2:
PAR1... (similar to 1.4) + // ---- + + // We're in an element that allows paragraph tags, but we're not + // sure if we're going to need them. + if ($this->_pLookAhead()) { + // State 2.1:
PAR1PAR1\n\nPAR2 + // ---- + // Note: This will always be the first child, since any + // previous inline element would have triggered this very + // same routine, and found the double newline. One possible + // exception would be a comment. $token = array($this->_pStart(), $token); + } else { + // State 2.2.1:
PAR1
+ // ---- + + // State 2.2.2:
PAR1PAR1
+ // ---- } } + // Is the current parent a

tag? + } elseif ( + !empty($this->currentNesting) && + $this->currentNesting[count($this->currentNesting)-1]->name == 'p' + ) { + // State 3.1: ...

PAR1 + // ---- + + // State 3.2: ...

PAR1\n\nPAR2 + // ------------ + $token = array(); + $this->_splitText($text, $token); + // Abort! + } else { + // State 4.1: ...PAR1 + // ---- + + // State 4.2: ...PAR1\n\nPAR2 + // ------------ } - } public function handleElement(&$token) { - // check if we're inside a tag already - if (!empty($this->currentNesting)) { - if ($this->allowsElement('p')) { - // special case: we're in an element that allows paragraphs - - // this token is already paragraph, abort - if ($token->name == 'p') return; - - // this token is a block level, abort - if (!$this->_isInline($token)) return; - - // check if this token is adjacent to the parent token - $prev = $this->inputTokens[$this->inputIndex - 1]; - if (!$prev instanceof HTMLPurifier_Token_Start) { - // not adjacent, we can abort early - // add lead paragraph tag if our token is inline - // and the previous tag was an end paragraph - if ( - $prev->name == 'p' && $prev instanceof HTMLPurifier_Token_End && - $this->_isInline($token) - ) { - $token = array($this->_pStart(), $token); - } - return; - } - - // this token is the first child of the element that allows - // paragraph. We have to peek ahead and see whether or not - // there is anything inside that suggests that a paragraph - // will be needed - $ok = false; - // maintain a mini-nesting counter, this lets us bail out - // early if possible - $j = 1; // current nesting, one is due to parent (we recalculate current token) - for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) { - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start) $j++; - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) $j--; - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text) { - if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { - $ok = true; - break; + // We don't have to check if we're already in a

tag for block + // tokens, because the tag would have been autoclosed by MakeWellFormed. + if ($this->allowsElement('p')) { + if (!empty($this->currentNesting)) { + if ($this->_isInline($token)) { + // State 1:

... + // --- + + // Check if this token is adjacent to the parent token + // (seek backwards until token isn't whitespace) + $i = null; + $this->backward($i, $prev); + + if (!$prev instanceof HTMLPurifier_Token_Start) { + // Token wasn't adjacent + + if ( + $prev instanceof HTMLPurifier_Token_Text && + substr($prev->data, -2) === "\n\n" + ) { + // State 1.1.4:

PAR1

\n\n + // --- + + // Quite frankly, this should be handled by splitText + $token = array($this->_pStart(), $token); + } else { + // State 1.1.1:

PAR1

+ // --- + + // State 1.1.2:

+ // --- + + // State 1.1.3:
PAR + // --- + } + + } else { + // State 1.2.1:
+ // --- + + // Lookahead to see if

is needed. + if ($this->_pLookAhead()) { + // State 1.3.1:

PAR1\n\nPAR2 + // --- + $token = array($this->_pStart(), $token); + } else { + // State 1.3.2:
PAR1
+ // --- + + // State 1.3.3:
PAR1
\n\n
+ // --- } } - if ($j <= 0) break; + } else { + // State 2.3: ...
+ // ----- } - if ($ok) { + } else { + if ($this->_isInline($token)) { + // State 3.1: + // --- + // This is where the {p} tag is inserted, not reflected in + // inputTokens yet, however. $token = array($this->_pStart(), $token); + } else { + // State 3.2:
+ // ----- + } + + $i = null; + if ($this->backward($i, $prev)) { + if ( + !$prev instanceof HTMLPurifier_Token_Text + ) { + // State 3.1.1: ...

{p} + // --- + + // State 3.2.1: ...

+ // ----- + + if (!is_array($token)) $token = array($token); + array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); + } else { + // State 3.1.2: ...

\n\n{p} + // --- + + // State 3.2.2: ...

\n\n
+ // ----- + + // Note: PAR cannot occur because PAR would have been + // wrapped in

tags. + } } } - return; + } else { + // State 2.2:

  • + // ---- + + // State 2.4:

    + // --- } - - // check if the start tag counts as a "block" element - if (!$this->_isInline($token)) return; - - // append a paragraph tag before the token - $token = array($this->_pStart(), $token); } /** @@ -142,96 +205,80 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector */ private function _splitText($data, &$result) { $raw_paragraphs = explode("\n\n", $data); - - // remove empty paragraphs - $paragraphs = array(); + $paragraphs = array(); // without empty paragraphs $needs_start = false; $needs_end = false; $c = count($raw_paragraphs); if ($c == 1) { - // there were no double-newlines, abort quickly + // There were no double-newlines, abort quickly. In theory this + // should never happen. $result[] = new HTMLPurifier_Token_Text($data); return; } - for ($i = 0; $i < $c; $i++) { $par = $raw_paragraphs[$i]; if (trim($par) !== '') { $paragraphs[] = $par; - continue; - } - if ($i == 0 && empty($result)) { - // The empty result indicates that the AutoParagraph - // injector did not add any start paragraph tokens. - // The fact that the first paragraph is empty indicates - // that there was a double-newline at the start of the - // data. - // Combined together, this means that we are in a paragraph, - // and the newline means we should start a new one. - $result[] = new HTMLPurifier_Token_End('p'); - // However, the start token should only be added if - // there is more processing to be done (i.e. there are - // real paragraphs in here). If there are none, the - // next start paragraph tag will be handled by the - // next run-around the injector - $needs_start = true; - } elseif ($i + 1 == $c) { - // a double-paragraph at the end indicates that - // there is an overriding need to start a new paragraph - // for the next section. This has no effect until - // we've processed all of the other paragraphs though - $needs_end = true; + } else { + if ($i == 0) { + // Double newline at the front + if (empty($result)) { + // The empty result indicates that the AutoParagraph + // injector did not add any start paragraph tokens. + // This means that we have been in a paragraph for + // a while, and the newline means we should start a new one. + $result[] = new HTMLPurifier_Token_End('p'); + $result[] = new HTMLPurifier_Token_Text("\n\n"); + // However, the start token should only be added if + // there is more processing to be done (i.e. there are + // real paragraphs in here). If there are none, the + // next start paragraph tag will be handled by the + // next call to the injector + $needs_start = true; + } else { + // We just started a new paragraph! + // Reinstate a double-newline for presentation's sake, since + // it was in the source code. + array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); + } + } elseif ($i + 1 == $c) { + // Double newline at the end + // There should be a trailing

    when we're finally done. + $needs_end = true; + } } } - // check if there are no "real" paragraphs to be processed + // Check if this was just a giant blob of whitespace. Move this earlier, + // perhaps? if (empty($paragraphs)) { return; } - // add a start tag if an end tag was added while processing - // the raw paragraphs (that happens if there's a leading double - // newline) - if ($needs_start) $result[] = $this->_pStart(); + // Add the start tag indicated by \n\n at the beginning of $data + if ($needs_start) { + $result[] = $this->_pStart(); + } - // append the paragraphs onto the result + // Append the paragraphs onto the result foreach ($paragraphs as $par) { $result[] = new HTMLPurifier_Token_Text($par); $result[] = new HTMLPurifier_Token_End('p'); + $result[] = new HTMLPurifier_Token_Text("\n\n"); $result[] = $this->_pStart(); } - // remove trailing start token, if one is needed, it will - // be handled the next time this injector is called + // Remove trailing start token; Injector will handle this later if + // it was indeed needed. This prevents from needing to do a lookahead, + // at the cost of a lookbehind later. array_pop($result); - // check the outside to determine whether or not the - // end paragraph tag should be removed. It should be removed - // unless the next non-whitespace token is a paragraph - // or a block element. - $remove_paragraph_end = true; - + // If there is no need for an end tag, remove all of it and let + // MakeWellFormed close it later. if (!$needs_end) { - // Start of the checks one after the current token's index - for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start || $this->inputTokens[$i] instanceof HTMLPurifier_Token_Empty) { - $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); - } - // check if we can abort early (whitespace means we carry-on!) - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text && !$this->inputTokens[$i]->is_whitespace) break; - // end tags will automatically be handled by MakeWellFormed, - // so we don't have to worry about them - if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) break; - } - } else { - $remove_paragraph_end = false; - } - - // check the outside to determine whether or not the - // end paragraph tag should be removed - if ($remove_paragraph_end) { - array_pop($result); + array_pop($result); // removes \n\n + array_pop($result); // removes

    } } @@ -244,5 +291,49 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); } + /** + * Looks ahead in the token list and determines whether or not we need + * to insert a

    tag. + */ + private function _pLookAhead() { + $this->current($i, $current); + if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1; + else $nesting = 0; + $ok = false; + while ($this->forwardUntilEndToken($i, $current, $nesting)) { + $result = $this->_checkNeedsP($current); + if ($result !== null) { + $ok = $result; + break; + } + } + return $ok; + } + + /** + * Determines if a particular token requires an earlier inline token + * to get a paragraph. This should be used with _forwardUntilEndToken + */ + private function _checkNeedsP($current) { + if ($current instanceof HTMLPurifier_Token_Start){ + if (!$this->_isInline($current)) { + //

    PAR1
    + // ---- + // Terminate early, since we hit a block element + return false; + } + } elseif ($current instanceof HTMLPurifier_Token_Text) { + if (strpos($current->data, "\n\n") !== false) { + //
    PAR1PAR1\n\nPAR2 + // ---- + return true; + } else { + //
    PAR1PAR1... + // ---- + } + } + return null; + } + } diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/DisplayLinkURI.php b/lib/htmlpurifier/HTMLPurifier/Injector/DisplayLinkURI.php new file mode 100644 index 0000000000..6fc566d70c --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Injector/DisplayLinkURI.php @@ -0,0 +1,24 @@ +start->attr['href'])){ + $url = $token->start->attr['href']; + unset($token->start->attr['href']); + $token = array($token, new HTMLPurifier_Token_Text(" ($url)")); + } else { + // nothing to display + } + } +} \ No newline at end of file diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/RemoveEmpty.php b/lib/htmlpurifier/HTMLPurifier/Injector/RemoveEmpty.php new file mode 100644 index 0000000000..53bb0f00c4 --- /dev/null +++ b/lib/htmlpurifier/HTMLPurifier/Injector/RemoveEmpty.php @@ -0,0 +1,40 @@ +config = $config; + $this->context = $context; + $this->attrValidator = new HTMLPurifier_AttrValidator(); + } + + public function handleElement(&$token) { + if (!$token instanceof HTMLPurifier_Token_Start) return; + $next = false; + for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) { + $next = $this->inputTokens[$i]; + if ($next instanceof HTMLPurifier_Token_Text && $next->is_whitespace) continue; + break; + } + if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) { + if ($token->name == 'colgroup') return; + $this->attrValidator->validateToken($token, $this->config, $this->context); + $token->armor['ValidateAttributes'] = true; + if (isset($token->attr['id']) || isset($token->attr['name'])) return; + $token = $i - $this->inputIndex + 1; + for ($b = $this->inputIndex - 1; $b > 0; $b--) { + $prev = $this->inputTokens[$b]; + if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue; + break; + } + // This is safe because we removed the token that triggered this. + $this->rewind($b - 1); + return; + } + } + +} diff --git a/lib/htmlpurifier/HTMLPurifier/Injector/SafeObject.php b/lib/htmlpurifier/HTMLPurifier/Injector/SafeObject.php index f379406866..1ac6c35936 100644 --- a/lib/htmlpurifier/HTMLPurifier/Injector/SafeObject.php +++ b/lib/htmlpurifier/HTMLPurifier/Injector/SafeObject.php @@ -72,7 +72,10 @@ class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector } } - public function notifyEnd($token) { + public function handleEnd(&$token) { + // This is the WRONG way of handling the object and param stacks; + // we should be inserting them directly on the relevant object tokens + // so that the global stack handling handles it. if ($token->name == 'object') { array_pop($this->objectStack); array_pop($this->paramStack); diff --git a/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php b/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php index b16c3ff385..212acd822b 100644 --- a/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php +++ b/lib/htmlpurifier/HTMLPurifier/Language/messages/en.php @@ -15,7 +15,8 @@ $messages = array( 'Item separator last' => ' and ', // non-Harvard style 'ErrorCollector: No errors' => 'No errors detected. However, because error reporting is still incomplete, there may have been errors that the error collector was not notified of; please inspect the output HTML carefully.', -'ErrorCollector: At line' => ' at line $line', +'ErrorCollector: At line' => ' at line $line', +'ErrorCollector: Incidental errors' => 'Incidental errors', 'Lexer: Unclosed comment' => 'Unclosed comment', 'Lexer: Unescaped lt' => 'Unescaped less-than sign (<) should be <', @@ -30,6 +31,8 @@ $messages = array( 'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$CurrentToken.Data" removed', 'Strategy_RemoveForeignElements: Foreign meta element removed' => 'Unrecognized $CurrentToken.Serialized meta tag and all descendants removed', 'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end', +'Strategy_RemoveForeignElements: Trailing hyphen in comment removed' => 'Trailing hyphen(s) in comment removed', +'Strategy_RemoveForeignElements: Hyphens in comment collapsed' => 'Double hyphens in comments are not allowed, and were collapsed into single hyphens', 'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary $CurrentToken.Serialized tag removed', 'Strategy_MakeWellFormed: Unnecessary end tag to text' => 'Unnecessary $CurrentToken.Serialized tag converted to text', @@ -50,8 +53,8 @@ $messages = array( ); $errorNames = array( - E_ERROR => 'Error', + E_ERROR => 'Error', E_WARNING => 'Warning', - E_NOTICE => 'Notice' + E_NOTICE => 'Notice' ); diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer.php b/lib/htmlpurifier/HTMLPurifier/Lexer.php index 01364f65b9..043c0ea0db 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer.php @@ -42,6 +42,12 @@ class HTMLPurifier_Lexer { + /** + * Whether or not this lexer implements line-number/column-number tracking. + * If it does, set to true. + */ + public $tracksLineNumbers = false; + // -- STATIC ---------------------------------------------------------- /** @@ -70,46 +76,65 @@ class HTMLPurifier_Lexer $lexer = $config->get('Core', 'LexerImpl'); } - if (is_object($lexer)) { - return $lexer; - } + $needs_tracking = + $config->get('Core', 'MaintainLineNumbers') || + $config->get('Core', 'CollectErrors'); - if (is_null($lexer)) { do { - // auto-detection algorithm + $inst = null; + if (is_object($lexer)) { + $inst = $lexer; + } else { - // once PHP DOM implements native line numbers, or we - // hack out something using XSLT, remove this stipulation - $line_numbers = $config->get('Core', 'MaintainLineNumbers'); - if ( - $line_numbers === true || - ($line_numbers === null && $config->get('Core', 'CollectErrors')) - ) { - $lexer = 'DirectLex'; - break; - } + if (is_null($lexer)) { do { + // auto-detection algorithm + + if ($needs_tracking) { + $lexer = 'DirectLex'; + break; + } + + if ( + class_exists('DOMDocument') && + method_exists('DOMDocument', 'loadHTML') && + !extension_loaded('domxml') + ) { + // check for DOM support, because while it's part of the + // core, it can be disabled compile time. Also, the PECL + // domxml extension overrides the default DOM, and is evil + // and nasty and we shan't bother to support it + $lexer = 'DOMLex'; + } else { + $lexer = 'DirectLex'; + } + + } while(0); } // do..while so we can break - if (class_exists('DOMDocument')) { - // check for DOM support, because, surprisingly enough, - // it's *not* part of the core! - $lexer = 'DOMLex'; - } else { - $lexer = 'DirectLex'; + // instantiate recognized string names + switch ($lexer) { + case 'DOMLex': + $inst = new HTMLPurifier_Lexer_DOMLex(); + break; + case 'DirectLex': + $inst = new HTMLPurifier_Lexer_DirectLex(); + break; + case 'PH5P': + $inst = new HTMLPurifier_Lexer_PH5P(); + break; + default: + throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer)); } - - } while(0); } // do..while so we can break + } + + if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated'); - // instantiate recognized string names - switch ($lexer) { - case 'DOMLex': - return new HTMLPurifier_Lexer_DOMLex(); - case 'DirectLex': - return new HTMLPurifier_Lexer_DirectLex(); - case 'PH5P': - return new HTMLPurifier_Lexer_PH5P(); - default: - trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR); + // once PHP DOM implements native line numbers, or we + // hack out something using XSLT, remove this stipulation + if ($needs_tracking && !$inst->tracksLineNumbers) { + throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'); } + return $inst; + } // -- CONVENIENCE MEMBERS --------------------------------------------- @@ -226,11 +251,6 @@ class HTMLPurifier_Lexer */ public function normalize($html, $config, $context) { - // extract body from document if applicable - if ($config->get('Core', 'ConvertDocumentToFragment')) { - $html = $this->extractBody($html); - } - // normalize newlines to \n $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); @@ -243,6 +263,11 @@ class HTMLPurifier_Lexer // escape CDATA $html = $this->escapeCDATA($html); + // extract body from document if applicable + if ($config->get('Core', 'ConvertDocumentToFragment')) { + $html = $this->extractBody($html); + } + // expand entities that aren't the big five $html = $this->_entity_parser->substituteNonSpecialEntities($html); diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php index dc0adff275..3ffe7dbdf5 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php @@ -45,7 +45,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $char = '[^a-z!\/]'; $comment = "/|\z)/is"; $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); - $html = preg_replace("/<($char)/i", '<\\1', $html); + do { + $old = $html; + $html = preg_replace("/<($char)/i", '<\\1', $html); + } while ($html !== $old); $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments } diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php b/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php index 985ad68084..45ddc876e4 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php @@ -13,6 +13,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { + public $tracksLineNumbers = true; + /** * Whitespace characters for str(c)spn. */ @@ -42,6 +44,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array + // This is also treated to mean maintain *column* numbers too $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); if ($maintain_line_numbers === null) { @@ -50,9 +53,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $maintain_line_numbers = $config->get('Core', 'CollectErrors'); } - if ($maintain_line_numbers) $current_line = 1; - else $current_line = false; + if ($maintain_line_numbers) { + $current_line = 1; + $current_col = 0; + $length = strlen($html); + } else { + $current_line = false; + $current_col = false; + $length = false; + } $context->register('CurrentLine', $current_line); + $context->register('CurrentCol', $current_col); $nl = "\n"; // how often to manually recalculate. This will ALWAYS be right, // but it's pretty wasteful. Set to 0 to turn off @@ -68,14 +79,31 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer while(++$loops) { - // recalculate lines - if ( - $maintain_line_numbers && // line number tracking is on - $synchronize_interval && // synchronization is on - $cursor > 0 && // cursor is further than zero - $loops % $synchronize_interval === 0 // time to synchronize! - ) { - $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); + // $cursor is either at the start of a token, or inside of + // a tag (i.e. there was a < immediately before it), as indicated + // by $inside_tag + + if ($maintain_line_numbers) { + + // $rcursor, however, is always at the start of a token. + $rcursor = $cursor - (int) $inside_tag; + + // Column number is cheap, so we calculate it every round. + // We're interested at the *end* of the newline string, so + // we need to add strlen($nl) == 1 to $nl_pos before subtracting it + // from our "rcursor" position. + $nl_pos = strrpos($html, $nl, $rcursor - $length); + $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); + + // recalculate lines + if ( + $synchronize_interval && // synchronization is on + $cursor > 0 && // cursor is further than zero + $loops % $synchronize_interval === 0 // time to synchronize! + ) { + $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); + } + } $position_next_lt = strpos($html, '<', $cursor); @@ -99,7 +127,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); } $array[] = $token; @@ -119,7 +147,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); - if ($maintain_line_numbers) $token->line = $current_line; + if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { @@ -167,7 +195,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); } $array[] = $token; @@ -182,7 +210,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $type = substr($segment, 1); $token = new HTMLPurifier_Token_End($type); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -197,20 +225,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if (!ctype_alpha($segment[0])) { // XML: $segment[0] !== '_' && $segment[0] !== ':' if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); - $token = new - HTMLPurifier_Token_Text( - '<' . - $this->parseData( - $segment - ) . - '>' - ); + $token = new HTMLPurifier_Token_Text('<'); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; - $cursor = $position_next_gt + 1; $inside_tag = false; continue; } @@ -235,7 +255,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $token = new HTMLPurifier_Token_Start($segment); } if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -267,7 +287,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $token = new HTMLPurifier_Token_Start($type, $attr); } if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -284,7 +304,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer substr($html, $cursor) ) ); - if ($maintain_line_numbers) $token->line = $current_line; + if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); // no cursor scroll? Hmm... $array[] = $token; break; @@ -293,6 +313,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } $context->destroy('CurrentLine'); + $context->destroy('CurrentCol'); return $array; } diff --git a/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php b/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php index 81659e5051..fa1bf973e0 100644 --- a/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php +++ b/lib/htmlpurifier/HTMLPurifier/Lexer/PH5P.php @@ -1,3906 +1,3906 @@ -normalize($html, $config, $context); - $new_html = $this->wrapHTML($new_html, $config, $context); - try { - $parser = new HTML5($new_html); - $doc = $parser->save(); - } catch (DOMException $e) { - // Uh oh, it failed. Punt to DirectLex. - $lexer = new HTMLPurifier_Lexer_DirectLex(); - $context->register('PH5PError', $e); // save the error, so we can detect it - return $lexer->tokenizeHTML($html, $config, $context); // use original HTML - } - $tokens = array(); - $this->tokenizeDOM( - $doc->getElementsByTagName('html')->item(0)-> // - getElementsByTagName('body')->item(0)-> // - getElementsByTagName('div')->item(0) //
    - , $tokens); - return $tokens; - } - -} - -/* - -Copyright 2007 Jeroen van der Meer - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -class HTML5 { - private $data; - private $char; - private $EOF; - private $state; - private $tree; - private $token; - private $content_model; - private $escape = false; - private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute', - 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;', - 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;', - 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;', - 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;', - 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;', - 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;', - 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;', - 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;', - 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN', - 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;', - 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;', - 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig', - 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;', - 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;', - 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil', - 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;', - 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;', - 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;', - 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth', - 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12', - 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt', - 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc', - 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;', - 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;', - 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;', - 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro', - 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;', - 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;', - 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;', - 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash', - 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;', - 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;', - 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;', - 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;', - 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;', - 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;', - 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;', - 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;', - 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc', - 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;', - 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;'); - - const PCDATA = 0; - const RCDATA = 1; - const CDATA = 2; - const PLAINTEXT = 3; - - const DOCTYPE = 0; - const STARTTAG = 1; - const ENDTAG = 2; - const COMMENT = 3; - const CHARACTR = 4; - const EOF = 5; - - public function __construct($data) { - $data = str_replace("\r\n", "\n", $data); - $data = str_replace("\r", null, $data); - - $this->data = $data; - $this->char = -1; - $this->EOF = strlen($data); - $this->tree = new HTML5TreeConstructer; - $this->content_model = self::PCDATA; - - $this->state = 'data'; - - while($this->state !== null) { - $this->{$this->state.'State'}(); - } - } - - public function save() { - return $this->tree->save(); - } - - private function char() { - return ($this->char < $this->EOF) - ? $this->data[$this->char] - : false; - } - - private function character($s, $l = 0) { - if($s + $l < $this->EOF) { - if($l === 0) { - return $this->data[$s]; - } else { - return substr($this->data, $s, $l); - } - } - } - - private function characters($char_class, $start) { - return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start)); - } - - private function dataState() { - // Consume the next input character - $this->char++; - $char = $this->char(); - - if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { - /* U+0026 AMPERSAND (&) - When the content model flag is set to one of the PCDATA or RCDATA - states: switch to the entity data state. Otherwise: treat it as per - the "anything else" entry below. */ - $this->state = 'entityData'; - - } elseif($char === '-') { - /* If the content model flag is set to either the RCDATA state or - the CDATA state, and the escape flag is false, and there are at - least three characters before this one in the input stream, and the - last four characters in the input stream, including this one, are - U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, - and U+002D HYPHEN-MINUS (""), - set the escape flag to false. */ - if(($this->content_model === self::RCDATA || - $this->content_model === self::CDATA) && $this->escape === true && - $this->character($this->char, 3) === '-->') { - $this->escape = false; - } - - /* In any case, emit the input character as a character token. - Stay in the data state. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => $char - )); - - } elseif($this->char === $this->EOF) { - /* EOF - Emit an end-of-file token. */ - $this->EOF(); - - } elseif($this->content_model === self::PLAINTEXT) { - /* When the content model flag is set to the PLAINTEXT state - THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of - the text and emit it as a character token. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => substr($this->data, $this->char) - )); - - $this->EOF(); - - } else { - /* Anything else - THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that - otherwise would also be treated as a character token and emit it - as a single character token. Stay in the data state. */ - $len = strcspn($this->data, '<&', $this->char); - $char = substr($this->data, $this->char, $len); - $this->char += $len - 1; - - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => $char - )); - - $this->state = 'data'; - } - } - - private function entityDataState() { - // Attempt to consume an entity. - $entity = $this->entity(); - - // If nothing is returned, emit a U+0026 AMPERSAND character token. - // Otherwise, emit the character token that was returned. - $char = (!$entity) ? '&' : $entity; - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => $char - )); - - // Finally, switch to the data state. - $this->state = 'data'; - } - - private function tagOpenState() { - switch($this->content_model) { - case self::RCDATA: - case self::CDATA: - /* If the next input character is a U+002F SOLIDUS (/) character, - consume it and switch to the close tag open state. If the next - input character is not a U+002F SOLIDUS (/) character, emit a - U+003C LESS-THAN SIGN character token and switch to the data - state to process the next input character. */ - if($this->character($this->char + 1) === '/') { - $this->char++; - $this->state = 'closeTagOpen'; - - } else { - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => '<' - )); - - $this->state = 'data'; - } - break; - - case self::PCDATA: - // If the content model flag is set to the PCDATA state - // Consume the next input character: - $this->char++; - $char = $this->char(); - - if($char === '!') { - /* U+0021 EXCLAMATION MARK (!) - Switch to the markup declaration open state. */ - $this->state = 'markupDeclarationOpen'; - - } elseif($char === '/') { - /* U+002F SOLIDUS (/) - Switch to the close tag open state. */ - $this->state = 'closeTagOpen'; - - } elseif(preg_match('/^[A-Za-z]$/', $char)) { - /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z - Create a new start tag token, set its tag name to the lowercase - version of the input character (add 0x0020 to the character's code - point), then switch to the tag name state. (Don't emit the token - yet; further details will be filled in before it is emitted.) */ - $this->token = array( - 'name' => strtolower($char), - 'type' => self::STARTTAG, - 'attr' => array() - ); - - $this->state = 'tagName'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Parse error. Emit a U+003C LESS-THAN SIGN character token and a - U+003E GREATER-THAN SIGN character token. Switch to the data state. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => '<>' - )); - - $this->state = 'data'; - - } elseif($char === '?') { - /* U+003F QUESTION MARK (?) - Parse error. Switch to the bogus comment state. */ - $this->state = 'bogusComment'; - - } else { - /* Anything else - Parse error. Emit a U+003C LESS-THAN SIGN character token and - reconsume the current input character in the data state. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => '<' - )); - - $this->char--; - $this->state = 'data'; - } - break; - } - } - - private function closeTagOpenState() { - $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); - $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; - - if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && - (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/', - $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) { - /* If the content model flag is set to the RCDATA or CDATA states then - examine the next few characters. If they do not match the tag name of - the last start tag token emitted (case insensitively), or if they do but - they are not immediately followed by one of the following characters: - * U+0009 CHARACTER TABULATION - * U+000A LINE FEED (LF) - * U+000B LINE TABULATION - * U+000C FORM FEED (FF) - * U+0020 SPACE - * U+003E GREATER-THAN SIGN (>) - * U+002F SOLIDUS (/) - * EOF - ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character - token, a U+002F SOLIDUS character token, and switch to the data state - to process the next input character. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => 'state = 'data'; - - } else { - /* Otherwise, if the content model flag is set to the PCDATA state, - or if the next few characters do match that tag name, consume the - next input character: */ - $this->char++; - $char = $this->char(); - - if(preg_match('/^[A-Za-z]$/', $char)) { - /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z - Create a new end tag token, set its tag name to the lowercase version - of the input character (add 0x0020 to the character's code point), then - switch to the tag name state. (Don't emit the token yet; further details - will be filled in before it is emitted.) */ - $this->token = array( - 'name' => strtolower($char), - 'type' => self::ENDTAG - ); - - $this->state = 'tagName'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Parse error. Switch to the data state. */ - $this->state = 'data'; - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F - SOLIDUS character token. Reconsume the EOF character in the data state. */ - $this->emitToken(array( - 'type' => self::CHARACTR, - 'data' => 'char--; - $this->state = 'data'; - - } else { - /* Parse error. Switch to the bogus comment state. */ - $this->state = 'bogusComment'; - } - } - } - - private function tagNameState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } elseif($char === '/') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } else { - /* Anything else - Append the current input character to the current tag token's tag name. - Stay in the tag name state. */ - $this->token['name'] .= strtolower($char); - $this->state = 'tagName'; - } - } - - private function beforeAttributeNameState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($char === '/') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Stay in the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Start a new attribute in the current tag token. Set that attribute's - name to the current input character, and its value to the empty string. - Switch to the attribute name state. */ - $this->token['attr'][] = array( - 'name' => strtolower($char), - 'value' => null - ); - - $this->state = 'attributeName'; - } - } - - private function attributeNameState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute name state. */ - $this->state = 'afterAttributeName'; - - } elseif($char === '=') { - /* U+003D EQUALS SIGN (=) - Switch to the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($char === '/' && $this->character($this->char + 1) !== '>') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the before - attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's name. - Stay in the attribute name state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['name'] .= strtolower($char); - - $this->state = 'attributeName'; - } - } - - private function afterAttributeNameState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the after attribute name state. */ - $this->state = 'afterAttributeName'; - - } elseif($char === '=') { - /* U+003D EQUALS SIGN (=) - Switch to the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($char === '/' && $this->character($this->char + 1) !== '>') { - /* U+002F SOLIDUS (/) - Parse error unless this is a permitted slash. Switch to the - before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the EOF - character in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Start a new attribute in the current tag token. Set that attribute's - name to the current input character, and its value to the empty string. - Switch to the attribute name state. */ - $this->token['attr'][] = array( - 'name' => strtolower($char), - 'value' => null - ); - - $this->state = 'attributeName'; - } - } - - private function beforeAttributeValueState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Stay in the before attribute value state. */ - $this->state = 'beforeAttributeValue'; - - } elseif($char === '"') { - /* U+0022 QUOTATION MARK (") - Switch to the attribute value (double-quoted) state. */ - $this->state = 'attributeValueDoubleQuoted'; - - } elseif($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the attribute value (unquoted) state and reconsume - this input character. */ - $this->char--; - $this->state = 'attributeValueUnquoted'; - - } elseif($char === '\'') { - /* U+0027 APOSTROPHE (') - Switch to the attribute value (single-quoted) state. */ - $this->state = 'attributeValueSingleQuoted'; - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Switch to the attribute value (unquoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueUnquoted'; - } - } - - private function attributeValueDoubleQuotedState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if($char === '"') { - /* U+0022 QUOTATION MARK (") - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState('double'); - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the character - in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (double-quoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueDoubleQuoted'; - } - } - - private function attributeValueSingleQuotedState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if($char === '\'') { - /* U+0022 QUOTATION MARK (') - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState('single'); - - } elseif($this->char === $this->EOF) { - /* EOF - Parse error. Emit the current tag token. Reconsume the character - in the data state. */ - $this->emitToken($this->token); - - $this->char--; - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (single-quoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueSingleQuoted'; - } - } - - private function attributeValueUnquotedState() { - // Consume the next input character: - $this->char++; - $char = $this->character($this->char); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - /* U+0009 CHARACTER TABULATION - U+000A LINE FEED (LF) - U+000B LINE TABULATION - U+000C FORM FEED (FF) - U+0020 SPACE - Switch to the before attribute name state. */ - $this->state = 'beforeAttributeName'; - - } elseif($char === '&') { - /* U+0026 AMPERSAND (&) - Switch to the entity in attribute value state. */ - $this->entityInAttributeValueState(); - - } elseif($char === '>') { - /* U+003E GREATER-THAN SIGN (>) - Emit the current tag token. Switch to the data state. */ - $this->emitToken($this->token); - $this->state = 'data'; - - } else { - /* Anything else - Append the current input character to the current attribute's value. - Stay in the attribute value (unquoted) state. */ - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - - $this->state = 'attributeValueUnquoted'; - } - } - - private function entityInAttributeValueState() { - // Attempt to consume an entity. - $entity = $this->entity(); - - // If nothing is returned, append a U+0026 AMPERSAND character to the - // current attribute's value. Otherwise, emit the character token that - // was returned. - $char = (!$entity) - ? '&' - : $entity; - - $last = count($this->token['attr']) - 1; - $this->token['attr'][$last]['value'] .= $char; - } - - private function bogusCommentState() { - /* Consume every character up to the first U+003E GREATER-THAN SIGN - character (>) or the end of the file (EOF), whichever comes first. Emit - a comment token whose data is the concatenation of all the characters - starting from and including the character that caused the state machine - to switch into the bogus comment state, up to and including the last - consumed character before the U+003E character, if any, or up to the - end of the file otherwise. (If the comment was started by the end of - the file (EOF), the token is empty.) */ - $data = $this->characters('^>', $this->char); - $this->emitToken(array( - 'data' => $data, - 'type' => self::COMMENT - )); - - $this->char += strlen($data); - - /* Switch to the data state. */ - $this->state = 'data'; - - /* If the end of the file was reached, reconsume the EOF character. */ - if($this->char === $this->EOF) { - $this->char = $this->EOF - 1; - } - } - - private function markupDeclarationOpenState() { - /* If the next two characters are both U+002D HYPHEN-MINUS (-) - characters, consume those two characters, create a comment token whose - data is the empty string, and switch to the comment state. */ - if($this->character($this->char + 1, 2) === '--') { - $this->char += 2; - $this->state = 'comment'; - $this->token = array( - 'data' => null, - 'type' => self::COMMENT - ); - - /* Otherwise if the next seven chacacters are a case-insensitive match - for the word "DOCTYPE", then consume those characters and switch to the - DOCTYPE state. */ - } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') { - $this->char += 7; - $this->state = 'doctype'; - - /* Otherwise, is is a parse error. Switch to the bogus comment state. - The next character that is consumed, if any, is the first character - that will be in the comment. */ - } else { - $this->char++; - $this->state = 'bogusComment'; - } - } - - private function commentState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - /* U+002D HYPHEN-MINUS (-) */ - if($char === '-') { - /* Switch to the comment dash state */ - $this->state = 'commentDash'; - - /* EOF */ - } elseif($this->char === $this->EOF) { - /* Parse error. Emit the comment token. Reconsume the EOF character - in the data state. */ - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - /* Anything else */ - } else { - /* Append the input character to the comment token's data. Stay in - the comment state. */ - $this->token['data'] .= $char; - } - } - - private function commentDashState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - /* U+002D HYPHEN-MINUS (-) */ - if($char === '-') { - /* Switch to the comment end state */ - $this->state = 'commentEnd'; - - /* EOF */ - } elseif($this->char === $this->EOF) { - /* Parse error. Emit the comment token. Reconsume the EOF character - in the data state. */ - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - /* Anything else */ - } else { - /* Append a U+002D HYPHEN-MINUS (-) character and the input - character to the comment token's data. Switch to the comment state. */ - $this->token['data'] .= '-'.$char; - $this->state = 'comment'; - } - } - - private function commentEndState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($char === '-') { - $this->token['data'] .= '-'; - - } elseif($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['data'] .= '--'.$char; - $this->state = 'comment'; - } - } - - private function doctypeState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - $this->state = 'beforeDoctypeName'; - - } else { - $this->char--; - $this->state = 'beforeDoctypeName'; - } - } - - private function beforeDoctypeNameState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - // Stay in the before DOCTYPE name state. - - } elseif(preg_match('/^[a-z]$/', $char)) { - $this->token = array( - 'name' => strtoupper($char), - 'type' => self::DOCTYPE, - 'error' => true - ); - - $this->state = 'doctypeName'; - - } elseif($char === '>') { - $this->emitToken(array( - 'name' => null, - 'type' => self::DOCTYPE, - 'error' => true - )); - - $this->state = 'data'; - - } elseif($this->char === $this->EOF) { - $this->emitToken(array( - 'name' => null, - 'type' => self::DOCTYPE, - 'error' => true - )); - - $this->char--; - $this->state = 'data'; - - } else { - $this->token = array( - 'name' => $char, - 'type' => self::DOCTYPE, - 'error' => true - ); - - $this->state = 'doctypeName'; - } - } - - private function doctypeNameState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - $this->state = 'AfterDoctypeName'; - - } elseif($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif(preg_match('/^[a-z]$/', $char)) { - $this->token['name'] .= strtoupper($char); - - } elseif($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['name'] .= $char; - } - - $this->token['error'] = ($this->token['name'] === 'HTML') - ? false - : true; - } - - private function afterDoctypeNameState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { - // Stay in the DOCTYPE name state. - - } elseif($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - $this->token['error'] = true; - $this->state = 'bogusDoctype'; - } - } - - private function bogusDoctypeState() { - /* Consume the next input character: */ - $this->char++; - $char = $this->char(); - - if($char === '>') { - $this->emitToken($this->token); - $this->state = 'data'; - - } elseif($this->char === $this->EOF) { - $this->emitToken($this->token); - $this->char--; - $this->state = 'data'; - - } else { - // Stay in the bogus DOCTYPE state. - } - } - - private function entity() { - $start = $this->char; - - // This section defines how to consume an entity. This definition is - // used when parsing entities in text and in attributes. - - // The behaviour depends on the identity of the next character (the - // one immediately after the U+0026 AMPERSAND character): - - switch($this->character($this->char + 1)) { - // U+0023 NUMBER SIGN (#) - case '#': - - // The behaviour further depends on the character after the - // U+0023 NUMBER SIGN: - switch($this->character($this->char + 1)) { - // U+0078 LATIN SMALL LETTER X - // U+0058 LATIN CAPITAL LETTER X - case 'x': - case 'X': - // Follow the steps below, but using the range of - // characters U+0030 DIGIT ZERO through to U+0039 DIGIT - // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 - // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER - // A, through to U+0046 LATIN CAPITAL LETTER F (in other - // words, 0-9, A-F, a-f). - $char = 1; - $char_class = '0-9A-Fa-f'; - break; - - // Anything else - default: - // Follow the steps below, but using the range of - // characters U+0030 DIGIT ZERO through to U+0039 DIGIT - // NINE (i.e. just 0-9). - $char = 0; - $char_class = '0-9'; - break; - } - - // Consume as many characters as match the range of characters - // given above. - $this->char++; - $e_name = $this->characters($char_class, $this->char + $char + 1); - $entity = $this->character($start, $this->char); - $cond = strlen($e_name) > 0; - - // The rest of the parsing happens bellow. - break; - - // Anything else - default: - // Consume the maximum number of characters possible, with the - // consumed characters case-sensitively matching one of the - // identifiers in the first column of the entities table. - $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); - $len = strlen($e_name); - - for($c = 1; $c <= $len; $c++) { - $id = substr($e_name, 0, $c); - $this->char++; - - if(in_array($id, $this->entities)) { - if ($e_name[$c-1] !== ';') { - if ($c < $len && $e_name[$c] == ';') { - $this->char++; // consume extra semicolon - } - } - $entity = $id; - break; - } - } - - $cond = isset($entity); - // The rest of the parsing happens bellow. - break; - } - - if(!$cond) { - // If no match can be made, then this is a parse error. No - // characters are consumed, and nothing is returned. - $this->char = $start; - return false; - } - - // Return a character token for the character corresponding to the - // entity name (as given by the second column of the entities table). - return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8'); - } - - private function emitToken($token) { - $emit = $this->tree->emitToken($token); - - if(is_int($emit)) { - $this->content_model = $emit; - - } elseif($token['type'] === self::ENDTAG) { - $this->content_model = self::PCDATA; - } - } - - private function EOF() { - $this->state = null; - $this->tree->emitToken(array( - 'type' => self::EOF - )); - } -} - -class HTML5TreeConstructer { - public $stack = array(); - - private $phase; - private $mode; - private $dom; - private $foster_parent = null; - private $a_formatting = array(); - - private $head_pointer = null; - private $form_pointer = null; - - private $scoping = array('button','caption','html','marquee','object','table','td','th'); - private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u'); - private $special = array('address','area','base','basefont','bgsound', - 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl', - 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5', - 'h6','head','hr','iframe','image','img','input','isindex','li','link', - 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup', - 'option','p','param','plaintext','pre','script','select','spacer','style', - 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); - - // The different phases. - const INIT_PHASE = 0; - const ROOT_PHASE = 1; - const MAIN_PHASE = 2; - const END_PHASE = 3; - - // The different insertion modes for the main phase. - const BEFOR_HEAD = 0; - const IN_HEAD = 1; - const AFTER_HEAD = 2; - const IN_BODY = 3; - const IN_TABLE = 4; - const IN_CAPTION = 5; - const IN_CGROUP = 6; - const IN_TBODY = 7; - const IN_ROW = 8; - const IN_CELL = 9; - const IN_SELECT = 10; - const AFTER_BODY = 11; - const IN_FRAME = 12; - const AFTR_FRAME = 13; - - // The different types of elements. - const SPECIAL = 0; - const SCOPING = 1; - const FORMATTING = 2; - const PHRASING = 3; - - const MARKER = 0; - - public function __construct() { - $this->phase = self::INIT_PHASE; - $this->mode = self::BEFOR_HEAD; - $this->dom = new DOMDocument; - - $this->dom->encoding = 'UTF-8'; - $this->dom->preserveWhiteSpace = true; - $this->dom->substituteEntities = true; - $this->dom->strictErrorChecking = false; - } - - // Process tag tokens - public function emitToken($token) { - switch($this->phase) { - case self::INIT_PHASE: return $this->initPhase($token); break; - case self::ROOT_PHASE: return $this->rootElementPhase($token); break; - case self::MAIN_PHASE: return $this->mainPhase($token); break; - case self::END_PHASE : return $this->trailingEndPhase($token); break; - } - } - - private function initPhase($token) { - /* Initially, the tree construction stage must handle each token - emitted from the tokenisation stage as follows: */ - - /* A DOCTYPE token that is marked as being in error - A comment token - A start tag token - An end tag token - A character token that is not one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE - An end-of-file token */ - if((isset($token['error']) && $token['error']) || - $token['type'] === HTML5::COMMENT || - $token['type'] === HTML5::STARTTAG || - $token['type'] === HTML5::ENDTAG || - $token['type'] === HTML5::EOF || - ($token['type'] === HTML5::CHARACTR && isset($token['data']) && - !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) { - /* This specification does not define how to handle this case. In - particular, user agents may ignore the entirety of this specification - altogether for such documents, and instead invoke special parse modes - with a greater emphasis on backwards compatibility. */ - - $this->phase = self::ROOT_PHASE; - return $this->rootElementPhase($token); - - /* A DOCTYPE token marked as being correct */ - } elseif(isset($token['error']) && !$token['error']) { - /* Append a DocumentType node to the Document node, with the name - attribute set to the name given in the DOCTYPE token (which will be - "HTML"), and the other attributes specific to DocumentType objects - set to null, empty lists, or the empty string as appropriate. */ - $doctype = new DOMDocumentType(null, null, 'HTML'); - - /* Then, switch to the root element phase of the tree construction - stage. */ - $this->phase = self::ROOT_PHASE; - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/', - $token['data'])) { - /* Append that character to the Document node. */ - $text = $this->dom->createTextNode($token['data']); - $this->dom->appendChild($text); - } - } - - private function rootElementPhase($token) { - /* After the initial phase, as each token is emitted from the tokenisation - stage, it must be processed as described in this section. */ - - /* A DOCTYPE token */ - if($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the Document object with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->dom->appendChild($comment); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append that character to the Document node. */ - $text = $this->dom->createTextNode($token['data']); - $this->dom->appendChild($text); - - /* A character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED - (FF), or U+0020 SPACE - A start tag token - An end tag token - An end-of-file token */ - } elseif(($token['type'] === HTML5::CHARACTR && - !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || - $token['type'] === HTML5::STARTTAG || - $token['type'] === HTML5::ENDTAG || - $token['type'] === HTML5::EOF) { - /* Create an HTMLElement node with the tag name html, in the HTML - namespace. Append it to the Document object. Switch to the main - phase and reprocess the current token. */ - $html = $this->dom->createElement('html'); - $this->dom->appendChild($html); - $this->stack[] = $html; - - $this->phase = self::MAIN_PHASE; - return $this->mainPhase($token); - } - } - - private function mainPhase($token) { - /* Tokens in the main phase must be handled as follows: */ - - /* A DOCTYPE token */ - if($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A start tag token with the tag name "html" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { - /* If this start tag token was not the first start tag token, then - it is a parse error. */ - - /* For each attribute on the token, check to see if the attribute - is already present on the top element of the stack of open elements. - If it is not, add the attribute and its corresponding value to that - element. */ - foreach($token['attr'] as $attr) { - if(!$this->stack[0]->hasAttribute($attr['name'])) { - $this->stack[0]->setAttribute($attr['name'], $attr['value']); - } - } - - /* An end-of-file token */ - } elseif($token['type'] === HTML5::EOF) { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Anything else. */ - } else { - /* Depends on the insertion mode: */ - switch($this->mode) { - case self::BEFOR_HEAD: return $this->beforeHead($token); break; - case self::IN_HEAD: return $this->inHead($token); break; - case self::AFTER_HEAD: return $this->afterHead($token); break; - case self::IN_BODY: return $this->inBody($token); break; - case self::IN_TABLE: return $this->inTable($token); break; - case self::IN_CAPTION: return $this->inCaption($token); break; - case self::IN_CGROUP: return $this->inColumnGroup($token); break; - case self::IN_TBODY: return $this->inTableBody($token); break; - case self::IN_ROW: return $this->inRow($token); break; - case self::IN_CELL: return $this->inCell($token); break; - case self::IN_SELECT: return $this->inSelect($token); break; - case self::AFTER_BODY: return $this->afterBody($token); break; - case self::IN_FRAME: return $this->inFrameset($token); break; - case self::AFTR_FRAME: return $this->afterFrameset($token); break; - case self::END_PHASE: return $this->trailingEndPhase($token); break; - } - } - } - - private function beforeHead($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token with the tag name "head" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { - /* Create an element for the token, append the new element to the - current node and push it onto the stack of open elements. */ - $element = $this->insertElement($token); - - /* Set the head element pointer to this new element node. */ - $this->head_pointer = $element; - - /* Change the insertion mode to "in head". */ - $this->mode = self::IN_HEAD; - - /* A start tag token whose tag name is one of: "base", "link", "meta", - "script", "style", "title". Or an end tag with the tag name "html". - Or a character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. Or any other start tag token */ - } elseif($token['type'] === HTML5::STARTTAG || - ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || - ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/', - $token['data']))) { - /* Act as if a start tag token with the tag name "head" and no - attributes had been seen, then reprocess the current token. */ - $this->beforeHead(array( - 'name' => 'head', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - return $this->inHead($token); - - /* Any other end tag */ - } elseif($token['type'] === HTML5::ENDTAG) { - /* Parse error. Ignore the token. */ - } - } - - private function inHead($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. - - THIS DIFFERS FROM THE SPEC: If the current node is either a title, style - or script element, append the character to the current node regardless - of its content. */ - if(($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( - $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName, - array('title', 'style', 'script')))) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - } elseif($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('title', 'style', 'script'))) { - array_pop($this->stack); - return HTML5::PCDATA; - - /* A start tag with the tag name "title" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - } else { - $element = $this->insertElement($token); - } - - /* Switch the tokeniser's content model flag to the RCDATA state. */ - return HTML5::RCDATA; - - /* A start tag with the tag name "style" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - } else { - $this->insertElement($token); - } - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - - /* A start tag with the tag name "script" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { - /* Create an element for the token. */ - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - - /* A start tag with the tag name "base", "link", or "meta" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('base', 'link', 'meta'))) { - /* Create an element for the token and append the new element to the - node pointed to by the head element pointer, or, if that is null - (innerHTML case), to the current node. */ - if($this->head_pointer !== null) { - $element = $this->insertElement($token, false); - $this->head_pointer->appendChild($element); - array_pop($this->stack); - - } else { - $this->insertElement($token); - } - - /* An end tag with the tag name "head" */ - } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { - /* If the current node is a head element, pop the current node off - the stack of open elements. */ - if($this->head_pointer->isSameNode(end($this->stack))) { - array_pop($this->stack); - - /* Otherwise, this is a parse error. */ - } else { - // k - } - - /* Change the insertion mode to "after head". */ - $this->mode = self::AFTER_HEAD; - - /* A start tag with the tag name "head" or an end tag except "html". */ - } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || - ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* If the current node is a head element, act as if an end tag - token with the tag name "head" had been seen. */ - if($this->head_pointer->isSameNode(end($this->stack))) { - $this->inHead(array( - 'name' => 'head', - 'type' => HTML5::ENDTAG - )); - - /* Otherwise, change the insertion mode to "after head". */ - } else { - $this->mode = self::AFTER_HEAD; - } - - /* Then, reprocess the current token. */ - return $this->afterHead($token); - } - } - - private function afterHead($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data attribute - set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token with the tag name "body" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { - /* Insert a body element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in body". */ - $this->mode = self::IN_BODY; - - /* A start tag token with the tag name "frameset" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { - /* Insert a frameset element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in frameset". */ - $this->mode = self::IN_FRAME; - - /* A start tag token whose tag name is one of: "base", "link", "meta", - "script", "style", "title" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('base', 'link', 'meta', 'script', 'style', 'title'))) { - /* Parse error. Switch the insertion mode back to "in head" and - reprocess the token. */ - $this->mode = self::IN_HEAD; - return $this->inHead($token); - - /* Anything else */ - } else { - /* Act as if a start tag token with the tag name "body" and no - attributes had been seen, and then reprocess the current token. */ - $this->afterHead(array( - 'name' => 'body', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - return $this->inBody($token); - } - } - - private function inBody($token) { - /* Handle the token as follows: */ - - switch($token['type']) { - /* A character token */ - case HTML5::CHARACTR: - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Append the token's character to the current node. */ - $this->insertText($token['data']); - break; - - /* A comment token */ - case HTML5::COMMENT: - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - break; - - case HTML5::STARTTAG: - switch($token['name']) { - /* A start tag token whose tag name is one of: "script", - "style" */ - case 'script': case 'style': - /* Process the token as if the insertion mode had been "in - head". */ - return $this->inHead($token); - break; - - /* A start tag token whose tag name is one of: "base", "link", - "meta", "title" */ - case 'base': case 'link': case 'meta': case 'title': - /* Parse error. Process the token as if the insertion mode - had been "in head". */ - return $this->inHead($token); - break; - - /* A start tag token with the tag name "body" */ - case 'body': - /* Parse error. If the second element on the stack of open - elements is not a body element, or, if the stack of open - elements has only one node on it, then ignore the token. - (innerHTML case) */ - if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { - // Ignore - - /* Otherwise, for each attribute on the token, check to see - if the attribute is already present on the body element (the - second element) on the stack of open elements. If it is not, - add the attribute and its corresponding value to that - element. */ - } else { - foreach($token['attr'] as $attr) { - if(!$this->stack[1]->hasAttribute($attr['name'])) { - $this->stack[1]->setAttribute($attr['name'], $attr['value']); - } - } - } - break; - - /* A start tag whose tag name is one of: "address", - "blockquote", "center", "dir", "div", "dl", "fieldset", - "listing", "menu", "ol", "p", "ul" */ - case 'address': case 'blockquote': case 'center': case 'dir': - case 'div': case 'dl': case 'fieldset': case 'listing': - case 'menu': case 'ol': case 'p': case 'ul': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - break; - - /* A start tag whose tag name is "form" */ - case 'form': - /* If the form element pointer is not null, ignore the - token with a parse error. */ - if($this->form_pointer !== null) { - // Ignore. - - /* Otherwise: */ - } else { - /* If the stack of open elements has a p element in - scope, then act as if an end tag with the tag name p - had been seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token, and set the - form element pointer to point to the element created. */ - $element = $this->insertElement($token); - $this->form_pointer = $element; - } - break; - - /* A start tag whose tag name is "li", "dd" or "dt" */ - case 'li': case 'dd': case 'dt': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - $stack_length = count($this->stack) - 1; - - for($n = $stack_length; 0 <= $n; $n--) { - /* 1. Initialise node to be the current node (the - bottommost node of the stack). */ - $stop = false; - $node = $this->stack[$n]; - $cat = $this->getElementCategory($node->tagName); - - /* 2. If node is an li, dd or dt element, then pop all - the nodes from the current node up to node, including - node, then stop this algorithm. */ - if($token['name'] === $node->tagName || ($token['name'] !== 'li' - && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { - for($x = $stack_length; $x >= $n ; $x--) { - array_pop($this->stack); - } - - break; - } - - /* 3. If node is not in the formatting category, and is - not in the phrasing category, and is not an address or - div element, then stop this algorithm. */ - if($cat !== self::FORMATTING && $cat !== self::PHRASING && - $node->tagName !== 'address' && $node->tagName !== 'div') { - break; - } - } - - /* Finally, insert an HTML element with the same tag - name as the token's. */ - $this->insertElement($token); - break; - - /* A start tag token whose tag name is "plaintext" */ - case 'plaintext': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been - seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - return HTML5::PLAINTEXT; - break; - - /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", - "h5", "h6" */ - case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* If the stack of open elements has in scope an element whose - tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then - this is a parse error; pop elements from the stack until an - element with one of those tag names has been popped from the - stack. */ - while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { - array_pop($this->stack); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - break; - - /* A start tag whose tag name is "a" */ - case 'a': - /* If the list of active formatting elements contains - an element whose tag name is "a" between the end of the - list and the last marker on the list (or the start of - the list if there is no marker on the list), then this - is a parse error; act as if an end tag with the tag name - "a" had been seen, then remove that element from the list - of active formatting elements and the stack of open - elements if the end tag didn't already remove it (it - might not have if the element is not in table scope). */ - $leng = count($this->a_formatting); - - for($n = $leng - 1; $n >= 0; $n--) { - if($this->a_formatting[$n] === self::MARKER) { - break; - - } elseif($this->a_formatting[$n]->nodeName === 'a') { - $this->emitToken(array( - 'name' => 'a', - 'type' => HTML5::ENDTAG - )); - break; - } - } - - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $el = $this->insertElement($token); - - /* Add that element to the list of active formatting - elements. */ - $this->a_formatting[] = $el; - break; - - /* A start tag whose tag name is one of: "b", "big", "em", "font", - "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ - case 'b': case 'big': case 'em': case 'font': case 'i': - case 'nobr': case 's': case 'small': case 'strike': - case 'strong': case 'tt': case 'u': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $el = $this->insertElement($token); - - /* Add that element to the list of active formatting - elements. */ - $this->a_formatting[] = $el; - break; - - /* A start tag token whose tag name is "button" */ - case 'button': - /* If the stack of open elements has a button element in scope, - then this is a parse error; act as if an end tag with the tag - name "button" had been seen, then reprocess the token. (We don't - do that. Unnecessary.) */ - if($this->elementInScope('button')) { - $this->inBody(array( - 'name' => 'button', - 'type' => HTML5::ENDTAG - )); - } - - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - break; - - /* A start tag token whose tag name is one of: "marquee", "object" */ - case 'marquee': case 'object': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - break; - - /* A start tag token whose tag name is "xmp" */ - case 'xmp': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Switch the content model flag to the CDATA state. */ - return HTML5::CDATA; - break; - - /* A start tag whose tag name is "table" */ - case 'table': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in table". */ - $this->mode = self::IN_TABLE; - break; - - /* A start tag whose tag name is one of: "area", "basefont", - "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ - case 'area': case 'basefont': case 'bgsound': case 'br': - case 'embed': case 'img': case 'param': case 'spacer': - case 'wbr': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "hr" */ - case 'hr': - /* If the stack of open elements has a p element in scope, - then act as if an end tag with the tag name p had been seen. */ - if($this->elementInScope('p')) { - $this->emitToken(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "image" */ - case 'image': - /* Parse error. Change the token's tag name to "img" and - reprocess it. (Don't ask.) */ - $token['name'] = 'img'; - return $this->inBody($token); - break; - - /* A start tag whose tag name is "input" */ - case 'input': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an input element for the token. */ - $element = $this->insertElement($token, false); - - /* If the form element pointer is not null, then associate the - input element with the form element pointed to by the form - element pointer. */ - $this->form_pointer !== null - ? $this->form_pointer->appendChild($element) - : end($this->stack)->appendChild($element); - - /* Pop that input element off the stack of open elements. */ - array_pop($this->stack); - break; - - /* A start tag whose tag name is "isindex" */ - case 'isindex': - /* Parse error. */ - // w/e - - /* If the form element pointer is not null, - then ignore the token. */ - if($this->form_pointer === null) { - /* Act as if a start tag token with the tag name "form" had - been seen. */ - $this->inBody(array( - 'name' => 'body', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - /* Act as if a start tag token with the tag name "hr" had - been seen. */ - $this->inBody(array( - 'name' => 'hr', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - /* Act as if a start tag token with the tag name "p" had - been seen. */ - $this->inBody(array( - 'name' => 'p', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - /* Act as if a start tag token with the tag name "label" - had been seen. */ - $this->inBody(array( - 'name' => 'label', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - /* Act as if a stream of character tokens had been seen. */ - $this->insertText('This is a searchable index. '. - 'Insert your search keywords here: '); - - /* Act as if a start tag token with the tag name "input" - had been seen, with all the attributes from the "isindex" - token, except with the "name" attribute set to the value - "isindex" (ignoring any explicit "name" attribute). */ - $attr = $token['attr']; - $attr[] = array('name' => 'name', 'value' => 'isindex'); - - $this->inBody(array( - 'name' => 'input', - 'type' => HTML5::STARTTAG, - 'attr' => $attr - )); - - /* Act as if a stream of character tokens had been seen - (see below for what they should say). */ - $this->insertText('This is a searchable index. '. - 'Insert your search keywords here: '); - - /* Act as if an end tag token with the tag name "label" - had been seen. */ - $this->inBody(array( - 'name' => 'label', - 'type' => HTML5::ENDTAG - )); - - /* Act as if an end tag token with the tag name "p" had - been seen. */ - $this->inBody(array( - 'name' => 'p', - 'type' => HTML5::ENDTAG - )); - - /* Act as if a start tag token with the tag name "hr" had - been seen. */ - $this->inBody(array( - 'name' => 'hr', - 'type' => HTML5::ENDTAG - )); - - /* Act as if an end tag token with the tag name "form" had - been seen. */ - $this->inBody(array( - 'name' => 'form', - 'type' => HTML5::ENDTAG - )); - } - break; - - /* A start tag whose tag name is "textarea" */ - case 'textarea': - $this->insertElement($token); - - /* Switch the tokeniser's content model flag to the - RCDATA state. */ - return HTML5::RCDATA; - break; - - /* A start tag whose tag name is one of: "iframe", "noembed", - "noframes" */ - case 'iframe': case 'noembed': case 'noframes': - $this->insertElement($token); - - /* Switch the tokeniser's content model flag to the CDATA state. */ - return HTML5::CDATA; - break; - - /* A start tag whose tag name is "select" */ - case 'select': - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Change the insertion mode to "in select". */ - $this->mode = self::IN_SELECT; - break; - - /* A start or end tag whose tag name is one of: "caption", "col", - "colgroup", "frame", "frameset", "head", "option", "optgroup", - "tbody", "td", "tfoot", "th", "thead", "tr". */ - case 'caption': case 'col': case 'colgroup': case 'frame': - case 'frameset': case 'head': case 'option': case 'optgroup': - case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': - case 'tr': - // Parse error. Ignore the token. - break; - - /* A start or end tag whose tag name is one of: "event-source", - "section", "nav", "article", "aside", "header", "footer", - "datagrid", "command" */ - case 'event-source': case 'section': case 'nav': case 'article': - case 'aside': case 'header': case 'footer': case 'datagrid': - case 'command': - // Work in progress! - break; - - /* A start tag token not covered by the previous entries */ - default: - /* Reconstruct the active formatting elements, if any. */ - $this->reconstructActiveFormattingElements(); - - $this->insertElement($token, true, true); - break; - } - break; - - case HTML5::ENDTAG: - switch($token['name']) { - /* An end tag with the tag name "body" */ - case 'body': - /* If the second element in the stack of open elements is - not a body element, this is a parse error. Ignore the token. - (innerHTML case) */ - if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { - // Ignore. - - /* If the current node is not the body element, then this - is a parse error. */ - } elseif(end($this->stack)->nodeName !== 'body') { - // Parse error. - } - - /* Change the insertion mode to "after body". */ - $this->mode = self::AFTER_BODY; - break; - - /* An end tag with the tag name "html" */ - case 'html': - /* Act as if an end tag with tag name "body" had been seen, - then, if that token wasn't ignored, reprocess the current - token. */ - $this->inBody(array( - 'name' => 'body', - 'type' => HTML5::ENDTAG - )); - - return $this->afterBody($token); - break; - - /* An end tag whose tag name is one of: "address", "blockquote", - "center", "dir", "div", "dl", "fieldset", "listing", "menu", - "ol", "pre", "ul" */ - case 'address': case 'blockquote': case 'center': case 'dir': - case 'div': case 'dl': case 'fieldset': case 'listing': - case 'menu': case 'ol': case 'pre': case 'ul': - /* If the stack of open elements has an element in scope - with the same tag name as that of the token, then generate - implied end tags. */ - if($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with - the same tag name as that of the token, then this - is a parse error. */ - // w/e - - /* If the stack of open elements has an element in - scope with the same tag name as that of the token, - then pop elements from this stack until an element - with that tag name has been popped from the stack. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is "form" */ - case 'form': - /* If the stack of open elements has an element in scope - with the same tag name as that of the token, then generate - implied end tags. */ - if($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - } - - if(end($this->stack)->nodeName !== $token['name']) { - /* Now, if the current node is not an element with the - same tag name as that of the token, then this is a parse - error. */ - // w/e - - } else { - /* Otherwise, if the current node is an element with - the same tag name as that of the token pop that element - from the stack. */ - array_pop($this->stack); - } - - /* In any case, set the form element pointer to null. */ - $this->form_pointer = null; - break; - - /* An end tag whose tag name is "p" */ - case 'p': - /* If the stack of open elements has a p element in scope, - then generate implied end tags, except for p elements. */ - if($this->elementInScope('p')) { - $this->generateImpliedEndTags(array('p')); - - /* If the current node is not a p element, then this is - a parse error. */ - // k - - /* If the stack of open elements has a p element in - scope, then pop elements from this stack until the stack - no longer has a p element in scope. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->elementInScope('p')) { - array_pop($this->stack); - - } else { - break; - } - } - } - break; - - /* An end tag whose tag name is "dd", "dt", or "li" */ - case 'dd': case 'dt': case 'li': - /* If the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then - generate implied end tags, except for elements with the - same tag name as the token. */ - if($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(array($token['name'])); - - /* If the current node is not an element with the same - tag name as the token, then this is a parse error. */ - // w/e - - /* If the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then - pop elements from this stack until an element with that - tag name has been popped from the stack. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", - "h5", "h6" */ - case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': - $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); - - /* If the stack of open elements has in scope an element whose - tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then - generate implied end tags. */ - if($this->elementInScope($elements)) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with the same - tag name as that of the token, then this is a parse error. */ - // w/e - - /* If the stack of open elements has in scope an element - whose tag name is one of "h1", "h2", "h3", "h4", "h5", or - "h6", then pop elements from the stack until an element - with one of those tag names has been popped from the stack. */ - while($this->elementInScope($elements)) { - array_pop($this->stack); - } - } - break; - - /* An end tag whose tag name is one of: "a", "b", "big", "em", - "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ - case 'a': case 'b': case 'big': case 'em': case 'font': - case 'i': case 'nobr': case 's': case 'small': case 'strike': - case 'strong': case 'tt': case 'u': - /* 1. Let the formatting element be the last element in - the list of active formatting elements that: - * is between the end of the list and the last scope - marker in the list, if any, or the start of the list - otherwise, and - * has the same tag name as the token. - */ - while(true) { - for($a = count($this->a_formatting) - 1; $a >= 0; $a--) { - if($this->a_formatting[$a] === self::MARKER) { - break; - - } elseif($this->a_formatting[$a]->tagName === $token['name']) { - $formatting_element = $this->a_formatting[$a]; - $in_stack = in_array($formatting_element, $this->stack, true); - $fe_af_pos = $a; - break; - } - } - - /* If there is no such node, or, if that node is - also in the stack of open elements but the element - is not in scope, then this is a parse error. Abort - these steps. The token is ignored. */ - if(!isset($formatting_element) || ($in_stack && - !$this->elementInScope($token['name']))) { - break; - - /* Otherwise, if there is such a node, but that node - is not in the stack of open elements, then this is a - parse error; remove the element from the list, and - abort these steps. */ - } elseif(isset($formatting_element) && !$in_stack) { - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - break; - } - - /* 2. Let the furthest block be the topmost node in the - stack of open elements that is lower in the stack - than the formatting element, and is not an element in - the phrasing or formatting categories. There might - not be one. */ - $fe_s_pos = array_search($formatting_element, $this->stack, true); - $length = count($this->stack); - - for($s = $fe_s_pos + 1; $s < $length; $s++) { - $category = $this->getElementCategory($this->stack[$s]->nodeName); - - if($category !== self::PHRASING && $category !== self::FORMATTING) { - $furthest_block = $this->stack[$s]; - } - } - - /* 3. If there is no furthest block, then the UA must - skip the subsequent steps and instead just pop all - the nodes from the bottom of the stack of open - elements, from the current node up to the formatting - element, and remove the formatting element from the - list of active formatting elements. */ - if(!isset($furthest_block)) { - for($n = $length - 1; $n >= $fe_s_pos; $n--) { - array_pop($this->stack); - } - - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - break; - } - - /* 4. Let the common ancestor be the element - immediately above the formatting element in the stack - of open elements. */ - $common_ancestor = $this->stack[$fe_s_pos - 1]; - - /* 5. If the furthest block has a parent node, then - remove the furthest block from its parent node. */ - if($furthest_block->parentNode !== null) { - $furthest_block->parentNode->removeChild($furthest_block); - } - - /* 6. Let a bookmark note the position of the - formatting element in the list of active formatting - elements relative to the elements on either side - of it in the list. */ - $bookmark = $fe_af_pos; - - /* 7. Let node and last node be the furthest block. - Follow these steps: */ - $node = $furthest_block; - $last_node = $furthest_block; - - while(true) { - for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { - /* 7.1 Let node be the element immediately - prior to node in the stack of open elements. */ - $node = $this->stack[$n]; - - /* 7.2 If node is not in the list of active - formatting elements, then remove node from - the stack of open elements and then go back - to step 1. */ - if(!in_array($node, $this->a_formatting, true)) { - unset($this->stack[$n]); - $this->stack = array_merge($this->stack); - - } else { - break; - } - } - - /* 7.3 Otherwise, if node is the formatting - element, then go to the next step in the overall - algorithm. */ - if($node === $formatting_element) { - break; - - /* 7.4 Otherwise, if last node is the furthest - block, then move the aforementioned bookmark to - be immediately after the node in the list of - active formatting elements. */ - } elseif($last_node === $furthest_block) { - $bookmark = array_search($node, $this->a_formatting, true) + 1; - } - - /* 7.5 If node has any children, perform a - shallow clone of node, replace the entry for - node in the list of active formatting elements - with an entry for the clone, replace the entry - for node in the stack of open elements with an - entry for the clone, and let node be the clone. */ - if($node->hasChildNodes()) { - $clone = $node->cloneNode(); - $s_pos = array_search($node, $this->stack, true); - $a_pos = array_search($node, $this->a_formatting, true); - - $this->stack[$s_pos] = $clone; - $this->a_formatting[$a_pos] = $clone; - $node = $clone; - } - - /* 7.6 Insert last node into node, first removing - it from its previous parent node if any. */ - if($last_node->parentNode !== null) { - $last_node->parentNode->removeChild($last_node); - } - - $node->appendChild($last_node); - - /* 7.7 Let last node be node. */ - $last_node = $node; - } - - /* 8. Insert whatever last node ended up being in - the previous step into the common ancestor node, - first removing it from its previous parent node if - any. */ - if($last_node->parentNode !== null) { - $last_node->parentNode->removeChild($last_node); - } - - $common_ancestor->appendChild($last_node); - - /* 9. Perform a shallow clone of the formatting - element. */ - $clone = $formatting_element->cloneNode(); - - /* 10. Take all of the child nodes of the furthest - block and append them to the clone created in the - last step. */ - while($furthest_block->hasChildNodes()) { - $child = $furthest_block->firstChild; - $furthest_block->removeChild($child); - $clone->appendChild($child); - } - - /* 11. Append that clone to the furthest block. */ - $furthest_block->appendChild($clone); - - /* 12. Remove the formatting element from the list - of active formatting elements, and insert the clone - into the list of active formatting elements at the - position of the aforementioned bookmark. */ - $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); - unset($this->a_formatting[$fe_af_pos]); - $this->a_formatting = array_merge($this->a_formatting); - - $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); - $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); - $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); - - /* 13. Remove the formatting element from the stack - of open elements, and insert the clone into the stack - of open elements immediately after (i.e. in a more - deeply nested position than) the position of the - furthest block in that stack. */ - $fe_s_pos = array_search($formatting_element, $this->stack, true); - $fb_s_pos = array_search($furthest_block, $this->stack, true); - unset($this->stack[$fe_s_pos]); - - $s_part1 = array_slice($this->stack, 0, $fb_s_pos); - $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); - $this->stack = array_merge($s_part1, array($clone), $s_part2); - - /* 14. Jump back to step 1 in this series of steps. */ - unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); - } - break; - - /* An end tag token whose tag name is one of: "button", - "marquee", "object" */ - case 'button': case 'marquee': case 'object': - /* If the stack of open elements has an element in scope whose - tag name matches the tag name of the token, then generate implied - tags. */ - if($this->elementInScope($token['name'])) { - $this->generateImpliedEndTags(); - - /* Now, if the current node is not an element with the same - tag name as the token, then this is a parse error. */ - // k - - /* Now, if the stack of open elements has an element in scope - whose tag name matches the tag name of the token, then pop - elements from the stack until that element has been popped from - the stack, and clear the list of active formatting elements up - to the last marker. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->stack[$n]->nodeName === $token['name']) { - $n = -1; - } - - array_pop($this->stack); - } - - $marker = end(array_keys($this->a_formatting, self::MARKER, true)); - - for($n = count($this->a_formatting) - 1; $n > $marker; $n--) { - array_pop($this->a_formatting); - } - } - break; - - /* Or an end tag whose tag name is one of: "area", "basefont", - "bgsound", "br", "embed", "hr", "iframe", "image", "img", - "input", "isindex", "noembed", "noframes", "param", "select", - "spacer", "table", "textarea", "wbr" */ - case 'area': case 'basefont': case 'bgsound': case 'br': - case 'embed': case 'hr': case 'iframe': case 'image': - case 'img': case 'input': case 'isindex': case 'noembed': - case 'noframes': case 'param': case 'select': case 'spacer': - case 'table': case 'textarea': case 'wbr': - // Parse error. Ignore the token. - break; - - /* An end tag token not covered by the previous entries */ - default: - for($n = count($this->stack) - 1; $n >= 0; $n--) { - /* Initialise node to be the current node (the bottommost - node of the stack). */ - $node = end($this->stack); - - /* If node has the same tag name as the end tag token, - then: */ - if($token['name'] === $node->nodeName) { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* If the tag name of the end tag token does not - match the tag name of the current node, this is a - parse error. */ - // k - - /* Pop all the nodes from the current node up to - node, including node, then stop this algorithm. */ - for($x = count($this->stack) - $n; $x >= $n; $x--) { - array_pop($this->stack); - } - - } else { - $category = $this->getElementCategory($node); - - if($category !== self::SPECIAL && $category !== self::SCOPING) { - /* Otherwise, if node is in neither the formatting - category nor the phrasing category, then this is a - parse error. Stop this algorithm. The end tag token - is ignored. */ - return false; - } - } - } - break; - } - break; - } - } - - private function inTable($token) { - $clear = array('html', 'table'); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $text = $this->dom->createTextNode($token['data']); - end($this->stack)->appendChild($text); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - end($this->stack)->appendChild($comment); - - /* A start tag whose tag name is "caption" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'caption') { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert a marker at the end of the list of active - formatting elements. */ - $this->a_formatting[] = self::MARKER; - - /* Insert an HTML element for the token, then switch the - insertion mode to "in caption". */ - $this->insertElement($token); - $this->mode = self::IN_CAPTION; - - /* A start tag whose tag name is "colgroup" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'colgroup') { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the - insertion mode to "in column group". */ - $this->insertElement($token); - $this->mode = self::IN_CGROUP; - - /* A start tag whose tag name is "col" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'col') { - $this->inTable(array( - 'name' => 'colgroup', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - $this->inColumnGroup($token); - - /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('tbody', 'tfoot', 'thead'))) { - /* Clear the stack back to a table context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the insertion - mode to "in table body". */ - $this->insertElement($token); - $this->mode = self::IN_TBODY; - - /* A start tag whose tag name is one of: "td", "th", "tr" */ - } elseif($token['type'] === HTML5::STARTTAG && - in_array($token['name'], array('td', 'th', 'tr'))) { - /* Act as if a start tag token with the tag name "tbody" had been - seen, then reprocess the current token. */ - $this->inTable(array( - 'name' => 'tbody', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - return $this->inTableBody($token); - - /* A start tag whose tag name is "table" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'table') { - /* Parse error. Act as if an end tag token with the tag name "table" - had been seen, then, if that token wasn't ignored, reprocess the - current token. */ - $this->inTable(array( - 'name' => 'table', - 'type' => HTML5::ENDTAG - )); - - return $this->mainPhase($token); - - /* An end tag whose tag name is "table" */ - } elseif($token['type'] === HTML5::ENDTAG && - $token['name'] === 'table') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if(!$this->elementInScope($token['name'], true)) { - return false; - - /* Otherwise: */ - } else { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Now, if the current node is not a table element, then this - is a parse error. */ - // w/e - - /* Pop elements from this stack until a table element has been - popped from the stack. */ - while(true) { - $current = end($this->stack)->nodeName; - array_pop($this->stack); - - if($current === 'table') { - break; - } - } - - /* Reset the insertion mode appropriately. */ - $this->resetInsertionMode(); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', - 'tfoot', 'th', 'thead', 'tr'))) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* Parse error. Process the token as if the insertion mode was "in - body", with the following exception: */ - - /* If the current node is a table, tbody, tfoot, thead, or tr - element, then, whenever a node would be inserted into the current - node, it must instead be inserted into the foster parent element. */ - if(in_array(end($this->stack)->nodeName, - array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { - /* The foster parent element is the parent element of the last - table element in the stack of open elements, if there is a - table element and it has such a parent element. If there is no - table element in the stack of open elements (innerHTML case), - then the foster parent element is the first element in the - stack of open elements (the html element). Otherwise, if there - is a table element in the stack of open elements, but the last - table element in the stack of open elements has no parent, or - its parent node is not an element, then the foster parent - element is the element before the last table element in the - stack of open elements. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->stack[$n]->nodeName === 'table') { - $table = $this->stack[$n]; - break; - } - } - - if(isset($table) && $table->parentNode !== null) { - $this->foster_parent = $table->parentNode; - - } elseif(!isset($table)) { - $this->foster_parent = $this->stack[0]; - - } elseif(isset($table) && ($table->parentNode === null || - $table->parentNode->nodeType !== XML_ELEMENT_NODE)) { - $this->foster_parent = $this->stack[$n - 1]; - } - } - - $this->inBody($token); - } - } - - private function inCaption($token) { - /* An end tag whose tag name is "caption" */ - if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore - - /* Otherwise: */ - } else { - /* Generate implied end tags. */ - $this->generateImpliedEndTags(); - - /* Now, if the current node is not a caption element, then this - is a parse error. */ - // w/e - - /* Pop elements from this stack until a caption element has - been popped from the stack. */ - while(true) { - $node = end($this->stack)->nodeName; - array_pop($this->stack); - - if($node === 'caption') { - break; - } - } - - /* Clear the list of active formatting elements up to the last - marker. */ - $this->clearTheActiveFormattingElementsUpToTheLastMarker(); - - /* Switch the insertion mode to "in table". */ - $this->mode = self::IN_TABLE; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag - name is "table" */ - } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG && - $token['name'] === 'table')) { - /* Parse error. Act as if an end tag with the tag name "caption" - had been seen, then, if that token wasn't ignored, reprocess the - current token. */ - $this->inCaption(array( - 'name' => 'caption', - 'type' => HTML5::ENDTAG - )); - - return $this->inTable($token); - - /* An end tag whose tag name is one of: "body", "col", "colgroup", - "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th', - 'thead', 'tr'))) { - // Parse error. Ignore the token. - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in body". */ - $this->inBody($token); - } - } - - private function inColumnGroup($token) { - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $text = $this->dom->createTextNode($token['data']); - end($this->stack)->appendChild($text); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - end($this->stack)->appendChild($comment); - - /* A start tag whose tag name is "col" */ - } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { - /* Insert a col element for the token. Immediately pop the current - node off the stack of open elements. */ - $this->insertElement($token); - array_pop($this->stack); - - /* An end tag whose tag name is "colgroup" */ - } elseif($token['type'] === HTML5::ENDTAG && - $token['name'] === 'colgroup') { - /* If the current node is the root html element, then this is a - parse error, ignore the token. (innerHTML case) */ - if(end($this->stack)->nodeName === 'html') { - // Ignore - - /* Otherwise, pop the current node (which will be a colgroup - element) from the stack of open elements. Switch the insertion - mode to "in table". */ - } else { - array_pop($this->stack); - $this->mode = self::IN_TABLE; - } - - /* An end tag whose tag name is "col" */ - } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Act as if an end tag with the tag name "colgroup" had been seen, - and then, if that token wasn't ignored, reprocess the current token. */ - $this->inColumnGroup(array( - 'name' => 'colgroup', - 'type' => HTML5::ENDTAG - )); - - return $this->inTable($token); - } - } - - private function inTableBody($token) { - $clear = array('tbody', 'tfoot', 'thead', 'html'); - - /* A start tag whose tag name is "tr" */ - if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Insert a tr element for the token, then switch the insertion - mode to "in row". */ - $this->insertElement($token); - $this->mode = self::IN_ROW; - - /* A start tag whose tag name is one of: "th", "td" */ - } elseif($token['type'] === HTML5::STARTTAG && - ($token['name'] === 'th' || $token['name'] === 'td')) { - /* Parse error. Act as if a start tag with the tag name "tr" had - been seen, then reprocess the current token. */ - $this->inTableBody(array( - 'name' => 'tr', - 'type' => HTML5::STARTTAG, - 'attr' => array() - )); - - return $this->inRow($token); - - /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore - - /* Otherwise: */ - } else { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Pop the current node from the stack of open elements. Switch - the insertion mode to "in table". */ - array_pop($this->stack); - $this->mode = self::IN_TABLE; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ - } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) || - ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) { - /* If the stack of open elements does not have a tbody, thead, or - tfoot element in table scope, this is a parse error. Ignore the - token. (innerHTML case) */ - if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Clear the stack back to a table body context. */ - $this->clearStackToTableContext($clear); - - /* Act as if an end tag with the same tag name as the current - node ("tbody", "tfoot", or "thead") had been seen, then - reprocess the current token. */ - $this->inTableBody(array( - 'name' => end($this->stack)->nodeName, - 'type' => HTML5::ENDTAG - )); - - return $this->mainPhase($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "td", "th", "tr" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in table". */ - $this->inTable($token); - } - } - - private function inRow($token) { - $clear = array('tr', 'html'); - - /* A start tag whose tag name is one of: "th", "td" */ - if($token['type'] === HTML5::STARTTAG && - ($token['name'] === 'th' || $token['name'] === 'td')) { - /* Clear the stack back to a table row context. */ - $this->clearStackToTableContext($clear); - - /* Insert an HTML element for the token, then switch the insertion - mode to "in cell". */ - $this->insertElement($token); - $this->mode = self::IN_CELL; - - /* Insert a marker at the end of the list of active formatting - elements. */ - $this->a_formatting[] = self::MARKER; - - /* An end tag whose tag name is "tr" */ - } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Clear the stack back to a table row context. */ - $this->clearStackToTableContext($clear); - - /* Pop the current node (which will be a tr element) from the - stack of open elements. Switch the insertion mode to "in table - body". */ - array_pop($this->stack); - $this->mode = self::IN_TBODY; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) { - /* Act as if an end tag with the tag name "tr" had been seen, then, - if that token wasn't ignored, reprocess the current token. */ - $this->inRow(array( - 'name' => 'tr', - 'type' => HTML5::ENDTAG - )); - - return $this->inCell($token); - - /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ - } elseif($token['type'] === HTML5::ENDTAG && - in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Otherwise, act as if an end tag with the tag name "tr" had - been seen, then reprocess the current token. */ - $this->inRow(array( - 'name' => 'tr', - 'type' => HTML5::ENDTAG - )); - - return $this->inCell($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html", "td", "th" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { - /* Parse error. Ignore the token. */ - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in table". */ - $this->inTable($token); - } - } - - private function inCell($token) { - /* An end tag whose tag name is one of: "td", "th" */ - if($token['type'] === HTML5::ENDTAG && - ($token['name'] === 'td' || $token['name'] === 'th')) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as that of the token, then this is a - parse error and the token must be ignored. */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise: */ - } else { - /* Generate implied end tags, except for elements with the same - tag name as the token. */ - $this->generateImpliedEndTags(array($token['name'])); - - /* Now, if the current node is not an element with the same tag - name as the token, then this is a parse error. */ - // k - - /* Pop elements from this stack until an element with the same - tag name as the token has been popped from the stack. */ - while(true) { - $node = end($this->stack)->nodeName; - array_pop($this->stack); - - if($node === $token['name']) { - break; - } - } - - /* Clear the list of active formatting elements up to the last - marker. */ - $this->clearTheActiveFormattingElementsUpToTheLastMarker(); - - /* Switch the insertion mode to "in row". (The current node - will be a tr element at this point.) */ - $this->mode = self::IN_ROW; - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr'))) { - /* If the stack of open elements does not have a td or th element - in table scope, then this is a parse error; ignore the token. - (innerHTML case) */ - if(!$this->elementInScope(array('td', 'th'), true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* A start tag whose tag name is one of: "caption", "col", "colgroup", - "tbody", "td", "tfoot", "th", "thead", "tr" */ - } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], - array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr'))) { - /* If the stack of open elements does not have a td or th element - in table scope, then this is a parse error; ignore the token. - (innerHTML case) */ - if(!$this->elementInScope(array('td', 'th'), true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* An end tag whose tag name is one of: "body", "caption", "col", - "colgroup", "html" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('body', 'caption', 'col', 'colgroup', 'html'))) { - /* Parse error. Ignore the token. */ - - /* An end tag whose tag name is one of: "table", "tbody", "tfoot", - "thead", "tr" */ - } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], - array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { - /* If the stack of open elements does not have an element in table - scope with the same tag name as that of the token (which can only - happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), - then this is a parse error and the token must be ignored. */ - if(!$this->elementInScope($token['name'], true)) { - // Ignore. - - /* Otherwise, close the cell (see below) and reprocess the current - token. */ - } else { - $this->closeCell(); - return $this->inRow($token); - } - - /* Anything else */ - } else { - /* Process the token as if the insertion mode was "in body". */ - $this->inBody($token); - } - } - - private function inSelect($token) { - /* Handle the token as follows: */ - - /* A character token */ - if($token['type'] === HTML5::CHARACTR) { - /* Append the token's character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag token whose tag name is "option" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'option') { - /* If the current node is an option element, act as if an end tag - with the tag name "option" had been seen. */ - if(end($this->stack)->nodeName === 'option') { - $this->inSelect(array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* A start tag token whose tag name is "optgroup" */ - } elseif($token['type'] === HTML5::STARTTAG && - $token['name'] === 'optgroup') { - /* If the current node is an option element, act as if an end tag - with the tag name "option" had been seen. */ - if(end($this->stack)->nodeName === 'option') { - $this->inSelect(array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - )); - } - - /* If the current node is an optgroup element, act as if an end tag - with the tag name "optgroup" had been seen. */ - if(end($this->stack)->nodeName === 'optgroup') { - $this->inSelect(array( - 'name' => 'optgroup', - 'type' => HTML5::ENDTAG - )); - } - - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* An end tag token whose tag name is "optgroup" */ - } elseif($token['type'] === HTML5::ENDTAG && - $token['name'] === 'optgroup') { - /* First, if the current node is an option element, and the node - immediately before it in the stack of open elements is an optgroup - element, then act as if an end tag with the tag name "option" had - been seen. */ - $elements_in_stack = count($this->stack); - - if($this->stack[$elements_in_stack - 1]->nodeName === 'option' && - $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') { - $this->inSelect(array( - 'name' => 'option', - 'type' => HTML5::ENDTAG - )); - } - - /* If the current node is an optgroup element, then pop that node - from the stack of open elements. Otherwise, this is a parse error, - ignore the token. */ - if($this->stack[$elements_in_stack - 1] === 'optgroup') { - array_pop($this->stack); - } - - /* An end tag token whose tag name is "option" */ - } elseif($token['type'] === HTML5::ENDTAG && - $token['name'] === 'option') { - /* If the current node is an option element, then pop that node - from the stack of open elements. Otherwise, this is a parse error, - ignore the token. */ - if(end($this->stack)->nodeName === 'option') { - array_pop($this->stack); - } - - /* An end tag whose tag name is "select" */ - } elseif($token['type'] === HTML5::ENDTAG && - $token['name'] === 'select') { - /* If the stack of open elements does not have an element in table - scope with the same tag name as the token, this is a parse error. - Ignore the token. (innerHTML case) */ - if(!$this->elementInScope($token['name'], true)) { - // w/e - - /* Otherwise: */ - } else { - /* Pop elements from the stack of open elements until a select - element has been popped from the stack. */ - while(true) { - $current = end($this->stack)->nodeName; - array_pop($this->stack); - - if($current === 'select') { - break; - } - } - - /* Reset the insertion mode appropriately. */ - $this->resetInsertionMode(); - } - - /* A start tag whose tag name is "select" */ - } elseif($token['name'] === 'select' && - $token['type'] === HTML5::STARTTAG) { - /* Parse error. Act as if the token had been an end tag with the - tag name "select" instead. */ - $this->inSelect(array( - 'name' => 'select', - 'type' => HTML5::ENDTAG - )); - - /* An end tag whose tag name is one of: "caption", "table", "tbody", - "tfoot", "thead", "tr", "td", "th" */ - } elseif(in_array($token['name'], array('caption', 'table', 'tbody', - 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) { - /* Parse error. */ - // w/e - - /* If the stack of open elements has an element in table scope with - the same tag name as that of the token, then act as if an end tag - with the tag name "select" had been seen, and reprocess the token. - Otherwise, ignore the token. */ - if($this->elementInScope($token['name'], true)) { - $this->inSelect(array( - 'name' => 'select', - 'type' => HTML5::ENDTAG - )); - - $this->mainPhase($token); - } - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function afterBody($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Process the token as it would be processed if the insertion mode - was "in body". */ - $this->inBody($token); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the first element in the stack of open - elements (the html element), with the data attribute set to the - data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->stack[0]->appendChild($comment); - - /* An end tag with the tag name "html" */ - } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { - /* If the parser was originally created in order to handle the - setting of an element's innerHTML attribute, this is a parse error; - ignore the token. (The element will be an html element in this - case.) (innerHTML case) */ - - /* Otherwise, switch to the trailing end phase. */ - $this->phase = self::END_PHASE; - - /* Anything else */ - } else { - /* Parse error. Set the insertion mode to "in body" and reprocess - the token. */ - $this->mode = self::IN_BODY; - return $this->inBody($token); - } - } - - private function inFrameset($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* A start tag with the tag name "frameset" */ - } elseif($token['name'] === 'frameset' && - $token['type'] === HTML5::STARTTAG) { - $this->insertElement($token); - - /* An end tag with the tag name "frameset" */ - } elseif($token['name'] === 'frameset' && - $token['type'] === HTML5::ENDTAG) { - /* If the current node is the root html element, then this is a - parse error; ignore the token. (innerHTML case) */ - if(end($this->stack)->nodeName === 'html') { - // Ignore - - } else { - /* Otherwise, pop the current node from the stack of open - elements. */ - array_pop($this->stack); - - /* If the parser was not originally created in order to handle - the setting of an element's innerHTML attribute (innerHTML case), - and the current node is no longer a frameset element, then change - the insertion mode to "after frameset". */ - $this->mode = self::AFTR_FRAME; - } - - /* A start tag with the tag name "frame" */ - } elseif($token['name'] === 'frame' && - $token['type'] === HTML5::STARTTAG) { - /* Insert an HTML element for the token. */ - $this->insertElement($token); - - /* Immediately pop the current node off the stack of open elements. */ - array_pop($this->stack); - - /* A start tag with the tag name "noframes" */ - } elseif($token['name'] === 'noframes' && - $token['type'] === HTML5::STARTTAG) { - /* Process the token as if the insertion mode had been "in body". */ - $this->inBody($token); - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function afterFrameset($token) { - /* Handle the token as follows: */ - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ - if($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Append the character to the current node. */ - $this->insertText($token['data']); - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the current node with the data - attribute set to the data given in the comment token. */ - $this->insertComment($token['data']); - - /* An end tag with the tag name "html" */ - } elseif($token['name'] === 'html' && - $token['type'] === HTML5::ENDTAG) { - /* Switch to the trailing end phase. */ - $this->phase = self::END_PHASE; - - /* A start tag with the tag name "noframes" */ - } elseif($token['name'] === 'noframes' && - $token['type'] === HTML5::STARTTAG) { - /* Process the token as if the insertion mode had been "in body". */ - $this->inBody($token); - - /* Anything else */ - } else { - /* Parse error. Ignore the token. */ - } - } - - private function trailingEndPhase($token) { - /* After the main phase, as each token is emitted from the tokenisation - stage, it must be processed as described in this section. */ - - /* A DOCTYPE token */ - if($token['type'] === HTML5::DOCTYPE) { - // Parse error. Ignore the token. - - /* A comment token */ - } elseif($token['type'] === HTML5::COMMENT) { - /* Append a Comment node to the Document object with the data - attribute set to the data given in the comment token. */ - $comment = $this->dom->createComment($token['data']); - $this->dom->appendChild($comment); - - /* A character token that is one of one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE */ - } elseif($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { - /* Process the token as it would be processed in the main phase. */ - $this->mainPhase($token); - - /* A character token that is not one of U+0009 CHARACTER TABULATION, - U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), - or U+0020 SPACE. Or a start tag token. Or an end tag token. */ - } elseif(($token['type'] === HTML5::CHARACTR && - preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || - $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) { - /* Parse error. Switch back to the main phase and reprocess the - token. */ - $this->phase = self::MAIN_PHASE; - return $this->mainPhase($token); - - /* An end-of-file token */ - } elseif($token['type'] === HTML5::EOF) { - /* OMG DONE!! */ - } - } - - private function insertElement($token, $append = true, $check = false) { - // Proprietary workaround for libxml2's limitations with tag names - if ($check) { - // Slightly modified HTML5 tag-name modification, - // removing anything that's not an ASCII letter, digit, or hyphen - $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); - // Remove leading hyphens and numbers - $token['name'] = ltrim($token['name'], '-0..9'); - // In theory, this should ever be needed, but just in case - if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice - } - - $el = $this->dom->createElement($token['name']); - - foreach($token['attr'] as $attr) { - if(!$el->hasAttribute($attr['name'])) { - $el->setAttribute($attr['name'], $attr['value']); - } - } - - $this->appendToRealParent($el); - $this->stack[] = $el; - - return $el; - } - - private function insertText($data) { - $text = $this->dom->createTextNode($data); - $this->appendToRealParent($text); - } - - private function insertComment($data) { - $comment = $this->dom->createComment($data); - $this->appendToRealParent($comment); - } - - private function appendToRealParent($node) { - if($this->foster_parent === null) { - end($this->stack)->appendChild($node); - - } elseif($this->foster_parent !== null) { - /* If the foster parent element is the parent element of the - last table element in the stack of open elements, then the new - node must be inserted immediately before the last table element - in the stack of open elements in the foster parent element; - otherwise, the new node must be appended to the foster parent - element. */ - for($n = count($this->stack) - 1; $n >= 0; $n--) { - if($this->stack[$n]->nodeName === 'table' && - $this->stack[$n]->parentNode !== null) { - $table = $this->stack[$n]; - break; - } - } - - if(isset($table) && $this->foster_parent->isSameNode($table->parentNode)) - $this->foster_parent->insertBefore($node, $table); - else - $this->foster_parent->appendChild($node); - - $this->foster_parent = null; - } - } - - private function elementInScope($el, $table = false) { - if(is_array($el)) { - foreach($el as $element) { - if($this->elementInScope($element, $table)) { - return true; - } - } - - return false; - } - - $leng = count($this->stack); - - for($n = 0; $n < $leng; $n++) { - /* 1. Initialise node to be the current node (the bottommost node of - the stack). */ - $node = $this->stack[$leng - 1 - $n]; - - if($node->tagName === $el) { - /* 2. If node is the target node, terminate in a match state. */ - return true; - - } elseif($node->tagName === 'table') { - /* 3. Otherwise, if node is a table element, terminate in a failure - state. */ - return false; - - } elseif($table === true && in_array($node->tagName, array('caption', 'td', - 'th', 'button', 'marquee', 'object'))) { - /* 4. Otherwise, if the algorithm is the "has an element in scope" - variant (rather than the "has an element in table scope" variant), - and node is one of the following, terminate in a failure state. */ - return false; - - } elseif($node === $node->ownerDocument->documentElement) { - /* 5. Otherwise, if node is an html element (root element), terminate - in a failure state. (This can only happen if the node is the topmost - node of the stack of open elements, and prevents the next step from - being invoked if there are no more elements in the stack.) */ - return false; - } - - /* Otherwise, set node to the previous entry in the stack of open - elements and return to step 2. (This will never fail, since the loop - will always terminate in the previous step if the top of the stack - is reached.) */ - } - } - - private function reconstructActiveFormattingElements() { - /* 1. If there are no entries in the list of active formatting elements, - then there is nothing to reconstruct; stop this algorithm. */ - $formatting_elements = count($this->a_formatting); - - if($formatting_elements === 0) { - return false; - } - - /* 3. Let entry be the last (most recently added) element in the list - of active formatting elements. */ - $entry = end($this->a_formatting); - - /* 2. If the last (most recently added) entry in the list of active - formatting elements is a marker, or if it is an element that is in the - stack of open elements, then there is nothing to reconstruct; stop this - algorithm. */ - if($entry === self::MARKER || in_array($entry, $this->stack, true)) { - return false; - } - - for($a = $formatting_elements - 1; $a >= 0; true) { - /* 4. If there are no entries before entry in the list of active - formatting elements, then jump to step 8. */ - if($a === 0) { - $step_seven = false; - break; - } - - /* 5. Let entry be the entry one earlier than entry in the list of - active formatting elements. */ - $a--; - $entry = $this->a_formatting[$a]; - - /* 6. If entry is neither a marker nor an element that is also in - thetack of open elements, go to step 4. */ - if($entry === self::MARKER || in_array($entry, $this->stack, true)) { - break; - } - } - - while(true) { - /* 7. Let entry be the element one later than entry in the list of - active formatting elements. */ - if(isset($step_seven) && $step_seven === true) { - $a++; - $entry = $this->a_formatting[$a]; - } - - /* 8. Perform a shallow clone of the element entry to obtain clone. */ - $clone = $entry->cloneNode(); - - /* 9. Append clone to the current node and push it onto the stack - of open elements so that it is the new current node. */ - end($this->stack)->appendChild($clone); - $this->stack[] = $clone; - - /* 10. Replace the entry for entry in the list with an entry for - clone. */ - $this->a_formatting[$a] = $clone; - - /* 11. If the entry for clone in the list of active formatting - elements is not the last entry in the list, return to step 7. */ - if(end($this->a_formatting) !== $clone) { - $step_seven = true; - } else { - break; - } - } - } - - private function clearTheActiveFormattingElementsUpToTheLastMarker() { - /* When the steps below require the UA to clear the list of active - formatting elements up to the last marker, the UA must perform the - following steps: */ - - while(true) { - /* 1. Let entry be the last (most recently added) entry in the list - of active formatting elements. */ - $entry = end($this->a_formatting); - - /* 2. Remove entry from the list of active formatting elements. */ - array_pop($this->a_formatting); - - /* 3. If entry was a marker, then stop the algorithm at this point. - The list has been cleared up to the last marker. */ - if($entry === self::MARKER) { - break; - } - } - } - - private function generateImpliedEndTags($exclude = array()) { - /* When the steps below require the UA to generate implied end tags, - then, if the current node is a dd element, a dt element, an li element, - a p element, a td element, a th element, or a tr element, the UA must - act as if an end tag with the respective tag name had been seen and - then generate implied end tags again. */ - $node = end($this->stack); - $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); - - while(in_array(end($this->stack)->nodeName, $elements)) { - array_pop($this->stack); - } - } - - private function getElementCategory($node) { - $name = $node->tagName; - if(in_array($name, $this->special)) - return self::SPECIAL; - - elseif(in_array($name, $this->scoping)) - return self::SCOPING; - - elseif(in_array($name, $this->formatting)) - return self::FORMATTING; - - else - return self::PHRASING; - } - - private function clearStackToTableContext($elements) { - /* When the steps above require the UA to clear the stack back to a - table context, it means that the UA must, while the current node is not - a table element or an html element, pop elements from the stack of open - elements. If this causes any elements to be popped from the stack, then - this is a parse error. */ - while(true) { - $node = end($this->stack)->nodeName; - - if(in_array($node, $elements)) { - break; - } else { - array_pop($this->stack); - } - } - } - - private function resetInsertionMode() { - /* 1. Let last be false. */ - $last = false; - $leng = count($this->stack); - - for($n = $leng - 1; $n >= 0; $n--) { - /* 2. Let node be the last node in the stack of open elements. */ - $node = $this->stack[$n]; - - /* 3. If node is the first node in the stack of open elements, then - set last to true. If the element whose innerHTML attribute is being - set is neither a td element nor a th element, then set node to the - element whose innerHTML attribute is being set. (innerHTML case) */ - if($this->stack[0]->isSameNode($node)) { - $last = true; - } - - /* 4. If node is a select element, then switch the insertion mode to - "in select" and abort these steps. (innerHTML case) */ - if($node->nodeName === 'select') { - $this->mode = self::IN_SELECT; - break; - - /* 5. If node is a td or th element, then switch the insertion mode - to "in cell" and abort these steps. */ - } elseif($node->nodeName === 'td' || $node->nodeName === 'th') { - $this->mode = self::IN_CELL; - break; - - /* 6. If node is a tr element, then switch the insertion mode to - "in row" and abort these steps. */ - } elseif($node->nodeName === 'tr') { - $this->mode = self::IN_ROW; - break; - - /* 7. If node is a tbody, thead, or tfoot element, then switch the - insertion mode to "in table body" and abort these steps. */ - } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { - $this->mode = self::IN_TBODY; - break; - - /* 8. If node is a caption element, then switch the insertion mode - to "in caption" and abort these steps. */ - } elseif($node->nodeName === 'caption') { - $this->mode = self::IN_CAPTION; - break; - - /* 9. If node is a colgroup element, then switch the insertion mode - to "in column group" and abort these steps. (innerHTML case) */ - } elseif($node->nodeName === 'colgroup') { - $this->mode = self::IN_CGROUP; - break; - - /* 10. If node is a table element, then switch the insertion mode - to "in table" and abort these steps. */ - } elseif($node->nodeName === 'table') { - $this->mode = self::IN_TABLE; - break; - - /* 11. If node is a head element, then switch the insertion mode - to "in body" ("in body"! not "in head"!) and abort these steps. - (innerHTML case) */ - } elseif($node->nodeName === 'head') { - $this->mode = self::IN_BODY; - break; - - /* 12. If node is a body element, then switch the insertion mode to - "in body" and abort these steps. */ - } elseif($node->nodeName === 'body') { - $this->mode = self::IN_BODY; - break; - - /* 13. If node is a frameset element, then switch the insertion - mode to "in frameset" and abort these steps. (innerHTML case) */ - } elseif($node->nodeName === 'frameset') { - $this->mode = self::IN_FRAME; - break; - - /* 14. If node is an html element, then: if the head element - pointer is null, switch the insertion mode to "before head", - otherwise, switch the insertion mode to "after head". In either - case, abort these steps. (innerHTML case) */ - } elseif($node->nodeName === 'html') { - $this->mode = ($this->head_pointer === null) - ? self::BEFOR_HEAD - : self::AFTER_HEAD; - - break; - - /* 15. If last is true, then set the insertion mode to "in body" - and abort these steps. (innerHTML case) */ - } elseif($last) { - $this->mode = self::IN_BODY; - break; - } - } - } - - private function closeCell() { - /* If the stack of open elements has a td or th element in table scope, - then act as if an end tag token with that tag name had been seen. */ - foreach(array('td', 'th') as $cell) { - if($this->elementInScope($cell, true)) { - $this->inCell(array( - 'name' => $cell, - 'type' => HTML5::ENDTAG - )); - - break; - } - } - } - - public function save() { - return $this->dom; - } -} -?> +normalize($html, $config, $context); + $new_html = $this->wrapHTML($new_html, $config, $context); + try { + $parser = new HTML5($new_html); + $doc = $parser->save(); + } catch (DOMException $e) { + // Uh oh, it failed. Punt to DirectLex. + $lexer = new HTMLPurifier_Lexer_DirectLex(); + $context->register('PH5PError', $e); // save the error, so we can detect it + return $lexer->tokenizeHTML($html, $config, $context); // use original HTML + } + $tokens = array(); + $this->tokenizeDOM( + $doc->getElementsByTagName('html')->item(0)-> // + getElementsByTagName('body')->item(0)-> // + getElementsByTagName('div')->item(0) //
    + , $tokens); + return $tokens; + } + +} + +/* + +Copyright 2007 Jeroen van der Meer + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +class HTML5 { + private $data; + private $char; + private $EOF; + private $state; + private $tree; + private $token; + private $content_model; + private $escape = false; + private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute', + 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;', + 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;', + 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;', + 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;', + 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;', + 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;', + 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;', + 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;', + 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN', + 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;', + 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;', + 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig', + 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;', + 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;', + 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil', + 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;', + 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;', + 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;', + 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth', + 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12', + 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt', + 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc', + 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;', + 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;', + 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;', + 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro', + 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;', + 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;', + 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;', + 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash', + 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;', + 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;', + 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;', + 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;', + 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;', + 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;', + 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;', + 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;', + 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc', + 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;', + 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;'); + + const PCDATA = 0; + const RCDATA = 1; + const CDATA = 2; + const PLAINTEXT = 3; + + const DOCTYPE = 0; + const STARTTAG = 1; + const ENDTAG = 2; + const COMMENT = 3; + const CHARACTR = 4; + const EOF = 5; + + public function __construct($data) { + $data = str_replace("\r\n", "\n", $data); + $data = str_replace("\r", null, $data); + + $this->data = $data; + $this->char = -1; + $this->EOF = strlen($data); + $this->tree = new HTML5TreeConstructer; + $this->content_model = self::PCDATA; + + $this->state = 'data'; + + while($this->state !== null) { + $this->{$this->state.'State'}(); + } + } + + public function save() { + return $this->tree->save(); + } + + private function char() { + return ($this->char < $this->EOF) + ? $this->data[$this->char] + : false; + } + + private function character($s, $l = 0) { + if($s + $l < $this->EOF) { + if($l === 0) { + return $this->data[$s]; + } else { + return substr($this->data, $s, $l); + } + } + } + + private function characters($char_class, $start) { + return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start)); + } + + private function dataState() { + // Consume the next input character + $this->char++; + $char = $this->char(); + + if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { + /* U+0026 AMPERSAND (&) + When the content model flag is set to one of the PCDATA or RCDATA + states: switch to the entity data state. Otherwise: treat it as per + the "anything else" entry below. */ + $this->state = 'entityData'; + + } elseif($char === '-') { + /* If the content model flag is set to either the RCDATA state or + the CDATA state, and the escape flag is false, and there are at + least three characters before this one in the input stream, and the + last four characters in the input stream, including this one, are + U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, + and U+002D HYPHEN-MINUS (""), + set the escape flag to false. */ + if(($this->content_model === self::RCDATA || + $this->content_model === self::CDATA) && $this->escape === true && + $this->character($this->char, 3) === '-->') { + $this->escape = false; + } + + /* In any case, emit the input character as a character token. + Stay in the data state. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => $char + )); + + } elseif($this->char === $this->EOF) { + /* EOF + Emit an end-of-file token. */ + $this->EOF(); + + } elseif($this->content_model === self::PLAINTEXT) { + /* When the content model flag is set to the PLAINTEXT state + THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of + the text and emit it as a character token. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => substr($this->data, $this->char) + )); + + $this->EOF(); + + } else { + /* Anything else + THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that + otherwise would also be treated as a character token and emit it + as a single character token. Stay in the data state. */ + $len = strcspn($this->data, '<&', $this->char); + $char = substr($this->data, $this->char, $len); + $this->char += $len - 1; + + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => $char + )); + + $this->state = 'data'; + } + } + + private function entityDataState() { + // Attempt to consume an entity. + $entity = $this->entity(); + + // If nothing is returned, emit a U+0026 AMPERSAND character token. + // Otherwise, emit the character token that was returned. + $char = (!$entity) ? '&' : $entity; + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => $char + )); + + // Finally, switch to the data state. + $this->state = 'data'; + } + + private function tagOpenState() { + switch($this->content_model) { + case self::RCDATA: + case self::CDATA: + /* If the next input character is a U+002F SOLIDUS (/) character, + consume it and switch to the close tag open state. If the next + input character is not a U+002F SOLIDUS (/) character, emit a + U+003C LESS-THAN SIGN character token and switch to the data + state to process the next input character. */ + if($this->character($this->char + 1) === '/') { + $this->char++; + $this->state = 'closeTagOpen'; + + } else { + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => '<' + )); + + $this->state = 'data'; + } + break; + + case self::PCDATA: + // If the content model flag is set to the PCDATA state + // Consume the next input character: + $this->char++; + $char = $this->char(); + + if($char === '!') { + /* U+0021 EXCLAMATION MARK (!) + Switch to the markup declaration open state. */ + $this->state = 'markupDeclarationOpen'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Switch to the close tag open state. */ + $this->state = 'closeTagOpen'; + + } elseif(preg_match('/^[A-Za-z]$/', $char)) { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new start tag token, set its tag name to the lowercase + version of the input character (add 0x0020 to the character's code + point), then switch to the tag name state. (Don't emit the token + yet; further details will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower($char), + 'type' => self::STARTTAG, + 'attr' => array() + ); + + $this->state = 'tagName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit a U+003C LESS-THAN SIGN character token and a + U+003E GREATER-THAN SIGN character token. Switch to the data state. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => '<>' + )); + + $this->state = 'data'; + + } elseif($char === '?') { + /* U+003F QUESTION MARK (?) + Parse error. Switch to the bogus comment state. */ + $this->state = 'bogusComment'; + + } else { + /* Anything else + Parse error. Emit a U+003C LESS-THAN SIGN character token and + reconsume the current input character in the data state. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => '<' + )); + + $this->char--; + $this->state = 'data'; + } + break; + } + } + + private function closeTagOpenState() { + $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); + $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; + + if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && + (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/', + $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) { + /* If the content model flag is set to the RCDATA or CDATA states then + examine the next few characters. If they do not match the tag name of + the last start tag token emitted (case insensitively), or if they do but + they are not immediately followed by one of the following characters: + * U+0009 CHARACTER TABULATION + * U+000A LINE FEED (LF) + * U+000B LINE TABULATION + * U+000C FORM FEED (FF) + * U+0020 SPACE + * U+003E GREATER-THAN SIGN (>) + * U+002F SOLIDUS (/) + * EOF + ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character + token, a U+002F SOLIDUS character token, and switch to the data state + to process the next input character. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => 'state = 'data'; + + } else { + /* Otherwise, if the content model flag is set to the PCDATA state, + or if the next few characters do match that tag name, consume the + next input character: */ + $this->char++; + $char = $this->char(); + + if(preg_match('/^[A-Za-z]$/', $char)) { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new end tag token, set its tag name to the lowercase version + of the input character (add 0x0020 to the character's code point), then + switch to the tag name state. (Don't emit the token yet; further details + will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower($char), + 'type' => self::ENDTAG + ); + + $this->state = 'tagName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Switch to the data state. */ + $this->state = 'data'; + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F + SOLIDUS character token. Reconsume the EOF character in the data state. */ + $this->emitToken(array( + 'type' => self::CHARACTR, + 'data' => 'char--; + $this->state = 'data'; + + } else { + /* Parse error. Switch to the bogus comment state. */ + $this->state = 'bogusComment'; + } + } + } + + private function tagNameState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Parse error unless this is a permitted slash. Switch to the before + attribute name state. */ + $this->state = 'beforeAttributeName'; + + } else { + /* Anything else + Append the current input character to the current tag token's tag name. + Stay in the tag name state. */ + $this->token['name'] .= strtolower($char); + $this->state = 'tagName'; + } + } + + private function beforeAttributeNameState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($char === '/') { + /* U+002F SOLIDUS (/) + Parse error unless this is a permitted slash. Stay in the before + attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } else { + /* Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => strtolower($char), + 'value' => null + ); + + $this->state = 'attributeName'; + } + } + + private function attributeNameState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute name state. */ + $this->state = 'afterAttributeName'; + + } elseif($char === '=') { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $this->state = 'beforeAttributeValue'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($char === '/' && $this->character($this->char + 1) !== '>') { + /* U+002F SOLIDUS (/) + Parse error unless this is a permitted slash. Switch to the before + attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's name. + Stay in the attribute name state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['name'] .= strtolower($char); + + $this->state = 'attributeName'; + } + } + + private function afterAttributeNameState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after attribute name state. */ + $this->state = 'afterAttributeName'; + + } elseif($char === '=') { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $this->state = 'beforeAttributeValue'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($char === '/' && $this->character($this->char + 1) !== '>') { + /* U+002F SOLIDUS (/) + Parse error unless this is a permitted slash. Switch to the + before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the EOF + character in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } else { + /* Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => strtolower($char), + 'value' => null + ); + + $this->state = 'attributeName'; + } + } + + private function beforeAttributeValueState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute value state. */ + $this->state = 'beforeAttributeValue'; + + } elseif($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the attribute value (double-quoted) state. */ + $this->state = 'attributeValueDoubleQuoted'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the attribute value (unquoted) state and reconsume + this input character. */ + $this->char--; + $this->state = 'attributeValueUnquoted'; + + } elseif($char === '\'') { + /* U+0027 APOSTROPHE (') + Switch to the attribute value (single-quoted) state. */ + $this->state = 'attributeValueSingleQuoted'; + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Switch to the attribute value (unquoted) state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + $this->state = 'attributeValueUnquoted'; + } + } + + private function attributeValueDoubleQuotedState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if($char === '"') { + /* U+0022 QUOTATION MARK (") + Switch to the before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the entity in attribute value state. */ + $this->entityInAttributeValueState('double'); + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the character + in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (double-quoted) state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + $this->state = 'attributeValueDoubleQuoted'; + } + } + + private function attributeValueSingleQuotedState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if($char === '\'') { + /* U+0022 QUOTATION MARK (') + Switch to the before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the entity in attribute value state. */ + $this->entityInAttributeValueState('single'); + + } elseif($this->char === $this->EOF) { + /* EOF + Parse error. Emit the current tag token. Reconsume the character + in the data state. */ + $this->emitToken($this->token); + + $this->char--; + $this->state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (single-quoted) state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + $this->state = 'attributeValueSingleQuoted'; + } + } + + private function attributeValueUnquotedState() { + // Consume the next input character: + $this->char++; + $char = $this->character($this->char); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $this->state = 'beforeAttributeName'; + + } elseif($char === '&') { + /* U+0026 AMPERSAND (&) + Switch to the entity in attribute value state. */ + $this->entityInAttributeValueState(); + + } elseif($char === '>') { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $this->emitToken($this->token); + $this->state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (unquoted) state. */ + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + + $this->state = 'attributeValueUnquoted'; + } + } + + private function entityInAttributeValueState() { + // Attempt to consume an entity. + $entity = $this->entity(); + + // If nothing is returned, append a U+0026 AMPERSAND character to the + // current attribute's value. Otherwise, emit the character token that + // was returned. + $char = (!$entity) + ? '&' + : $entity; + + $last = count($this->token['attr']) - 1; + $this->token['attr'][$last]['value'] .= $char; + } + + private function bogusCommentState() { + /* Consume every character up to the first U+003E GREATER-THAN SIGN + character (>) or the end of the file (EOF), whichever comes first. Emit + a comment token whose data is the concatenation of all the characters + starting from and including the character that caused the state machine + to switch into the bogus comment state, up to and including the last + consumed character before the U+003E character, if any, or up to the + end of the file otherwise. (If the comment was started by the end of + the file (EOF), the token is empty.) */ + $data = $this->characters('^>', $this->char); + $this->emitToken(array( + 'data' => $data, + 'type' => self::COMMENT + )); + + $this->char += strlen($data); + + /* Switch to the data state. */ + $this->state = 'data'; + + /* If the end of the file was reached, reconsume the EOF character. */ + if($this->char === $this->EOF) { + $this->char = $this->EOF - 1; + } + } + + private function markupDeclarationOpenState() { + /* If the next two characters are both U+002D HYPHEN-MINUS (-) + characters, consume those two characters, create a comment token whose + data is the empty string, and switch to the comment state. */ + if($this->character($this->char + 1, 2) === '--') { + $this->char += 2; + $this->state = 'comment'; + $this->token = array( + 'data' => null, + 'type' => self::COMMENT + ); + + /* Otherwise if the next seven chacacters are a case-insensitive match + for the word "DOCTYPE", then consume those characters and switch to the + DOCTYPE state. */ + } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') { + $this->char += 7; + $this->state = 'doctype'; + + /* Otherwise, is is a parse error. Switch to the bogus comment state. + The next character that is consumed, if any, is the first character + that will be in the comment. */ + } else { + $this->char++; + $this->state = 'bogusComment'; + } + } + + private function commentState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + /* U+002D HYPHEN-MINUS (-) */ + if($char === '-') { + /* Switch to the comment dash state */ + $this->state = 'commentDash'; + + /* EOF */ + } elseif($this->char === $this->EOF) { + /* Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + /* Anything else */ + } else { + /* Append the input character to the comment token's data. Stay in + the comment state. */ + $this->token['data'] .= $char; + } + } + + private function commentDashState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + /* U+002D HYPHEN-MINUS (-) */ + if($char === '-') { + /* Switch to the comment end state */ + $this->state = 'commentEnd'; + + /* EOF */ + } elseif($this->char === $this->EOF) { + /* Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + /* Anything else */ + } else { + /* Append a U+002D HYPHEN-MINUS (-) character and the input + character to the comment token's data. Switch to the comment state. */ + $this->token['data'] .= '-'.$char; + $this->state = 'comment'; + } + } + + private function commentEndState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if($char === '>') { + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($char === '-') { + $this->token['data'] .= '-'; + + } elseif($this->char === $this->EOF) { + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + } else { + $this->token['data'] .= '--'.$char; + $this->state = 'comment'; + } + } + + private function doctypeState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + $this->state = 'beforeDoctypeName'; + + } else { + $this->char--; + $this->state = 'beforeDoctypeName'; + } + } + + private function beforeDoctypeNameState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + // Stay in the before DOCTYPE name state. + + } elseif(preg_match('/^[a-z]$/', $char)) { + $this->token = array( + 'name' => strtoupper($char), + 'type' => self::DOCTYPE, + 'error' => true + ); + + $this->state = 'doctypeName'; + + } elseif($char === '>') { + $this->emitToken(array( + 'name' => null, + 'type' => self::DOCTYPE, + 'error' => true + )); + + $this->state = 'data'; + + } elseif($this->char === $this->EOF) { + $this->emitToken(array( + 'name' => null, + 'type' => self::DOCTYPE, + 'error' => true + )); + + $this->char--; + $this->state = 'data'; + + } else { + $this->token = array( + 'name' => $char, + 'type' => self::DOCTYPE, + 'error' => true + ); + + $this->state = 'doctypeName'; + } + } + + private function doctypeNameState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + $this->state = 'AfterDoctypeName'; + + } elseif($char === '>') { + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif(preg_match('/^[a-z]$/', $char)) { + $this->token['name'] .= strtoupper($char); + + } elseif($this->char === $this->EOF) { + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + } else { + $this->token['name'] .= $char; + } + + $this->token['error'] = ($this->token['name'] === 'HTML') + ? false + : true; + } + + private function afterDoctypeNameState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { + // Stay in the DOCTYPE name state. + + } elseif($char === '>') { + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($this->char === $this->EOF) { + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + } else { + $this->token['error'] = true; + $this->state = 'bogusDoctype'; + } + } + + private function bogusDoctypeState() { + /* Consume the next input character: */ + $this->char++; + $char = $this->char(); + + if($char === '>') { + $this->emitToken($this->token); + $this->state = 'data'; + + } elseif($this->char === $this->EOF) { + $this->emitToken($this->token); + $this->char--; + $this->state = 'data'; + + } else { + // Stay in the bogus DOCTYPE state. + } + } + + private function entity() { + $start = $this->char; + + // This section defines how to consume an entity. This definition is + // used when parsing entities in text and in attributes. + + // The behaviour depends on the identity of the next character (the + // one immediately after the U+0026 AMPERSAND character): + + switch($this->character($this->char + 1)) { + // U+0023 NUMBER SIGN (#) + case '#': + + // The behaviour further depends on the character after the + // U+0023 NUMBER SIGN: + switch($this->character($this->char + 1)) { + // U+0078 LATIN SMALL LETTER X + // U+0058 LATIN CAPITAL LETTER X + case 'x': + case 'X': + // Follow the steps below, but using the range of + // characters U+0030 DIGIT ZERO through to U+0039 DIGIT + // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 + // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER + // A, through to U+0046 LATIN CAPITAL LETTER F (in other + // words, 0-9, A-F, a-f). + $char = 1; + $char_class = '0-9A-Fa-f'; + break; + + // Anything else + default: + // Follow the steps below, but using the range of + // characters U+0030 DIGIT ZERO through to U+0039 DIGIT + // NINE (i.e. just 0-9). + $char = 0; + $char_class = '0-9'; + break; + } + + // Consume as many characters as match the range of characters + // given above. + $this->char++; + $e_name = $this->characters($char_class, $this->char + $char + 1); + $entity = $this->character($start, $this->char); + $cond = strlen($e_name) > 0; + + // The rest of the parsing happens bellow. + break; + + // Anything else + default: + // Consume the maximum number of characters possible, with the + // consumed characters case-sensitively matching one of the + // identifiers in the first column of the entities table. + $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); + $len = strlen($e_name); + + for($c = 1; $c <= $len; $c++) { + $id = substr($e_name, 0, $c); + $this->char++; + + if(in_array($id, $this->entities)) { + if ($e_name[$c-1] !== ';') { + if ($c < $len && $e_name[$c] == ';') { + $this->char++; // consume extra semicolon + } + } + $entity = $id; + break; + } + } + + $cond = isset($entity); + // The rest of the parsing happens bellow. + break; + } + + if(!$cond) { + // If no match can be made, then this is a parse error. No + // characters are consumed, and nothing is returned. + $this->char = $start; + return false; + } + + // Return a character token for the character corresponding to the + // entity name (as given by the second column of the entities table). + return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8'); + } + + private function emitToken($token) { + $emit = $this->tree->emitToken($token); + + if(is_int($emit)) { + $this->content_model = $emit; + + } elseif($token['type'] === self::ENDTAG) { + $this->content_model = self::PCDATA; + } + } + + private function EOF() { + $this->state = null; + $this->tree->emitToken(array( + 'type' => self::EOF + )); + } +} + +class HTML5TreeConstructer { + public $stack = array(); + + private $phase; + private $mode; + private $dom; + private $foster_parent = null; + private $a_formatting = array(); + + private $head_pointer = null; + private $form_pointer = null; + + private $scoping = array('button','caption','html','marquee','object','table','td','th'); + private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u'); + private $special = array('address','area','base','basefont','bgsound', + 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl', + 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5', + 'h6','head','hr','iframe','image','img','input','isindex','li','link', + 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup', + 'option','p','param','plaintext','pre','script','select','spacer','style', + 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); + + // The different phases. + const INIT_PHASE = 0; + const ROOT_PHASE = 1; + const MAIN_PHASE = 2; + const END_PHASE = 3; + + // The different insertion modes for the main phase. + const BEFOR_HEAD = 0; + const IN_HEAD = 1; + const AFTER_HEAD = 2; + const IN_BODY = 3; + const IN_TABLE = 4; + const IN_CAPTION = 5; + const IN_CGROUP = 6; + const IN_TBODY = 7; + const IN_ROW = 8; + const IN_CELL = 9; + const IN_SELECT = 10; + const AFTER_BODY = 11; + const IN_FRAME = 12; + const AFTR_FRAME = 13; + + // The different types of elements. + const SPECIAL = 0; + const SCOPING = 1; + const FORMATTING = 2; + const PHRASING = 3; + + const MARKER = 0; + + public function __construct() { + $this->phase = self::INIT_PHASE; + $this->mode = self::BEFOR_HEAD; + $this->dom = new DOMDocument; + + $this->dom->encoding = 'UTF-8'; + $this->dom->preserveWhiteSpace = true; + $this->dom->substituteEntities = true; + $this->dom->strictErrorChecking = false; + } + + // Process tag tokens + public function emitToken($token) { + switch($this->phase) { + case self::INIT_PHASE: return $this->initPhase($token); break; + case self::ROOT_PHASE: return $this->rootElementPhase($token); break; + case self::MAIN_PHASE: return $this->mainPhase($token); break; + case self::END_PHASE : return $this->trailingEndPhase($token); break; + } + } + + private function initPhase($token) { + /* Initially, the tree construction stage must handle each token + emitted from the tokenisation stage as follows: */ + + /* A DOCTYPE token that is marked as being in error + A comment token + A start tag token + An end tag token + A character token that is not one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE + An end-of-file token */ + if((isset($token['error']) && $token['error']) || + $token['type'] === HTML5::COMMENT || + $token['type'] === HTML5::STARTTAG || + $token['type'] === HTML5::ENDTAG || + $token['type'] === HTML5::EOF || + ($token['type'] === HTML5::CHARACTR && isset($token['data']) && + !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) { + /* This specification does not define how to handle this case. In + particular, user agents may ignore the entirety of this specification + altogether for such documents, and instead invoke special parse modes + with a greater emphasis on backwards compatibility. */ + + $this->phase = self::ROOT_PHASE; + return $this->rootElementPhase($token); + + /* A DOCTYPE token marked as being correct */ + } elseif(isset($token['error']) && !$token['error']) { + /* Append a DocumentType node to the Document node, with the name + attribute set to the name given in the DOCTYPE token (which will be + "HTML"), and the other attributes specific to DocumentType objects + set to null, empty lists, or the empty string as appropriate. */ + $doctype = new DOMDocumentType(null, null, 'HTML'); + + /* Then, switch to the root element phase of the tree construction + stage. */ + $this->phase = self::ROOT_PHASE; + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/', + $token['data'])) { + /* Append that character to the Document node. */ + $text = $this->dom->createTextNode($token['data']); + $this->dom->appendChild($text); + } + } + + private function rootElementPhase($token) { + /* After the initial phase, as each token is emitted from the tokenisation + stage, it must be processed as described in this section. */ + + /* A DOCTYPE token */ + if($token['type'] === HTML5::DOCTYPE) { + // Parse error. Ignore the token. + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the Document object with the data + attribute set to the data given in the comment token. */ + $comment = $this->dom->createComment($token['data']); + $this->dom->appendChild($comment); + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + } elseif($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append that character to the Document node. */ + $text = $this->dom->createTextNode($token['data']); + $this->dom->appendChild($text); + + /* A character token that is not one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED + (FF), or U+0020 SPACE + A start tag token + An end tag token + An end-of-file token */ + } elseif(($token['type'] === HTML5::CHARACTR && + !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || + $token['type'] === HTML5::STARTTAG || + $token['type'] === HTML5::ENDTAG || + $token['type'] === HTML5::EOF) { + /* Create an HTMLElement node with the tag name html, in the HTML + namespace. Append it to the Document object. Switch to the main + phase and reprocess the current token. */ + $html = $this->dom->createElement('html'); + $this->dom->appendChild($html); + $this->stack[] = $html; + + $this->phase = self::MAIN_PHASE; + return $this->mainPhase($token); + } + } + + private function mainPhase($token) { + /* Tokens in the main phase must be handled as follows: */ + + /* A DOCTYPE token */ + if($token['type'] === HTML5::DOCTYPE) { + // Parse error. Ignore the token. + + /* A start tag token with the tag name "html" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { + /* If this start tag token was not the first start tag token, then + it is a parse error. */ + + /* For each attribute on the token, check to see if the attribute + is already present on the top element of the stack of open elements. + If it is not, add the attribute and its corresponding value to that + element. */ + foreach($token['attr'] as $attr) { + if(!$this->stack[0]->hasAttribute($attr['name'])) { + $this->stack[0]->setAttribute($attr['name'], $attr['value']); + } + } + + /* An end-of-file token */ + } elseif($token['type'] === HTML5::EOF) { + /* Generate implied end tags. */ + $this->generateImpliedEndTags(); + + /* Anything else. */ + } else { + /* Depends on the insertion mode: */ + switch($this->mode) { + case self::BEFOR_HEAD: return $this->beforeHead($token); break; + case self::IN_HEAD: return $this->inHead($token); break; + case self::AFTER_HEAD: return $this->afterHead($token); break; + case self::IN_BODY: return $this->inBody($token); break; + case self::IN_TABLE: return $this->inTable($token); break; + case self::IN_CAPTION: return $this->inCaption($token); break; + case self::IN_CGROUP: return $this->inColumnGroup($token); break; + case self::IN_TBODY: return $this->inTableBody($token); break; + case self::IN_ROW: return $this->inRow($token); break; + case self::IN_CELL: return $this->inCell($token); break; + case self::IN_SELECT: return $this->inSelect($token); break; + case self::AFTER_BODY: return $this->afterBody($token); break; + case self::IN_FRAME: return $this->inFrameset($token); break; + case self::AFTR_FRAME: return $this->afterFrameset($token); break; + case self::END_PHASE: return $this->trailingEndPhase($token); break; + } + } + } + + private function beforeHead($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data attribute + set to the data given in the comment token. */ + $this->insertComment($token['data']); + + /* A start tag token with the tag name "head" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { + /* Create an element for the token, append the new element to the + current node and push it onto the stack of open elements. */ + $element = $this->insertElement($token); + + /* Set the head element pointer to this new element node. */ + $this->head_pointer = $element; + + /* Change the insertion mode to "in head". */ + $this->mode = self::IN_HEAD; + + /* A start tag token whose tag name is one of: "base", "link", "meta", + "script", "style", "title". Or an end tag with the tag name "html". + Or a character token that is not one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE. Or any other start tag token */ + } elseif($token['type'] === HTML5::STARTTAG || + ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || + ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/', + $token['data']))) { + /* Act as if a start tag token with the tag name "head" and no + attributes had been seen, then reprocess the current token. */ + $this->beforeHead(array( + 'name' => 'head', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + return $this->inHead($token); + + /* Any other end tag */ + } elseif($token['type'] === HTML5::ENDTAG) { + /* Parse error. Ignore the token. */ + } + } + + private function inHead($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE. + + THIS DIFFERS FROM THE SPEC: If the current node is either a title, style + or script element, append the character to the current node regardless + of its content. */ + if(($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( + $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName, + array('title', 'style', 'script')))) { + /* Append the character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data attribute + set to the data given in the comment token. */ + $this->insertComment($token['data']); + + } elseif($token['type'] === HTML5::ENDTAG && + in_array($token['name'], array('title', 'style', 'script'))) { + array_pop($this->stack); + return HTML5::PCDATA; + + /* A start tag with the tag name "title" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { + /* Create an element for the token and append the new element to the + node pointed to by the head element pointer, or, if that is null + (innerHTML case), to the current node. */ + if($this->head_pointer !== null) { + $element = $this->insertElement($token, false); + $this->head_pointer->appendChild($element); + + } else { + $element = $this->insertElement($token); + } + + /* Switch the tokeniser's content model flag to the RCDATA state. */ + return HTML5::RCDATA; + + /* A start tag with the tag name "style" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { + /* Create an element for the token and append the new element to the + node pointed to by the head element pointer, or, if that is null + (innerHTML case), to the current node. */ + if($this->head_pointer !== null) { + $element = $this->insertElement($token, false); + $this->head_pointer->appendChild($element); + + } else { + $this->insertElement($token); + } + + /* Switch the tokeniser's content model flag to the CDATA state. */ + return HTML5::CDATA; + + /* A start tag with the tag name "script" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { + /* Create an element for the token. */ + $element = $this->insertElement($token, false); + $this->head_pointer->appendChild($element); + + /* Switch the tokeniser's content model flag to the CDATA state. */ + return HTML5::CDATA; + + /* A start tag with the tag name "base", "link", or "meta" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('base', 'link', 'meta'))) { + /* Create an element for the token and append the new element to the + node pointed to by the head element pointer, or, if that is null + (innerHTML case), to the current node. */ + if($this->head_pointer !== null) { + $element = $this->insertElement($token, false); + $this->head_pointer->appendChild($element); + array_pop($this->stack); + + } else { + $this->insertElement($token); + } + + /* An end tag with the tag name "head" */ + } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { + /* If the current node is a head element, pop the current node off + the stack of open elements. */ + if($this->head_pointer->isSameNode(end($this->stack))) { + array_pop($this->stack); + + /* Otherwise, this is a parse error. */ + } else { + // k + } + + /* Change the insertion mode to "after head". */ + $this->mode = self::AFTER_HEAD; + + /* A start tag with the tag name "head" or an end tag except "html". */ + } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || + ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) { + // Parse error. Ignore the token. + + /* Anything else */ + } else { + /* If the current node is a head element, act as if an end tag + token with the tag name "head" had been seen. */ + if($this->head_pointer->isSameNode(end($this->stack))) { + $this->inHead(array( + 'name' => 'head', + 'type' => HTML5::ENDTAG + )); + + /* Otherwise, change the insertion mode to "after head". */ + } else { + $this->mode = self::AFTER_HEAD; + } + + /* Then, reprocess the current token. */ + return $this->afterHead($token); + } + } + + private function afterHead($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data attribute + set to the data given in the comment token. */ + $this->insertComment($token['data']); + + /* A start tag token with the tag name "body" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { + /* Insert a body element for the token. */ + $this->insertElement($token); + + /* Change the insertion mode to "in body". */ + $this->mode = self::IN_BODY; + + /* A start tag token with the tag name "frameset" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { + /* Insert a frameset element for the token. */ + $this->insertElement($token); + + /* Change the insertion mode to "in frameset". */ + $this->mode = self::IN_FRAME; + + /* A start tag token whose tag name is one of: "base", "link", "meta", + "script", "style", "title" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('base', 'link', 'meta', 'script', 'style', 'title'))) { + /* Parse error. Switch the insertion mode back to "in head" and + reprocess the token. */ + $this->mode = self::IN_HEAD; + return $this->inHead($token); + + /* Anything else */ + } else { + /* Act as if a start tag token with the tag name "body" and no + attributes had been seen, and then reprocess the current token. */ + $this->afterHead(array( + 'name' => 'body', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + return $this->inBody($token); + } + } + + private function inBody($token) { + /* Handle the token as follows: */ + + switch($token['type']) { + /* A character token */ + case HTML5::CHARACTR: + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Append the token's character to the current node. */ + $this->insertText($token['data']); + break; + + /* A comment token */ + case HTML5::COMMENT: + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $this->insertComment($token['data']); + break; + + case HTML5::STARTTAG: + switch($token['name']) { + /* A start tag token whose tag name is one of: "script", + "style" */ + case 'script': case 'style': + /* Process the token as if the insertion mode had been "in + head". */ + return $this->inHead($token); + break; + + /* A start tag token whose tag name is one of: "base", "link", + "meta", "title" */ + case 'base': case 'link': case 'meta': case 'title': + /* Parse error. Process the token as if the insertion mode + had been "in head". */ + return $this->inHead($token); + break; + + /* A start tag token with the tag name "body" */ + case 'body': + /* Parse error. If the second element on the stack of open + elements is not a body element, or, if the stack of open + elements has only one node on it, then ignore the token. + (innerHTML case) */ + if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { + // Ignore + + /* Otherwise, for each attribute on the token, check to see + if the attribute is already present on the body element (the + second element) on the stack of open elements. If it is not, + add the attribute and its corresponding value to that + element. */ + } else { + foreach($token['attr'] as $attr) { + if(!$this->stack[1]->hasAttribute($attr['name'])) { + $this->stack[1]->setAttribute($attr['name'], $attr['value']); + } + } + } + break; + + /* A start tag whose tag name is one of: "address", + "blockquote", "center", "dir", "div", "dl", "fieldset", + "listing", "menu", "ol", "p", "ul" */ + case 'address': case 'blockquote': case 'center': case 'dir': + case 'div': case 'dl': case 'fieldset': case 'listing': + case 'menu': case 'ol': case 'p': case 'ul': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been + seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + break; + + /* A start tag whose tag name is "form" */ + case 'form': + /* If the form element pointer is not null, ignore the + token with a parse error. */ + if($this->form_pointer !== null) { + // Ignore. + + /* Otherwise: */ + } else { + /* If the stack of open elements has a p element in + scope, then act as if an end tag with the tag name p + had been seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token, and set the + form element pointer to point to the element created. */ + $element = $this->insertElement($token); + $this->form_pointer = $element; + } + break; + + /* A start tag whose tag name is "li", "dd" or "dt" */ + case 'li': case 'dd': case 'dt': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been + seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + $stack_length = count($this->stack) - 1; + + for($n = $stack_length; 0 <= $n; $n--) { + /* 1. Initialise node to be the current node (the + bottommost node of the stack). */ + $stop = false; + $node = $this->stack[$n]; + $cat = $this->getElementCategory($node->tagName); + + /* 2. If node is an li, dd or dt element, then pop all + the nodes from the current node up to node, including + node, then stop this algorithm. */ + if($token['name'] === $node->tagName || ($token['name'] !== 'li' + && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { + for($x = $stack_length; $x >= $n ; $x--) { + array_pop($this->stack); + } + + break; + } + + /* 3. If node is not in the formatting category, and is + not in the phrasing category, and is not an address or + div element, then stop this algorithm. */ + if($cat !== self::FORMATTING && $cat !== self::PHRASING && + $node->tagName !== 'address' && $node->tagName !== 'div') { + break; + } + } + + /* Finally, insert an HTML element with the same tag + name as the token's. */ + $this->insertElement($token); + break; + + /* A start tag token whose tag name is "plaintext" */ + case 'plaintext': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been + seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + return HTML5::PLAINTEXT; + break; + + /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", + "h5", "h6" */ + case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* If the stack of open elements has in scope an element whose + tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then + this is a parse error; pop elements from the stack until an + element with one of those tag names has been popped from the + stack. */ + while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { + array_pop($this->stack); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + break; + + /* A start tag whose tag name is "a" */ + case 'a': + /* If the list of active formatting elements contains + an element whose tag name is "a" between the end of the + list and the last marker on the list (or the start of + the list if there is no marker on the list), then this + is a parse error; act as if an end tag with the tag name + "a" had been seen, then remove that element from the list + of active formatting elements and the stack of open + elements if the end tag didn't already remove it (it + might not have if the element is not in table scope). */ + $leng = count($this->a_formatting); + + for($n = $leng - 1; $n >= 0; $n--) { + if($this->a_formatting[$n] === self::MARKER) { + break; + + } elseif($this->a_formatting[$n]->nodeName === 'a') { + $this->emitToken(array( + 'name' => 'a', + 'type' => HTML5::ENDTAG + )); + break; + } + } + + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $el = $this->insertElement($token); + + /* Add that element to the list of active formatting + elements. */ + $this->a_formatting[] = $el; + break; + + /* A start tag whose tag name is one of: "b", "big", "em", "font", + "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ + case 'b': case 'big': case 'em': case 'font': case 'i': + case 'nobr': case 's': case 'small': case 'strike': + case 'strong': case 'tt': case 'u': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $el = $this->insertElement($token); + + /* Add that element to the list of active formatting + elements. */ + $this->a_formatting[] = $el; + break; + + /* A start tag token whose tag name is "button" */ + case 'button': + /* If the stack of open elements has a button element in scope, + then this is a parse error; act as if an end tag with the tag + name "button" had been seen, then reprocess the token. (We don't + do that. Unnecessary.) */ + if($this->elementInScope('button')) { + $this->inBody(array( + 'name' => 'button', + 'type' => HTML5::ENDTAG + )); + } + + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Insert a marker at the end of the list of active + formatting elements. */ + $this->a_formatting[] = self::MARKER; + break; + + /* A start tag token whose tag name is one of: "marquee", "object" */ + case 'marquee': case 'object': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Insert a marker at the end of the list of active + formatting elements. */ + $this->a_formatting[] = self::MARKER; + break; + + /* A start tag token whose tag name is "xmp" */ + case 'xmp': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Switch the content model flag to the CDATA state. */ + return HTML5::CDATA; + break; + + /* A start tag whose tag name is "table" */ + case 'table': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Change the insertion mode to "in table". */ + $this->mode = self::IN_TABLE; + break; + + /* A start tag whose tag name is one of: "area", "basefont", + "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ + case 'area': case 'basefont': case 'bgsound': case 'br': + case 'embed': case 'img': case 'param': case 'spacer': + case 'wbr': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Immediately pop the current node off the stack of open elements. */ + array_pop($this->stack); + break; + + /* A start tag whose tag name is "hr" */ + case 'hr': + /* If the stack of open elements has a p element in scope, + then act as if an end tag with the tag name p had been seen. */ + if($this->elementInScope('p')) { + $this->emitToken(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Immediately pop the current node off the stack of open elements. */ + array_pop($this->stack); + break; + + /* A start tag whose tag name is "image" */ + case 'image': + /* Parse error. Change the token's tag name to "img" and + reprocess it. (Don't ask.) */ + $token['name'] = 'img'; + return $this->inBody($token); + break; + + /* A start tag whose tag name is "input" */ + case 'input': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an input element for the token. */ + $element = $this->insertElement($token, false); + + /* If the form element pointer is not null, then associate the + input element with the form element pointed to by the form + element pointer. */ + $this->form_pointer !== null + ? $this->form_pointer->appendChild($element) + : end($this->stack)->appendChild($element); + + /* Pop that input element off the stack of open elements. */ + array_pop($this->stack); + break; + + /* A start tag whose tag name is "isindex" */ + case 'isindex': + /* Parse error. */ + // w/e + + /* If the form element pointer is not null, + then ignore the token. */ + if($this->form_pointer === null) { + /* Act as if a start tag token with the tag name "form" had + been seen. */ + $this->inBody(array( + 'name' => 'body', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + /* Act as if a start tag token with the tag name "hr" had + been seen. */ + $this->inBody(array( + 'name' => 'hr', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + /* Act as if a start tag token with the tag name "p" had + been seen. */ + $this->inBody(array( + 'name' => 'p', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + /* Act as if a start tag token with the tag name "label" + had been seen. */ + $this->inBody(array( + 'name' => 'label', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + /* Act as if a stream of character tokens had been seen. */ + $this->insertText('This is a searchable index. '. + 'Insert your search keywords here: '); + + /* Act as if a start tag token with the tag name "input" + had been seen, with all the attributes from the "isindex" + token, except with the "name" attribute set to the value + "isindex" (ignoring any explicit "name" attribute). */ + $attr = $token['attr']; + $attr[] = array('name' => 'name', 'value' => 'isindex'); + + $this->inBody(array( + 'name' => 'input', + 'type' => HTML5::STARTTAG, + 'attr' => $attr + )); + + /* Act as if a stream of character tokens had been seen + (see below for what they should say). */ + $this->insertText('This is a searchable index. '. + 'Insert your search keywords here: '); + + /* Act as if an end tag token with the tag name "label" + had been seen. */ + $this->inBody(array( + 'name' => 'label', + 'type' => HTML5::ENDTAG + )); + + /* Act as if an end tag token with the tag name "p" had + been seen. */ + $this->inBody(array( + 'name' => 'p', + 'type' => HTML5::ENDTAG + )); + + /* Act as if a start tag token with the tag name "hr" had + been seen. */ + $this->inBody(array( + 'name' => 'hr', + 'type' => HTML5::ENDTAG + )); + + /* Act as if an end tag token with the tag name "form" had + been seen. */ + $this->inBody(array( + 'name' => 'form', + 'type' => HTML5::ENDTAG + )); + } + break; + + /* A start tag whose tag name is "textarea" */ + case 'textarea': + $this->insertElement($token); + + /* Switch the tokeniser's content model flag to the + RCDATA state. */ + return HTML5::RCDATA; + break; + + /* A start tag whose tag name is one of: "iframe", "noembed", + "noframes" */ + case 'iframe': case 'noembed': case 'noframes': + $this->insertElement($token); + + /* Switch the tokeniser's content model flag to the CDATA state. */ + return HTML5::CDATA; + break; + + /* A start tag whose tag name is "select" */ + case 'select': + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Change the insertion mode to "in select". */ + $this->mode = self::IN_SELECT; + break; + + /* A start or end tag whose tag name is one of: "caption", "col", + "colgroup", "frame", "frameset", "head", "option", "optgroup", + "tbody", "td", "tfoot", "th", "thead", "tr". */ + case 'caption': case 'col': case 'colgroup': case 'frame': + case 'frameset': case 'head': case 'option': case 'optgroup': + case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': + case 'tr': + // Parse error. Ignore the token. + break; + + /* A start or end tag whose tag name is one of: "event-source", + "section", "nav", "article", "aside", "header", "footer", + "datagrid", "command" */ + case 'event-source': case 'section': case 'nav': case 'article': + case 'aside': case 'header': case 'footer': case 'datagrid': + case 'command': + // Work in progress! + break; + + /* A start tag token not covered by the previous entries */ + default: + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + + $this->insertElement($token, true, true); + break; + } + break; + + case HTML5::ENDTAG: + switch($token['name']) { + /* An end tag with the tag name "body" */ + case 'body': + /* If the second element in the stack of open elements is + not a body element, this is a parse error. Ignore the token. + (innerHTML case) */ + if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { + // Ignore. + + /* If the current node is not the body element, then this + is a parse error. */ + } elseif(end($this->stack)->nodeName !== 'body') { + // Parse error. + } + + /* Change the insertion mode to "after body". */ + $this->mode = self::AFTER_BODY; + break; + + /* An end tag with the tag name "html" */ + case 'html': + /* Act as if an end tag with tag name "body" had been seen, + then, if that token wasn't ignored, reprocess the current + token. */ + $this->inBody(array( + 'name' => 'body', + 'type' => HTML5::ENDTAG + )); + + return $this->afterBody($token); + break; + + /* An end tag whose tag name is one of: "address", "blockquote", + "center", "dir", "div", "dl", "fieldset", "listing", "menu", + "ol", "pre", "ul" */ + case 'address': case 'blockquote': case 'center': case 'dir': + case 'div': case 'dl': case 'fieldset': case 'listing': + case 'menu': case 'ol': case 'pre': case 'ul': + /* If the stack of open elements has an element in scope + with the same tag name as that of the token, then generate + implied end tags. */ + if($this->elementInScope($token['name'])) { + $this->generateImpliedEndTags(); + + /* Now, if the current node is not an element with + the same tag name as that of the token, then this + is a parse error. */ + // w/e + + /* If the stack of open elements has an element in + scope with the same tag name as that of the token, + then pop elements from this stack until an element + with that tag name has been popped from the stack. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->stack[$n]->nodeName === $token['name']) { + $n = -1; + } + + array_pop($this->stack); + } + } + break; + + /* An end tag whose tag name is "form" */ + case 'form': + /* If the stack of open elements has an element in scope + with the same tag name as that of the token, then generate + implied end tags. */ + if($this->elementInScope($token['name'])) { + $this->generateImpliedEndTags(); + + } + + if(end($this->stack)->nodeName !== $token['name']) { + /* Now, if the current node is not an element with the + same tag name as that of the token, then this is a parse + error. */ + // w/e + + } else { + /* Otherwise, if the current node is an element with + the same tag name as that of the token pop that element + from the stack. */ + array_pop($this->stack); + } + + /* In any case, set the form element pointer to null. */ + $this->form_pointer = null; + break; + + /* An end tag whose tag name is "p" */ + case 'p': + /* If the stack of open elements has a p element in scope, + then generate implied end tags, except for p elements. */ + if($this->elementInScope('p')) { + $this->generateImpliedEndTags(array('p')); + + /* If the current node is not a p element, then this is + a parse error. */ + // k + + /* If the stack of open elements has a p element in + scope, then pop elements from this stack until the stack + no longer has a p element in scope. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->elementInScope('p')) { + array_pop($this->stack); + + } else { + break; + } + } + } + break; + + /* An end tag whose tag name is "dd", "dt", or "li" */ + case 'dd': case 'dt': case 'li': + /* If the stack of open elements has an element in scope + whose tag name matches the tag name of the token, then + generate implied end tags, except for elements with the + same tag name as the token. */ + if($this->elementInScope($token['name'])) { + $this->generateImpliedEndTags(array($token['name'])); + + /* If the current node is not an element with the same + tag name as the token, then this is a parse error. */ + // w/e + + /* If the stack of open elements has an element in scope + whose tag name matches the tag name of the token, then + pop elements from this stack until an element with that + tag name has been popped from the stack. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->stack[$n]->nodeName === $token['name']) { + $n = -1; + } + + array_pop($this->stack); + } + } + break; + + /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", + "h5", "h6" */ + case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': + $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); + + /* If the stack of open elements has in scope an element whose + tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then + generate implied end tags. */ + if($this->elementInScope($elements)) { + $this->generateImpliedEndTags(); + + /* Now, if the current node is not an element with the same + tag name as that of the token, then this is a parse error. */ + // w/e + + /* If the stack of open elements has in scope an element + whose tag name is one of "h1", "h2", "h3", "h4", "h5", or + "h6", then pop elements from the stack until an element + with one of those tag names has been popped from the stack. */ + while($this->elementInScope($elements)) { + array_pop($this->stack); + } + } + break; + + /* An end tag whose tag name is one of: "a", "b", "big", "em", + "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ + case 'a': case 'b': case 'big': case 'em': case 'font': + case 'i': case 'nobr': case 's': case 'small': case 'strike': + case 'strong': case 'tt': case 'u': + /* 1. Let the formatting element be the last element in + the list of active formatting elements that: + * is between the end of the list and the last scope + marker in the list, if any, or the start of the list + otherwise, and + * has the same tag name as the token. + */ + while(true) { + for($a = count($this->a_formatting) - 1; $a >= 0; $a--) { + if($this->a_formatting[$a] === self::MARKER) { + break; + + } elseif($this->a_formatting[$a]->tagName === $token['name']) { + $formatting_element = $this->a_formatting[$a]; + $in_stack = in_array($formatting_element, $this->stack, true); + $fe_af_pos = $a; + break; + } + } + + /* If there is no such node, or, if that node is + also in the stack of open elements but the element + is not in scope, then this is a parse error. Abort + these steps. The token is ignored. */ + if(!isset($formatting_element) || ($in_stack && + !$this->elementInScope($token['name']))) { + break; + + /* Otherwise, if there is such a node, but that node + is not in the stack of open elements, then this is a + parse error; remove the element from the list, and + abort these steps. */ + } elseif(isset($formatting_element) && !$in_stack) { + unset($this->a_formatting[$fe_af_pos]); + $this->a_formatting = array_merge($this->a_formatting); + break; + } + + /* 2. Let the furthest block be the topmost node in the + stack of open elements that is lower in the stack + than the formatting element, and is not an element in + the phrasing or formatting categories. There might + not be one. */ + $fe_s_pos = array_search($formatting_element, $this->stack, true); + $length = count($this->stack); + + for($s = $fe_s_pos + 1; $s < $length; $s++) { + $category = $this->getElementCategory($this->stack[$s]->nodeName); + + if($category !== self::PHRASING && $category !== self::FORMATTING) { + $furthest_block = $this->stack[$s]; + } + } + + /* 3. If there is no furthest block, then the UA must + skip the subsequent steps and instead just pop all + the nodes from the bottom of the stack of open + elements, from the current node up to the formatting + element, and remove the formatting element from the + list of active formatting elements. */ + if(!isset($furthest_block)) { + for($n = $length - 1; $n >= $fe_s_pos; $n--) { + array_pop($this->stack); + } + + unset($this->a_formatting[$fe_af_pos]); + $this->a_formatting = array_merge($this->a_formatting); + break; + } + + /* 4. Let the common ancestor be the element + immediately above the formatting element in the stack + of open elements. */ + $common_ancestor = $this->stack[$fe_s_pos - 1]; + + /* 5. If the furthest block has a parent node, then + remove the furthest block from its parent node. */ + if($furthest_block->parentNode !== null) { + $furthest_block->parentNode->removeChild($furthest_block); + } + + /* 6. Let a bookmark note the position of the + formatting element in the list of active formatting + elements relative to the elements on either side + of it in the list. */ + $bookmark = $fe_af_pos; + + /* 7. Let node and last node be the furthest block. + Follow these steps: */ + $node = $furthest_block; + $last_node = $furthest_block; + + while(true) { + for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { + /* 7.1 Let node be the element immediately + prior to node in the stack of open elements. */ + $node = $this->stack[$n]; + + /* 7.2 If node is not in the list of active + formatting elements, then remove node from + the stack of open elements and then go back + to step 1. */ + if(!in_array($node, $this->a_formatting, true)) { + unset($this->stack[$n]); + $this->stack = array_merge($this->stack); + + } else { + break; + } + } + + /* 7.3 Otherwise, if node is the formatting + element, then go to the next step in the overall + algorithm. */ + if($node === $formatting_element) { + break; + + /* 7.4 Otherwise, if last node is the furthest + block, then move the aforementioned bookmark to + be immediately after the node in the list of + active formatting elements. */ + } elseif($last_node === $furthest_block) { + $bookmark = array_search($node, $this->a_formatting, true) + 1; + } + + /* 7.5 If node has any children, perform a + shallow clone of node, replace the entry for + node in the list of active formatting elements + with an entry for the clone, replace the entry + for node in the stack of open elements with an + entry for the clone, and let node be the clone. */ + if($node->hasChildNodes()) { + $clone = $node->cloneNode(); + $s_pos = array_search($node, $this->stack, true); + $a_pos = array_search($node, $this->a_formatting, true); + + $this->stack[$s_pos] = $clone; + $this->a_formatting[$a_pos] = $clone; + $node = $clone; + } + + /* 7.6 Insert last node into node, first removing + it from its previous parent node if any. */ + if($last_node->parentNode !== null) { + $last_node->parentNode->removeChild($last_node); + } + + $node->appendChild($last_node); + + /* 7.7 Let last node be node. */ + $last_node = $node; + } + + /* 8. Insert whatever last node ended up being in + the previous step into the common ancestor node, + first removing it from its previous parent node if + any. */ + if($last_node->parentNode !== null) { + $last_node->parentNode->removeChild($last_node); + } + + $common_ancestor->appendChild($last_node); + + /* 9. Perform a shallow clone of the formatting + element. */ + $clone = $formatting_element->cloneNode(); + + /* 10. Take all of the child nodes of the furthest + block and append them to the clone created in the + last step. */ + while($furthest_block->hasChildNodes()) { + $child = $furthest_block->firstChild; + $furthest_block->removeChild($child); + $clone->appendChild($child); + } + + /* 11. Append that clone to the furthest block. */ + $furthest_block->appendChild($clone); + + /* 12. Remove the formatting element from the list + of active formatting elements, and insert the clone + into the list of active formatting elements at the + position of the aforementioned bookmark. */ + $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); + unset($this->a_formatting[$fe_af_pos]); + $this->a_formatting = array_merge($this->a_formatting); + + $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); + $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); + $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); + + /* 13. Remove the formatting element from the stack + of open elements, and insert the clone into the stack + of open elements immediately after (i.e. in a more + deeply nested position than) the position of the + furthest block in that stack. */ + $fe_s_pos = array_search($formatting_element, $this->stack, true); + $fb_s_pos = array_search($furthest_block, $this->stack, true); + unset($this->stack[$fe_s_pos]); + + $s_part1 = array_slice($this->stack, 0, $fb_s_pos); + $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); + $this->stack = array_merge($s_part1, array($clone), $s_part2); + + /* 14. Jump back to step 1 in this series of steps. */ + unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); + } + break; + + /* An end tag token whose tag name is one of: "button", + "marquee", "object" */ + case 'button': case 'marquee': case 'object': + /* If the stack of open elements has an element in scope whose + tag name matches the tag name of the token, then generate implied + tags. */ + if($this->elementInScope($token['name'])) { + $this->generateImpliedEndTags(); + + /* Now, if the current node is not an element with the same + tag name as the token, then this is a parse error. */ + // k + + /* Now, if the stack of open elements has an element in scope + whose tag name matches the tag name of the token, then pop + elements from the stack until that element has been popped from + the stack, and clear the list of active formatting elements up + to the last marker. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->stack[$n]->nodeName === $token['name']) { + $n = -1; + } + + array_pop($this->stack); + } + + $marker = end(array_keys($this->a_formatting, self::MARKER, true)); + + for($n = count($this->a_formatting) - 1; $n > $marker; $n--) { + array_pop($this->a_formatting); + } + } + break; + + /* Or an end tag whose tag name is one of: "area", "basefont", + "bgsound", "br", "embed", "hr", "iframe", "image", "img", + "input", "isindex", "noembed", "noframes", "param", "select", + "spacer", "table", "textarea", "wbr" */ + case 'area': case 'basefont': case 'bgsound': case 'br': + case 'embed': case 'hr': case 'iframe': case 'image': + case 'img': case 'input': case 'isindex': case 'noembed': + case 'noframes': case 'param': case 'select': case 'spacer': + case 'table': case 'textarea': case 'wbr': + // Parse error. Ignore the token. + break; + + /* An end tag token not covered by the previous entries */ + default: + for($n = count($this->stack) - 1; $n >= 0; $n--) { + /* Initialise node to be the current node (the bottommost + node of the stack). */ + $node = end($this->stack); + + /* If node has the same tag name as the end tag token, + then: */ + if($token['name'] === $node->nodeName) { + /* Generate implied end tags. */ + $this->generateImpliedEndTags(); + + /* If the tag name of the end tag token does not + match the tag name of the current node, this is a + parse error. */ + // k + + /* Pop all the nodes from the current node up to + node, including node, then stop this algorithm. */ + for($x = count($this->stack) - $n; $x >= $n; $x--) { + array_pop($this->stack); + } + + } else { + $category = $this->getElementCategory($node); + + if($category !== self::SPECIAL && $category !== self::SCOPING) { + /* Otherwise, if node is in neither the formatting + category nor the phrasing category, then this is a + parse error. Stop this algorithm. The end tag token + is ignored. */ + return false; + } + } + } + break; + } + break; + } + } + + private function inTable($token) { + $clear = array('html', 'table'); + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $text = $this->dom->createTextNode($token['data']); + end($this->stack)->appendChild($text); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $comment = $this->dom->createComment($token['data']); + end($this->stack)->appendChild($comment); + + /* A start tag whose tag name is "caption" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'caption') { + /* Clear the stack back to a table context. */ + $this->clearStackToTableContext($clear); + + /* Insert a marker at the end of the list of active + formatting elements. */ + $this->a_formatting[] = self::MARKER; + + /* Insert an HTML element for the token, then switch the + insertion mode to "in caption". */ + $this->insertElement($token); + $this->mode = self::IN_CAPTION; + + /* A start tag whose tag name is "colgroup" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'colgroup') { + /* Clear the stack back to a table context. */ + $this->clearStackToTableContext($clear); + + /* Insert an HTML element for the token, then switch the + insertion mode to "in column group". */ + $this->insertElement($token); + $this->mode = self::IN_CGROUP; + + /* A start tag whose tag name is "col" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'col') { + $this->inTable(array( + 'name' => 'colgroup', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + $this->inColumnGroup($token); + + /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('tbody', 'tfoot', 'thead'))) { + /* Clear the stack back to a table context. */ + $this->clearStackToTableContext($clear); + + /* Insert an HTML element for the token, then switch the insertion + mode to "in table body". */ + $this->insertElement($token); + $this->mode = self::IN_TBODY; + + /* A start tag whose tag name is one of: "td", "th", "tr" */ + } elseif($token['type'] === HTML5::STARTTAG && + in_array($token['name'], array('td', 'th', 'tr'))) { + /* Act as if a start tag token with the tag name "tbody" had been + seen, then reprocess the current token. */ + $this->inTable(array( + 'name' => 'tbody', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + return $this->inTableBody($token); + + /* A start tag whose tag name is "table" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'table') { + /* Parse error. Act as if an end tag token with the tag name "table" + had been seen, then, if that token wasn't ignored, reprocess the + current token. */ + $this->inTable(array( + 'name' => 'table', + 'type' => HTML5::ENDTAG + )); + + return $this->mainPhase($token); + + /* An end tag whose tag name is "table" */ + } elseif($token['type'] === HTML5::ENDTAG && + $token['name'] === 'table') { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. (innerHTML case) */ + if(!$this->elementInScope($token['name'], true)) { + return false; + + /* Otherwise: */ + } else { + /* Generate implied end tags. */ + $this->generateImpliedEndTags(); + + /* Now, if the current node is not a table element, then this + is a parse error. */ + // w/e + + /* Pop elements from this stack until a table element has been + popped from the stack. */ + while(true) { + $current = end($this->stack)->nodeName; + array_pop($this->stack); + + if($current === 'table') { + break; + } + } + + /* Reset the insertion mode appropriately. */ + $this->resetInsertionMode(); + } + + /* An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', + 'tfoot', 'th', 'thead', 'tr'))) { + // Parse error. Ignore the token. + + /* Anything else */ + } else { + /* Parse error. Process the token as if the insertion mode was "in + body", with the following exception: */ + + /* If the current node is a table, tbody, tfoot, thead, or tr + element, then, whenever a node would be inserted into the current + node, it must instead be inserted into the foster parent element. */ + if(in_array(end($this->stack)->nodeName, + array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { + /* The foster parent element is the parent element of the last + table element in the stack of open elements, if there is a + table element and it has such a parent element. If there is no + table element in the stack of open elements (innerHTML case), + then the foster parent element is the first element in the + stack of open elements (the html element). Otherwise, if there + is a table element in the stack of open elements, but the last + table element in the stack of open elements has no parent, or + its parent node is not an element, then the foster parent + element is the element before the last table element in the + stack of open elements. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->stack[$n]->nodeName === 'table') { + $table = $this->stack[$n]; + break; + } + } + + if(isset($table) && $table->parentNode !== null) { + $this->foster_parent = $table->parentNode; + + } elseif(!isset($table)) { + $this->foster_parent = $this->stack[0]; + + } elseif(isset($table) && ($table->parentNode === null || + $table->parentNode->nodeType !== XML_ELEMENT_NODE)) { + $this->foster_parent = $this->stack[$n - 1]; + } + } + + $this->inBody($token); + } + } + + private function inCaption($token) { + /* An end tag whose tag name is "caption" */ + if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. (innerHTML case) */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore + + /* Otherwise: */ + } else { + /* Generate implied end tags. */ + $this->generateImpliedEndTags(); + + /* Now, if the current node is not a caption element, then this + is a parse error. */ + // w/e + + /* Pop elements from this stack until a caption element has + been popped from the stack. */ + while(true) { + $node = end($this->stack)->nodeName; + array_pop($this->stack); + + if($node === 'caption') { + break; + } + } + + /* Clear the list of active formatting elements up to the last + marker. */ + $this->clearTheActiveFormattingElementsUpToTheLastMarker(); + + /* Switch the insertion mode to "in table". */ + $this->mode = self::IN_TABLE; + } + + /* A start tag whose tag name is one of: "caption", "col", "colgroup", + "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag + name is "table" */ + } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', + 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG && + $token['name'] === 'table')) { + /* Parse error. Act as if an end tag with the tag name "caption" + had been seen, then, if that token wasn't ignored, reprocess the + current token. */ + $this->inCaption(array( + 'name' => 'caption', + 'type' => HTML5::ENDTAG + )); + + return $this->inTable($token); + + /* An end tag whose tag name is one of: "body", "col", "colgroup", + "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th', + 'thead', 'tr'))) { + // Parse error. Ignore the token. + + /* Anything else */ + } else { + /* Process the token as if the insertion mode was "in body". */ + $this->inBody($token); + } + } + + private function inColumnGroup($token) { + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $text = $this->dom->createTextNode($token['data']); + end($this->stack)->appendChild($text); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $comment = $this->dom->createComment($token['data']); + end($this->stack)->appendChild($comment); + + /* A start tag whose tag name is "col" */ + } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { + /* Insert a col element for the token. Immediately pop the current + node off the stack of open elements. */ + $this->insertElement($token); + array_pop($this->stack); + + /* An end tag whose tag name is "colgroup" */ + } elseif($token['type'] === HTML5::ENDTAG && + $token['name'] === 'colgroup') { + /* If the current node is the root html element, then this is a + parse error, ignore the token. (innerHTML case) */ + if(end($this->stack)->nodeName === 'html') { + // Ignore + + /* Otherwise, pop the current node (which will be a colgroup + element) from the stack of open elements. Switch the insertion + mode to "in table". */ + } else { + array_pop($this->stack); + $this->mode = self::IN_TABLE; + } + + /* An end tag whose tag name is "col" */ + } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { + /* Parse error. Ignore the token. */ + + /* Anything else */ + } else { + /* Act as if an end tag with the tag name "colgroup" had been seen, + and then, if that token wasn't ignored, reprocess the current token. */ + $this->inColumnGroup(array( + 'name' => 'colgroup', + 'type' => HTML5::ENDTAG + )); + + return $this->inTable($token); + } + } + + private function inTableBody($token) { + $clear = array('tbody', 'tfoot', 'thead', 'html'); + + /* A start tag whose tag name is "tr" */ + if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { + /* Clear the stack back to a table body context. */ + $this->clearStackToTableContext($clear); + + /* Insert a tr element for the token, then switch the insertion + mode to "in row". */ + $this->insertElement($token); + $this->mode = self::IN_ROW; + + /* A start tag whose tag name is one of: "th", "td" */ + } elseif($token['type'] === HTML5::STARTTAG && + ($token['name'] === 'th' || $token['name'] === 'td')) { + /* Parse error. Act as if a start tag with the tag name "tr" had + been seen, then reprocess the current token. */ + $this->inTableBody(array( + 'name' => 'tr', + 'type' => HTML5::STARTTAG, + 'attr' => array() + )); + + return $this->inRow($token); + + /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ + } elseif($token['type'] === HTML5::ENDTAG && + in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore + + /* Otherwise: */ + } else { + /* Clear the stack back to a table body context. */ + $this->clearStackToTableContext($clear); + + /* Pop the current node from the stack of open elements. Switch + the insertion mode to "in table". */ + array_pop($this->stack); + $this->mode = self::IN_TABLE; + } + + /* A start tag whose tag name is one of: "caption", "col", "colgroup", + "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ + } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) || + ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) { + /* If the stack of open elements does not have a tbody, thead, or + tfoot element in table scope, this is a parse error. Ignore the + token. (innerHTML case) */ + if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { + // Ignore. + + /* Otherwise: */ + } else { + /* Clear the stack back to a table body context. */ + $this->clearStackToTableContext($clear); + + /* Act as if an end tag with the same tag name as the current + node ("tbody", "tfoot", or "thead") had been seen, then + reprocess the current token. */ + $this->inTableBody(array( + 'name' => end($this->stack)->nodeName, + 'type' => HTML5::ENDTAG + )); + + return $this->mainPhase($token); + } + + /* An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "td", "th", "tr" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { + /* Parse error. Ignore the token. */ + + /* Anything else */ + } else { + /* Process the token as if the insertion mode was "in table". */ + $this->inTable($token); + } + } + + private function inRow($token) { + $clear = array('tr', 'html'); + + /* A start tag whose tag name is one of: "th", "td" */ + if($token['type'] === HTML5::STARTTAG && + ($token['name'] === 'th' || $token['name'] === 'td')) { + /* Clear the stack back to a table row context. */ + $this->clearStackToTableContext($clear); + + /* Insert an HTML element for the token, then switch the insertion + mode to "in cell". */ + $this->insertElement($token); + $this->mode = self::IN_CELL; + + /* Insert a marker at the end of the list of active formatting + elements. */ + $this->a_formatting[] = self::MARKER; + + /* An end tag whose tag name is "tr" */ + } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. (innerHTML case) */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore. + + /* Otherwise: */ + } else { + /* Clear the stack back to a table row context. */ + $this->clearStackToTableContext($clear); + + /* Pop the current node (which will be a tr element) from the + stack of open elements. Switch the insertion mode to "in table + body". */ + array_pop($this->stack); + $this->mode = self::IN_TBODY; + } + + /* A start tag whose tag name is one of: "caption", "col", "colgroup", + "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) { + /* Act as if an end tag with the tag name "tr" had been seen, then, + if that token wasn't ignored, reprocess the current token. */ + $this->inRow(array( + 'name' => 'tr', + 'type' => HTML5::ENDTAG + )); + + return $this->inCell($token); + + /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ + } elseif($token['type'] === HTML5::ENDTAG && + in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore. + + /* Otherwise: */ + } else { + /* Otherwise, act as if an end tag with the tag name "tr" had + been seen, then reprocess the current token. */ + $this->inRow(array( + 'name' => 'tr', + 'type' => HTML5::ENDTAG + )); + + return $this->inCell($token); + } + + /* An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "td", "th" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { + /* Parse error. Ignore the token. */ + + /* Anything else */ + } else { + /* Process the token as if the insertion mode was "in table". */ + $this->inTable($token); + } + } + + private function inCell($token) { + /* An end tag whose tag name is one of: "td", "th" */ + if($token['type'] === HTML5::ENDTAG && + ($token['name'] === 'td' || $token['name'] === 'th')) { + /* If the stack of open elements does not have an element in table + scope with the same tag name as that of the token, then this is a + parse error and the token must be ignored. */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore. + + /* Otherwise: */ + } else { + /* Generate implied end tags, except for elements with the same + tag name as the token. */ + $this->generateImpliedEndTags(array($token['name'])); + + /* Now, if the current node is not an element with the same tag + name as the token, then this is a parse error. */ + // k + + /* Pop elements from this stack until an element with the same + tag name as the token has been popped from the stack. */ + while(true) { + $node = end($this->stack)->nodeName; + array_pop($this->stack); + + if($node === $token['name']) { + break; + } + } + + /* Clear the list of active formatting elements up to the last + marker. */ + $this->clearTheActiveFormattingElementsUpToTheLastMarker(); + + /* Switch the insertion mode to "in row". (The current node + will be a tr element at this point.) */ + $this->mode = self::IN_ROW; + } + + /* A start tag whose tag name is one of: "caption", "col", "colgroup", + "tbody", "td", "tfoot", "th", "thead", "tr" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', + 'thead', 'tr'))) { + /* If the stack of open elements does not have a td or th element + in table scope, then this is a parse error; ignore the token. + (innerHTML case) */ + if(!$this->elementInScope(array('td', 'th'), true)) { + // Ignore. + + /* Otherwise, close the cell (see below) and reprocess the current + token. */ + } else { + $this->closeCell(); + return $this->inRow($token); + } + + /* A start tag whose tag name is one of: "caption", "col", "colgroup", + "tbody", "td", "tfoot", "th", "thead", "tr" */ + } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], + array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', + 'thead', 'tr'))) { + /* If the stack of open elements does not have a td or th element + in table scope, then this is a parse error; ignore the token. + (innerHTML case) */ + if(!$this->elementInScope(array('td', 'th'), true)) { + // Ignore. + + /* Otherwise, close the cell (see below) and reprocess the current + token. */ + } else { + $this->closeCell(); + return $this->inRow($token); + } + + /* An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('body', 'caption', 'col', 'colgroup', 'html'))) { + /* Parse error. Ignore the token. */ + + /* An end tag whose tag name is one of: "table", "tbody", "tfoot", + "thead", "tr" */ + } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], + array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { + /* If the stack of open elements does not have an element in table + scope with the same tag name as that of the token (which can only + happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), + then this is a parse error and the token must be ignored. */ + if(!$this->elementInScope($token['name'], true)) { + // Ignore. + + /* Otherwise, close the cell (see below) and reprocess the current + token. */ + } else { + $this->closeCell(); + return $this->inRow($token); + } + + /* Anything else */ + } else { + /* Process the token as if the insertion mode was "in body". */ + $this->inBody($token); + } + } + + private function inSelect($token) { + /* Handle the token as follows: */ + + /* A character token */ + if($token['type'] === HTML5::CHARACTR) { + /* Append the token's character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $this->insertComment($token['data']); + + /* A start tag token whose tag name is "option" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'option') { + /* If the current node is an option element, act as if an end tag + with the tag name "option" had been seen. */ + if(end($this->stack)->nodeName === 'option') { + $this->inSelect(array( + 'name' => 'option', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* A start tag token whose tag name is "optgroup" */ + } elseif($token['type'] === HTML5::STARTTAG && + $token['name'] === 'optgroup') { + /* If the current node is an option element, act as if an end tag + with the tag name "option" had been seen. */ + if(end($this->stack)->nodeName === 'option') { + $this->inSelect(array( + 'name' => 'option', + 'type' => HTML5::ENDTAG + )); + } + + /* If the current node is an optgroup element, act as if an end tag + with the tag name "optgroup" had been seen. */ + if(end($this->stack)->nodeName === 'optgroup') { + $this->inSelect(array( + 'name' => 'optgroup', + 'type' => HTML5::ENDTAG + )); + } + + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* An end tag token whose tag name is "optgroup" */ + } elseif($token['type'] === HTML5::ENDTAG && + $token['name'] === 'optgroup') { + /* First, if the current node is an option element, and the node + immediately before it in the stack of open elements is an optgroup + element, then act as if an end tag with the tag name "option" had + been seen. */ + $elements_in_stack = count($this->stack); + + if($this->stack[$elements_in_stack - 1]->nodeName === 'option' && + $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') { + $this->inSelect(array( + 'name' => 'option', + 'type' => HTML5::ENDTAG + )); + } + + /* If the current node is an optgroup element, then pop that node + from the stack of open elements. Otherwise, this is a parse error, + ignore the token. */ + if($this->stack[$elements_in_stack - 1] === 'optgroup') { + array_pop($this->stack); + } + + /* An end tag token whose tag name is "option" */ + } elseif($token['type'] === HTML5::ENDTAG && + $token['name'] === 'option') { + /* If the current node is an option element, then pop that node + from the stack of open elements. Otherwise, this is a parse error, + ignore the token. */ + if(end($this->stack)->nodeName === 'option') { + array_pop($this->stack); + } + + /* An end tag whose tag name is "select" */ + } elseif($token['type'] === HTML5::ENDTAG && + $token['name'] === 'select') { + /* If the stack of open elements does not have an element in table + scope with the same tag name as the token, this is a parse error. + Ignore the token. (innerHTML case) */ + if(!$this->elementInScope($token['name'], true)) { + // w/e + + /* Otherwise: */ + } else { + /* Pop elements from the stack of open elements until a select + element has been popped from the stack. */ + while(true) { + $current = end($this->stack)->nodeName; + array_pop($this->stack); + + if($current === 'select') { + break; + } + } + + /* Reset the insertion mode appropriately. */ + $this->resetInsertionMode(); + } + + /* A start tag whose tag name is "select" */ + } elseif($token['name'] === 'select' && + $token['type'] === HTML5::STARTTAG) { + /* Parse error. Act as if the token had been an end tag with the + tag name "select" instead. */ + $this->inSelect(array( + 'name' => 'select', + 'type' => HTML5::ENDTAG + )); + + /* An end tag whose tag name is one of: "caption", "table", "tbody", + "tfoot", "thead", "tr", "td", "th" */ + } elseif(in_array($token['name'], array('caption', 'table', 'tbody', + 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) { + /* Parse error. */ + // w/e + + /* If the stack of open elements has an element in table scope with + the same tag name as that of the token, then act as if an end tag + with the tag name "select" had been seen, and reprocess the token. + Otherwise, ignore the token. */ + if($this->elementInScope($token['name'], true)) { + $this->inSelect(array( + 'name' => 'select', + 'type' => HTML5::ENDTAG + )); + + $this->mainPhase($token); + } + + /* Anything else */ + } else { + /* Parse error. Ignore the token. */ + } + } + + private function afterBody($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Process the token as it would be processed if the insertion mode + was "in body". */ + $this->inBody($token); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the first element in the stack of open + elements (the html element), with the data attribute set to the + data given in the comment token. */ + $comment = $this->dom->createComment($token['data']); + $this->stack[0]->appendChild($comment); + + /* An end tag with the tag name "html" */ + } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { + /* If the parser was originally created in order to handle the + setting of an element's innerHTML attribute, this is a parse error; + ignore the token. (The element will be an html element in this + case.) (innerHTML case) */ + + /* Otherwise, switch to the trailing end phase. */ + $this->phase = self::END_PHASE; + + /* Anything else */ + } else { + /* Parse error. Set the insertion mode to "in body" and reprocess + the token. */ + $this->mode = self::IN_BODY; + return $this->inBody($token); + } + } + + private function inFrameset($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $this->insertComment($token['data']); + + /* A start tag with the tag name "frameset" */ + } elseif($token['name'] === 'frameset' && + $token['type'] === HTML5::STARTTAG) { + $this->insertElement($token); + + /* An end tag with the tag name "frameset" */ + } elseif($token['name'] === 'frameset' && + $token['type'] === HTML5::ENDTAG) { + /* If the current node is the root html element, then this is a + parse error; ignore the token. (innerHTML case) */ + if(end($this->stack)->nodeName === 'html') { + // Ignore + + } else { + /* Otherwise, pop the current node from the stack of open + elements. */ + array_pop($this->stack); + + /* If the parser was not originally created in order to handle + the setting of an element's innerHTML attribute (innerHTML case), + and the current node is no longer a frameset element, then change + the insertion mode to "after frameset". */ + $this->mode = self::AFTR_FRAME; + } + + /* A start tag with the tag name "frame" */ + } elseif($token['name'] === 'frame' && + $token['type'] === HTML5::STARTTAG) { + /* Insert an HTML element for the token. */ + $this->insertElement($token); + + /* Immediately pop the current node off the stack of open elements. */ + array_pop($this->stack); + + /* A start tag with the tag name "noframes" */ + } elseif($token['name'] === 'noframes' && + $token['type'] === HTML5::STARTTAG) { + /* Process the token as if the insertion mode had been "in body". */ + $this->inBody($token); + + /* Anything else */ + } else { + /* Parse error. Ignore the token. */ + } + } + + private function afterFrameset($token) { + /* Handle the token as follows: */ + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ + if($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Append the character to the current node. */ + $this->insertText($token['data']); + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the current node with the data + attribute set to the data given in the comment token. */ + $this->insertComment($token['data']); + + /* An end tag with the tag name "html" */ + } elseif($token['name'] === 'html' && + $token['type'] === HTML5::ENDTAG) { + /* Switch to the trailing end phase. */ + $this->phase = self::END_PHASE; + + /* A start tag with the tag name "noframes" */ + } elseif($token['name'] === 'noframes' && + $token['type'] === HTML5::STARTTAG) { + /* Process the token as if the insertion mode had been "in body". */ + $this->inBody($token); + + /* Anything else */ + } else { + /* Parse error. Ignore the token. */ + } + } + + private function trailingEndPhase($token) { + /* After the main phase, as each token is emitted from the tokenisation + stage, it must be processed as described in this section. */ + + /* A DOCTYPE token */ + if($token['type'] === HTML5::DOCTYPE) { + // Parse error. Ignore the token. + + /* A comment token */ + } elseif($token['type'] === HTML5::COMMENT) { + /* Append a Comment node to the Document object with the data + attribute set to the data given in the comment token. */ + $comment = $this->dom->createComment($token['data']); + $this->dom->appendChild($comment); + + /* A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE */ + } elseif($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { + /* Process the token as it would be processed in the main phase. */ + $this->mainPhase($token); + + /* A character token that is not one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + or U+0020 SPACE. Or a start tag token. Or an end tag token. */ + } elseif(($token['type'] === HTML5::CHARACTR && + preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || + $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) { + /* Parse error. Switch back to the main phase and reprocess the + token. */ + $this->phase = self::MAIN_PHASE; + return $this->mainPhase($token); + + /* An end-of-file token */ + } elseif($token['type'] === HTML5::EOF) { + /* OMG DONE!! */ + } + } + + private function insertElement($token, $append = true, $check = false) { + // Proprietary workaround for libxml2's limitations with tag names + if ($check) { + // Slightly modified HTML5 tag-name modification, + // removing anything that's not an ASCII letter, digit, or hyphen + $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); + // Remove leading hyphens and numbers + $token['name'] = ltrim($token['name'], '-0..9'); + // In theory, this should ever be needed, but just in case + if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice + } + + $el = $this->dom->createElement($token['name']); + + foreach($token['attr'] as $attr) { + if(!$el->hasAttribute($attr['name'])) { + $el->setAttribute($attr['name'], $attr['value']); + } + } + + $this->appendToRealParent($el); + $this->stack[] = $el; + + return $el; + } + + private function insertText($data) { + $text = $this->dom->createTextNode($data); + $this->appendToRealParent($text); + } + + private function insertComment($data) { + $comment = $this->dom->createComment($data); + $this->appendToRealParent($comment); + } + + private function appendToRealParent($node) { + if($this->foster_parent === null) { + end($this->stack)->appendChild($node); + + } elseif($this->foster_parent !== null) { + /* If the foster parent element is the parent element of the + last table element in the stack of open elements, then the new + node must be inserted immediately before the last table element + in the stack of open elements in the foster parent element; + otherwise, the new node must be appended to the foster parent + element. */ + for($n = count($this->stack) - 1; $n >= 0; $n--) { + if($this->stack[$n]->nodeName === 'table' && + $this->stack[$n]->parentNode !== null) { + $table = $this->stack[$n]; + break; + } + } + + if(isset($table) && $this->foster_parent->isSameNode($table->parentNode)) + $this->foster_parent->insertBefore($node, $table); + else + $this->foster_parent->appendChild($node); + + $this->foster_parent = null; + } + } + + private function elementInScope($el, $table = false) { + if(is_array($el)) { + foreach($el as $element) { + if($this->elementInScope($element, $table)) { + return true; + } + } + + return false; + } + + $leng = count($this->stack); + + for($n = 0; $n < $leng; $n++) { + /* 1. Initialise node to be the current node (the bottommost node of + the stack). */ + $node = $this->stack[$leng - 1 - $n]; + + if($node->tagName === $el) { + /* 2. If node is the target node, terminate in a match state. */ + return true; + + } elseif($node->tagName === 'table') { + /* 3. Otherwise, if node is a table element, terminate in a failure + state. */ + return false; + + } elseif($table === true && in_array($node->tagName, array('caption', 'td', + 'th', 'button', 'marquee', 'object'))) { + /* 4. Otherwise, if the algorithm is the "has an element in scope" + variant (rather than the "has an element in table scope" variant), + and node is one of the following, terminate in a failure state. */ + return false; + + } elseif($node === $node->ownerDocument->documentElement) { + /* 5. Otherwise, if node is an html element (root element), terminate + in a failure state. (This can only happen if the node is the topmost + node of the stack of open elements, and prevents the next step from + being invoked if there are no more elements in the stack.) */ + return false; + } + + /* Otherwise, set node to the previous entry in the stack of open + elements and return to step 2. (This will never fail, since the loop + will always terminate in the previous step if the top of the stack + is reached.) */ + } + } + + private function reconstructActiveFormattingElements() { + /* 1. If there are no entries in the list of active formatting elements, + then there is nothing to reconstruct; stop this algorithm. */ + $formatting_elements = count($this->a_formatting); + + if($formatting_elements === 0) { + return false; + } + + /* 3. Let entry be the last (most recently added) element in the list + of active formatting elements. */ + $entry = end($this->a_formatting); + + /* 2. If the last (most recently added) entry in the list of active + formatting elements is a marker, or if it is an element that is in the + stack of open elements, then there is nothing to reconstruct; stop this + algorithm. */ + if($entry === self::MARKER || in_array($entry, $this->stack, true)) { + return false; + } + + for($a = $formatting_elements - 1; $a >= 0; true) { + /* 4. If there are no entries before entry in the list of active + formatting elements, then jump to step 8. */ + if($a === 0) { + $step_seven = false; + break; + } + + /* 5. Let entry be the entry one earlier than entry in the list of + active formatting elements. */ + $a--; + $entry = $this->a_formatting[$a]; + + /* 6. If entry is neither a marker nor an element that is also in + thetack of open elements, go to step 4. */ + if($entry === self::MARKER || in_array($entry, $this->stack, true)) { + break; + } + } + + while(true) { + /* 7. Let entry be the element one later than entry in the list of + active formatting elements. */ + if(isset($step_seven) && $step_seven === true) { + $a++; + $entry = $this->a_formatting[$a]; + } + + /* 8. Perform a shallow clone of the element entry to obtain clone. */ + $clone = $entry->cloneNode(); + + /* 9. Append clone to the current node and push it onto the stack + of open elements so that it is the new current node. */ + end($this->stack)->appendChild($clone); + $this->stack[] = $clone; + + /* 10. Replace the entry for entry in the list with an entry for + clone. */ + $this->a_formatting[$a] = $clone; + + /* 11. If the entry for clone in the list of active formatting + elements is not the last entry in the list, return to step 7. */ + if(end($this->a_formatting) !== $clone) { + $step_seven = true; + } else { + break; + } + } + } + + private function clearTheActiveFormattingElementsUpToTheLastMarker() { + /* When the steps below require the UA to clear the list of active + formatting elements up to the last marker, the UA must perform the + following steps: */ + + while(true) { + /* 1. Let entry be the last (most recently added) entry in the list + of active formatting elements. */ + $entry = end($this->a_formatting); + + /* 2. Remove entry from the list of active formatting elements. */ + array_pop($this->a_formatting); + + /* 3. If entry was a marker, then stop the algorithm at this point. + The list has been cleared up to the last marker. */ + if($entry === self::MARKER) { + break; + } + } + } + + private function generateImpliedEndTags($exclude = array()) { + /* When the steps below require the UA to generate implied end tags, + then, if the current node is a dd element, a dt element, an li element, + a p element, a td element, a th element, or a tr element, the UA must + act as if an end tag with the respective tag name had been seen and + then generate implied end tags again. */ + $node = end($this->stack); + $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); + + while(in_array(end($this->stack)->nodeName, $elements)) { + array_pop($this->stack); + } + } + + private function getElementCategory($node) { + $name = $node->tagName; + if(in_array($name, $this->special)) + return self::SPECIAL; + + elseif(in_array($name, $this->scoping)) + return self::SCOPING; + + elseif(in_array($name, $this->formatting)) + return self::FORMATTING; + + else + return self::PHRASING; + } + + private function clearStackToTableContext($elements) { + /* When the steps above require the UA to clear the stack back to a + table context, it means that the UA must, while the current node is not + a table element or an html element, pop elements from the stack of open + elements. If this causes any elements to be popped from the stack, then + this is a parse error. */ + while(true) { + $node = end($this->stack)->nodeName; + + if(in_array($node, $elements)) { + break; + } else { + array_pop($this->stack); + } + } + } + + private function resetInsertionMode() { + /* 1. Let last be false. */ + $last = false; + $leng = count($this->stack); + + for($n = $leng - 1; $n >= 0; $n--) { + /* 2. Let node be the last node in the stack of open elements. */ + $node = $this->stack[$n]; + + /* 3. If node is the first node in the stack of open elements, then + set last to true. If the element whose innerHTML attribute is being + set is neither a td element nor a th element, then set node to the + element whose innerHTML attribute is being set. (innerHTML case) */ + if($this->stack[0]->isSameNode($node)) { + $last = true; + } + + /* 4. If node is a select element, then switch the insertion mode to + "in select" and abort these steps. (innerHTML case) */ + if($node->nodeName === 'select') { + $this->mode = self::IN_SELECT; + break; + + /* 5. If node is a td or th element, then switch the insertion mode + to "in cell" and abort these steps. */ + } elseif($node->nodeName === 'td' || $node->nodeName === 'th') { + $this->mode = self::IN_CELL; + break; + + /* 6. If node is a tr element, then switch the insertion mode to + "in row" and abort these steps. */ + } elseif($node->nodeName === 'tr') { + $this->mode = self::IN_ROW; + break; + + /* 7. If node is a tbody, thead, or tfoot element, then switch the + insertion mode to "in table body" and abort these steps. */ + } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { + $this->mode = self::IN_TBODY; + break; + + /* 8. If node is a caption element, then switch the insertion mode + to "in caption" and abort these steps. */ + } elseif($node->nodeName === 'caption') { + $this->mode = self::IN_CAPTION; + break; + + /* 9. If node is a colgroup element, then switch the insertion mode + to "in column group" and abort these steps. (innerHTML case) */ + } elseif($node->nodeName === 'colgroup') { + $this->mode = self::IN_CGROUP; + break; + + /* 10. If node is a table element, then switch the insertion mode + to "in table" and abort these steps. */ + } elseif($node->nodeName === 'table') { + $this->mode = self::IN_TABLE; + break; + + /* 11. If node is a head element, then switch the insertion mode + to "in body" ("in body"! not "in head"!) and abort these steps. + (innerHTML case) */ + } elseif($node->nodeName === 'head') { + $this->mode = self::IN_BODY; + break; + + /* 12. If node is a body element, then switch the insertion mode to + "in body" and abort these steps. */ + } elseif($node->nodeName === 'body') { + $this->mode = self::IN_BODY; + break; + + /* 13. If node is a frameset element, then switch the insertion + mode to "in frameset" and abort these steps. (innerHTML case) */ + } elseif($node->nodeName === 'frameset') { + $this->mode = self::IN_FRAME; + break; + + /* 14. If node is an html element, then: if the head element + pointer is null, switch the insertion mode to "before head", + otherwise, switch the insertion mode to "after head". In either + case, abort these steps. (innerHTML case) */ + } elseif($node->nodeName === 'html') { + $this->mode = ($this->head_pointer === null) + ? self::BEFOR_HEAD + : self::AFTER_HEAD; + + break; + + /* 15. If last is true, then set the insertion mode to "in body" + and abort these steps. (innerHTML case) */ + } elseif($last) { + $this->mode = self::IN_BODY; + break; + } + } + } + + private function closeCell() { + /* If the stack of open elements has a td or th element in table scope, + then act as if an end tag token with that tag name had been seen. */ + foreach(array('td', 'th') as $cell) { + if($this->elementInScope($cell, true)) { + $this->inCell(array( + 'name' => $cell, + 'type' => HTML5::ENDTAG + )); + + break; + } + } + } + + public function save() { + return $this->dom; + } +} +?> diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php b/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php index 1ca6271158..0698c4e457 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/MakeWellFormed.php @@ -7,31 +7,61 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy { /** - * Locally shared variable references + * Array stream of tokens being processed. */ - protected $inputTokens, $inputIndex, $outputTokens, $currentNesting, - $currentInjector, $injectors; + protected $tokens; + + /** + * Current index in $tokens. + */ + protected $t; + + /** + * Current nesting of elements. + */ + protected $stack; + + /** + * Injectors active in this stream processing. + */ + protected $injectors; + + /** + * Current instance of HTMLPurifier_Config. + */ + protected $config; + + /** + * Current instance of HTMLPurifier_Context. + */ + protected $context; public function execute($tokens, $config, $context) { $definition = $config->getHTMLDefinition(); // local variables - $result = array(); $generator = new HTMLPurifier_Generator($config, $context); $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); $e = $context->get('ErrorCollector', true); + $t = false; // token index + $i = false; // injector index + $token = false; // the current token + $reprocess = false; // whether or not to reprocess the same token + $stack = array(); // member variables - $this->currentNesting = array(); - $this->inputIndex = false; - $this->inputTokens =& $tokens; - $this->outputTokens =& $result; + $this->stack =& $stack; + $this->t =& $t; + $this->tokens =& $tokens; + $this->config = $config; + $this->context = $context; // context variables - $context->register('CurrentNesting', $this->currentNesting); - $context->register('InputIndex', $this->inputIndex); + $context->register('CurrentNesting', $stack); + $context->register('InputIndex', $t); $context->register('InputTokens', $tokens); + $context->register('CurrentToken', $token); // -- begin INJECTOR -- @@ -58,73 +88,119 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $this->injectors[] = $injector; } - // array index of the injector that resulted in an array - // substitution. This enables processTokens() to know which - // injectors are affected by the added tokens and which are - // not (namely, the ones after the current injector are not - // affected) - $this->currentInjector = false; - // give the injectors references to the definition and context // variables for performance reasons - foreach ($this->injectors as $i => $injector) { + foreach ($this->injectors as $ix => $injector) { $error = $injector->prepare($config, $context); if (!$error) continue; - array_splice($this->injectors, $i, 1); // rm the injector + array_splice($this->injectors, $ix, 1); // rm the injector trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING); } - // warning: most foreach loops follow the convention $i => $injector. - // Don't define these as loop-wide variables, please! - // -- end INJECTOR -- - $token = false; - $context->register('CurrentToken', $token); + // a note on punting: + // In order to reduce code duplication, whenever some code needs + // to make HTML changes in order to make things "correct", the + // new HTML gets sent through the purifier, regardless of its + // status. This means that if we add a start token, because it + // was totally necessary, we don't have to update nesting; we just + // punt ($reprocess = true; continue;) and it does that for us. // isset is in loop because $tokens size changes during loop exec - for ($this->inputIndex = 0; isset($tokens[$this->inputIndex]); $this->inputIndex++) { - - // if all goes well, this token will be passed through unharmed - $token = $tokens[$this->inputIndex]; + for ( + $t = 0; + $t == 0 || isset($tokens[$t - 1]); + // only increment if we don't need to reprocess + $reprocess ? $reprocess = false : $t++ + ) { - //printTokens($tokens, $this->inputIndex); + // check for a rewind + if (is_int($i) && $i >= 0) { + // possibility: disable rewinding if the current token has a + // rewind set on it already. This would offer protection from + // infinite loop, but might hinder some advanced rewinding. + $rewind_to = $this->injectors[$i]->getRewind(); + if (is_int($rewind_to) && $rewind_to < $t) { + if ($rewind_to < 0) $rewind_to = 0; + while ($t > $rewind_to) { + $t--; + $prev = $tokens[$t]; + // indicate that other injectors should not process this token, + // but we need to reprocess it + unset($prev->skip[$i]); + $prev->rewind = $i; + if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack); + elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start; + } + } + $i = false; + } - foreach ($this->injectors as $injector) { - if ($injector->skip > 0) $injector->skip--; + // handle case of document end + if (!isset($tokens[$t])) { + // kill processing if stack is empty + if (empty($this->stack)) break; + + // peek + $top_nesting = array_pop($this->stack); + $this->stack[] = $top_nesting; + + // send error + if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting); + } + + // append, don't splice, since this is the end + $tokens[] = new HTMLPurifier_Token_End($top_nesting->name); + + // punt! + $reprocess = true; + continue; } + // if all goes well, this token will be passed through unharmed + $token = $tokens[$t]; + + //echo '
    '; + //printTokens($tokens, $t); + //var_dump($this->stack); + // quick-check: if it's not a tag, no need to process - if (empty( $token->is_tag )) { + if (empty($token->is_tag)) { if ($token instanceof HTMLPurifier_Token_Text) { - // injector handler code; duplicated for performance reasons - foreach ($this->injectors as $i => $injector) { - if (!$injector->skip) $injector->handleText($token); - if (is_array($token)) { - $this->currentInjector = $i; - break; - } - } + foreach ($this->injectors as $i => $injector) { + if (isset($token->skip[$i])) continue; + if ($token->rewind !== null && $token->rewind !== $i) continue; + $injector->handleText($token); + $this->processToken($token, $i); + $reprocess = true; + break; + } } - $this->processToken($token, $config, $context); + // another possibility is a comment continue; } - $info = $definition->info[$token->name]->child; + if (isset($definition->info[$token->name])) { + $type = $definition->info[$token->name]->child->type; + } else { + $type = false; // Type is unknown, treat accordingly + } // quick tag checks: anything that's *not* an end tag $ok = false; - if ($info->type === 'empty' && $token instanceof HTMLPurifier_Token_Start) { - // test if it claims to be a start tag but is empty + if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) { + // claims to be a start tag but is empty $token = new HTMLPurifier_Token_Empty($token->name, $token->attr); $ok = true; - } elseif ($info->type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) { + } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) { // claims to be empty but really is a start tag - $token = array( - new HTMLPurifier_Token_Start($token->name, $token->attr), - new HTMLPurifier_Token_End($token->name) - ); - $ok = true; + $this->swap(new HTMLPurifier_Token_End($token->name)); + $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr)); + // punt (since we had to modify the input stream in a non-trivial way) + $reprocess = true; + continue; } elseif ($token instanceof HTMLPurifier_Token_Empty) { // real empty token $ok = true; @@ -132,62 +208,88 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // start tag // ...unless they also have to close their parent - if (!empty($this->currentNesting)) { + if (!empty($this->stack)) { - $parent = array_pop($this->currentNesting); - $parent_info = $definition->info[$parent->name]; + $parent = array_pop($this->stack); + $this->stack[] = $parent; - // this can be replaced with a more general algorithm: - // if the token is not allowed by the parent, auto-close - // the parent - if (!isset($parent_info->child->elements[$token->name])) { + if (isset($definition->info[$parent->name])) { + $elements = $definition->info[$parent->name]->child->getNonAutoCloseElements($config); + $autoclose = !isset($elements[$token->name]); + } else { + $autoclose = false; + } + + if ($autoclose) { if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent); - // close the parent, then re-loop to reprocess token - $result[] = new HTMLPurifier_Token_End($parent->name); - $this->inputIndex--; + // insert parent end tag before this tag + $new_token = new HTMLPurifier_Token_End($parent->name); + $new_token->start = $parent; + $this->insertBefore($new_token); + $reprocess = true; continue; } - $this->currentNesting[] = $parent; // undo the pop } $ok = true; } - // injector handler code; duplicated for performance reasons if ($ok) { foreach ($this->injectors as $i => $injector) { - if (!$injector->skip) $injector->handleElement($token); - if (is_array($token)) { - $this->currentInjector = $i; - break; + if (isset($token->skip[$i])) continue; + if ($token->rewind !== null && $token->rewind !== $i) continue; + $injector->handleElement($token); + $this->processToken($token, $i); + $reprocess = true; + break; + } + if (!$reprocess) { + // ah, nothing interesting happened; do normal processing + $this->swap($token); + if ($token instanceof HTMLPurifier_Token_Start) { + $this->stack[] = $token; + } elseif ($token instanceof HTMLPurifier_Token_End) { + throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed'); } } - $this->processToken($token, $config, $context); continue; } // sanity check: we should be dealing with a closing tag - if (!$token instanceof HTMLPurifier_Token_End) continue; + if (!$token instanceof HTMLPurifier_Token_End) { + throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier'); + } // make sure that we have something open - if (empty($this->currentNesting)) { + if (empty($this->stack)) { if ($escape_invalid_tags) { if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text'); - $result[] = new HTMLPurifier_Token_Text( + $this->swap(new HTMLPurifier_Token_Text( $generator->generateFromToken($token) - ); - } elseif ($e) { - $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); + )); + } else { + $this->remove(); + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); } + $reprocess = true; continue; } - // first, check for the simplest case: everything closes neatly - $current_parent = array_pop($this->currentNesting); + // first, check for the simplest case: everything closes neatly. + // Eventually, everything passes through here; if there are problems + // we modify the input stream accordingly and then punt, so that + // the tokens get processed again. + $current_parent = array_pop($this->stack); if ($current_parent->name == $token->name) { - $result[] = $token; + $token->start = $current_parent; foreach ($this->injectors as $i => $injector) { - $injector->notifyEnd($token); + if (isset($token->skip[$i])) continue; + if ($token->rewind !== null && $token->rewind !== $i) continue; + $injector->handleEnd($token); + $this->processToken($token, $i); + $this->stack[] = $current_parent; + $reprocess = true; + break; } continue; } @@ -195,47 +297,56 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // okay, so we're trying to close the wrong tag // undo the pop previous pop - $this->currentNesting[] = $current_parent; + $this->stack[] = $current_parent; // scroll back the entire nest, trying to find our tag. // (feature could be to specify how far you'd like to go) - $size = count($this->currentNesting); + $size = count($this->stack); // -2 because -1 is the last element, but we already checked that $skipped_tags = false; - for ($i = $size - 2; $i >= 0; $i--) { - if ($this->currentNesting[$i]->name == $token->name) { - // current nesting is modified - $skipped_tags = array_splice($this->currentNesting, $i); + for ($j = $size - 2; $j >= 0; $j--) { + if ($this->stack[$j]->name == $token->name) { + $skipped_tags = array_slice($this->stack, $j); break; } } - // we still didn't find the tag, so remove + // we didn't find the tag, so remove if ($skipped_tags === false) { if ($escape_invalid_tags) { - $result[] = new HTMLPurifier_Token_Text( + $this->swap(new HTMLPurifier_Token_Text( $generator->generateFromToken($token) - ); + )); if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text'); - } elseif ($e) { - $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); + } else { + $this->remove(); + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); } + $reprocess = true; continue; } - // okay, we found it, close all the skipped tags - // note that skipped tags contains the element we need closed - for ($i = count($skipped_tags) - 1; $i >= 0; $i--) { - // please don't redefine $i! - if ($i && $e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) { - $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]); - } - $result[] = $new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name); - foreach ($this->injectors as $injector) { - $injector->notifyEnd($new_token); + // do errors, in REVERSE $j order: a,b,c with
    + $c = count($skipped_tags); + if ($e) { + for ($j = $c - 1; $j > 0; $j--) { + // notice we exclude $j == 0, i.e. the current ending tag, from + // the errors... + if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]); + } } } + // insert tags, in FORWARD $j order: c,b,a with
    + for ($j = 1; $j < $c; $j++) { + // ...as well as from the insertions + $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name); + $new_token->start = $skipped_tags[$j]; + $this->insertBefore($new_token); + } + $reprocess = true; + continue; } $context->destroy('CurrentNesting'); @@ -243,59 +354,77 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $context->destroy('InputIndex'); $context->destroy('CurrentToken'); - // we're at the end now, fix all still unclosed tags (this is - // duplicated from the end of the loop with some slight modifications) - // not using $skipped_tags since it would invariably be all of them - if (!empty($this->currentNesting)) { - for ($i = count($this->currentNesting) - 1; $i >= 0; $i--) { - // please don't redefine $i! - if ($e && !isset($this->currentNesting[$i]->armor['MakeWellFormed_TagClosedError'])) { - $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]); - } - $result[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name); - foreach ($this->injectors as $injector) { - $injector->notifyEnd($new_token); - } + unset($this->injectors, $this->stack, $this->tokens, $this->t); + return $tokens; + } + + /** + * Processes arbitrary token values for complicated substitution patterns. + * In general: + * + * If $token is an array, it is a list of tokens to substitute for the + * current token. These tokens then get individually processed. If there + * is a leading integer in the list, that integer determines how many + * tokens from the stream should be removed. + * + * If $token is a regular token, it is swapped with the current token. + * + * If $token is false, the current token is deleted. + * + * If $token is an integer, that number of tokens (with the first token + * being the current one) will be deleted. + * + * @param $token Token substitution value + * @param $injector Injector that performed the substitution; default is if + * this is not an injector related operation. + */ + protected function processToken($token, $injector = -1) { + + // normalize forms of token + if (is_object($token)) $token = array(1, $token); + if (is_int($token)) $token = array($token); + if ($token === false) $token = array(1); + if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector'); + if (!is_int($token[0])) array_unshift($token, 1); + if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid'); + + // $token is now an array with the following form: + // array(number nodes to delete, new node 1, new node 2, ...) + + $delete = array_shift($token); + $old = array_splice($this->tokens, $this->t, $delete, $token); + + if ($injector > -1) { + // determine appropriate skips + $oldskip = isset($old[0]) ? $old[0]->skip : array(); + foreach ($token as $object) { + $object->skip = $oldskip; + $object->skip[$injector] = true; } } - unset($this->outputTokens, $this->injectors, $this->currentInjector, - $this->currentNesting, $this->inputTokens, $this->inputIndex); - - return $result; } - function processToken($token, $config, $context) { - if (is_array($token)) { - // the original token was overloaded by an injector, time - // to some fancy acrobatics - - // $this->inputIndex is decremented so that the entire set gets - // re-processed - array_splice($this->inputTokens, $this->inputIndex--, 1, $token); - - // adjust the injector skips based on the array substitution - if ($this->injectors) { - $offset = count($token); - for ($i = 0; $i <= $this->currentInjector; $i++) { - // because of the skip back, we need to add one more - // for uninitialized injectors. I'm not exactly - // sure why this is the case, but I think it has to - // do with the fact that we're decrementing skips - // before re-checking text - if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++; - $this->injectors[$i]->skip += $offset; - } - } - } elseif ($token) { - // regular case - $this->outputTokens[] = $token; - if ($token instanceof HTMLPurifier_Token_Start) { - $this->currentNesting[] = $token; - } elseif ($token instanceof HTMLPurifier_Token_End) { - array_pop($this->currentNesting); // not actually used - } - } + /** + * Inserts a token before the current token. Cursor now points to this token + */ + private function insertBefore($token) { + array_splice($this->tokens, $this->t, 0, array($token)); + } + + /** + * Removes current token. Cursor now points to new token occupying previously + * occupied space. + */ + private function remove() { + array_splice($this->tokens, $this->t, 1); + } + + /** + * Swap current token with new token. Cursor points to new token (no change). + */ + private function swap($token) { + $this->tokens[$this->t] = $token; } } diff --git a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php index 165308dd0c..6a69299041 100644 --- a/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/lib/htmlpurifier/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -19,6 +19,9 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); $remove_invalid_img = $config->get('Core', 'RemoveInvalidImg'); + // currently only used to determine if comments should be kept + $trusted = $config->get('HTML', 'Trusted'); + $remove_script_contents = $config->get('Core', 'RemoveScriptContents'); $hidden_elements = $config->get('Core', 'HiddenElements'); @@ -125,6 +128,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy if ($textify_comments !== false) { $data = $token->data; $token = new HTMLPurifier_Token_Text($data); + } elseif ($trusted) { + // keep, but perform comment cleaning + if ($e) { + // perform check whether or not there's a trailing hyphen + if (substr($token->data, -1) == '-') { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); + } + } + $token->data = rtrim($token->data, '-'); + $found_double_hyphen = false; + while (strpos($token->data, '--') !== false) { + if ($e && !$found_double_hyphen) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); + } + $found_double_hyphen = true; // prevent double-erroring + $token->data = str_replace('--', '-', $token->data); + } } else { // strip comments if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); diff --git a/lib/htmlpurifier/HTMLPurifier/StringHashParser.php b/lib/htmlpurifier/HTMLPurifier/StringHashParser.php index e96897d136..e98271df48 100644 --- a/lib/htmlpurifier/HTMLPurifier/StringHashParser.php +++ b/lib/htmlpurifier/HTMLPurifier/StringHashParser.php @@ -78,6 +78,7 @@ class HTMLPurifier_StringHashParser if (strncmp('--', $line, 2) === 0) { // Multiline declaration $state = trim($line, '- '); + if (!isset($ret[$state])) $ret[$state] = ''; continue; } elseif (!$state) { $single = true; @@ -94,7 +95,6 @@ class HTMLPurifier_StringHashParser $single = false; $state = false; } else { - if (!isset($ret[$state])) $ret[$state] = ''; $ret[$state] .= "$line\n"; } } while (!feof($fh)); diff --git a/lib/htmlpurifier/HTMLPurifier/Token.php b/lib/htmlpurifier/HTMLPurifier/Token.php index 8803307bda..942a61d129 100644 --- a/lib/htmlpurifier/HTMLPurifier/Token.php +++ b/lib/htmlpurifier/HTMLPurifier/Token.php @@ -5,6 +5,7 @@ */ class HTMLPurifier_Token { public $line; /**< Line number node was on in source document. Null if unknown. */ + public $col; /**< Column of line node was on in source document. Null if unknown. */ /** * Lookup array of processing that this token is exempt from. @@ -13,17 +14,41 @@ class HTMLPurifier_Token { */ public $armor = array(); + /** + * Used during MakeWellFormed. + */ + public $skip; + public $rewind; + public function __get($n) { if ($n === 'type') { trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE); switch (get_class($this)) { - case 'HTMLPurifier_Token_Start': return 'start'; - case 'HTMLPurifier_Token_Empty': return 'empty'; - case 'HTMLPurifier_Token_End': return 'end'; - case 'HTMLPurifier_Token_Text': return 'text'; - case 'HTMLPurifier_Token_Comment': return 'comment'; + case 'HTMLPurifier_Token_Start': return 'start'; + case 'HTMLPurifier_Token_Empty': return 'empty'; + case 'HTMLPurifier_Token_End': return 'end'; + case 'HTMLPurifier_Token_Text': return 'text'; + case 'HTMLPurifier_Token_Comment': return 'comment'; default: return null; } } } + + /** + * Sets the position of the token in the source document. + */ + public function position($l = null, $c = null) { + $this->line = $l; + $this->col = $c; + } + + /** + * Convenience function for DirectLex settings line/col position. + */ + public function rawPosition($l, $c) { + if ($c === -1) $l++; + $this->line = $l; + $this->col = $c; + } + } diff --git a/lib/htmlpurifier/HTMLPurifier/Token/Comment.php b/lib/htmlpurifier/HTMLPurifier/Token/Comment.php index 1571a40dc1..6717478095 100644 --- a/lib/htmlpurifier/HTMLPurifier/Token/Comment.php +++ b/lib/htmlpurifier/HTMLPurifier/Token/Comment.php @@ -11,9 +11,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token * * @param $data String comment data. */ - public function __construct($data, $line = null) { + public function __construct($data, $line = null, $col = null) { $this->data = $data; $this->line = $line; + $this->col = $col; } } diff --git a/lib/htmlpurifier/HTMLPurifier/Token/End.php b/lib/htmlpurifier/HTMLPurifier/Token/End.php index fbf45e81cd..13cba031d9 100644 --- a/lib/htmlpurifier/HTMLPurifier/Token/End.php +++ b/lib/htmlpurifier/HTMLPurifier/Token/End.php @@ -9,5 +9,9 @@ */ class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag { - + /** + * Token that started this node. Added by MakeWellFormed. Please + * do not edit this! + */ + public $start; } diff --git a/lib/htmlpurifier/HTMLPurifier/Token/Tag.php b/lib/htmlpurifier/HTMLPurifier/Token/Tag.php index 43748f7008..795c40f665 100644 --- a/lib/htmlpurifier/HTMLPurifier/Token/Tag.php +++ b/lib/htmlpurifier/HTMLPurifier/Token/Tag.php @@ -33,7 +33,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token * @param $name String name. * @param $attr Associative array of attributes. */ - public function __construct($name, $attr = array(), $line = null) { + public function __construct($name, $attr = array(), $line = null, $col = null) { $this->name = ctype_lower($name) ? $name : strtolower($name); foreach ($attr as $key => $value) { // normalization only necessary when key is not lowercase @@ -49,5 +49,6 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token } $this->attr = $attr; $this->line = $line; + $this->col = $col; } } diff --git a/lib/htmlpurifier/HTMLPurifier/Token/Text.php b/lib/htmlpurifier/HTMLPurifier/Token/Text.php index 3942f8a0ae..02b53e5c84 100644 --- a/lib/htmlpurifier/HTMLPurifier/Token/Text.php +++ b/lib/htmlpurifier/HTMLPurifier/Token/Text.php @@ -21,10 +21,11 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token * * @param $data String parsed character data. */ - public function __construct($data, $line = null) { + public function __construct($data, $line = null, $col = null) { $this->data = $data; $this->is_whitespace = ctype_space($data); $this->line = $line; + $this->col = $col; } } diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternal.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternal.php index d48bce06bd..960e2b9b5a 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternal.php +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/DisableExternal.php @@ -5,7 +5,7 @@ class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter public $name = 'DisableExternal'; protected $ourHostParts = false; public function prepare($config) { - $our_host = $config->get('URI', 'Host'); + $our_host = $config->getDefinition('URI')->host; if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host)); } public function filter(&$uri, $config, $context) { diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php index 289db51ad3..8999826744 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/MakeAbsolute.php @@ -51,12 +51,18 @@ class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter } if ($uri->path === '') { $uri->path = $this->base->path; - }elseif ($uri->path[0] !== '/') { + } elseif ($uri->path[0] !== '/') { // relative path, needs more complicated processing $stack = explode('/', $uri->path); $new_stack = array_merge($this->basePathStack, $stack); + if ($new_stack[0] !== '' && !is_null($this->base->host)) { + array_unshift($new_stack, ''); + } $new_stack = $this->_collapseStack($new_stack); $uri->path = implode('/', $new_stack); + } else { + // absolute path, but still we should collapse + $uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path))); } // re-combine $uri->scheme = $this->base->scheme; @@ -71,6 +77,7 @@ class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter */ private function _collapseStack($stack) { $result = array(); + $is_folder = false; for ($i = 0; isset($stack[$i]); $i++) { $is_folder = false; // absorb an internally duplicated slash diff --git a/lib/htmlpurifier/HTMLPurifier/URIFilter/Munge.php b/lib/htmlpurifier/HTMLPurifier/URIFilter/Munge.php index ad6578e669..d82d2b9203 100644 --- a/lib/htmlpurifier/HTMLPurifier/URIFilter/Munge.php +++ b/lib/htmlpurifier/HTMLPurifier/URIFilter/Munge.php @@ -28,7 +28,11 @@ class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter $this->replace = array_map('rawurlencode', $this->replace); $new_uri = strtr($this->target, $this->replace); - $uri = $this->parser->parse($new_uri); // overwrite + $new_uri = $this->parser->parse($new_uri); + // don't redirect if the target host is the same as the + // starting host + if ($uri->host === $new_uri->host) return true; + $uri = $new_uri; // overwrite return true; } diff --git a/lib/htmlpurifier/HTMLPurifier/UnitConverter.php b/lib/htmlpurifier/HTMLPurifier/UnitConverter.php index 6e3046102b..d69a132b26 100644 --- a/lib/htmlpurifier/HTMLPurifier/UnitConverter.php +++ b/lib/htmlpurifier/HTMLPurifier/UnitConverter.php @@ -234,6 +234,18 @@ class HTMLPurifier_UnitConverter * Scales a float to $scale digits right of decimal point, like BCMath. */ private function scale($r, $scale) { + if ($scale < 0) { + // The f sprintf type doesn't support negative numbers, so we + // need to cludge things manually. First get the string. + $r = sprintf('%.0f', (float) $r); + // Due to floating point precision loss, $r will more than likely + // look something like 4652999999999.9234. We grab one more digit + // than we need to precise from $r and then use that to round + // appropriately. + $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1); + // Now we return it, truncating the zero that was rounded off. + return substr($precise, 0, -1) . str_repeat('0', -$scale + 1); + } return sprintf('%.' . $scale . 'f', (float) $r); } diff --git a/lib/htmlpurifier/readme_moodle.txt b/lib/htmlpurifier/readme_moodle.txt index 99e67dbc4f..c9708ae0cd 100644 --- a/lib/htmlpurifier/readme_moodle.txt +++ b/lib/htmlpurifier/readme_moodle.txt @@ -1,4 +1,4 @@ -Description of HTML Purifier v3.1.1 library import into Moodle +Description of HTML Purifier v3.2.0 library import into Moodle Changes: * HMLTModule/Text.php - added , , and tags -- 2.39.5