From c2e916df0fca5e23d46b51e607b137ca922d182a Mon Sep 17 00:00:00 2001 From: garvinhicking Date: Tue, 27 Sep 2005 13:16:14 +0000 Subject: [PATCH] Fix the whole UTF-8 Onyx/XML parsing logic to properly work. PHP5 did not need xml_parser_create() to specify a charset, but PHP4 does. PHP5 made errors when feeds in UTF-8 were double-recoded because of a xml_parser_set_option() call without specifying the charset in xml_parser_create. Hope this works. ;) --- bundled-libs/Onyx/RSS.php | 11 ++++--- docs/NEWS | 5 ++++ include/admin/import.inc.php | 15 +++------- include/admin/importers/generic.inc.php | 2 +- .../serendipity_event_spartacus.php | 15 ++++++++-- .../serendipity_event_xhtmlcleanup.php | 2 +- .../serendipity_plugin_remoterss.php | 29 ++++++++++--------- serendipity_config.inc.php | 8 +++++ 8 files changed, 55 insertions(+), 32 deletions(-) diff --git a/bundled-libs/Onyx/RSS.php b/bundled-libs/Onyx/RSS.php index aa8d782..11c2cb6 100644 --- a/bundled-libs/Onyx/RSS.php +++ b/bundled-libs/Onyx/RSS.php @@ -51,14 +51,14 @@ class ONYX_RSS * private $type; */ - function ONYX_RSS() + function ONYX_RSS($charset = 'UTF-8') { - $this->__construct(); + $this->__construct($charset); } // Forward compatibility with PHP v.5 // http://www.phpvolcano.com/eide/php5.php?page=start - function __construct() + function __construct($charset = 'UTF-8') { $this->conf = array(); $this->conf['error'] = '
Error on line %s of '.__FILE__.': %s
'; @@ -73,7 +73,10 @@ class ONYX_RSS return false; } - $this->parser = @xml_parser_create(); + if ($charset == 'native') { + $charset = LANG_CHARSET; + } + $this->parser = @xml_parser_create($charset); if (!is_resource($this->parser)) { $this->raiseError((__LINE__-3), ONYX_ERR_NO_PARSER); diff --git a/docs/NEWS b/docs/NEWS index 41d8f9a..ecb7c73 100644 --- a/docs/NEWS +++ b/docs/NEWS @@ -3,6 +3,11 @@ Version 0.9 () ------------------------------------------------------------------------ + * Change Onyx RSS parser and xml_parser_* functions to already specify + the source charset, so that PHP functions can do the recoding on + their own. Functionality differed on PHP4 and PHP5, this has now + been unified. Thanks a lot to W-Mark Kubacki! + * Enhance XHTML Cleanup plugin to recode double-encoded UTF-8 HTML entities on NON-UTF8 blogs. Patch thanks to W-Mark Kubacki! diff --git a/include/admin/import.inc.php b/include/admin/import.inc.php index 7bb4f7f..1d4935a 100644 --- a/include/admin/import.inc.php +++ b/include/admin/import.inc.php @@ -42,20 +42,13 @@ class Serendipity_Import { } function &decode($string) { - static $phpCharset = null; - $target = $this->data['charset']; - - if ($phpCharset === null) { - $phpCharset = version_compare(phpversion(), '4.3.11', '>='); - } - - if ($phpCharset == 1) { - // Luckily PHP5 supports - // xml_parser_set_option($this->parser, XML_OPTION_TARGET_ENCODING, LANG_CHARSET); - // which means we need no transcoding here. + // xml_parser_* functions to recoding from ISO-8859-1/UTF-8 + if (LANG_CHARSET == 'ISO-8859-1' || LANG_CHARSET == 'UTF-8') { return $string; } + $target = $this->data['charset']; + switch($target) { case 'native': return $string; diff --git a/include/admin/importers/generic.inc.php b/include/admin/importers/generic.inc.php index 216a383..9d52b29 100644 --- a/include/admin/importers/generic.inc.php +++ b/include/admin/importers/generic.inc.php @@ -127,7 +127,7 @@ class Serendipity_Import_Generic extends Serendipity_Import { function import() { global $serendipity; - $c = &new Onyx_RSS(); + $c = &new Onyx_RSS($this->data['charset']); $c->parse($this->data['url']); $this->data['encoding'] = $c->rss['encoding']; diff --git a/plugins/serendipity_event_spartacus/serendipity_event_spartacus.php b/plugins/serendipity_event_spartacus/serendipity_event_spartacus.php index 983553c..a0a2490 100644 --- a/plugins/serendipity_event_spartacus/serendipity_event_spartacus.php +++ b/plugins/serendipity_event_spartacus/serendipity_event_spartacus.php @@ -259,6 +259,11 @@ class serendipity_event_spartacus extends serendipity_event } function decode(&$data) { + // xml_parser_* functions to recoding from ISO-8859-1/UTF-8 + if (LANG_CHARSET == 'ISO-8859-1' || LANG_CHARSET == 'UTF-8') { + return true; + } + switch (strtolower(LANG_CHARSET)) { case 'utf-8': // The XML file is UTF-8 format. No changes needed. @@ -326,10 +331,15 @@ class serendipity_event_spartacus extends serendipity_event } // XML functions - $xml_string = ''; + $xml_string = ''; if (preg_match('@(<\?xml.+\?>)@imsU', $xml, $xml_head)) { $xml_string = $xml_head[1]; } + + $encoding = 'UTF-8'; + if (preg_match('@encoding="([^"]+)"@', $xml_string, $xml_encoding)) { + $encoding = $xml_encoding[1]; + } preg_match_all('@(.*)@imsU', $xml, $xml_matches); if (!is_array($xml_matches)) { @@ -347,8 +357,9 @@ class serendipity_event_spartacus extends serendipity_event foreach($xml_matches[0] as $xml_index => $xml_package) { $i = 0; - $p = xml_parser_create(); + $p = xml_parser_create($encoding); xml_parser_set_option($p, XML_OPTION_CASE_FOLDING, 0); + @xml_parser_set_option($this->parser, XML_OPTION_TARGET_ENCODING, LANG_CHARSET); $xml_package = $xml_string . "\n" . $xml_package; xml_parse_into_struct($p, $xml_package, $vals); xml_parser_free($p); diff --git a/plugins/serendipity_event_xhtmlcleanup/serendipity_event_xhtmlcleanup.php b/plugins/serendipity_event_xhtmlcleanup/serendipity_event_xhtmlcleanup.php index b1986c5..5b9299f 100644 --- a/plugins/serendipity_event_xhtmlcleanup/serendipity_event_xhtmlcleanup.php +++ b/plugins/serendipity_event_xhtmlcleanup/serendipity_event_xhtmlcleanup.php @@ -188,7 +188,7 @@ class serendipity_event_xhtmlcleanup extends serendipity_event $val = &$this->cleanup_val; // Instead of nasty regex-mangling we use the XML parser to get the attribute list of our input tag - $p = xml_parser_create(); + $p = xml_parser_create(LANG_CHARSET); @xml_parse_into_struct($p, $data[0], $vals, $index); xml_parser_free($p); diff --git a/plugins/serendipity_plugin_remoterss/serendipity_plugin_remoterss.php b/plugins/serendipity_plugin_remoterss/serendipity_plugin_remoterss.php index b3813ed..85eea67 100644 --- a/plugins/serendipity_plugin_remoterss/serendipity_plugin_remoterss.php +++ b/plugins/serendipity_plugin_remoterss/serendipity_plugin_remoterss.php @@ -83,7 +83,17 @@ class s9y_remoterss_XMLTree { $data = str_replace('', '', $data); // XML functions - $p = xml_parser_create(); + $xml_string = ''; + if (preg_match('@(<\?xml.+\?>)@imsU', $data, $xml_head)) { + $xml_string = $xml_head[1]; + } + + $encoding = 'UTF-8'; + if (preg_match('@encoding="([^"]+)"@', $xml_string, $xml_encoding)) { + $encoding = $xml_encoding[1]; + } + + $p = xml_parser_create($encoding); // by: anony@mous.com - meets XML 1.0 specification @xml_parser_set_option($p, XML_OPTION_CASE_FOLDING, 0); xml_parser_set_option($p, XML_OPTION_TARGET_ENCODING, LANG_CHARSET); @@ -386,6 +396,7 @@ class serendipity_plugin_remoterss extends serendipity_plugin { $feedtype = $this->get_config('feedtype', 'rss'); $markup = $this->get_config('markup', 'false'); $bulletimg = $this->get_config('bulletimg'); + $charset = $this->get_config('charset', 'native'); if (!$number || !is_numeric($number) || $number < 1) { $showAll = true; @@ -407,7 +418,7 @@ class serendipity_plugin_remoterss extends serendipity_plugin { if ($feedtype == 'rss') { require_once S9Y_PEAR_PATH . 'Onyx/RSS.php'; - $c = &new Onyx_RSS(); + $c = &new Onyx_RSS($charset); $c->parse($rssuri); $this->encoding = $c->rss['encoding']; @@ -529,21 +540,13 @@ class serendipity_plugin_remoterss extends serendipity_plugin { } function &decode($string) { - static $phpCharset = null; - - if ($phpCharset === null) { - $phpCharset = version_compare(phpversion(), '4.3.11', '>='); - } + $target = $this->get_config('charset', 'native'); - if ($phpCharset == 1) { - // Luckily PHP5 supports - // xml_parser_set_option($this->parser, XML_OPTION_TARGET_ENCODING, LANG_CHARSET); - // which means we need no transcoding here. + // xml_parser_* functions to recoding from ISO-8859-1/UTF-8 + if (LANG_CHARSET == 'ISO-8859-1' || LANG_CHARSET == 'UTF-8') { return $string; } - $target = $this->get_config('charset', 'native'); - switch($target) { case 'native': return $string; diff --git a/serendipity_config.inc.php b/serendipity_config.inc.php index 04a6bb4..6bbf564 100644 --- a/serendipity_config.inc.php +++ b/serendipity_config.inc.php @@ -227,6 +227,14 @@ serendipity_permalinkPatterns(); */ include(S9Y_INCLUDE_PATH . 'include/lang.inc.php'); +/* + * Reset charset definition now that final language is known + */ +$serendipity['charsets'] = array( + 'UTF-8/' => 'UTF-8', + '' => CHARSET_NATIVE +); + /* * Set current locale, if any has been defined */ -- 2.39.5