From: fmarier Date: Wed, 17 Dec 2008 02:55:33 +0000 (+0000) Subject: MDL-17542 weblib/html2text: replace html2text.php with a GPL alternative (which is... X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=588acd0608f8af6e43da4c8bed50c2bbd8a78060;p=moodle.git MDL-17542 weblib/html2text: replace html2text.php with a GPL alternative (which is also faster) --- diff --git a/lib/html2text.php b/lib/html2text.php index 219efaa7ef..d298ee2e4b 100644 --- a/lib/html2text.php +++ b/lib/html2text.php @@ -1,167 +1,595 @@ * + * All rights reserved. * + * * + * This script is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * The GNU General Public License can be found at * + * http://www.gnu.org/copyleft/gpl.html. * + * * + * This script is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * Author(s): Jon Abernathy * + * * + * Last modified: 08/08/07 * + * * + *************************************************************************/ + + +/** + * Takes HTML and converts it to formatted, plain text. + * + * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and + * correcting an error in the regexp search array. Fixed 7/30/03. + * + * Updated set_html() function's file reading mechanism, 9/25/03. + * + * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding + * several more HTML entity codes to the $search and $replace arrays. + * Updated 11/7/03. + * + * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for + * suggesting the addition of $allowed_tags and its supporting function + * (which I slightly modified). Updated 3/12/04. + * + * Thanks to Justin Dearing for pointing out that a replacement for the + * tag was missing, and suggesting an appropriate fix. + * Updated 8/25/04. + * + * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a + * display/formatting bug in the _build_link_list() function: email + * readers would show the left bracket and number ("[1") as part of the + * rendered email address. + * Updated 12/16/04. + * + * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code + * to handle relative links, which I hadn't considered. I modified his + * code a bit to handle normal HTTP links and MAILTO links. Also for + * suggesting three additional HTML entity codes to search for. + * Updated 03/02/05. + * + * Thanks to Jacob Chandler for pointing out another link condition + * for the _build_link_list() function: "https". + * Updated 04/06/05. + * + * Thanks to Marc Bertrand (http://www.dresdensky.com/) for + * suggesting a revision to the word wrapping functionality; if you + * specify a $width of 0 or less, word wrapping will be ignored. + * Updated 11/02/06. + * + * *** Big housecleaning updates below: + * + * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for + * suggesting the fix to handle and blank lines (whitespace). + * Christian Basedau (http://www.movetheweb.de/) also suggested the + * blank lines fix. + * + * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), + * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), + * Bas van de Weijer, and Marijn van Butselaar + * for pointing out my glaring error in the handling. Marcus also + * supplied a host of fixes. + * + * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing + * out that extra spaces should be compressed--a problem addressed with + * Marcus Bointon's fixes but that I had not yet incorporated. + * + * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for + * suggesting a valuable fix with tag handling. + * + * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, + * including the tag handling that Daniel Schledermann pointed + * out but that I had not yet incorporated. I haven't (yet) + * incorporated all of Wojciech's changes, though I may at some + * future time. + * + * *** End of the housecleaning updates. Updated 08/08/07. + * + * @author Jon Abernathy + * @version 1.0.0 + * @since PHP 4.0.2 + */ +class html2text +{ + + /** + * Contains the HTML content to convert. + * + * @var string $html + * @access public + */ + var $html; + + /** + * Contains the converted, formatted text. + * + * @var string $text + * @access public + */ + var $text; + + /** + * Maximum width of the formatted text, in columns. + * + * Set this value to 0 (or less) to ignore word wrapping + * and not constrain text to a fixed-width column. + * + * @var integer $width + * @access public + */ + var $width = 70; + + /** + * List of preg* regular expression patterns to search for, + * used in conjunction with $replace. + * + * @var array $search + * @access public + * @see $replace + */ + var $search = array( + "/\r/", // Non-legal carriage return + "/[\n\t]+/", // Newlines and tabs + '/[ ]{2,}/', // Runs of spaces, pre-handling + '/]*>.*?<\/script>/i', // ', $x ) + 8; // Moodle - $chr = ''; - } else if (!$is_open_tb) { - $is_open_tb = true; - } else { - $chr = '<'; - } - break; - - case '>': - if ( !$is_open_tb || $is_open_dq || $is_open_sq ) { - $chr = '>'; - } else { - $is_open_tb = false; - } - break; - - case '"': - if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) { - $is_open_dq = true; - } else if ( $is_open_tb && $is_open_dq && !$is_open_sq ) { - $is_open_dq = false; - } else { - $chr = '"'; - } - break; - - case "'": - if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) { - $is_open_sq = true; - } else if ( $is_open_tb && !$is_open_dq && $is_open_sq ) { - $is_open_sq = false; - } - break; - } - $goodStr .= $chr; + /** + * Returns the text, converted from HTML. + * + * @access public + * @return string + */ + function get_text() + { + if ( !$this->_converted ) { + $this->_convert(); } - } // Moodle - //now that the page is valid (I hope) for strip_tags, strip all unwanted tags + return $this->text; + } - $goodStr = strip_tags( $goodStr, '<hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' ); + /** + * Prints the text, converted from HTML. + * + * @access public + * @return void + */ + function print_text() + { + print $this->get_text(); + } - //strip extra whitespace except between <pre> and <textarea> tags + /** + * Alias to print_text(), operates identically. + * + * @access public + * @return void + * @see print_text() + */ + function p() + { + print $this->get_text(); + } - $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr ); + /** + * Sets the allowed HTML tags to pass through to the resulting text. + * + * Tags should be in the form "<p>", with no corresponding closing tag. + * + * @access public + * @return void + */ + function set_allowed_tags( $allowed_tags = '' ) + { + if ( !empty($allowed_tags) ) { + $this->allowed_tags = $allowed_tags; + } + } - for ( $x = 0; isset($badStr[$x]) && is_string( $badStr[$x] ); $x++ ) { // Moodle: added isset() test - if ( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else { - $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] ); - for ( $z = 0; isset($goodStr[$z]) && is_string( $goodStr[$z] ); $z++ ) { // Moodle: added isset() test - if ( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else { - $goodStr[$z] = str_replace(' ', ' ', $goodStr[$z] ); - } + /** + * Sets a base URL to handle relative links. + * + * @access public + * @return void + */ + function set_base_url( $url = '' ) + { + if ( empty($url) ) { + if ( !empty($_SERVER['HTTP_HOST']) ) { + $this->url = 'http://' . $_SERVER['HTTP_HOST']; + } else { + $this->url = ''; + } + } else { + // Strip any trailing slashes for consistency (relative + // URLs may already start with a slash like "/file.html") + if ( substr($url, -1) == '/' ) { + $url = substr($url, 0, -1); } - $badStr[$x] = implode('',$goodStr); + $this->url = $url; } } - $goodStr = implode('',$badStr); - - //remove all options from select inputs - - $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr ); - - //replace all tags with their text equivalents - - $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr ); - - $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr ); - - $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr ); - - $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr ); - - $goodStr = preg_replace( "/<li[^>]*>/i", "\n� ", $goodStr ); - - $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr ); - - $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr ); - - // $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr ); // Moodle - $goodStr = preg_replace( "/<a\s[^>]*href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>([^<]*)<\/a>/i", "$7 [$2$4$6]", $goodStr ); - - // $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr ); // Moodle - $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[$2$3$4] ", $goodStr ); - - $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr ); - - $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr ); - - //strip all remaining tags (mostly closing tags) - - $goodStr = strip_tags( $goodStr ); - - //convert HTML entities - - $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) ); + /** + * Workhorse function that does actual conversion. + * + * First performs custom tag replacement specified by $search and + * $replace arrays. Then strips any remaining HTML tags, reduces whitespace + * and newlines to a readable format, and word wraps the text to + * $width characters. + * + * @access private + * @return void + */ + function _convert() + { + // Variables used for building the link list + $this->_link_count = 0; + $this->_link_list = ''; + + $text = trim(stripslashes($this->html)); + + // Convert <PRE> + $this->_convert_pre($text); + + // Replace known html entities + $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); + + // Run our defined search-and-replace + $text = preg_replace($this->search, $this->replace, $text); + $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text); + + // Strip any other HTML tags + $text = strip_tags($text, $this->allowed_tags); + + // Bring down number of empty lines to 2 max + $text = preg_replace("/\n\s+\n/", "\n\n", $text); + $text = preg_replace("/[\n]{3,}/", "\n\n", $text); + + // Add link list + if ( !empty($this->_link_list) ) { + $text .= "\n\nLinks:\n------\n" . $this->_link_list; + } - preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr ); + // Wrap the text to a readable format + // for PHP versions >= 4.0.2. Default width is 75 + // If width is 0 or less, don't wrap the text. + if ( $this->width > 0 ) { + $text = wordwrap($text, $this->width); + } - //wordwrap + $this->text = $text; - // $goodStr = wordwrap( $goodStr ); // Moodle - $goodStr = wordwrap( $goodStr, 78 ); + $this->_converted = true; + } - //make sure there are no more than 3 linebreaks in a row and trim whitespace - $goodStr = preg_replace("/\r\n?|\f/", "\n", $goodStr); - $goodStr = preg_replace("/\n(\s*\n){2}/", "\n\n\n", $goodStr); - $goodStr = preg_replace("/[ \t]+(\n|$)/", "$1", $goodStr); - $goodStr = preg_replace("/^\n*|\n*$/", '', $goodStr); + /** + * Helper function called by preg_replace() on link replacement. + * + * Maintains an internal list of links to be displayed at the end of the + * text, with numeric indices to the original point in the text they + * appeared. Also makes an effort at identifying and handling absolute + * and relative links. + * + * @param string $link URL of the link + * @param string $display Part of the text to associate number with + * @access private + * @return string + */ + function _build_link_list( $link, $display ) + { + if ( !$this->_do_links ) return $display; + + if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || + substr($link, 0, 7) == 'mailto:' ) { + $this->_link_count++; + $this->_link_list .= "[" . $this->_link_count . "] $link\n"; + $additional = ' [' . $this->_link_count . ']'; + } elseif ( substr($link, 0, 11) == 'javascript:' ) { + // Don't count the link; ignore it + $additional = ''; + // what about href="#anchor" ? + } else { + $this->_link_count++; + $this->_link_list .= "[" . $this->_link_count . "] " . $this->url; + if ( substr($link, 0, 1) != '/' ) { + $this->_link_list .= '/'; + } + $this->_link_list .= "$link\n"; + $additional = ' [' . $this->_link_count . ']'; + } - return $goodStr; + return $display . $additional; + } + + /** + * Helper function for PRE body conversion. + * + * @param string HTML content + * @access private + */ + function _convert_pre(&$text) + { + while(preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) + { + $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]); + $text = preg_replace('/<pre[^>]*>.*<\/pre>/ismU', '<div><br>' . $result . '<br></div>', $text, 1); + } + } + /** + * Callback function for preg_replace_callback use. + * + * @param array PREG matches + * @return string + * @access private + */ + function _preg_callback($matches) + { + switch($matches[1]) + { + case 'b': + case 'strong': + return $this->_strtoupper($matches[2]); + case 'hr': + return $this->_strtoupper("\t\t". $matches[2] ."\n"); + case 'h': + return $this->_strtoupper("\n\n". $matches[2] ."\n\n"); + case 'a': + return $this->_build_link_list($matches[3], $matches[4]); + } + } + + /** + * Strtoupper multibyte wrapper function + * + * @param string + * @return string + * @access private + */ + function _strtoupper($str) + { + if (function_exists('mb_strtoupper')) + return mb_strtoupper($str); + else + return strtoupper($str); + } } ?> diff --git a/lib/weblib.php b/lib/weblib.php index 1b8d7b58c2..5e45d088ae 100644 --- a/lib/weblib.php +++ b/lib/weblib.php @@ -2174,11 +2174,11 @@ function html_to_text($html) { require_once($CFG->libdir .'/html2text.php'); - $result = html2text($html); + $h2t = new html2text($html); + $result = $h2t->get_text(); - // html2text does not fix numerical entities so handle those here. - $tl=textlib_get_instance(); - $result = $tl->entities_to_utf8($result,false); + // html2text does not fix HTML entities so handle those here. + $result = html_entity_decode($result, ENT_NOQUOTES, 'UTF-8'); return $result; }