From b6c120104f520e753c8ef8331f4ede822f658539 Mon Sep 17 00:00:00 2001 From: moodler Date: Fri, 7 Nov 2003 03:25:09 +0000 Subject: [PATCH] A new function for converting HTML into text form --- lib/html2text.php | 162 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 lib/html2text.php diff --git a/lib/html2text.php b/lib/html2text.php new file mode 100644 index 0000000000..f96108f015 --- /dev/null +++ b/lib/html2text.php @@ -0,0 +1,162 @@ +, +Closing or tags without their corresponding opening tags, +Tags within tags, which will be rendered, even though they +should not be. + +Conversion requires a lot of preg_replace statements, so it can be quite slow with +large HTML files. + +****** +To use +****** + +This library requires PHP 4+ + +To use this library, put the following line in your script before the part that needs it: +require('PATH_TO_THIS_FILE/html2text.php'); + +To convert HTML/PHP to text: + $textVersion = html2text( $HTMLversion ); + +************ +Further info +************ + +For the technically minded, this is the process I use for converting HTML to approx text: + +REMOVE php start and end tags +REMOVE +ensure HTML uses entities in the right places (like inside tags) so strip_tags works properly + +carefully remove everything between them +strip_tags except the important ones +replace all \s that are after the start or a and before
 or end with a single space
+
+\n          --------------------
+
+\n\n
+
+^
+(TH|TD)>
+\n
+
  • +\n· +
    +\n\t + +\t + +[LINK:hrefWithout#] + +[IMG:alt] +
    +[FORM:action] + +[INPUT] +strip tags again, leaving nothing this time +un-htmlspecialchars +word wrap (this will also affect pre, but as this is intended for email use, I don't care) + + +*/ + +function html2text( $badStr ) { + //remove PHP if it exists + if( $andPHP ) { while( substr_count( $badStr, '<'.'?' ) && substr_count( $badStr, '?'.'>' ) && strpos( $badStr, '?'.'>' ) > strpos( $badStr, '<'.'?' ) ) { + $badStr = substr( $badStr, 0, strpos( $badStr, '<'.'?' ) ) . substr( $badStr, strpos( $badStr, '?'.'>' ) + 2 ); } } + //remove comments + while( substr_count( $badStr, '' ) && strpos( $badStr, '-->' ) > strpos( $badStr, '' ) + 3 ); } + //now make sure all HTML tags are correctly written (> not in between quotes) + for( $x = 0, $goodStr = '', $is_open_tb = false, $is_open_sq = false, $is_open_sq = false; strlen( $chr = $badStr{$x} ); $x++ ) { + //take each letter in turn and check if that character is permitted there + switch( $chr ) { + case '<': + if( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 5 ) ) == 'style' ) { + $badStr = substr( $badStr, 0, $x ) . substr( $badStr, strpos( strtolower( $badStr ), '' ) + 7 ); $chr = ''; + } elseif( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 6 ) ) == 'script' ) { + $badStr = substr( $badStr, 0, $x ) . substr( $badStr, strpos( strtolower( $badStr ), '' ) + 8 ); $chr = ''; + } elseif( !$is_open_tb ) { $is_open_tb = true; } else { $chr = '<'; } + break; + case '>': + if( !$is_open_tb || $is_open_dq || $is_open_sq ) { $chr = '>'; } else { $is_open_tb = false; } + break; + case '"': + if( $is_open_tb && !$is_open_dq && !$is_open_sq ) { $is_open_dq = true; } + elseif( $is_open_tb && $is_open_dq && !$is_open_sq ) { $is_open_dq = false; } + else { $chr = '"'; } + break; + case "'": + if( $is_open_tb && !$is_open_dq && !$is_open_sq ) { $is_open_sq = true; } + elseif( $is_open_tb && !$is_open_dq && $is_open_sq ) { $is_open_sq = false; } + } $goodStr .= $chr; + } + //now that the page is valid (I hope) for strip_tags, strip all unwanted tags + $goodStr = strip_tags( $goodStr, '<hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' ); + //strip extra whitespace except between <pre> and <textarea> tags + $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr ); + for( $x = 0; is_string( $badStr[$x] ); $x++ ) { + if( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else { + $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] ); + for( $z = 0; is_string( $goodStr[$z] ); $z++ ) { + if( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else { + $goodStr[$z] = preg_replace( "/\s+/", ' ', $goodStr[$z] ); + } } + $badStr[$x] = implode('',$goodStr); + } } + $goodStr = implode('',$badStr); + //remove all options from select inputs + $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr ); + //replace all tags with their text equivalents + $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr ); + $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr ); + $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr ); + $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr ); + $goodStr = preg_replace( "/<li[^>]*>/i", "\n· ", $goodStr ); + $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr ); + $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr ); + ///$goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr ); /// Moodle + $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[$2$4$6] ", $goodStr ); + /// $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr );/// Moodle + $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "{$2$3$4} ", $goodStr ); + $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr ); + $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr ); + //strip all remaining tags (mostly closing tags) + $goodStr = strip_tags( $goodStr ); + //convert HTML entities + $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) ); + preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr ); + //wordwrap + ///$goodStr = wordwrap($goodStr); /// Moodle + $goodStr = wordwrap($goodStr, 70); + //make sure there are no more than 3 linebreaks in a row and trim whitespace + return preg_replace( "/^\n*|\n*$/", '', preg_replace( "/[ \t]+(\n|$)/", "$1", preg_replace( "/\n(\s*\n){2}/", "\n\n\n", preg_replace( "/\r\n?|\f/", "\n", str_replace( chr(160), ' ', $goodStr ) ) ) ) ); +} + +?> -- 2.39.5