From: diml Date: Mon, 7 Apr 2008 20:36:47 +0000 (+0000) Subject: fixes HTM files parsing X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=0f4471f5d357e353ae1d1e80efd00da1dc1c0862;p=moodle.git fixes HTM files parsing --- diff --git a/search/documents/physical_htm.php b/search/documents/physical_htm.php index 90b8c0100e..76d6073dd6 100644 --- a/search/documents/physical_htm.php +++ b/search/documents/physical_htm.php @@ -32,14 +32,24 @@ function get_text_for_indexing_htm(&$resource){ $meta_attributes = $matches[2]; $suffix = $matches[3]; if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){ - preg_match('/content="[^"]+"/i', $meta_attributes, $matches); - $text = $prefix.' '.$matches[0].' '.$suffix; + preg_match('/content="([^"]+)"/i', $meta_attributes, $matches); + $text = $prefix.' '.$matches[1].' '.$suffix; } } - // filter all html tags - // $text = clean_text($text, FORMAT_PLAIN); - // NOTE : this is done in ResourceSearchDocument __constructor - $text = preg_replace("//", '', $text); + // brutally filters all html tags + $text = preg_replace("/<[^>]*>/", '', $text); + $text = preg_replace("//", '', $text); + $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); + $text = mb_convert_encoding($text, 'UTF-8', 'AUTO'); + + /* + * debug code for tracing input + echo "
"; + $FILE = fopen("filetrace.log", 'w'); + fwrite($FILE, $text); + fclose($FILE); + echo "
"; + */ if (!empty($CFG->block_search_limit_index_body)){ $text = shorten($text, $CFG->block_search_limit_index_body);