$meta_attributes = $matches[2];
$suffix = $matches[3];
if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){
- preg_match('/content="[^"]+"/i', $meta_attributes, $matches);
- $text = $prefix.' '.$matches[0].' '.$suffix;
+ preg_match('/content="([^"]+)"/i', $meta_attributes, $matches);
+ $text = $prefix.' '.$matches[1].' '.$suffix;
}
}
- // filter all html tags
- // $text = clean_text($text, FORMAT_PLAIN);
- // NOTE : this is done in ResourceSearchDocument __constructor
- $text = preg_replace("/<!--[^>]*?-->/", '', $text);
+ // brutally filters all html tags
+ $text = preg_replace("/<[^>]*>/", '', $text);
+ $text = preg_replace("/<!--[^>]*-->/", '', $text);
+ $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
+ $text = mb_convert_encoding($text, 'UTF-8', 'AUTO');
+
+ /*
+ * debug code for tracing input
+ echo "<hr/>";
+ $FILE = fopen("filetrace.log", 'w');
+ fwrite($FILE, $text);
+ fclose($FILE);
+ echo "<hr/>";
+ */
if (!empty($CFG->block_search_limit_index_body)){
$text = shorten($text, $CFG->block_search_limit_index_body);