From 7bb11c76d19d5f3fa70a502bb5bee9259c064d70 Mon Sep 17 00:00:00 2001 From: stronk7 Date: Tue, 12 Jun 2007 18:15:55 +0000 Subject: [PATCH] First cut of the tokeniser library. Modified from drupal search.module code (see copyrights). The library converts any text/html into an array of tokens with their score (weight). Supports stop_words, cjk basic tokeniser (for indexers) and different modes of handling numbers. Merged from MOODLE_18_STABLE --- lib/tokeniserlib.php | 405 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 lib/tokeniserlib.php diff --git a/lib/tokeniserlib.php b/lib/tokeniserlib.php new file mode 100644 index 0000000000..737875b126 --- /dev/null +++ b/lib/tokeniserlib.php @@ -0,0 +1,405 @@ + 25, + 'h2' => 18, + 'h3' => 15, + 'h4' => 12, + 'h5' => 9, + 'h6' => 6, + 'u' => 3, + 'b' => 3, + 'i' => 3, + 'strong' => 3, + 'em' => 3, + 'a' => 10); + + // Strip off all ignored tags to speed up processing, but insert space before/after + // them to keep word boundaries. + $text = str_replace(array('<', '>'), array(' <', '> '), $text); + $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>'); + + // Split HTML tags from plain text. + $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); + // Note: PHP ensures the array consists of alternating delimiters and literals + // and begins and ends with a literal (inserting $null as required). + + $tag = FALSE; // Odd/even counter. Tag or no tag. + $score = 1; // Starting score per word + $accum = ' '; // Accumulator for cleaned up data + $tagstack = array(); // Stack with open tags + $tagwords = 0; // Counter for consecutive words + $focus = 1; // Focus state + + $results = array(0 => array()); // Accumulator for words for index + + foreach ($split as $value) { + if ($tag) { + // Increase or decrease score per word based on tag + list($tagname) = explode(' ', $value, 2); + $tagname = $textlib->strtolower($tagname); + // Closing or opening tag? + if ($tagname[0] == '/') { + $tagname = substr($tagname, 1); + // If we encounter unexpected tags, reset score to avoid incorrect boosting. + if (!count($tagstack) || $tagstack[0] != $tagname) { + $tagstack = array(); + $score = 1; + } + else { + // Remove from tag stack and decrement score + $score = max(1, $score - $tags[array_shift($tagstack)]); + } + } + else { + if (isset($tagstack[0]) && $tagstack[0] == $tagname) { + // None of the tags we look for make sense when nested identically. + // If they are, it's probably broken HTML. + $tagstack = array(); + $score = 1; + } + else { + // Add to open tag stack and increment score + array_unshift($tagstack, $tagname); + $score += $tags[$tagname]; + } + } + // A tag change occurred, reset counter. + $tagwords = 0; + } + else { + // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values + if ($value != '') { + $words = tokenise_split($value, $stop_words, $overlap_cjk, $join_numbers); + foreach ($words as $word) { + // Add word to accumulator + $accum .= $word .' '; + $num = is_numeric($word); + // Check word length + if ($num || $textlib->strlen($word) >= MINIMUM_WORD_SIZE) { + // Normalize numbers + if ($num && $join_numbers) { + $word = (int)ltrim($word, '-0'); + } + + if (!isset($results[0][$word])) { + $results[0][$word] = 0; + } + + $results[0][$word] += $score * $focus; + // Focus is a decaying value in terms of the amount of unique words up to this point. + // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words. + $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015)); + } + $tagwords++; + // Too many words inside a single tag probably mean a tag was accidentally left open. + if (count($tagstack) && $tagwords >= 15) { + $tagstack = array(); + $score = 1; + } + } + } + } + $tag = !$tag; + } + + $res = array(); + + if (isset($results[0]) && count($results[0]) > 0) { + $res = $results[0]; + arsort($res, SORT_NUMERIC); + } + + return $res; +} + +/// +/// Some helper functions (should be considered private) +/// + +/** + * Splits a string into tokens + */ +function tokenise_split($text, $stop_words, $overlap_cjk, $join_numbers) { + static $last = NULL; + static $lastsplit = NULL; + + if ($last == $text) { + return $lastsplit; + } + // Process words + $text = tokenise_simplify($text, $overlap_cjk, $join_numbers); + $words = explode(' ', $text); + // Limit word length to 50 + array_walk($words, 'tokenise_truncate_word'); + + // We have stop words, apply them + if (is_array($stop_words) && !empty($stop_words)) { + // Normalise them + $simp_stop_words = explode(' ', tokenise_simplify(implode(' ', $stop_words), $overlap_cjk, $join_numbers)); + // Extract from the list + $words = array_diff($words, $simp_stop_words); + } + // Save last keyword result + $last = $text; + $lastsplit = $words; + + return $words; +} + +/** + * Simplifies a string according to indexing rules. + */ +function tokenise_simplify($text, $overlap_cjk, $join_numbers) { + + $textlib = textlib_get_instance(); + + // Decode entities to UTF-8 + $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); + + // Lowercase + $text = $textlib->strtolower($text); + + // Simple CJK handling + if ($overlap_cjk) { + $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'tokenise_expand_cjk', $text); + } + + // To improve searching for numerical data such as dates, IP addresses + // or version numbers, we consider a group of numerical characters + // separated only by punctuation characters to be one piece. + // This also means that searching for e.g. '20/03/1984' also returns + // results with '20-03-1984' in them. + // Readable regexp: ([number]+)[punctuation]+(?=[number]) + if ($join_numbers) { + $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text); + } else { + // Keep all the detected numbers+punctuation in a safe place in order to restore them later + preg_match_all('/['. PREG_CLASS_NUMBERS .']+['. PREG_CLASS_PUNCTUATION . PREG_CLASS_NUMBERS .']+/u', $text, $foundseqs); + $storedseqs = array(); + foreach (array_unique($foundseqs[0]) as $ntkey => $value) { + $prefix = (string)(count($storedseqs) + 1); + $storedseqs[START_DELIM.$prefix.CENTER_DELIM.$ntkey.END_DELIM] = $value; + } + if (!empty($storedseqs)) { + $text = str_replace($storedseqs, array_keys($storedseqs), $text); + } + } + + // The dot, underscore and dash are simply removed. This allows meaningful + // search behaviour with acronyms and URLs. + $text = preg_replace('/[._-]+/', '', $text); + + // With the exception of the rules above, we consider all punctuation, + // marks, spacers, etc, to be a word boundary. + $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE .']+/u', ' ', $text); + + // Restore, if not joining numbers, recover the original strings + if (!$join_numbers) { + if (!empty($storedseqs)) { + $text = str_replace(array_keys($storedseqs), $storedseqs, $text); + } + } + + return $text; +} + +/** + * Basic CJK tokeniser. Simply splits a string into consecutive, overlapping + * sequences of characters (MINIMUM_WORD_SIZE long). + */ +function tokenise_expand_cjk($matches) { + + $textlib = textlib_get_instance(); + + $str = $matches[0]; + $l = $textlib->strlen($str); + // Passthrough short words + if ($l <= MINIMUM_WORD_SIZE) { + return ' '. $str .' '; + } + $tokens = ' '; + // FIFO queue of characters + $chars = array(); + // Begin loop + for ($i = 0; $i < $l; ++$i) { + // Grab next character + $current = $textlib->substr($str, 0, 1); + $str = substr($str, strlen($current)); + $chars[] = $current; + if ($i >= MINIMUM_WORD_SIZE - 1) { + $tokens .= implode('', $chars) .' '; + array_shift($chars); + } + } + return $tokens; +} + +/** + * Helper function for array_walk in search_index_split. + * Truncates one string (token) to MAXIMUM_WORD_SIZE + */ +function tokenise_truncate_word(&$text) { + + $textlib = textlib_get_instance(); + $text = $textlib->substr($text, 0, MAXIMUM_WORD_SIZE); +} + +?> -- 2.39.5