From b585dc5fcb001ca52b35f9e8792d1af80e5823f7 Mon Sep 17 00:00:00 2001 From: mchampan Date: Tue, 25 Jul 2006 17:23:10 +0000 Subject: [PATCH] General updates, see README.txt. --- search/README.txt | 18 +- .../Zend/Search/Lucene/Index/SegmentInfo.php | 2 +- .../Search/Lucene/Search/QueryTokenizer.php | 11 +- search/documents/document.php | 2 +- search/documents/forum_document.php | 38 +--- search/documents/wiki_document.php | 2 +- search/indexer.php | 23 ++- search/indexersplash.php | 32 +--- search/lib.php | 35 ++-- search/query.php | 180 ++---------------- search/stats.php | 81 ++++---- search/tests/index.php | 4 +- 12 files changed, 149 insertions(+), 279 deletions(-) diff --git a/search/README.txt b/search/README.txt index e039bc9447..249c8f8b5e 100644 --- a/search/README.txt +++ b/search/README.txt @@ -1,7 +1,23 @@ +2006/07/25 +---------- +Query logic moved into the SearchQuery class in querylib.php. Should be able +to include this file in any page and run a query against the index (PHP 5 +checks must be added to those pages then, though). + +Index info can be retrieved using IndexInfo class in indexlib.php. + +Abstracted some stuff away, to reduce rendundancy and decrease the +likelihood of errors. Improved the stats.php page to include some +diagnostics for adminstrators. + +delete.php skeleton created for removing deleted documents from the +index. cron.php will contain the logic for running delete.php, +update.php and eventually add.php. + 2006/07/11 ---------- (Warning: It took me 1900 seconds to index the forum, go make coffee -whilst you wait.) +whilst you wait.) [Moodle.org forum data] Forum search functions changed to use 'get_recordset' instead of 'get_records', for speed reasons. This provides a significant improvement, diff --git a/search/Zend/Search/Lucene/Index/SegmentInfo.php b/search/Zend/Search/Lucene/Index/SegmentInfo.php index 3defbed1dd..aeceab63b6 100644 --- a/search/Zend/Search/Lucene/Index/SegmentInfo.php +++ b/search/Zend/Search/Lucene/Index/SegmentInfo.php @@ -430,7 +430,7 @@ class Zend_Search_Lucene_Index_SegmentInfo $freqPointer = $prevTermInfo->freqPointer; $proxPointer = $prevTermInfo->proxPointer; for ($count = $prevPosition*$indexInterval + 1; - $count < $termCount && + $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || ($this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0) ); diff --git a/search/Zend/Search/Lucene/Search/QueryTokenizer.php b/search/Zend/Search/Lucene/Search/QueryTokenizer.php index a59f8a8b12..4fe870bedc 100644 --- a/search/Zend/Search/Lucene/Search/QueryTokenizer.php +++ b/search/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -64,7 +64,16 @@ class Zend_Search_Lucene_Search_QueryTokenizer implements Iterator $currentToken = ''; for ($count = 0; $count < strlen($inputString); $count++) { - if (ctype_alnum( $inputString{$count} )) { + if (ctype_alnum( $inputString{$count} ) || + $inputString{$count} == '_') { + $currentToken .= $inputString{$count}; + } else if ($inputString{$count} == '\\') { // Escaped character + $count++; + + if ($count == strlen($inputString)) { + throw new Zend_Search_Lucene_Exception('Non finished escape sequence.'); + } + $currentToken .= $inputString{$count}; } else { // Previous token is finished diff --git a/search/documents/document.php b/search/documents/document.php index 83762aa2ce..e9a9766b64 100644 --- a/search/documents/document.php +++ b/search/documents/document.php @@ -3,7 +3,7 @@ * extend. * */ - class SearchDocument extends Zend_Search_Lucene_Document { + abstract class SearchDocument extends Zend_Search_Lucene_Document { public function __construct(&$doc, &$data, $document_type, $course_id, $group_id) { $this->addField(Zend_Search_Lucene_Field::Keyword('id', $doc->id)); $this->addField(Zend_Search_Lucene_Field::Text('title', $doc->title)); diff --git a/search/documents/forum_document.php b/search/documents/forum_document.php index c1b81d7a4a..7b948d8eed 100644 --- a/search/documents/forum_document.php +++ b/search/documents/forum_document.php @@ -7,12 +7,7 @@ class ForumSearchDocument extends SearchDocument { public function __construct(&$post, $forum_id, $course_id, $group_id) { - // generic information - /*$doc->id = $post->id; - $doc->title = $post->subject; - $doc->author = $post->firstname." ".$post->lastname; - $doc->contents = $post->message;*/ - + // generic information $doc->id = $post['id']; $doc->title = $post['subject']; $doc->author = $post['firstname']." ".$post['lastname']; @@ -24,7 +19,7 @@ $data->forum = $forum_id; $data->discussion = $post['discussion']; - parent::__construct($doc, $data, SEARCH_FORUM_TYPE, $course_id, $group_id); + parent::__construct($doc, $data, SEARCH_TYPE_FORUM, $course_id, $group_id); } //constructor } //ForumSearchDocument @@ -34,7 +29,7 @@ } //forum_make_link function forum_iterator() { - //no @ = Undefined index: 82 in /home/michael/public_html/moodle/lib/datalib.php on line 2671 + //no @ = Undefined index: 82 in moodle/lib/datalib.php on line 2671 return @get_all_instances_in_courses("forum", get_courses()); } //forum_iterator @@ -72,33 +67,6 @@ return $documents; } //forum_get_content_for_index - //old slower version - function forum_get_content_for_index_old(&$forum) { - $documents = array(); - if (!$forum) return $documents; - - $posts = forum_get_discussions($forum->id); - if (!$posts) return $documents; - - foreach($posts as $post) { - if (is_object($post)) { - if (strlen($post->message) > 0 && ($post->deleted != 1)) { - $documents[] = new ForumSearchDocument($post, $forum->id, $forum->course, $post->groupid); - } //if - - if ($children = forum_get_child_posts($post->id, $forum->id)) { - foreach ($children as $child) { - if (strlen($child->message) > 0 && ($child->deleted != 1)) { - $documents[] = new ForumSearchDocument($child, $forum->id, $forum->course, $post->groupid); - } //if - } //foreach - } //if - } //if - } //foreach - - return $documents; - } //forum_get_content_for_index_old - //reworked faster version from /mod/forum/lib.php function forum_get_discussions_fast($forum) { global $CFG, $USER; diff --git a/search/documents/wiki_document.php b/search/documents/wiki_document.php index 95820d1ca4..812e3fd239 100644 --- a/search/documents/wiki_document.php +++ b/search/documents/wiki_document.php @@ -32,7 +32,7 @@ $data->wiki = $wiki_id; // construct the parent class - parent::__construct($doc, $data, SEARCH_WIKI_TYPE, $course_id, $group_id); + parent::__construct($doc, $data, SEARCH_TYPE_WIKI, $course_id, $group_id); } //constructor } //WikiSearchDocument diff --git a/search/indexer.php b/search/indexer.php index a56302e59d..12968541d0 100644 --- a/search/indexer.php +++ b/search/indexer.php @@ -48,16 +48,19 @@ //php5 found, continue including php5-only files require_once("$CFG->dirroot/search/Zend/Search/Lucene.php"); - - if (get_config("search_indexer_busy") == 1) { - } //if - - //turn on busy flag - set_config("search_indexer_busy", 1); + mtrace('
Server Time: '.date('r',time())."\n");
+
+  if ($CFG->search_indexer_busy == '1') {
+    //means indexing was not finished previously
+    mtrace("Warning: Indexing was not successfully completed last time, restarting.\n");
+  } //if
+
+  //turn on busy flag
+  set_config('search_indexer_busy', '1');
   
   //paths
-  $index_path = $CFG->dataroot.'/search';
+  $index_path = SEARCH_INDEX_PATH;
   $index_db_file = "$CFG->dirroot/search/db/$CFG->dbtype.sql";  
   
   //setup directory in data root
@@ -103,6 +106,7 @@
   
   if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
     foreach ($mods as $mod) {
+      if ($mod->name == 'forum') continue;
       $class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';              
       
       if (file_exists($class_file)) {
@@ -174,6 +178,9 @@
   mtrace('
'); //finished, turn busy flag off - set_config("search_indexer_busy", 0); + set_config("search_indexer_busy", "0"); + + //mark the time we last updated + set_config("search_indexer_run_date", time()); ?> \ No newline at end of file diff --git a/search/indexersplash.php b/search/indexersplash.php index aa3b91b5a8..c0899b669c 100644 --- a/search/indexersplash.php +++ b/search/indexersplash.php @@ -18,30 +18,18 @@ $phpversion = phpversion(); mtrace("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)"); exit(0); - } //if - - $index_path = "$CFG->dataroot/search"; - $index_dir = get_directory_list($index_path, '', false, false); - $index_filecount = count($index_dir); - - //check if the table exists in the db - $tables = $db->MetaTables(); - - if (in_array($CFG->prefix.'search_documents', $tables)) { - $db_count = count_records('search_documents'); - } else { - $db_count = 0; - } //else + } //if - //TODO: elaborate on error messages, when db!=0 and index=0 -> corrupt, etc. - if ($index_filecount != 0 or $db_count != 0) { - mtrace("
The data directory ($index_path) contains $index_filecount files, and\n"
-          ."there are $db_count records in the search_documents table.\n"
+  require_once("$CFG->dirroot/search/indexlib.php");    
+  $indexinfo = new IndexInfo();
+    
+  if ($indexinfo->valid()) {    
+    mtrace("
The data directory ($indexinfo->path) contains $indexinfo->filecount files, and\n"
+          ."there are ".$indexinfo->dbcount." records in the search_documents table.\n"
           ."\n"
-          ."This indicates that you have already succesfully indexed this site, or at least\n"
-          ."started and cancelled an indexing session. Follow the link if you are sure that\n"
-          ."you want to continue indexing - this will replace any existing index data (no\n"
-          ."Moodle data is affected).\n"
+          ."This indicates that you have already succesfully indexed this site. Follow the link\n"
+          ."if you are sure that you want to continue indexing - this will replace any existing\n"
+          ."index data (no Moodle data is affected).\n"
           ."\n"
           ."You are encouraged to use the 'Test indexing' script before continuing onto\n"
           ."indexing - this will check if the modules are set up correctly. Please correct\n"
diff --git a/search/lib.php b/search/lib.php
index 17d16784fd..964f27545f 100644
--- a/search/lib.php
+++ b/search/lib.php
@@ -1,19 +1,32 @@
 dataroot/search");
+  
+  //document types that can be searched  
+  define('SEARCH_TYPE_NONE', 'none');
+  define('SEARCH_TYPE_WIKI', 'wiki');
+  define('SEARCH_TYPE_FORUM', 'forum');  
   
   //returns all the document type constants
-  function search_get_document_types() {
-    $r = Array(SEARCH_WIKI_TYPE, SEARCH_NO_TYPE, SEARCH_FORUM_TYPE);
-    return $r;
+  function search_get_document_types($prefix='SEARCH_TYPE') {
+    $ret = array();
+    
+    foreach (get_defined_constants() as $key=>$value) {
+      if (substr($key, 0, strlen($prefix)) == $prefix) {
+        $ret[$key] = $value;
+      } //if
+    } //foreach
+    
+    return $ret;
   } //search_get_document_types
-  
+    
   //shortens a url so it can fit on the results page
   function search_shorten_url($url, $length=30) {    
     return substr($url, 0, $length)."...";
diff --git a/search/query.php b/search/query.php
index 3f6cfc029b..d33512e8e6 100644
--- a/search/query.php
+++ b/search/query.php
@@ -21,13 +21,13 @@
    *   All articles written by Helen Foster
    *   
    * */
-
-  require_once('../config.php');  
+    
+  require_once('../config.php');
   require_once("$CFG->dirroot/search/lib.php"); 
     
-  //check for php5, but don't die yet (see line 27)
-  if ($check = search_check_php5()) {  
-    require_once("$CFG->dirroot/search/Zend/Search/Lucene.php");
+  //check for php5, but don't die yet (see line 52)
+  if ($check = search_check_php5()) {      
+    require_once("$CFG->dirroot/search/querylib.php");    
     
     $query_string = optional_param('query_string', '', PARAM_CLEAN);
     $page_number  = optional_param('page', 1, PARAM_INT);
@@ -35,103 +35,10 @@
     if ($page_number < 1) {
       $page_number = 1;
     } //if
-        
-    $index_path = "$CFG->dataroot/search";
-    $no_index = false; //optimism!
-    $results_per_page = 10;
     
-    try {
-      $index = new Zend_Search_Lucene($index_path, false);
-    } catch(Exception $e) {
-      //print $e;
-      $no_index = true;
-    } //catch
+    $sq = new SearchQuery($query_string, $page_number, 10, true);  
   } //if
   
-  
-  //Result document class that contains all the display information we need
-  class ResultDocument {
-    public  $url,
-            $title,
-            $score,
-            $doctype,
-            $author;
-  } //ResultDocument  
-
-  //generates an HTML string of links to result pages
-  function page_numbers($query, $hits, $page=1, $results_per_page=20) {
-    //total result pages
-    $pages = ceil($hits/$results_per_page);
-    
-    $ret = "
"; - - //Back is disabled if we're on page 1 - if ($page > 1) { - $ret .= "< Back "; - } else { - $ret .= "< Back "; - } //else - - //don't the current page - for ($i = 1; $i <= $pages; $i++) { - if ($page == $i) { - $ret .= "[$i] "; - } else { - $ret .= "$i "; - } //else - } //for - - //Next disabled if we're on the last page - if ($page < $pages) { - $ret .= "Next > "; - } else { - $ret .= "Next > "; - } //else - - $ret .= "
"; - - //shorten really long page lists, to stop table distorting width-ways - if (strlen($ret) > 70) { - $start = 4; - $end = $page - 5; - $ret = preg_replace("/$start<\/a>.*?$end<\/a>/", '...', $ret); - - $start = $page + 5; - $end = $pages - 3; - $ret = preg_replace("/$start<\/a>.*?$end<\/a>/", '...', $ret); - } //if - - return $ret; - } //page_numbers - - //calculates whether a user is allowed to see this result - function can_display(&$user, $course_id, $group_id) { - return true; - } //can_display - - //caches the results of the last query, deletes the previous one also - function cache($id=false, &$object=false) { - //see if there was a previous query - $last_term = (isset($_SESSION['search_last_term'])) ? $_SESSION['search_last_term'] : false; - - //if this query is different from the last, clear out the last one - if ($id != false and $last_term != $id) { - unset($_SESSION[$last_term]); - session_unregister($last_term); - } //if - - //store the new query if id and object are passed in - if ($object and $id) { - $_SESSION['search_last_term'] = $id; - $_SESSION[$id] = $object; - return true; - //otherwise return the stored results - } else if ($id and isset($_SESSION[$id])) { - return $_SESSION[$id]; - } //else - } //cache - - if (!$site = get_site()) { redirect("index.php"); } //if @@ -166,17 +73,17 @@
is_valid_index()) { + print $sq->index_count(); } else { - print $index->count(); + print "0"; } //else print ' documents.'; - if ($no_index and isadmin()) { + if (!$sq->is_valid_index() and isadmin()) { print "

Admin: There appears to be no index, click here to create one."; } //if ?> @@ -185,43 +92,11 @@ is_valid()) { print_simple_box_start('center', '50%', 'white', 10); - search_stopwatch(); - - //if the cache is empty - if (!($hits = cache($query_string))) { - $resultdocs = array(); - $resultdoc = new ResultDocument; - - //generate a new result-set - $hits = $index->find(strtolower($query_string)); - - foreach ($hits as $hit) { - //check permissions on each result - if (can_display($USER, $hit->course_id, $hit->group_id)) { - $resultdoc->url = $hit->url; - $resultdoc->title = $hit->title; - $resultdoc->score = $hit->score; - $resultdoc->doctype = $hit->doctype; - $resultdoc->author = $hit->author; - - //and store it if it passes the test - $resultdocs[] = clone($resultdoc); - } //if - } //foreach - - //cache the results so we don't have to compute this on every page-load - cache($query_string, $resultdocs); - - //print "Using new results."; - } else { - //There was something in the cache, so we're using that to save time - //print "Using cached results."; - } //else - - $hit_count = count($hits); + search_stopwatch(); + $hit_count = $sq->count(); print "
"; @@ -229,27 +104,13 @@ print "
"; if ($hit_count > 0) { - if ($hit_count < $results_per_page) { - $page_number = 1; - } else if ($page_number > ceil($hit_count/$results_per_page)) { - $page_number = $hit_count/$results_per_page; - } //if - - $start = ($page_number - 1)*$results_per_page; - $end = $start + $results_per_page; - - $page_links = page_numbers($query_string, $hit_count, $page_number, $results_per_page); + $page_links = $sq->page_numbers(); + $hits = $sq->results(); print "
    "; - for ($i = $start; $i < $end; $i++) { - if ($i >= $hit_count) { - break; - } //if - - $listing = $hits[$i]; - - print "
  1. $listing->title
    \n" + foreach ($hits as $listing) { + print "
  2. $listing->title
    \n" ."".search_shorten_url($listing->url, 70)."
    \n" ."Type: ".$listing->doctype.", score: ".round($listing->score, 3).", author: ".$listing->author."
    \n" ."
  3. \n"; @@ -260,9 +121,6 @@ } //if print_simple_box_end(); - } //if - - if (!empty($query_string) and !$no_index) { ?>
    @@ -270,7 +128,7 @@
    dirroot/search/lib.php"); //check for php5, but don't die yet - if ($check = search_check_php5()) { - //filesystem stats - $index_path = "$CFG->dataroot/search"; - $index_size = display_size(get_directory_size($index_path)); - $index_dir = get_directory_list($index_path, '', false, false); - $index_filecount = count($index_dir); + if ($check = search_check_php5()) { + require_once("$CFG->dirroot/search/indexlib.php"); - //indexed documents stats (via db) - $db_exists = false; - $admin_tables = $db->MetaTables(); - - if (in_array($CFG->prefix.'search_documents', $admin_tables)) { - $db_exists = true; - $types = search_get_document_types(); - sort($types); - - //total documents - $type_counts['Total'] = count_records('search_documents'); - - foreach($types as $type) { - $c = count_records('search_documents', 'doctype', $type); - $type_counts[$type] = (int)$c; - } //foreach - } else { - $type_counts['Total'] = 0; - } //else - } //if + $indexinfo = new IndexInfo(); + } //if if (!$site = get_site()) { redirect("index.php"); @@ -66,12 +45,35 @@ $admin_table->cellspacing = 0; $admin_table->width = '500'; - $admin_table->data[] = array('Data directory', ''.$index_path.''); - $admin_table->data[] = array('Files in index directory', $index_filecount); - $admin_table->data[] = array('Total size', $index_size); + $admin_table->data[] = array('Data directory', ''.$indexinfo->path.''); + $admin_table->data[] = array('Files in index directory', $indexinfo->filecount); + $admin_table->data[] = array('Total size', $indexinfo->size); + + if ($indexinfo->time > 0) { + $admin_table->data[] = array('Created on', date('r', $indexinfo->time)); + } else { + $admin_table->data[] = array('Created on', '-'); + } //else + + if (!$indexinfo->valid($errors)) { + $admin_table->data[] = array('Errors', ' '); - if ($index_filecount == 0 or !$db_exists) { - $admin_table->data[] = array('Click to create index', "Indexer"); + foreach ($errors as $key=>$value) { + $admin_table->data[] = array($key.' ... ', $value); + } //foreach + + $admin_table->data[] = array('Solutions', ' '); + + if (isset($errors['dir'])) { + $admin_table->data[] = array('Check dir', 'Ensure the data directory exists and is writable.'); + } //if + + if (isset($errors['db'])) { + $admin_table->data[] = array('Check DB', 'Check your database for any problems.'); + } //if + + $admin_table->data[] = array('Run indexer test', 'tests/index.php'); + $admin_table->data[] = array('Run indexer', 'indexersplash.php'); } //if } //if @@ -83,9 +85,18 @@ $table->cellspacing = 0; $table->width = '500'; - $table->data[] = array('Database', 'search_documents'); - foreach($type_counts as $key => $value) { - $table->data[] = array($key, $value); + $table->data[] = array('Database', 'search_documents'); + + //add an extra field if we're admin + if (isadmin()) { + //don't want to confuse users if the two totals don't match (hint: they should) + $table->data[] = array('Documents in index', $indexinfo->indexcount); + } //if + + $table->data[] = array('Documents in database', $indexinfo->dbcount); + + foreach($indexinfo->types as $key => $value) { + $table->data[] = array("'$key' documents", $value); } //foreach if (isadmin()) { diff --git a/search/tests/index.php b/search/tests/index.php index 91f72bf601..49ddb39f99 100644 --- a/search/tests/index.php +++ b/search/tests/index.php @@ -49,8 +49,8 @@ if (file_exists($class_file)) { include_once($class_file); - if (!defined('SEARCH_'.strtoupper($mod->name).'_TYPE')) { - mtrace("ERROR: Constant 'SEARCH_".strtoupper($mod->name)."_TYPE' is not defined in /search/lib.php"); + if (!defined('SEARCH_TYPE_'.strtoupper($mod->name))) { + mtrace("ERROR: Constant 'SEARCH_TYPE_".strtoupper($mod->name)."' is not defined in /search/lib.php"); continue; } //if -- 2.39.5