+2006/08/21
+----------
+Fixed index document count, and created new config variable to store
+the size. (Search now has 3 global vars in $CFG, date, size and complete,
+see indexer.php for var names). Index size is cached to provide an always
+current value for the index - this is to take into account the fact that
+deleted documents are in fact not removed from the index, but instead just
+marked as deleted and not returned in search results. The actual document
+still features in the index, and skews sizes. When the index optimiser is
+completed in ZFS, then these deleted documents will be pruned, thus
+correctly modifying the index size.
+
+Additional commenting added.
+
+Query page logic very slightly modified to clean up GET string a bit (removed
+'p' variable).
+
+Add/delete functions added to other document types.
+
+A few TODO fields added to source, indicating changes still to come (or at
+least to be considered).
+
2006/08/16
----------
Add/delete/update cron functions finished - can be called seperately
$dbcontrol = new IndexDBControl();
$addition_count = 0;
+ $indexdate = $CFG->search_indexer_run_date;
+
mtrace('<pre>Starting index update (additions)...');
- mtrace('Index size before: '.$index->count()."\n");
+ mtrace('Index size before: '.$CFG->search_index_size."\n");
+ //get all modules
if ($mods = get_records_select('modules')) {
+ //append virtual modules onto array
+ $mods = array_merge($mods, search_get_additional_modules());
+
foreach ($mods as $mod) {
+ //build include file and function names
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$db_names_function = $mod->name.'_db_names';
$get_document_function = $mod->name.'_single_document';
if (file_exists($class_file)) {
require_once($class_file);
+ //if both required functions exist
if (function_exists($db_names_function) and function_exists($get_document_function)) {
mtrace("Checking $mod->name module for additions.");
$values = $db_names_function();
+ $where = (isset($values[4])) ? $values[4] : '';
- $sql = "select id, ".$values[0]." as docid from ".$values[1]."
- where id not in
- (select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')";
+ //select records in MODULE table, but not in SEARCH_DATABASE_TABLE
+ $sql = "select id, ".$values[0]." as docid from ".$values[1].
+ " where id not in".
+ " (select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')".
+ " and ".$values[2]." > $indexdate".
+ " $where";
$records = get_records_sql($sql);
+ //foreach record, build a module specific search document using the get_document function
if (is_array($records)) {
foreach($records as $record) {
$additions[] = $get_document_function($record->id);
} //foreach
} //if
+ //foreach document, add it to the index and database table
foreach ($additions as $add) {
++$addition_count;
//commit changes
$index->commit();
- //update index date
+ //update index date and size
set_config("search_indexer_run_date", time());
+ set_config("search_index_size", (int)$CFG->search_index_size + (int)$addition_count);
+ //print some additional info
mtrace("Added $addition_count documents.");
mtrace('Index size after: '.$index->count().'</pre>');
<?php
+ /* cron script to perform all the periodic search tasks
+ *
+ * delete.php
+ * updates the index by pruning deleted documents
+ *
+ * update.php
+ * updates document info in the index if the document has been modified since indexing
+ *
+ * add.php
+ * adds documents created since the last index run
+ */
+
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");
$deletion_count = 0;
mtrace('<pre>Starting clean-up of removed records...');
- mtrace('Index size before: '.$index->count()."\n");
+ mtrace('Index size before: '.$CFG->search_index_size."\n");
if ($mods = get_records_select('modules')) {
+ $mods = array_merge($mods, search_get_additional_modules());
+
foreach ($mods as $mod) {
+ //build function names
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$delete_function = $mod->name.'_delete';
$db_names_function = $mod->name.'_db_names';
mtrace("Checking $mod->name module for deletions.");
$values = $db_names_function();
- $sql = "select id, docid from ".SEARCH_DATABASE_TABLE."
- where doctype like '$mod->name'
- and docid not in
- (select ".$values[0]." from ".$values[1].")";
+ $sql = "select id, docid from ".SEARCH_DATABASE_TABLE.
+ " where doctype like '$mod->name'".
+ " and docid not in".
+ " (select ".$values[0]." from ".$values[1].")";
$records = get_records_sql($sql);
+ //build an array of all the deleted records
if (is_array($records)) {
foreach($records as $record) {
$deletions[] = $delete_function($record->docid);
} //if
foreach ($deletions as $delete) {
+ //find the specific document in the index, using it's docid and doctype as keys
$doc = $index->find("+docid:$delete +doctype:$mod->name");
//get the record, should only be one
++$deletion_count;
mtrace(" Delete: $thisdoc->title (database id = $thisdoc->dbid, index id = $thisdoc->id, moodle instance id = $thisdoc->docid)");
+ //remove it from index and database table
$dbcontrol->delDocument($thisdoc);
$index->delete($thisdoc->id);
} //foreach
//commit changes
$index->commit();
- //update index date
+ //update index date and index size
set_config("search_indexer_run_date", time());
+ set_config("search_index_size", (int)$CFG->search_index_size - (int)$deletion_count);
mtrace("Finished $deletion_count removals.");
mtrace('Index size after: '.$index->count().'</pre>');
$this->addField(Zend_Search_Lucene_Field::UnIndexed('url', $doc->url));
$this->addField(Zend_Search_Lucene_Field::UnIndexed('date', $doc->date));
+ //additional data added on a per-module basis
$this->addField(Zend_Search_Lucene_Field::Binary('data', serialize($data)));
$this->addField(Zend_Search_Lucene_Field::Keyword('doctype', $document_type));
return $documents;
} //forum_get_content_for_index
+ //returns a single forum search document based on a forum_entry id
+ function forum_single_document($id) {
+ $posts = get_recordset('forum_posts', 'id', $id);
+ $post = $posts->fields;
+
+ $discussions = get_recordset('forum_discussions', 'id', $post['discussion']);
+ $discussion = $discussions->fields;
+
+ $forums = get_recordset('forum', 'id', $discussion['forum']);
+ $forum = $forums->fields;
+
+ return new ForumSearchDocument($post, $forum['id'], $forum['course'], $post['groupid']);
+ } //forum_single_document
+
+ function forum_delete($info) {
+ return $info;
+ } //forum_delete
+
+ //returns the var names needed to build a sql query for addition/deletions
+ function forum_db_names() {
+ //[primary id], [table name], [time created field name], [time modified field name]
+ return array('id', 'forum_posts', 'created', 'modified');
+ } //forum_db_names
+
//reworked faster version from /mod/forum/lib.php
function forum_get_discussions_fast($forum) {
global $CFG, $USER;
* */
require_once("$CFG->dirroot/search/documents/document.php");
- //require_once("$CFG->dirroot/mod/glossary/lib.php");
class GlossarySearchDocument extends SearchDocument {
public function __construct(&$entry, $glossary_id, $course_id, $group_id) {
return $documents;
} //glossary_get_content_for_index
+ //returns a single glossary search document based on a glossary_entry id
function glossary_single_document($id) {
$entries = get_recordset('glossary_entries', 'id', $id);
$entry = $entries->fields;
return new GlossarySearchDocument($entry, $entry['glossaryid'], $glossary['course'], -1);
} //glossary_single_document
+ //dummy delete function that converts docid from the search table to itself..
+ //this was here for a reason, but I can't remember it at the moment.
function glossary_delete($info) {
return $info;
} //glossary_delete
+ //returns the var names needed to build a sql query for addition/deletions
function glossary_db_names() {
- return array('id', 'glossary_entries', 'timemodified');
+ //[primary id], [table name], [time created field name], [time modified field name]
+ return array('id', 'glossary_entries', 'timecreated', 'timemodified');
} //glossary_db_names
?>
\ No newline at end of file
return $documents;
} //resource_get_content_for_index
+ //returns a single resource search document based on a resource_entry id
+ function resource_single_document($id) {
+ $resources = get_recordset_sql('SELECT *
+ FROM `resource`
+ WHERE alltext NOT LIKE ""
+ AND alltext NOT LIKE " "
+ AND alltext NOT LIKE " "
+ AND TYPE != "file",
+ AND id = '.$id);
+
+ $resource = $resources->fields;
+
+ return new ResourceSearchDocument($resource);
+ } //resource_single_document
+
+ function resource_delete($info) {
+ return $info;
+ } //resource_delete
+
+ //returns the var names needed to build a sql query for addition/deletions
+ function resource_db_names() {
+ //[primary id], [table name], [time created field name], [time modified field name], [additional where conditions for sql]
+ return array('id', 'resource', 'timemodified', 'timemodified', "WHERE alltext NOT LIKE '' AND alltext NOT LIKE ' ' AND alltext NOT LIKE ' ' AND TYPE != 'file'");
+ } //resource_db_names
+
?>
\ No newline at end of file
return $documents;
} //wiki_get_content_for_index
+ //returns a single wiki search document based on a wiki_entry id
+ function wiki_single_document($id) {
+ $pages = get_recordset('wiki_pages', 'id', $id);
+ $page = $pages->fields;
+
+ $entries = get_recordset('wiki_entries', 'id', $page['wiki']);
+ $entry = $entries->fields;
+
+ return new WikiSearchDocument($page, $entry['wikiid'], $entry['course'], $entry['groupid']);
+ } //wiki_single_document
+
+ function wiki_delete($info) {
+ return $info;
+ } //wiki_delete
+
+ //returns the var names needed to build a sql query for addition/deletions
+ function wiki_db_names() {
+ //[primary id], [table name], [time created field name], [time modified field name]
+ return array('id', 'wiki_pages', 'created', 'lastmodified');
+ } //wiki_db_names
+
?>
\ No newline at end of file
// * mod_get_content_for_index
//are the sole basis for including a module in the index at the moment.
- if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
- $mods = array_merge($mods, search_get_additional_modules());
+ if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
+ //add virtual modules onto the back of the array
+ $mods = array_merge($mods, search_get_additional_modules());
- foreach ($mods as $mod) {
+ foreach ($mods as $mod) {
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
if (file_exists($class_file)) {
include_once($class_file);
+ //build function names
$iter_function = $mod->name.'_iterator';
$index_function = $mod->name.'_get_content_for_index';
//mark the time we last updated
set_config("search_indexer_run_date", time());
-
+
+ //and the index size
+ set_config("search_index_size", (int)$index->count());
+
?>
\ No newline at end of file
$this->path = $path;
+ //test to see if there is a valid index on disk, at the specified path
try {
$test_index = new Zend_Search_Lucene($this->path, false);
$validindex = true;
} catch(Exception $e) {
$validindex = false;
} //catch
-
+
+ //retrieve file system info about the index if it is valid
if ($validindex) {
$this->size = display_size(get_directory_size($this->path));
$index_dir = get_directory_list($this->path, '', false, false);
$this->indexcount = 0;
} //else
- $db_exists = false;
+ $db_exists = false; //for now
+ //get all the current tables in moodle
$admin_tables = $db->MetaTables();
+ //TODO: use new IndexDBControl class for database checks?
+
+ //check if our search table exists
if (in_array($CFG->prefix.SEARCH_DATABASE_TABLE, $admin_tables)) {
+ //retrieve database information if it does
$db_exists = true;
//total documents
$this->types = array();
} //else
+ //check if the busy flag is set
if ($CFG->search_indexer_busy == '1') {
$this->complete = false;
} else {
$this->complete = true;
} //if
+ //get the last run date for the indexer
if ($this->valid() && $CFG->search_indexer_run_date) {
$this->time = $CFG->search_indexer_run_date;
} else {
} //else
} //__construct
+ //returns false on error, and the error message via referenced variable $err
public function valid(&$err=null) {
$err = array();
$ret = true;
return $ret;
} //valid
+ //is the index dir valid
public function is_valid_dir() {
if ($this->filecount > 0) {
return true;
} //else
} //is_valid_dir
+ //is the db table valid
public function is_valid_db() {
if ($this->dbcount > 0) {
return true;
} //else
} //is_valid_db
+ //shorthand get method for the class variables
public function __get($var) {
if (in_array($var, array_keys(get_class_vars(get_class($this))))) {
return $this->$var;
/* DB Index control class
*
+ * Used to control the search index database table
* */
class IndexDBControl {
+ //does the table exist?
public function checkTableExists() {
global $CFG, $db;
} //else
} //checkTableExists
+ //is our database setup valid?
public function checkDB() {
global $CFG, $db;
return $ret;
} //checkDB
+ //add a document record to the table
public function addDocument($document=null) {
global $db;
return $id;
} //addDocument
+ //remove a document record from the index
public function delDocument($document) {
global $db;
//check for php5, but don't die yet (see line 52)
if ($check = search_check_php5()) {
- require_once("$CFG->dirroot/search/querylib.php");
+ require_once("$CFG->dirroot/search/querylib.php");
- $advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false;
- $pages = (optional_param('p', '0', PARAM_INT) == '1') ? true : false;
+ $page_number = optional_param('page', -1, PARAM_INT);
+ $pages = ($page_number == -1) ? false : true;
+ $advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false;
$query_string = optional_param('query_string', '', PARAM_CLEAN);
if ($pages && isset($_SESSION['search_advanced_query'])) {
+ //if both are set, then we are busy browsing through the result pages of an advanced query
$adv = unserialize($_SESSION['search_advanced_query']);
} else if ($advanced) {
+ //otherwise we are dealing with a new advanced query
unset($_SESSION['search_advanced_query']);
session_unregister('search_advanced_query');
+ //retrieve advanced query variables
$adv->mustappear = trim(optional_param('mustappear', '', PARAM_CLEAN), $chars);
$adv->notappear = trim(optional_param('notappear', '', PARAM_CLEAN), $chars);
$adv->canappear = trim(optional_param('canappear', '', PARAM_CLEAN), $chars);
} //else
if ($advanced) {
+ //parse the advanced variables into a query string
+ //TODO: move out to external query class (QueryParse?)
+
+ //chars to strip from strings (whitespace)
$chars = ' \t\n\r\0\x0B,;';
$query_string = '';
+ //get all available module types
$module_types = array_merge(array('All'), array_values(search_get_document_types()));
$adv->module = in_array($adv->module, $module_types) ? $adv->module : 'All';
+ //convert '1 2' into '+1 +2' for required words field
if (strlen(trim($adv->mustappear)) > 0) {
$query_string = ' +'.implode(' +', preg_split("/[\s,;]+/", $adv->mustappear));
} //if
+ //convert '1 2' into '-1 -2' for not wanted words field
if (strlen(trim($adv->notappear)) > 0) {
$query_string .= ' -'.implode(' -', preg_split("/[\s,;]+/", $adv->notappear));
} //if
+ //this field is left untouched, apart from whitespace being stripped
if (strlen(trim($adv->canappear)) > 0) {
$query_string .= ' '.implode(' ', preg_split("/[\s,;]+/", $adv->canappear));
} //if
+ //add module restriction
if ($adv->module != 'All') {
$query_string .= ' +doctype:'.$adv->module;
} //if
+ //create title search string
if (strlen(trim($adv->title)) > 0) {
$query_string .= ' +title:'.implode(' +title:', preg_split("/[\s,;]+/", $adv->title));
} //if
+ //create author search string
if (strlen(trim($adv->author)) > 0) {
$query_string .= ' +author:'.implode(' +author:', preg_split("/[\s,;]+/", $adv->author));
} //if
+ //save our options if the query is valid
if (!empty($query_string)) {
$_SESSION['search_advanced_query'] = serialize($adv);
} //if
} //if
- $page_number = optional_param('page', 1, PARAM_INT);
-
+ //normalise page number
if ($page_number < 1) {
$page_number = 1;
- } //if
+ } //if
+ //run the query against the index
$sq = new SearchQuery($query_string, $page_number, 10, true);
} //if
print 'Searching: ';
if ($sq->is_valid_index()) {
- print $sq->index_count();
+ //use cached variable to show up-to-date index size (takes deletions into account)
+ print $CFG->search_index_size;
} else {
print "0";
} //else
$hits = $sq->results();
if ($advanced) {
- $page_links = preg_replace("/query_string=[^&]+/", 'a=1&p=1', $page_links);
+ //if in advanced mode, search options are saved in the session, so
+ //we can remove the query string var from the page links, and replace
+ //it with a=1 (Advanced = on) instead
+ $page_links = preg_replace("/query_string=[^&]+/", 'a=1', $page_links);
} //if
print "<ol>";
return count($this->results);
} //count
- public function index_count() {
- return $this->index->count();
- } //index_count
+ //this shouldn't be in this class
+ //public function index_count() {
+ // return $this->index->count();
+ //} //index_count
public function is_valid() {
return ($this->validquery and $this->validindex);
$table->data[] = array('<strong>Database</strong>', '<em><strong>search_documents<strong></em>');
- //add an extra field if we're admin
+ //add extra fields if we're admin
if (isadmin()) {
//don't want to confuse users if the two totals don't match (hint: they should)
$table->data[] = array('Documents in index', $indexinfo->indexcount);
+
+ //*cough* they should match if deletions were actually removed from the index,
+ //as it turns out, they're only marked as deleted and not returned in search results
+ $table->data[] = array('Deletions in index', (int)$indexinfo->indexcount - (int)$indexinfo->dbcount);
} //if
$table->data[] = array('Documents in database', $indexinfo->dbcount);
mtrace("<pre>Starting index update (updates)...\n");
if ($mods = get_records_select('modules')) {
+ $mods = array_merge($mods, search_get_additional_modules());
+
foreach ($mods as $mod) {
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$get_document_function = $mod->name.'_single_document';
mtrace("Checking $mod->name module for updates.");
$values = $db_names_function();
- $sql = "select id, ".$values[0]." as docid from ".$values[1]."
- where ".$values[2]." > $indexdate";
+ //TODO: check 'in' syntax with other RDBMS' (add and update.php as well)
+ $sql = "select id, ".$values[0]." as docid from ".$values[1].
+ " where ".$values[3]." > $indexdate".
+ " and id in (select docid from ".SEARCH_DATABASE_TABLE.")";
$records = get_records_sql($sql);