From 682d4032591a033060e3683c3f0c300c9de4d87e Mon Sep 17 00:00:00 2001 From: mchampan Date: Sun, 25 Jun 2006 23:07:36 +0000 Subject: [PATCH] Initial commit --- blocks/search/block_search.php | 70 ++ blocks/search/config_global.html | 19 + mod/wiki/lib.php | 120 ++++ search/README.txt | 22 + search/Zend/Exception.php | 30 + search/Zend/IMPORTANT.txt | 15 + search/Zend/LICENSE.txt | 27 + search/Zend/Search/Exception.php | 36 + search/Zend/Search/Lucene.php | 614 ++++++++++++++++++ .../Zend/Search/Lucene/Analysis/Analyzer.php | 96 +++ .../Lucene/Analysis/Analyzer/Common.php | 75 +++ .../Lucene/Analysis/Analyzer/Common/Text.php | 78 +++ .../Analyzer/Common/Text/CaseInsensitive.php | 46 ++ search/Zend/Search/Lucene/Analysis/Token.php | 171 +++++ .../Search/Lucene/Analysis/TokenFilter.php | 47 ++ .../Lucene/Analysis/TokenFilter/LowerCase.php | 57 ++ search/Zend/Search/Lucene/Document.php | 111 ++++ .../Zend/Search/Lucene/EncodingConverter.php | 32 + search/Zend/Search/Lucene/Exception.php | 36 + search/Zend/Search/Lucene/Field.php | 161 +++++ search/Zend/Search/Lucene/Index/FieldInfo.php | 45 ++ .../Zend/Search/Lucene/Index/SegmentInfo.php | 575 ++++++++++++++++ .../Search/Lucene/Index/SegmentWriter.php | 519 +++++++++++++++ search/Zend/Search/Lucene/Index/Term.php | 72 ++ search/Zend/Search/Lucene/Index/TermInfo.php | 79 +++ search/Zend/Search/Lucene/Index/Writer.php | 331 ++++++++++ search/Zend/Search/Lucene/Search/Query.php | 100 +++ .../Search/Lucene/Search/Query/MultiTerm.php | 439 +++++++++++++ .../Search/Lucene/Search/Query/Phrase.php | 426 ++++++++++++ .../Zend/Search/Lucene/Search/Query/Term.php | 128 ++++ search/Zend/Search/Lucene/Search/QueryHit.php | 108 +++ .../Zend/Search/Lucene/Search/QueryParser.php | 142 ++++ .../Zend/Search/Lucene/Search/QueryToken.php | 104 +++ .../Search/Lucene/Search/QueryTokenizer.php | 164 +++++ .../Zend/Search/Lucene/Search/Similarity.php | 553 ++++++++++++++++ .../Lucene/Search/Similarity/Default.php | 105 +++ search/Zend/Search/Lucene/Search/Weight.php | 61 ++ .../Search/Lucene/Search/Weight/MultiTerm.php | 135 ++++ .../Search/Lucene/Search/Weight/Phrase.php | 141 ++++ .../Zend/Search/Lucene/Search/Weight/Term.php | 146 +++++ .../Zend/Search/Lucene/Storage/Directory.php | 120 ++++ .../Lucene/Storage/Directory/Filesystem.php | 272 ++++++++ search/Zend/Search/Lucene/Storage/File.php | 371 +++++++++++ .../Search/Lucene/Storage/File/Filesystem.php | 171 +++++ search/Zend/Search/TODO.txt | 14 + search/db/mysql.sql | 15 + search/db/postgres7.sql | 21 + search/documents/document.php | 12 + search/documents/wiki_document.php | 28 + search/index.php | 10 + search/indexer.php | 152 +++++ search/indexersplash.php | 44 ++ search/lib.php | 59 ++ search/query.php | 116 ++++ search/stats.php | 91 +++ 55 files changed, 7732 insertions(+) create mode 100644 blocks/search/block_search.php create mode 100644 blocks/search/config_global.html create mode 100644 search/README.txt create mode 100755 search/Zend/Exception.php create mode 100644 search/Zend/IMPORTANT.txt create mode 100644 search/Zend/LICENSE.txt create mode 100644 search/Zend/Search/Exception.php create mode 100644 search/Zend/Search/Lucene.php create mode 100644 search/Zend/Search/Lucene/Analysis/Analyzer.php create mode 100644 search/Zend/Search/Lucene/Analysis/Analyzer/Common.php create mode 100644 search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php create mode 100644 search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php create mode 100644 search/Zend/Search/Lucene/Analysis/Token.php create mode 100644 search/Zend/Search/Lucene/Analysis/TokenFilter.php create mode 100644 search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php create mode 100644 search/Zend/Search/Lucene/Document.php create mode 100644 search/Zend/Search/Lucene/EncodingConverter.php create mode 100644 search/Zend/Search/Lucene/Exception.php create mode 100644 search/Zend/Search/Lucene/Field.php create mode 100644 search/Zend/Search/Lucene/Index/FieldInfo.php create mode 100644 search/Zend/Search/Lucene/Index/SegmentInfo.php create mode 100644 search/Zend/Search/Lucene/Index/SegmentWriter.php create mode 100644 search/Zend/Search/Lucene/Index/Term.php create mode 100644 search/Zend/Search/Lucene/Index/TermInfo.php create mode 100644 search/Zend/Search/Lucene/Index/Writer.php create mode 100644 search/Zend/Search/Lucene/Search/Query.php create mode 100644 search/Zend/Search/Lucene/Search/Query/MultiTerm.php create mode 100644 search/Zend/Search/Lucene/Search/Query/Phrase.php create mode 100644 search/Zend/Search/Lucene/Search/Query/Term.php create mode 100644 search/Zend/Search/Lucene/Search/QueryHit.php create mode 100644 search/Zend/Search/Lucene/Search/QueryParser.php create mode 100644 search/Zend/Search/Lucene/Search/QueryToken.php create mode 100644 search/Zend/Search/Lucene/Search/QueryTokenizer.php create mode 100644 search/Zend/Search/Lucene/Search/Similarity.php create mode 100644 search/Zend/Search/Lucene/Search/Similarity/Default.php create mode 100644 search/Zend/Search/Lucene/Search/Weight.php create mode 100644 search/Zend/Search/Lucene/Search/Weight/MultiTerm.php create mode 100644 search/Zend/Search/Lucene/Search/Weight/Phrase.php create mode 100644 search/Zend/Search/Lucene/Search/Weight/Term.php create mode 100644 search/Zend/Search/Lucene/Storage/Directory.php create mode 100644 search/Zend/Search/Lucene/Storage/Directory/Filesystem.php create mode 100644 search/Zend/Search/Lucene/Storage/File.php create mode 100644 search/Zend/Search/Lucene/Storage/File/Filesystem.php create mode 100644 search/Zend/Search/TODO.txt create mode 100644 search/db/mysql.sql create mode 100644 search/db/postgres7.sql create mode 100644 search/documents/document.php create mode 100644 search/documents/wiki_document.php create mode 100644 search/index.php create mode 100644 search/indexer.php create mode 100644 search/indexersplash.php create mode 100644 search/lib.php create mode 100644 search/query.php create mode 100644 search/stats.php diff --git a/blocks/search/block_search.php b/blocks/search/block_search.php new file mode 100644 index 0000000000..3cafd10da7 --- /dev/null +++ b/blocks/search/block_search.php @@ -0,0 +1,70 @@ + get_string() + */ + + class block_search extends block_base { + + function init() { + $this->title = "Global Search"; //get_string() + $this->version = 20060625; + } //init + + // only one instance of this block is required + function instance_allow_multiple() { + return false; + } //instance_allow_multiple + + // label and button values can be set in admin + function has_config() { + return true; + } //has_config + + function get_content() { + global $CFG; + + //cache block contents + if ($this->content !== NULL) { + return $this->content; + } //if + + $this->content = new stdClass; + + //lazy check for the moment + if (check_php_version("5.0.0")) { + //fetch values if defined in admin, otherwise use defaults + $label = (isset($CFG->block_search_text)) ? $CFG->block_search_text : "Search Moodle"; + $button = (isset($CFG->block_search_button)) ? $CFG->block_search_button : "Go"; + + //basic search form + $this->content->text = + '
' + . "" + . '' + . '' + . '
'; + } else { + $this->content->text = "Sorry folks, PHP 5 is needed for the new search module."; + } //else + + //no footer, thanks + $this->content->footer = ''; + + return $this->content; + } //get_content + + function specialisation() { + //empty! + } //specialisation + + } //block_search + +?> \ No newline at end of file diff --git a/blocks/search/config_global.html b/blocks/search/config_global.html new file mode 100644 index 0000000000..63e6a1b591 --- /dev/null +++ b/blocks/search/config_global.html @@ -0,0 +1,19 @@ +
+ + "/>
+ + + "/>

+ + +
\ No newline at end of file diff --git a/mod/wiki/lib.php b/mod/wiki/lib.php index 54fe22d4f7..487847c9cb 100644 --- a/mod/wiki/lib.php +++ b/mod/wiki/lib.php @@ -352,6 +352,126 @@ function wiki_get_entries(&$wiki, $byindex=NULL) { } } + +/*==== Global search modifications + * Author: Michael Champanis (mchampan) + * Last date: 2006 06 25 + * These modifications allow wiki documents to be indexed in the new + * search engine module - they are probably not final, and as such + * shouldn't be used by other stuff for the time being + **/ + +//rescued and converted from ewikimoodlelib.php +//retrieves latest version of a page +function wiki_get_latest_page(&$entry, $pagename, $version=0) { + global $CFG; + + //need something like this in datalib.php? + switch ($CFG->dbtype) { + case 'mysql': + $f = 'mysql_real_escape_string'; + break; + case 'postgres7': + $f = 'pg_escape_string'; + break; + default: + $f = 'addslashes'; + } //switch + + $pagename = "'".$f($pagename)."'"; + + if ($version > 0 and is_int($version)) { + $version = "AND (version=$version)"; + } else { + $version = ''; + } //else + + $select = "(pagename=$pagename) AND wiki=".$entry->id." $version "; + $sort = 'version DESC'; + + //change this to recordset_select, as per http://docs.moodle.org/en/Datalib_Notes + if ($result_arr = get_records_select('wiki_pages', $select, $sort, '*', 0, 1)) { + foreach ($result_arr as $obj) { + $result_obj = $obj; + } //foreach + } //if + + if (isset($result_obj)) { + $result_obj->meta = @unserialize($result_obj->meta); + return $result_obj; + } else { + return false; + } //else +} //wiki_get_latest_page + +//fetches all pages, including old versions +function wiki_get_pages(&$entry) { + return get_records('wiki_pages', 'wiki', $entry->id); +} //wiki_get_pages + +//fetches all the latest versions of all the pages +function wiki_get_latest_pages(&$entry) { + //== (My)SQL for this + /* select * from wiki_pages + inner join + (select wiki_pages.pagename, max(wiki_pages.version) as ver + from wiki_pages group by pagename) as a + on ((wiki_pages.version = a.ver) and + (wiki_pages.pagename like a.pagename)) */ + + $pages = array(); + + //http://moodle.org/bugs/bug.php?op=show&bugid=5877&pos=0 + //if ($ids = get_records('wiki_pages', 'wiki', $entry->id, '', 'distinct pagename')) { + if ($rs = get_recordset('wiki_pages', 'wiki', $entry->id, '', 'distinct pagename')) { + $ids = $rs->GetRows(); + //-- + foreach ($ids as $id) { + $pages[] = wiki_get_latest_page($entry, $id[0]); + } //foreach + } else { + return false; + } //else + + return $pages; +} //wiki_get_latest_pages + +function wiki_iterator() { + return get_all_instances_in_courses("wiki", get_courses()); +} //wiki_search_index + +function wiki_get_content_for_index(&$wiki) { + $documents = array(); + + $entries = wiki_get_entries($wiki); + foreach($entries as $entry) { + //all pages + //$pages = wiki_get_pages($entry); + + //latest pages + $pages = wiki_get_latest_pages($entry); + $i = 0; + + if (is_array($pages)) { + foreach($pages as $page) { + if (strlen($page->content) > 0) { + $i++; + $documents[] = new WikiSearchDocument($page, $entry->wikiid, $entry->course, $entry->userid, $entry->groupid); + } //if + } //foreach + + //print "$entry->id : $i"; print "
"; + } else { + print $pages; + } //else + } //foreach + + return $documents; +} //wiki_get_content_for_index + +/*==== Global search modifications end */ + + function wiki_get_default_entry(&$wiki, &$course, $userid=0, $groupid=0) { /// Returns the wiki entry according to the wiki type. /// Optionally, will return wiki entry for $userid student wiki, or diff --git a/search/README.txt b/search/README.txt new file mode 100644 index 0000000000..c3d4ab18e6 --- /dev/null +++ b/search/README.txt @@ -0,0 +1,22 @@ +This is the initial release (prototype) of Moodle's new search module - +so basically watch out for sharp edges. + +The structure has not been finalised, but this is what is working at the +moment, when I start looking at other content to index, it will most likely +change. I don't recommend trying to make your own content modules indexable, +at least not until the whole flow is finalised. I will be implementing the +functions needed to index all of the default content modules on Moodle, so +expect that around mid-August. + +Wiki pages were my goal for this release, they can be indexed and searched, +but not updated or deleted at this stage (was waiting for ZF 0.14 actually). + +I need to check the PostgreSQL sql file, I don't have a PG7 install lying +around to test on, so the script is untested. + +To index for the first time, login as an admin user and browse to /search/index.php +or /search/stats.php - there will be a message and a link telling you to go index. + +-- Michael Champanis (mchampan) + cynnical@gmail.com + Summer of Code 2006 \ No newline at end of file diff --git a/search/Zend/Exception.php b/search/Zend/Exception.php new file mode 100755 index 0000000000..c47fffba30 --- /dev/null +++ b/search/Zend/Exception.php @@ -0,0 +1,30 @@ +_directory = $directory; + $this->_closeDirOnExit = false; + } else { + $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); + $this->_closeDirOnExit = true; + } + + if ($create) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); + } else { + $this->_writer = null; + } + + $this->_segmentInfos = array(); + + $segmentsFile = $this->_directory->getFileObject('segments'); + + $format = $segmentsFile->readInt(); + + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $segmentsFile->readLong(); + + // read counter + $segmentsFile->readInt(); + + $segments = $segmentsFile->readInt(); + + $this->_docCount = 0; + + // read segmentInfos + for ($count = 0; $count < $segments; $count++) { + $segName = $segmentsFile->readString(); + $segSize = $segmentsFile->readInt(); + $this->_docCount += $segSize; + + $this->_segmentInfos[$count] = + new Zend_Search_Lucene_Index_SegmentInfo($segName, + $segSize, + $this->_directory); + } + } + + + /** + * Object destructor + */ + public function __destruct() + { + $this->commit(); + + if ($this->_closeDirOnExit) { + $this->_directory->close(); + } + } + + /** + * Returns an instance of Zend_Search_Lucene_Index_Writer for the index + * + * @return Zend_Search_Lucene_Index_Writer + */ + public function getIndexWriter() + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + return $this->_writer; + } + + + /** + * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. + * + * @return Zend_Search_Lucene_Storage_Directory + */ + public function getDirectory() + { + return $this->_directory; + } + + + /** + * Returns the total number of documents in this index. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * @param mixed $query + * @return array ZSearchHit + */ + public function find($query) + { + if (is_string($query)) { + $query = Zend_Search_Lucene_Search_QueryParser::parse($query); + } + + if (!$query instanceof Zend_Search_Lucene_Search_Query) { + throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); + } + + $this->commit(); + + $hits = array(); + $scores = array(); + + $docNum = $this->count(); + for( $count=0; $count < $docNum; $count++ ) { + $docScore = $query->score( $count, $this); + if( $docScore != 0 ) { + $hit = new Zend_Search_Lucene_Search_QueryHit($this); + $hit->id = $count; + $hit->score = $docScore; + + $hits[] = $hit; + $scores[] = $docScore; + } + } + array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); + + return $hits; + } + + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false) + { + $result = array(); + foreach( $this->_segmentInfos as $segmentInfo ) { + $result = array_merge($result, $segmentInfo->getFields($indexed)); + } + return $result; + } + + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id) + { + if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { + /* @var $id Zend_Search_Lucene_Search_QueryHit */ + $id = $id->id; + } + + if ($id >= $this->_docCount) { + throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); + } + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + + $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); + $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); + $fieldValuesPosition = $fdxFile->readLong(); + + $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); + $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); + $fieldCount = $fdtFile->readVInt(); + + $doc = new Zend_Search_Lucene_Document(); + for( $count = 0; $count < $fieldCount; $count++ ) { + $fieldNum = $fdtFile->readVInt(); + $bits = $fdtFile->readByte(); + + $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); + + if( !($bits & 2) ) { // Text data + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readString(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } else { + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readBinary(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } + + $doc->addField($field); + } + + return $doc; + } + + + /** + * Returns an array of all the documents which contain term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $docId = 0; + for( $count=0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + } else { + $docId += $docDelta/2; + // read freq + $frqFile->readVInt(); + } + + $result[] = $segmentStartDocId + $docId; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + foreach( $this->_segmentInfos as $segInfo ) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $freqs = array(); + $docId = 0; + + for( $count = 0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + $freqs[ $docId ] = 1; + } else { + $docId += $docDelta/2; + $freqs[ $docId ] = $frqFile->readVInt(); + } + } + + $prxFile = $segInfo->openCompoundFile('.prx'); + $prxFile->seek($termInfo->proxPointer,SEEK_CUR); + foreach ($freqs as $docId => $freq) { + $termPosition = 0; + $positions = array(); + + for ($count = 0; $count < $freq; $count++ ) { + $termPosition += $prxFile->readVInt(); + $positions[] = $termPosition; + } + + $result[ $segmentStartDocId + $docId ] = $positions; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term) + { + $result = 0; + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + if ($termInfo !== null) { + $result += $termInfo->docFreq; + } + } + + return $result; + } + + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity() + { + return Zend_Search_Lucene_Search_Similarity::getDefault(); + } + + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return Zend_Search_Lucene_Document + */ + public function norm( $id, $fieldName ) + { + if ($id >= $this->_docCount) { + return null; + } + + $segmentStartId = 0; + foreach ($this->_segmentInfos as $segInfo) { + if ($segmentStartId + $segInfo->count() > $id) { + break; + } + + $segmentStartId += $segInfo->count(); + } + + if ($segInfo->isDeleted($id - $segmentStartId)) { + return 0; + } + + return $segInfo->norm($id - $segmentStartId, $fieldName); + } + + /** + * Returns true if any documents have been deleted from this index. + * + * @return boolean + */ + public function hasDeletions() + { + foreach ($this->_segmentInfos as $segmentInfo) { + if ($segmentInfo->hasDeletions()) { + return true; + } + } + + return false; + } + + + /** + * Deletes a document from the index. + * $id is an internal document id + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @throws Zend_Search_Lucene_Exception + */ + public function delete($id) + { + if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { + /* @var $id Zend_Search_Lucene_Search_QueryHit */ + $id = $id->id; + } + + if ($id >= $this->_docCount) { + throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); + } + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + + $this->_hasChanges = true; + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId); + } + + + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + $this->_writer->addDocument($document); + } + + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + * + * @todo delete() and undeleteAll processing. + */ + public function commit() + { + if ($this->_hasChanges) { + foreach ($this->_segmentInfos as $segInfo) { + $segInfo->writeChanges(); + } + + $this->_hasChanges = false; + } + + if ($this->_writer !== null) { + foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { + if ($segmentInfo !== null) { + $this->_segmentInfos[] = $segmentInfo; + $this->_docCount += $segmentInfo->count(); + } else { + foreach ($this->_segmentInfos as $segId => $segInfo) { + if ($segInfo->getName() == $segmentName) { + unset($this->_segmentInfos[$segId]); + } + } + } + } + } + } + + + /************************************************************************* + @todo UNIMPLEMENTED + *************************************************************************/ + + /** + * Returns an array of all terms in this index. + * + * @todo Implementation + * @return array + */ + public function terms() + { + return array(); + } + + + /** + * Undeletes all documents currently marked as deleted in this index. + * + * @todo Implementation + */ + public function undeleteAll() + {} +} \ No newline at end of file diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer.php b/search/Zend/Search/Lucene/Analysis/Analyzer.php new file mode 100644 index 0000000000..febf88e614 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer.php @@ -0,0 +1,96 @@ +_filters[] = $filter; + } + + /** + * Apply filters to the token. + * + * @param Zend_Search_Lucene_Analysis_Token $token + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $token) + { + foreach ($this->_filters as $filter) { + $token = $filter->normalize($token); + } + + return $token; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php new file mode 100644 index 0000000000..6f6f0dd936 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -0,0 +1,78 @@ +normalize($token); + } + + return $tokenStream; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php new file mode 100644 index 0000000000..e5fc372628 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php @@ -0,0 +1,46 @@ +addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase()); + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Token.php b/search/Zend/Search/Lucene/Analysis/Token.php new file mode 100644 index 0000000000..f2e9ee7cad --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Token.php @@ -0,0 +1,171 @@ +_termText = $text; + $this->_startOffset = $start; + $this->_endOffset = $end; + $this->_type = $type; + + $this->_positionIncrement = 1; + } + + + /** + * positionIncrement setter + * + * @param integer $positionIncrement + */ + public function setPositionIncrement($positionIncrement) + { + $this->_positionIncrement = $positionIncrement; + } + + /** + * Returns the position increment of this Token. + * + * @return integer + */ + public function getPositionIncrement() + { + return $this->_positionIncrement; + } + + /** + * Returns the Token's term text. + * + * @return string + */ + public function getTermText() + { + return $this->_termText; + } + + /** + * Returns this Token's starting offset, the position of the first character + * corresponding to this token in the source text. + * + * Note: + * The difference between getEndOffset() and getStartOffset() may not be equal + * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered + * by a stemmer or some other filter. + * + * @return integer + */ + public function getStartOffset() + { + return $this->_startOffset; + } + + /** + * Returns this Token's ending offset, one greater than the position of the + * last character corresponding to this token in the source text. + * + * @return integer + */ + public function getEndOffset() + { + return $this->_endOffset; + } + + /** + * Returns this Token's lexical type. Defaults to 'word'. + * + * @return string + */ + public function getType() + { + return $this->_type; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter.php b/search/Zend/Search/Lucene/Analysis/TokenFilter.php new file mode 100644 index 0000000000..a363aa1c1c --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter.php @@ -0,0 +1,47 @@ +getTermText() ), + $srcToken->getStartOffset(), + $srcToken->getEndOffset(), + $srcToken->getType()); + + $newToken->setPositionIncrement($srcToken->getPositionIncrement()); + + return $newToken; + } +} + diff --git a/search/Zend/Search/Lucene/Document.php b/search/Zend/Search/Lucene/Document.php new file mode 100644 index 0000000000..48e48cf17a --- /dev/null +++ b/search/Zend/Search/Lucene/Document.php @@ -0,0 +1,111 @@ +getFieldValue($offset); + } + + + /** + * Add a field object to this document. + * + * @param Zend_Search_Lucene_Field $field + */ + public function addField(Zend_Search_Lucene_Field $field) + { + $this->_fields[$field->name] = $field; + } + + + /** + * Return an array with the names of the fields in this document. + * + * @return array + */ + public function getFieldNames() + { + return array_keys($this->_fields); + } + + + /** + * Returns Zend_Search_Lucene_Field object for a named field in this document. + * + * @param string $fieldName + * @return Zend_Search_Lucene_Field + */ + public function getField($fieldName) + { + if (!array_key_exists($fieldName, $this->_fields)) { + throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); + } + return $this->_fields[$fieldName]; + } + + + /** + * Returns the string value of a named field in this document. + * + * @see __get() + * @return string + */ + public function getFieldValue($fieldName) + { + return $this->getField($fieldName)->stringValue; + } + +} diff --git a/search/Zend/Search/Lucene/EncodingConverter.php b/search/Zend/Search/Lucene/EncodingConverter.php new file mode 100644 index 0000000000..9c22e5cd73 --- /dev/null +++ b/search/Zend/Search/Lucene/EncodingConverter.php @@ -0,0 +1,32 @@ +in_encoding = $in_encoding; + $this->out_encoding = $out_encoding; + } //constructor + + function handleError($err, $msg) { + $this->last_error = $msg; + } //handleError + + function convert($str) { + $this->last_error = FALSE; + + set_error_handler(array(&$this, 'handleError')); + $ret = iconv($this->in_encoding, $this->out_encoding, $str); + restore_error_handler(); + + return $ret; + } //convert + + function getLastError() { + return $this->last_error; + } //getLastError +} //EncodingConverter + +?> \ No newline at end of file diff --git a/search/Zend/Search/Lucene/Exception.php b/search/Zend/Search/Lucene/Exception.php new file mode 100644 index 0000000000..5b73b29c5e --- /dev/null +++ b/search/Zend/Search/Lucene/Exception.php @@ -0,0 +1,36 @@ +name = $name; + + if (!$isBinary) { + /* + $econv = new EncodingConverter(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT'); + $this->stringValue = $econv->convert($stringValue); + + if ($econv->getLastError()) { + echo "Error: ".$econv->getLastError(); + echo "
"; + echo "x".$stringValue."x"; + exit(); + } else { + }*/ + + /** + * @todo Correct UTF-8 string should be required in future + * Until full UTF-8 support is not completed, string should be normalized to ANSII encoding + */ + + $this->stringValue = iconv('ISO-8859-1', 'ASCII//TRANSLIT', $stringValue); + //$this->stringValue = iconv(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT', $stringValue); + } else { + $this->stringValue = $stringValue; + } + $this->isStored = $isStored; + $this->isIndexed = $isIndexed; + $this->isTokenized = $isTokenized; + $this->isBinary = $isBinary; + + $this->storeTermVector = false; + $this->boost = 1.0; + } + + + /** + * Constructs a String-valued Field that is not tokenized, but is indexed + * and stored. Useful for non-text fields, e.g. date or url. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Keyword($name, $value) + { + return new self($name, $value, true, true, false); + } + + + /** + * Constructs a String-valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnIndexed($name, $value) + { + return new self($name, $value, true, false, false); + } + + + /** + * Constructs a Binary String valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Binary($name, $value) + { + return new self($name, $value, true, false, false, true); + } + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * and is stored in the index, for return with hits. Useful for short text + * fields, like "title" or "subject". Term vector will not be stored for this field. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Text($name, $value) + { + return new self($name, $value, true, true, true); + } + + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * but that is not stored in the index. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnStored($name, $value) + { + return new self($name, $value, false, true, true); + } + +} + diff --git a/search/Zend/Search/Lucene/Index/FieldInfo.php b/search/Zend/Search/Lucene/Index/FieldInfo.php new file mode 100644 index 0000000000..4c11aaac77 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/FieldInfo.php @@ -0,0 +1,45 @@ +name = $name; + $this->isIndexed = $isIndexed; + $this->number = $number; + $this->storeTermVector = $storeTermVector; + } +} + diff --git a/search/Zend/Search/Lucene/Index/SegmentInfo.php b/search/Zend/Search/Lucene/Index/SegmentInfo.php new file mode 100644 index 0000000000..3defbed1dd --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentInfo.php @@ -0,0 +1,575 @@ + normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) + * + * @var array + */ + private $_norms = array(); + + /** + * List of deleted documents. + * bitset if bitset extension is loaded or array otherwise. + * + * @var mixed + */ + private $_deleted; + + /** + * $this->_deleted update flag + * + * @var boolean + */ + private $_deletedDirty = false; + + /** + * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, + * Documents count and Directory as a parameter. + * + * @param string $name + * @param integer $docCount + * @param Zend_Search_Lucene_Storage_Directory $directory + */ + public function __construct($name, $docCount, $directory) + { + $this->_name = $name; + $this->_docCount = $docCount; + $this->_directory = $directory; + $this->_termDictionary = null; + + $this->_segFiles = array(); + if ($this->_directory->fileExists($name . '.cfs')) { + $cfsFile = $this->_directory->getFileObject($name . '.cfs'); + $segFilesCount = $cfsFile->readVInt(); + + for ($count = 0; $count < $segFilesCount; $count++) { + $dataOffset = $cfsFile->readLong(); + $fileName = $cfsFile->readString(); + $this->_segFiles[$fileName] = $dataOffset; + } + } + + $fnmFile = $this->openCompoundFile('.fnm'); + $fieldsCount = $fnmFile->readVInt(); + $fieldNames = array(); + $fieldNums = array(); + $this->_fields = array(); + for ($count=0; $count < $fieldsCount; $count++) { + $fieldName = $fnmFile->readString(); + $fieldBits = $fnmFile->readByte(); + $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, + $fieldBits & 1, + $count, + $fieldBits & 2 ); + if ($fieldBits & 0x10) { + // norms are omitted for the indexed field + $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); + } + + $fieldNums[$count] = $count; + $fieldNames[$count] = $fieldName; + } + array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); + $this->_fieldsDicPositions = array_flip($fieldNums); + + try { + $delFile = $this->openCompoundFile('.del'); + + $byteCount = $delFile->readInt(); + $byteCount = ceil($byteCount/8); + $bitCount = $delFile->readInt(); + + if ($bitCount == 0) { + $delBytes = ''; + } else { + $delBytes = $delFile->readBytes($byteCount); + } + + if (extension_loaded('bitset')) { + $this->_deleted = $delBytes; + } else { + $this->_deleted = array(); + for ($count = 0; $count < $byteCount; $count++) { + $byte = ord($delBytes{$count}); + for ($bit = 0; $bit < 8; $bit++) { + if ($byte & (1<<$bit)) { + $this->_deleted[$count*8 + $bit] = 1; + } + } + } + + } + } catch(Zend_Search_Exception $e) { + if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) { + $this->_deleted = null; + } else { + throw $e; + } + } + } + + /** + * Opens index file stoted within compound index file + * + * @param string $extension + * @throws Zend_Search_Lucene_Exception + * @return Zend_Search_Lucene_Storage_File + */ + public function openCompoundFile($extension) + { + $filename = $this->_name . $extension; + + // Try to open common file first + if ($this->_directory->fileExists($filename)) { + return $this->_directory->getFileObject($filename); + } + + if( !isset($this->_segFiles[$filename]) ) { + throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' + . $filename . ' file.' ); + } + + $file = $this->_directory->getFileObject( $this->_name.".cfs" ); + $file->seek($this->_segFiles[$filename]); + return $file; + } + + /** + * Returns field index or -1 if field is not found + * + * @param string $fieldName + * @return integer + */ + public function getFieldNum($fieldName) + { + foreach( $this->_fields as $field ) { + if( $field->name == $fieldName ) { + return $field->number; + } + } + + return -1; + } + + /** + * Returns field info for specified field + * + * @param integer $fieldNum + * @return ZSearchFieldInfo + */ + public function getField($fieldNum) + { + return $this->_fields[$fieldNum]; + } + + /** + * Returns array of fields. + * if $indexed parameter is true, then returns only indexed fields. + * + * @param boolean $indexed + * @return array + */ + public function getFields($indexed = false) + { + $result = array(); + foreach( $this->_fields as $field ) { + if( (!$indexed) || $field->isIndexed ) { + $result[ $field->name ] = $field->name; + } + } + return $result; + } + + /** + * Returns the total number of documents in this segment. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + /** + * Get field position in a fields dictionary + * + * @param integer $fieldNum + * @return integer + */ + private function _getFieldPosition($fieldNum) { + // Treat values which are not in a translation table as a 'direct value' + return isset($this->_fieldsDicPositions[$fieldNum]) ? + $this->_fieldsDicPositions[$fieldNum] : $fieldNum; + } + + /** + * Loads Term dictionary from TermInfoIndex file + */ + protected function _loadDictionary() + { + if ($this->_termDictionary !== null) { + return; + } + + $this->_termDictionary = array(); + $this->_termDictionaryInfos = array(); + + $tiiFile = $this->openCompoundFile('.tii'); + $tiVersion = $tiiFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + $indexTermCount = $tiiFile->readLong(); + $tiiFile->readInt(); // IndexInterval + $skipInterval = $tiiFile->readInt(); + + $prevTerm = ''; + $freqPointer = 0; + $proxPointer = 0; + $indexPointer = 0; + for ($count = 0; $count < $indexTermCount; $count++) { + $termPrefixLength = $tiiFile->readVInt(); + $termSuffix = $tiiFile->readString(); + $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; + + $termFieldNum = $tiiFile->readVInt(); + $docFreq = $tiiFile->readVInt(); + $freqPointer += $tiiFile->readVInt(); + $proxPointer += $tiiFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipDelta = $tiiFile->readVInt(); + } else { + $skipDelta = 0; + } + + $indexPointer += $tiiFile->readVInt(); + + $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); + $this->_termDictionaryInfos[] = + new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + $prevTerm = $termValue; + } + } + + + /** + * Return segment name + * + * @return string + */ + public function getName() + { + return $this->_name; + } + + + /** + * Scans terms dictionary and returns term info + * + * @param Zend_Search_Lucene_Index_Term $term + * @return Zend_Search_Lucene_Index_TermInfo + */ + public function getTermInfo($term) + { + $this->_loadDictionary(); + + $searchField = $this->getFieldNum($term->field); + + if ($searchField == -1) { + return null; + } + $searchDicField = $this->_getFieldPosition($searchField); + + // search for appropriate value in dictionary + $lowIndex = 0; + $highIndex = count($this->_termDictionary)-1; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $midTerm = $this->_termDictionary[$mid]; + + $fieldNum = $this->_getFieldPosition($midTerm->field); + $delta = $searchDicField - $fieldNum; + if ($delta == 0) { + $delta = strcmp($term->text, $midTerm->text); + } + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $this->_termDictionaryInfos[$mid]; // We got it! + } + } + + if ($highIndex == -1) { + // Term is out of the dictionary range + return null; + } + + $prevPosition = $highIndex; + $prevTerm = $this->_termDictionary[$prevPosition]; + $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; + + $tisFile = $this->openCompoundFile('.tis'); + $tiVersion = $tisFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); + } + + $termCount = $tisFile->readLong(); + $indexInterval = $tisFile->readInt(); + $skipInterval = $tisFile->readInt(); + + $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); + + $termValue = $prevTerm->text; + $termFieldNum = $prevTerm->field; + $freqPointer = $prevTermInfo->freqPointer; + $proxPointer = $prevTermInfo->proxPointer; + for ($count = $prevPosition*$indexInterval + 1; + $count < $termCount && + ( $this->_getFieldPosition($termFieldNum) < $searchDicField || + ($this->_getFieldPosition($termFieldNum) == $searchDicField && + strcmp($termValue, $term->text) < 0) ); + $count++) { + $termPrefixLength = $tisFile->readVInt(); + $termSuffix = $tisFile->readString(); + $termFieldNum = $tisFile->readVInt(); + $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; + + $docFreq = $tisFile->readVInt(); + $freqPointer += $tisFile->readVInt(); + $proxPointer += $tisFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipOffset = $tisFile->readVInt(); + } else { + $skipOffset = 0; + } + } + + if ($termFieldNum == $searchField && $termValue == $term->text) { + return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); + } else { + return null; + } + } + + /** + * Returns normalization factor for specified documents + * + * @param integer $id + * @param string $fieldName + * @return string + */ + public function norm($id, $fieldName) + { + $fieldNum = $this->getFieldNum($fieldName); + + if ( !($this->_fields[$fieldNum]->isIndexed) ) { + return null; + } + + if ( !isset( $this->_norms[$fieldNum] )) { + $fFile = $this->openCompoundFile('.f' . $fieldNum); + $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); + } + + return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); + } + + + /** + * Returns true if any documents have been deleted from this index segment. + * + * @return boolean + */ + public function hasDeletions() + { + return $this->_deleted !== null; + } + + + /** + * Deletes a document from the index segment. + * $id is an internal document id + * + * @param integer + */ + public function delete($id) + { + $this->_deletedDirty = true; + + if (extension_loaded('bitset')) { + if ($this->_deleted === null) { + $this->_deleted = bitset_empty($id); + } + bitset_incl($this->_deleted, $id); + } else { + if ($this->_deleted === null) { + $this->_deleted = array(); + } + + $this->_deleted[$id] = 1; + } + } + + /** + * Checks, that document is deleted + * + * @param integer + * @return boolean + */ + public function isDeleted($id) + { + if ($this->_deleted === null) { + return false; + } + + if (extension_loaded('bitset')) { + return bitset_in($this->_deleted, $id); + } else { + return isset($this->_deleted[$id]); + } + } + + + /** + * Write changes if it's necessary. + */ + public function writeChanges() + { + if (!$this->_deletedDirty) { + return; + } + + if (extension_loaded('bitset')) { + $delBytes = $this->_deleted; + $bitCount = count(bitset_to_array($delBytes)); + } else { + $byteCount = floor($this->_docCount/8)+1; + $delBytes = str_repeat(chr(0), $byteCount); + for ($count = 0; $count < $byteCount; $count++) { + $byte = 0; + for ($bit = 0; $bit < 8; $bit++) { + if (isset($this->_deleted[$count*8 + $bit])) { + $byte |= (1<<$bit); + } + } + $delBytes{$count} = chr($byte); + } + $bitCount = count($this->_deleted); + } + + + $delFile = $this->_directory->createFile($this->_name . '.del'); + $delFile->writeInt($this->_docCount); + $delFile->writeInt($bitCount); + $delFile->writeBytes($delBytes); + + $this->_deletedDirty = false; + } +} + diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter.php new file mode 100644 index 0000000000..6cb4477f99 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentWriter.php @@ -0,0 +1,519 @@ +_directory = $directory; + $this->_name = $name; + $this->_docCount = 0; + + $this->_fields = array(); + $this->_termDocs = array(); + $this->_files = array(); + $this->_norms = array(); + $this->_fieldLengths = array(); + $this->_termDictionary = array(); + + $this->_fdxFile = null; + $this->_fdtFile = null; + } + + + /** + * Add field to the segment + * + * @param Zend_Search_Lucene_Field $field + */ + private function _addFieldInfo(Zend_Search_Lucene_Field $field) + { + if (!isset($this->_fields[$field->name])) { + $this->_fields[$field->name] = + new Zend_Search_Lucene_Index_FieldInfo($field->name, + $field->isIndexed, + count($this->_fields), + $field->storeTermVector); + } else { + $this->_fields[$field->name]->isIndexed |= $field->isIndexed; + $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; + } + } + + + /** + * Adds a document to this segment. + * + * @param Zend_Search_Lucene_Document $document + * @throws Zend_Search_Lucene_Exception + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $storedFields = array(); + + foreach ($document->getFieldNames() as $fieldName) { + $field = $document->getField($fieldName); + $this->_addFieldInfo($field); + + if ($field->storeTermVector) { + /** + * @todo term vector storing support + */ + throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); + } + + if ($field->isIndexed) { + if ($field->isTokenized) { + $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); + } else { + $tokenList = array(); + $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); + } + $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); + + $position = 0; + foreach ($tokenList as $token) { + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); + $termKey = $term->key(); + + if (!isset($this->_termDictionary[$termKey])) { + // New term + $this->_termDictionary[$termKey] = $term; + $this->_termDocs[$termKey] = array(); + $this->_termDocs[$termKey][$this->_docCount] = array(); + } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { + // Existing term, but new term entry + $this->_termDocs[$termKey][$this->_docCount] = array(); + } + $position += $token->getPositionIncrement(); + $this->_termDocs[$termKey][$this->_docCount][] = $position; + } + } + + if ($field->isStored) { + $storedFields[] = $field; + } + } + + if (count($storedFields) != 0) { + if (!isset($this->_fdxFile)) { + $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); + $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); + + $this->_files[] = $this->_name . '.fdx'; + $this->_files[] = $this->_name . '.fdt'; + } + + $this->_fdxFile->writeLong($this->_fdtFile->tell()); + $this->_fdtFile->writeVInt(count($storedFields)); + foreach ($storedFields as $field) { + $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); + $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | + ($field->isBinary ? 0x02 : 0x00) | + 0x00; /* 0x04 - third bit, compressed (ZLIB) */ + $this->_fdtFile->writeByte($fieldBits); + if ($field->isBinary) { + $this->_fdtFile->writeVInt(strlen($field->stringValue)); + $this->_fdtFile->writeBytes($field->stringValue); + } else { + $this->_fdtFile->writeString($field->stringValue); + } + } + } + + $this->_docCount++; + } + + + /** + * Dump Field Info (.fnm) segment file + */ + private function _dumpFNM() + { + $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); + $fnmFile->writeVInt(count($this->_fields)); + + foreach ($this->_fields as $field) { + $fnmFile->writeString($field->name); + $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | + ($field->storeTermVector ? 0x02 : 0x00) +// not supported yet 0x04 /* term positions are stored with the term vectors */ | +// not supported yet 0x08 /* term offsets are stored with the term vectors */ | + ); + + if ($field->isIndexed) { + $fieldNum = $this->_fields[$field->name]->number; + $fieldName = $field->name; + $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); + $norm = ''; + + for ($count = 0; $count < $this->_docCount; $count++) { + $numTokens = isset($this->_fieldLengths[$fieldName][$count]) ? + $this->_fieldLengths[$fieldName][$count] : 0; + $norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens))); + } + + $normFileName = $this->_name . '.f' . $fieldNum; + $fFile = $this->_directory->createFile($normFileName); + $fFile->writeBytes($norm); + $this->_files[] = $normFileName; + } + } + + $this->_files[] = $this->_name . '.fnm'; + } + + + /** + * Dump Term Dictionary segment file entry. + * Used to write entry to .tis or .tii files + * + * @param Zend_Search_Lucene_Storage_File $dicFile + * @param Zend_Search_Lucene_Index_Term $prevTerm + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo + * @param Zend_Search_Lucene_Index_TermInfo $termInfo + */ + private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, + &$prevTerm, Zend_Search_Lucene_Index_Term $term, + &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) + { + if (isset($prevTerm) && $prevTerm->field == $term->field) { + $prefixLength = 0; + while ($prefixLength < strlen($prevTerm->text) && + $prefixLength < strlen($term->text) && + $prevTerm->text{$prefixLength} == $term->text{$prefixLength} + ) { + $prefixLength++; + } + // Write preffix length + $dicFile->writeVInt($prefixLength); + // Write suffix + $dicFile->writeString( substr($term->text, $prefixLength) ); + } else { + // Write preffix length + $dicFile->writeVInt(0); + // Write suffix + $dicFile->writeString($term->text); + } + // Write field number + $dicFile->writeVInt($term->field); + // DocFreq (the count of documents which contain the term) + $dicFile->writeVInt($termInfo->docFreq); + + $prevTerm = $term; + + if (!isset($prevTermInfo)) { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer); + } else { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); + } + // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval + if ($termInfo->skipOffset != 0) { + $dicFile->writeVInt($termInfo->skipOffset); + } + + $prevTermInfo = $termInfo; + } + + /** + * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files + */ + private function _dumpDictionary() + { + $termKeys = array_keys($this->_termDictionary); + sort($termKeys, SORT_STRING); + + $tisFile = $this->_directory->createFile($this->_name . '.tis'); + $tisFile->writeInt((int)0xFFFFFFFE); + $tisFile->writeLong(count($termKeys)); + $tisFile->writeInt(self::$indexInterval); + $tisFile->writeInt(self::$skipInterval); + + $tiiFile = $this->_directory->createFile($this->_name . '.tii'); + $tiiFile->writeInt((int)0xFFFFFFFE); + $tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval)); + $tiiFile->writeInt(self::$indexInterval); + $tiiFile->writeInt(self::$skipInterval); + + /** Dump dictionary header */ + $tiiFile->writeVInt(0); // preffix length + $tiiFile->writeString(''); // suffix + $tiiFile->writeInt((int)0xFFFFFFFF); // field number + $tiiFile->writeByte((int)0x0F); + $tiiFile->writeVInt(0); // DocFreq + $tiiFile->writeVInt(0); // FreqDelta + $tiiFile->writeVInt(0); // ProxDelta + $tiiFile->writeVInt(20); // IndexDelta + + $frqFile = $this->_directory->createFile($this->_name . '.frq'); + $prxFile = $this->_directory->createFile($this->_name . '.prx'); + + $termCount = 1; + + $prevTerm = null; + $prevTermInfo = null; + $prevIndexTerm = null; + $prevIndexTermInfo = null; + $prevIndexPosition = 20; + + foreach ($termKeys as $termId) { + $freqPointer = $frqFile->tell(); + $proxPointer = $prxFile->tell(); + + $prevDoc = 0; + foreach ($this->_termDocs[$termId] as $docId => $termPositions) { + $docDelta = ($docId - $prevDoc)*2; + $prevDoc = $docId; + if (count($termPositions) > 1) { + $frqFile->writeVInt($docDelta); + $frqFile->writeVInt(count($termPositions)); + } else { + $frqFile->writeVInt($docDelta + 1); + } + + $prevPosition = 0; + foreach ($termPositions as $position) { + $prxFile->writeVInt($position - $prevPosition); + $prevPosition = $position; + } + } + + if (count($this->_termDocs[$termId]) >= self::$skipInterval) { + /** + * @todo Write Skip Data to a freq file. + * It's not used now, but make index more optimal + */ + $skipOffset = $frqFile->tell() - $freqPointer; + } else { + $skipOffset = 0; + } + + $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, + $this->_fields[$this->_termDictionary[$termId]->field]->number); + $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), + $freqPointer, $proxPointer, $skipOffset); + + $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); + + if ($termCount % self::$indexInterval == 0) { + $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); + + $indexPosition = $tisFile->tell(); + $tiiFile->writeVInt($indexPosition - $prevIndexPosition); + $prevIndexPosition = $indexPosition; + } + $termCount++; + } + + $this->_files[] = $this->_name . '.tis'; + $this->_files[] = $this->_name . '.tii'; + $this->_files[] = $this->_name . '.frq'; + $this->_files[] = $this->_name . '.prx'; + } + + + /** + * Generate compound index file + */ + private function _generateCFS() + { + $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); + $cfsFile->writeVInt(count($this->_files)); + + $dataOffsetPointers = array(); + foreach ($this->_files as $fileName) { + $dataOffsetPointers[$fileName] = $cfsFile->tell(); + $cfsFile->writeLong(0); // write dummy data + $cfsFile->writeString($fileName); + } + + foreach ($this->_files as $fileName) { + // Get actual data offset + $dataOffset = $cfsFile->tell(); + // Seek to the data offset pointer + $cfsFile->seek($dataOffsetPointers[$fileName]); + // Write actual data offset value + $cfsFile->writeLong($dataOffset); + // Seek back to the end of file + $cfsFile->seek($dataOffset); + + $dataFile = $this->_directory->getFileObject($fileName); + $data = $dataFile->readBytes($this->_directory->fileLength($fileName)); + $cfsFile->writeBytes($data); + + $this->_directory->deleteFile($fileName); + } + } + + + /** + * Close segment, write it to disk and return segment info + * + * @return Zend_Search_Lucene_Index_SegmentInfo + */ + public function close() + { + if ($this->_docCount == 0) { + return null; + } + + $this->_dumpFNM(); + $this->_dumpDictionary(); + + $this->_generateCFS(); + + return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, + $this->_docCount, + $this->_directory); + } + +} + diff --git a/search/Zend/Search/Lucene/Index/Term.php b/search/Zend/Search/Lucene/Index/Term.php new file mode 100644 index 0000000000..3deffa90ab --- /dev/null +++ b/search/Zend/Search/Lucene/Index/Term.php @@ -0,0 +1,72 @@ +field = $field; + $this->text = $text; + } + + + /** + * @todo docblock + */ + public function key() + { + return $this->field . chr(0) . $this->text; + } +} + diff --git a/search/Zend/Search/Lucene/Index/TermInfo.php b/search/Zend/Search/Lucene/Index/TermInfo.php new file mode 100644 index 0000000000..7dcfcc8a96 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/TermInfo.php @@ -0,0 +1,79 @@ +docFreq = $docFreq; + $this->freqPointer = $freqPointer; + $this->proxPointer = $proxPointer; + $this->skipOffset = $skipOffset; + $this->indexPointer = $indexPointer; + } +} + diff --git a/search/Zend/Search/Lucene/Index/Writer.php b/search/Zend/Search/Lucene/Index/Writer.php new file mode 100644 index 0000000000..ef6c65526a --- /dev/null +++ b/search/Zend/Search/Lucene/Index/Writer.php @@ -0,0 +1,331 @@ + '.cfs', + '.fnm' => '.fnm', + '.fdx' => '.fdx', + '.fdt' => '.fdt', + '.tis' => '.tis', + '.tii' => '.tii', + '.frq' => '.frq', + '.prx' => '.prx', + '.tvx' => '.tvx', + '.tvd' => '.tvd', + '.tvf' => '.tvf', + '.del' => '.del' ); + + /** + * Opens the index for writing + * + * IndexWriter constructor needs Directory as a parameter. It should be + * a string with a path to the index folder or a Directory object. + * Second constructor parameter create is optional - true to create the + * index or overwrite the existing one. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param boolean $create + */ + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false) + { + $this->_directory = $directory; + + if ($create) { + foreach ($this->_directory->fileList() as $file) { + if ($file == 'deletable' || + $file == 'segments' || + isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) || + preg_match('/\.f\d+$/i', $file) /* matches .f file names */) { + $this->_directory->deleteFile($file); + } + } + $segmentsFile = $this->_directory->createFile('segments'); + $segmentsFile->writeInt((int)0xFFFFFFFF); + // write version + $segmentsFile->writeLong(0); + // write name counter + $segmentsFile->writeInt(0); + // write segment counter + $segmentsFile->writeInt(0); + + $deletableFile = $this->_directory->createFile('deletable'); + // write counter + $deletableFile->writeInt(0); + + $this->_version = 0; + $this->_segmentNameCounter = 0; + $this->_segments = 0; + } else { + $segmentsFile = $this->_directory->getFileObject('segments'); + $format = $segmentsFile->readInt(); + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $this->_version = $segmentsFile->readLong(); + // read counter + $this->_segmentNameCounter = $segmentsFile->readInt(); + // read segment counter + $this->_segments = $segmentsFile->readInt(); + } + + $this->_newSegments = array(); + $this->_currentSegment = null; + } + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if ($this->_currentSegment === null) { + $this->_currentSegment = + new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); + } + $this->_currentSegment->addDocument($document); + $this->_version++; + } + + + + /** + * Update segments file by adding current segment to a list + * @todo !!!!!Finish the implementation + * + * @throws Zend_Search_Lucene_Exception + */ + private function _updateSegments() + { + $segmentsFile = $this->_directory->getFileObject('segments'); + $newSegmentFile = $this->_directory->createFile('segments.new'); + + $newSegmentFile->writeInt((int)0xFFFFFFFF); + $newSegmentFile->writeLong($this->_version); + $newSegmentFile->writeInt($this->_segmentNameCounter); + + $this->_segments += count($this->_newSegments); + $newSegmentFile->writeInt($this->_segments); + + $segmentsFile->seek(20); + $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); + + foreach ($this->_newSegments as $segmentName => $segmentInfo) { + $newSegmentFile->writeString($segmentName); + $newSegmentFile->writeInt($segmentInfo->count()); + } + + $this->_directory->renameFile('segments.new', 'segments'); + } + + + /** + * Commit current changes + * returns array of new segments + * + * @return array + */ + public function commit() + { + if ($this->_currentSegment !== null) { + $newSegment = $this->_currentSegment->close(); + if ($newSegment !== null) { + $this->_newSegments[$newSegment->getName()] = $newSegment; + } + $this->_currentSegment = null; + } + + if (count($this->_newSegments) != 0) { + $this->_updateSegments(); + } + + $result = $this->_newSegments; + $this->_newSegments = array(); + + return $result; + } + + + /** + * Merges the provided indexes into this index. + * + * @param array $readers + * @return void + */ + public function addIndexes($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Returns the number of documents currently in this index. + * + * @return integer + */ + public function docCount($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Flushes all changes to an index and closes all associated files. + * + */ + public function close() + { + /** + * @todo implementation + */ + } + + + /** + * Merges all segments together into a single segment, optimizing + * an index for search. + * + * return void + */ + public function optimize() + { + /** + * @todo implementation + */ + } + + /** + * Get name for new segment + * + * @return string + */ + private function _newSegmentName() + { + return '_' . base_convert($this->_segmentNameCounter++, 10, 36); + } + +} diff --git a/search/Zend/Search/Lucene/Search/Query.php b/search/Zend/Search/Lucene/Search/Query.php new file mode 100644 index 0000000000..bf284970a1 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query.php @@ -0,0 +1,100 @@ +_boost; + } + + /** + * Sets the boost for this query clause to $boost. + * + * @param float $boost + */ + public function setBoost($boost) + { + $this->_boost = $boost; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + abstract public function score($docId, $reader); + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + abstract protected function _createWeight($reader); + + /** + * Constructs an initializes a Weight for a query. + * + * @param Zend_Search_Lucene $reader + */ + protected function _initWeight($reader) + { + $this->_weight = $this->_createWeight($reader); + $sum = $this->_weight->sumOfSquaredWeights(); + $queryNorm = $reader->getSimilarity()->queryNorm($sum); + $this->_weight->normalize($queryNorm); + } + +} \ No newline at end of file diff --git a/search/Zend/Search/Lucene/Search/Query/MultiTerm.php b/search/Zend/Search/Lucene/Search/Query/MultiTerm.php new file mode 100644 index 0000000000..d3ec761bc6 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query/MultiTerm.php @@ -0,0 +1,439 @@ + (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + + /** + * A score factor based on the fraction of all query terms + * that a document contains. + * float for conjunction queries + * array of float for non conjunction queries + * + * @var mixed + */ + private $_coord = null; + + + /** + * Terms weights + * array of Zend_Search_Lucene_Search_Weight + * + * @var array + */ + private $_weights = array(); + + + /** + * Class constructor. Create a new multi-term query object. + * + * @param array $terms Array of Zend_Search_Lucene_Index_Term objects + * @param array $signs Array of signs. Sign is boolean|null. + * @return void + */ + public function __construct($terms = null, $signs = null) + { + /** + * @todo Check contents of $terms and $signs before adding them. + */ + if (is_array($terms)) { + $this->_terms = $terms; + + $this->_signs = null; + // Check if all terms are required + if (is_array($signs)) { + foreach ($signs as $sign ) { + if ($sign !== true) { + $this->_signs = $signs; + continue; + } + } + } + } + } + + + /** + * Add a $term (Zend_Search_Lucene_Index_Term) to this query. + * + * The sign is specified as: + * TRUE - term is required + * FALSE - term is prohibited + * NULL - term is neither prohibited, nor required + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean|null $sign + * @return void + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) { + $this->_terms[] = $term; + + /** + * @todo This is not good. Sometimes $this->_signs is an array, sometimes + * it is null, even when there are terms. It will be changed so that + * it is always an array. + */ + if ($this->_signs === null) { + if ($sign !== null) { + $this->_signs = array(); + foreach ($this->_terms as $term) { + $this->_signs[] = null; + } + $this->_signs[] = $sign; + } + } else { + $this->_signs[] = $sign; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Return terms signs + * + * @return array + */ + public function getSigns() + { + return $this->_signs; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); + } + + + /** + * Calculate result vector for Conjunction query + * (like '+something +another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Calculate result vector for non Conjunction query + * (like '+something -another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateNonConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + $required = null; + $neither = bitset_empty(); + $prohibited = bitset_empty(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = bitset_from_array($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + $required = bitset_intersection($required, $termDocs); + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + $prohibited = bitset_union($prohibited, $termDocs); + } else { + // neither required, nor prohibited + $neither = bitset_union($neither, $termDocs); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + $this->_resVector = bitset_intersection( $required, + bitset_invert($prohibited, $reader->count()) ); + } else { + $required = null; + $neither = array(); + $prohibited = array(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = array_flip($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + // substitute for bitset_intersection + foreach ($required as $key => $value) { + if (!isset( $termDocs[$key] )) { + unset($required[$key]); + } + } + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $prohibited[$key] = $value; + } + } else { + // neither required, nor prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $neither[$key] = $value; + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + + foreach ($required as $key=>$value) { + if (isset( $prohibited[$key] )) { + unset($required[$key]); + } + } + $this->_resVector = $required; + } + } + + + /** + * Score calculator for conjunction queries (all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _conjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), + count($this->_terms) ); + } + + $score = 0.0; + + foreach ($this->_terms as $termId=>$term) { + $score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + + return $score * $this->_coord; + } + + + /** + * Score calculator for non conjunction queries (not all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _nonConjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = array(); + + $maxCoord = 0; + foreach ($this->_signs as $sign) { + if ($sign !== false /* not prohibited */) { + $maxCoord++; + } + } + + for ($count = 0; $count <= $maxCoord; $count++) { + $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); + } + } + + $score = 0.0; + $matchedTerms = 0; + foreach ($this->_terms as $termId=>$term) { + // Check if term is + if ($this->_signs[$termId] !== false && // not prohibited + isset($this->_termsPositions[$termId][$docId]) // matched + ) { + $matchedTerms++; + $score += + $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + } + + return $score * $this->_coord[$matchedTerms]; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + if($this->_resVector === null) { + if ($this->_signs === null) { + $this->_calculateConjunctionResult($reader); + } else { + $this->_calculateNonConjunctionResult($reader); + } + + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_signs === null) { + return $this->_conjunctionScore($docId, $reader); + } else { + return $this->_nonConjunctionScore($docId, $reader); + } + } else { + return 0; + } + } +} + diff --git a/search/Zend/Search/Lucene/Search/Query/Phrase.php b/search/Zend/Search/Lucene/Search/Query/Phrase.php new file mode 100644 index 0000000000..b1d40b4bea --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query/Phrase.php @@ -0,0 +1,426 @@ + (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + /** + * Class constructor. Create a new prase query. + * + * @param string $field Field to search. + * @param array $terms Terms to search Array of strings. + * @param array $offsets Relative term positions. Array of integers. + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($terms = null, $offsets = null, $field = null) + { + $this->_slop = 0; + + if (is_array($terms)) { + $this->_terms = array(); + foreach ($terms as $termId => $termText) { + $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field): + new Zend_Search_Lucene_Index_Term($termText); + } + } else if ($terms === null) { + $this->_terms = array(); + } else { + throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null'); + } + + if (is_array($offsets)) { + if (count($this->_terms) != count($offsets)) { + throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.'); + } + $this->_offsets = $offsets; + } else if ($offsets === null) { + $this->_offsets = array(); + foreach ($this->_terms as $termId => $term) { + $position = count($this->_offsets); + $this->_offsets[$termId] = $position; + } + } else { + throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null'); + } + } + + /** + * Set slop + * + * @param integer $slop + */ + public function setSlop($slop) + { + $this->_slop = $slop; + } + + + /** + * Get slop + * + * @return integer + */ + public function getSlop() + { + return $this->_slop; + } + + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is specified explicitly or the one immediately + * after the last term added. + * + * @param Zend_Search_Lucene_Index_Term $term + * @param integer $position + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) { + if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) { + throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' . + $term->field . ':' . $term->text); + } + + $this->_terms[] = $term; + if ($position !== null) { + $this->_offsets[] = $position; + } else if (count($this->_offsets) != 0) { + $this->_offsets[] = end($this->_offsets) + 1; + } else { + $this->_offsets[] = 0; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); + } + + + /** + * Calculate result vector + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Score calculator for exact phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @return float + */ + public function _exactPhraseFreq($docId) + { + $freq = 0; + + // Term Id with lowest cardinality + $lowCardTermId = null; + + // Calculate $lowCardTermId + foreach ($this->_terms as $termId => $term) { + if ($lowCardTermId === null || + count($this->_termsPositions[$termId][$docId]) < + count($this->_termsPositions[$lowCardTermId][$docId]) ) { + $lowCardTermId = $termId; + } + } + + // Walk through positions of the term with lowest cardinality + foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { + // We expect phrase to be found + $freq++; + + // Walk through other terms + foreach ($this->_terms as $termId => $term) { + if ($termId != $lowCardTermId) { + $expectedPosition = $lowCardPos + + ($this->_offsets[$termId] - + $this->_offsets[$lowCardTermId]); + + if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { + $freq--; // Phrase wasn't found. + break; + } + } + } + } + + return $freq; + } + + /** + * Score calculator for sloppy phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader) + { + $freq = 0; + + $phraseQueue = array(); + $phraseQueue[0] = array(); // empty phrase + $lastTerm = null; + + // Walk through the terms to create phrases. + foreach ($this->_terms as $termId => $term) { + $queueSize = count($phraseQueue); + $firstPass = true; + + // Walk through the term positions. + // Each term position produces a set of phrases. + foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { + if ($firstPass) { + for ($count = 0; $count < $queueSize; $count++) { + $phraseQueue[$count][$termId] = $termPosition; + } + } else { + for ($count = 0; $count < $queueSize; $count++) { + if ($lastTerm !== null && + abs( $termPosition - $phraseQueue[$count][$lastTerm] - + ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { + continue; + } + + $newPhraseId = count($phraseQueue); + $phraseQueue[$newPhraseId] = $phraseQueue[$count]; + $phraseQueue[$newPhraseId][$termId] = $termPosition; + } + + } + + $firstPass = false; + } + $lastTerm = $termId; + } + + + foreach ($phraseQueue as $phrasePos) { + $minDistance = null; + + for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { + $distance = 0; + $start = reset($phrasePos) - reset($this->_offsets) + $shift; + + foreach ($this->_terms as $termId => $term) { + $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); + + if($distance > $this->_slop) { + break; + } + } + + if ($minDistance === null || $distance < $minDistance) { + $minDistance = $distance; + } + } + + if ($minDistance <= $this->_slop) { + $freq += $reader->getSimilarity()->sloppyFreq($minDistance); + } + } + + return $freq; + } + + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + // optimize zero-term case + if (count($this->_terms) == 0) { + return 0; + } + + if($this->_resVector === null) { + $this->_calculateResult($reader); + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_slop == 0) { + $freq = $this->_exactPhraseFreq($docId); + } else { + $freq = $this->_sloppyPhraseFreq($docId, $reader); + } + +/* + return $reader->getSimilarity()->tf($freq) * + $this->_weight->getValue() * + $reader->norm($docId, reset($this->_terms)->field); +*/ + if ($freq != 0) { + $tf = $reader->getSimilarity()->tf($freq); + $weight = $this->_weight->getValue(); + $norm = $reader->norm($docId, reset($this->_terms)->field); + + return $tf*$weight*$norm; + } + } else { + return 0; + } + } +} + diff --git a/search/Zend/Search/Lucene/Search/Query/Term.php b/search/Zend/Search/Lucene/Search/Query/Term.php new file mode 100644 index 0000000000..b0baf0f5ac --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query/Term.php @@ -0,0 +1,128 @@ + array( pos1, pos2, ... ) + * + * @var array + */ + private $_termPositions; + + + /** + * Zend_Search_Lucene_Search_Query_Term constructor + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean $sign + */ + public function __construct( $term, $sign = true ) + { + $this->_term = $term; + $this->_sign = $sign; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score( $docId, $reader ) + { + if($this->_docVector===null) { + if (extension_loaded('bitset')) { + $this->_docVector = bitset_from_array( $reader->termDocs($this->_term) ); + } else { + $this->_docVector = array_flip($reader->termDocs($this->_term)); + } + + $this->_termPositions = $reader->termPositions($this->_term); + $this->_initWeight($reader); + } + + $match = extension_loaded('bitset') ? bitset_in($this->_docVector, $docId) : + isset($this->_docVector[$docId]); + if ($this->_sign && $match) { + return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) * + $this->_weight->getValue() * + $reader->norm($docId, $this->_term->field); + } else { + return 0; + } + } +} + diff --git a/search/Zend/Search/Lucene/Search/QueryHit.php b/search/Zend/Search/Lucene/Search/QueryHit.php new file mode 100644 index 0000000000..19ab381fe4 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryHit.php @@ -0,0 +1,108 @@ +_index = $index; + } + + + /** + * Convenience function for getting fields from the document + * associated with this hit. + * + * @param string $offset + * @return string + */ + public function __get($offset) + { + return $this->getDocument()->getFieldValue($offset); + } + + + /** + * Return the document object for this hit + * + * @return Zend_Search_Lucene_Document + */ + public function getDocument() + { + if (!$this->_document instanceof Zend_Search_Lucene_Document) { + $this->_document = $this->_index->getDocument($this->id); + } + + return $this->_document; + } + + + /** + * Return the index object for this hit + * + * @return Zend_Search_Lucene + */ + public function getIndex() + { + return $this->_index; + } +} + diff --git a/search/Zend/Search/Lucene/Search/QueryParser.php b/search/Zend/Search/Lucene/Search/QueryParser.php new file mode 100644 index 0000000000..63b6497e05 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryParser.php @@ -0,0 +1,142 @@ +count()) { + throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.'); + } + + // Term query + if ($tokens->count() == 1) { + if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents')); + } else { + throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.'); + } + } + + + /** + * MultiTerm Query + * + * Process each token that was returned by the tokenizer. + */ + $terms = array(); + $signs = array(); + $prevToken = null; + $openBrackets = 0; + $field = 'contents'; + foreach ($tokens as $token) { + switch ($token->type) { + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD: + $terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field); + $field = 'contents'; + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + if ($prevToken->text == "+") { + $signs[] = true; + } else { + $signs[] = false; + } + } else { + $signs[] = null; + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN: + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.'); + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD: + $field = $token->text; + // let previous token to be signed as next $prevToken + $token = $prevToken; + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET: + $token->text=='(' ? $openBrackets++ : $openBrackets--; + } + $prevToken = $token; + } + + // Finish up parsing: check the last token in the query for an opening sign or parenthesis. + if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.'); + } + + // Finish up parsing: check that every opening bracket has a matching closing bracket. + if ($openBrackets != 0) { + throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.'); + } + + switch (count($terms)) { + case 0: + throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.'); + case 1: + return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false); + default: + return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs); + } + } + +} + diff --git a/search/Zend/Search/Lucene/Search/QueryToken.php b/search/Zend/Search/Lucene/Search/QueryToken.php new file mode 100644 index 0000000000..56d3522c71 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryToken.php @@ -0,0 +1,104 @@ +type = $tokType; + $this->text = $tokText; + } +} + diff --git a/search/Zend/Search/Lucene/Search/QueryTokenizer.php b/search/Zend/Search/Lucene/Search/QueryTokenizer.php new file mode 100644 index 0000000000..a59f8a8b12 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -0,0 +1,164 @@ +_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, + $currentToken); + $currentToken = ''; + } + + if ($inputString{$count} == '+' || $inputString{$count} == '-') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN, + $inputString{$count}); + } elseif ($inputString{$count} == '(' || $inputString{$count} == ')') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET, + $inputString{$count}); + } elseif ($inputString{$count} == ':' && $this->count()) { + if ($this->_tokens[count($this->_tokens)-1]->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + $this->_tokens[count($this->_tokens)-1]->type = Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD; + } + } + } + } + + if (strlen($currentToken)) { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, $currentToken); + } + } + + + /** + * Returns number of tokens + * + * @return integer + */ + public function count() + { + return count($this->_tokens); + } + + + /** + * Returns TRUE if a token exists at the current position. + * + * @return boolean + */ + public function valid() + { + return $this->_currToken < $this->count(); + } + + + /** + * Resets token stream. + * + * @return integer + */ + public function rewind() + { + $this->_currToken = 0; + } + + + /** + * Returns the token at the current position or FALSE if + * the position does not contain a valid token. + * + * @return mixed + */ + public function current() + { + return $this->valid() ? $this->_tokens[$this->_currToken] : false; + } + + + /** + * Returns next token + * + * @return Zend_Search_Lucene_Search_QueryToken + */ + public function next() + { + return ++$this->_currToken; + } + + + /** + * Return the position of the current token. + * + * @return integer + */ + public function key() + { + return $this->_currToken; + } + +} + diff --git a/search/Zend/Search/Lucene/Search/Similarity.php b/search/Zend/Search/Lucene/Search/Similarity.php new file mode 100644 index 0000000000..74ecb1dda6 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Similarity.php @@ -0,0 +1,553 @@ + 0.0, + 1 => 5.820766E-10, + 2 => 6.9849193E-10, + 3 => 8.1490725E-10, + 4 => 9.313226E-10, + 5 => 1.1641532E-9, + 6 => 1.3969839E-9, + 7 => 1.6298145E-9, + 8 => 1.8626451E-9, + 9 => 2.3283064E-9, + 10 => 2.7939677E-9, + 11 => 3.259629E-9, + 12 => 3.7252903E-9, + 13 => 4.656613E-9, + 14 => 5.5879354E-9, + 15 => 6.519258E-9, + 16 => 7.4505806E-9, + 17 => 9.313226E-9, + 18 => 1.1175871E-8, + 19 => 1.3038516E-8, + 20 => 1.4901161E-8, + 21 => 1.8626451E-8, + 22 => 2.2351742E-8, + 23 => 2.6077032E-8, + 24 => 2.9802322E-8, + 25 => 3.7252903E-8, + 26 => 4.4703484E-8, + 27 => 5.2154064E-8, + 28 => 5.9604645E-8, + 29 => 7.4505806E-8, + 30 => 8.940697E-8, + 31 => 1.0430813E-7, + 32 => 1.1920929E-7, + 33 => 1.4901161E-7, + 34 => 1.7881393E-7, + 35 => 2.0861626E-7, + 36 => 2.3841858E-7, + 37 => 2.9802322E-7, + 38 => 3.5762787E-7, + 39 => 4.172325E-7, + 40 => 4.7683716E-7, + 41 => 5.9604645E-7, + 42 => 7.1525574E-7, + 43 => 8.34465E-7, + 44 => 9.536743E-7, + 45 => 1.1920929E-6, + 46 => 1.4305115E-6, + 47 => 1.66893E-6, + 48 => 1.9073486E-6, + 49 => 2.3841858E-6, + 50 => 2.861023E-6, + 51 => 3.33786E-6, + 52 => 3.8146973E-6, + 53 => 4.7683716E-6, + 54 => 5.722046E-6, + 55 => 6.67572E-6, + 56 => 7.6293945E-6, + 57 => 9.536743E-6, + 58 => 1.1444092E-5, + 59 => 1.335144E-5, + 60 => 1.5258789E-5, + 61 => 1.9073486E-5, + 62 => 2.2888184E-5, + 63 => 2.670288E-5, + 64 => 3.0517578E-5, + 65 => 3.8146973E-5, + 66 => 4.5776367E-5, + 67 => 5.340576E-5, + 68 => 6.1035156E-5, + 69 => 7.6293945E-5, + 70 => 9.1552734E-5, + 71 => 1.0681152E-4, + 72 => 1.2207031E-4, + 73 => 1.5258789E-4, + 74 => 1.8310547E-4, + 75 => 2.1362305E-4, + 76 => 2.4414062E-4, + 77 => 3.0517578E-4, + 78 => 3.6621094E-4, + 79 => 4.272461E-4, + 80 => 4.8828125E-4, + 81 => 6.1035156E-4, + 82 => 7.324219E-4, + 83 => 8.544922E-4, + 84 => 9.765625E-4, + 85 => 0.0012207031, + 86 => 0.0014648438, + 87 => 0.0017089844, + 88 => 0.001953125, + 89 => 0.0024414062, + 90 => 0.0029296875, + 91 => 0.0034179688, + 92 => 0.00390625, + 93 => 0.0048828125, + 94 => 0.005859375, + 95 => 0.0068359375, + 96 => 0.0078125, + 97 => 0.009765625, + 98 => 0.01171875, + 99 => 0.013671875, + 100 => 0.015625, + 101 => 0.01953125, + 102 => 0.0234375, + 103 => 0.02734375, + 104 => 0.03125, + 105 => 0.0390625, + 106 => 0.046875, + 107 => 0.0546875, + 108 => 0.0625, + 109 => 0.078125, + 110 => 0.09375, + 111 => 0.109375, + 112 => 0.125, + 113 => 0.15625, + 114 => 0.1875, + 115 => 0.21875, + 116 => 0.25, + 117 => 0.3125, + 118 => 0.375, + 119 => 0.4375, + 120 => 0.5, + 121 => 0.625, + 122 => 0.75, + 123 => 0.875, + 124 => 1.0, + 125 => 1.25, + 126 => 1.5, + 127 => 1.75, + 128 => 2.0, + 129 => 2.5, + 130 => 3.0, + 131 => 3.5, + 132 => 4.0, + 133 => 5.0, + 134 => 6.0, + 135 => 7.0, + 136 => 8.0, + 137 => 10.0, + 138 => 12.0, + 139 => 14.0, + 140 => 16.0, + 141 => 20.0, + 142 => 24.0, + 143 => 28.0, + 144 => 32.0, + 145 => 40.0, + 146 => 48.0, + 147 => 56.0, + 148 => 64.0, + 149 => 80.0, + 150 => 96.0, + 151 => 112.0, + 152 => 128.0, + 153 => 160.0, + 154 => 192.0, + 155 => 224.0, + 156 => 256.0, + 157 => 320.0, + 158 => 384.0, + 159 => 448.0, + 160 => 512.0, + 161 => 640.0, + 162 => 768.0, + 163 => 896.0, + 164 => 1024.0, + 165 => 1280.0, + 166 => 1536.0, + 167 => 1792.0, + 168 => 2048.0, + 169 => 2560.0, + 170 => 3072.0, + 171 => 3584.0, + 172 => 4096.0, + 173 => 5120.0, + 174 => 6144.0, + 175 => 7168.0, + 176 => 8192.0, + 177 => 10240.0, + 178 => 12288.0, + 179 => 14336.0, + 180 => 16384.0, + 181 => 20480.0, + 182 => 24576.0, + 183 => 28672.0, + 184 => 32768.0, + 185 => 40960.0, + 186 => 49152.0, + 187 => 57344.0, + 188 => 65536.0, + 189 => 81920.0, + 190 => 98304.0, + 191 => 114688.0, + 192 => 131072.0, + 193 => 163840.0, + 194 => 196608.0, + 195 => 229376.0, + 196 => 262144.0, + 197 => 327680.0, + 198 => 393216.0, + 199 => 458752.0, + 200 => 524288.0, + 201 => 655360.0, + 202 => 786432.0, + 203 => 917504.0, + 204 => 1048576.0, + 205 => 1310720.0, + 206 => 1572864.0, + 207 => 1835008.0, + 208 => 2097152.0, + 209 => 2621440.0, + 210 => 3145728.0, + 211 => 3670016.0, + 212 => 4194304.0, + 213 => 5242880.0, + 214 => 6291456.0, + 215 => 7340032.0, + 216 => 8388608.0, + 217 => 1.048576E7, + 218 => 1.2582912E7, + 219 => 1.4680064E7, + 220 => 1.6777216E7, + 221 => 2.097152E7, + 222 => 2.5165824E7, + 223 => 2.9360128E7, + 224 => 3.3554432E7, + 225 => 4.194304E7, + 226 => 5.0331648E7, + 227 => 5.8720256E7, + 228 => 6.7108864E7, + 229 => 8.388608E7, + 230 => 1.00663296E8, + 231 => 1.17440512E8, + 232 => 1.34217728E8, + 233 => 1.6777216E8, + 234 => 2.01326592E8, + 235 => 2.34881024E8, + 236 => 2.68435456E8, + 237 => 3.3554432E8, + 238 => 4.02653184E8, + 239 => 4.69762048E8, + 240 => 5.3687091E8, + 241 => 6.7108864E8, + 242 => 8.0530637E8, + 243 => 9.395241E8, + 244 => 1.07374182E9, + 245 => 1.34217728E9, + 246 => 1.61061274E9, + 247 => 1.87904819E9, + 248 => 2.14748365E9, + 249 => 2.68435456E9, + 250 => 3.22122547E9, + 251 => 3.75809638E9, + 252 => 4.2949673E9, + 253 => 5.3687091E9, + 254 => 6.4424509E9, + 255 => 7.5161928E9 ); + + + /** + * Set the default Similarity implementation used by indexing and search + * code. + * + * @param Zend_Search_Lucene_Search_Similarity $similarity + */ + static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) + { + self::$_defaultImpl = $similarity; + } + + + /** + * Return the default Similarity implementation used by indexing and search + * code. + * + * @return Zend_Search_Lucene_Search_Similarity + */ + static public function getDefault() + { + if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { + self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); + } + + return self::$_defaultImpl; + } + + + /** + * Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + * Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when 'numTokens' is large, + * and larger values when 'numTokens' is small. + * + * That these values are computed under + * IndexWriter::addDocument(Document) and stored then using + * encodeNorm(float). Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * fieldName - name of field + * numTokens - the total number of tokens contained in fields named + * 'fieldName' of 'doc'. + * Returns a normalization factor for hits on this field of this document + * + * @param string $fieldName + * @param integer $numTokens + * @return float + */ + abstract public function lengthNorm($fieldName, $numTokens); + + /** + * Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + * This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * sumOfSquaredWeights - the sum of the squares of query term weights + * Returns a normalization factor for query weights + * + * @param float $sumOfSquaredWeights + * @return float + */ + abstract public function queryNorm($sumOfSquaredWeights); + + + /** + * Decodes a normalization factor stored in an index. + * + * @param integer $byte + * @return float + */ + static public function decodeNorm($byte) + { + return self::$_normTable[$byte & 0xFF]; + } + + + /** + * Encodes a normalization factor for storage in an index. + * + * The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @param float $f + * @return integer + */ + static function encodeNorm($f) + { + return self::_floatToByte($f); + } + + /** + * Float to byte conversion + * + * @param integer $b + * @return float + */ + static private function _floatToByte($f) + { + // round negatives up to zero + if ($f <= 0.0) { + return 0; + } + + // search for appropriate value + $lowIndex = 0; + $highIndex = 255; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $delta = $f - self::$_normTable[$mid]; + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $mid; // We got it! + } + } + + // round to closest value + if ($highIndex != 255 && + $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { + return $highIndex + 1; + } else { + return $highIndex; + } + } + + + /** + * Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the idf(Term, Searcher) + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when 'freq' is large, and smaller values when 'freq' + * is small. + * + * freq - the frequency of a term within a document + * Returns a score factor based on a term's within-document frequency + * + * @param float $freq + * @return float + */ + abstract public function tf($freq); + + /** + * Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to tf(float). + * + * A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * distance - the edit distance of this sloppy phrase match + * Returns the frequency increment for this match + * + * @param integer $distance + * @return float + */ + abstract public function sloppyFreq($distance); + + + /** + * Computes a score factor for a simple term or a phrase. + * + * The default implementation is: + * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); + * + * input - the term in question or array of terms + * reader - reader the document collection being searched + * Returns a score factor for the term + * + * @param mixed $input + * @param Zend_Search_Lucene $reader + * @return a score factor for the term + */ + public function idf($input, $reader) + { + if (!is_array($input)) { + return $this->idfFreq($reader->docFreq($input), $reader->count()); + } else { + $idf = 0.0; + foreach ($input as $term) { + $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); + } + return $idf; + } + } + + /** + * Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * tf(int) factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + * Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * docFreq - the number of documents which contain the term + * numDocs - the total number of documents in the collection + * Returns a score factor based on the term's document frequency + * + * @param integer $docFreq + * @param integer $numDocs + * @return float + */ + abstract public function idfFreq($docFreq, $numDocs); + + /** + * Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + * The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * overlap - the number of query terms matched in the document + * maxOverlap - the total number of terms in the query + * Returns a score factor based on term overlap with the query + * + * @param integer $overlap + * @param integer $maxOverlap + * @return float + */ + abstract public function coord($overlap, $maxOverlap); +} + diff --git a/search/Zend/Search/Lucene/Search/Similarity/Default.php b/search/Zend/Search/Lucene/Search/Similarity/Default.php new file mode 100644 index 0000000000..6cafb59668 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Similarity/Default.php @@ -0,0 +1,105 @@ +createWeight(). + * The sumOfSquaredWeights() method is then called on the top-level + * query to compute the query normalization factor Similarity->queryNorm(float). + * This factor is then passed to normalize(float). At this point the weighting + * is complete. + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +abstract class Zend_Search_Lucene_Search_Weight +{ + /** + * The weight for this query. + * + * @return float + */ + abstract public function getValue(); + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + abstract public function sumOfSquaredWeights(); + + /** + * Assigns the query normalization factor to this. + * + * @param $norm + */ + abstract public function normalize($norm); +} + diff --git a/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php b/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php new file mode 100644 index 0000000000..448bb064eb --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php @@ -0,0 +1,135 @@ +_query = $query; + $this->_reader = $reader; + $this->_weights = array(); + + $signs = $query->getSigns(); + + foreach ($query->getTerms() as $num => $term) { + if ($signs === null || $signs[$num] === null || $signs[$num]) { + $this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); + $query->setWeight($num, $this->_weights[$num]); + } + } + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_query->getBoost(); + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + $sum = 0; + foreach ($this->_weights as $weight) { + // sum sub weights + $sum += $weight->sumOfSquaredWeights(); + } + + // boost each sub-weight + $sum *= $this->_query->getBoost() * $this->_query->getBoost(); + + // check for empty query (like '-something -another') + if ($sum == 0) { + $sum = 1.0; + } + return $sum; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + // incorporate boost + $queryNorm *= $this->_query->getBoost(); + + foreach ($this->_weights as $weight) { + $weight->normalize($queryNorm); + } + } +} + + diff --git a/search/Zend/Search/Lucene/Search/Weight/Phrase.php b/search/Zend/Search/Lucene/Search/Weight/Phrase.php new file mode 100644 index 0000000000..536659614c --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Weight/Phrase.php @@ -0,0 +1,141 @@ +_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + + diff --git a/search/Zend/Search/Lucene/Search/Weight/Term.php b/search/Zend/Search/Lucene/Search/Weight/Term.php new file mode 100644 index 0000000000..d502896a5b --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Weight/Term.php @@ -0,0 +1,146 @@ +_term = $term; + $this->_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + diff --git a/search/Zend/Search/Lucene/Storage/Directory.php b/search/Zend/Search/Lucene/Storage/Directory.php new file mode 100644 index 0000000000..01ea380e3c --- /dev/null +++ b/search/Zend/Search/Lucene/Storage/Directory.php @@ -0,0 +1,120 @@ + Zend_Search_Lucene_Storage_File object + * + * @var array + * @throws Zend_Search_Lucene_Exception + */ + private $_fileHandlers; + + + /** + * Utility function to recursive directory creation + * + * @param string $dir + * @param integer $mode + * @param boolean $recursive + * @return boolean + */ + + static public function mkdirs($dir, $mode = 0777, $recursive = true) + { + if (is_null($dir) || $dir === '') { + return false; + } + if (is_dir($dir) || $dir === '/') { + return true; + } + if (self::mkdirs(dirname($dir), $mode, $recursive)) { + return mkdir($dir, $mode); + } + return false; + } + + + /** + * Object constructor + * Checks if $path is a directory or tries to create it. + * + * @param string $path + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($path) + { + if (!is_dir($path)) { + if (file_exists($path)) { + throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory'); + } else { + if (!self::mkdirs($path)) { + throw new Zend_Search_Lucene_Exception("Can't create directory '$path'."); + } + } + } + $this->_dirPath = $path; + $this->_fileHandlers = array(); + } + + + /** + * Closes the store. + * + * @return void + */ + public function close() + { + foreach ($this->_fileHandlers as $fileObject) { + $fileObject->close(); + } + + unset($this->_fileHandlers); + } + + + /** + * Returns an array of strings, one for each file in the directory. + * + * @return array + */ + public function fileList() + { + $result = array(); + + $dirContent = opendir( $this->_dirPath ); + while ($file = readdir($dirContent)) { + if (($file == '..')||($file == '.')) continue; + + $fullName = $this->_dirPath . '/' . $file; + + if( !is_dir($this->_dirPath . '/' . $file) ) { + $result[] = $file; + } + } + + return $result; + } + + /** + * Creates a new, empty file in the directory with the given $filename. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function createFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b'); + return $this->_fileHandlers[$filename]; + } + + + /** + * Removes an existing $filename in the directory. + * + * @param string $filename + * @return void + */ + public function deleteFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + unlink($this->_dirPath .'/'. $filename); + } + + + /** + * Returns true if a file with the given $filename exists. + * + * @param string $filename + * @return boolean + */ + public function fileExists($filename) + { + return isset($this->_fileHandlers[$filename]) || + file_exists($this->_dirPath . '/' . $filename); + } + + + /** + * Returns the length of a $filename in the directory. + * + * @param string $filename + * @return integer + */ + public function fileLength($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + return $this->_fileHandlers[$filename]->size(); + } + return filesize($this->_dirPath .'/'. $filename); + } + + + /** + * Returns the UNIX timestamp $filename was last modified. + * + * @param string $filename + * @return integer + */ + public function fileModified($filename) + { + return filemtime($this->_dirPath .'/'. $filename); + } + + + /** + * Renames an existing file in the directory. + * + * @param string $from + * @param string $to + * @return void + */ + public function renameFile($from, $to) + { + if ($this->_fileHandlers[$from] !== null) { + $this->_fileHandlers[$from]->close(); + } + unset($this->_fileHandlers[$from]); + + if ($this->_fileHandlers[$to] !== null) { + $this->_fileHandlers[$to]->close(); + } + unset($this->_fileHandlers[$to]); + + if (file_exists($this->_dirPath . '/' . $to)) { + unlink($this->_dirPath . '/' . $to); + } + + return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); + } + + + /** + * Sets the modified time of $filename to now. + * + * @param string $filename + * @return void + */ + public function touchFile($filename) + { + return touch($this->_dirPath .'/'. $filename); + } + + + /** + * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function getFileObject($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + $this->_fileHandlers[$filename]->seek(0); + return $this->_fileHandlers[$filename]; + } + + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename); + return $this->_fileHandlers[$filename]; + } +} + diff --git a/search/Zend/Search/Lucene/Storage/File.php b/search/Zend/Search/Lucene/Storage/File.php new file mode 100644 index 0000000000..a53c75b709 --- /dev/null +++ b/search/Zend/Search/Lucene/Storage/File.php @@ -0,0 +1,371 @@ +_fread(1)); + } + + /** + * Writes a byte to the end of the file. + * + * @param integer $byte + */ + public function writeByte($byte) + { + return $this->_fwrite(chr($byte), 1); + } + + /** + * Read num bytes from the current position in the file + * and advances the file pointer. + * + * @param integer $num + * @return string + */ + public function readBytes($num) + { + return $this->_fread($num); + } + + /** + * Writes num bytes of data (all, if $num===null) to the end + * of the string. + * + * @param string $data + * @param integer $num + */ + public function writeBytes($data, $num=null) + { + $this->_fwrite($data, $num); + } + + + /** + * Reads an integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readInt() + { + $str = $this->_fread(4); + + return ord($str{0}) << 24 | + ord($str{1}) << 16 | + ord($str{2}) << 8 | + ord($str{3}); + } + + + /** + * Writes an integer to the end of file. + * + * @param integer $value + */ + public function writeInt($value) + { + settype($value, 'integer'); + $this->_fwrite( chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 4 ); + } + + + /** + * Returns a long integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readLong() + { + $str = $this->_fread(8); + + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + return /* ord($str{0}) << 56 | */ + /* ord($str{1}) << 48 | */ + /* ord($str{2}) << 40 | */ + /* ord($str{3}) << 32 | */ + ord($str{4}) << 24 | + ord($str{5}) << 16 | + ord($str{6}) << 8 | + ord($str{7}); + } + + /** + * Writes long integer to the end of file + * + * @param integer $value + */ + public function writeLong($value) + { + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + settype($value, 'integer'); + $this->_fwrite( "\x00\x00\x00\x00" . + chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 8 ); + } + + + + /** + * Returns a variable-length integer from the current + * position in the file and advances the file pointer. + * + * @return integer + */ + public function readVInt() + { + $nextByte = ord($this->_fread(1)); + $val = $nextByte & 0x7F; + + for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { + $nextByte = ord($this->_fread(1)); + $val |= ($nextByte & 0x7F) << $shift; + } + return $val; + } + + /** + * Writes a variable-length integer to the end of file. + * + * @param integer $value + */ + public function writeVInt($value) + { + settype($value, 'integer'); + while ($value > 0x7F) { + $this->_fwrite(chr( ($value & 0x7F)|0x80 )); + $value >>= 7; + } + $this->_fwrite(chr($value)); + } + + + /** + * Reads a string from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readString() + { + $strlen = $this->readVInt(); + if ($strlen == 0) { + return ''; + } else { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + $str_val = $this->_fread($strlen); + + for ($count = 0; $count < $strlen; $count++ ) { + if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($str_val{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + if (ord($str_val{$count}) & 0x10 ) { + $addBytes++; + } + } + $str_val .= $this->_fread($addBytes); + $strlen += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($str_val{$count}) == 0xC0 && + ord($str_val{$count+1}) == 0x80 ) { + $str_val{$count} = 0; + $str_val = substr($str_val,0,$count+1) + . substr($str_val,$count+2); + } + $count += $addBytes; + } + } + + return $str_val; + } + } + + /** + * Writes a string to the end of file. + * + * @param string $str + * @throws Zend_Search_Lucene_Exception + */ + public function writeString($str) + { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + // convert input to a string before iterating string characters + settype($str, 'string'); + + $chars = $strlen = strlen($str); + $containNullChars = false; + + for ($count = 0; $count < $strlen; $count++ ) { + /** + * String is already in Java 2 representation. + * We should only calculate actual string length and replace + * \x00 by \xC0\x80 + */ + if ((ord($str{$count}) & 0xC0) == 0xC0) { + $addBytes = 1; + if (ord($str{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + // and we dont't support non-BMP characters + if (ord($str{$count}) & 0x10 ) { + $addBytes++; + } + } + $chars -= $addBytes; + + if (ord($str{$count}) == 0 ) { + $containNullChars = true; + } + $count += $addBytes; + } + } + + if ($chars < 0) { + throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string'); + } + + $this->writeVInt($chars); + if ($containNullChars) { + $this->_fwrite(str_replace($str, "\x00", "\xC0\x80")); + } else { + $this->_fwrite($str); + } + } + + + /** + * Reads binary data from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readBinary() + { + return $this->_fread($this->readVInt()); + } +} \ No newline at end of file diff --git a/search/Zend/Search/Lucene/Storage/File/Filesystem.php b/search/Zend/Search/Lucene/Storage/File/Filesystem.php new file mode 100644 index 0000000000..7c33543dd6 --- /dev/null +++ b/search/Zend/Search/Lucene/Storage/File/Filesystem.php @@ -0,0 +1,171 @@ +_fileHandle = @fopen($filename, $mode); + + if ($this->_fileHandle===false) { + ini_set('track_errors', $trackErrors); + throw new Zend_Search_Lucene_Exception($php_errormsg); + } + + ini_set('track_errors', $trackErrors); + } + + /** + * Sets the file position indicator and advances the file pointer. + * The new position, measured in bytes from the beginning of the file, + * is obtained by adding offset to the position specified by whence, + * whose values are defined as follows: + * SEEK_SET - Set position equal to offset bytes. + * SEEK_CUR - Set position to current location plus offset. + * SEEK_END - Set position to end-of-file plus offset. (To move to + * a position before the end-of-file, you need to pass a negative value + * in offset.) + * SEEK_CUR is the only supported offset type for compound files + * + * Upon success, returns 0; otherwise, returns -1 + * + * @param integer $offset + * @param integer $whence + * @return integer + */ + public function seek($offset, $whence=SEEK_SET) + { + return fseek($this->_fileHandle, $offset, $whence); + } + + + /** + * Get file position. + * + * @return integer + */ + public function tell() + { + return ftell($this->_fileHandle); + } + + + /** + * Close File object + */ + public function close() + { + if ($this->_fileHandle !== null ) { + @fclose($this->_fileHandle); + $this->_fileHandle = null; + } + } + + /** + * Get the size of the already opened file + * + * @return integer + */ + public function size() + { + $position = ftell($this->_fileHandle); + fseek($this->_fileHandle, 0, SEEK_END); + $size = ftell($this->_fileHandle); + fseek($this->_fileHandle,$position); + + return $size; + } + + /** + * Read a $length bytes from the file and advance the file pointer. + * + * @param integer $length + * @return string + */ + protected function _fread($length=1) + { + if ($length == 0) { + return ''; + } + + if ($length < 1024) { + return fread($this->_fileHandle, $length); + } + + $data = ''; + while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { + $data .= $nextBlock; + $length -= strlen($nextBlock); + } + return $data; + } + + + /** + * Writes $length number of bytes (all, if $length===null) to the end + * of the file. + * + * @param string $data + * @param integer $length + */ + protected function _fwrite($data, $length=null) + { + if ($length === null ) { + fwrite($this->_fileHandle, $data); + } else { + fwrite($this->_fileHandle, $data, $length); + } + } +} + diff --git a/search/Zend/Search/TODO.txt b/search/Zend/Search/TODO.txt new file mode 100644 index 0000000000..06f7b48792 --- /dev/null +++ b/search/Zend/Search/TODO.txt @@ -0,0 +1,14 @@ +@todo + +- Improve API: fix ZSearchMultiTermQuery($terms, $signs); + +- Analysis and indexing engine + +- Additional queries: phrase, wildcard, proximity, and range + +- Better class-level docblocks (most functions okay) + +- Some Windows issues(?) during indexing + +- Finish renaming classes to PEAR-like conventions + diff --git a/search/db/mysql.sql b/search/db/mysql.sql new file mode 100644 index 0000000000..867b5751e0 --- /dev/null +++ b/search/db/mysql.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS `search_documents` ( + `id` int(11) NOT NULL auto_increment, + `type` varchar(12) NOT NULL default 'none', + `title` varchar(100) NOT NULL default '', + `url` varchar(100) NOT NULL default '', + `updated` timestamp NOT NULL default CURRENT_TIMESTAMP, + `courseid` int(11) NOT NULL default '0', + `userid` int(11) NOT NULL default '0', + `groupid` int(11) NOT NULL default '0', + PRIMARY KEY (`id`) +) ENGINE=MyISAM AUTO_INCREMENT=1; + +DELETE FROM `search_documents` WHERE 1; +ALTER TABLE `search_documents` AUTO_INCREMENT =1; + diff --git a/search/db/postgres7.sql b/search/db/postgres7.sql new file mode 100644 index 0000000000..19e5fb3165 --- /dev/null +++ b/search/db/postgres7.sql @@ -0,0 +1,21 @@ +--probably a bit suspect, need to explicitly create +--id sequence (i.e. don't depend on postgres default seq naming)? +--not sure about table owner either + +CREATE TABLE search_documents +( + id serial, + "type" varchar(12) NOT NULL DEFAULT 'none', + title varchar(100) NOT NULL default '', + url varchar(100) NOT NULL default '', + updated timestamp NOT NULL DEFAULT NOW(), + courseid int4, + userid int4, + groupid int4, + CONSTRAINT id_pkey PRIMARY KEY (id) +) WITHOUT OIDS; + +--ALTER TABLE search_documents OWNER TO postgres; + +DELETE FROM search_documents; +SELECT setval('public.search_documents_id_seq', 1); diff --git a/search/documents/document.php b/search/documents/document.php new file mode 100644 index 0000000000..f5d4697d76 --- /dev/null +++ b/search/documents/document.php @@ -0,0 +1,12 @@ +addField(Zend_Search_Lucene_Field::Keyword('type', $document_type)); + $this->addField(Zend_Search_Lucene_Field::Keyword('courseid', $cid)); + $this->addField(Zend_Search_Lucene_Field::Keyword('userid', $uid)); + $this->addField(Zend_Search_Lucene_Field::Keyword('groupid', $gid)); + } //constructor + } //SearchDocument + +?> \ No newline at end of file diff --git a/search/documents/wiki_document.php b/search/documents/wiki_document.php new file mode 100644 index 0000000000..a6d75aef8d --- /dev/null +++ b/search/documents/wiki_document.php @@ -0,0 +1,28 @@ +dirroot/search/documents/document.php"); + + class WikiSearchDocument extends SearchDocument { + public function __construct(&$page, $wiki_id, $cid, $uid, $gid) { + $this->addField(Zend_Search_Lucene_Field::Text('title', $page->pagename)); + $this->addField(Zend_Search_Lucene_Field::Text('author', $page->author)); + $this->addField(Zend_Search_Lucene_Field::UnStored('contents', $page->content)); + + $this->addField(Zend_Search_Lucene_Field::Keyword('id', $page->id)); + $this->addField(Zend_Search_Lucene_Field::Keyword('version', $page->version)); + $this->addField(Zend_Search_Lucene_Field::Keyword('wiki', $wiki_id)); + + parent::__construct(SEARCH_WIKI_TYPE, $cid, $uid, $gid); + } //constructor + } //WikiSearchDocument + + function wiki_name_convert($str) { + return str_replace(' ', '+', $str); + } //wiki_name_convert + + function wiki_make_link(&$doc) { + global $CFG; + return $CFG->wwwroot.'/mod/wiki/view.php?wid='.$doc->wiki.'&page='.wiki_name_convert($doc->title).'&version='.$doc->version; + } //wiki_make_link + +?> \ No newline at end of file diff --git a/search/index.php b/search/index.php new file mode 100644 index 0000000000..8c4db6584e --- /dev/null +++ b/search/index.php @@ -0,0 +1,10 @@ +id, "wiki", "view all", "index.php?id=$course->id", "");*/ + + header("Location: query.php"); +?> \ No newline at end of file diff --git a/search/indexer.php b/search/indexer.php new file mode 100644 index 0000000000..b91b23ad21 --- /dev/null +++ b/search/indexer.php @@ -0,0 +1,152 @@ +dirroot/search/lib.php"); + + require_login(); + + if (!isadmin()) { + error("You need to be an admin user to use this page.", "$CFG->wwwroot/login/index.php"); + } //if + + $sure = strtolower(optional_param('areyousure', '', PARAM_ALPHA)); + + if ($sure != 'yes') { + mtrace("Sorry, you weren't sure enough (back to query page)."); + exit(0); + } //if + + //check for php5 (lib.php) + if (!search_check_php5()) { + $phpversion = phpversion(); + mtrace("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)"); + exit(0); + } //if + + require_once("$CFG->dirroot/search/Zend/Search/Lucene.php"); + + //begin timer + search_stopwatch(); + mtrace('
Server Time: '.date('r',time())."\n");
+  
+  //paths
+  $index_path = $CFG->dataroot.'/search';
+  $index_db_file = "$CFG->dirroot/search/db/$CFG->dbtype.sql";  
+  
+  if (!file_exists($index_path)) {
+    mtrace("Data directory ($index_path) does not exist, attempting to create.");
+    if (!mkdir($index_path)) {
+      search_pexit("Error creating data directory at: $index_path. Please correct.");
+    } else {
+      mtrace("Directory successfully created.");
+    } //else
+  } else {
+    mtrace("Using $index_path as data directory.");
+  } //else
+
+  //stop accidental re-indexing (zzz)
+  //search_pexit("Not indexing at this time.");
+
+  $index = new Zend_Search_Lucene($index_path, true);
+  
+  //create the database tables
+  ob_start(); //turn output buffering on - to hide modify_database() output
+  modify_database($index_db_file, '', false);
+  ob_end_clean(); //chuck the buffer and resume normal operation
+  
+  //empty database table goes here
+  // delete * from search_documents;
+  // set auto_increment back to 1
+  
+  //-------- debug stuff
+  /*
+  include_once("$CFG->dirroot/mod/wiki/lib.php");
+  
+  $wikis = get_all_instances_in_courses("wiki", get_courses());
+  #search_pexit($wikis[1]);
+  $entries = wiki_get_entries($wikis[1]);
+  #search_pexit($entries);
+    
+  #$r = wiki_get_pages($entries[134]);
+  $r = wiki_get_latest_pages($entries[95]);
+  
+  search_pexit($r);
+  //ignore me --------*/
+    
+  mtrace('Starting activity modules');
+  if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
+    foreach ($mods as $mod) {
+      $libfile = "$CFG->dirroot/mod/$mod->name/lib.php";
+      if (file_exists($libfile)) {
+        include_once($libfile);
+        
+        $iter_function = $mod->name.'_iterator';
+        $index_function = $mod->name.'_get_content_for_index';
+        $include_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';        
+        $c = 0;
+        $doc = new stdClass;
+                
+        if (function_exists($index_function) && function_exists($iter_function)) {
+          include_once($include_file);
+          
+          mtrace("Processing module function $index_function ...");
+                     
+          foreach ($iter_function() as $i) {
+            $documents = $index_function($i);
+            
+            //begin transaction
+            
+            foreach($documents as $document) {
+              $c++;
+              
+              //db sync increases indexing time from 55 sec to 73 (64 on Saturday?), so ~30%
+              //therefore, let us make a custom insert function for this search module
+              
+              //data object for db
+              $doc->type = $document->type;
+              $doc->title = mysql_real_escape_string($document->title); //naughty
+              $doc->update = time();
+              $doc->permissions = 0;
+              $doc->url = 'none';
+              $doc->courseid = $document->courseid;
+              $doc->userid = $document->userid;
+              $doc->groupid = $document->groupid;
+              
+              //insert summary into db
+              $id = insert_record($CFG->prefix.'search_documents', $doc);
+              
+              //synchronise db with index
+              $document->addField(Zend_Search_Lucene_Field::Keyword('dbid', $id));
+              $index->addDocument($document);                  
+                            
+              //commit every 100 new documents, and print a status message                            
+              if (($c%100) == 0) {
+                $index->commit();
+                mtrace(".. $c");                
+              } //if
+            } //foreach
+            
+            //end transaction
+            
+          } //foreach
+                  
+          //commit left over documents, and finish up  
+          $index->commit();
+          mtrace("-- $c documents indexed");
+          mtrace('done.');          
+        } //if
+      } //if
+    } //foreach
+  } //if
+  
+  //done modules
+  mtrace('Finished activity modules');
+  search_stopwatch();
+  mtrace(".
Back to query page."); + mtrace('
'); + +?> \ No newline at end of file diff --git a/search/indexersplash.php b/search/indexersplash.php new file mode 100644 index 0000000000..c10df92bd6 --- /dev/null +++ b/search/indexersplash.php @@ -0,0 +1,44 @@ +dirroot/search/lib.php"); + + require_login(); + + if (!isadmin()) { + error("You need to be an admin user to use this page.", "$CFG->wwwroot/login/index.php"); + } //if + + //check for php5 (lib.php) + if (!search_check_php5()) { + $phpversion = phpversion(); + mtrace("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)"); + exit(0); + } //if + + $index_path = "$CFG->dataroot/search"; + $index_dir = get_directory_list($index_path, '', false, false); + $index_filecount = count($index_dir); + + //check if the table exists in the db + $tables = $db->MetaTables(); + + if (array_search('search_documents', $tables)) { + $db_count = count_records($CFG->prefix.'search_documents'); + } else { + $db_count = 0; + } //else + + //elaborate on error messages, when db!=0 and index=0 -> corrupt, etc. + if ($index_filecount != 0 or $db_count != 0) { + mtrace("
The data directory ($index_path) contains $index_filecount files, and "
+          ."there are $db_count records in the search_documents table.");    
+    mtrace('');    
+    mtrace("This indicates that you have already indexed this site - click the following "
+          ."link if you're sure you want to continue: Go!");          
+    mtrace('');          
+    mtrace("Back to query page.");
+    mtrace("
"); + } else { + header('Location: indexer.php?areyousure=yes'); + } //else +?> \ No newline at end of file diff --git a/search/lib.php b/search/lib.php new file mode 100644 index 0000000000..081d9ef0bf --- /dev/null +++ b/search/lib.php @@ -0,0 +1,59 @@ +'; + print round(microtime(true) - $GLOBALS['search_script_start_time'], 6).' seconds'; + if (!$cli) print ''; + + unset($GLOBALS['search_script_start_time']); + } else { + $GLOBALS['search_script_start_time'] = microtime(true); + } //else + } //search_stopwatch + + //print and exit (for debugging) + function search_pexit($str = "") { + if (is_array($str) or is_object($str)) { + print_r($str); + } else if ($str) { + print $str."
"; + } //if + + exit(0); + } //search_pexit + +?> \ No newline at end of file diff --git a/search/query.php b/search/query.php new file mode 100644 index 0000000000..59169b5d2e --- /dev/null +++ b/search/query.php @@ -0,0 +1,116 @@ +dirroot/search/lib.php"); + + //check for php5, but don't die yet (see line 27) + if ($check = search_check_php5()) { + require_once("$CFG->dirroot/search/Zend/Search/Lucene.php"); + require_once("$CFG->dirroot/search/documents/wiki_document.php"); + + $query_string = optional_param('query_string', '', PARAM_CLEAN); + $index_path = "$CFG->dataroot/search"; + $no_index = false; //optimism! + + try { + $index = new Zend_Search_Lucene($index_path, false); + } catch(Exception $e) { + //print $e; + $no_index = true; + } //catch + } //if + + if (!$site = get_site()) { + redirect("index.php"); + } //if + + $strsearch = "Search"; //get_string(); + $strquery = "Enter your search query"; //get_string(); + + print_header("$site->shortname: $strsearch: $strquery", "$site->fullname", + "$strsearch -> $strquery"); + + //keep things pretty, even if php5 isn't available + if (!$check) { + print_heading(search_check_php5(true)); + print_footer(); + exit(0); + } //if + + print_simple_box_start('center', '100%', '', 20); + print_heading($strquery); + + print_simple_box_start('center', '', '', 20); +?> + +
+ +    Advanced search + Statistics +
+ +
+ +
+count(); +} //else + +print ' documents.'; + +if ($no_index and isadmin()) { + print "

Admin: There appears to be no index, click here to create one."; +} //if +?> +
+ +find(strtolower($query_string)); + + if (count($hits) > 0) { + $link_function = $hits[0]->type.'_make_link'; + } //if + + print "
"; + + print count($hits)." results returned for '".$query_string."'."; + print "

"; + + print "
    "; + + foreach ($hits as $listing) { + print "
  1. $listing->title
    \n" + ."".search_shorten_url($link_function($listing), 70)."
    \n" + ."Type: ".$listing->type.", score: ".round($listing->score, 3)."
    \n" + ."
  2. \n"; + } //foreach + + print "
"; + + print_simple_box_end(); + } //if + + if (!empty($query_string) and !$no_index) { +?> + +
+ It took to fetch these results. +
+ + \ No newline at end of file diff --git a/search/stats.php b/search/stats.php new file mode 100644 index 0000000000..caf23e765c --- /dev/null +++ b/search/stats.php @@ -0,0 +1,91 @@ +dirroot/search/lib.php"); + + //check for php5, but don't die yet + if ($check = search_check_php5()) { + //filesystem stats + $index_path = "$CFG->dataroot/search"; + $index_size = display_size(get_directory_size($index_path)); + $index_dir = get_directory_list($index_path, '', false, false); + $index_filecount = count($index_dir); + + //indexed documents stats + $tables = $db->MetaTables(); + + if (array_search('search_documents', $tables)) { + $types = search_get_document_types(); + sort($types); + + //total documents + $type_counts['Total'] = count_records($CFG->prefix.'search_documents'); + + foreach($types as $type) { + $c = count_records($CFG->prefix.'search_documents', 'type', $type); + $type_counts[$type] = (int)$c; + } //foreach + } else { + $type_counts['Total'] = 0; + } //else + } //if + + if (!$site = get_site()) { + redirect("index.php"); + } //if + + $strsearch = "Search"; //get_string(); + $strquery = "Search statistics"; //get_string(); + + print_header("$site->shortname: $strsearch: $strquery", "$site->fullname", + "$strsearch -> $strquery"); + + //keep things pretty, even if php5 isn't available + if (!$check) { + print_heading(search_check_php5(true)); + print_footer(); + exit(0); + } //if + + print_simple_box_start('center', '100%', '', 20); + print_heading($strquery); + + print_simple_box_start('center', '', '', 20); + + $table->tablealign = "center"; + $table->align = array ("right", "left"); + $table->wrap = array ("nowrap", "nowrap"); + $table->cellpadding = 5; + $table->cellspacing = 0; + $table->width = '500'; + + $table->data[] = array('Data directory', ''.$index_path.''); + $table->data[] = array('Files in index directory', $index_filecount); + $table->data[] = array('Total size', $index_size); + + if ($index_filecount == 0) { + $table->data[] = array('Click to create index', "Indexer"); + } //if + + $return_of_table->tablealign = "center"; + $return_of_table->align = array ("right", "left"); + $return_of_table->wrap = array ("nowrap", "nowrap"); + $return_of_table->cellpadding = 5; + $return_of_table->cellspacing = 0; + $return_of_table->width = '500'; + + $return_of_table->data[] = array('Database', 'search_documents'); + foreach($type_counts as $key => $value) { + $return_of_table->data[] = array($key, $value); + } //foreach + + if (isadmin()) { + print_table($table); + print_spacer(20); + } //if + + print_table($return_of_table); + + print_simple_box_end(); + print_simple_box_end(); + print_footer(); +?> \ No newline at end of file -- 2.39.5