From: diml Date: Mon, 9 Jul 2007 20:57:29 +0000 (+0000) Subject: Lucene Zend Implementation update (better handle of UTF8) X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=8cfbeb81592532da2136660239f13621e67ea3fd;p=moodle.git Lucene Zend Implementation update (better handle of UTF8) --- diff --git a/search/Zend/Search/Exception.php b/search/Zend/Search/Exception.php index a111cf6fcc..291cc43ed5 100644 --- a/search/Zend/Search/Exception.php +++ b/search/Zend/Search/Exception.php @@ -14,7 +14,7 @@ * * @category Zend * @package Zend_Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -22,13 +22,13 @@ /** * Framework base exception */ -require_once 'Zend/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Exception.php'; /** * @category Zend * @package Zend_Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Exception extends Zend_Exception diff --git a/search/Zend/Search/Lucene.php b/search/Zend/Search/Lucene.php index 3e33b7c103..1f15c9a0ba 100644 --- a/search/Zend/Search/Lucene.php +++ b/search/Zend/Search/Lucene.php @@ -14,53 +14,78 @@ * * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** Zend_Search_Lucene_Document */ -require_once 'Zend/Search/Lucene/Document.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php'; + +/** Zend_Search_Lucene_Document_Html */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php'; /** Zend_Search_Lucene_Storage_Directory */ -require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php'; + +/** Zend_Search_Lucene_Storage_File_Memory */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Memory.php'; /** Zend_Search_Lucene_Index_Term */ -require_once 'Zend/Search/Lucene/Index/Term.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; /** Zend_Search_Lucene_Index_TermInfo */ -require_once 'Zend/Search/Lucene/Index/TermInfo.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/TermInfo.php'; /** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; /** Zend_Search_Lucene_Index_FieldInfo */ -require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/FieldInfo.php'; /** Zend_Search_Lucene_Index_Writer */ -require_once 'Zend/Search/Lucene/Index/Writer.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Writer.php'; /** Zend_Search_Lucene_Search_QueryParser */ -require_once 'Zend/Search/Lucene/Search/QueryParser.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php'; /** Zend_Search_Lucene_Search_QueryHit */ -require_once 'Zend/Search/Lucene/Search/QueryHit.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryHit.php'; /** Zend_Search_Lucene_Search_Similarity */ -require_once 'Zend/Search/Lucene/Search/Similarity.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php'; + +/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php'; + + +/** Zend_Search_Lucene_Interface */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php'; + +/** Zend_Search_Lucene_Proxy */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Proxy.php'; /** * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ -class Zend_Search_Lucene +class Zend_Search_Lucene implements Zend_Search_Lucene_Interface { + /** + * Default field name for search + * + * Null means search through all fields + * + * @var string + */ + private static $_defaultSearchField = null; + /** * File system adapter. * @@ -103,6 +128,51 @@ class Zend_Search_Lucene */ private $_hasChanges = false; + + /** + * Index lock object + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_lock; + + /** + * Signal, that index is already closed, changes are fixed and resources are cleaned up + * + * @var boolean + */ + private $_closed = false; + + /** + * Number of references to the index object + * + * @var integer + */ + private $_refCount = 0; + + + /** + * Create index + * + * @param mixed $directory + * @return Zend_Search_Lucene_Interface + */ + public static function create($directory) + { + return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true)); + } + + /** + * Open index + * + * @param mixed $directory + * @return Zend_Search_Lucene_Interface + */ + public static function open($directory) + { + return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false)); + } + /** * Opens the index. * @@ -126,13 +196,32 @@ class Zend_Search_Lucene $this->_closeDirOnExit = true; } + + // Get a shared lock to the index + $this->_lock = $this->_directory->createFile('index.lock'); + + $this->_segmentInfos = array(); + if ($create) { - $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); + // Throw an exception if index is under processing now + if (!$this->_lock->lock(LOCK_EX, true)) { + throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now'); + } + + // Writer will create segments file for empty segments list + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, true); + + if (!$this->_lock->lock(LOCK_SH)) { + throw new Zend_Search_Lucene_Exception('Can\'t reduce lock level from Exclusive to Shared'); + } } else { + // Wait if index is under switching from one set of segments to another (Index_Writer::_updateSegments()) + if (!$this->_lock->lock(LOCK_SH)) { + throw new Zend_Search_Lucene_Exception('Can\'t obtain shared index lock'); + } $this->_writer = null; } - $this->_segmentInfos = array(); $segmentsFile = $this->_directory->getFileObject('segments'); @@ -143,9 +232,10 @@ class Zend_Search_Lucene } // read version - $segmentsFile->readLong(); + // $segmentsFile->readLong(); + $segmentsFile->readInt(); $segmentsFile->readInt(); - // read counter + // read segment name counter $segmentsFile->readInt(); $segments = $segmentsFile->readInt(); @@ -158,35 +248,83 @@ class Zend_Search_Lucene $segSize = $segmentsFile->readInt(); $this->_docCount += $segSize; - $this->_segmentInfos[$count] = + $this->_segmentInfos[] = new Zend_Search_Lucene_Index_SegmentInfo($segName, $segSize, $this->_directory); } } - /** - * Object destructor + * Close current index and free resources */ - public function __destruct() + private function _close() { + if ($this->_closed) { + // index is already closed and resources are cleaned up + return; + } + $this->commit(); + // Free shared lock + $this->_lock->unlock(); + if ($this->_closeDirOnExit) { $this->_directory->close(); } + + $this->_directory = null; + $this->_writer = null; + $this->_segmentInfos = null; + + $this->_closed = true; + } + + /** + * Add reference to the index object + * + * @internal + */ + public function addReference() + { + $this->_refCount++; + } + + /** + * Remove reference from the index object + * + * When reference count becomes zero, index is closed and resources are cleaned up + * + * @internal + */ + public function removeReference() + { + $this->_refCount--; + + if ($this->_refCount == 0) { + $this->_close(); + } + } + + /** + * Object destructor + */ + public function __destruct() + { + $this->_close(); } /** * Returns an instance of Zend_Search_Lucene_Index_Writer for the index * + * @internal * @return Zend_Search_Lucene_Index_Writer */ public function getIndexWriter() { if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { - $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos); } return $this->_writer; @@ -205,7 +343,7 @@ class Zend_Search_Lucene /** - * Returns the total number of documents in this index. + * Returns the total number of documents in this index (including deleted documents). * * @return integer */ @@ -214,6 +352,192 @@ class Zend_Search_Lucene return $this->_docCount; } + /** + * Returns one greater than the largest possible document number. + * This may be used to, e.g., determine how big to allocate a structure which will have + * an element for every document number in an index. + * + * @return integer + */ + public function maxDoc() + { + return $this->count(); + } + + /** + * Returns the total number of non-deleted documents in this index. + * + * @return integer + */ + public function numDocs() + { + $numDocs = 0; + + foreach ($this->_segmentInfos as $segmentInfo) { + $numDocs += $segmentInfo->numDocs(); + } + + return $numDocs; + } + + /** + * Checks, that document is deleted + * + * @param integer $id + * @return boolean + * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range + */ + public function isDeleted($id) + { + if ($id >= $this->_docCount) { + throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); + } + + $segmentStartId = 0; + foreach ($this->_segmentInfos as $segmentInfo) { + if ($segmentStartId + $segmentInfo->count() > $id) { + break; + } + + $segmentStartId += $segmentInfo->count(); + } + + return $segmentInfo->isDeleted($id - $segmentStartId); + } + + /** + * Set default search field. + * + * Null means, that search is performed through all fields by default + * + * Default value is null + * + * @param string $fieldName + */ + public static function setDefaultSearchField($fieldName) + { + self::$_defaultSearchField = $fieldName; + } + + /** + * Get default search field. + * + * Null means, that search is performed through all fields by default + * + * @return string + */ + public static function getDefaultSearchField() + { + return self::$_defaultSearchField; + } + + /** + * Retrieve index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @return integer + */ + public function getMaxBufferedDocs() + { + return $this->getIndexWriter()->maxBufferedDocs; + } + + /** + * Set index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @param integer $maxBufferedDocs + */ + public function setMaxBufferedDocs($maxBufferedDocs) + { + $this->getIndexWriter()->maxBufferedDocs = $maxBufferedDocs; + } + + /** + * Retrieve index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @return integer + */ + public function getMaxMergeDocs() + { + return $this->getIndexWriter()->maxMergeDocs; + } + + /** + * Set index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @param integer $maxMergeDocs + */ + public function setMaxMergeDocs($maxMergeDocs) + { + $this->getIndexWriter()->maxMergeDocs = $maxMergeDocs; + } + + /** + * Retrieve index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @return integer + */ + public function getMergeFactor() + { + return $this->getIndexWriter()->mergeFactor; + } + + /** + * Set index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @param integer $maxMergeDocs + */ + public function setMergeFactor($mergeFactor) + { + $this->getIndexWriter()->mergeFactor = $mergeFactor; + } /** * Performs a query against the index and returns an array @@ -221,7 +545,8 @@ class Zend_Search_Lucene * Input is a string or Zend_Search_Lucene_Search_Query. * * @param mixed $query - * @return array ZSearchHit + * @return array Zend_Search_Lucene_Search_QueryHit + * @throws Zend_Search_Lucene_Exception */ public function find($query) { @@ -235,22 +560,115 @@ class Zend_Search_Lucene $this->commit(); - $hits = array(); + $hits = array(); $scores = array(); + $ids = array(); + + $query = $query->rewrite($this)->optimize($this); + + $query->execute($this); + + $topScore = 0; - $docNum = $this->count(); - for( $count=0; $count < $docNum; $count++ ) { - $docScore = $query->score( $count, $this); + foreach ($query->matchedDocs() as $id => $num) { + $docScore = $query->score($id, $this); if( $docScore != 0 ) { $hit = new Zend_Search_Lucene_Search_QueryHit($this); - $hit->id = $count; + $hit->id = $id; $hit->score = $docScore; - $hits[] = $hit; + $hits[] = $hit; + $ids[] = $id; $scores[] = $docScore; + + if ($docScore > $topScore) { + $topScore = $docScore; + } + } + } + + if (count($hits) == 0) { + // skip sorting, which may cause a error on empty index + return array(); + } + + if ($topScore > 1) { + foreach ($hits as $hit) { + $hit->score /= $topScore; + } + } + + if (func_num_args() == 1) { + // sort by scores + array_multisort($scores, SORT_DESC, SORT_NUMERIC, + $ids, SORT_ASC, SORT_NUMERIC, + $hits); + } else { + // sort by given field names + + $argList = func_get_args(); + $fieldNames = $this->getFieldNames(); + $sortArgs = array(); + + for ($count = 1; $count < count($argList); $count++) { + $fieldName = $argList[$count]; + + if (!is_string($fieldName)) { + throw new Zend_Search_Lucene_Exception('Field name must be a string.'); + } + + if (!in_array($fieldName, $fieldNames)) { + throw new Zend_Search_Lucene_Exception('Wrong field name.'); + } + + $valuesArray = array(); + foreach ($hits as $hit) { + try { + $value = $hit->getDocument()->getFieldValue($fieldName); + } catch (Zend_Search_Lucene_Exception $e) { + if (strpos($e->getMessage(), 'not found') === false) { + throw $e; + } else { + $value = null; + } + } + + $valuesArray[] = $value; + } + + $sortArgs[] = $valuesArray; + + if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { + $count++; + $sortArgs[] = $argList[$count]; + + if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { + $count++; + $sortArgs[] = $argList[$count]; + } else { + if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) { + $sortArgs[] = SORT_REGULAR; + } else { + $sortArgs[] = SORT_ASC; + } + } + } else { + $sortArgs[] = SORT_ASC; + $sortArgs[] = SORT_REGULAR; + } } + + // Sort by id's if values are equal + $sortArgs[] = $ids; + $sortArgs[] = SORT_ASC; + $sortArgs[] = SORT_NUMERIC; + + // Array to be sorted + $sortArgs[] = &$hits; + + // Do sort + call_user_func_array('array_multisort', $sortArgs); } - array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); return $hits; } @@ -290,41 +708,45 @@ class Zend_Search_Lucene throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); } - $segCount = 0; - $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); - while( $nextSegmentStartId <= $id ) { - $segCount++; - $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + $segmentStartId = 0; + foreach ($this->_segmentInfos as $segmentInfo) { + if ($segmentStartId + $segmentInfo->count() > $id) { + break; + } + + $segmentStartId += $segmentInfo->count(); } - $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); - $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); + $fdxFile = $segmentInfo->openCompoundFile('.fdx'); $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); $fieldValuesPosition = $fdxFile->readLong(); - $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); - $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); + $fdtFile = $segmentInfo->openCompoundFile('.fdt'); + $fdtFile->seek($fieldValuesPosition, SEEK_CUR); $fieldCount = $fdtFile->readVInt(); $doc = new Zend_Search_Lucene_Document(); - for( $count = 0; $count < $fieldCount; $count++ ) { + for ($count = 0; $count < $fieldCount; $count++) { $fieldNum = $fdtFile->readVInt(); $bits = $fdtFile->readByte(); - $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); + $fieldInfo = $segmentInfo->getField($fieldNum); - if( !($bits & 2) ) { // Text data + if (!($bits & 2)) { // Text data $field = new Zend_Search_Lucene_Field($fieldInfo->name, $fdtFile->readString(), + 'UTF-8', true, $fieldInfo->isIndexed, $bits & 1 ); - } else { + } else { // Binary data $field = new Zend_Search_Lucene_Field($fieldInfo->name, $fdtFile->readBinary(), + '', true, $fieldInfo->isIndexed, - $bits & 1 ); + $bits & 1, + true ); } $doc->addField($field); @@ -335,7 +757,26 @@ class Zend_Search_Lucene /** - * Returns an array of all the documents which contain term. + * Returns true if index contain documents with specified term. + * + * Is used for query optimization. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return boolean + */ + public function hasTerm(Zend_Search_Lucene_Index_Term $term) + { + foreach ($this->_segmentInfos as $segInfo) { + if ($segInfo->getTermInfo($term) instanceof Zend_Search_Lucene_Index_TermInfo) { + return true; + } + } + + return false; + } + + /** + * Returns IDs of all the documents containing term. * * @param Zend_Search_Lucene_Index_Term $term * @return array @@ -377,55 +818,40 @@ class Zend_Search_Lucene /** - * Returns an array of all term positions in the documents. - * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * Returns an array of all term freqs. + * Result array structure: array(docId => freq, ...) * * @param Zend_Search_Lucene_Index_Term $term - * @return array + * @return integer */ - public function termPositions(Zend_Search_Lucene_Index_Term $term) + public function termFreqs(Zend_Search_Lucene_Index_Term $term) { $result = array(); $segmentStartDocId = 0; - foreach( $this->_segmentInfos as $segInfo ) { - $termInfo = $segInfo->getTermInfo($term); - - if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { - $segmentStartDocId += $segInfo->count(); - continue; - } - - $frqFile = $segInfo->openCompoundFile('.frq'); - $frqFile->seek($termInfo->freqPointer,SEEK_CUR); - $freqs = array(); - $docId = 0; - - for( $count = 0; $count < $termInfo->docFreq; $count++ ) { - $docDelta = $frqFile->readVInt(); - if( $docDelta % 2 == 1 ) { - $docId += ($docDelta-1)/2; - $freqs[ $docId ] = 1; - } else { - $docId += $docDelta/2; - $freqs[ $docId ] = $frqFile->readVInt(); - } - } + foreach ($this->_segmentInfos as $segmentInfo) { + $result += $segmentInfo->termFreqs($term, $segmentStartDocId); - $prxFile = $segInfo->openCompoundFile('.prx'); - $prxFile->seek($termInfo->proxPointer,SEEK_CUR); - foreach ($freqs as $docId => $freq) { - $termPosition = 0; - $positions = array(); + $segmentStartDocId += $segmentInfo->count(); + } - for ($count = 0; $count < $freq; $count++ ) { - $termPosition += $prxFile->readVInt(); - $positions[] = $termPosition; - } + return $result; + } - $result[ $segmentStartDocId + $docId ] = $positions; - } + /** + * Returns an array of all term positions in the documents. + * Result array structure: array(docId => array(pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + foreach ($this->_segmentInfos as $segmentInfo) { + $result += $segmentInfo->termPositions($term, $segmentStartDocId); - $segmentStartDocId += $segInfo->count(); + $segmentStartDocId += $segmentInfo->count(); } return $result; @@ -468,9 +894,9 @@ class Zend_Search_Lucene * * @param integer $id * @param string $fieldName - * @return Zend_Search_Lucene_Document + * @return float */ - public function norm( $id, $fieldName ) + public function norm($id, $fieldName) { if ($id >= $this->_docCount) { return null; @@ -527,16 +953,17 @@ class Zend_Search_Lucene throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); } - $segCount = 0; - $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); - while( $nextSegmentStartId <= $id ) { - $segCount++; - $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + $segmentStartId = 0; + foreach ($this->_segmentInfos as $segmentInfo) { + if ($segmentStartId + $segmentInfo->count() > $id) { + break; + } + + $segmentStartId += $segmentInfo->count(); } + $segmentInfo->delete($id - $segmentStartId); $this->_hasChanges = true; - $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); - $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId); } @@ -548,18 +975,26 @@ class Zend_Search_Lucene */ public function addDocument(Zend_Search_Lucene_Document $document) { - if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { - $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); - } - - $this->_writer->addDocument($document); + $this->getIndexWriter()->addDocument($document); + $this->_docCount++; } + /** + * Update document counter + */ + private function _updateDocCount() + { + $this->_docCount = 0; + foreach ($this->_segmentInfos as $segInfo) { + $this->_docCount += $segInfo->count(); + } + } + /** * Commit changes resulting from delete() or undeleteAll() operations. * - * @todo delete() and undeleteAll processing. + * @todo undeleteAll processing. */ public function commit() { @@ -572,38 +1007,73 @@ class Zend_Search_Lucene } if ($this->_writer !== null) { - foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { - if ($segmentInfo !== null) { - $this->_segmentInfos[] = $segmentInfo; - $this->_docCount += $segmentInfo->count(); - } else { - foreach ($this->_segmentInfos as $segId => $segInfo) { - if ($segInfo->getName() == $segmentName) { - unset($this->_segmentInfos[$segId]); - } - } - } - } + $this->_writer->commit(); + + $this->_updateDocCount(); } } - /************************************************************************* - @todo UNIMPLEMENTED - *************************************************************************/ + /** + * Optimize index. + * + * Merges all segments into one + */ + public function optimize() + { + // Commit changes if any changes have been made + $this->commit(); + + if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) { + $this->getIndexWriter()->optimize(); + $this->_updateDocCount(); + } + } + /** * Returns an array of all terms in this index. * - * @todo Implementation * @return array */ public function terms() { - return array(); + $result = array(); + + $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue(); + + foreach ($this->_segmentInfos as $segmentInfo) { + $segmentInfo->reset(); + + // Skip "empty" segments + if ($segmentInfo->currentTerm() !== null) { + $segmentInfoQueue->put($segmentInfo); + } + } + + while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { + if ($segmentInfoQueue->top() === null || + $segmentInfoQueue->top()->currentTerm()->key() != + $segmentInfo->currentTerm()->key()) { + // We got new term + $result[] = $segmentInfo->currentTerm(); + } + + $segmentInfo->nextTerm(); + // check, if segment dictionary is finished + if ($segmentInfo->currentTerm() !== null) { + // Put segment back into the priority queue + $segmentInfoQueue->put($segmentInfo); + } + } + + return $result; } + /************************************************************************* + @todo UNIMPLEMENTED + *************************************************************************/ /** * Undeletes all documents currently marked as deleted in this index. * @@ -611,4 +1081,4 @@ class Zend_Search_Lucene */ public function undeleteAll() {} -} \ No newline at end of file +} diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer.php b/search/Zend/Search/Lucene/Analysis/Analyzer.php index febf88e614..e57f6a5ed5 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer.php @@ -15,20 +15,37 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Token */ -require_once 'Zend/Search/Lucene/Analysis/Token.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php'; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php'; + +/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php'; +/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php'; /** @@ -44,7 +61,7 @@ require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.p * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -55,16 +72,74 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer * * @var Zend_Search_Lucene_Analysis_Analyzer */ - static private $_defaultImpl; + private static $_defaultImpl; + + /** + * Input string + * + * @var string + */ + protected $_input = null; + + /** + * Input string encoding + * + * @var string + */ + protected $_encoding = ''; /** * Tokenize text to a terms * Returns array of Zend_Search_Lucene_Analysis_Token objects * + * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) + * * @param string $data * @return array */ - abstract public function tokenize($data); + public function tokenize($data, $encoding = '') + { + $this->setInput($data, $encoding); + + $tokenList = array(); + while (($nextToken = $this->nextToken()) !== null) { + $tokenList[] = $nextToken; + } + + return $tokenList; + } + + + /** + * Tokenization stream API + * Set input + * + * @param string $data + */ + public function setInput($data, $encoding = '') + { + $this->_input = $data; + $this->_encoding = $encoding; + $this->reset(); + } + + /** + * Reset token stream + */ + abstract public function reset(); + + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + abstract public function nextToken(); + + /** @@ -72,7 +147,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer * * @param Zend_Search_Lucene_Analysis_Analyzer $similarity */ - static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) + public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) { self::$_defaultImpl = $analyzer; } @@ -83,7 +158,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer * * @return Zend_Search_Lucene_Analysis_Analyzer */ - static public function getDefault() + public static function getDefault() { if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); @@ -91,6 +166,5 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer return self::$_defaultImpl; } - } diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php index 2ad8a0516b..c518e93071 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php @@ -15,13 +15,13 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Analyzer */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php'; /** @@ -34,7 +34,7 @@ require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer @@ -58,7 +58,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_L } /** - * Apply filters to the token. + * Apply filters to the token. Can return null when the token was removed. * * @param Zend_Search_Lucene_Analysis_Token $token * @return Zend_Search_Lucene_Analysis_Token @@ -67,6 +67,11 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_L { foreach ($this->_filters as $filter) { $token = $filter->normalize($token); + + // resulting token can be null if the filter removed it + if (is_null($token)) { + return null; + } } return $token; diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php index 6f6f0dd936..d084ebc4c5 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -15,64 +15,79 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Analyzer_Common */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common { /** - * Tokenize text to a terms - * Returns array of Zend_Search_Lucene_Analysis_Token objects + * Current position in a stream * - * @param string $data - * @return array + * @var integer */ - public function tokenize($data) + private $_position; + + /** + * Reset token stream + */ + public function reset() { - $tokenStream = array(); + $this->_position = 0; - $position = 0; - while ($position < strlen($data)) { - // skip white space - while ($position < strlen($data) && !ctype_alpha( $data{$position} )) { - $position++; - } + if ($this->_input === null) { + return; + } - $termStartPosition = $position; + // convert input into ascii + $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); + $this->_encoding = 'ASCII'; + } - // read token - while ($position < strlen($data) && ctype_alpha( $data{$position} )) { - $position++; - } + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + public function nextToken() + { + if ($this->_input === null) { + return null; + } - // Empty token, end of stream. - if ($position == $termStartPosition) { - break; + + do { + if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { + // It covers both cases a) there are no matches (preg_match(...) === 0) + // b) error occured (preg_match(...) === FALSE) + return null; } - $token = new Zend_Search_Lucene_Analysis_Token(substr($data, - $termStartPosition, - $position-$termStartPosition), - $termStartPosition, - $position); - $tokenStream[] = $this->normalize($token); - } + $str = $match[0][0]; + $pos = $match[0][1]; + $endpos = $pos + strlen($str); + + $this->_position = $endpos; + + $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos)); + } while ($token === null); // try again if token is skipped - return $tokenStream; + return $token; } } diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php index e5fc372628..d9f786a870 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php @@ -15,23 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; /** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */ -require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php new file mode 100644 index 0000000000..d68b594a07 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php @@ -0,0 +1,92 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum extends Zend_Search_Lucene_Analysis_Analyzer_Common +{ + /** + * Current position in a stream + * + * @var integer + */ + private $_position; + + /** + * Reset token stream + */ + public function reset() + { + $this->_position = 0; + + if ($this->_input === null) { + return; + } + + // convert input into ascii + $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); + $this->_encoding = 'ASCII'; + } + + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + public function nextToken() + { + if ($this->_input === null) { + return null; + } + + do { + if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { + // It covers both cases a) there are no matches (preg_match(...) === 0) + // b) error occured (preg_match(...) === FALSE) + return null; + } + + $str = $match[0][0]; + $pos = $match[0][1]; + $endpos = $pos + strlen($str); + + $this->_position = $endpos; + + $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos)); + } while ($token === null); // try again if token is skipped + + return $token; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php new file mode 100644 index 0000000000..6eab437229 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php @@ -0,0 +1,46 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; + +/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + + +class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum +{ + public function __construct() + { + $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase()); + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php new file mode 100644 index 0000000000..674a3d9e64 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php @@ -0,0 +1,169 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common +{ + /** + * Current char position in an UTF-8 stream + * + * @var integer + */ + private $_position; + + /** + * Current binary position in an UTF-8 stream + * + * @var integer + */ + private $_bytePosition; + + /** + * Stream length + * + * @var integer + */ + private $_streamLength; + + /** + * Reset token stream + */ + public function reset() + { + $this->_position = 0; + $this->_bytePosition = 0; + + // convert input into UTF-8 + if (strcasecmp($this->_encoding, 'utf8' ) != 0 && + strcasecmp($this->_encoding, 'utf-8') != 0 ) { + $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); + $this->_encoding = 'UTF-8'; + } + + // Get UTF-8 string length. + // It also checks if it's a correct utf-8 string + $this->_streamLength = iconv_strlen($this->_input, 'UTF-8'); + } + + /** + * Check, that character is a letter + * + * @param string $char + * @return boolean + */ + private static function _isAlpha($char) + { + if (strlen($char) > 1) { + // It's an UTF-8 character + return true; + } + + return ctype_alpha($char); + } + + /** + * Get next UTF-8 char + * + * @param string $char + * @return boolean + */ + private function _nextChar() + { + $char = $this->_input[$this->_bytePosition++]; + + if (( ord($char) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($char) & 0x20 ) { + $addBytes++; + if (ord($char) & 0x10 ) { + $addBytes++; + } + } + $char .= substr($this->_input, $this->_bytePosition, $addBytes); + $this->_bytePosition += $addBytes; + } + + $this->_position++; + + return $char; + } + + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + public function nextToken() + { + if ($this->_input === null) { + return null; + } + + while ($this->_position < $this->_streamLength) { + // skip white space + while ($this->_position < $this->_streamLength && + !self::_isAlpha($char = $this->_nextChar())) { + $char = ''; + } + + $termStartPosition = $this->_position - 1; + $termText = $char; + + // read token + while ($this->_position < $this->_streamLength && + self::_isAlpha($char = $this->_nextChar())) { + $termText .= $char; + } + + // Empty token, end of stream. + if ($termText == '') { + return null; + } + + $token = new Zend_Search_Lucene_Analysis_Token( + $termText, + $termStartPosition, + $this->_position - 1); + $token = $this->normalize($token); + if ($token !== null) { + return $token; + } + // Continue if token is skipped + } + + return null; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php new file mode 100644 index 0000000000..982b55418f --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php @@ -0,0 +1,169 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common +{ + /** + * Current char position in an UTF-8 stream + * + * @var integer + */ + private $_position; + + /** + * Current binary position in an UTF-8 stream + * + * @var integer + */ + private $_bytePosition; + + /** + * Stream length + * + * @var integer + */ + private $_streamLength; + + /** + * Reset token stream + */ + public function reset() + { + $this->_position = 0; + $this->_bytePosition = 0; + + // convert input into UTF-8 + if (strcasecmp($this->_encoding, 'utf8' ) != 0 && + strcasecmp($this->_encoding, 'utf-8') != 0 ) { + $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); + $this->_encoding = 'UTF-8'; + } + + // Get UTF-8 string length. + // It also checks if it's a correct utf-8 string + $this->_streamLength = iconv_strlen($this->_input, 'UTF-8'); + } + + /** + * Check, that character is a letter + * + * @param string $char + * @return boolean + */ + private static function _isAlNum($char) + { + if (strlen($char) > 1) { + // It's an UTF-8 character + return true; + } + + return ctype_alnum($char); + } + + /** + * Get next UTF-8 char + * + * @param string $char + * @return boolean + */ + private function _nextChar() + { + $char = $this->_input[$this->_bytePosition++]; + + if (( ord($char) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($char) & 0x20 ) { + $addBytes++; + if (ord($char) & 0x10 ) { + $addBytes++; + } + } + $char .= substr($this->_input, $this->_bytePosition, $addBytes); + $this->_bytePosition += $addBytes; + } + + $this->_position++; + + return $char; + } + + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + public function nextToken() + { + if ($this->_input === null) { + return null; + } + + while ($this->_position < $this->_streamLength) { + // skip white space + while ($this->_position < $this->_streamLength && + !self::_isAlNum($char = $this->_nextChar())) { + $char = ''; + } + + $termStartPosition = $this->_position - 1; + $termText = $char; + + // read token + while ($this->_position < $this->_streamLength && + self::_isAlNum($char = $this->_nextChar())) { + $termText .= $char; + } + + // Empty token, end of stream. + if ($termText == '') { + return null; + } + + $token = new Zend_Search_Lucene_Analysis_Token( + $termText, + $termStartPosition, + $this->_position - 1); + $token = $this->normalize($token); + if ($token !== null) { + return $token; + } + // Continue if token is skipped + } + + return null; + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/Token.php b/search/Zend/Search/Lucene/Analysis/Token.php index f2e9ee7cad..91586b01f7 100644 --- a/search/Zend/Search/Lucene/Analysis/Token.php +++ b/search/Zend/Search/Lucene/Analysis/Token.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -24,7 +24,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Analysis_Token @@ -50,13 +50,6 @@ class Zend_Search_Lucene_Analysis_Token */ private $_endOffset; - /** - * Lexical type. - * - * @var string - */ - private $_type; - /** * The position of this token relative to the previous Token. * @@ -90,12 +83,11 @@ class Zend_Search_Lucene_Analysis_Token * @param integer $end * @param string $type */ - public function __construct($text, $start, $end, $type = 'word' ) + public function __construct($text, $start, $end) { $this->_termText = $text; $this->_startOffset = $start; $this->_endOffset = $end; - $this->_type = $type; $this->_positionIncrement = 1; } @@ -157,15 +149,5 @@ class Zend_Search_Lucene_Analysis_Token { return $this->_endOffset; } - - /** - * Returns this Token's lexical type. Defaults to 'word'. - * - * @return string - */ - public function getType() - { - return $this->_type; - } } diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter.php b/search/Zend/Search/Lucene/Analysis/TokenFilter.php index a363aa1c1c..4d559a93b7 100644 --- a/search/Zend/Search/Lucene/Analysis/TokenFilter.php +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter.php @@ -15,13 +15,13 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Token */ -require_once 'Zend/Search/Lucene/Analysis/Token.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php'; /** @@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Analysis/Token.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php index 5ea1edf832..01d25c6745 100644 --- a/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php @@ -15,13 +15,13 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_TokenFilter */ -require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php'; /** @@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -44,10 +44,10 @@ class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Luce */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { - $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ), + $newToken = new Zend_Search_Lucene_Analysis_Token( + strtolower( $srcToken->getTermText() ), $srcToken->getStartOffset(), - $srcToken->getEndOffset(), - $srcToken->getType()); + $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php new file mode 100644 index 0000000000..83abfb22c3 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php @@ -0,0 +1,68 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php'; + + +/** + * Token filter that removes short words. What is short word can be configured with constructor. + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_TokenFilter_ShortWords extends Zend_Search_Lucene_Analysis_TokenFilter +{ + /** + * Minimum allowed term length + * @var integer + */ + private $length; + + /** + * Constructs new instance of this filter. + * + * @param integer $short minimum allowed length of term which passes this filter (default 2) + */ + public function __construct($length = 2) { + $this->length = $length; + } + + /** + * Normalize Token or remove it (if null is returned) + * + * @param Zend_Search_Lucene_Analysis_Token $srcToken + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { + if (strlen($srcToken->getTermText()) < $this->length) { + return null; + } else { + return $srcToken; + } + } +} + diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php new file mode 100644 index 0000000000..f85d5d55d8 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php @@ -0,0 +1,101 @@ +dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Exception.php'; + + +/** + * Token filter that removes stop words. These words must be provided as array (set), example: + * $stopwords = array('the' => 1, 'an' => '1'); + * + * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter. + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter +{ + /** + * Minimum allowed term length + * @var array + */ + private $_stopSet; + + /** + * Constructs new instance of this filter. + * + * @param array $stopwords array (set) of words that will be filtered out + */ + public function __construct($stopwords = array()) { + $this->_stopSet = array_flip($stopwords); + } + + /** + * Normalize Token or remove it (if null is returned) + * + * @param Zend_Search_Lucene_Analysis_Token $srcToken + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { + if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { + $t = $srcToken->getTermText(); + return null; + } else { + return $srcToken; + } + } + + /** + * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first + * column are ignored (as comments). + * + * You can call this method one or more times. New stopwords are always added to current set. + * + * @param string $filepath full path for text file with stopwords + * @throws Zend_Search_Exception When the file doesn`t exists or is not readable. + */ + public function loadFromFile($filepath = null) { + if (! $filepath || ! file_exists($filepath)) { + throw new Zend_Search_Exception('You have to provide valid file path'); + } + $fd = fopen($filepath, "r"); + if (! $fd) { + throw new Zend_Search_Exception('Cannot open file ' . $filepath); + } + while (!feof ($fd)) { + $buffer = trim(fgets($fd)); + if (strlen($buffer) > 0 && $buffer[0] != '#') { + $this->_stopSet[$buffer] = 1; + } + } + if (!fclose($fd)) { + throw new Zend_Search_Exception('Cannot close file ' . $filepath); + } + } +} + diff --git a/search/Zend/Search/Lucene/Document.php b/search/Zend/Search/Lucene/Document.php index 48e48cf17a..6309719568 100644 --- a/search/Zend/Search/Lucene/Document.php +++ b/search/Zend/Search/Lucene/Document.php @@ -15,13 +15,13 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Field */ -require_once 'Zend/Search/Lucene/Field.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Field.php'; /** @@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Field.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Document @@ -90,9 +90,9 @@ class Zend_Search_Lucene_Document */ public function getField($fieldName) { - if (!array_key_exists($fieldName, $this->_fields)) { - throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); - } + if (!array_key_exists($fieldName, $this->_fields)) { + throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); + } return $this->_fields[$fieldName]; } @@ -105,7 +105,17 @@ class Zend_Search_Lucene_Document */ public function getFieldValue($fieldName) { - return $this->getField($fieldName)->stringValue; + return $this->getField($fieldName)->value; } + /** + * Returns the string value of a named field in UTF-8 encoding. + * + * @see __get() + * @return string + */ + public function getFieldUtf8Value($fieldName) + { + return $this->getField($fieldName)->getUtf8Value(); + } } diff --git a/search/Zend/Search/Lucene/Document/Html.php b/search/Zend/Search/Lucene/Document/Html.php new file mode 100644 index 0000000000..c10c8236c5 --- /dev/null +++ b/search/Zend/Search/Lucene/Document/Html.php @@ -0,0 +1,310 @@ +dirroot.'/search/Zend/Search/Lucene/Document.php'; + + +/** + * HTML document. + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Document + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document +{ + /** + * List of document links + * + * @var array + */ + private $_links = array(); + + /** + * List of document header links + * + * @var array + */ + private $_headerLinks = array(); + + /** + * Stored DOM representation + * + * @var DOMDocument + */ + private $_doc; + + /** + * Object constructor + * + * @param string $data + * @param boolean $isFile + * @param boolean $storeContent + */ + private function __construct($data, $isFile, $storeContent) + { + $this->_doc = new DOMDocument(); + $this->_doc->substituteEntities = true; + + if ($isFile) { + @$this->_doc->loadHTMLFile($data); + } else{ + @$this->_doc->loadHTML($data); + } + + $xpath = new DOMXPath($this->_doc); + + $docTitle = ''; + $titleNodes = $xpath->query('/html/head/title'); + foreach ($titleNodes as $titleNode) { + // title should always have only one entry, but we process all nodeset entries + $docTitle .= $titleNode->nodeValue . ' '; + } + $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding)); + + $metaNodes = $xpath->query('/html/head/meta[@name]'); + foreach ($metaNodes as $metaNode) { + $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), + $metaNode->getAttribute('content'), + $this->_doc->actualEncoding)); + } + + $docBody = ''; + $bodyNodes = $xpath->query('/html/body'); + foreach ($bodyNodes as $bodyNode) { + // body should always have only one entry, but we process all nodeset entries + $this->_retrieveNodeText($bodyNode, $docBody); + } + if ($storeContent) { + $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding)); + } else { + $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding)); + } + + $linkNodes = $this->_doc->getElementsByTagName('a'); + foreach ($linkNodes as $linkNode) { + if (($href = $linkNode->getAttribute('href')) != '') { + $this->_links[] = $href; + } + } + $this->_links = array_unique($this->_links); + + $linkNodes = $xpath->query('/html/head/link'); + foreach ($linkNodes as $linkNode) { + if (($href = $linkNode->getAttribute('href')) != '') { + $this->_headerLinks[] = $href; + } + } + $this->_headerLinks = array_unique($this->_headerLinks); + } + + /** + * Get node text + * + * We should exclude scripts, which may be not included into comment tags, CDATA sections, + * + * @param DOMNode $node + * @param string &$text + */ + private function _retrieveNodeText(DOMNode $node, &$text) + { + if ($node->nodeType == XML_TEXT_NODE) { + $text .= $node->nodeValue ; + $text .= ' '; + } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { + foreach ($node->childNodes as $childNode) { + $this->_retrieveNodeText($childNode, $text); + } + } + } + + /** + * Get document HREF links + * + * @return array + */ + public function getLinks() + { + return $this->_links; + } + + /** + * Get document header links + * + * @return array + */ + public function getHeaderLinks() + { + return $this->_headerLinks; + } + + /** + * Load HTML document from a string + * + * @param string $data + * @param boolean $storeContent + * @return Zend_Search_Lucene_Document_Html + */ + public static function loadHTML($data, $storeContent = false) + { + return new Zend_Search_Lucene_Document_Html($data, false, $storeContent); + } + + /** + * Load HTML document from a file + * + * @param string $file + * @param boolean $storeContent + * @return Zend_Search_Lucene_Document_Html + */ + public static function loadHTMLFile($file, $storeContent = false) + { + return new Zend_Search_Lucene_Document_Html($file, true, $storeContent); + } + + + /** + * Highlight text in text node + * + * @param DOMText $node + * @param array $wordsToHighlight + * @param string $color + */ + public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color) + { + $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); + $analyzer->setInput($node->nodeValue, $this->_doc->encoding); + + $matchedTokens = array(); + + while (($token = $analyzer->nextToken()) !== null) { + if (isset($wordsToHighlight[$token->getTermText()])) { + $matchedTokens[] = $token; + } + } + + if (count($matchedTokens) == 0) { + return; + } + + $matchedTokens = array_reverse($matchedTokens); + + foreach ($matchedTokens as $token) { + // Cut text after matched token + $node->splitText($token->getEndOffset()); + + // Cut matched node + $matchedWordNode = $node->splitText($token->getStartOffset()); + + $highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue); + $highlightedNode->setAttribute('style', 'color:black;background-color:' . $color); + + $node->parentNode->replaceChild($highlightedNode, $matchedWordNode); + } + } + + + /** + * highlight words in content of the specified node + * + * @param DOMNode $contextNode + * @param array $wordsToHighlight + * @param string $color + */ + public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color) + { + $textNodes = array(); + + if (!$contextNode->hasChildNodes()) { + return; + } + + foreach ($contextNode->childNodes as $childNode) { + if ($childNode->nodeType == XML_TEXT_NODE) { + // process node later to leave childNodes structure untouched + $textNodes[] = $childNode; + } else { + // Skip script nodes + if ($childNode->nodeName != 'script') { + $this->_highlightNode($childNode, $wordsToHighlight, $color); + } + } + } + + foreach ($textNodes as $textNode) { + $this->_highlightTextNode($textNode, $wordsToHighlight, $color); + } + } + + + + /** + * Highlight text with specified color + * + * @param string|array $words + * @param string $color + * @return string + */ + public function highlight($words, $color = '#66ffff') + { + if (!is_array($words)) { + $words = array($words); + } + $wordsToHighlight = array(); + + $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); + foreach ($words as $wordString) { + $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString)); + } + + if (count($wordsToHighlight) == 0) { + return $this->_doc->saveHTML(); + } + + $wordsToHighlightFlipped = array(); + foreach ($wordsToHighlight as $id => $token) { + $wordsToHighlightFlipped[$token->getTermText()] = $id; + } + + $xpath = new DOMXPath($this->_doc); + + $matchedNodes = $xpath->query("/html/body/*"); + foreach ($matchedNodes as $matchedNode) { + $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color); + } + + } + + /** + * Get HTML + * + * @return string + */ + public function getHTML() + { + return $this->_doc->saveHTML(); + } +} + diff --git a/search/Zend/Search/Lucene/Exception.php b/search/Zend/Search/Lucene/Exception.php index 5b73b29c5e..9d06e89522 100644 --- a/search/Zend/Search/Lucene/Exception.php +++ b/search/Zend/Search/Lucene/Exception.php @@ -14,7 +14,7 @@ * * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -22,13 +22,13 @@ /** * Framework base exception */ -require_once 'Zend/Search/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Exception.php'; /** * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Exception extends Zend_Search_Exception diff --git a/search/Zend/Search/Lucene/FSM.php b/search/Zend/Search/Lucene/FSM.php new file mode 100644 index 0000000000..31c9069fe4 --- /dev/null +++ b/search/Zend/Search/Lucene/FSM.php @@ -0,0 +1,433 @@ +dirroot.'/search/Zend/Search/Lucene/FSMAction.php'; + +/** Zend_Search_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Exception.php'; + + +/** + * Abstract Finite State Machine + * + * Take a look on Wikipedia state machine description: http://en.wikipedia.org/wiki/Finite_state_machine + * + * Any type of Transducers (Moore machine or Mealy machine) also may be implemented by using this abstract FSM. + * process() methods invokes a specified actions which may construct FSM output. + * Actions may be also used to signal, that we have reached Accept State + * + * @category Zend + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +abstract class Zend_Search_Lucene_FSM +{ + /** + * Machine States alphabet + * + * @var array + */ + private $_states = array(); + + /** + * Current state + * + * @var integer|string + */ + private $_currentState = null; + + /** + * Input alphabet + * + * @var array + */ + private $_inputAphabet = array(); + + /** + * State transition table + * + * [sourceState][input] => targetState + * + * @var array + */ + private $_rules = array(); + + /** + * List of entry actions + * Each action executes when entering the state + * + * [state] => action + * + * @var array + */ + private $_entryActions = array(); + + /** + * List of exit actions + * Each action executes when exiting the state + * + * [state] => action + * + * @var array + */ + private $_exitActions = array(); + + /** + * List of input actions + * Each action executes when entering the state + * + * [state][input] => action + * + * @var array + */ + private $_inputActions = array(); + + /** + * List of input actions + * Each action executes when entering the state + * + * [state1][state2] => action + * + * @var array + */ + private $_transitionActions = array(); + + /** + * Finite State machine constructor + * + * $states is an array of integers or strings with a list of possible machine states + * constructor treats fist list element as a sturt state (assignes it to $_current state). + * It may be reassigned by setState() call. + * States list may be empty and can be extended later by addState() or addStates() calls. + * + * $inputAphabet is the same as $states, but represents input alphabet + * it also may be extended later by addInputSymbols() or addInputSymbol() calls. + * + * $rules parameter describes FSM transitions and has a structure: + * array( array(sourseState, input, targetState[, inputAction]), + * array(sourseState, input, targetState[, inputAction]), + * array(sourseState, input, targetState[, inputAction]), + * ... + * ) + * Rules also can be added later by addRules() and addRule() calls. + * + * FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(), + * addInputAction() and addTransitionAction() calls. + * + * @param array $states + * @param array $inputAphabet + * @param array $rules + */ + public function __construct($states = array(), $inputAphabet = array(), $rules = array()) + { + $this->addStates($states); + $this->addInputSymbols($inputAphabet); + $this->addRules($rules); + } + + /** + * Add states to the state machine + * + * @param array $states + */ + public function addStates($states) + { + foreach ($states as $state) { + $this->addState($state); + } + } + + /** + * Add state to the state machine + * + * @param integer|string $state + */ + public function addState($state) + { + $this->_states[$state] = $state; + + if ($this->_currentState === null) { + $this->_currentState = $state; + } + } + + /** + * Set FSM state. + * No any action is invoked + * + * @param integer|string $state + * @throws Zend_Search_Exception + */ + public function setState($state) + { + if (!isset($this->_states[$state])) { + throw new Zend_Search_Exception('State \'' . $state . '\' is not on of the possible FSM states.'); + } + + $this->_currentState = $state; + } + + /** + * Get FSM state. + * + * @return integer|string $state|null + */ + public function getState() + { + return $this->_currentState; + } + + /** + * Add symbols to the input alphabet + * + * @param array $inputAphabet + */ + public function addInputSymbols($inputAphabet) + { + foreach ($inputAphabet as $inputSymbol) { + $this->addInputSymbol($inputSymbol); + } + } + + /** + * Add symbol to the input alphabet + * + * @param integer|string $inputSymbol + */ + public function addInputSymbol($inputSymbol) + { + $this->_inputAphabet[$inputSymbol] = $inputSymbol; + } + + + /** + * Add transition rules + * + * array structure: + * array( array(sourseState, input, targetState[, inputAction]), + * array(sourseState, input, targetState[, inputAction]), + * array(sourseState, input, targetState[, inputAction]), + * ... + * ) + * + * @param array $rules + */ + public function addRules($rules) + { + foreach ($rules as $rule) { + $this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null); + } + } + + /** + * Add symbol to the input alphabet + * + * @param integer|string $sourceState + * @param integer|string $input + * @param integer|string $targetState + * @param Zend_Search_Lucene_FSMAction|null $inputAction + * @throws Zend_Search_Exception + */ + public function addRule($sourceState, $input, $targetState, $inputAction = null) + { + if (!isset($this->_states[$sourceState])) { + throw new Zend_Search_Exception('Undefined source state (' . $sourceState . ').'); + } + if (!isset($this->_states[$targetState])) { + throw new Zend_Search_Exception('Undefined target state (' . $targetState . ').'); + } + if (!isset($this->_inputAphabet[$input])) { + throw new Zend_Search_Exception('Undefined input symbol (' . $input . ').'); + } + + if (!isset($this->_rules[$sourceState])) { + $this->_rules[$sourceState] = array(); + } + if (isset($this->_rules[$sourceState][$input])) { + throw new Zend_Search_Exception('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.'); + } + + $this->_rules[$sourceState][$input] = $targetState; + + + if ($inputAction !== null) { + $this->addInputAction($sourceState, $input, $inputAction); + } + } + + + /** + * Add state entry action. + * Several entry actions are allowed. + * Action execution order is defined by addEntryAction() calls + * + * @param integer|string $state + * @param Zend_Search_Lucene_FSMAction $action + */ + public function addEntryAction($state, Zend_Search_Lucene_FSMAction $action) + { + if (!isset($this->_states[$state])) { + throw new Zend_Search_Exception('Undefined state (' . $state. ').'); + } + + if (!isset($this->_entryActions[$state])) { + $this->_entryActions[$state] = array(); + } + + $this->_entryActions[$state][] = $action; + } + + /** + * Add state exit action. + * Several exit actions are allowed. + * Action execution order is defined by addEntryAction() calls + * + * @param integer|string $state + * @param Zend_Search_Lucene_FSMAction $action + */ + public function addExitAction($state, Zend_Search_Lucene_FSMAction $action) + { + if (!isset($this->_states[$state])) { + throw new Zend_Search_Exception('Undefined state (' . $state. ').'); + } + + if (!isset($this->_exitActions[$state])) { + $this->_exitActions[$state] = array(); + } + + $this->_exitActions[$state][] = $action; + } + + /** + * Add input action (defined by {state, input} pair). + * Several input actions are allowed. + * Action execution order is defined by addInputAction() calls + * + * @param integer|string $state + * @param integer|string $input + * @param Zend_Search_Lucene_FSMAction $action + */ + public function addInputAction($state, $inputSymbol, Zend_Search_Lucene_FSMAction $action) + { + if (!isset($this->_states[$state])) { + throw new Zend_Search_Exception('Undefined state (' . $state. ').'); + } + if (!isset($this->_inputAphabet[$inputSymbol])) { + throw new Zend_Search_Exception('Undefined input symbol (' . $inputSymbol. ').'); + } + + if (!isset($this->_inputActions[$state])) { + $this->_inputActions[$state] = array(); + } + if (!isset($this->_inputActions[$state][$inputSymbol])) { + $this->_inputActions[$state][$inputSymbol] = array(); + } + + $this->_inputActions[$state][$inputSymbol][] = $action; + } + + /** + * Add transition action (defined by {state, input} pair). + * Several transition actions are allowed. + * Action execution order is defined by addTransitionAction() calls + * + * @param integer|string $sourceState + * @param integer|string $targetState + * @param Zend_Search_Lucene_FSMAction $action + */ + public function addTransitionAction($sourceState, $targetState, Zend_Search_Lucene_FSMAction $action) + { + if (!isset($this->_states[$sourceState])) { + throw new Zend_Search_Exception('Undefined source state (' . $sourceState. ').'); + } + if (!isset($this->_states[$targetState])) { + throw new Zend_Search_Exception('Undefined source state (' . $targetState. ').'); + } + + if (!isset($this->_transitionActions[$sourceState])) { + $this->_transitionActions[$sourceState] = array(); + } + if (!isset($this->_transitionActions[$sourceState][$targetState])) { + $this->_transitionActions[$sourceState][$targetState] = array(); + } + + $this->_transitionActions[$sourceState][$targetState][] = $action; + } + + + /** + * Process an input + * + * @param mixed $input + * @throws Zend_Search_Exception + */ + public function process($input) + { + if (!isset($this->_rules[$this->_currentState])) { + throw new Zend_Search_Exception('There is no any rule for current state (' . $this->_currentState . ').'); + } + if (!isset($this->_rules[$this->_currentState][$input])) { + throw new Zend_Search_Exception('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').'); + } + + $sourceState = $this->_currentState; + $targetState = $this->_rules[$this->_currentState][$input]; + + if ($sourceState != $targetState && isset($this->_exitActions[$sourceState])) { + foreach ($this->_exitActions[$sourceState] as $action) { + $action->doAction(); + } + } + if (isset($this->_inputActions[$sourceState]) && + isset($this->_inputActions[$sourceState][$input])) { + foreach ($this->_inputActions[$sourceState][$input] as $action) { + $action->doAction(); + } + } + + + $this->_currentState = $targetState; + + if (isset($this->_transitionActions[$sourceState]) && + isset($this->_transitionActions[$sourceState][$targetState])) { + foreach ($this->_transitionActions[$sourceState][$targetState] as $action) { + $action->doAction(); + } + } + if ($sourceState != $targetState && isset($this->_entryActions[$targetState])) { + foreach ($this->_entryActions[$targetState] as $action) { + $action->doAction(); + } + } + } + + public function reset() + { + if (count($this->_states) == 0) { + throw new Zend_Search_Exception('There is no any state defined for FSM.'); + } + + $this->_currentState = $this->_states[0]; + } +} + diff --git a/search/Zend/Search/Lucene/FSMAction.php b/search/Zend/Search/Lucene/FSMAction.php new file mode 100644 index 0000000000..606de7b613 --- /dev/null +++ b/search/Zend/Search/Lucene/FSMAction.php @@ -0,0 +1,65 @@ +_object = $object; + $this->_method = $method; + } + + public function doAction() + { + $methodName = $this->_method; + $this->_object->$methodName(); + } +} + diff --git a/search/Zend/Search/Lucene/Field.php b/search/Zend/Search/Lucene/Field.php index 5a18fcfc46..86cd22ccea 100644 --- a/search/Zend/Search/Lucene/Field.php +++ b/search/Zend/Search/Lucene/Field.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -31,15 +31,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Field { - public $kind; + /** + * Field name + * + * @var string + */ + public $name; - public $name = 'body'; - public $stringValue = null; + + public $value; public $isStored = false; public $isIndexed = true; public $isTokenized = true; @@ -47,26 +52,48 @@ class Zend_Search_Lucene_Field public $storeTermVector = false; + /** + * Field boos factor + * It's not stored directly in the index, but affects on normalizetion factor + * + * @var float + */ public $boost = 1.0; - public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false) + /** + * Field value encoding. + * + * @var string + */ + public $encoding; + + /** + * Object constructor + * + * @param string $name + * @param string $value + * @param string $encoding + * @param boolean $isStored + * @param boolean $isIndexed + * @param boolean $isTokenized + * @param boolean $isBinary + */ + public function __construct($name, $value, $encoding, $isStored, $isIndexed, $isTokenized, $isBinary = false) { - $this->name = $name; + $this->name = $name; + $this->value = $value; if (!$isBinary) { - /** - * @todo Correct UTF-8 string should be required in future - * Until full UTF-8 support is not completed, string should be normalized to ANSII encoding - */ - $this->stringValue = iconv(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT', $stringValue); - //$this->stringValue = iconv('', 'ASCII//TRANSLIT', $stringValue); + $this->encoding = $encoding; + $this->isTokenized = $isTokenized; } else { - $this->stringValue = $stringValue; + $this->encoding = ''; + $this->isTokenized = false; } - $this->isStored = $isStored; - $this->isIndexed = $isIndexed; - $this->isTokenized = $isTokenized; - $this->isBinary = $isBinary; + + $this->isStored = $isStored; + $this->isIndexed = $isIndexed; + $this->isBinary = $isBinary; $this->storeTermVector = false; $this->boost = 1.0; @@ -79,11 +106,12 @@ class Zend_Search_Lucene_Field * * @param string $name * @param string $value + * @param string $encoding * @return Zend_Search_Lucene_Field */ - static public function Keyword($name, $value) + public static function Keyword($name, $value, $encoding = '') { - return new self($name, $value, true, true, false); + return new self($name, $value, $encoding, true, true, false); } @@ -93,11 +121,12 @@ class Zend_Search_Lucene_Field * * @param string $name * @param string $value + * @param string $encoding * @return Zend_Search_Lucene_Field */ - static public function UnIndexed($name, $value) + public static function UnIndexed($name, $value, $encoding = '') { - return new self($name, $value, true, false, false); + return new self($name, $value, $encoding, true, false, false); } @@ -107,11 +136,12 @@ class Zend_Search_Lucene_Field * * @param string $name * @param string $value + * @param string $encoding * @return Zend_Search_Lucene_Field */ - static public function Binary($name, $value) + public static function Binary($name, $value) { - return new self($name, $value, true, false, false, true); + return new self($name, $value, '', true, false, false, true); } /** @@ -121,11 +151,12 @@ class Zend_Search_Lucene_Field * * @param string $name * @param string $value + * @param string $encoding * @return Zend_Search_Lucene_Field */ - static public function Text($name, $value) + public static function Text($name, $value, $encoding = '') { - return new self($name, $value, true, true, true); + return new self($name, $value, $encoding, true, true, true); } @@ -135,12 +166,27 @@ class Zend_Search_Lucene_Field * * @param string $name * @param string $value + * @param string $encoding * @return Zend_Search_Lucene_Field */ - static public function UnStored($name, $value) + public static function UnStored($name, $value, $encoding = '') { - return new self($name, $value, false, true, true); + return new self($name, $value, $encoding, false, true, true); } + /** + * Get field value in UTF-8 encoding + * + * @return string + */ + public function getUtf8Value() + { + if (strcasecmp($this->encoding, 'utf8' ) == 0 || + strcasecmp($this->encoding, 'utf-8') == 0 ) { + return $this->value; + } else { + return iconv($this->encoding, 'UTF-8', $this->value); + } + } } diff --git a/search/Zend/Search/Lucene/Index/DictionaryLoader.php b/search/Zend/Search/Lucene/Index/DictionaryLoader.php new file mode 100644 index 0000000000..d3f0669c81 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/DictionaryLoader.php @@ -0,0 +1,254 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + + +/** + * Dictionary loader + * + * It's a dummy class which is created to encapsulate non-good structured code. + * Manual "method inlining" is performed to increase dictionary index loading operation + * which is major bottelneck for search performance. + * + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Index_DictionaryLoader +{ + /** + * Dictionary index loader. + * + * It takes a string which is actually .tii index file data and + * returns two arrays - term and tremInfo lists. + * + * See Zend_Search_Lucene_Index_SegmintInfo class for details + * + * @param string $data + * @return array + * @throws Zend_Search_Lucene_Exception + */ + public static function load($data) + { + $termDictionary = array(); + $termInfos = array(); + $pos = 0; + + // $tiVersion = $tiiFile->readInt(); + $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]); + $pos += 4; + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + // $indexTermCount = = $tiiFile->readLong(); + if (PHP_INT_SIZE > 4) { + $indexTermCount = ord($data[$pos]) << 56 | + ord($data[$pos+1]) << 48 | + ord($data[$pos+2]) << 40 | + ord($data[$pos+3]) << 32 | + ord($data[$pos+4]) << 24 | + ord($data[$pos+5]) << 16 | + ord($data[$pos+6]) << 8 | + ord($data[$pos+7]); + } else { + if ((ord($data[$pos]) != 0) || + (ord($data[$pos+1]) != 0) || + (ord($data[$pos+2]) != 0) || + (ord($data[$pos+3]) != 0) || + ((ord($data[$pos+4]) & 0x80) != 0)) { + throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); + } + + $indexTermCount = ord($data[$pos+4]) << 24 | + ord($data[$pos+5]) << 16 | + ord($data[$pos+6]) << 8 | + ord($data[$pos+7]); + } + $pos += 8; + + // $tiiFile->readInt(); // IndexInterval + $pos += 4; + + // $skipInterval = $tiiFile->readInt(); + $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]); + $pos += 4; + if ($indexTermCount < 1) { + throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index'); + } + + $prevTerm = ''; + $freqPointer = 0; + $proxPointer = 0; + $indexPointer = 0; + for ($count = 0; $count < $indexTermCount; $count++) { + //$termPrefixLength = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $termPrefixLength = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $termPrefixLength |= ($nbyte & 0x7F) << $shift; + } + + // $termSuffix = $tiiFile->readString(); + $nbyte = ord($data[$pos++]); + $len = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $len |= ($nbyte & 0x7F) << $shift; + } + if ($len == 0) { + $termSuffix = ''; + } else { + $termSuffix = substr($data, $pos, $len); + $pos += $len; + for ($count1 = 0; $count1 < $len; $count1++ ) { + if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($termSuffix[$count1]) & 0x20 ) { + $addBytes++; + } + $termSuffix .= substr($data, $pos, $addBytes); + $pos += $addBytes; + $len += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($termSuffix[$count1]) == 0xC0 && + ord($termSuffix[$count1+1]) == 0x80 ) { + $termSuffix[$count1] = 0; + $termSuffix = substr($termSuffix,0,$count1+1) + . substr($termSuffix,$count1+2); + } + $count1 += $addBytes; + } + } + } + + // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix; + $pb = 0; $pc = 0; + while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) { + $charBytes = 1; + if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) { + $charBytes++; + if (ord($prevTerm[$pb]) & 0x20 ) { + $charBytes++; + if (ord($prevTerm[$pb]) & 0x10 ) { + $charBytes++; + } + } + } + + if ($pb + $charBytes > strlen($data)) { + // wrong character + break; + } + + $pc++; + $pb += $charBytes; + } + $termValue = substr($prevTerm, 0, $pb) . $termSuffix; + + // $termFieldNum = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $termFieldNum = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $termFieldNum |= ($nbyte & 0x7F) << $shift; + } + + // $docFreq = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $docFreq = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $docFreq |= ($nbyte & 0x7F) << $shift; + } + + // $freqPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $freqPointer += $vint; + + // $proxPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $proxPointer += $vint; + + if( $docFreq >= $skipInterval ) { + // $skipDelta = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $skipDelta = $vint; + } else { + $skipDelta = 0; + } + + // $indexPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $indexPointer += $vint; + + + // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum); + $termDictionary[] = array($termFieldNum, $termValue); + + $termInfos[] = + // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + + $prevTerm = $termValue; + } + + // Check special index entry mark + if ($termDictionary[0][0] != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } else if (PHP_INT_SIZE > 4){ + // Treat 64-bit 0xFFFFFFFF as -1 + $termDictionary[0][0] = -1; + } + + return array(&$termDictionary, &$termInfos); + } +} + diff --git a/search/Zend/Search/Lucene/Index/FieldInfo.php b/search/Zend/Search/Lucene/Index/FieldInfo.php index 4c11aaac77..1d138b6575 100644 --- a/search/Zend/Search/Lucene/Index/FieldInfo.php +++ b/search/Zend/Search/Lucene/Index/FieldInfo.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -24,7 +24,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_FieldInfo diff --git a/search/Zend/Search/Lucene/Index/SegmentInfo.php b/search/Zend/Search/Lucene/Index/SegmentInfo.php index aeceab63b6..c6f7868fd5 100644 --- a/search/Zend/Search/Lucene/Index/SegmentInfo.php +++ b/search/Zend/Search/Lucene/Index/SegmentInfo.php @@ -15,20 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ +/** Zend_Search_Lucene_Index_DictionaryLoader */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php'; + /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_SegmentInfo @@ -49,7 +52,12 @@ class Zend_Search_Lucene_Index_SegmentInfo /** * Term Dictionary Index - * Array of the Zend_Search_Lucene_Index_Term objects + * + * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because + * of performance considerations) + * [0] -> $termValue + * [1] -> $termFieldNum + * * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos * * @var array @@ -58,7 +66,14 @@ class Zend_Search_Lucene_Index_SegmentInfo /** * Term Dictionary Index TermInfos - * Array of the Zend_Search_Lucene_Index_TermInfo objects + * + * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because + * of performance considerations) + * [0] -> $docFreq + * [1] -> $freqPointer + * [2] -> $proxPointer + * [3] -> $skipOffset + * [4] -> $indexPointer * * @var array */ @@ -88,6 +103,14 @@ class Zend_Search_Lucene_Index_SegmentInfo */ private $_segFiles; + /** + * Associative array where the key is the file name and the value is file size (.csf). + * + * @var array + */ + private $_segFileSizes; + + /** * File system adapter. * @@ -122,6 +145,7 @@ class Zend_Search_Lucene_Index_SegmentInfo */ private $_deletedDirty = false; + /** * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, * Documents count and Directory as a parameter. @@ -144,9 +168,15 @@ class Zend_Search_Lucene_Index_SegmentInfo for ($count = 0; $count < $segFilesCount; $count++) { $dataOffset = $cfsFile->readLong(); + if ($count != 0) { + $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); + } $fileName = $cfsFile->readString(); $this->_segFiles[$fileName] = $dataOffset; } + if ($count != 0) { + $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; + } } $fnmFile = $this->openCompoundFile('.fnm'); @@ -197,7 +227,6 @@ class Zend_Search_Lucene_Index_SegmentInfo } } } - } } catch(Zend_Search_Exception $e) { if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) { @@ -212,16 +241,17 @@ class Zend_Search_Lucene_Index_SegmentInfo * Opens index file stoted within compound index file * * @param string $extension + * @param boolean $shareHandler * @throws Zend_Search_Lucene_Exception * @return Zend_Search_Lucene_Storage_File */ - public function openCompoundFile($extension) + public function openCompoundFile($extension, $shareHandler = true) { $filename = $this->_name . $extension; // Try to open common file first if ($this->_directory->fileExists($filename)) { - return $this->_directory->getFileObject($filename); + return $this->_directory->getFileObject($filename, $shareHandler); } if( !isset($this->_segFiles[$filename]) ) { @@ -229,11 +259,34 @@ class Zend_Search_Lucene_Index_SegmentInfo . $filename . ' file.' ); } - $file = $this->_directory->getFileObject( $this->_name.".cfs" ); + $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); $file->seek($this->_segFiles[$filename]); return $file; } + /** + * Get compound file length + * + * @param string $extension + * @return integer + */ + public function compoundFileLength($extension) + { + $filename = $this->_name . $extension; + + // Try to get common file first + if ($this->_directory->fileExists($filename)) { + return $this->_directory->fileLength($filename); + } + + if( !isset($this->_segFileSizes[$filename]) ) { + throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' + . $filename . ' file.' ); + } + + return $this->_segFileSizes[$filename]; + } + /** * Returns field index or -1 if field is not found * @@ -255,7 +308,7 @@ class Zend_Search_Lucene_Index_SegmentInfo * Returns field info for specified field * * @param integer $fieldNum - * @return ZSearchFieldInfo + * @return Zend_Search_Lucene_Index_FieldInfo */ public function getField($fieldNum) { @@ -281,77 +334,68 @@ class Zend_Search_Lucene_Index_SegmentInfo } /** - * Returns the total number of documents in this segment. + * Returns array of FieldInfo objects. * - * @return integer + * @return array */ - public function count() + public function getFieldInfos() { - return $this->_docCount; + return $this->_fields; } /** - * Get field position in a fields dictionary + * Returns the total number of documents in this segment (including deleted documents). * - * @param integer $fieldNum * @return integer */ - private function _getFieldPosition($fieldNum) { - // Treat values which are not in a translation table as a 'direct value' - return isset($this->_fieldsDicPositions[$fieldNum]) ? - $this->_fieldsDicPositions[$fieldNum] : $fieldNum; + public function count() + { + return $this->_docCount; } /** - * Loads Term dictionary from TermInfoIndex file + * Returns number of deleted documents. + * + * @return integer */ - protected function _loadDictionary() + private function _deletedCount() { - if ($this->_termDictionary !== null) { - return; + if ($this->_deleted === null) { + return 0; } - $this->_termDictionary = array(); - $this->_termDictionaryInfos = array(); - - $tiiFile = $this->openCompoundFile('.tii'); - $tiVersion = $tiiFile->readInt(); - if ($tiVersion != (int)0xFFFFFFFE) { - throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); - } - - $indexTermCount = $tiiFile->readLong(); - $tiiFile->readInt(); // IndexInterval - $skipInterval = $tiiFile->readInt(); - - $prevTerm = ''; - $freqPointer = 0; - $proxPointer = 0; - $indexPointer = 0; - for ($count = 0; $count < $indexTermCount; $count++) { - $termPrefixLength = $tiiFile->readVInt(); - $termSuffix = $tiiFile->readString(); - $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; - - $termFieldNum = $tiiFile->readVInt(); - $docFreq = $tiiFile->readVInt(); - $freqPointer += $tiiFile->readVInt(); - $proxPointer += $tiiFile->readVInt(); - if( $docFreq >= $skipInterval ) { - $skipDelta = $tiiFile->readVInt(); - } else { - $skipDelta = 0; - } - - $indexPointer += $tiiFile->readVInt(); + if (extension_loaded('bitset')) { + return count(bitset_to_array($this->_deleted)); + } else { + return count($this->_deleted); + } + } - $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); - $this->_termDictionaryInfos[] = - new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); - $prevTerm = $termValue; + /** + * Returns the total number of non-deleted documents in this segment. + * + * @return integer + */ + public function numDocs() + { + if ($this->hasDeletions()) { + return $this->_docCount - $this->_deletedCount(); + } else { + return $this->_docCount; } } + /** + * Get field position in a fields dictionary + * + * @param integer $fieldNum + * @return integer + */ + private function _getFieldPosition($fieldNum) { + // Treat values which are not in a translation table as a 'direct value' + return isset($this->_fieldsDicPositions[$fieldNum]) ? + $this->_fieldsDicPositions[$fieldNum] : $fieldNum; + } /** * Return segment name @@ -364,15 +408,75 @@ class Zend_Search_Lucene_Index_SegmentInfo } + /** + * TermInfo cache + * + * Size is 1024. + * Numbers are used instead of class constants because of performance considerations + * + * @var array + */ + private $_termInfoCache = array(); + + private function _cleanUpTermInfoCache() + { + // Clean 256 term infos + foreach ($this->_termInfoCache as $key => $termInfo) { + unset($this->_termInfoCache[$key]); + + // leave 768 last used term infos + if (count($this->_termInfoCache) == 768) { + break; + } + } + } + /** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ - public function getTermInfo($term) + public function getTermInfo(Zend_Search_Lucene_Index_Term $term) { - $this->_loadDictionary(); + $termKey = $term->key(); + if (isset($this->_termInfoCache[$termKey])) { + $termInfo = $this->_termInfoCache[$termKey]; + + // Move termInfo to the end of cache + unset($this->_termInfoCache[$termKey]); + $this->_termInfoCache[$termKey] = $termInfo; + + return $termInfo; + } + + + if ($this->_termDictionary === null) { + // Check, if index is already serialized + if ($this->_directory->fileExists($this->_name . '.sti')) { + // Prefetch dictionary index data + $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); + $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); + + // Load dictionary index data + list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData); + } else { + // Prefetch dictionary index data + $tiiFile = $this->openCompoundFile('.tii'); + $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); + + // Load dictionary index data + list($this->_termDictionary, $this->_termDictionaryInfos) = + Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); + + $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); + $stiFile = $this->_directory->createFile($this->_name . '.sti'); + $stiFile->writeBytes($stiFileData); + } + + } + + $searchField = $this->getFieldNum($term->field); @@ -389,10 +493,10 @@ class Zend_Search_Lucene_Index_SegmentInfo $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; - $fieldNum = $this->_getFieldPosition($midTerm->field); + $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { - $delta = strcmp($term->text, $midTerm->text); + $delta = strcmp($term->text, $midTerm[1] /* text */); } if ($delta < 0) { @@ -400,7 +504,14 @@ class Zend_Search_Lucene_Index_SegmentInfo } elseif ($delta > 0) { $lowIndex = $mid+1; } else { - return $this->_termDictionaryInfos[$mid]; // We got it! + // return $this->_termDictionaryInfos[$mid]; // We got it! + $a = $this->_termDictionaryInfos[$mid]; + $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); + + // Put loaded termInfo into cache + $this->_termInfoCache[$termKey] = $termInfo; + + return $termInfo; } } @@ -411,7 +522,7 @@ class Zend_Search_Lucene_Index_SegmentInfo $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; - $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; + $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); @@ -423,12 +534,12 @@ class Zend_Search_Lucene_Index_SegmentInfo $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); - $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); + $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR); - $termValue = $prevTerm->text; - $termFieldNum = $prevTerm->field; - $freqPointer = $prevTermInfo->freqPointer; - $proxPointer = $prevTermInfo->proxPointer; + $termValue = $prevTerm[1] /* text */; + $termFieldNum = $prevTerm[0] /* field */; + $freqPointer = $prevTermInfo[1] /* freqPointer */; + $proxPointer = $prevTermInfo[2] /* proxPointer */; for ($count = $prevPosition*$indexInterval + 1; $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || @@ -438,7 +549,7 @@ class Zend_Search_Lucene_Index_SegmentInfo $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); - $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; + $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); @@ -451,10 +562,115 @@ class Zend_Search_Lucene_Index_SegmentInfo } if ($termFieldNum == $searchField && $termValue == $term->text) { - return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); + $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { - return null; + $termInfo = null; + } + + // Put loaded termInfo into cache + $this->_termInfoCache[$termKey] = $termInfo; + + if (count($this->_termInfoCache) == 1024) { + $this->_cleanUpTermInfoCache(); + } + + return $termInfo; + } + + /** + * Returns term freqs array. + * Result array structure: array(docId => freq, ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @param integer $shift + * @return Zend_Search_Lucene_Index_TermInfo + */ + public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0) + { + $termInfo = $this->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + return array(); } + + $frqFile = $this->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $result = array(); + $docId = 0; + + for ($count = 0; $count < $termInfo->docFreq; $count++) { + $docDelta = $frqFile->readVInt(); + if ($docDelta % 2 == 1) { + $docId += ($docDelta-1)/2; + $result[$shift + $docId] = 1; + } else { + $docId += $docDelta/2; + $result[$shift + $docId] = $frqFile->readVInt(); + } + } + + return $result; + } + + /** + * Returns term positions array. + * Result array structure: array(docId => array(pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @param integer $shift + * @return Zend_Search_Lucene_Index_TermInfo + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0) + { + $termInfo = $this->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + return array(); + } + + $frqFile = $this->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $freqs = array(); + $docId = 0; + + for ($count = 0; $count < $termInfo->docFreq; $count++) { + $docDelta = $frqFile->readVInt(); + if ($docDelta % 2 == 1) { + $docId += ($docDelta-1)/2; + $freqs[$docId] = 1; + } else { + $docId += $docDelta/2; + $freqs[$docId] = $frqFile->readVInt(); + } + } + + $result = array(); + $prxFile = $this->openCompoundFile('.prx'); + $prxFile->seek($termInfo->proxPointer, SEEK_CUR); + foreach ($freqs as $docId => $freq) { + $termPosition = 0; + $positions = array(); + + for ($count = 0; $count < $freq; $count++ ) { + $termPosition += $prxFile->readVInt(); + $positions[] = $termPosition; + } + + $result[$shift + $docId] = $positions; + } + + return $result; + } + + /** + * Load normalizatin factors from an index file + * + * @param integer $fieldNum + */ + private function _loadNorm($fieldNum) + { + $fFile = $this->openCompoundFile('.f' . $fieldNum); + $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); } /** @@ -462,7 +678,7 @@ class Zend_Search_Lucene_Index_SegmentInfo * * @param integer $id * @param string $fieldName - * @return string + * @return float */ public function norm($id, $fieldName) { @@ -472,14 +688,37 @@ class Zend_Search_Lucene_Index_SegmentInfo return null; } - if ( !isset( $this->_norms[$fieldNum] )) { - $fFile = $this->openCompoundFile('.f' . $fieldNum); - $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); + if (!isset($this->_norms[$fieldNum])) { + $this->_loadNorm($fieldNum); } return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); } + /** + * Returns norm vector, encoded in a byte string + * + * @param string $fieldName + * @return string + */ + public function normVector($fieldName) + { + $fieldNum = $this->getFieldNum($fieldName); + + if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { + $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); + + return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), + $this->_docCount); + } + + if (!isset($this->_norms[$fieldNum])) { + $this->_loadNorm($fieldNum); + } + + return $this->_norms[$fieldNum]; + } + /** * Returns true if any documents have been deleted from this index segment. @@ -571,5 +810,247 @@ class Zend_Search_Lucene_Index_SegmentInfo $this->_deletedDirty = false; } + + + + /** + * Term Dictionary File object for stream like terms reading + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_tisFile = null; + + /** + * Frequencies File object for stream like terms reading + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_frqFile = null; + + /** + * Offset of the .frq file in the compound file + * + * @var integer + */ + private $_frqFileOffset; + + /** + * Positions File object for stream like terms reading + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_prxFile = null; + + /** + * Offset of the .prx file in the compound file + * + * @var integer + */ + private $_prxFileOffset; + + + /** + * Number of terms in term stream + * + * @var integer + */ + private $_termCount = 0; + + /** + * Segment skip interval + * + * @var integer + */ + private $_skipInterval; + + /** + * Last TermInfo in a terms stream + * + * @var Zend_Search_Lucene_Index_TermInfo + */ + private $_lastTermInfo = null; + + /** + * Last Term in a terms stream + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_lastTerm = null; + + /** + * Map of the document IDs + * Used to get new docID after removing deleted documents. + * It's not very effective from memory usage point of view, + * but much more faster, then other methods + * + * @var array|null + */ + private $_docMap = null; + + /** + * An array of all term positions in the documents. + * Array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @var array + */ + private $_lastTermPositions; + + /** + * Reset terms stream + * + * $startId - id for the fist document + * $compact - remove deleted documents + * + * Returns start document id for the next segment + * + * @param integer $startId + * @param boolean $compact + * @throws Zend_Search_Lucene_Exception + * @return integer + */ + public function reset($startId = 0, $compact = false) + { + if ($this->_tisFile !== null) { + $this->_tisFile = null; + } + + $this->_tisFile = $this->openCompoundFile('.tis', false); + $tiVersion = $this->_tisFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); + } + + $this->_termCount = $this->_tisFile->readLong(); + $this->_tisFile->readInt(); // Read Index interval + $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval + + if ($this->_frqFile !== null) { + $this->_frqFile = null; + } + $this->_frqFile = $this->openCompoundFile('.frq', false); + $this->_frqFileOffset = $this->_frqFile->tell(); + + if ($this->_prxFile !== null) { + $this->_prxFile = null; + } + $this->_prxFile = $this->openCompoundFile('.prx', false); + $this->_prxFileOffset = $this->_prxFile->tell(); + + $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); + $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); + + $this->_docMap = array(); + for ($count = 0; $count < $this->_docCount; $count++) { + if (!$this->isDeleted($count)) { + $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count); + } + } + + $this->nextTerm(); + return $startId + ($compact ? count($this->_docMap) : $this->_docCount); + } + + + /** + * Scans terms dictionary and returns next term + * + * @return Zend_Search_Lucene_Index_Term|null + */ + public function nextTerm() + { + if ($this->_tisFile === null || $this->_termCount == 0) { + $this->_lastTerm = null; + $this->_lastTermInfo = null; + + // may be necessary for "empty" segment + $this->_tisFile = null; + $this->_frqFile = null; + $this->_prxFile = null; + + return null; + } + + $termPrefixLength = $this->_tisFile->readVInt(); + $termSuffix = $this->_tisFile->readString(); + $termFieldNum = $this->_tisFile->readVInt(); + $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; + + $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); + + $docFreq = $this->_tisFile->readVInt(); + $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); + $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); + if ($docFreq >= $this->_skipInterval) { + $skipOffset = $this->_tisFile->readVInt(); + } else { + $skipOffset = 0; + } + + $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); + + + $this->_lastTermPositions = array(); + + $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); + $freqs = array(); $docId = 0; + for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { + $docDelta = $this->_frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + $freqs[ $docId ] = 1; + } else { + $docId += $docDelta/2; + $freqs[ $docId ] = $this->_frqFile->readVInt(); + } + } + + $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); + foreach ($freqs as $docId => $freq) { + $termPosition = 0; $positions = array(); + + for ($count = 0; $count < $freq; $count++ ) { + $termPosition += $this->_prxFile->readVInt(); + $positions[] = $termPosition; + } + + if (isset($this->_docMap[$docId])) { + $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; + } + } + + + $this->_termCount--; + if ($this->_termCount == 0) { + $this->_tisFile = null; + $this->_frqFile = null; + $this->_prxFile = null; + } + + return $this->_lastTerm; + } + + + /** + * Returns term in current position + * + * @param Zend_Search_Lucene_Index_Term $term + * @return Zend_Search_Lucene_Index_Term|null + */ + public function currentTerm() + { + return $this->_lastTerm; + } + + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @return array + */ + public function currentTermPositions() + { + return $this->_lastTermPositions; + } } diff --git a/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php b/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php new file mode 100644 index 0000000000..4d0f346ceb --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php @@ -0,0 +1,53 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/PriorityQueue.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Index_SegmentInfoPriorityQueue extends Zend_Search_Lucene_PriorityQueue +{ + /** + * Compare elements + * + * Returns true, if $el1 is less than $el2; else otherwise + * + * @param mixed $segmentInfo1 + * @param mixed $segmentInfo2 + * @return boolean + */ + protected function _less($segmentInfo1, $segmentInfo2) + { + return strcmp($segmentInfo1->currentTerm()->key(), $segmentInfo2->currentTerm()->key()) < 0; + } + +} diff --git a/search/Zend/Search/Lucene/Index/SegmentMerger.php b/search/Zend/Search/Lucene/Index/SegmentMerger.php new file mode 100644 index 0000000000..157489c492 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentMerger.php @@ -0,0 +1,273 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; + +/** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php'; + +/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Index_SegmentMerger +{ + /** + * Target segment writer + * + * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter + */ + private $_writer; + + /** + * Number of docs in a new segment + * + * @var integer + */ + private $_docCount; + + /** + * A set of segments to be merged + * + * @var array Zend_Search_Lucene_Index_SegmentInfo + */ + private $_segmentInfos = array(); + + /** + * Flag to signal, that merge is already done + * + * @var boolean + */ + private $_mergeDone = false; + + /** + * Field map + * [][] => + * + * @var array + */ + private $_fieldsMap = array(); + + + + /** + * Object constructor. + * + * Creates new segment merger with $directory as target to merge segments into + * and $name as a name of new segment + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param string $name + */ + public function __construct($directory, $name) + { + $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name); + } + + + /** + * Add segmnet to a collection of segments to be merged + * + * @param Zend_Search_Lucene_Index_SegmentInfo $segment + */ + public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo) + { + $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo; + } + + + /** + * Do merge. + * + * Returns number of documents in newly created segment + * + * @return Zend_Search_Lucene_Index_SegmentInfo + * @throws Zend_Search_Lucene_Exception + */ + public function merge() + { + if ($this->_mergeDone) { + throw new Zend_Search_Lucene_Exception('Merge is already done.'); + } + + if (count($this->_segmentInfos) < 1) { + throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged (' + . count($this->_segmentInfos) + . ').'); + } + + $this->_mergeFields(); + $this->_mergeNorms(); + $this->_mergeStoredFields(); + $this->_mergeTerms(); + + $this->_mergeDone = true; + + return $this->_writer->close(); + } + + + /** + * Merge fields information + */ + private function _mergeFields() + { + foreach ($this->_segmentInfos as $segName => $segmentInfo) { + foreach ($segmentInfo->getFieldInfos() as $fieldInfo) { + $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo); + } + } + } + + /** + * Merge field's normalization factors + */ + private function _mergeNorms() + { + foreach ($this->_writer->getFieldInfos() as $fieldInfo) { + if ($fieldInfo->isIndexed) { + foreach ($this->_segmentInfos as $segName => $segmentInfo) { + if ($segmentInfo->hasDeletions()) { + $srcNorm = $segmentInfo->normVector($fieldInfo->name); + $norm = ''; + $docs = $segmentInfo->count(); + for ($count = 0; $count < $docs; $count++) { + if (!$segmentInfo->isDeleted($count)) { + $norm .= $srcNorm[$count]; + } + } + $this->_writer->addNorm($fieldInfo->name, $norm); + } else { + $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name)); + } + } + } + } + } + + /** + * Merge fields information + */ + private function _mergeStoredFields() + { + $this->_docCount = 0; + + foreach ($this->_segmentInfos as $segName => $segmentInfo) { + $fdtFile = $segmentInfo->openCompoundFile('.fdt'); + + for ($count = 0; $count < $segmentInfo->count(); $count++) { + $fieldCount = $fdtFile->readVInt(); + $storedFields = array(); + + for ($count2 = 0; $count2 < $fieldCount; $count2++) { + $fieldNum = $fdtFile->readVInt(); + $bits = $fdtFile->readByte(); + $fieldInfo = $segmentInfo->getField($fieldNum); + + if (!($bits & 2)) { // Text data + $storedFields[] = + new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readString(), + 'UTF-8', + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } else { // Binary data + $storedFields[] = + new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readBinary(), + '', + true, + $fieldInfo->isIndexed, + $bits & 1, + true); + } + } + + if (!$segmentInfo->isDeleted($count)) { + $this->_docCount++; + $this->_writer->addStoredFields($storedFields); + } + } + } + } + + + /** + * Merge fields information + */ + private function _mergeTerms() + { + $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue(); + + $segmentStartId = 0; + foreach ($this->_segmentInfos as $segName => $segmentInfo) { + $segmentStartId = $segmentInfo->reset($segmentStartId, true); + + // Skip "empty" segments + if ($segmentInfo->currentTerm() !== null) { + $segmentInfoQueue->put($segmentInfo); + } + } + + $this->_writer->initializeDictionaryFiles(); + + $termDocs = array(); + while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { + // Merge positions array + $termDocs += $segmentInfo->currentTermPositions(); + + if ($segmentInfoQueue->top() === null || + $segmentInfoQueue->top()->currentTerm()->key() != + $segmentInfo->currentTerm()->key()) { + // We got new term + ksort($termDocs, SORT_NUMERIC); + + // Add term if it's contained in any document + if (count($termDocs) > 0) { + $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs); + } + $termDocs = array(); + } + + $segmentInfo->nextTerm(); + // check, if segment dictionary is finished + if ($segmentInfo->currentTerm() !== null) { + // Put segment back into the priority queue + $segmentInfoQueue->put($segmentInfo); + } + } + + $this->_writer->closeDictionaryFiles(); + } +} diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter.php index 6cb4477f99..2f1a05e322 100644 --- a/search/Zend/Search/Lucene/Index/SegmentWriter.php +++ b/search/Zend/Search/Lucene/Index/SegmentWriter.php @@ -15,29 +15,26 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; - -/** Zend_Search_Lucene_Analysis_Analyzer */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ -class Zend_Search_Lucene_Index_SegmentWriter +abstract class Zend_Search_Lucene_Index_SegmentWriter { /** * Expert: The fraction of terms in the "dictionary" which should be stored @@ -48,7 +45,7 @@ class Zend_Search_Lucene_Index_SegmentWriter * * @var integer */ - static public $indexInterval = 128; + public static $indexInterval = 128; /** Expert: The fraction of TermDocs entries stored in skip tables. * Larger values result in smaller indexes, greater acceleration, but fewer @@ -61,28 +58,28 @@ class Zend_Search_Lucene_Index_SegmentWriter * * @var integer */ - static public $skipInterval = 0x7FFFFFFF; + public static $skipInterval = 0x7FFFFFFF; /** * Number of docs in a segment * * @var integer */ - private $_docCount; + protected $_docCount = 0; /** * Segment name * * @var string */ - private $_name; + protected $_name; /** * File system adapter. * * @var Zend_Search_Lucene_Storage_Directory */ - private $_directory; + protected $_directory; /** * List of the index files. @@ -90,52 +87,41 @@ class Zend_Search_Lucene_Index_SegmentWriter * * @var unknown_type */ - private $_files; - - /** - * Term Dictionary - * Array of the Zend_Search_Lucene_Index_Term objects - * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos - * - * @var array - */ - private $_termDictionary; - - /** - * Documents, which contain the term - * - * @var array - */ - private $_termDocs; + protected $_files = array(); /** * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment * * @var array */ - private $_fields; + protected $_fields = array(); /** - * Sizes of the indexed fields. - * Used for normalization factors calculation. + * Normalization factors. + * An array fieldName => normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ - private $_fieldLengths; + protected $_norms = array(); + /** * '.fdx' file - Stored Fields, the field index. * * @var Zend_Search_Lucene_Storage_File */ - private $_fdxFile; + protected $_fdxFile = null; /** * '.fdt' file - Stored Fields, the field data. * * @var Zend_Search_Lucene_Storage_File */ - private $_fdtFile; + protected $_fdtFile = null; /** @@ -144,132 +130,125 @@ class Zend_Search_Lucene_Index_SegmentWriter * @param Zend_Search_Lucene_Storage_Directory $directory * @param string $name */ - public function __construct($directory, $name) + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) { $this->_directory = $directory; $this->_name = $name; - $this->_docCount = 0; - - $this->_fields = array(); - $this->_termDocs = array(); - $this->_files = array(); - $this->_norms = array(); - $this->_fieldLengths = array(); - $this->_termDictionary = array(); - - $this->_fdxFile = null; - $this->_fdtFile = null; } /** * Add field to the segment * + * Returns actual field number + * * @param Zend_Search_Lucene_Field $field + * @return integer */ - private function _addFieldInfo(Zend_Search_Lucene_Field $field) + public function addField(Zend_Search_Lucene_Field $field) { if (!isset($this->_fields[$field->name])) { + $fieldNumber = count($this->_fields); $this->_fields[$field->name] = new Zend_Search_Lucene_Index_FieldInfo($field->name, $field->isIndexed, - count($this->_fields), + $fieldNumber, $field->storeTermVector); + + return $fieldNumber; } else { $this->_fields[$field->name]->isIndexed |= $field->isIndexed; $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; + + return $this->_fields[$field->name]->number; } } - /** - * Adds a document to this segment. + * Add fieldInfo to the segment * - * @param Zend_Search_Lucene_Document $document - * @throws Zend_Search_Lucene_Exception + * Returns actual field number + * + * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo + * @return integer */ - public function addDocument(Zend_Search_Lucene_Document $document) + public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo) { - $storedFields = array(); + if (!isset($this->_fields[$fieldInfo->name])) { + $fieldNumber = count($this->_fields); + $this->_fields[$fieldInfo->name] = + new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name, + $fieldInfo->isIndexed, + $fieldNumber, + $fieldInfo->storeTermVector); + + return $fieldNumber; + } else { + $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; + $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector; - foreach ($document->getFieldNames() as $fieldName) { - $field = $document->getField($fieldName); - $this->_addFieldInfo($field); + return $this->_fields[$fieldInfo->name]->number; + } + } - if ($field->storeTermVector) { - /** - * @todo term vector storing support - */ - throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); - } + /** + * Returns array of FieldInfo objects. + * + * @return array + */ + public function getFieldInfos() + { + return $this->_fields; + } - if ($field->isIndexed) { - if ($field->isTokenized) { - $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); - } else { - $tokenList = array(); - $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); - } - $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); - - $position = 0; - foreach ($tokenList as $token) { - $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); - $termKey = $term->key(); - - if (!isset($this->_termDictionary[$termKey])) { - // New term - $this->_termDictionary[$termKey] = $term; - $this->_termDocs[$termKey] = array(); - $this->_termDocs[$termKey][$this->_docCount] = array(); - } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { - // Existing term, but new term entry - $this->_termDocs[$termKey][$this->_docCount] = array(); - } - $position += $token->getPositionIncrement(); - $this->_termDocs[$termKey][$this->_docCount][] = $position; - } - } + /** + * Add stored fields information + * + * @param array $storedFields array of Zend_Search_Lucene_Field objects + */ + public function addStoredFields($storedFields) + { + if (!isset($this->_fdxFile)) { + $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); + $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); - if ($field->isStored) { - $storedFields[] = $field; - } + $this->_files[] = $this->_name . '.fdx'; + $this->_files[] = $this->_name . '.fdt'; } - if (count($storedFields) != 0) { - if (!isset($this->_fdxFile)) { - $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); - $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); - - $this->_files[] = $this->_name . '.fdx'; - $this->_files[] = $this->_name . '.fdt'; - } - - $this->_fdxFile->writeLong($this->_fdtFile->tell()); - $this->_fdtFile->writeVInt(count($storedFields)); - foreach ($storedFields as $field) { - $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); - $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | - ($field->isBinary ? 0x02 : 0x00) | - 0x00; /* 0x04 - third bit, compressed (ZLIB) */ - $this->_fdtFile->writeByte($fieldBits); - if ($field->isBinary) { - $this->_fdtFile->writeVInt(strlen($field->stringValue)); - $this->_fdtFile->writeBytes($field->stringValue); - } else { - $this->_fdtFile->writeString($field->stringValue); - } + $this->_fdxFile->writeLong($this->_fdtFile->tell()); + $this->_fdtFile->writeVInt(count($storedFields)); + foreach ($storedFields as $field) { + $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); + $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | + ($field->isBinary ? 0x02 : 0x00) | + 0x00; /* 0x04 - third bit, compressed (ZLIB) */ + $this->_fdtFile->writeByte($fieldBits); + if ($field->isBinary) { + $this->_fdtFile->writeVInt(strlen($field->value)); + $this->_fdtFile->writeBytes($field->value); + } else { + $this->_fdtFile->writeString($field->getUtf8Value()); } } $this->_docCount++; } + /** + * Returns the total number of documents in this segment. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } /** * Dump Field Info (.fnm) segment file */ - private function _dumpFNM() + protected function _dumpFNM() { $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); $fnmFile->writeVInt(count($this->_fields)); @@ -283,20 +262,9 @@ class Zend_Search_Lucene_Index_SegmentWriter ); if ($field->isIndexed) { - $fieldNum = $this->_fields[$field->name]->number; - $fieldName = $field->name; - $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); - $norm = ''; - - for ($count = 0; $count < $this->_docCount; $count++) { - $numTokens = isset($this->_fieldLengths[$fieldName][$count]) ? - $this->_fieldLengths[$fieldName][$count] : 0; - $norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens))); - } - - $normFileName = $this->_name . '.f' . $fieldNum; + $normFileName = $this->_name . '.f' . $field->number; $fFile = $this->_directory->createFile($normFileName); - $fFile->writeBytes($norm); + $fFile->writeBytes($this->_norms[$field->name]); $this->_files[] = $normFileName; } } @@ -305,6 +273,194 @@ class Zend_Search_Lucene_Index_SegmentWriter } + + /** + * Term Dictionary file + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_tisFile = null; + + /** + * Term Dictionary index file + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_tiiFile = null; + + /** + * Frequencies file + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_frqFile = null; + + /** + * Positions file + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_prxFile = null; + + /** + * Number of written terms + * + * @var integer + */ + private $_termCount; + + + /** + * Last saved term + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_prevTerm; + + /** + * Last saved term info + * + * @var Zend_Search_Lucene_Index_TermInfo + */ + private $_prevTermInfo; + + /** + * Last saved index term + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_prevIndexTerm; + + /** + * Last saved index term info + * + * @var Zend_Search_Lucene_Index_TermInfo + */ + private $_prevIndexTermInfo; + + /** + * Last term dictionary file position + * + * @var integer + */ + private $_lastIndexPosition; + + /** + * Create dicrionary, frequency and positions files and write necessary headers + */ + public function initializeDictionaryFiles() + { + $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); + $this->_tisFile->writeInt((int)0xFFFFFFFE); + $this->_tisFile->writeLong(0 /* dummy data for terms count */); + $this->_tisFile->writeInt(self::$indexInterval); + $this->_tisFile->writeInt(self::$skipInterval); + + $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); + $this->_tiiFile->writeInt((int)0xFFFFFFFE); + $this->_tiiFile->writeLong(0 /* dummy data for terms count */); + $this->_tiiFile->writeInt(self::$indexInterval); + $this->_tiiFile->writeInt(self::$skipInterval); + + /** Dump dictionary header */ + $this->_tiiFile->writeVInt(0); // preffix length + $this->_tiiFile->writeString(''); // suffix + $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number + $this->_tiiFile->writeByte((int)0x0F); + $this->_tiiFile->writeVInt(0); // DocFreq + $this->_tiiFile->writeVInt(0); // FreqDelta + $this->_tiiFile->writeVInt(0); // ProxDelta + $this->_tiiFile->writeVInt(20); // IndexDelta + + $this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); + $this->_prxFile = $this->_directory->createFile($this->_name . '.prx'); + + $this->_files[] = $this->_name . '.tis'; + $this->_files[] = $this->_name . '.tii'; + $this->_files[] = $this->_name . '.frq'; + $this->_files[] = $this->_name . '.prx'; + + $this->_prevTerm = null; + $this->_prevTermInfo = null; + $this->_prevIndexTerm = null; + $this->_prevIndexTermInfo = null; + $this->_lastIndexPosition = 20; + $this->_termCount = 0; + + } + + /** + * Add term + * + * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) + * + * @param Zend_Search_Lucene_Index_Term $termEntry + * @param array $termDocs + */ + public function addTerm($termEntry, $termDocs) + { + $freqPointer = $this->_frqFile->tell(); + $proxPointer = $this->_prxFile->tell(); + + $prevDoc = 0; + foreach ($termDocs as $docId => $termPositions) { + $docDelta = ($docId - $prevDoc)*2; + $prevDoc = $docId; + if (count($termPositions) > 1) { + $this->_frqFile->writeVInt($docDelta); + $this->_frqFile->writeVInt(count($termPositions)); + } else { + $this->_frqFile->writeVInt($docDelta + 1); + } + + $prevPosition = 0; + foreach ($termPositions as $position) { + $this->_prxFile->writeVInt($position - $prevPosition); + $prevPosition = $position; + } + } + + if (count($termDocs) >= self::$skipInterval) { + /** + * @todo Write Skip Data to a freq file. + * It's not used now, but make index more optimal + */ + $skipOffset = $this->_frqFile->tell() - $freqPointer; + } else { + $skipOffset = 0; + } + + $term = new Zend_Search_Lucene_Index_Term($termEntry->text, + $this->_fields[$termEntry->field]->number); + $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), + $freqPointer, $proxPointer, $skipOffset); + + $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); + + if (($this->_termCount + 1) % self::$indexInterval == 0) { + $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); + + $indexPosition = $this->_tisFile->tell(); + $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); + $this->_lastIndexPosition = $indexPosition; + + } + $this->_termCount++; + } + + /** + * Close dictionary + */ + public function closeDictionaryFiles() + { + $this->_tisFile->seek(4); + $this->_tisFile->writeLong($this->_termCount); + + $this->_tiiFile->seek(4); + $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval)); + } + + /** * Dump Term Dictionary segment file entry. * Used to write entry to .tis or .tii files @@ -315,22 +471,47 @@ class Zend_Search_Lucene_Index_SegmentWriter * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo * @param Zend_Search_Lucene_Index_TermInfo $termInfo */ - private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, + protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, &$prevTerm, Zend_Search_Lucene_Index_Term $term, &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) { if (isset($prevTerm) && $prevTerm->field == $term->field) { - $prefixLength = 0; - while ($prefixLength < strlen($prevTerm->text) && - $prefixLength < strlen($term->text) && - $prevTerm->text{$prefixLength} == $term->text{$prefixLength} - ) { - $prefixLength++; + $matchedBytes = 0; + $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); + while ($matchedBytes < $maxBytes && + $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { + $matchedBytes++; + } + + // Calculate actual matched UTF-8 pattern + $prefixBytes = 0; + $prefixChars = 0; + while ($prefixBytes < $matchedBytes) { + $charBytes = 1; + if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { + $charBytes++; + if (ord($term->text[$prefixBytes]) & 0x20 ) { + $charBytes++; + if (ord($term->text[$prefixBytes]) & 0x10 ) { + $charBytes++; + } + } + } + + if ($prefixBytes + $charBytes > $matchedBytes) { + // char crosses matched bytes boundary + // skip char + break; + } + + $prefixChars++; + $prefixBytes += $charBytes; } + // Write preffix length - $dicFile->writeVInt($prefixLength); + $dicFile->writeVInt($prefixChars); // Write suffix - $dicFile->writeString( substr($term->text, $prefixLength) ); + $dicFile->writeString(substr($term->text, $prefixBytes)); } else { // Write preffix length $dicFile->writeVInt(0); @@ -363,107 +544,11 @@ class Zend_Search_Lucene_Index_SegmentWriter $prevTermInfo = $termInfo; } - /** - * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files - */ - private function _dumpDictionary() - { - $termKeys = array_keys($this->_termDictionary); - sort($termKeys, SORT_STRING); - - $tisFile = $this->_directory->createFile($this->_name . '.tis'); - $tisFile->writeInt((int)0xFFFFFFFE); - $tisFile->writeLong(count($termKeys)); - $tisFile->writeInt(self::$indexInterval); - $tisFile->writeInt(self::$skipInterval); - - $tiiFile = $this->_directory->createFile($this->_name . '.tii'); - $tiiFile->writeInt((int)0xFFFFFFFE); - $tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval)); - $tiiFile->writeInt(self::$indexInterval); - $tiiFile->writeInt(self::$skipInterval); - - /** Dump dictionary header */ - $tiiFile->writeVInt(0); // preffix length - $tiiFile->writeString(''); // suffix - $tiiFile->writeInt((int)0xFFFFFFFF); // field number - $tiiFile->writeByte((int)0x0F); - $tiiFile->writeVInt(0); // DocFreq - $tiiFile->writeVInt(0); // FreqDelta - $tiiFile->writeVInt(0); // ProxDelta - $tiiFile->writeVInt(20); // IndexDelta - - $frqFile = $this->_directory->createFile($this->_name . '.frq'); - $prxFile = $this->_directory->createFile($this->_name . '.prx'); - - $termCount = 1; - - $prevTerm = null; - $prevTermInfo = null; - $prevIndexTerm = null; - $prevIndexTermInfo = null; - $prevIndexPosition = 20; - - foreach ($termKeys as $termId) { - $freqPointer = $frqFile->tell(); - $proxPointer = $prxFile->tell(); - - $prevDoc = 0; - foreach ($this->_termDocs[$termId] as $docId => $termPositions) { - $docDelta = ($docId - $prevDoc)*2; - $prevDoc = $docId; - if (count($termPositions) > 1) { - $frqFile->writeVInt($docDelta); - $frqFile->writeVInt(count($termPositions)); - } else { - $frqFile->writeVInt($docDelta + 1); - } - - $prevPosition = 0; - foreach ($termPositions as $position) { - $prxFile->writeVInt($position - $prevPosition); - $prevPosition = $position; - } - } - - if (count($this->_termDocs[$termId]) >= self::$skipInterval) { - /** - * @todo Write Skip Data to a freq file. - * It's not used now, but make index more optimal - */ - $skipOffset = $frqFile->tell() - $freqPointer; - } else { - $skipOffset = 0; - } - - $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, - $this->_fields[$this->_termDictionary[$termId]->field]->number); - $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), - $freqPointer, $proxPointer, $skipOffset); - - $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); - - if ($termCount % self::$indexInterval == 0) { - $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); - - $indexPosition = $tisFile->tell(); - $tiiFile->writeVInt($indexPosition - $prevIndexPosition); - $prevIndexPosition = $indexPosition; - } - $termCount++; - } - - $this->_files[] = $this->_name . '.tis'; - $this->_files[] = $this->_name . '.tii'; - $this->_files[] = $this->_name . '.frq'; - $this->_files[] = $this->_name . '.prx'; - } - /** * Generate compound index file */ - private function _generateCFS() + protected function _generateCFS() { $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); $cfsFile->writeVInt(count($this->_files)); @@ -486,8 +571,13 @@ class Zend_Search_Lucene_Index_SegmentWriter $cfsFile->seek($dataOffset); $dataFile = $this->_directory->getFileObject($fileName); - $data = $dataFile->readBytes($this->_directory->fileLength($fileName)); - $cfsFile->writeBytes($data); + + $byteCount = $this->_directory->fileLength($fileName); + while ($byteCount > 0) { + $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); + $byteCount -= strlen($data); + $cfsFile->writeBytes($data); + } $this->_directory->deleteFile($fileName); } @@ -499,21 +589,6 @@ class Zend_Search_Lucene_Index_SegmentWriter * * @return Zend_Search_Lucene_Index_SegmentInfo */ - public function close() - { - if ($this->_docCount == 0) { - return null; - } - - $this->_dumpFNM(); - $this->_dumpDictionary(); - - $this->_generateCFS(); - - return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, - $this->_docCount, - $this->_directory); - } - + abstract public function close(); } diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php new file mode 100644 index 0000000000..7dd2bf9158 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php @@ -0,0 +1,213 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php'; + +/** Zend_Search_Lucene_Index_SegmentWriter */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter +{ + /** + * Term Dictionary + * Array of the Zend_Search_Lucene_Index_Term objects + * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos + * + * @var array + */ + protected $_termDictionary; + + /** + * Documents, which contain the term + * + * @var array + */ + protected $_termDocs; + + /** + * Object constructor. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param string $name + */ + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) + { + parent::__construct($directory, $name); + + $this->_termDocs = array(); + $this->_termDictionary = array(); + } + + + /** + * Adds a document to this segment. + * + * @param Zend_Search_Lucene_Document $document + * @throws Zend_Search_Lucene_Exception + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $storedFields = array(); + $docNorms = array(); + $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); + + foreach ($document->getFieldNames() as $fieldName) { + $field = $document->getField($fieldName); + $this->addField($field); + + if ($field->storeTermVector) { + /** + * @todo term vector storing support + */ + throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); + } + + if ($field->isIndexed) { + if ($field->isTokenized) { + $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); + $analyzer->setInput($field->value, $field->encoding); + + $position = 0; + $tokenCounter = 0; + while (($token = $analyzer->nextToken()) !== null) { + $tokenCounter++; + + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); + $termKey = $term->key(); + + if (!isset($this->_termDictionary[$termKey])) { + // New term + $this->_termDictionary[$termKey] = $term; + $this->_termDocs[$termKey] = array(); + $this->_termDocs[$termKey][$this->_docCount] = array(); + } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { + // Existing term, but new term entry + $this->_termDocs[$termKey][$this->_docCount] = array(); + } + $position += $token->getPositionIncrement(); + $this->_termDocs[$termKey][$this->_docCount][] = $position; + } + + $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, + $tokenCounter)* + $document->boost* + $field->boost )); + } else { + $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name); + $termKey = $term->key(); + + if (!isset($this->_termDictionary[$termKey])) { + // New term + $this->_termDictionary[$termKey] = $term; + $this->_termDocs[$termKey] = array(); + $this->_termDocs[$termKey][$this->_docCount] = array(); + } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { + // Existing term, but new term entry + $this->_termDocs[$termKey][$this->_docCount] = array(); + } + $this->_termDocs[$termKey][$this->_docCount][] = 0; // position + + $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* + $document->boost* + $field->boost )); + } + } + + if ($field->isStored) { + $storedFields[] = $field; + } + } + + + foreach ($this->_fields as $fieldName => $field) { + if (!$field->isIndexed) { + continue; + } + + if (!isset($this->_norms[$fieldName])) { + $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), + $this->_docCount); + } + + if (isset($docNorms[$fieldName])){ + $this->_norms[$fieldName] .= $docNorms[$fieldName]; + } else { + $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); + } + } + + $this->addStoredFields($storedFields); + } + + + /** + * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files + */ + protected function _dumpDictionary() + { + ksort($this->_termDictionary, SORT_STRING); + + $this->initializeDictionaryFiles(); + + foreach ($this->_termDictionary as $termId => $term) { + $this->addTerm($term, $this->_termDocs[$termId]); + } + + $this->closeDictionaryFiles(); + } + + + /** + * Close segment, write it to disk and return segment info + * + * @return Zend_Search_Lucene_Index_SegmentInfo + */ + public function close() + { + if ($this->_docCount == 0) { + return null; + } + + $this->_dumpFNM(); + $this->_dumpDictionary(); + + $this->_generateCFS(); + + return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, + $this->_docCount, + $this->_directory); + } + +} + diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php new file mode 100644 index 0000000000..ba0e202522 --- /dev/null +++ b/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php @@ -0,0 +1,94 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; + +/** Zend_Search_Lucene_Index_SegmentWriter */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter +{ + /** + * Object constructor. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param string $name + */ + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) + { + parent::__construct($directory, $name); + } + + + /** + * Create stored fields files and open them for write + */ + public function createStoredFieldsFiles() + { + $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); + $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); + + $this->_files[] = $this->_name . '.fdx'; + $this->_files[] = $this->_name . '.fdt'; + } + + public function addNorm($fieldName, $normVector) + { + if (isset($this->_norms[$fieldName])) { + $this->_norms[$fieldName] .= $normVector; + } else { + $this->_norms[$fieldName] = $normVector; + } + } + + /** + * Close segment, write it to disk and return segment info + * + * @return Zend_Search_Lucene_Index_SegmentInfo + */ + public function close() + { + if ($this->_docCount == 0) { + return null; + } + + $this->_dumpFNM(); + $this->_generateCFS(); + + return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, + $this->_docCount, + $this->_directory); + } +} + diff --git a/search/Zend/Search/Lucene/Index/Term.php b/search/Zend/Search/Lucene/Index/Term.php index 3deffa90ab..465b4ef41b 100644 --- a/search/Zend/Search/Lucene/Index/Term.php +++ b/search/Zend/Search/Lucene/Index/Term.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -31,7 +31,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_Term @@ -52,21 +52,57 @@ class Zend_Search_Lucene_Index_Term /** - * @todo docblock + * Object constructor */ - public function __construct( $text, $field = 'contents' ) + public function __construct($text, $field = null) { - $this->field = $field; - $this->text = $text; + $this->field = ($field === null)? Zend_Search_Lucene::getDefaultSearchField() : $field; + $this->text = $text; } /** - * @todo docblock + * Returns term key + * + * @return string */ public function key() { return $this->field . chr(0) . $this->text; } + + /** + * Get term prefix + * + * @param integer $length + * @return string + */ + public static function getPrefix($str, $length) + { + $prefixBytes = 0; + $prefixChars = 0; + while ($prefixBytes < strlen($str) && $prefixChars < $length) { + $charBytes = 1; + if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) { + $charBytes++; + if (ord($str[$prefixBytes]) & 0x20 ) { + $charBytes++; + if (ord($str[$prefixBytes]) & 0x10 ) { + $charBytes++; + } + } + } + + if ($prefixBytes + $charBytes > strlen($str)) { + // wrong character + break; + } + + $prefixChars++; + $prefixBytes += $charBytes; + } + + return substr($str, 0, $prefixBytes); + } } diff --git a/search/Zend/Search/Lucene/Index/TermInfo.php b/search/Zend/Search/Lucene/Index/TermInfo.php index 7dcfcc8a96..95f7cfc65f 100644 --- a/search/Zend/Search/Lucene/Index/TermInfo.php +++ b/search/Zend/Search/Lucene/Index/TermInfo.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -26,7 +26,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_TermInfo diff --git a/search/Zend/Search/Lucene/Index/Writer.php b/search/Zend/Search/Lucene/Index/Writer.php index ef6c65526a..8e32f4e1f6 100644 --- a/search/Zend/Search/Lucene/Index/Writer.php +++ b/search/Zend/Search/Lucene/Index/Writer.php @@ -15,30 +15,32 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ -/** Zend_Search_Lucene_Index_SegmentWriter */ -require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; +/** Zend_Search_Lucene_Index_SegmentWriter_ */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php'; /** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; + +/** Zend_Search_Lucene_Index_SegmentMerger */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentMerger.php'; + /** * @category Zend * @package Zend_Search_Lucene * @subpackage Index - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_Writer { /** - * @todo Implement segment merger - * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage. * @todo Implement Analyzer substitution * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for * temporary index files @@ -46,74 +48,92 @@ class Zend_Search_Lucene_Index_Writer */ /** - * File system adapter. + * Number of documents required before the buffered in-memory + * documents are written into a new Segment * - * @var Zend_Search_Lucene_Storage_Directory + * Default value is 10 + * + * @var integer */ - private $_directory = null; - + public $maxBufferedDocs = 10; /** - * Index version - * Counts how often the index has been changed by adding or deleting docs + * Largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX * * @var integer */ - private $_version; + public $maxMergeDocs = PHP_INT_MAX; /** - * Segment name counter. - * Used to name new segments . + * Determines how often segment indices are merged by addDocument(). + * + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 * * @var integer */ - private $_segmentNameCounter; + public $mergeFactor = 10; /** - * Number of the segments in the index + * File system adapter. * - * @var inteher + * @var Zend_Search_Lucene_Storage_Directory */ - private $_segments; + private $_directory = null; + /** - * Determines how often segment indices - * are merged by addDocument(). + * Changes counter. * * @var integer */ - public $mergeFactor; + private $_versionUpdate = 0; /** - * Determines the minimal number of documents required before - * the buffered in-memory documents are merging and a new Segment - * is created. + * List of the segments, created by index writer + * Array of Zend_Search_Lucene_Index_SegmentInfo objects * - * @var integer + * @var array */ - public $minMergeDocs; + private $_newSegments = array(); /** - * Determines the largest number of documents ever merged by addDocument(). + * List of segments to be deleted on commit * - * @var integer + * @var array */ - public $maxMergeDocs; + private $_segmentsToDelete = array(); /** - * List of the segments, created by index writer - * Array of Zend_Search_Lucene_Index_SegmentInfo objects + * Current segment to add documents * - * @var array + * @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter */ - private $_newSegments; + private $_currentSegment = null; /** - * Current segment to add documents + * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. + * + * It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array * - * @var Zend_Search_Lucene_Index_SegmentWriter + * @var array Zend_Search_Lucene_Index_SegmentInfo */ - private $_currentSegment; + private $_segmentInfos; /** * List of indexfiles extensions @@ -131,7 +151,8 @@ class Zend_Search_Lucene_Index_Writer '.tvx' => '.tvx', '.tvd' => '.tvd', '.tvf' => '.tvf', - '.del' => '.del' ); + '.del' => '.del', + '.sti' => '.sti' ); /** * Opens the index for writing @@ -142,11 +163,13 @@ class Zend_Search_Lucene_Index_Writer * index or overwrite the existing one. * * @param Zend_Search_Lucene_Storage_Directory $directory + * @param array $segmentInfos * @param boolean $create */ - public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false) + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $create = false) { - $this->_directory = $directory; + $this->_directory = $directory; + $this->_segmentInfos = &$segmentInfos; if ($create) { foreach ($this->_directory->fileList() as $file) { @@ -159,8 +182,13 @@ class Zend_Search_Lucene_Index_Writer } $segmentsFile = $this->_directory->createFile('segments'); $segmentsFile->writeInt((int)0xFFFFFFFF); - // write version - $segmentsFile->writeLong(0); + + // write version (is initialized by current time + // $segmentsFile->writeLong((int)microtime(true)); + $version = microtime(true); + $segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1))); + $segmentsFile->writeInt((int)($version & 0xFFFFFFFF)); + // write name counter $segmentsFile->writeInt(0); // write segment counter @@ -169,27 +197,13 @@ class Zend_Search_Lucene_Index_Writer $deletableFile = $this->_directory->createFile('deletable'); // write counter $deletableFile->writeInt(0); - - $this->_version = 0; - $this->_segmentNameCounter = 0; - $this->_segments = 0; } else { $segmentsFile = $this->_directory->getFileObject('segments'); $format = $segmentsFile->readInt(); if ($format != (int)0xFFFFFFFF) { throw new Zend_Search_Lucene_Exception('Wrong segments file format'); } - - // read version - $this->_version = $segmentsFile->readLong(); - // read counter - $this->_segmentNameCounter = $segmentsFile->readInt(); - // read segment counter - $this->_segments = $segmentsFile->readInt(); } - - $this->_newSegments = array(); - $this->_currentSegment = null; } /** @@ -201,49 +215,218 @@ class Zend_Search_Lucene_Index_Writer { if ($this->_currentSegment === null) { $this->_currentSegment = - new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); + new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName()); } $this->_currentSegment->addDocument($document); - $this->_version++; + + if ($this->_currentSegment->count() >= $this->maxBufferedDocs) { + $this->commit(); + } + + $this->_versionUpdate++; + + $this->_maybeMergeSegments(); + } + + + /** + * Merge segments if necessary + */ + private function _maybeMergeSegments() + { + $segmentSizes = array(); + foreach ($this->_segmentInfos as $segId => $segmentInfo) { + $segmentSizes[$segId] = $segmentInfo->count(); + } + + $mergePool = array(); + $poolSize = 0; + $sizeToMerge = $this->maxBufferedDocs; + asort($segmentSizes, SORT_NUMERIC); + foreach ($segmentSizes as $segId => $size) { + // Check, if segment comes into a new merging block + while ($size >= $sizeToMerge) { + // Merge previous block if it's large enough + if ($poolSize >= $sizeToMerge) { + $this->_mergeSegments($mergePool); + } + $mergePool = array(); + $poolSize = 0; + + $sizeToMerge *= $this->mergeFactor; + + if ($sizeToMerge > $this->maxMergeDocs) { + return; + } + } + + $mergePool[] = $this->_segmentInfos[$segId]; + $poolSize += $size; + } + + if ($poolSize >= $sizeToMerge) { + $this->_mergeSegments($mergePool); + } } + /** + * Merge specified segments + * + * $segments is an array of SegmentInfo objects + * + * @param array $segments + */ + private function _mergeSegments($segments) + { + // Try to get exclusive non-blocking lock to the 'index.optimization.lock' + // Skip optimization if it's performed by other process right now + $optimizationLock = $this->_directory->createFile('index.optimization.lock'); + if (!$optimizationLock->lock(LOCK_EX,true)) { + return; + } + + $newName = $this->_newSegmentName(); + $merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory, + $newName); + foreach ($segments as $segmentInfo) { + $merger->addSource($segmentInfo); + $this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName(); + } + + $newSegment = $merger->merge(); + if ($newSegment !== null) { + $this->_newSegments[$newSegment->getName()] = $newSegment; + } + + $this->commit(); + // optimization is finished + $optimizationLock->unlock(); + } /** * Update segments file by adding current segment to a list - * @todo !!!!!Finish the implementation * * @throws Zend_Search_Lucene_Exception */ private function _updateSegments() { - $segmentsFile = $this->_directory->getFileObject('segments'); - $newSegmentFile = $this->_directory->createFile('segments.new'); + // Get an exclusive index lock + // Wait, until all parallel searchers or indexers won't stop + // and stop all next searchers, while we are updating segments file + $lock = $this->_directory->getFileObject('index.lock'); + if (!$lock->lock(LOCK_EX)) { + throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock'); + } - $newSegmentFile->writeInt((int)0xFFFFFFFF); - $newSegmentFile->writeLong($this->_version); - $newSegmentFile->writeInt($this->_segmentNameCounter); - $this->_segments += count($this->_newSegments); - $newSegmentFile->writeInt($this->_segments); + // Do not share file handlers to get file updates from other sessions. + $segmentsFile = $this->_directory->getFileObject('segments', false); + $newSegmentFile = $this->_directory->createFile('segments.new', false); - $segmentsFile->seek(20); - $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); + // Write format marker + $newSegmentFile->writeInt((int)0xFFFFFFFF); - foreach ($this->_newSegments as $segmentName => $segmentInfo) { - $newSegmentFile->writeString($segmentName); + // Write index version + $segmentsFile->seek(4, SEEK_CUR); + // $version = $segmentsFile->readLong() + $this->_versionUpdate; + // Process version on 32-bit platforms + $versionHigh = $segmentsFile->readInt(); + $versionLow = $segmentsFile->readInt(); + $version = $versionHigh * ((double)0xFFFFFFFF + 1) + + (($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow); + $version += $this->_versionUpdate; + $this->_versionUpdate = 0; + $newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1))); + $newSegmentFile->writeInt((int)($version & 0xFFFFFFFF)); + + // Write segment name counter + $newSegmentFile->writeInt($segmentsFile->readInt()); + + // Get number of segments offset + $numOfSegmentsOffset = $newSegmentFile->tell(); + // Write number of segemnts + $segmentsCount = $segmentsFile->readInt(); + $newSegmentFile->writeInt(0); // Write dummy data (segment counter) + + $segments = array(); + for ($count = 0; $count < $segmentsCount; $count++) { + $segName = $segmentsFile->readString(); + $segSize = $segmentsFile->readInt(); + + if (!in_array($segName, $this->_segmentsToDelete)) { + $newSegmentFile->writeString($segName); + $newSegmentFile->writeInt($segSize); + + $segments[$segName] = $segSize; + } + } + $segmentsFile->close(); + + $segmentsCount = count($segments) + count($this->_newSegments); + + // Remove segments, not listed in $segments (deleted) + // Load segments, not listed in $this->_segmentInfos + foreach ($this->_segmentInfos as $segId => $segInfo) { + if (isset($segments[$segInfo->getName()])) { + // Segment is already included into $this->_segmentInfos + unset($segments[$segInfo->getName()]); + } else { + // remove deleted segment from a list + unset($this->_segmentInfos[$segId]); + } + } + // $segments contains a list of segments to load + // do it later + + foreach ($this->_newSegments as $segName => $segmentInfo) { + $newSegmentFile->writeString($segName); $newSegmentFile->writeInt($segmentInfo->count()); + + $this->_segmentInfos[] = $segmentInfo; } + $this->_newSegments = array(); + $newSegmentFile->seek($numOfSegmentsOffset); + $newSegmentFile->writeInt($segmentsCount); // Update segments count + $newSegmentFile->close(); $this->_directory->renameFile('segments.new', 'segments'); + + + // Segments file update is finished + // Switch back to shared lock mode + $lock->lock(LOCK_SH); + + + $fileList = $this->_directory->fileList(); + foreach ($this->_segmentsToDelete as $nameToDelete) { + foreach (self::$_indexExtensions as $ext) { + if ($this->_directory->fileExists($nameToDelete . $ext)) { + $this->_directory->deleteFile($nameToDelete . $ext); + } + } + + foreach ($fileList as $file) { + if (substr($file, 0, strlen($nameToDelete) + 2) == ($nameToDelete . '.f') && + ctype_digit( substr($file, strlen($nameToDelete) + 2) )) { + $this->_directory->deleteFile($file); + } + } + } + $this->_segmentsToDelete = array(); + + // Load segments, created by other process + foreach ($segments as $segName => $segSize) { + // Load new segments + $this->_segmentInfos[] = new Zend_Search_Lucene_Index_SegmentInfo($segName, + $segSize, + $this->_directory); + } } /** * Commit current changes - * returns array of new segments - * - * @return array */ public function commit() { @@ -255,14 +438,10 @@ class Zend_Search_Lucene_Index_Writer $this->_currentSegment = null; } - if (count($this->_newSegments) != 0) { + if (count($this->_newSegments) != 0 || + count($this->_segmentsToDelete) != 0) { $this->_updateSegments(); } - - $result = $this->_newSegments; - $this->_newSegments = array(); - - return $result; } @@ -279,43 +458,16 @@ class Zend_Search_Lucene_Index_Writer */ } - - /** - * Returns the number of documents currently in this index. - * - * @return integer - */ - public function docCount($readers) - { - /** - * @todo implementation - */ - } - - - /** - * Flushes all changes to an index and closes all associated files. - * - */ - public function close() - { - /** - * @todo implementation - */ - } - - /** * Merges all segments together into a single segment, optimizing * an index for search. + * Input is an array of Zend_Search_Lucene_Index_SegmentInfo objects * - * return void + * @throws Zend_Search_Lucene_Exception */ public function optimize() { - /** - * @todo implementation - */ + $this->_mergeSegments($this->_segmentInfos); } /** @@ -325,7 +477,30 @@ class Zend_Search_Lucene_Index_Writer */ private function _newSegmentName() { - return '_' . base_convert($this->_segmentNameCounter++, 10, 36); + // Do not share file handler to get file updates from other sessions. + $segmentsFile = $this->_directory->getFileObject('segments', false); + + // Get exclusive segments file lock + // We have guarantee, that we will not intersect with _updateSegments() call + // of other process, because it needs exclusive index lock and waits + // until all other searchers won't stop + if (!$segmentsFile->lock(LOCK_EX)) { + throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock'); + } + + $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) + $segmentNameCounter = $segmentsFile->readInt(); + + $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) + $segmentsFile->writeInt($segmentNameCounter + 1); + + // Flash output to guarantee that wrong value will not be loaded between unlock and + // return (which calls $segmentsFile destructor) + $segmentsFile->flush(); + + $segmentsFile->unlock(); + + return '_' . base_convert($segmentNameCounter, 10, 36); } } diff --git a/search/Zend/Search/Lucene/Interface.php b/search/Zend/Search/Lucene/Interface.php new file mode 100644 index 0000000000..58c75b644f --- /dev/null +++ b/search/Zend/Search/Lucene/Interface.php @@ -0,0 +1,330 @@ + 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @return integer + */ + public function getMergeFactor(); + + /** + * Set index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @param integer $maxMergeDocs + */ + public function setMergeFactor($mergeFactor); + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * @param mixed $query + * @return array Zend_Search_Lucene_Search_QueryHit + * @throws Zend_Search_Lucene_Exception + */ + public function find($query); + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false); + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id); + + /** + * Returns true if index contain documents with specified term. + * + * Is used for query optimization. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return boolean + */ + public function hasTerm(Zend_Search_Lucene_Index_Term $term); + + /** + * Returns IDs of all the documents containing term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term); + + /** + * Returns an array of all term freqs. + * Return array structure: array( docId => freq, ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function termFreqs(Zend_Search_Lucene_Index_Term $term); + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term); + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term); + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity(); + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return float + */ + public function norm($id, $fieldName); + + /** + * Returns true if any documents have been deleted from this index. + * + * @return boolean + */ + public function hasDeletions(); + + /** + * Deletes a document from the index. + * $id is an internal document id + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @throws Zend_Search_Lucene_Exception + */ + public function delete($id); + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document); + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + */ + public function commit(); + + /** + * Optimize index. + * + * Merges all segments into one + */ + public function optimize(); + + /** + * Returns an array of all terms in this index. + * + * @return array + */ + public function terms(); + + /** + * Undeletes all documents currently marked as deleted in this index. + */ + public function undeleteAll(); + + + /** + * Add reference to the index object + * + * @internal + */ + public function addReference(); + + /** + * Remove reference from the index object + * + * When reference count becomes zero, index is closed and resources are cleaned up + * + * @internal + */ + public function removeReference(); +} diff --git a/search/Zend/Search/Lucene/PriorityQueue.php b/search/Zend/Search/Lucene/PriorityQueue.php new file mode 100644 index 0000000000..4e844b18f6 --- /dev/null +++ b/search/Zend/Search/Lucene/PriorityQueue.php @@ -0,0 +1,170 @@ +_heap); + $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) + + while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) { + // Move parent node down + $this->_heap[$nodeId] = $this->_heap[$parentId]; + + // Move pointer to the next level of tree + $nodeId = $parentId; + $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) + } + + // Put new node into the tree + $this->_heap[$nodeId] = $element; + } + + + /** + * Return least element of the queue + * + * Constant time + * + * @return mixed + */ + public function top() + { + if (count($this->_heap) == 0) { + return null; + } + + return $this->_heap[0]; + } + + + /** + * Removes and return least element of the queue + * + * O(log(N)) time + * + * @return mixed + */ + public function pop() + { + if (count($this->_heap) == 0) { + return null; + } + + $top = $this->_heap[0]; + $lastId = count($this->_heap) - 1; + + /** + * Find appropriate position for last node + */ + $nodeId = 0; // Start from a top + $childId = 1; // First child + + // Choose smaller child + if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) { + $childId = 2; + } + + while ($childId < $lastId && + $this->_less($this->_heap[$childId], $this->_heap[$lastId]) + ) { + // Move child node up + $this->_heap[$nodeId] = $this->_heap[$childId]; + + $nodeId = $childId; // Go down + $childId = ($nodeId << 1) + 1; // First child + + // Choose smaller child + if (($childId+1) < $lastId && + $this->_less($this->_heap[$childId+1], $this->_heap[$childId]) + ) { + $childId++; + } + } + + // Move last element to the new position + $this->_heap[$nodeId] = $this->_heap[$lastId]; + unset($this->_heap[$lastId]); + + return $top; + } + + + /** + * Clear queue + */ + public function clear() + { + $this->_heap = array(); + } + + + /** + * Compare elements + * + * Returns true, if $el1 is less than $el2; else otherwise + * + * @param mixed $el1 + * @param mixed $el2 + * @return boolean + */ + abstract protected function _less($el1, $el2); +} + diff --git a/search/Zend/Search/Lucene/Proxy.php b/search/Zend/Search/Lucene/Proxy.php new file mode 100644 index 0000000000..53fb150bd4 --- /dev/null +++ b/search/Zend/Search/Lucene/Proxy.php @@ -0,0 +1,468 @@ +dirroot.'/search/Zend/Search/Lucene/Interface.php'; + + +/** + * Proxy class intended to be used in userland. + * + * It tracks, when index object goes out of scope and forces ndex closing + * + * @category Zend + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Proxy implements Zend_Search_Lucene_Interface +{ + /** + * Index object + * + * @var Zend_Search_Lucene_Interface + */ + private $_index; + + /** + * Object constructor + * + * @param Zend_Search_Lucene_Interface $index + */ + public function __construct(Zend_Search_Lucene_Interface $index) + { + $this->_index = $index; + $this->_index->addReference(); + } + + /** + * Object destructor + */ + public function __destruct() + { + if ($this->_index !== null) { + // This code is invoked if Zend_Search_Lucene_Interface object constructor throws an exception + $this->_index->removeReference(); + } + $this->_index = null; + } + + /** + * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. + * + * @return Zend_Search_Lucene_Storage_Directory + */ + public function getDirectory() + { + return $this->_index->getDirectory(); + } + + /** + * Returns the total number of documents in this index (including deleted documents). + * + * @return integer + */ + public function count() + { + return $this->_index->count(); + } + + /** + * Returns one greater than the largest possible document number. + * This may be used to, e.g., determine how big to allocate a structure which will have + * an element for every document number in an index. + * + * @return integer + */ + public function maxDoc() + { + return $this->_index->maxDoc(); + } + + /** + * Returns the total number of non-deleted documents in this index. + * + * @return integer + */ + public function numDocs() + { + return $this->_index->numDocs(); + } + + /** + * Checks, that document is deleted + * + * @param integer $id + * @return boolean + * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range + */ + public function isDeleted($id) + { + return $this->_index->isDeleted($id); + } + + /** + * Set default search field. + * + * Null means, that search is performed through all fields by default + * + * Default value is null + * + * @param string $fieldName + */ + public static function setDefaultSearchField($fieldName) + { + Zend_Search_Lucene::setDefaultSearchField($fieldName); + } + + /** + * Get default search field. + * + * Null means, that search is performed through all fields by default + * + * @return string + */ + public static function getDefaultSearchField() + { + return Zend_Search_Lucene::getDefaultSearchField(); + } + + /** + * Retrieve index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @return integer + */ + public function getMaxBufferedDocs() + { + return $this->_index->getMaxBufferedDocs(); + } + + /** + * Set index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @param integer $maxBufferedDocs + */ + public function setMaxBufferedDocs($maxBufferedDocs) + { + $this->_index->setMaxBufferedDocs($maxBufferedDocs); + } + + + /** + * Retrieve index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @return integer + */ + public function getMaxMergeDocs() + { + return $this->_index->getMaxMergeDocs(); + } + + /** + * Set index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @param integer $maxMergeDocs + */ + public function setMaxMergeDocs($maxMergeDocs) + { + $this->_index->setMaxMergeDocs($maxMergeDocs); + } + + + /** + * Retrieve index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @return integer + */ + public function getMergeFactor() + { + return $this->_index->getMergeFactor(); + } + + /** + * Set index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @param integer $maxMergeDocs + */ + public function setMergeFactor($mergeFactor) + { + $this->_index->setMergeFactor($mergeFactor); + } + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * @param mixed $query + * @return array Zend_Search_Lucene_Search_QueryHit + * @throws Zend_Search_Lucene_Exception + */ + public function find($query) + { + // actual parameter list + $parameters = func_get_args(); + + // invoke $this->_index->find() method with specified parameters + return call_user_func_array(array(&$this->_index, 'find'), $parameters); + } + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false) + { + return $this->_index->getFieldNames($indexed); + } + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id) + { + return $this->_index->getDocument($id); + } + + /** + * Returns true if index contain documents with specified term. + * + * Is used for query optimization. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return boolean + */ + public function hasTerm(Zend_Search_Lucene_Index_Term $term) + { + return $this->_index->hasTerm($term); + } + + /** + * Returns IDs of all the documents containing term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term) + { + return $this->_index->termDocs($term); + } + + /** + * Returns an array of all term freqs. + * Return array structure: array( docId => freq, ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function termFreqs(Zend_Search_Lucene_Index_Term $term) + { + return $this->_index->termFreqs($term); + } + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term) + { + return $this->_index->termPositions($term); + } + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term) + { + return $this->_index->docFreq($term); + } + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity() + { + return $this->_index->getSimilarity(); + } + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return float + */ + public function norm($id, $fieldName) + { + return $this->_index->norm($id, $fieldName); + } + + /** + * Returns true if any documents have been deleted from this index. + * + * @return boolean + */ + public function hasDeletions() + { + return $this->_index->hasDeletions(); + } + + /** + * Deletes a document from the index. + * $id is an internal document id + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @throws Zend_Search_Lucene_Exception + */ + public function delete($id) + { + return $this->_index->delete($id); + } + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $this->_index->addDocument($document); + } + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + */ + public function commit() + { + $this->_index->commit(); + } + + /** + * Optimize index. + * + * Merges all segments into one + */ + public function optimize() + { + $this->_index->optimize(); + } + + /** + * Returns an array of all terms in this index. + * + * @return array + */ + public function terms() + { + return $this->_index->terms(); + } + + /** + * Undeletes all documents currently marked as deleted in this index. + */ + public function undeleteAll() + { + return $this->_index->undeleteAll(); + } + + /** + * Add reference to the index object + * + * @internal + */ + public function addReference() + { + return $this->_index->addReference(); + } + + /** + * Remove reference from the index object + * + * When reference count becomes zero, index is closed and resources are cleaned up + * + * @internal + */ + public function removeReference() + { + return $this->_index->removeReference(); + } +} diff --git a/search/Zend/Search/TODO.txt b/search/Zend/Search/TODO.txt index 06f7b48792..799a19e960 100644 --- a/search/Zend/Search/TODO.txt +++ b/search/Zend/Search/TODO.txt @@ -1,14 +1,7 @@ @todo -- Improve API: fix ZSearchMultiTermQuery($terms, $signs); - -- Analysis and indexing engine - -- Additional queries: phrase, wildcard, proximity, and range +- Additional queries: wildcard, proximity, and range - Better class-level docblocks (most functions okay) -- Some Windows issues(?) during indexing - -- Finish renaming classes to PEAR-like conventions