*
* @category Zend
* @package Zend_Search
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Framework base exception
*/
-require_once 'Zend/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Exception.php';
/**
* @category Zend
* @package Zend_Search
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Exception extends Zend_Exception
*
* @category Zend
* @package Zend_Search_Lucene
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Document */
-require_once 'Zend/Search/Lucene/Document.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
+
+/** Zend_Search_Lucene_Document_Html */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php';
/** Zend_Search_Lucene_Storage_Directory */
-require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php';
+
+/** Zend_Search_Lucene_Storage_File_Memory */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Memory.php';
/** Zend_Search_Lucene_Index_Term */
-require_once 'Zend/Search/Lucene/Index/Term.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_TermInfo */
-require_once 'Zend/Search/Lucene/Index/TermInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/TermInfo.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_FieldInfo */
-require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/FieldInfo.php';
/** Zend_Search_Lucene_Index_Writer */
-require_once 'Zend/Search/Lucene/Index/Writer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Writer.php';
/** Zend_Search_Lucene_Search_QueryParser */
-require_once 'Zend/Search/Lucene/Search/QueryParser.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php';
/** Zend_Search_Lucene_Search_QueryHit */
-require_once 'Zend/Search/Lucene/Search/QueryHit.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryHit.php';
/** Zend_Search_Lucene_Search_Similarity */
-require_once 'Zend/Search/Lucene/Search/Similarity.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
+
+
+/** Zend_Search_Lucene_Interface */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php';
+
+/** Zend_Search_Lucene_Proxy */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Proxy.php';
/**
* @category Zend
* @package Zend_Search_Lucene
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
-class Zend_Search_Lucene
+class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
{
+ /**
+ * Default field name for search
+ *
+ * Null means search through all fields
+ *
+ * @var string
+ */
+ private static $_defaultSearchField = null;
+
/**
* File system adapter.
*
*/
private $_hasChanges = false;
+
+ /**
+ * Index lock object
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_lock;
+
+ /**
+ * Signal, that index is already closed, changes are fixed and resources are cleaned up
+ *
+ * @var boolean
+ */
+ private $_closed = false;
+
+ /**
+ * Number of references to the index object
+ *
+ * @var integer
+ */
+ private $_refCount = 0;
+
+
+ /**
+ * Create index
+ *
+ * @param mixed $directory
+ * @return Zend_Search_Lucene_Interface
+ */
+ public static function create($directory)
+ {
+ return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
+ }
+
+ /**
+ * Open index
+ *
+ * @param mixed $directory
+ * @return Zend_Search_Lucene_Interface
+ */
+ public static function open($directory)
+ {
+ return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
+ }
+
/**
* Opens the index.
*
$this->_closeDirOnExit = true;
}
+
+ // Get a shared lock to the index
+ $this->_lock = $this->_directory->createFile('index.lock');
+
+ $this->_segmentInfos = array();
+
if ($create) {
- $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true);
+ // Throw an exception if index is under processing now
+ if (!$this->_lock->lock(LOCK_EX, true)) {
+ throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now');
+ }
+
+ // Writer will create segments file for empty segments list
+ $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, true);
+
+ if (!$this->_lock->lock(LOCK_SH)) {
+ throw new Zend_Search_Lucene_Exception('Can\'t reduce lock level from Exclusive to Shared');
+ }
} else {
+ // Wait if index is under switching from one set of segments to another (Index_Writer::_updateSegments())
+ if (!$this->_lock->lock(LOCK_SH)) {
+ throw new Zend_Search_Lucene_Exception('Can\'t obtain shared index lock');
+ }
$this->_writer = null;
}
- $this->_segmentInfos = array();
$segmentsFile = $this->_directory->getFileObject('segments');
}
// read version
- $segmentsFile->readLong();
+ // $segmentsFile->readLong();
+ $segmentsFile->readInt(); $segmentsFile->readInt();
- // read counter
+ // read segment name counter
$segmentsFile->readInt();
$segments = $segmentsFile->readInt();
$segSize = $segmentsFile->readInt();
$this->_docCount += $segSize;
- $this->_segmentInfos[$count] =
+ $this->_segmentInfos[] =
new Zend_Search_Lucene_Index_SegmentInfo($segName,
$segSize,
$this->_directory);
}
}
-
/**
- * Object destructor
+ * Close current index and free resources
*/
- public function __destruct()
+ private function _close()
{
+ if ($this->_closed) {
+ // index is already closed and resources are cleaned up
+ return;
+ }
+
$this->commit();
+ // Free shared lock
+ $this->_lock->unlock();
+
if ($this->_closeDirOnExit) {
$this->_directory->close();
}
+
+ $this->_directory = null;
+ $this->_writer = null;
+ $this->_segmentInfos = null;
+
+ $this->_closed = true;
+ }
+
+ /**
+ * Add reference to the index object
+ *
+ * @internal
+ */
+ public function addReference()
+ {
+ $this->_refCount++;
+ }
+
+ /**
+ * Remove reference from the index object
+ *
+ * When reference count becomes zero, index is closed and resources are cleaned up
+ *
+ * @internal
+ */
+ public function removeReference()
+ {
+ $this->_refCount--;
+
+ if ($this->_refCount == 0) {
+ $this->_close();
+ }
+ }
+
+ /**
+ * Object destructor
+ */
+ public function __destruct()
+ {
+ $this->_close();
}
/**
* Returns an instance of Zend_Search_Lucene_Index_Writer for the index
*
+ * @internal
* @return Zend_Search_Lucene_Index_Writer
*/
public function getIndexWriter()
{
if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
- $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
+ $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos);
}
return $this->_writer;
/**
- * Returns the total number of documents in this index.
+ * Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
return $this->_docCount;
}
+ /**
+ * Returns one greater than the largest possible document number.
+ * This may be used to, e.g., determine how big to allocate a structure which will have
+ * an element for every document number in an index.
+ *
+ * @return integer
+ */
+ public function maxDoc()
+ {
+ return $this->count();
+ }
+
+ /**
+ * Returns the total number of non-deleted documents in this index.
+ *
+ * @return integer
+ */
+ public function numDocs()
+ {
+ $numDocs = 0;
+
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ $numDocs += $segmentInfo->numDocs();
+ }
+
+ return $numDocs;
+ }
+
+ /**
+ * Checks, that document is deleted
+ *
+ * @param integer $id
+ * @return boolean
+ * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
+ */
+ public function isDeleted($id)
+ {
+ if ($id >= $this->_docCount) {
+ throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
+ }
+
+ $segmentStartId = 0;
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ if ($segmentStartId + $segmentInfo->count() > $id) {
+ break;
+ }
+
+ $segmentStartId += $segmentInfo->count();
+ }
+
+ return $segmentInfo->isDeleted($id - $segmentStartId);
+ }
+
+ /**
+ * Set default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * Default value is null
+ *
+ * @param string $fieldName
+ */
+ public static function setDefaultSearchField($fieldName)
+ {
+ self::$_defaultSearchField = $fieldName;
+ }
+
+ /**
+ * Get default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * @return string
+ */
+ public static function getDefaultSearchField()
+ {
+ return self::$_defaultSearchField;
+ }
+
+ /**
+ * Retrieve index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMaxBufferedDocs()
+ {
+ return $this->getIndexWriter()->maxBufferedDocs;
+ }
+
+ /**
+ * Set index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @param integer $maxBufferedDocs
+ */
+ public function setMaxBufferedDocs($maxBufferedDocs)
+ {
+ $this->getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
+ }
+
+ /**
+ * Retrieve index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @return integer
+ */
+ public function getMaxMergeDocs()
+ {
+ return $this->getIndexWriter()->maxMergeDocs;
+ }
+
+ /**
+ * Set index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMaxMergeDocs($maxMergeDocs)
+ {
+ $this->getIndexWriter()->maxMergeDocs = $maxMergeDocs;
+ }
+
+ /**
+ * Retrieve index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMergeFactor()
+ {
+ return $this->getIndexWriter()->mergeFactor;
+ }
+
+ /**
+ * Set index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMergeFactor($mergeFactor)
+ {
+ $this->getIndexWriter()->mergeFactor = $mergeFactor;
+ }
/**
* Performs a query against the index and returns an array
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
- * @return array ZSearchHit
+ * @return array Zend_Search_Lucene_Search_QueryHit
+ * @throws Zend_Search_Lucene_Exception
*/
public function find($query)
{
$this->commit();
- $hits = array();
+ $hits = array();
$scores = array();
+ $ids = array();
+
+ $query = $query->rewrite($this)->optimize($this);
+
+ $query->execute($this);
+
+ $topScore = 0;
- $docNum = $this->count();
- for( $count=0; $count < $docNum; $count++ ) {
- $docScore = $query->score( $count, $this);
+ foreach ($query->matchedDocs() as $id => $num) {
+ $docScore = $query->score($id, $this);
if( $docScore != 0 ) {
$hit = new Zend_Search_Lucene_Search_QueryHit($this);
- $hit->id = $count;
+ $hit->id = $id;
$hit->score = $docScore;
- $hits[] = $hit;
+ $hits[] = $hit;
+ $ids[] = $id;
$scores[] = $docScore;
+
+ if ($docScore > $topScore) {
+ $topScore = $docScore;
+ }
+ }
+ }
+
+ if (count($hits) == 0) {
+ // skip sorting, which may cause a error on empty index
+ return array();
+ }
+
+ if ($topScore > 1) {
+ foreach ($hits as $hit) {
+ $hit->score /= $topScore;
+ }
+ }
+
+ if (func_num_args() == 1) {
+ // sort by scores
+ array_multisort($scores, SORT_DESC, SORT_NUMERIC,
+ $ids, SORT_ASC, SORT_NUMERIC,
+ $hits);
+ } else {
+ // sort by given field names
+
+ $argList = func_get_args();
+ $fieldNames = $this->getFieldNames();
+ $sortArgs = array();
+
+ for ($count = 1; $count < count($argList); $count++) {
+ $fieldName = $argList[$count];
+
+ if (!is_string($fieldName)) {
+ throw new Zend_Search_Lucene_Exception('Field name must be a string.');
+ }
+
+ if (!in_array($fieldName, $fieldNames)) {
+ throw new Zend_Search_Lucene_Exception('Wrong field name.');
+ }
+
+ $valuesArray = array();
+ foreach ($hits as $hit) {
+ try {
+ $value = $hit->getDocument()->getFieldValue($fieldName);
+ } catch (Zend_Search_Lucene_Exception $e) {
+ if (strpos($e->getMessage(), 'not found') === false) {
+ throw $e;
+ } else {
+ $value = null;
+ }
+ }
+
+ $valuesArray[] = $value;
+ }
+
+ $sortArgs[] = $valuesArray;
+
+ if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
+ $count++;
+ $sortArgs[] = $argList[$count];
+
+ if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
+ $count++;
+ $sortArgs[] = $argList[$count];
+ } else {
+ if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) {
+ $sortArgs[] = SORT_REGULAR;
+ } else {
+ $sortArgs[] = SORT_ASC;
+ }
+ }
+ } else {
+ $sortArgs[] = SORT_ASC;
+ $sortArgs[] = SORT_REGULAR;
+ }
}
+
+ // Sort by id's if values are equal
+ $sortArgs[] = $ids;
+ $sortArgs[] = SORT_ASC;
+ $sortArgs[] = SORT_NUMERIC;
+
+ // Array to be sorted
+ $sortArgs[] = &$hits;
+
+ // Do sort
+ call_user_func_array('array_multisort', $sortArgs);
}
- array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits);
return $hits;
}
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
- $segCount = 0;
- $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
- while( $nextSegmentStartId <= $id ) {
- $segCount++;
- $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+ $segmentStartId = 0;
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ if ($segmentStartId + $segmentInfo->count() > $id) {
+ break;
+ }
+
+ $segmentStartId += $segmentInfo->count();
}
- $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
- $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx');
+ $fdxFile = $segmentInfo->openCompoundFile('.fdx');
$fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
$fieldValuesPosition = $fdxFile->readLong();
- $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt');
- $fdtFile->seek( $fieldValuesPosition, SEEK_CUR );
+ $fdtFile = $segmentInfo->openCompoundFile('.fdt');
+ $fdtFile->seek($fieldValuesPosition, SEEK_CUR);
$fieldCount = $fdtFile->readVInt();
$doc = new Zend_Search_Lucene_Document();
- for( $count = 0; $count < $fieldCount; $count++ ) {
+ for ($count = 0; $count < $fieldCount; $count++) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
- $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum);
+ $fieldInfo = $segmentInfo->getField($fieldNum);
- if( !($bits & 2) ) { // Text data
+ if (!($bits & 2)) { // Text data
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readString(),
+ 'UTF-8',
true,
$fieldInfo->isIndexed,
$bits & 1 );
- } else {
+ } else { // Binary data
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readBinary(),
+ '',
true,
$fieldInfo->isIndexed,
- $bits & 1 );
+ $bits & 1,
+ true );
}
$doc->addField($field);
/**
- * Returns an array of all the documents which contain term.
+ * Returns true if index contain documents with specified term.
+ *
+ * Is used for query optimization.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return boolean
+ */
+ public function hasTerm(Zend_Search_Lucene_Index_Term $term)
+ {
+ foreach ($this->_segmentInfos as $segInfo) {
+ if ($segInfo->getTermInfo($term) instanceof Zend_Search_Lucene_Index_TermInfo) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return array
/**
- * Returns an array of all term positions in the documents.
- * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+ * Returns an array of all term freqs.
+ * Result array structure: array(docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
- * @return array
+ * @return integer
*/
- public function termPositions(Zend_Search_Lucene_Index_Term $term)
+ public function termFreqs(Zend_Search_Lucene_Index_Term $term)
{
$result = array();
$segmentStartDocId = 0;
- foreach( $this->_segmentInfos as $segInfo ) {
- $termInfo = $segInfo->getTermInfo($term);
-
- if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
- $segmentStartDocId += $segInfo->count();
- continue;
- }
-
- $frqFile = $segInfo->openCompoundFile('.frq');
- $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
- $freqs = array();
- $docId = 0;
-
- for( $count = 0; $count < $termInfo->docFreq; $count++ ) {
- $docDelta = $frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- $freqs[ $docId ] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[ $docId ] = $frqFile->readVInt();
- }
- }
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ $result += $segmentInfo->termFreqs($term, $segmentStartDocId);
- $prxFile = $segInfo->openCompoundFile('.prx');
- $prxFile->seek($termInfo->proxPointer,SEEK_CUR);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0;
- $positions = array();
+ $segmentStartDocId += $segmentInfo->count();
+ }
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $prxFile->readVInt();
- $positions[] = $termPosition;
- }
+ return $result;
+ }
- $result[ $segmentStartDocId + $docId ] = $positions;
- }
+ /**
+ * Returns an array of all term positions in the documents.
+ * Result array structure: array(docId => array(pos1, pos2, ...), ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termPositions(Zend_Search_Lucene_Index_Term $term)
+ {
+ $result = array();
+ $segmentStartDocId = 0;
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ $result += $segmentInfo->termPositions($term, $segmentStartDocId);
- $segmentStartDocId += $segInfo->count();
+ $segmentStartDocId += $segmentInfo->count();
}
return $result;
*
* @param integer $id
* @param string $fieldName
- * @return Zend_Search_Lucene_Document
+ * @return float
*/
- public function norm( $id, $fieldName )
+ public function norm($id, $fieldName)
{
if ($id >= $this->_docCount) {
return null;
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
- $segCount = 0;
- $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
- while( $nextSegmentStartId <= $id ) {
- $segCount++;
- $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+ $segmentStartId = 0;
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ if ($segmentStartId + $segmentInfo->count() > $id) {
+ break;
+ }
+
+ $segmentStartId += $segmentInfo->count();
}
+ $segmentInfo->delete($id - $segmentStartId);
$this->_hasChanges = true;
- $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
- $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId);
}
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
- if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
- $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
- }
-
- $this->_writer->addDocument($document);
+ $this->getIndexWriter()->addDocument($document);
+ $this->_docCount++;
}
+ /**
+ * Update document counter
+ */
+ private function _updateDocCount()
+ {
+ $this->_docCount = 0;
+ foreach ($this->_segmentInfos as $segInfo) {
+ $this->_docCount += $segInfo->count();
+ }
+ }
+
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*
- * @todo delete() and undeleteAll processing.
+ * @todo undeleteAll processing.
*/
public function commit()
{
}
if ($this->_writer !== null) {
- foreach ($this->_writer->commit() as $segmentName => $segmentInfo) {
- if ($segmentInfo !== null) {
- $this->_segmentInfos[] = $segmentInfo;
- $this->_docCount += $segmentInfo->count();
- } else {
- foreach ($this->_segmentInfos as $segId => $segInfo) {
- if ($segInfo->getName() == $segmentName) {
- unset($this->_segmentInfos[$segId]);
- }
- }
- }
- }
+ $this->_writer->commit();
+
+ $this->_updateDocCount();
}
}
- /*************************************************************************
- @todo UNIMPLEMENTED
- *************************************************************************/
+ /**
+ * Optimize index.
+ *
+ * Merges all segments into one
+ */
+ public function optimize()
+ {
+ // Commit changes if any changes have been made
+ $this->commit();
+
+ if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
+ $this->getIndexWriter()->optimize();
+ $this->_updateDocCount();
+ }
+ }
+
/**
* Returns an array of all terms in this index.
*
- * @todo Implementation
* @return array
*/
public function terms()
{
- return array();
+ $result = array();
+
+ $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
+
+ foreach ($this->_segmentInfos as $segmentInfo) {
+ $segmentInfo->reset();
+
+ // Skip "empty" segments
+ if ($segmentInfo->currentTerm() !== null) {
+ $segmentInfoQueue->put($segmentInfo);
+ }
+ }
+
+ while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
+ if ($segmentInfoQueue->top() === null ||
+ $segmentInfoQueue->top()->currentTerm()->key() !=
+ $segmentInfo->currentTerm()->key()) {
+ // We got new term
+ $result[] = $segmentInfo->currentTerm();
+ }
+
+ $segmentInfo->nextTerm();
+ // check, if segment dictionary is finished
+ if ($segmentInfo->currentTerm() !== null) {
+ // Put segment back into the priority queue
+ $segmentInfoQueue->put($segmentInfo);
+ }
+ }
+
+ return $result;
}
+ /*************************************************************************
+ @todo UNIMPLEMENTED
+ *************************************************************************/
/**
* Undeletes all documents currently marked as deleted in this index.
*
*/
public function undeleteAll()
{}
-}
\ No newline at end of file
+}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
-require_once 'Zend/Search/Lucene/Analysis/Token.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
+
+/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
+/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
*
* @var Zend_Search_Lucene_Analysis_Analyzer
*/
- static private $_defaultImpl;
+ private static $_defaultImpl;
+
+ /**
+ * Input string
+ *
+ * @var string
+ */
+ protected $_input = null;
+
+ /**
+ * Input string encoding
+ *
+ * @var string
+ */
+ protected $_encoding = '';
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
+ * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
+ *
* @param string $data
* @return array
*/
- abstract public function tokenize($data);
+ public function tokenize($data, $encoding = '')
+ {
+ $this->setInput($data, $encoding);
+
+ $tokenList = array();
+ while (($nextToken = $this->nextToken()) !== null) {
+ $tokenList[] = $nextToken;
+ }
+
+ return $tokenList;
+ }
+
+
+ /**
+ * Tokenization stream API
+ * Set input
+ *
+ * @param string $data
+ */
+ public function setInput($data, $encoding = '')
+ {
+ $this->_input = $data;
+ $this->_encoding = $encoding;
+ $this->reset();
+ }
+
+ /**
+ * Reset token stream
+ */
+ abstract public function reset();
+
+ /**
+ * Tokenization stream API
+ * Get next token
+ * Returns null at the end of stream
+ *
+ * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
+ *
+ * @return Zend_Search_Lucene_Analysis_Token|null
+ */
+ abstract public function nextToken();
+
+
/**
*
* @param Zend_Search_Lucene_Analysis_Analyzer $similarity
*/
- static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
+ public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
*
* @return Zend_Search_Lucene_Analysis_Analyzer
*/
- static public function getDefault()
+ public static function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
return self::$_defaultImpl;
}
-
}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
}
/**
- * Apply filters to the token.
+ * Apply filters to the token. Can return null when the token was removed.
*
* @param Zend_Search_Lucene_Analysis_Token $token
* @return Zend_Search_Lucene_Analysis_Token
{
foreach ($this->_filters as $filter) {
$token = $filter->normalize($token);
+
+ // resulting token can be null if the filter removed it
+ if (is_null($token)) {
+ return null;
+ }
}
return $token;
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
- * Tokenize text to a terms
- * Returns array of Zend_Search_Lucene_Analysis_Token objects
+ * Current position in a stream
*
- * @param string $data
- * @return array
+ * @var integer
*/
- public function tokenize($data)
+ private $_position;
+
+ /**
+ * Reset token stream
+ */
+ public function reset()
{
- $tokenStream = array();
+ $this->_position = 0;
- $position = 0;
- while ($position < strlen($data)) {
- // skip white space
- while ($position < strlen($data) && !ctype_alpha( $data{$position} )) {
- $position++;
- }
+ if ($this->_input === null) {
+ return;
+ }
- $termStartPosition = $position;
+ // convert input into ascii
+ $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
+ $this->_encoding = 'ASCII';
+ }
- // read token
- while ($position < strlen($data) && ctype_alpha( $data{$position} )) {
- $position++;
- }
+ /**
+ * Tokenization stream API
+ * Get next token
+ * Returns null at the end of stream
+ *
+ * @return Zend_Search_Lucene_Analysis_Token|null
+ */
+ public function nextToken()
+ {
+ if ($this->_input === null) {
+ return null;
+ }
- // Empty token, end of stream.
- if ($position == $termStartPosition) {
- break;
+
+ do {
+ if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
+ // It covers both cases a) there are no matches (preg_match(...) === 0)
+ // b) error occured (preg_match(...) === FALSE)
+ return null;
}
- $token = new Zend_Search_Lucene_Analysis_Token(substr($data,
- $termStartPosition,
- $position-$termStartPosition),
- $termStartPosition,
- $position);
- $tokenStream[] = $this->normalize($token);
- }
+ $str = $match[0][0];
+ $pos = $match[0][1];
+ $endpos = $pos + strlen($str);
+
+ $this->_position = $endpos;
+
+ $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
+ } while ($token === null); // try again if token is skipped
- return $tokenStream;
+ return $token;
}
}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
-require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+ /**
+ * Current position in a stream
+ *
+ * @var integer
+ */
+ private $_position;
+
+ /**
+ * Reset token stream
+ */
+ public function reset()
+ {
+ $this->_position = 0;
+
+ if ($this->_input === null) {
+ return;
+ }
+
+ // convert input into ascii
+ $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
+ $this->_encoding = 'ASCII';
+ }
+
+ /**
+ * Tokenization stream API
+ * Get next token
+ * Returns null at the end of stream
+ *
+ * @return Zend_Search_Lucene_Analysis_Token|null
+ */
+ public function nextToken()
+ {
+ if ($this->_input === null) {
+ return null;
+ }
+
+ do {
+ if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
+ // It covers both cases a) there are no matches (preg_match(...) === 0)
+ // b) error occured (preg_match(...) === FALSE)
+ return null;
+ }
+
+ $str = $match[0][0];
+ $pos = $match[0][1];
+ $endpos = $pos + strlen($str);
+
+ $this->_position = $endpos;
+
+ $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
+ } while ($token === null); // try again if token is skipped
+
+ return $token;
+ }
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
+
+/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum
+{
+ public function __construct()
+ {
+ $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
+ }
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+ /**
+ * Current char position in an UTF-8 stream
+ *
+ * @var integer
+ */
+ private $_position;
+
+ /**
+ * Current binary position in an UTF-8 stream
+ *
+ * @var integer
+ */
+ private $_bytePosition;
+
+ /**
+ * Stream length
+ *
+ * @var integer
+ */
+ private $_streamLength;
+
+ /**
+ * Reset token stream
+ */
+ public function reset()
+ {
+ $this->_position = 0;
+ $this->_bytePosition = 0;
+
+ // convert input into UTF-8
+ if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
+ strcasecmp($this->_encoding, 'utf-8') != 0 ) {
+ $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
+ $this->_encoding = 'UTF-8';
+ }
+
+ // Get UTF-8 string length.
+ // It also checks if it's a correct utf-8 string
+ $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
+ }
+
+ /**
+ * Check, that character is a letter
+ *
+ * @param string $char
+ * @return boolean
+ */
+ private static function _isAlpha($char)
+ {
+ if (strlen($char) > 1) {
+ // It's an UTF-8 character
+ return true;
+ }
+
+ return ctype_alpha($char);
+ }
+
+ /**
+ * Get next UTF-8 char
+ *
+ * @param string $char
+ * @return boolean
+ */
+ private function _nextChar()
+ {
+ $char = $this->_input[$this->_bytePosition++];
+
+ if (( ord($char) & 0xC0 ) == 0xC0) {
+ $addBytes = 1;
+ if (ord($char) & 0x20 ) {
+ $addBytes++;
+ if (ord($char) & 0x10 ) {
+ $addBytes++;
+ }
+ }
+ $char .= substr($this->_input, $this->_bytePosition, $addBytes);
+ $this->_bytePosition += $addBytes;
+ }
+
+ $this->_position++;
+
+ return $char;
+ }
+
+ /**
+ * Tokenization stream API
+ * Get next token
+ * Returns null at the end of stream
+ *
+ * @return Zend_Search_Lucene_Analysis_Token|null
+ */
+ public function nextToken()
+ {
+ if ($this->_input === null) {
+ return null;
+ }
+
+ while ($this->_position < $this->_streamLength) {
+ // skip white space
+ while ($this->_position < $this->_streamLength &&
+ !self::_isAlpha($char = $this->_nextChar())) {
+ $char = '';
+ }
+
+ $termStartPosition = $this->_position - 1;
+ $termText = $char;
+
+ // read token
+ while ($this->_position < $this->_streamLength &&
+ self::_isAlpha($char = $this->_nextChar())) {
+ $termText .= $char;
+ }
+
+ // Empty token, end of stream.
+ if ($termText == '') {
+ return null;
+ }
+
+ $token = new Zend_Search_Lucene_Analysis_Token(
+ $termText,
+ $termStartPosition,
+ $this->_position - 1);
+ $token = $this->normalize($token);
+ if ($token !== null) {
+ return $token;
+ }
+ // Continue if token is skipped
+ }
+
+ return null;
+ }
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+ /**
+ * Current char position in an UTF-8 stream
+ *
+ * @var integer
+ */
+ private $_position;
+
+ /**
+ * Current binary position in an UTF-8 stream
+ *
+ * @var integer
+ */
+ private $_bytePosition;
+
+ /**
+ * Stream length
+ *
+ * @var integer
+ */
+ private $_streamLength;
+
+ /**
+ * Reset token stream
+ */
+ public function reset()
+ {
+ $this->_position = 0;
+ $this->_bytePosition = 0;
+
+ // convert input into UTF-8
+ if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
+ strcasecmp($this->_encoding, 'utf-8') != 0 ) {
+ $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
+ $this->_encoding = 'UTF-8';
+ }
+
+ // Get UTF-8 string length.
+ // It also checks if it's a correct utf-8 string
+ $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
+ }
+
+ /**
+ * Check, that character is a letter
+ *
+ * @param string $char
+ * @return boolean
+ */
+ private static function _isAlNum($char)
+ {
+ if (strlen($char) > 1) {
+ // It's an UTF-8 character
+ return true;
+ }
+
+ return ctype_alnum($char);
+ }
+
+ /**
+ * Get next UTF-8 char
+ *
+ * @param string $char
+ * @return boolean
+ */
+ private function _nextChar()
+ {
+ $char = $this->_input[$this->_bytePosition++];
+
+ if (( ord($char) & 0xC0 ) == 0xC0) {
+ $addBytes = 1;
+ if (ord($char) & 0x20 ) {
+ $addBytes++;
+ if (ord($char) & 0x10 ) {
+ $addBytes++;
+ }
+ }
+ $char .= substr($this->_input, $this->_bytePosition, $addBytes);
+ $this->_bytePosition += $addBytes;
+ }
+
+ $this->_position++;
+
+ return $char;
+ }
+
+ /**
+ * Tokenization stream API
+ * Get next token
+ * Returns null at the end of stream
+ *
+ * @return Zend_Search_Lucene_Analysis_Token|null
+ */
+ public function nextToken()
+ {
+ if ($this->_input === null) {
+ return null;
+ }
+
+ while ($this->_position < $this->_streamLength) {
+ // skip white space
+ while ($this->_position < $this->_streamLength &&
+ !self::_isAlNum($char = $this->_nextChar())) {
+ $char = '';
+ }
+
+ $termStartPosition = $this->_position - 1;
+ $termText = $char;
+
+ // read token
+ while ($this->_position < $this->_streamLength &&
+ self::_isAlNum($char = $this->_nextChar())) {
+ $termText .= $char;
+ }
+
+ // Empty token, end of stream.
+ if ($termText == '') {
+ return null;
+ }
+
+ $token = new Zend_Search_Lucene_Analysis_Token(
+ $termText,
+ $termStartPosition,
+ $this->_position - 1);
+ $token = $this->normalize($token);
+ if ($token !== null) {
+ return $token;
+ }
+ // Continue if token is skipped
+ }
+
+ return null;
+ }
+}
+
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Token
*/
private $_endOffset;
- /**
- * Lexical type.
- *
- * @var string
- */
- private $_type;
-
/**
* The position of this token relative to the previous Token.
*
* @param integer $end
* @param string $type
*/
- public function __construct($text, $start, $end, $type = 'word' )
+ public function __construct($text, $start, $end)
{
$this->_termText = $text;
$this->_startOffset = $start;
$this->_endOffset = $end;
- $this->_type = $type;
$this->_positionIncrement = 1;
}
{
return $this->_endOffset;
}
-
- /**
- * Returns this Token's lexical type. Defaults to 'word'.
- *
- * @return string
- */
- public function getType()
- {
- return $this->_type;
- }
}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
-require_once 'Zend/Search/Lucene/Analysis/Token.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
-require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
- $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ),
+ $newToken = new Zend_Search_Lucene_Analysis_Token(
+ strtolower( $srcToken->getTermText() ),
$srcToken->getStartOffset(),
- $srcToken->getEndOffset(),
- $srcToken->getType());
+ $srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_TokenFilter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
+
+
+/**
+ * Token filter that removes short words. What is short word can be configured with constructor.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_TokenFilter_ShortWords extends Zend_Search_Lucene_Analysis_TokenFilter
+{
+ /**
+ * Minimum allowed term length
+ * @var integer
+ */
+ private $length;
+
+ /**
+ * Constructs new instance of this filter.
+ *
+ * @param integer $short minimum allowed length of term which passes this filter (default 2)
+ */
+ public function __construct($length = 2) {
+ $this->length = $length;
+ }
+
+ /**
+ * Normalize Token or remove it (if null is returned)
+ *
+ * @param Zend_Search_Lucene_Analysis_Token $srcToken
+ * @return Zend_Search_Lucene_Analysis_Token
+ */
+ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
+ if (strlen($srcToken->getTermText()) < $this->length) {
+ return null;
+ } else {
+ return $srcToken;
+ }
+ }
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_TokenFilter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
+
+
+/**
+ * Token filter that removes stop words. These words must be provided as array (set), example:
+ * $stopwords = array('the' => 1, 'an' => '1');
+ *
+ * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter
+{
+ /**
+ * Minimum allowed term length
+ * @var array
+ */
+ private $_stopSet;
+
+ /**
+ * Constructs new instance of this filter.
+ *
+ * @param array $stopwords array (set) of words that will be filtered out
+ */
+ public function __construct($stopwords = array()) {
+ $this->_stopSet = array_flip($stopwords);
+ }
+
+ /**
+ * Normalize Token or remove it (if null is returned)
+ *
+ * @param Zend_Search_Lucene_Analysis_Token $srcToken
+ * @return Zend_Search_Lucene_Analysis_Token
+ */
+ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
+ if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
+ $t = $srcToken->getTermText();
+ return null;
+ } else {
+ return $srcToken;
+ }
+ }
+
+ /**
+ * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first
+ * column are ignored (as comments).
+ *
+ * You can call this method one or more times. New stopwords are always added to current set.
+ *
+ * @param string $filepath full path for text file with stopwords
+ * @throws Zend_Search_Exception When the file doesn`t exists or is not readable.
+ */
+ public function loadFromFile($filepath = null) {
+ if (! $filepath || ! file_exists($filepath)) {
+ throw new Zend_Search_Exception('You have to provide valid file path');
+ }
+ $fd = fopen($filepath, "r");
+ if (! $fd) {
+ throw new Zend_Search_Exception('Cannot open file ' . $filepath);
+ }
+ while (!feof ($fd)) {
+ $buffer = trim(fgets($fd));
+ if (strlen($buffer) > 0 && $buffer[0] != '#') {
+ $this->_stopSet[$buffer] = 1;
+ }
+ }
+ if (!fclose($fd)) {
+ throw new Zend_Search_Exception('Cannot close file ' . $filepath);
+ }
+ }
+}
+
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Field */
-require_once 'Zend/Search/Lucene/Field.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Field.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document
*/
public function getField($fieldName)
{
- if (!array_key_exists($fieldName, $this->_fields)) {
- throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
- }
+ if (!array_key_exists($fieldName, $this->_fields)) {
+ throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
+ }
return $this->_fields[$fieldName];
}
*/
public function getFieldValue($fieldName)
{
- return $this->getField($fieldName)->stringValue;
+ return $this->getField($fieldName)->value;
}
+ /**
+ * Returns the string value of a named field in UTF-8 encoding.
+ *
+ * @see __get()
+ * @return string
+ */
+ public function getFieldUtf8Value($fieldName)
+ {
+ return $this->getField($fieldName)->getUtf8Value();
+ }
}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Document
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Document */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
+
+
+/**
+ * HTML document.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Document
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
+{
+ /**
+ * List of document links
+ *
+ * @var array
+ */
+ private $_links = array();
+
+ /**
+ * List of document header links
+ *
+ * @var array
+ */
+ private $_headerLinks = array();
+
+ /**
+ * Stored DOM representation
+ *
+ * @var DOMDocument
+ */
+ private $_doc;
+
+ /**
+ * Object constructor
+ *
+ * @param string $data
+ * @param boolean $isFile
+ * @param boolean $storeContent
+ */
+ private function __construct($data, $isFile, $storeContent)
+ {
+ $this->_doc = new DOMDocument();
+ $this->_doc->substituteEntities = true;
+
+ if ($isFile) {
+ @$this->_doc->loadHTMLFile($data);
+ } else{
+ @$this->_doc->loadHTML($data);
+ }
+
+ $xpath = new DOMXPath($this->_doc);
+
+ $docTitle = '';
+ $titleNodes = $xpath->query('/html/head/title');
+ foreach ($titleNodes as $titleNode) {
+ // title should always have only one entry, but we process all nodeset entries
+ $docTitle .= $titleNode->nodeValue . ' ';
+ }
+ $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding));
+
+ $metaNodes = $xpath->query('/html/head/meta[@name]');
+ foreach ($metaNodes as $metaNode) {
+ $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
+ $metaNode->getAttribute('content'),
+ $this->_doc->actualEncoding));
+ }
+
+ $docBody = '';
+ $bodyNodes = $xpath->query('/html/body');
+ foreach ($bodyNodes as $bodyNode) {
+ // body should always have only one entry, but we process all nodeset entries
+ $this->_retrieveNodeText($bodyNode, $docBody);
+ }
+ if ($storeContent) {
+ $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding));
+ } else {
+ $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding));
+ }
+
+ $linkNodes = $this->_doc->getElementsByTagName('a');
+ foreach ($linkNodes as $linkNode) {
+ if (($href = $linkNode->getAttribute('href')) != '') {
+ $this->_links[] = $href;
+ }
+ }
+ $this->_links = array_unique($this->_links);
+
+ $linkNodes = $xpath->query('/html/head/link');
+ foreach ($linkNodes as $linkNode) {
+ if (($href = $linkNode->getAttribute('href')) != '') {
+ $this->_headerLinks[] = $href;
+ }
+ }
+ $this->_headerLinks = array_unique($this->_headerLinks);
+ }
+
+ /**
+ * Get node text
+ *
+ * We should exclude scripts, which may be not included into comment tags, CDATA sections,
+ *
+ * @param DOMNode $node
+ * @param string &$text
+ */
+ private function _retrieveNodeText(DOMNode $node, &$text)
+ {
+ if ($node->nodeType == XML_TEXT_NODE) {
+ $text .= $node->nodeValue ;
+ $text .= ' ';
+ } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
+ foreach ($node->childNodes as $childNode) {
+ $this->_retrieveNodeText($childNode, $text);
+ }
+ }
+ }
+
+ /**
+ * Get document HREF links
+ *
+ * @return array
+ */
+ public function getLinks()
+ {
+ return $this->_links;
+ }
+
+ /**
+ * Get document header links
+ *
+ * @return array
+ */
+ public function getHeaderLinks()
+ {
+ return $this->_headerLinks;
+ }
+
+ /**
+ * Load HTML document from a string
+ *
+ * @param string $data
+ * @param boolean $storeContent
+ * @return Zend_Search_Lucene_Document_Html
+ */
+ public static function loadHTML($data, $storeContent = false)
+ {
+ return new Zend_Search_Lucene_Document_Html($data, false, $storeContent);
+ }
+
+ /**
+ * Load HTML document from a file
+ *
+ * @param string $file
+ * @param boolean $storeContent
+ * @return Zend_Search_Lucene_Document_Html
+ */
+ public static function loadHTMLFile($file, $storeContent = false)
+ {
+ return new Zend_Search_Lucene_Document_Html($file, true, $storeContent);
+ }
+
+
+ /**
+ * Highlight text in text node
+ *
+ * @param DOMText $node
+ * @param array $wordsToHighlight
+ * @param string $color
+ */
+ public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color)
+ {
+ $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+ $analyzer->setInput($node->nodeValue, $this->_doc->encoding);
+
+ $matchedTokens = array();
+
+ while (($token = $analyzer->nextToken()) !== null) {
+ if (isset($wordsToHighlight[$token->getTermText()])) {
+ $matchedTokens[] = $token;
+ }
+ }
+
+ if (count($matchedTokens) == 0) {
+ return;
+ }
+
+ $matchedTokens = array_reverse($matchedTokens);
+
+ foreach ($matchedTokens as $token) {
+ // Cut text after matched token
+ $node->splitText($token->getEndOffset());
+
+ // Cut matched node
+ $matchedWordNode = $node->splitText($token->getStartOffset());
+
+ $highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue);
+ $highlightedNode->setAttribute('style', 'color:black;background-color:' . $color);
+
+ $node->parentNode->replaceChild($highlightedNode, $matchedWordNode);
+ }
+ }
+
+
+ /**
+ * highlight words in content of the specified node
+ *
+ * @param DOMNode $contextNode
+ * @param array $wordsToHighlight
+ * @param string $color
+ */
+ public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color)
+ {
+ $textNodes = array();
+
+ if (!$contextNode->hasChildNodes()) {
+ return;
+ }
+
+ foreach ($contextNode->childNodes as $childNode) {
+ if ($childNode->nodeType == XML_TEXT_NODE) {
+ // process node later to leave childNodes structure untouched
+ $textNodes[] = $childNode;
+ } else {
+ // Skip script nodes
+ if ($childNode->nodeName != 'script') {
+ $this->_highlightNode($childNode, $wordsToHighlight, $color);
+ }
+ }
+ }
+
+ foreach ($textNodes as $textNode) {
+ $this->_highlightTextNode($textNode, $wordsToHighlight, $color);
+ }
+ }
+
+
+
+ /**
+ * Highlight text with specified color
+ *
+ * @param string|array $words
+ * @param string $color
+ * @return string
+ */
+ public function highlight($words, $color = '#66ffff')
+ {
+ if (!is_array($words)) {
+ $words = array($words);
+ }
+ $wordsToHighlight = array();
+
+ $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+ foreach ($words as $wordString) {
+ $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString));
+ }
+
+ if (count($wordsToHighlight) == 0) {
+ return $this->_doc->saveHTML();
+ }
+
+ $wordsToHighlightFlipped = array();
+ foreach ($wordsToHighlight as $id => $token) {
+ $wordsToHighlightFlipped[$token->getTermText()] = $id;
+ }
+
+ $xpath = new DOMXPath($this->_doc);
+
+ $matchedNodes = $xpath->query("/html/body/*");
+ foreach ($matchedNodes as $matchedNode) {
+ $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color);
+ }
+
+ }
+
+ /**
+ * Get HTML
+ *
+ * @return string
+ */
+ public function getHTML()
+ {
+ return $this->_doc->saveHTML();
+ }
+}
+
*
* @category Zend
* @package Zend_Search_Lucene
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Framework base exception
*/
-require_once 'Zend/Search/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Exception extends Zend_Search_Exception
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_FSMAction */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/FSMAction.php';
+
+/** Zend_Search_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
+
+
+/**
+ * Abstract Finite State Machine
+ *
+ * Take a look on Wikipedia state machine description: http://en.wikipedia.org/wiki/Finite_state_machine
+ *
+ * Any type of Transducers (Moore machine or Mealy machine) also may be implemented by using this abstract FSM.
+ * process() methods invokes a specified actions which may construct FSM output.
+ * Actions may be also used to signal, that we have reached Accept State
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+abstract class Zend_Search_Lucene_FSM
+{
+ /**
+ * Machine States alphabet
+ *
+ * @var array
+ */
+ private $_states = array();
+
+ /**
+ * Current state
+ *
+ * @var integer|string
+ */
+ private $_currentState = null;
+
+ /**
+ * Input alphabet
+ *
+ * @var array
+ */
+ private $_inputAphabet = array();
+
+ /**
+ * State transition table
+ *
+ * [sourceState][input] => targetState
+ *
+ * @var array
+ */
+ private $_rules = array();
+
+ /**
+ * List of entry actions
+ * Each action executes when entering the state
+ *
+ * [state] => action
+ *
+ * @var array
+ */
+ private $_entryActions = array();
+
+ /**
+ * List of exit actions
+ * Each action executes when exiting the state
+ *
+ * [state] => action
+ *
+ * @var array
+ */
+ private $_exitActions = array();
+
+ /**
+ * List of input actions
+ * Each action executes when entering the state
+ *
+ * [state][input] => action
+ *
+ * @var array
+ */
+ private $_inputActions = array();
+
+ /**
+ * List of input actions
+ * Each action executes when entering the state
+ *
+ * [state1][state2] => action
+ *
+ * @var array
+ */
+ private $_transitionActions = array();
+
+ /**
+ * Finite State machine constructor
+ *
+ * $states is an array of integers or strings with a list of possible machine states
+ * constructor treats fist list element as a sturt state (assignes it to $_current state).
+ * It may be reassigned by setState() call.
+ * States list may be empty and can be extended later by addState() or addStates() calls.
+ *
+ * $inputAphabet is the same as $states, but represents input alphabet
+ * it also may be extended later by addInputSymbols() or addInputSymbol() calls.
+ *
+ * $rules parameter describes FSM transitions and has a structure:
+ * array( array(sourseState, input, targetState[, inputAction]),
+ * array(sourseState, input, targetState[, inputAction]),
+ * array(sourseState, input, targetState[, inputAction]),
+ * ...
+ * )
+ * Rules also can be added later by addRules() and addRule() calls.
+ *
+ * FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(),
+ * addInputAction() and addTransitionAction() calls.
+ *
+ * @param array $states
+ * @param array $inputAphabet
+ * @param array $rules
+ */
+ public function __construct($states = array(), $inputAphabet = array(), $rules = array())
+ {
+ $this->addStates($states);
+ $this->addInputSymbols($inputAphabet);
+ $this->addRules($rules);
+ }
+
+ /**
+ * Add states to the state machine
+ *
+ * @param array $states
+ */
+ public function addStates($states)
+ {
+ foreach ($states as $state) {
+ $this->addState($state);
+ }
+ }
+
+ /**
+ * Add state to the state machine
+ *
+ * @param integer|string $state
+ */
+ public function addState($state)
+ {
+ $this->_states[$state] = $state;
+
+ if ($this->_currentState === null) {
+ $this->_currentState = $state;
+ }
+ }
+
+ /**
+ * Set FSM state.
+ * No any action is invoked
+ *
+ * @param integer|string $state
+ * @throws Zend_Search_Exception
+ */
+ public function setState($state)
+ {
+ if (!isset($this->_states[$state])) {
+ throw new Zend_Search_Exception('State \'' . $state . '\' is not on of the possible FSM states.');
+ }
+
+ $this->_currentState = $state;
+ }
+
+ /**
+ * Get FSM state.
+ *
+ * @return integer|string $state|null
+ */
+ public function getState()
+ {
+ return $this->_currentState;
+ }
+
+ /**
+ * Add symbols to the input alphabet
+ *
+ * @param array $inputAphabet
+ */
+ public function addInputSymbols($inputAphabet)
+ {
+ foreach ($inputAphabet as $inputSymbol) {
+ $this->addInputSymbol($inputSymbol);
+ }
+ }
+
+ /**
+ * Add symbol to the input alphabet
+ *
+ * @param integer|string $inputSymbol
+ */
+ public function addInputSymbol($inputSymbol)
+ {
+ $this->_inputAphabet[$inputSymbol] = $inputSymbol;
+ }
+
+
+ /**
+ * Add transition rules
+ *
+ * array structure:
+ * array( array(sourseState, input, targetState[, inputAction]),
+ * array(sourseState, input, targetState[, inputAction]),
+ * array(sourseState, input, targetState[, inputAction]),
+ * ...
+ * )
+ *
+ * @param array $rules
+ */
+ public function addRules($rules)
+ {
+ foreach ($rules as $rule) {
+ $this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null);
+ }
+ }
+
+ /**
+ * Add symbol to the input alphabet
+ *
+ * @param integer|string $sourceState
+ * @param integer|string $input
+ * @param integer|string $targetState
+ * @param Zend_Search_Lucene_FSMAction|null $inputAction
+ * @throws Zend_Search_Exception
+ */
+ public function addRule($sourceState, $input, $targetState, $inputAction = null)
+ {
+ if (!isset($this->_states[$sourceState])) {
+ throw new Zend_Search_Exception('Undefined source state (' . $sourceState . ').');
+ }
+ if (!isset($this->_states[$targetState])) {
+ throw new Zend_Search_Exception('Undefined target state (' . $targetState . ').');
+ }
+ if (!isset($this->_inputAphabet[$input])) {
+ throw new Zend_Search_Exception('Undefined input symbol (' . $input . ').');
+ }
+
+ if (!isset($this->_rules[$sourceState])) {
+ $this->_rules[$sourceState] = array();
+ }
+ if (isset($this->_rules[$sourceState][$input])) {
+ throw new Zend_Search_Exception('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.');
+ }
+
+ $this->_rules[$sourceState][$input] = $targetState;
+
+
+ if ($inputAction !== null) {
+ $this->addInputAction($sourceState, $input, $inputAction);
+ }
+ }
+
+
+ /**
+ * Add state entry action.
+ * Several entry actions are allowed.
+ * Action execution order is defined by addEntryAction() calls
+ *
+ * @param integer|string $state
+ * @param Zend_Search_Lucene_FSMAction $action
+ */
+ public function addEntryAction($state, Zend_Search_Lucene_FSMAction $action)
+ {
+ if (!isset($this->_states[$state])) {
+ throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+ }
+
+ if (!isset($this->_entryActions[$state])) {
+ $this->_entryActions[$state] = array();
+ }
+
+ $this->_entryActions[$state][] = $action;
+ }
+
+ /**
+ * Add state exit action.
+ * Several exit actions are allowed.
+ * Action execution order is defined by addEntryAction() calls
+ *
+ * @param integer|string $state
+ * @param Zend_Search_Lucene_FSMAction $action
+ */
+ public function addExitAction($state, Zend_Search_Lucene_FSMAction $action)
+ {
+ if (!isset($this->_states[$state])) {
+ throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+ }
+
+ if (!isset($this->_exitActions[$state])) {
+ $this->_exitActions[$state] = array();
+ }
+
+ $this->_exitActions[$state][] = $action;
+ }
+
+ /**
+ * Add input action (defined by {state, input} pair).
+ * Several input actions are allowed.
+ * Action execution order is defined by addInputAction() calls
+ *
+ * @param integer|string $state
+ * @param integer|string $input
+ * @param Zend_Search_Lucene_FSMAction $action
+ */
+ public function addInputAction($state, $inputSymbol, Zend_Search_Lucene_FSMAction $action)
+ {
+ if (!isset($this->_states[$state])) {
+ throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+ }
+ if (!isset($this->_inputAphabet[$inputSymbol])) {
+ throw new Zend_Search_Exception('Undefined input symbol (' . $inputSymbol. ').');
+ }
+
+ if (!isset($this->_inputActions[$state])) {
+ $this->_inputActions[$state] = array();
+ }
+ if (!isset($this->_inputActions[$state][$inputSymbol])) {
+ $this->_inputActions[$state][$inputSymbol] = array();
+ }
+
+ $this->_inputActions[$state][$inputSymbol][] = $action;
+ }
+
+ /**
+ * Add transition action (defined by {state, input} pair).
+ * Several transition actions are allowed.
+ * Action execution order is defined by addTransitionAction() calls
+ *
+ * @param integer|string $sourceState
+ * @param integer|string $targetState
+ * @param Zend_Search_Lucene_FSMAction $action
+ */
+ public function addTransitionAction($sourceState, $targetState, Zend_Search_Lucene_FSMAction $action)
+ {
+ if (!isset($this->_states[$sourceState])) {
+ throw new Zend_Search_Exception('Undefined source state (' . $sourceState. ').');
+ }
+ if (!isset($this->_states[$targetState])) {
+ throw new Zend_Search_Exception('Undefined source state (' . $targetState. ').');
+ }
+
+ if (!isset($this->_transitionActions[$sourceState])) {
+ $this->_transitionActions[$sourceState] = array();
+ }
+ if (!isset($this->_transitionActions[$sourceState][$targetState])) {
+ $this->_transitionActions[$sourceState][$targetState] = array();
+ }
+
+ $this->_transitionActions[$sourceState][$targetState][] = $action;
+ }
+
+
+ /**
+ * Process an input
+ *
+ * @param mixed $input
+ * @throws Zend_Search_Exception
+ */
+ public function process($input)
+ {
+ if (!isset($this->_rules[$this->_currentState])) {
+ throw new Zend_Search_Exception('There is no any rule for current state (' . $this->_currentState . ').');
+ }
+ if (!isset($this->_rules[$this->_currentState][$input])) {
+ throw new Zend_Search_Exception('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').');
+ }
+
+ $sourceState = $this->_currentState;
+ $targetState = $this->_rules[$this->_currentState][$input];
+
+ if ($sourceState != $targetState && isset($this->_exitActions[$sourceState])) {
+ foreach ($this->_exitActions[$sourceState] as $action) {
+ $action->doAction();
+ }
+ }
+ if (isset($this->_inputActions[$sourceState]) &&
+ isset($this->_inputActions[$sourceState][$input])) {
+ foreach ($this->_inputActions[$sourceState][$input] as $action) {
+ $action->doAction();
+ }
+ }
+
+
+ $this->_currentState = $targetState;
+
+ if (isset($this->_transitionActions[$sourceState]) &&
+ isset($this->_transitionActions[$sourceState][$targetState])) {
+ foreach ($this->_transitionActions[$sourceState][$targetState] as $action) {
+ $action->doAction();
+ }
+ }
+ if ($sourceState != $targetState && isset($this->_entryActions[$targetState])) {
+ foreach ($this->_entryActions[$targetState] as $action) {
+ $action->doAction();
+ }
+ }
+ }
+
+ public function reset()
+ {
+ if (count($this->_states) == 0) {
+ throw new Zend_Search_Exception('There is no any state defined for FSM.');
+ }
+
+ $this->_currentState = $this->_states[0];
+ }
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/**
+ * Abstract Finite State Machine
+ *
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_FSMAction
+{
+ /**
+ * Object reference
+ *
+ * @var object
+ */
+ private $_object;
+
+ /**
+ * Method name
+ *
+ * @var string
+ */
+ private $_method;
+
+ /**
+ * Object constructor
+ *
+ * @param object $object
+ * @param string $method
+ */
+ public function __construct($object, $method)
+ {
+ $this->_object = $object;
+ $this->_method = $method;
+ }
+
+ public function doAction()
+ {
+ $methodName = $this->_method;
+ $this->_object->$methodName();
+ }
+}
+
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Field
{
- public $kind;
+ /**
+ * Field name
+ *
+ * @var string
+ */
+ public $name;
- public $name = 'body';
- public $stringValue = null;
+
+ public $value;
public $isStored = false;
public $isIndexed = true;
public $isTokenized = true;
public $storeTermVector = false;
+ /**
+ * Field boos factor
+ * It's not stored directly in the index, but affects on normalizetion factor
+ *
+ * @var float
+ */
public $boost = 1.0;
- public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false)
+ /**
+ * Field value encoding.
+ *
+ * @var string
+ */
+ public $encoding;
+
+ /**
+ * Object constructor
+ *
+ * @param string $name
+ * @param string $value
+ * @param string $encoding
+ * @param boolean $isStored
+ * @param boolean $isIndexed
+ * @param boolean $isTokenized
+ * @param boolean $isBinary
+ */
+ public function __construct($name, $value, $encoding, $isStored, $isIndexed, $isTokenized, $isBinary = false)
{
- $this->name = $name;
+ $this->name = $name;
+ $this->value = $value;
if (!$isBinary) {
- /**
- * @todo Correct UTF-8 string should be required in future
- * Until full UTF-8 support is not completed, string should be normalized to ANSII encoding
- */
- $this->stringValue = iconv(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT', $stringValue);
- //$this->stringValue = iconv('', 'ASCII//TRANSLIT', $stringValue);
+ $this->encoding = $encoding;
+ $this->isTokenized = $isTokenized;
} else {
- $this->stringValue = $stringValue;
+ $this->encoding = '';
+ $this->isTokenized = false;
}
- $this->isStored = $isStored;
- $this->isIndexed = $isIndexed;
- $this->isTokenized = $isTokenized;
- $this->isBinary = $isBinary;
+
+ $this->isStored = $isStored;
+ $this->isIndexed = $isIndexed;
+ $this->isBinary = $isBinary;
$this->storeTermVector = false;
$this->boost = 1.0;
*
* @param string $name
* @param string $value
+ * @param string $encoding
* @return Zend_Search_Lucene_Field
*/
- static public function Keyword($name, $value)
+ public static function Keyword($name, $value, $encoding = '')
{
- return new self($name, $value, true, true, false);
+ return new self($name, $value, $encoding, true, true, false);
}
*
* @param string $name
* @param string $value
+ * @param string $encoding
* @return Zend_Search_Lucene_Field
*/
- static public function UnIndexed($name, $value)
+ public static function UnIndexed($name, $value, $encoding = '')
{
- return new self($name, $value, true, false, false);
+ return new self($name, $value, $encoding, true, false, false);
}
*
* @param string $name
* @param string $value
+ * @param string $encoding
* @return Zend_Search_Lucene_Field
*/
- static public function Binary($name, $value)
+ public static function Binary($name, $value)
{
- return new self($name, $value, true, false, false, true);
+ return new self($name, $value, '', true, false, false, true);
}
/**
*
* @param string $name
* @param string $value
+ * @param string $encoding
* @return Zend_Search_Lucene_Field
*/
- static public function Text($name, $value)
+ public static function Text($name, $value, $encoding = '')
{
- return new self($name, $value, true, true, true);
+ return new self($name, $value, $encoding, true, true, true);
}
*
* @param string $name
* @param string $value
+ * @param string $encoding
* @return Zend_Search_Lucene_Field
*/
- static public function UnStored($name, $value)
+ public static function UnStored($name, $value, $encoding = '')
{
- return new self($name, $value, false, true, true);
+ return new self($name, $value, $encoding, false, true, true);
}
+ /**
+ * Get field value in UTF-8 encoding
+ *
+ * @return string
+ */
+ public function getUtf8Value()
+ {
+ if (strcasecmp($this->encoding, 'utf8' ) == 0 ||
+ strcasecmp($this->encoding, 'utf-8') == 0 ) {
+ return $this->value;
+ } else {
+ return iconv($this->encoding, 'UTF-8', $this->value);
+ }
+ }
}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+
+/**
+ * Dictionary loader
+ *
+ * It's a dummy class which is created to encapsulate non-good structured code.
+ * Manual "method inlining" is performed to increase dictionary index loading operation
+ * which is major bottelneck for search performance.
+ *
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Index_DictionaryLoader
+{
+ /**
+ * Dictionary index loader.
+ *
+ * It takes a string which is actually <segment_name>.tii index file data and
+ * returns two arrays - term and tremInfo lists.
+ *
+ * See Zend_Search_Lucene_Index_SegmintInfo class for details
+ *
+ * @param string $data
+ * @return array
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public static function load($data)
+ {
+ $termDictionary = array();
+ $termInfos = array();
+ $pos = 0;
+
+ // $tiVersion = $tiiFile->readInt();
+ $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
+ $pos += 4;
+ if ($tiVersion != (int)0xFFFFFFFE) {
+ throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+ }
+
+ // $indexTermCount = = $tiiFile->readLong();
+ if (PHP_INT_SIZE > 4) {
+ $indexTermCount = ord($data[$pos]) << 56 |
+ ord($data[$pos+1]) << 48 |
+ ord($data[$pos+2]) << 40 |
+ ord($data[$pos+3]) << 32 |
+ ord($data[$pos+4]) << 24 |
+ ord($data[$pos+5]) << 16 |
+ ord($data[$pos+6]) << 8 |
+ ord($data[$pos+7]);
+ } else {
+ if ((ord($data[$pos]) != 0) ||
+ (ord($data[$pos+1]) != 0) ||
+ (ord($data[$pos+2]) != 0) ||
+ (ord($data[$pos+3]) != 0) ||
+ ((ord($data[$pos+4]) & 0x80) != 0)) {
+ throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
+ }
+
+ $indexTermCount = ord($data[$pos+4]) << 24 |
+ ord($data[$pos+5]) << 16 |
+ ord($data[$pos+6]) << 8 |
+ ord($data[$pos+7]);
+ }
+ $pos += 8;
+
+ // $tiiFile->readInt(); // IndexInterval
+ $pos += 4;
+
+ // $skipInterval = $tiiFile->readInt();
+ $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
+ $pos += 4;
+ if ($indexTermCount < 1) {
+ throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
+ }
+
+ $prevTerm = '';
+ $freqPointer = 0;
+ $proxPointer = 0;
+ $indexPointer = 0;
+ for ($count = 0; $count < $indexTermCount; $count++) {
+ //$termPrefixLength = $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $termPrefixLength = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $termPrefixLength |= ($nbyte & 0x7F) << $shift;
+ }
+
+ // $termSuffix = $tiiFile->readString();
+ $nbyte = ord($data[$pos++]);
+ $len = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $len |= ($nbyte & 0x7F) << $shift;
+ }
+ if ($len == 0) {
+ $termSuffix = '';
+ } else {
+ $termSuffix = substr($data, $pos, $len);
+ $pos += $len;
+ for ($count1 = 0; $count1 < $len; $count1++ ) {
+ if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
+ $addBytes = 1;
+ if (ord($termSuffix[$count1]) & 0x20 ) {
+ $addBytes++;
+ }
+ $termSuffix .= substr($data, $pos, $addBytes);
+ $pos += $addBytes;
+ $len += $addBytes;
+
+ // Check for null character. Java2 encodes null character
+ // in two bytes.
+ if (ord($termSuffix[$count1]) == 0xC0 &&
+ ord($termSuffix[$count1+1]) == 0x80 ) {
+ $termSuffix[$count1] = 0;
+ $termSuffix = substr($termSuffix,0,$count1+1)
+ . substr($termSuffix,$count1+2);
+ }
+ $count1 += $addBytes;
+ }
+ }
+ }
+
+ // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
+ $pb = 0; $pc = 0;
+ while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
+ $charBytes = 1;
+ if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
+ $charBytes++;
+ if (ord($prevTerm[$pb]) & 0x20 ) {
+ $charBytes++;
+ if (ord($prevTerm[$pb]) & 0x10 ) {
+ $charBytes++;
+ }
+ }
+ }
+
+ if ($pb + $charBytes > strlen($data)) {
+ // wrong character
+ break;
+ }
+
+ $pc++;
+ $pb += $charBytes;
+ }
+ $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
+
+ // $termFieldNum = $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $termFieldNum = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $termFieldNum |= ($nbyte & 0x7F) << $shift;
+ }
+
+ // $docFreq = $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $docFreq = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $docFreq |= ($nbyte & 0x7F) << $shift;
+ }
+
+ // $freqPointer += $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $vint = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $vint |= ($nbyte & 0x7F) << $shift;
+ }
+ $freqPointer += $vint;
+
+ // $proxPointer += $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $vint = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $vint |= ($nbyte & 0x7F) << $shift;
+ }
+ $proxPointer += $vint;
+
+ if( $docFreq >= $skipInterval ) {
+ // $skipDelta = $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $vint = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $vint |= ($nbyte & 0x7F) << $shift;
+ }
+ $skipDelta = $vint;
+ } else {
+ $skipDelta = 0;
+ }
+
+ // $indexPointer += $tiiFile->readVInt();
+ $nbyte = ord($data[$pos++]);
+ $vint = $nbyte & 0x7F;
+ for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+ $nbyte = ord($data[$pos++]);
+ $vint |= ($nbyte & 0x7F) << $shift;
+ }
+ $indexPointer += $vint;
+
+
+ // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
+ $termDictionary[] = array($termFieldNum, $termValue);
+
+ $termInfos[] =
+ // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+ array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+
+ $prevTerm = $termValue;
+ }
+
+ // Check special index entry mark
+ if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
+ throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+ } else if (PHP_INT_SIZE > 4){
+ // Treat 64-bit 0xFFFFFFFF as -1
+ $termDictionary[0][0] = -1;
+ }
+
+ return array(&$termDictionary, &$termInfos);
+ }
+}
+
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_FieldInfo
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
+/** Zend_Search_Lucene_Index_DictionaryLoader */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php';
+
/** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentInfo
/**
* Term Dictionary Index
- * Array of the Zend_Search_Lucene_Index_Term objects
+ *
+ * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
+ * of performance considerations)
+ * [0] -> $termValue
+ * [1] -> $termFieldNum
+ *
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
/**
* Term Dictionary Index TermInfos
- * Array of the Zend_Search_Lucene_Index_TermInfo objects
+ *
+ * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
+ * of performance considerations)
+ * [0] -> $docFreq
+ * [1] -> $freqPointer
+ * [2] -> $proxPointer
+ * [3] -> $skipOffset
+ * [4] -> $indexPointer
*
* @var array
*/
*/
private $_segFiles;
+ /**
+ * Associative array where the key is the file name and the value is file size (.csf).
+ *
+ * @var array
+ */
+ private $_segFileSizes;
+
+
/**
* File system adapter.
*
*/
private $_deletedDirty = false;
+
/**
* Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
* Documents count and Directory as a parameter.
for ($count = 0; $count < $segFilesCount; $count++) {
$dataOffset = $cfsFile->readLong();
+ if ($count != 0) {
+ $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
+ }
$fileName = $cfsFile->readString();
$this->_segFiles[$fileName] = $dataOffset;
}
+ if ($count != 0) {
+ $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
+ }
}
$fnmFile = $this->openCompoundFile('.fnm');
}
}
}
-
}
} catch(Zend_Search_Exception $e) {
if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
* Opens index file stoted within compound index file
*
* @param string $extension
+ * @param boolean $shareHandler
* @throws Zend_Search_Lucene_Exception
* @return Zend_Search_Lucene_Storage_File
*/
- public function openCompoundFile($extension)
+ public function openCompoundFile($extension, $shareHandler = true)
{
$filename = $this->_name . $extension;
// Try to open common file first
if ($this->_directory->fileExists($filename)) {
- return $this->_directory->getFileObject($filename);
+ return $this->_directory->getFileObject($filename, $shareHandler);
}
if( !isset($this->_segFiles[$filename]) ) {
. $filename . ' file.' );
}
- $file = $this->_directory->getFileObject( $this->_name.".cfs" );
+ $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
$file->seek($this->_segFiles[$filename]);
return $file;
}
+ /**
+ * Get compound file length
+ *
+ * @param string $extension
+ * @return integer
+ */
+ public function compoundFileLength($extension)
+ {
+ $filename = $this->_name . $extension;
+
+ // Try to get common file first
+ if ($this->_directory->fileExists($filename)) {
+ return $this->_directory->fileLength($filename);
+ }
+
+ if( !isset($this->_segFileSizes[$filename]) ) {
+ throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
+ . $filename . ' file.' );
+ }
+
+ return $this->_segFileSizes[$filename];
+ }
+
/**
* Returns field index or -1 if field is not found
*
* Returns field info for specified field
*
* @param integer $fieldNum
- * @return ZSearchFieldInfo
+ * @return Zend_Search_Lucene_Index_FieldInfo
*/
public function getField($fieldNum)
{
}
/**
- * Returns the total number of documents in this segment.
+ * Returns array of FieldInfo objects.
*
- * @return integer
+ * @return array
*/
- public function count()
+ public function getFieldInfos()
{
- return $this->_docCount;
+ return $this->_fields;
}
/**
- * Get field position in a fields dictionary
+ * Returns the total number of documents in this segment (including deleted documents).
*
- * @param integer $fieldNum
* @return integer
*/
- private function _getFieldPosition($fieldNum) {
- // Treat values which are not in a translation table as a 'direct value'
- return isset($this->_fieldsDicPositions[$fieldNum]) ?
- $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
+ public function count()
+ {
+ return $this->_docCount;
}
/**
- * Loads Term dictionary from TermInfoIndex file
+ * Returns number of deleted documents.
+ *
+ * @return integer
*/
- protected function _loadDictionary()
+ private function _deletedCount()
{
- if ($this->_termDictionary !== null) {
- return;
+ if ($this->_deleted === null) {
+ return 0;
}
- $this->_termDictionary = array();
- $this->_termDictionaryInfos = array();
-
- $tiiFile = $this->openCompoundFile('.tii');
- $tiVersion = $tiiFile->readInt();
- if ($tiVersion != (int)0xFFFFFFFE) {
- throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
- }
-
- $indexTermCount = $tiiFile->readLong();
- $tiiFile->readInt(); // IndexInterval
- $skipInterval = $tiiFile->readInt();
-
- $prevTerm = '';
- $freqPointer = 0;
- $proxPointer = 0;
- $indexPointer = 0;
- for ($count = 0; $count < $indexTermCount; $count++) {
- $termPrefixLength = $tiiFile->readVInt();
- $termSuffix = $tiiFile->readString();
- $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;
-
- $termFieldNum = $tiiFile->readVInt();
- $docFreq = $tiiFile->readVInt();
- $freqPointer += $tiiFile->readVInt();
- $proxPointer += $tiiFile->readVInt();
- if( $docFreq >= $skipInterval ) {
- $skipDelta = $tiiFile->readVInt();
- } else {
- $skipDelta = 0;
- }
-
- $indexPointer += $tiiFile->readVInt();
+ if (extension_loaded('bitset')) {
+ return count(bitset_to_array($this->_deleted));
+ } else {
+ return count($this->_deleted);
+ }
+ }
- $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
- $this->_termDictionaryInfos[] =
- new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
- $prevTerm = $termValue;
+ /**
+ * Returns the total number of non-deleted documents in this segment.
+ *
+ * @return integer
+ */
+ public function numDocs()
+ {
+ if ($this->hasDeletions()) {
+ return $this->_docCount - $this->_deletedCount();
+ } else {
+ return $this->_docCount;
}
}
+ /**
+ * Get field position in a fields dictionary
+ *
+ * @param integer $fieldNum
+ * @return integer
+ */
+ private function _getFieldPosition($fieldNum) {
+ // Treat values which are not in a translation table as a 'direct value'
+ return isset($this->_fieldsDicPositions[$fieldNum]) ?
+ $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
+ }
/**
* Return segment name
}
+ /**
+ * TermInfo cache
+ *
+ * Size is 1024.
+ * Numbers are used instead of class constants because of performance considerations
+ *
+ * @var array
+ */
+ private $_termInfoCache = array();
+
+ private function _cleanUpTermInfoCache()
+ {
+ // Clean 256 term infos
+ foreach ($this->_termInfoCache as $key => $termInfo) {
+ unset($this->_termInfoCache[$key]);
+
+ // leave 768 last used term infos
+ if (count($this->_termInfoCache) == 768) {
+ break;
+ }
+ }
+ }
+
/**
* Scans terms dictionary and returns term info
*
* @param Zend_Search_Lucene_Index_Term $term
* @return Zend_Search_Lucene_Index_TermInfo
*/
- public function getTermInfo($term)
+ public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
{
- $this->_loadDictionary();
+ $termKey = $term->key();
+ if (isset($this->_termInfoCache[$termKey])) {
+ $termInfo = $this->_termInfoCache[$termKey];
+
+ // Move termInfo to the end of cache
+ unset($this->_termInfoCache[$termKey]);
+ $this->_termInfoCache[$termKey] = $termInfo;
+
+ return $termInfo;
+ }
+
+
+ if ($this->_termDictionary === null) {
+ // Check, if index is already serialized
+ if ($this->_directory->fileExists($this->_name . '.sti')) {
+ // Prefetch dictionary index data
+ $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
+ $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
+
+ // Load dictionary index data
+ list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData);
+ } else {
+ // Prefetch dictionary index data
+ $tiiFile = $this->openCompoundFile('.tii');
+ $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
+
+ // Load dictionary index data
+ list($this->_termDictionary, $this->_termDictionaryInfos) =
+ Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
+
+ $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
+ $stiFile = $this->_directory->createFile($this->_name . '.sti');
+ $stiFile->writeBytes($stiFileData);
+ }
+
+ }
+
+
$searchField = $this->getFieldNum($term->field);
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
- $fieldNum = $this->_getFieldPosition($midTerm->field);
+ $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
- $delta = strcmp($term->text, $midTerm->text);
+ $delta = strcmp($term->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
- return $this->_termDictionaryInfos[$mid]; // We got it!
+ // return $this->_termDictionaryInfos[$mid]; // We got it!
+ $a = $this->_termDictionaryInfos[$mid];
+ $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
+
+ // Put loaded termInfo into cache
+ $this->_termInfoCache[$termKey] = $termInfo;
+
+ return $termInfo;
}
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
- $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];
+ $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
$tisFile = $this->openCompoundFile('.tis');
$tiVersion = $tisFile->readInt();
$indexInterval = $tisFile->readInt();
$skipInterval = $tisFile->readInt();
- $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);
+ $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);
- $termValue = $prevTerm->text;
- $termFieldNum = $prevTerm->field;
- $freqPointer = $prevTermInfo->freqPointer;
- $proxPointer = $prevTermInfo->proxPointer;
+ $termValue = $prevTerm[1] /* text */;
+ $termFieldNum = $prevTerm[0] /* field */;
+ $freqPointer = $prevTermInfo[1] /* freqPointer */;
+ $proxPointer = $prevTermInfo[2] /* proxPointer */;
for ($count = $prevPosition*$indexInterval + 1;
$count <= $termCount &&
( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
$termPrefixLength = $tisFile->readVInt();
$termSuffix = $tisFile->readString();
$termFieldNum = $tisFile->readVInt();
- $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;
+ $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
$docFreq = $tisFile->readVInt();
$freqPointer += $tisFile->readVInt();
}
if ($termFieldNum == $searchField && $termValue == $term->text) {
- return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
+ $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
} else {
- return null;
+ $termInfo = null;
+ }
+
+ // Put loaded termInfo into cache
+ $this->_termInfoCache[$termKey] = $termInfo;
+
+ if (count($this->_termInfoCache) == 1024) {
+ $this->_cleanUpTermInfoCache();
+ }
+
+ return $termInfo;
+ }
+
+ /**
+ * Returns term freqs array.
+ * Result array structure: array(docId => freq, ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @param integer $shift
+ * @return Zend_Search_Lucene_Index_TermInfo
+ */
+ public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0)
+ {
+ $termInfo = $this->getTermInfo($term);
+
+ if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+ return array();
}
+
+ $frqFile = $this->openCompoundFile('.frq');
+ $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+ $result = array();
+ $docId = 0;
+
+ for ($count = 0; $count < $termInfo->docFreq; $count++) {
+ $docDelta = $frqFile->readVInt();
+ if ($docDelta % 2 == 1) {
+ $docId += ($docDelta-1)/2;
+ $result[$shift + $docId] = 1;
+ } else {
+ $docId += $docDelta/2;
+ $result[$shift + $docId] = $frqFile->readVInt();
+ }
+ }
+
+ return $result;
+ }
+
+ /**
+ * Returns term positions array.
+ * Result array structure: array(docId => array(pos1, pos2, ...), ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @param integer $shift
+ * @return Zend_Search_Lucene_Index_TermInfo
+ */
+ public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0)
+ {
+ $termInfo = $this->getTermInfo($term);
+
+ if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+ return array();
+ }
+
+ $frqFile = $this->openCompoundFile('.frq');
+ $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+ $freqs = array();
+ $docId = 0;
+
+ for ($count = 0; $count < $termInfo->docFreq; $count++) {
+ $docDelta = $frqFile->readVInt();
+ if ($docDelta % 2 == 1) {
+ $docId += ($docDelta-1)/2;
+ $freqs[$docId] = 1;
+ } else {
+ $docId += $docDelta/2;
+ $freqs[$docId] = $frqFile->readVInt();
+ }
+ }
+
+ $result = array();
+ $prxFile = $this->openCompoundFile('.prx');
+ $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
+ foreach ($freqs as $docId => $freq) {
+ $termPosition = 0;
+ $positions = array();
+
+ for ($count = 0; $count < $freq; $count++ ) {
+ $termPosition += $prxFile->readVInt();
+ $positions[] = $termPosition;
+ }
+
+ $result[$shift + $docId] = $positions;
+ }
+
+ return $result;
+ }
+
+ /**
+ * Load normalizatin factors from an index file
+ *
+ * @param integer $fieldNum
+ */
+ private function _loadNorm($fieldNum)
+ {
+ $fFile = $this->openCompoundFile('.f' . $fieldNum);
+ $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
}
/**
*
* @param integer $id
* @param string $fieldName
- * @return string
+ * @return float
*/
public function norm($id, $fieldName)
{
return null;
}
- if ( !isset( $this->_norms[$fieldNum] )) {
- $fFile = $this->openCompoundFile('.f' . $fieldNum);
- $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
+ if (!isset($this->_norms[$fieldNum])) {
+ $this->_loadNorm($fieldNum);
}
return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
}
+ /**
+ * Returns norm vector, encoded in a byte string
+ *
+ * @param string $fieldName
+ * @return string
+ */
+ public function normVector($fieldName)
+ {
+ $fieldNum = $this->getFieldNum($fieldName);
+
+ if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
+ $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
+
+ return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
+ $this->_docCount);
+ }
+
+ if (!isset($this->_norms[$fieldNum])) {
+ $this->_loadNorm($fieldNum);
+ }
+
+ return $this->_norms[$fieldNum];
+ }
+
/**
* Returns true if any documents have been deleted from this index segment.
$this->_deletedDirty = false;
}
+
+
+
+ /**
+ * Term Dictionary File object for stream like terms reading
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_tisFile = null;
+
+ /**
+ * Frequencies File object for stream like terms reading
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_frqFile = null;
+
+ /**
+ * Offset of the .frq file in the compound file
+ *
+ * @var integer
+ */
+ private $_frqFileOffset;
+
+ /**
+ * Positions File object for stream like terms reading
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_prxFile = null;
+
+ /**
+ * Offset of the .prx file in the compound file
+ *
+ * @var integer
+ */
+ private $_prxFileOffset;
+
+
+ /**
+ * Number of terms in term stream
+ *
+ * @var integer
+ */
+ private $_termCount = 0;
+
+ /**
+ * Segment skip interval
+ *
+ * @var integer
+ */
+ private $_skipInterval;
+
+ /**
+ * Last TermInfo in a terms stream
+ *
+ * @var Zend_Search_Lucene_Index_TermInfo
+ */
+ private $_lastTermInfo = null;
+
+ /**
+ * Last Term in a terms stream
+ *
+ * @var Zend_Search_Lucene_Index_Term
+ */
+ private $_lastTerm = null;
+
+ /**
+ * Map of the document IDs
+ * Used to get new docID after removing deleted documents.
+ * It's not very effective from memory usage point of view,
+ * but much more faster, then other methods
+ *
+ * @var array|null
+ */
+ private $_docMap = null;
+
+ /**
+ * An array of all term positions in the documents.
+ * Array structure: array( docId => array( pos1, pos2, ...), ...)
+ *
+ * @var array
+ */
+ private $_lastTermPositions;
+
+ /**
+ * Reset terms stream
+ *
+ * $startId - id for the fist document
+ * $compact - remove deleted documents
+ *
+ * Returns start document id for the next segment
+ *
+ * @param integer $startId
+ * @param boolean $compact
+ * @throws Zend_Search_Lucene_Exception
+ * @return integer
+ */
+ public function reset($startId = 0, $compact = false)
+ {
+ if ($this->_tisFile !== null) {
+ $this->_tisFile = null;
+ }
+
+ $this->_tisFile = $this->openCompoundFile('.tis', false);
+ $tiVersion = $this->_tisFile->readInt();
+ if ($tiVersion != (int)0xFFFFFFFE) {
+ throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
+ }
+
+ $this->_termCount = $this->_tisFile->readLong();
+ $this->_tisFile->readInt(); // Read Index interval
+ $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
+
+ if ($this->_frqFile !== null) {
+ $this->_frqFile = null;
+ }
+ $this->_frqFile = $this->openCompoundFile('.frq', false);
+ $this->_frqFileOffset = $this->_frqFile->tell();
+
+ if ($this->_prxFile !== null) {
+ $this->_prxFile = null;
+ }
+ $this->_prxFile = $this->openCompoundFile('.prx', false);
+ $this->_prxFileOffset = $this->_prxFile->tell();
+
+ $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
+ $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
+
+ $this->_docMap = array();
+ for ($count = 0; $count < $this->_docCount; $count++) {
+ if (!$this->isDeleted($count)) {
+ $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
+ }
+ }
+
+ $this->nextTerm();
+ return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
+ }
+
+
+ /**
+ * Scans terms dictionary and returns next term
+ *
+ * @return Zend_Search_Lucene_Index_Term|null
+ */
+ public function nextTerm()
+ {
+ if ($this->_tisFile === null || $this->_termCount == 0) {
+ $this->_lastTerm = null;
+ $this->_lastTermInfo = null;
+
+ // may be necessary for "empty" segment
+ $this->_tisFile = null;
+ $this->_frqFile = null;
+ $this->_prxFile = null;
+
+ return null;
+ }
+
+ $termPrefixLength = $this->_tisFile->readVInt();
+ $termSuffix = $this->_tisFile->readString();
+ $termFieldNum = $this->_tisFile->readVInt();
+ $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
+
+ $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
+
+ $docFreq = $this->_tisFile->readVInt();
+ $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
+ $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
+ if ($docFreq >= $this->_skipInterval) {
+ $skipOffset = $this->_tisFile->readVInt();
+ } else {
+ $skipOffset = 0;
+ }
+
+ $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
+
+
+ $this->_lastTermPositions = array();
+
+ $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
+ $freqs = array(); $docId = 0;
+ for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
+ $docDelta = $this->_frqFile->readVInt();
+ if( $docDelta % 2 == 1 ) {
+ $docId += ($docDelta-1)/2;
+ $freqs[ $docId ] = 1;
+ } else {
+ $docId += $docDelta/2;
+ $freqs[ $docId ] = $this->_frqFile->readVInt();
+ }
+ }
+
+ $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
+ foreach ($freqs as $docId => $freq) {
+ $termPosition = 0; $positions = array();
+
+ for ($count = 0; $count < $freq; $count++ ) {
+ $termPosition += $this->_prxFile->readVInt();
+ $positions[] = $termPosition;
+ }
+
+ if (isset($this->_docMap[$docId])) {
+ $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
+ }
+ }
+
+
+ $this->_termCount--;
+ if ($this->_termCount == 0) {
+ $this->_tisFile = null;
+ $this->_frqFile = null;
+ $this->_prxFile = null;
+ }
+
+ return $this->_lastTerm;
+ }
+
+
+ /**
+ * Returns term in current position
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return Zend_Search_Lucene_Index_Term|null
+ */
+ public function currentTerm()
+ {
+ return $this->_lastTerm;
+ }
+
+
+ /**
+ * Returns an array of all term positions in the documents.
+ * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+ *
+ * @return array
+ */
+ public function currentTermPositions()
+ {
+ return $this->_lastTermPositions;
+ }
}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/PriorityQueue.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentInfoPriorityQueue extends Zend_Search_Lucene_PriorityQueue
+{
+ /**
+ * Compare elements
+ *
+ * Returns true, if $el1 is less than $el2; else otherwise
+ *
+ * @param mixed $segmentInfo1
+ * @param mixed $segmentInfo2
+ * @return boolean
+ */
+ protected function _less($segmentInfo1, $segmentInfo2)
+ {
+ return strcmp($segmentInfo1->currentTerm()->key(), $segmentInfo2->currentTerm()->key()) < 0;
+ }
+
+}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentMerger
+{
+ /**
+ * Target segment writer
+ *
+ * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
+ */
+ private $_writer;
+
+ /**
+ * Number of docs in a new segment
+ *
+ * @var integer
+ */
+ private $_docCount;
+
+ /**
+ * A set of segments to be merged
+ *
+ * @var array Zend_Search_Lucene_Index_SegmentInfo
+ */
+ private $_segmentInfos = array();
+
+ /**
+ * Flag to signal, that merge is already done
+ *
+ * @var boolean
+ */
+ private $_mergeDone = false;
+
+ /**
+ * Field map
+ * [<segment_name>][<field_number>] => <target_field_number>
+ *
+ * @var array
+ */
+ private $_fieldsMap = array();
+
+
+
+ /**
+ * Object constructor.
+ *
+ * Creates new segment merger with $directory as target to merge segments into
+ * and $name as a name of new segment
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param string $name
+ */
+ public function __construct($directory, $name)
+ {
+ $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
+ }
+
+
+ /**
+ * Add segmnet to a collection of segments to be merged
+ *
+ * @param Zend_Search_Lucene_Index_SegmentInfo $segment
+ */
+ public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
+ {
+ $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
+ }
+
+
+ /**
+ * Do merge.
+ *
+ * Returns number of documents in newly created segment
+ *
+ * @return Zend_Search_Lucene_Index_SegmentInfo
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function merge()
+ {
+ if ($this->_mergeDone) {
+ throw new Zend_Search_Lucene_Exception('Merge is already done.');
+ }
+
+ if (count($this->_segmentInfos) < 1) {
+ throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
+ . count($this->_segmentInfos)
+ . ').');
+ }
+
+ $this->_mergeFields();
+ $this->_mergeNorms();
+ $this->_mergeStoredFields();
+ $this->_mergeTerms();
+
+ $this->_mergeDone = true;
+
+ return $this->_writer->close();
+ }
+
+
+ /**
+ * Merge fields information
+ */
+ private function _mergeFields()
+ {
+ foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+ foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
+ $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
+ }
+ }
+ }
+
+ /**
+ * Merge field's normalization factors
+ */
+ private function _mergeNorms()
+ {
+ foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
+ if ($fieldInfo->isIndexed) {
+ foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+ if ($segmentInfo->hasDeletions()) {
+ $srcNorm = $segmentInfo->normVector($fieldInfo->name);
+ $norm = '';
+ $docs = $segmentInfo->count();
+ for ($count = 0; $count < $docs; $count++) {
+ if (!$segmentInfo->isDeleted($count)) {
+ $norm .= $srcNorm[$count];
+ }
+ }
+ $this->_writer->addNorm($fieldInfo->name, $norm);
+ } else {
+ $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Merge fields information
+ */
+ private function _mergeStoredFields()
+ {
+ $this->_docCount = 0;
+
+ foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+ $fdtFile = $segmentInfo->openCompoundFile('.fdt');
+
+ for ($count = 0; $count < $segmentInfo->count(); $count++) {
+ $fieldCount = $fdtFile->readVInt();
+ $storedFields = array();
+
+ for ($count2 = 0; $count2 < $fieldCount; $count2++) {
+ $fieldNum = $fdtFile->readVInt();
+ $bits = $fdtFile->readByte();
+ $fieldInfo = $segmentInfo->getField($fieldNum);
+
+ if (!($bits & 2)) { // Text data
+ $storedFields[] =
+ new Zend_Search_Lucene_Field($fieldInfo->name,
+ $fdtFile->readString(),
+ 'UTF-8',
+ true,
+ $fieldInfo->isIndexed,
+ $bits & 1 );
+ } else { // Binary data
+ $storedFields[] =
+ new Zend_Search_Lucene_Field($fieldInfo->name,
+ $fdtFile->readBinary(),
+ '',
+ true,
+ $fieldInfo->isIndexed,
+ $bits & 1,
+ true);
+ }
+ }
+
+ if (!$segmentInfo->isDeleted($count)) {
+ $this->_docCount++;
+ $this->_writer->addStoredFields($storedFields);
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Merge fields information
+ */
+ private function _mergeTerms()
+ {
+ $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
+
+ $segmentStartId = 0;
+ foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+ $segmentStartId = $segmentInfo->reset($segmentStartId, true);
+
+ // Skip "empty" segments
+ if ($segmentInfo->currentTerm() !== null) {
+ $segmentInfoQueue->put($segmentInfo);
+ }
+ }
+
+ $this->_writer->initializeDictionaryFiles();
+
+ $termDocs = array();
+ while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
+ // Merge positions array
+ $termDocs += $segmentInfo->currentTermPositions();
+
+ if ($segmentInfoQueue->top() === null ||
+ $segmentInfoQueue->top()->currentTerm()->key() !=
+ $segmentInfo->currentTerm()->key()) {
+ // We got new term
+ ksort($termDocs, SORT_NUMERIC);
+
+ // Add term if it's contained in any document
+ if (count($termDocs) > 0) {
+ $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
+ }
+ $termDocs = array();
+ }
+
+ $segmentInfo->nextTerm();
+ // check, if segment dictionary is finished
+ if ($segmentInfo->currentTerm() !== null) {
+ // Put segment back into the priority queue
+ $segmentInfoQueue->put($segmentInfo);
+ }
+ }
+
+ $this->_writer->closeDictionaryFiles();
+ }
+}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
-
-/** Zend_Search_Lucene_Analysis_Analyzer */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
-class Zend_Search_Lucene_Index_SegmentWriter
+abstract class Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Expert: The fraction of terms in the "dictionary" which should be stored
*
* @var integer
*/
- static public $indexInterval = 128;
+ public static $indexInterval = 128;
/** Expert: The fraction of TermDocs entries stored in skip tables.
* Larger values result in smaller indexes, greater acceleration, but fewer
*
* @var integer
*/
- static public $skipInterval = 0x7FFFFFFF;
+ public static $skipInterval = 0x7FFFFFFF;
/**
* Number of docs in a segment
*
* @var integer
*/
- private $_docCount;
+ protected $_docCount = 0;
/**
* Segment name
*
* @var string
*/
- private $_name;
+ protected $_name;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
- private $_directory;
+ protected $_directory;
/**
* List of the index files.
*
* @var unknown_type
*/
- private $_files;
-
- /**
- * Term Dictionary
- * Array of the Zend_Search_Lucene_Index_Term objects
- * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
- *
- * @var array
- */
- private $_termDictionary;
-
- /**
- * Documents, which contain the term
- *
- * @var array
- */
- private $_termDocs;
+ protected $_files = array();
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
- private $_fields;
+ protected $_fields = array();
/**
- * Sizes of the indexed fields.
- * Used for normalization factors calculation.
+ * Normalization factors.
+ * An array fieldName => normVector
+ * normVector is a binary string.
+ * Each byte corresponds to an indexed document in a segment and
+ * encodes normalization factor (float value, encoded by
+ * Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
- private $_fieldLengths;
+ protected $_norms = array();
+
/**
* '.fdx' file - Stored Fields, the field index.
*
* @var Zend_Search_Lucene_Storage_File
*/
- private $_fdxFile;
+ protected $_fdxFile = null;
/**
* '.fdt' file - Stored Fields, the field data.
*
* @var Zend_Search_Lucene_Storage_File
*/
- private $_fdtFile;
+ protected $_fdtFile = null;
/**
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
- public function __construct($directory, $name)
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
$this->_directory = $directory;
$this->_name = $name;
- $this->_docCount = 0;
-
- $this->_fields = array();
- $this->_termDocs = array();
- $this->_files = array();
- $this->_norms = array();
- $this->_fieldLengths = array();
- $this->_termDictionary = array();
-
- $this->_fdxFile = null;
- $this->_fdtFile = null;
}
/**
* Add field to the segment
*
+ * Returns actual field number
+ *
* @param Zend_Search_Lucene_Field $field
+ * @return integer
*/
- private function _addFieldInfo(Zend_Search_Lucene_Field $field)
+ public function addField(Zend_Search_Lucene_Field $field)
{
if (!isset($this->_fields[$field->name])) {
+ $fieldNumber = count($this->_fields);
$this->_fields[$field->name] =
new Zend_Search_Lucene_Index_FieldInfo($field->name,
$field->isIndexed,
- count($this->_fields),
+ $fieldNumber,
$field->storeTermVector);
+
+ return $fieldNumber;
} else {
$this->_fields[$field->name]->isIndexed |= $field->isIndexed;
$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
+
+ return $this->_fields[$field->name]->number;
}
}
-
/**
- * Adds a document to this segment.
+ * Add fieldInfo to the segment
*
- * @param Zend_Search_Lucene_Document $document
- * @throws Zend_Search_Lucene_Exception
+ * Returns actual field number
+ *
+ * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
+ * @return integer
*/
- public function addDocument(Zend_Search_Lucene_Document $document)
+ public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
{
- $storedFields = array();
+ if (!isset($this->_fields[$fieldInfo->name])) {
+ $fieldNumber = count($this->_fields);
+ $this->_fields[$fieldInfo->name] =
+ new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
+ $fieldInfo->isIndexed,
+ $fieldNumber,
+ $fieldInfo->storeTermVector);
+
+ return $fieldNumber;
+ } else {
+ $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
+ $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
- foreach ($document->getFieldNames() as $fieldName) {
- $field = $document->getField($fieldName);
- $this->_addFieldInfo($field);
+ return $this->_fields[$fieldInfo->name]->number;
+ }
+ }
- if ($field->storeTermVector) {
- /**
- * @todo term vector storing support
- */
- throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
- }
+ /**
+ * Returns array of FieldInfo objects.
+ *
+ * @return array
+ */
+ public function getFieldInfos()
+ {
+ return $this->_fields;
+ }
- if ($field->isIndexed) {
- if ($field->isTokenized) {
- $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
- } else {
- $tokenList = array();
- $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
- }
- $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
-
- $position = 0;
- foreach ($tokenList as $token) {
- $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
- $termKey = $term->key();
-
- if (!isset($this->_termDictionary[$termKey])) {
- // New term
- $this->_termDictionary[$termKey] = $term;
- $this->_termDocs[$termKey] = array();
- $this->_termDocs[$termKey][$this->_docCount] = array();
- } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
- // Existing term, but new term entry
- $this->_termDocs[$termKey][$this->_docCount] = array();
- }
- $position += $token->getPositionIncrement();
- $this->_termDocs[$termKey][$this->_docCount][] = $position;
- }
- }
+ /**
+ * Add stored fields information
+ *
+ * @param array $storedFields array of Zend_Search_Lucene_Field objects
+ */
+ public function addStoredFields($storedFields)
+ {
+ if (!isset($this->_fdxFile)) {
+ $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+ $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
- if ($field->isStored) {
- $storedFields[] = $field;
- }
+ $this->_files[] = $this->_name . '.fdx';
+ $this->_files[] = $this->_name . '.fdt';
}
- if (count($storedFields) != 0) {
- if (!isset($this->_fdxFile)) {
- $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
- $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
-
- $this->_files[] = $this->_name . '.fdx';
- $this->_files[] = $this->_name . '.fdt';
- }
-
- $this->_fdxFile->writeLong($this->_fdtFile->tell());
- $this->_fdtFile->writeVInt(count($storedFields));
- foreach ($storedFields as $field) {
- $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
- $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
- ($field->isBinary ? 0x02 : 0x00) |
- 0x00; /* 0x04 - third bit, compressed (ZLIB) */
- $this->_fdtFile->writeByte($fieldBits);
- if ($field->isBinary) {
- $this->_fdtFile->writeVInt(strlen($field->stringValue));
- $this->_fdtFile->writeBytes($field->stringValue);
- } else {
- $this->_fdtFile->writeString($field->stringValue);
- }
+ $this->_fdxFile->writeLong($this->_fdtFile->tell());
+ $this->_fdtFile->writeVInt(count($storedFields));
+ foreach ($storedFields as $field) {
+ $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
+ $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
+ ($field->isBinary ? 0x02 : 0x00) |
+ 0x00; /* 0x04 - third bit, compressed (ZLIB) */
+ $this->_fdtFile->writeByte($fieldBits);
+ if ($field->isBinary) {
+ $this->_fdtFile->writeVInt(strlen($field->value));
+ $this->_fdtFile->writeBytes($field->value);
+ } else {
+ $this->_fdtFile->writeString($field->getUtf8Value());
}
}
$this->_docCount++;
}
+ /**
+ * Returns the total number of documents in this segment.
+ *
+ * @return integer
+ */
+ public function count()
+ {
+ return $this->_docCount;
+ }
/**
* Dump Field Info (.fnm) segment file
*/
- private function _dumpFNM()
+ protected function _dumpFNM()
{
$fnmFile = $this->_directory->createFile($this->_name . '.fnm');
$fnmFile->writeVInt(count($this->_fields));
);
if ($field->isIndexed) {
- $fieldNum = $this->_fields[$field->name]->number;
- $fieldName = $field->name;
- $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
- $norm = '';
-
- for ($count = 0; $count < $this->_docCount; $count++) {
- $numTokens = isset($this->_fieldLengths[$fieldName][$count]) ?
- $this->_fieldLengths[$fieldName][$count] : 0;
- $norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens)));
- }
-
- $normFileName = $this->_name . '.f' . $fieldNum;
+ $normFileName = $this->_name . '.f' . $field->number;
$fFile = $this->_directory->createFile($normFileName);
- $fFile->writeBytes($norm);
+ $fFile->writeBytes($this->_norms[$field->name]);
$this->_files[] = $normFileName;
}
}
}
+
+ /**
+ * Term Dictionary file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_tisFile = null;
+
+ /**
+ * Term Dictionary index file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_tiiFile = null;
+
+ /**
+ * Frequencies file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_frqFile = null;
+
+ /**
+ * Positions file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_prxFile = null;
+
+ /**
+ * Number of written terms
+ *
+ * @var integer
+ */
+ private $_termCount;
+
+
+ /**
+ * Last saved term
+ *
+ * @var Zend_Search_Lucene_Index_Term
+ */
+ private $_prevTerm;
+
+ /**
+ * Last saved term info
+ *
+ * @var Zend_Search_Lucene_Index_TermInfo
+ */
+ private $_prevTermInfo;
+
+ /**
+ * Last saved index term
+ *
+ * @var Zend_Search_Lucene_Index_Term
+ */
+ private $_prevIndexTerm;
+
+ /**
+ * Last saved index term info
+ *
+ * @var Zend_Search_Lucene_Index_TermInfo
+ */
+ private $_prevIndexTermInfo;
+
+ /**
+ * Last term dictionary file position
+ *
+ * @var integer
+ */
+ private $_lastIndexPosition;
+
+ /**
+ * Create dicrionary, frequency and positions files and write necessary headers
+ */
+ public function initializeDictionaryFiles()
+ {
+ $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
+ $this->_tisFile->writeInt((int)0xFFFFFFFE);
+ $this->_tisFile->writeLong(0 /* dummy data for terms count */);
+ $this->_tisFile->writeInt(self::$indexInterval);
+ $this->_tisFile->writeInt(self::$skipInterval);
+
+ $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
+ $this->_tiiFile->writeInt((int)0xFFFFFFFE);
+ $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
+ $this->_tiiFile->writeInt(self::$indexInterval);
+ $this->_tiiFile->writeInt(self::$skipInterval);
+
+ /** Dump dictionary header */
+ $this->_tiiFile->writeVInt(0); // preffix length
+ $this->_tiiFile->writeString(''); // suffix
+ $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
+ $this->_tiiFile->writeByte((int)0x0F);
+ $this->_tiiFile->writeVInt(0); // DocFreq
+ $this->_tiiFile->writeVInt(0); // FreqDelta
+ $this->_tiiFile->writeVInt(0); // ProxDelta
+ $this->_tiiFile->writeVInt(20); // IndexDelta
+
+ $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
+ $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
+
+ $this->_files[] = $this->_name . '.tis';
+ $this->_files[] = $this->_name . '.tii';
+ $this->_files[] = $this->_name . '.frq';
+ $this->_files[] = $this->_name . '.prx';
+
+ $this->_prevTerm = null;
+ $this->_prevTermInfo = null;
+ $this->_prevIndexTerm = null;
+ $this->_prevIndexTermInfo = null;
+ $this->_lastIndexPosition = 20;
+ $this->_termCount = 0;
+
+ }
+
+ /**
+ * Add term
+ *
+ * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
+ *
+ * @param Zend_Search_Lucene_Index_Term $termEntry
+ * @param array $termDocs
+ */
+ public function addTerm($termEntry, $termDocs)
+ {
+ $freqPointer = $this->_frqFile->tell();
+ $proxPointer = $this->_prxFile->tell();
+
+ $prevDoc = 0;
+ foreach ($termDocs as $docId => $termPositions) {
+ $docDelta = ($docId - $prevDoc)*2;
+ $prevDoc = $docId;
+ if (count($termPositions) > 1) {
+ $this->_frqFile->writeVInt($docDelta);
+ $this->_frqFile->writeVInt(count($termPositions));
+ } else {
+ $this->_frqFile->writeVInt($docDelta + 1);
+ }
+
+ $prevPosition = 0;
+ foreach ($termPositions as $position) {
+ $this->_prxFile->writeVInt($position - $prevPosition);
+ $prevPosition = $position;
+ }
+ }
+
+ if (count($termDocs) >= self::$skipInterval) {
+ /**
+ * @todo Write Skip Data to a freq file.
+ * It's not used now, but make index more optimal
+ */
+ $skipOffset = $this->_frqFile->tell() - $freqPointer;
+ } else {
+ $skipOffset = 0;
+ }
+
+ $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
+ $this->_fields[$termEntry->field]->number);
+ $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
+ $freqPointer, $proxPointer, $skipOffset);
+
+ $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
+
+ if (($this->_termCount + 1) % self::$indexInterval == 0) {
+ $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
+
+ $indexPosition = $this->_tisFile->tell();
+ $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
+ $this->_lastIndexPosition = $indexPosition;
+
+ }
+ $this->_termCount++;
+ }
+
+ /**
+ * Close dictionary
+ */
+ public function closeDictionaryFiles()
+ {
+ $this->_tisFile->seek(4);
+ $this->_tisFile->writeLong($this->_termCount);
+
+ $this->_tiiFile->seek(4);
+ $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
+ }
+
+
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
* @param Zend_Search_Lucene_Index_TermInfo $termInfo
*/
- private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
+ protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
&$prevTerm, Zend_Search_Lucene_Index_Term $term,
&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
- $prefixLength = 0;
- while ($prefixLength < strlen($prevTerm->text) &&
- $prefixLength < strlen($term->text) &&
- $prevTerm->text{$prefixLength} == $term->text{$prefixLength}
- ) {
- $prefixLength++;
+ $matchedBytes = 0;
+ $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
+ while ($matchedBytes < $maxBytes &&
+ $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
+ $matchedBytes++;
+ }
+
+ // Calculate actual matched UTF-8 pattern
+ $prefixBytes = 0;
+ $prefixChars = 0;
+ while ($prefixBytes < $matchedBytes) {
+ $charBytes = 1;
+ if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
+ $charBytes++;
+ if (ord($term->text[$prefixBytes]) & 0x20 ) {
+ $charBytes++;
+ if (ord($term->text[$prefixBytes]) & 0x10 ) {
+ $charBytes++;
+ }
+ }
+ }
+
+ if ($prefixBytes + $charBytes > $matchedBytes) {
+ // char crosses matched bytes boundary
+ // skip char
+ break;
+ }
+
+ $prefixChars++;
+ $prefixBytes += $charBytes;
}
+
// Write preffix length
- $dicFile->writeVInt($prefixLength);
+ $dicFile->writeVInt($prefixChars);
// Write suffix
- $dicFile->writeString( substr($term->text, $prefixLength) );
+ $dicFile->writeString(substr($term->text, $prefixBytes));
} else {
// Write preffix length
$dicFile->writeVInt(0);
$prevTermInfo = $termInfo;
}
- /**
- * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
- */
- private function _dumpDictionary()
- {
- $termKeys = array_keys($this->_termDictionary);
- sort($termKeys, SORT_STRING);
-
- $tisFile = $this->_directory->createFile($this->_name . '.tis');
- $tisFile->writeInt((int)0xFFFFFFFE);
- $tisFile->writeLong(count($termKeys));
- $tisFile->writeInt(self::$indexInterval);
- $tisFile->writeInt(self::$skipInterval);
-
- $tiiFile = $this->_directory->createFile($this->_name . '.tii');
- $tiiFile->writeInt((int)0xFFFFFFFE);
- $tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval));
- $tiiFile->writeInt(self::$indexInterval);
- $tiiFile->writeInt(self::$skipInterval);
-
- /** Dump dictionary header */
- $tiiFile->writeVInt(0); // preffix length
- $tiiFile->writeString(''); // suffix
- $tiiFile->writeInt((int)0xFFFFFFFF); // field number
- $tiiFile->writeByte((int)0x0F);
- $tiiFile->writeVInt(0); // DocFreq
- $tiiFile->writeVInt(0); // FreqDelta
- $tiiFile->writeVInt(0); // ProxDelta
- $tiiFile->writeVInt(20); // IndexDelta
-
- $frqFile = $this->_directory->createFile($this->_name . '.frq');
- $prxFile = $this->_directory->createFile($this->_name . '.prx');
-
- $termCount = 1;
-
- $prevTerm = null;
- $prevTermInfo = null;
- $prevIndexTerm = null;
- $prevIndexTermInfo = null;
- $prevIndexPosition = 20;
-
- foreach ($termKeys as $termId) {
- $freqPointer = $frqFile->tell();
- $proxPointer = $prxFile->tell();
-
- $prevDoc = 0;
- foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
- $docDelta = ($docId - $prevDoc)*2;
- $prevDoc = $docId;
- if (count($termPositions) > 1) {
- $frqFile->writeVInt($docDelta);
- $frqFile->writeVInt(count($termPositions));
- } else {
- $frqFile->writeVInt($docDelta + 1);
- }
-
- $prevPosition = 0;
- foreach ($termPositions as $position) {
- $prxFile->writeVInt($position - $prevPosition);
- $prevPosition = $position;
- }
- }
-
- if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
- /**
- * @todo Write Skip Data to a freq file.
- * It's not used now, but make index more optimal
- */
- $skipOffset = $frqFile->tell() - $freqPointer;
- } else {
- $skipOffset = 0;
- }
-
- $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
- $this->_fields[$this->_termDictionary[$termId]->field]->number);
- $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
- $freqPointer, $proxPointer, $skipOffset);
-
- $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);
-
- if ($termCount % self::$indexInterval == 0) {
- $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);
-
- $indexPosition = $tisFile->tell();
- $tiiFile->writeVInt($indexPosition - $prevIndexPosition);
- $prevIndexPosition = $indexPosition;
- }
- $termCount++;
- }
-
- $this->_files[] = $this->_name . '.tis';
- $this->_files[] = $this->_name . '.tii';
- $this->_files[] = $this->_name . '.frq';
- $this->_files[] = $this->_name . '.prx';
- }
-
/**
* Generate compound index file
*/
- private function _generateCFS()
+ protected function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
- $data = $dataFile->readBytes($this->_directory->fileLength($fileName));
- $cfsFile->writeBytes($data);
+
+ $byteCount = $this->_directory->fileLength($fileName);
+ while ($byteCount > 0) {
+ $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
+ $byteCount -= strlen($data);
+ $cfsFile->writeBytes($data);
+ }
$this->_directory->deleteFile($fileName);
}
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
- public function close()
- {
- if ($this->_docCount == 0) {
- return null;
- }
-
- $this->_dumpFNM();
- $this->_dumpDictionary();
-
- $this->_generateCFS();
-
- return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
- $this->_docCount,
- $this->_directory);
- }
-
+ abstract public function close();
}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
+{
+ /**
+ * Term Dictionary
+ * Array of the Zend_Search_Lucene_Index_Term objects
+ * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
+ *
+ * @var array
+ */
+ protected $_termDictionary;
+
+ /**
+ * Documents, which contain the term
+ *
+ * @var array
+ */
+ protected $_termDocs;
+
+ /**
+ * Object constructor.
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param string $name
+ */
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
+ {
+ parent::__construct($directory, $name);
+
+ $this->_termDocs = array();
+ $this->_termDictionary = array();
+ }
+
+
+ /**
+ * Adds a document to this segment.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document)
+ {
+ $storedFields = array();
+ $docNorms = array();
+ $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
+
+ foreach ($document->getFieldNames() as $fieldName) {
+ $field = $document->getField($fieldName);
+ $this->addField($field);
+
+ if ($field->storeTermVector) {
+ /**
+ * @todo term vector storing support
+ */
+ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
+ }
+
+ if ($field->isIndexed) {
+ if ($field->isTokenized) {
+ $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+ $analyzer->setInput($field->value, $field->encoding);
+
+ $position = 0;
+ $tokenCounter = 0;
+ while (($token = $analyzer->nextToken()) !== null) {
+ $tokenCounter++;
+
+ $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
+ $termKey = $term->key();
+
+ if (!isset($this->_termDictionary[$termKey])) {
+ // New term
+ $this->_termDictionary[$termKey] = $term;
+ $this->_termDocs[$termKey] = array();
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
+ // Existing term, but new term entry
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ }
+ $position += $token->getPositionIncrement();
+ $this->_termDocs[$termKey][$this->_docCount][] = $position;
+ }
+
+ $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
+ $tokenCounter)*
+ $document->boost*
+ $field->boost ));
+ } else {
+ $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
+ $termKey = $term->key();
+
+ if (!isset($this->_termDictionary[$termKey])) {
+ // New term
+ $this->_termDictionary[$termKey] = $term;
+ $this->_termDocs[$termKey] = array();
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
+ // Existing term, but new term entry
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ }
+ $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
+
+ $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
+ $document->boost*
+ $field->boost ));
+ }
+ }
+
+ if ($field->isStored) {
+ $storedFields[] = $field;
+ }
+ }
+
+
+ foreach ($this->_fields as $fieldName => $field) {
+ if (!$field->isIndexed) {
+ continue;
+ }
+
+ if (!isset($this->_norms[$fieldName])) {
+ $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
+ $this->_docCount);
+ }
+
+ if (isset($docNorms[$fieldName])){
+ $this->_norms[$fieldName] .= $docNorms[$fieldName];
+ } else {
+ $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
+ }
+ }
+
+ $this->addStoredFields($storedFields);
+ }
+
+
+ /**
+ * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
+ */
+ protected function _dumpDictionary()
+ {
+ ksort($this->_termDictionary, SORT_STRING);
+
+ $this->initializeDictionaryFiles();
+
+ foreach ($this->_termDictionary as $termId => $term) {
+ $this->addTerm($term, $this->_termDocs[$termId]);
+ }
+
+ $this->closeDictionaryFiles();
+ }
+
+
+ /**
+ * Close segment, write it to disk and return segment info
+ *
+ * @return Zend_Search_Lucene_Index_SegmentInfo
+ */
+ public function close()
+ {
+ if ($this->_docCount == 0) {
+ return null;
+ }
+
+ $this->_dumpFNM();
+ $this->_dumpDictionary();
+
+ $this->_generateCFS();
+
+ return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
+ $this->_docCount,
+ $this->_directory);
+ }
+
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php';
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter
+{
+ /**
+ * Object constructor.
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param string $name
+ */
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
+ {
+ parent::__construct($directory, $name);
+ }
+
+
+ /**
+ * Create stored fields files and open them for write
+ */
+ public function createStoredFieldsFiles()
+ {
+ $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+ $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
+
+ $this->_files[] = $this->_name . '.fdx';
+ $this->_files[] = $this->_name . '.fdt';
+ }
+
+ public function addNorm($fieldName, $normVector)
+ {
+ if (isset($this->_norms[$fieldName])) {
+ $this->_norms[$fieldName] .= $normVector;
+ } else {
+ $this->_norms[$fieldName] = $normVector;
+ }
+ }
+
+ /**
+ * Close segment, write it to disk and return segment info
+ *
+ * @return Zend_Search_Lucene_Index_SegmentInfo
+ */
+ public function close()
+ {
+ if ($this->_docCount == 0) {
+ return null;
+ }
+
+ $this->_dumpFNM();
+ $this->_generateCFS();
+
+ return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
+ $this->_docCount,
+ $this->_directory);
+ }
+}
+
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Term
/**
- * @todo docblock
+ * Object constructor
*/
- public function __construct( $text, $field = 'contents' )
+ public function __construct($text, $field = null)
{
- $this->field = $field;
- $this->text = $text;
+ $this->field = ($field === null)? Zend_Search_Lucene::getDefaultSearchField() : $field;
+ $this->text = $text;
}
/**
- * @todo docblock
+ * Returns term key
+ *
+ * @return string
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
+
+ /**
+ * Get term prefix
+ *
+ * @param integer $length
+ * @return string
+ */
+ public static function getPrefix($str, $length)
+ {
+ $prefixBytes = 0;
+ $prefixChars = 0;
+ while ($prefixBytes < strlen($str) && $prefixChars < $length) {
+ $charBytes = 1;
+ if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
+ $charBytes++;
+ if (ord($str[$prefixBytes]) & 0x20 ) {
+ $charBytes++;
+ if (ord($str[$prefixBytes]) & 0x10 ) {
+ $charBytes++;
+ }
+ }
+ }
+
+ if ($prefixBytes + $charBytes > strlen($str)) {
+ // wrong character
+ break;
+ }
+
+ $prefixChars++;
+ $prefixBytes += $charBytes;
+ }
+
+ return substr($str, 0, $prefixBytes);
+ }
}
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermInfo
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
-/** Zend_Search_Lucene_Index_SegmentWriter */
-require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
+/** Zend_Search_Lucene_Index_SegmentWriter_ */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentMerger */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentMerger.php';
+
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
- * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Writer
{
/**
- * @todo Implement segment merger
- * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage.
* @todo Implement Analyzer substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
*/
/**
- * File system adapter.
+ * Number of documents required before the buffered in-memory
+ * documents are written into a new Segment
*
- * @var Zend_Search_Lucene_Storage_Directory
+ * Default value is 10
+ *
+ * @var integer
*/
- private $_directory = null;
-
+ public $maxBufferedDocs = 10;
/**
- * Index version
- * Counts how often the index has been changed by adding or deleting docs
+ * Largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
*
* @var integer
*/
- private $_version;
+ public $maxMergeDocs = PHP_INT_MAX;
/**
- * Segment name counter.
- * Used to name new segments .
+ * Determines how often segment indices are merged by addDocument().
+ *
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ *
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ *
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
*
* @var integer
*/
- private $_segmentNameCounter;
+ public $mergeFactor = 10;
/**
- * Number of the segments in the index
+ * File system adapter.
*
- * @var inteher
+ * @var Zend_Search_Lucene_Storage_Directory
*/
- private $_segments;
+ private $_directory = null;
+
/**
- * Determines how often segment indices
- * are merged by addDocument().
+ * Changes counter.
*
* @var integer
*/
- public $mergeFactor;
+ private $_versionUpdate = 0;
/**
- * Determines the minimal number of documents required before
- * the buffered in-memory documents are merging and a new Segment
- * is created.
+ * List of the segments, created by index writer
+ * Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
- * @var integer
+ * @var array
*/
- public $minMergeDocs;
+ private $_newSegments = array();
/**
- * Determines the largest number of documents ever merged by addDocument().
+ * List of segments to be deleted on commit
*
- * @var integer
+ * @var array
*/
- public $maxMergeDocs;
+ private $_segmentsToDelete = array();
/**
- * List of the segments, created by index writer
- * Array of Zend_Search_Lucene_Index_SegmentInfo objects
+ * Current segment to add documents
*
- * @var array
+ * @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
*/
- private $_newSegments;
+ private $_currentSegment = null;
/**
- * Current segment to add documents
+ * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
+ *
+ * It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
*
- * @var Zend_Search_Lucene_Index_SegmentWriter
+ * @var array Zend_Search_Lucene_Index_SegmentInfo
*/
- private $_currentSegment;
+ private $_segmentInfos;
/**
* List of indexfiles extensions
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
- '.del' => '.del' );
+ '.del' => '.del',
+ '.sti' => '.sti' );
/**
* Opens the index for writing
* index or overwrite the existing one.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param array $segmentInfos
* @param boolean $create
*/
- public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false)
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $create = false)
{
- $this->_directory = $directory;
+ $this->_directory = $directory;
+ $this->_segmentInfos = &$segmentInfos;
if ($create) {
foreach ($this->_directory->fileList() as $file) {
}
$segmentsFile = $this->_directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
- // write version
- $segmentsFile->writeLong(0);
+
+ // write version (is initialized by current time
+ // $segmentsFile->writeLong((int)microtime(true));
+ $version = microtime(true);
+ $segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
+ $segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
+
// write name counter
$segmentsFile->writeInt(0);
// write segment counter
$deletableFile = $this->_directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
-
- $this->_version = 0;
- $this->_segmentNameCounter = 0;
- $this->_segments = 0;
} else {
$segmentsFile = $this->_directory->getFileObject('segments');
$format = $segmentsFile->readInt();
if ($format != (int)0xFFFFFFFF) {
throw new Zend_Search_Lucene_Exception('Wrong segments file format');
}
-
- // read version
- $this->_version = $segmentsFile->readLong();
- // read counter
- $this->_segmentNameCounter = $segmentsFile->readInt();
- // read segment counter
- $this->_segments = $segmentsFile->readInt();
}
-
- $this->_newSegments = array();
- $this->_currentSegment = null;
}
/**
{
if ($this->_currentSegment === null) {
$this->_currentSegment =
- new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName());
+ new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
- $this->_version++;
+
+ if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
+ $this->commit();
+ }
+
+ $this->_versionUpdate++;
+
+ $this->_maybeMergeSegments();
+ }
+
+
+ /**
+ * Merge segments if necessary
+ */
+ private function _maybeMergeSegments()
+ {
+ $segmentSizes = array();
+ foreach ($this->_segmentInfos as $segId => $segmentInfo) {
+ $segmentSizes[$segId] = $segmentInfo->count();
+ }
+
+ $mergePool = array();
+ $poolSize = 0;
+ $sizeToMerge = $this->maxBufferedDocs;
+ asort($segmentSizes, SORT_NUMERIC);
+ foreach ($segmentSizes as $segId => $size) {
+ // Check, if segment comes into a new merging block
+ while ($size >= $sizeToMerge) {
+ // Merge previous block if it's large enough
+ if ($poolSize >= $sizeToMerge) {
+ $this->_mergeSegments($mergePool);
+ }
+ $mergePool = array();
+ $poolSize = 0;
+
+ $sizeToMerge *= $this->mergeFactor;
+
+ if ($sizeToMerge > $this->maxMergeDocs) {
+ return;
+ }
+ }
+
+ $mergePool[] = $this->_segmentInfos[$segId];
+ $poolSize += $size;
+ }
+
+ if ($poolSize >= $sizeToMerge) {
+ $this->_mergeSegments($mergePool);
+ }
}
+ /**
+ * Merge specified segments
+ *
+ * $segments is an array of SegmentInfo objects
+ *
+ * @param array $segments
+ */
+ private function _mergeSegments($segments)
+ {
+ // Try to get exclusive non-blocking lock to the 'index.optimization.lock'
+ // Skip optimization if it's performed by other process right now
+ $optimizationLock = $this->_directory->createFile('index.optimization.lock');
+ if (!$optimizationLock->lock(LOCK_EX,true)) {
+ return;
+ }
+
+ $newName = $this->_newSegmentName();
+ $merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
+ $newName);
+ foreach ($segments as $segmentInfo) {
+ $merger->addSource($segmentInfo);
+ $this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
+ }
+
+ $newSegment = $merger->merge();
+ if ($newSegment !== null) {
+ $this->_newSegments[$newSegment->getName()] = $newSegment;
+ }
+
+ $this->commit();
+ // optimization is finished
+ $optimizationLock->unlock();
+ }
/**
* Update segments file by adding current segment to a list
- * @todo !!!!!Finish the implementation
*
* @throws Zend_Search_Lucene_Exception
*/
private function _updateSegments()
{
- $segmentsFile = $this->_directory->getFileObject('segments');
- $newSegmentFile = $this->_directory->createFile('segments.new');
+ // Get an exclusive index lock
+ // Wait, until all parallel searchers or indexers won't stop
+ // and stop all next searchers, while we are updating segments file
+ $lock = $this->_directory->getFileObject('index.lock');
+ if (!$lock->lock(LOCK_EX)) {
+ throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
+ }
- $newSegmentFile->writeInt((int)0xFFFFFFFF);
- $newSegmentFile->writeLong($this->_version);
- $newSegmentFile->writeInt($this->_segmentNameCounter);
- $this->_segments += count($this->_newSegments);
- $newSegmentFile->writeInt($this->_segments);
+ // Do not share file handlers to get file updates from other sessions.
+ $segmentsFile = $this->_directory->getFileObject('segments', false);
+ $newSegmentFile = $this->_directory->createFile('segments.new', false);
- $segmentsFile->seek(20);
- $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20));
+ // Write format marker
+ $newSegmentFile->writeInt((int)0xFFFFFFFF);
- foreach ($this->_newSegments as $segmentName => $segmentInfo) {
- $newSegmentFile->writeString($segmentName);
+ // Write index version
+ $segmentsFile->seek(4, SEEK_CUR);
+ // $version = $segmentsFile->readLong() + $this->_versionUpdate;
+ // Process version on 32-bit platforms
+ $versionHigh = $segmentsFile->readInt();
+ $versionLow = $segmentsFile->readInt();
+ $version = $versionHigh * ((double)0xFFFFFFFF + 1) +
+ (($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow);
+ $version += $this->_versionUpdate;
+ $this->_versionUpdate = 0;
+ $newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
+ $newSegmentFile->writeInt((int)($version & 0xFFFFFFFF));
+
+ // Write segment name counter
+ $newSegmentFile->writeInt($segmentsFile->readInt());
+
+ // Get number of segments offset
+ $numOfSegmentsOffset = $newSegmentFile->tell();
+ // Write number of segemnts
+ $segmentsCount = $segmentsFile->readInt();
+ $newSegmentFile->writeInt(0); // Write dummy data (segment counter)
+
+ $segments = array();
+ for ($count = 0; $count < $segmentsCount; $count++) {
+ $segName = $segmentsFile->readString();
+ $segSize = $segmentsFile->readInt();
+
+ if (!in_array($segName, $this->_segmentsToDelete)) {
+ $newSegmentFile->writeString($segName);
+ $newSegmentFile->writeInt($segSize);
+
+ $segments[$segName] = $segSize;
+ }
+ }
+ $segmentsFile->close();
+
+ $segmentsCount = count($segments) + count($this->_newSegments);
+
+ // Remove segments, not listed in $segments (deleted)
+ // Load segments, not listed in $this->_segmentInfos
+ foreach ($this->_segmentInfos as $segId => $segInfo) {
+ if (isset($segments[$segInfo->getName()])) {
+ // Segment is already included into $this->_segmentInfos
+ unset($segments[$segInfo->getName()]);
+ } else {
+ // remove deleted segment from a list
+ unset($this->_segmentInfos[$segId]);
+ }
+ }
+ // $segments contains a list of segments to load
+ // do it later
+
+ foreach ($this->_newSegments as $segName => $segmentInfo) {
+ $newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segmentInfo->count());
+
+ $this->_segmentInfos[] = $segmentInfo;
}
+ $this->_newSegments = array();
+ $newSegmentFile->seek($numOfSegmentsOffset);
+ $newSegmentFile->writeInt($segmentsCount); // Update segments count
+ $newSegmentFile->close();
$this->_directory->renameFile('segments.new', 'segments');
+
+
+ // Segments file update is finished
+ // Switch back to shared lock mode
+ $lock->lock(LOCK_SH);
+
+
+ $fileList = $this->_directory->fileList();
+ foreach ($this->_segmentsToDelete as $nameToDelete) {
+ foreach (self::$_indexExtensions as $ext) {
+ if ($this->_directory->fileExists($nameToDelete . $ext)) {
+ $this->_directory->deleteFile($nameToDelete . $ext);
+ }
+ }
+
+ foreach ($fileList as $file) {
+ if (substr($file, 0, strlen($nameToDelete) + 2) == ($nameToDelete . '.f') &&
+ ctype_digit( substr($file, strlen($nameToDelete) + 2) )) {
+ $this->_directory->deleteFile($file);
+ }
+ }
+ }
+ $this->_segmentsToDelete = array();
+
+ // Load segments, created by other process
+ foreach ($segments as $segName => $segSize) {
+ // Load new segments
+ $this->_segmentInfos[] = new Zend_Search_Lucene_Index_SegmentInfo($segName,
+ $segSize,
+ $this->_directory);
+ }
}
/**
* Commit current changes
- * returns array of new segments
- *
- * @return array
*/
public function commit()
{
$this->_currentSegment = null;
}
- if (count($this->_newSegments) != 0) {
+ if (count($this->_newSegments) != 0 ||
+ count($this->_segmentsToDelete) != 0) {
$this->_updateSegments();
}
-
- $result = $this->_newSegments;
- $this->_newSegments = array();
-
- return $result;
}
*/
}
-
- /**
- * Returns the number of documents currently in this index.
- *
- * @return integer
- */
- public function docCount($readers)
- {
- /**
- * @todo implementation
- */
- }
-
-
- /**
- * Flushes all changes to an index and closes all associated files.
- *
- */
- public function close()
- {
- /**
- * @todo implementation
- */
- }
-
-
/**
* Merges all segments together into a single segment, optimizing
* an index for search.
+ * Input is an array of Zend_Search_Lucene_Index_SegmentInfo objects
*
- * return void
+ * @throws Zend_Search_Lucene_Exception
*/
public function optimize()
{
- /**
- * @todo implementation
- */
+ $this->_mergeSegments($this->_segmentInfos);
}
/**
*/
private function _newSegmentName()
{
- return '_' . base_convert($this->_segmentNameCounter++, 10, 36);
+ // Do not share file handler to get file updates from other sessions.
+ $segmentsFile = $this->_directory->getFileObject('segments', false);
+
+ // Get exclusive segments file lock
+ // We have guarantee, that we will not intersect with _updateSegments() call
+ // of other process, because it needs exclusive index lock and waits
+ // until all other searchers won't stop
+ if (!$segmentsFile->lock(LOCK_EX)) {
+ throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
+ }
+
+ $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
+ $segmentNameCounter = $segmentsFile->readInt();
+
+ $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
+ $segmentsFile->writeInt($segmentNameCounter + 1);
+
+ // Flash output to guarantee that wrong value will not be loaded between unlock and
+ // return (which calls $segmentsFile destructor)
+ $segmentsFile->flush();
+
+ $segmentsFile->unlock();
+
+ return '_' . base_convert($segmentNameCounter, 10, 36);
}
}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+interface Zend_Search_Lucene_Interface
+{
+ /**
+ * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
+ *
+ * @return Zend_Search_Lucene_Storage_Directory
+ */
+ public function getDirectory();
+
+ /**
+ * Returns the total number of documents in this index (including deleted documents).
+ *
+ * @return integer
+ */
+ public function count();
+
+ /**
+ * Returns one greater than the largest possible document number.
+ * This may be used to, e.g., determine how big to allocate a structure which will have
+ * an element for every document number in an index.
+ *
+ * @return integer
+ */
+ public function maxDoc();
+
+ /**
+ * Returns the total number of non-deleted documents in this index.
+ *
+ * @return integer
+ */
+ public function numDocs();
+
+ /**
+ * Checks, that document is deleted
+ *
+ * @param integer $id
+ * @return boolean
+ * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
+ */
+ public function isDeleted($id);
+
+ /**
+ * Set default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * Default value is null
+ *
+ * @param string $fieldName
+ */
+ public static function setDefaultSearchField($fieldName);
+
+ /**
+ * Get default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * @return string
+ */
+ public static function getDefaultSearchField();
+
+ /**
+ * Retrieve index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMaxBufferedDocs();
+
+ /**
+ * Set index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @param integer $maxBufferedDocs
+ */
+ public function setMaxBufferedDocs($maxBufferedDocs);
+
+ /**
+ * Retrieve index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @return integer
+ */
+ public function getMaxMergeDocs();
+
+ /**
+ * Set index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMaxMergeDocs($maxMergeDocs);
+
+ /**
+ * Retrieve index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMergeFactor();
+
+ /**
+ * Set index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMergeFactor($mergeFactor);
+
+ /**
+ * Performs a query against the index and returns an array
+ * of Zend_Search_Lucene_Search_QueryHit objects.
+ * Input is a string or Zend_Search_Lucene_Search_Query.
+ *
+ * @param mixed $query
+ * @return array Zend_Search_Lucene_Search_QueryHit
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function find($query);
+
+ /**
+ * Returns a list of all unique field names that exist in this index.
+ *
+ * @param boolean $indexed
+ * @return array
+ */
+ public function getFieldNames($indexed = false);
+
+ /**
+ * Returns a Zend_Search_Lucene_Document object for the document
+ * number $id in this index.
+ *
+ * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+ * @return Zend_Search_Lucene_Document
+ */
+ public function getDocument($id);
+
+ /**
+ * Returns true if index contain documents with specified term.
+ *
+ * Is used for query optimization.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return boolean
+ */
+ public function hasTerm(Zend_Search_Lucene_Index_Term $term);
+
+ /**
+ * Returns IDs of all the documents containing term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termDocs(Zend_Search_Lucene_Index_Term $term);
+
+ /**
+ * Returns an array of all term freqs.
+ * Return array structure: array( docId => freq, ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return integer
+ */
+ public function termFreqs(Zend_Search_Lucene_Index_Term $term);
+
+ /**
+ * Returns an array of all term positions in the documents.
+ * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termPositions(Zend_Search_Lucene_Index_Term $term);
+
+ /**
+ * Returns the number of documents in this index containing the $term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return integer
+ */
+ public function docFreq(Zend_Search_Lucene_Index_Term $term);
+
+ /**
+ * Retrive similarity used by index reader
+ *
+ * @return Zend_Search_Lucene_Search_Similarity
+ */
+ public function getSimilarity();
+
+ /**
+ * Returns a normalization factor for "field, document" pair.
+ *
+ * @param integer $id
+ * @param string $fieldName
+ * @return float
+ */
+ public function norm($id, $fieldName);
+
+ /**
+ * Returns true if any documents have been deleted from this index.
+ *
+ * @return boolean
+ */
+ public function hasDeletions();
+
+ /**
+ * Deletes a document from the index.
+ * $id is an internal document id
+ *
+ * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function delete($id);
+
+ /**
+ * Adds a document to this index.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document);
+
+ /**
+ * Commit changes resulting from delete() or undeleteAll() operations.
+ */
+ public function commit();
+
+ /**
+ * Optimize index.
+ *
+ * Merges all segments into one
+ */
+ public function optimize();
+
+ /**
+ * Returns an array of all terms in this index.
+ *
+ * @return array
+ */
+ public function terms();
+
+ /**
+ * Undeletes all documents currently marked as deleted in this index.
+ */
+ public function undeleteAll();
+
+
+ /**
+ * Add reference to the index object
+ *
+ * @internal
+ */
+ public function addReference();
+
+ /**
+ * Remove reference from the index object
+ *
+ * When reference count becomes zero, index is closed and resources are cleaned up
+ *
+ * @internal
+ */
+ public function removeReference();
+}
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+
+/**
+ * Abstract Priority Queue
+ *
+ * It implements a priority queue.
+ * Please go to "Data Structures and Algorithms",
+ * Aho, Hopcroft, and Ullman, Addison-Wesley, 1983 (corrected 1987 edition),
+ * for implementation details.
+ *
+ * It provides O(log(N)) time of put/pop operations, where N is a size of queue
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+abstract class Zend_Search_Lucene_PriorityQueue
+{
+ /**
+ * Queue heap
+ *
+ * Heap contains balanced partial ordered binary tree represented in array
+ * [0] - top of the tree
+ * [1] - first child of [0]
+ * [2] - second child of [0]
+ * ...
+ * [2*n + 1] - first child of [n]
+ * [2*n + 2] - second child of [n]
+ *
+ * @var array
+ */
+ private $_heap = array();
+
+
+ /**
+ * Add element to the queue
+ *
+ * O(log(N)) time
+ *
+ * @param mixed $element
+ */
+ public function put($element)
+ {
+ $nodeId = count($this->_heap);
+ $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
+
+ while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) {
+ // Move parent node down
+ $this->_heap[$nodeId] = $this->_heap[$parentId];
+
+ // Move pointer to the next level of tree
+ $nodeId = $parentId;
+ $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
+ }
+
+ // Put new node into the tree
+ $this->_heap[$nodeId] = $element;
+ }
+
+
+ /**
+ * Return least element of the queue
+ *
+ * Constant time
+ *
+ * @return mixed
+ */
+ public function top()
+ {
+ if (count($this->_heap) == 0) {
+ return null;
+ }
+
+ return $this->_heap[0];
+ }
+
+
+ /**
+ * Removes and return least element of the queue
+ *
+ * O(log(N)) time
+ *
+ * @return mixed
+ */
+ public function pop()
+ {
+ if (count($this->_heap) == 0) {
+ return null;
+ }
+
+ $top = $this->_heap[0];
+ $lastId = count($this->_heap) - 1;
+
+ /**
+ * Find appropriate position for last node
+ */
+ $nodeId = 0; // Start from a top
+ $childId = 1; // First child
+
+ // Choose smaller child
+ if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) {
+ $childId = 2;
+ }
+
+ while ($childId < $lastId &&
+ $this->_less($this->_heap[$childId], $this->_heap[$lastId])
+ ) {
+ // Move child node up
+ $this->_heap[$nodeId] = $this->_heap[$childId];
+
+ $nodeId = $childId; // Go down
+ $childId = ($nodeId << 1) + 1; // First child
+
+ // Choose smaller child
+ if (($childId+1) < $lastId &&
+ $this->_less($this->_heap[$childId+1], $this->_heap[$childId])
+ ) {
+ $childId++;
+ }
+ }
+
+ // Move last element to the new position
+ $this->_heap[$nodeId] = $this->_heap[$lastId];
+ unset($this->_heap[$lastId]);
+
+ return $top;
+ }
+
+
+ /**
+ * Clear queue
+ */
+ public function clear()
+ {
+ $this->_heap = array();
+ }
+
+
+ /**
+ * Compare elements
+ *
+ * Returns true, if $el1 is less than $el2; else otherwise
+ *
+ * @param mixed $el1
+ * @param mixed $el2
+ * @return boolean
+ */
+ abstract protected function _less($el1, $el2);
+}
+
--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+
+/** Zend_Search_Lucene_Interface */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php';
+
+
+/**
+ * Proxy class intended to be used in userland.
+ *
+ * It tracks, when index object goes out of scope and forces ndex closing
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+class Zend_Search_Lucene_Proxy implements Zend_Search_Lucene_Interface
+{
+ /**
+ * Index object
+ *
+ * @var Zend_Search_Lucene_Interface
+ */
+ private $_index;
+
+ /**
+ * Object constructor
+ *
+ * @param Zend_Search_Lucene_Interface $index
+ */
+ public function __construct(Zend_Search_Lucene_Interface $index)
+ {
+ $this->_index = $index;
+ $this->_index->addReference();
+ }
+
+ /**
+ * Object destructor
+ */
+ public function __destruct()
+ {
+ if ($this->_index !== null) {
+ // This code is invoked if Zend_Search_Lucene_Interface object constructor throws an exception
+ $this->_index->removeReference();
+ }
+ $this->_index = null;
+ }
+
+ /**
+ * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
+ *
+ * @return Zend_Search_Lucene_Storage_Directory
+ */
+ public function getDirectory()
+ {
+ return $this->_index->getDirectory();
+ }
+
+ /**
+ * Returns the total number of documents in this index (including deleted documents).
+ *
+ * @return integer
+ */
+ public function count()
+ {
+ return $this->_index->count();
+ }
+
+ /**
+ * Returns one greater than the largest possible document number.
+ * This may be used to, e.g., determine how big to allocate a structure which will have
+ * an element for every document number in an index.
+ *
+ * @return integer
+ */
+ public function maxDoc()
+ {
+ return $this->_index->maxDoc();
+ }
+
+ /**
+ * Returns the total number of non-deleted documents in this index.
+ *
+ * @return integer
+ */
+ public function numDocs()
+ {
+ return $this->_index->numDocs();
+ }
+
+ /**
+ * Checks, that document is deleted
+ *
+ * @param integer $id
+ * @return boolean
+ * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
+ */
+ public function isDeleted($id)
+ {
+ return $this->_index->isDeleted($id);
+ }
+
+ /**
+ * Set default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * Default value is null
+ *
+ * @param string $fieldName
+ */
+ public static function setDefaultSearchField($fieldName)
+ {
+ Zend_Search_Lucene::setDefaultSearchField($fieldName);
+ }
+
+ /**
+ * Get default search field.
+ *
+ * Null means, that search is performed through all fields by default
+ *
+ * @return string
+ */
+ public static function getDefaultSearchField()
+ {
+ return Zend_Search_Lucene::getDefaultSearchField();
+ }
+
+ /**
+ * Retrieve index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMaxBufferedDocs()
+ {
+ return $this->_index->getMaxBufferedDocs();
+ }
+
+ /**
+ * Set index maxBufferedDocs option
+ *
+ * maxBufferedDocs is a minimal number of documents required before
+ * the buffered in-memory documents are written into a new Segment
+ *
+ * Default value is 10
+ *
+ * @param integer $maxBufferedDocs
+ */
+ public function setMaxBufferedDocs($maxBufferedDocs)
+ {
+ $this->_index->setMaxBufferedDocs($maxBufferedDocs);
+ }
+
+
+ /**
+ * Retrieve index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @return integer
+ */
+ public function getMaxMergeDocs()
+ {
+ return $this->_index->getMaxMergeDocs();
+ }
+
+ /**
+ * Set index maxMergeDocs option
+ *
+ * maxMergeDocs is a largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * Default value is PHP_INT_MAX
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMaxMergeDocs($maxMergeDocs)
+ {
+ $this->_index->setMaxMergeDocs($maxMergeDocs);
+ }
+
+
+ /**
+ * Retrieve index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @return integer
+ */
+ public function getMergeFactor()
+ {
+ return $this->_index->getMergeFactor();
+ }
+
+ /**
+ * Set index mergeFactor option
+ *
+ * mergeFactor determines how often segment indices are merged by addDocument().
+ * With smaller values, less RAM is used while indexing,
+ * and searches on unoptimized indices are faster,
+ * but indexing speed is slower.
+ * With larger values, more RAM is used during indexing,
+ * and while searches on unoptimized indices are slower,
+ * indexing is faster.
+ * Thus larger values (> 10) are best for batch index creation,
+ * and smaller values (< 10) for indices that are interactively maintained.
+ *
+ * Default value is 10
+ *
+ * @param integer $maxMergeDocs
+ */
+ public function setMergeFactor($mergeFactor)
+ {
+ $this->_index->setMergeFactor($mergeFactor);
+ }
+
+ /**
+ * Performs a query against the index and returns an array
+ * of Zend_Search_Lucene_Search_QueryHit objects.
+ * Input is a string or Zend_Search_Lucene_Search_Query.
+ *
+ * @param mixed $query
+ * @return array Zend_Search_Lucene_Search_QueryHit
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function find($query)
+ {
+ // actual parameter list
+ $parameters = func_get_args();
+
+ // invoke $this->_index->find() method with specified parameters
+ return call_user_func_array(array(&$this->_index, 'find'), $parameters);
+ }
+
+ /**
+ * Returns a list of all unique field names that exist in this index.
+ *
+ * @param boolean $indexed
+ * @return array
+ */
+ public function getFieldNames($indexed = false)
+ {
+ return $this->_index->getFieldNames($indexed);
+ }
+
+ /**
+ * Returns a Zend_Search_Lucene_Document object for the document
+ * number $id in this index.
+ *
+ * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+ * @return Zend_Search_Lucene_Document
+ */
+ public function getDocument($id)
+ {
+ return $this->_index->getDocument($id);
+ }
+
+ /**
+ * Returns true if index contain documents with specified term.
+ *
+ * Is used for query optimization.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return boolean
+ */
+ public function hasTerm(Zend_Search_Lucene_Index_Term $term)
+ {
+ return $this->_index->hasTerm($term);
+ }
+
+ /**
+ * Returns IDs of all the documents containing term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termDocs(Zend_Search_Lucene_Index_Term $term)
+ {
+ return $this->_index->termDocs($term);
+ }
+
+ /**
+ * Returns an array of all term freqs.
+ * Return array structure: array( docId => freq, ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return integer
+ */
+ public function termFreqs(Zend_Search_Lucene_Index_Term $term)
+ {
+ return $this->_index->termFreqs($term);
+ }
+
+ /**
+ * Returns an array of all term positions in the documents.
+ * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termPositions(Zend_Search_Lucene_Index_Term $term)
+ {
+ return $this->_index->termPositions($term);
+ }
+
+ /**
+ * Returns the number of documents in this index containing the $term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return integer
+ */
+ public function docFreq(Zend_Search_Lucene_Index_Term $term)
+ {
+ return $this->_index->docFreq($term);
+ }
+
+ /**
+ * Retrive similarity used by index reader
+ *
+ * @return Zend_Search_Lucene_Search_Similarity
+ */
+ public function getSimilarity()
+ {
+ return $this->_index->getSimilarity();
+ }
+
+ /**
+ * Returns a normalization factor for "field, document" pair.
+ *
+ * @param integer $id
+ * @param string $fieldName
+ * @return float
+ */
+ public function norm($id, $fieldName)
+ {
+ return $this->_index->norm($id, $fieldName);
+ }
+
+ /**
+ * Returns true if any documents have been deleted from this index.
+ *
+ * @return boolean
+ */
+ public function hasDeletions()
+ {
+ return $this->_index->hasDeletions();
+ }
+
+ /**
+ * Deletes a document from the index.
+ * $id is an internal document id
+ *
+ * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function delete($id)
+ {
+ return $this->_index->delete($id);
+ }
+
+ /**
+ * Adds a document to this index.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document)
+ {
+ $this->_index->addDocument($document);
+ }
+
+ /**
+ * Commit changes resulting from delete() or undeleteAll() operations.
+ */
+ public function commit()
+ {
+ $this->_index->commit();
+ }
+
+ /**
+ * Optimize index.
+ *
+ * Merges all segments into one
+ */
+ public function optimize()
+ {
+ $this->_index->optimize();
+ }
+
+ /**
+ * Returns an array of all terms in this index.
+ *
+ * @return array
+ */
+ public function terms()
+ {
+ return $this->_index->terms();
+ }
+
+ /**
+ * Undeletes all documents currently marked as deleted in this index.
+ */
+ public function undeleteAll()
+ {
+ return $this->_index->undeleteAll();
+ }
+
+ /**
+ * Add reference to the index object
+ *
+ * @internal
+ */
+ public function addReference()
+ {
+ return $this->_index->addReference();
+ }
+
+ /**
+ * Remove reference from the index object
+ *
+ * When reference count becomes zero, index is closed and resources are cleaned up
+ *
+ * @internal
+ */
+ public function removeReference()
+ {
+ return $this->_index->removeReference();
+ }
+}
@todo
-- Improve API: fix ZSearchMultiTermQuery($terms, $signs);
-
-- Analysis and indexing engine
-
-- Additional queries: phrase, wildcard, proximity, and range
+- Additional queries: wildcard, proximity, and range
- Better class-level docblocks (most functions okay)
-- Some Windows issues(?) during indexing
-
-- Finish renaming classes to PEAR-like conventions