From cf0b12ac8395803edfaaf05eb510e8d49d917e38 Mon Sep 17 00:00:00 2001 From: diml Date: Mon, 9 Jul 2007 21:05:40 +0000 Subject: [PATCH] Lucene Zend Implementation update (better handle of UTF8) --- .../Search/BooleanExpressionRecognizer.php | 280 +++++++ search/Zend/Search/Lucene/Search/Query.php | 149 +++- .../Search/Lucene/Search/Query/Boolean.php | 715 ++++++++++++++++++ .../Zend/Search/Lucene/Search/Query/Empty.php | 139 ++++ .../Search/Lucene/Search/Query/MultiTerm.php | 468 ++++++++---- .../Search/Lucene/Search/Query/Phrase.php | 258 +++++-- .../Zend/Search/Lucene/Search/Query/Term.php | 180 ++++- .../Zend/Search/Lucene/Search/QueryEntry.php | 87 +++ .../Lucene/Search/QueryEntry/Phrase.php | 147 ++++ .../Lucene/Search/QueryEntry/Subquery.php | 86 +++ .../Search/Lucene/Search/QueryEntry/Term.php | 154 ++++ search/Zend/Search/Lucene/Search/QueryHit.php | 16 +- .../Zend/Search/Lucene/Search/QueryLexer.php | 508 +++++++++++++ .../Zend/Search/Lucene/Search/QueryParser.php | 531 +++++++++++-- .../Lucene/Search/QueryParserContext.php | 416 ++++++++++ .../Lucene/Search/QueryParserException.php | 40 + .../Zend/Search/Lucene/Search/QueryToken.php | 194 ++++- .../Search/Lucene/Search/QueryTokenizer.php | 4 +- .../Zend/Search/Lucene/Search/Similarity.php | 22 +- .../Lucene/Search/Similarity/Default.php | 8 +- search/Zend/Search/Lucene/Search/Weight.php | 29 +- .../Search/Lucene/Search/Weight/Boolean.php | 136 ++++ .../Search/Lucene/Search/Weight/Empty.php | 56 ++ .../Search/Lucene/Search/Weight/MultiTerm.php | 26 +- .../Search/Lucene/Search/Weight/Phrase.php | 48 +- .../Zend/Search/Lucene/Search/Weight/Term.php | 42 +- .../Zend/Search/Lucene/Storage/Directory.php | 12 +- .../Lucene/Storage/Directory/Filesystem.php | 65 +- search/Zend/Search/Lucene/Storage/File.php | 31 +- .../Search/Lucene/Storage/File/Filesystem.php | 57 +- .../Search/Lucene/Storage/File/Memory.php | 555 ++++++++++++++ 31 files changed, 4927 insertions(+), 532 deletions(-) create mode 100644 search/Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php create mode 100644 search/Zend/Search/Lucene/Search/Query/Boolean.php create mode 100644 search/Zend/Search/Lucene/Search/Query/Empty.php create mode 100644 search/Zend/Search/Lucene/Search/QueryEntry.php create mode 100644 search/Zend/Search/Lucene/Search/QueryEntry/Phrase.php create mode 100644 search/Zend/Search/Lucene/Search/QueryEntry/Subquery.php create mode 100644 search/Zend/Search/Lucene/Search/QueryEntry/Term.php create mode 100644 search/Zend/Search/Lucene/Search/QueryLexer.php create mode 100644 search/Zend/Search/Lucene/Search/QueryParserContext.php create mode 100644 search/Zend/Search/Lucene/Search/QueryParserException.php create mode 100644 search/Zend/Search/Lucene/Search/Weight/Boolean.php create mode 100644 search/Zend/Search/Lucene/Search/Weight/Empty.php create mode 100644 search/Zend/Search/Lucene/Storage/File/Memory.php diff --git a/search/Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php b/search/Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php new file mode 100644 index 0000000000..1801bc1cf3 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php @@ -0,0 +1,280 @@ +dirroot.'/search/Zend/Search/Lucene/FSM.php'; + +/** Zend_Search_Lucene_Search_QueryToken */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryToken.php'; + +/** Zend_Search_Lucene_Search_QueryParser */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php'; + + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_BooleanExpressionRecognizer extends Zend_Search_Lucene_FSM +{ + /** State Machine states */ + const ST_START = 0; + const ST_LITERAL = 1; + const ST_NOT_OPERATOR = 2; + const ST_AND_OPERATOR = 3; + const ST_OR_OPERATOR = 4; + + /** Input symbols */ + const IN_LITERAL = 0; + const IN_NOT_OPERATOR = 1; + const IN_AND_OPERATOR = 2; + const IN_OR_OPERATOR = 3; + + + /** + * NOT operator signal + * + * @var boolean + */ + private $_negativeLiteral = false; + + /** + * Current literal + * + * @var mixed + */ + private $_literal; + + + /** + * Set of boolean query conjunctions + * + * Each conjunction is an array of conjunction elements + * Each conjunction element is presented with two-elements array: + * array(, ) + * + * So, it has a structure: + * array( array( array(, ), // first literal of first conjuction + * array(, ), // second literal of first conjuction + * ... + * array(, ) + * ), // end of first conjuction + * array( array(, ), // first literal of second conjuction + * array(, ), // second literal of second conjuction + * ... + * array(, ) + * ), // end of second conjuction + * ... + * ) // end of structure + * + * @var array + */ + private $_conjunctions = array(); + + /** + * Current conjuction + * + * @var array + */ + private $_currentConjunction = array(); + + + /** + * Object constructor + */ + public function __construct() + { + parent::__construct( array(self::ST_START, + self::ST_LITERAL, + self::ST_NOT_OPERATOR, + self::ST_AND_OPERATOR, + self::ST_OR_OPERATOR), + array(self::IN_LITERAL, + self::IN_NOT_OPERATOR, + self::IN_AND_OPERATOR, + self::IN_OR_OPERATOR)); + + $emptyOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyOperatorAction'); + $emptyNotOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyNotOperatorAction'); + + $this->addRules(array( array(self::ST_START, self::IN_LITERAL, self::ST_LITERAL), + array(self::ST_START, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), + + array(self::ST_LITERAL, self::IN_AND_OPERATOR, self::ST_AND_OPERATOR), + array(self::ST_LITERAL, self::IN_OR_OPERATOR, self::ST_OR_OPERATOR), + array(self::ST_LITERAL, self::IN_LITERAL, self::ST_LITERAL, $emptyOperatorAction), + array(self::ST_LITERAL, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR, $emptyNotOperatorAction), + + array(self::ST_NOT_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), + + array(self::ST_AND_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), + array(self::ST_AND_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), + + array(self::ST_OR_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), + array(self::ST_OR_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), + )); + + $notOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'notOperatorAction'); + $orOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'orOperatorAction'); + $literalAction = new Zend_Search_Lucene_FSMAction($this, 'literalAction'); + + + $this->addEntryAction(self::ST_NOT_OPERATOR, $notOperatorAction); + $this->addEntryAction(self::ST_OR_OPERATOR, $orOperatorAction); + $this->addEntryAction(self::ST_LITERAL, $literalAction); + } + + + /** + * Process next operator. + * + * Operators are defined by class constants: IN_AND_OPERATOR, IN_OR_OPERATOR and IN_NOT_OPERATOR + * + * @param integer $operator + */ + public function processOperator($operator) + { + $this->process($operator); + } + + /** + * Process expression literal. + * + * @param integer $operator + */ + public function processLiteral($literal) + { + $this->_literal = $literal; + + $this->process(self::IN_LITERAL); + } + + /** + * Finish an expression and return result + * + * Result is a set of boolean query conjunctions + * + * Each conjunction is an array of conjunction elements + * Each conjunction element is presented with two-elements array: + * array(, ) + * + * So, it has a structure: + * array( array( array(, ), // first literal of first conjuction + * array(, ), // second literal of first conjuction + * ... + * array(, ) + * ), // end of first conjuction + * array( array(, ), // first literal of second conjuction + * array(, ), // second literal of second conjuction + * ... + * array(, ) + * ), // end of second conjuction + * ... + * ) // end of structure + * + * @return array + * @throws Zend_Search_Lucene_Exception + */ + public function finishExpression() + { + if ($this->getState() != self::ST_LITERAL) { + throw new Zend_Search_Lucene_Exception('Literal expected.'); + } + + $this->_conjunctions[] = $this->_currentConjunction; + + return $this->_conjunctions; + } + + + + /********************************************************************* + * Actions implementation + *********************************************************************/ + + /** + * default (omitted) operator processing + */ + public function emptyOperatorAction() + { + if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) { + // Do nothing + } else { + $this->orOperatorAction(); + } + + // Process literal + $this->literalAction(); + } + + /** + * default (omitted) + NOT operator processing + */ + public function emptyNotOperatorAction() + { + if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) { + // Do nothing + } else { + $this->orOperatorAction(); + } + + // Process NOT operator + $this->notOperatorAction(); + } + + + /** + * NOT operator processing + */ + public function notOperatorAction() + { + $this->_negativeLiteral = true; + } + + /** + * OR operator processing + * Close current conjunction + */ + public function orOperatorAction() + { + $this->_conjunctions[] = $this->_currentConjunction; + $this->_currentConjunction = array(); + } + + /** + * Literal processing + */ + public function literalAction() + { + // Add literal to the current conjunction + $this->_currentConjunction[] = array($this->_literal, !$this->_negativeLiteral); + + // Switch off negative signal + $this->_negativeLiteral = false; + } +} diff --git a/search/Zend/Search/Lucene/Search/Query.php b/search/Zend/Search/Lucene/Search/Query.php index bf284970a1..e55c22ce01 100644 --- a/search/Zend/Search/Lucene/Search/Query.php +++ b/search/Zend/Search/Lucene/Search/Query.php @@ -15,16 +15,19 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ +/** Zend_Search_Lucene_Document_Html */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php'; + /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Search_Query @@ -35,14 +38,31 @@ abstract class Zend_Search_Lucene_Search_Query * * @var float */ - private $_boost = 1.0; + private $_boost = 1; /** * Query weight * * @var Zend_Search_Lucene_Search_Weight */ - protected $_weight; + protected $_weight = null; + + /** + * Current highlight color + * + * @var integer + */ + private $_currentColorIndex = 0; + + /** + * List of colors for text highlighting + * + * @var array + */ + private $_highlightColors = array('#66ffff', '#ff66ff', '#ffff66', + '#ff8888', '#88ff88', '#8888ff', + '#88dddd', '#dd88dd', '#dddd88', + '#aaddff', '#aaffdd', '#ddaaff', '#ddffaa', '#ffaadd', '#ffddaa'); /** @@ -71,30 +91,133 @@ abstract class Zend_Search_Lucene_Search_Query * Score specified document * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ - abstract public function score($docId, $reader); + abstract public function score($docId, Zend_Search_Lucene_Interface $reader); + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + abstract public function matchedDocs(); + + /** + * Execute query in context of index reader + * It also initializes necessary internal structures + * + * Query specific implementation + * + * @param Zend_Search_Lucene_Interface $reader + */ + abstract public function execute(Zend_Search_Lucene_Interface $reader); /** * Constructs an appropriate Weight implementation for this query. * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return Zend_Search_Lucene_Search_Weight */ - abstract protected function _createWeight($reader); + abstract public function createWeight(Zend_Search_Lucene_Interface $reader); /** - * Constructs an initializes a Weight for a query. + * Constructs an initializes a Weight for a _top-level_query_. * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader */ - protected function _initWeight($reader) + protected function _initWeight(Zend_Search_Lucene_Interface $reader) { - $this->_weight = $this->_createWeight($reader); + // Check, that it's a top-level query and query weight is not initialized yet. + if ($this->_weight !== null) { + return $this->_weight; + } + + $this->createWeight($reader); $sum = $this->_weight->sumOfSquaredWeights(); $queryNorm = $reader->getSimilarity()->queryNorm($sum); $this->_weight->normalize($queryNorm); } -} \ No newline at end of file + /** + * Re-write query into primitive queries in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + abstract public function rewrite(Zend_Search_Lucene_Interface $index); + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + abstract public function optimize(Zend_Search_Lucene_Interface $index); + + /** + * Reset query, so it can be reused within other queries or + * with other indeces + */ + public function reset() + { + $this->_weight = null; + } + + + /** + * Print a query + * + * @return string + */ + abstract public function __toString(); + + /** + * Return query terms + * + * @return array + */ + abstract public function getQueryTerms(); + + /** + * Get highlight color and shift to next + * + * @param integer &$colorIndex + * @return string + */ + protected function _getHighlightColor(&$colorIndex) + { + $color = $this->_highlightColors[$colorIndex++]; + + $colorIndex %= count($this->_highlightColors); + + return $color; + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + abstract public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex); + + /** + * Highlight matches in $inputHTML + * + * @param string $inputHTML + * @return string + */ + public function highlightMatches($inputHTML) + { + $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); + + $colorIndex = 0; + $this->highlightMatchesDOM($doc, $colorIndex); + + return $doc->getHTML(); + } +} + diff --git a/search/Zend/Search/Lucene/Search/Query/Boolean.php b/search/Zend/Search/Lucene/Search/Query/Boolean.php new file mode 100644 index 0000000000..710a41f381 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query/Boolean.php @@ -0,0 +1,715 @@ +dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_Boolean */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Boolean.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_Query_Boolean extends Zend_Search_Lucene_Search_Query +{ + + /** + * Subqueries + * Array of Zend_Search_Lucene_Query + * + * @var array + */ + private $_subqueries = array(); + + /** + * Subqueries signs. + * If true then subquery is required. + * If false then subquery is prohibited. + * If null then subquery is neither prohibited, nor required + * + * If array is null then all subqueries are required + * + * @var array + */ + private $_signs = array(); + + /** + * Result vector. + * + * @var array + */ + private $_resVector = null; + + /** + * A score factor based on the fraction of all query subqueries + * that a document contains. + * float for conjunction queries + * array of float for non conjunction queries + * + * @var mixed + */ + private $_coord = null; + + + /** + * Class constructor. Create a new Boolean query object. + * + * if $signs array is omitted then all subqueries are required + * it differs from addSubquery() behavior, but should never be used + * + * @param array $subqueries Array of Zend_Search_Search_Query objects + * @param array $signs Array of signs. Sign is boolean|null. + * @return void + */ + public function __construct($subqueries = null, $signs = null) + { + if (is_array($subqueries)) { + $this->_subqueries = $subqueries; + + $this->_signs = null; + // Check if all subqueries are required + if (is_array($signs)) { + foreach ($signs as $sign ) { + if ($sign !== true) { + $this->_signs = $signs; + break; + } + } + } + } + } + + + /** + * Add a $subquery (Zend_Search_Lucene_Query) to this query. + * + * The sign is specified as: + * TRUE - subquery is required + * FALSE - subquery is prohibited + * NULL - subquery is neither prohibited, nor required + * + * @param Zend_Search_Lucene_Search_Query $subquery + * @param boolean|null $sign + * @return void + */ + public function addSubquery(Zend_Search_Lucene_Search_Query $subquery, $sign=null) { + if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required + if ($this->_signs === null) { // Check, If all previous subqueries are required + foreach ($this->_subqueries as $prevSubquery) { + $this->_signs[] = true; + } + } + $this->_signs[] = $sign; + } + + $this->_subqueries[] = $subquery; + } + + /** + * Re-write queries into primitive queries + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function rewrite(Zend_Search_Lucene_Interface $index) + { + $query = new Zend_Search_Lucene_Search_Query_Boolean(); + $query->setBoost($this->getBoost()); + + foreach ($this->_subqueries as $subqueryId => $subquery) { + $query->addSubquery($subquery->rewrite($index), + ($this->_signs === null)? true : $this->_signs[$subqueryId]); + } + + return $query; + } + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function optimize(Zend_Search_Lucene_Interface $index) + { + $subqueries = array(); + $signs = array(); + + // Optimize all subqueries + foreach ($this->_subqueries as $id => $subquery) { + $subqueries[] = $subquery->optimize($index); + $signs[] = ($this->_signs === null)? true : $this->_signs[$id]; + } + + // Check for empty subqueries + foreach ($subqueries as $id => $subquery) { + if ($subquery instanceof Zend_Search_Lucene_Search_Query_Empty) { + if ($signs[$id] === true) { + // Matching is required, but is actually empty + return new Zend_Search_Lucene_Search_Query_Empty(); + } else { + // Matching is optional or prohibited, but is empty + // Remove it from subqueries and signs list + unset($subqueries[$id]); + unset($signs[$id]); + } + } + } + + + // Check if all non-empty subqueries are prohibited + $allProhibited = true; + foreach ($signs as $sign) { + if ($sign !== false) { + $allProhibited = false; + break; + } + } + if ($allProhibited) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + + // Check, if reduced subqueries list has only one entry + if (count($subqueries) == 1) { + // It's a query with only one required or optional clause + // (it's already checked, that it's not a prohibited clause) + + if ($this->getBoost() == 1) { + return reset($subqueries); + } + + $optimizedQuery = clone reset($subqueries); + $optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost()); + + return $optimizedQuery; + } + + + // Check, if reduced subqueries list is empty + if (count($subqueries) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + + // Prepare first candidate for optimized query + $optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs); + $optimizedQuery->setBoost($this->getBoost()); + + + $terms = array(); + $tsigns = array(); + $boostFactors = array(); + + // Try to decompose term and multi-term subqueries + foreach ($subqueries as $id => $subquery) { + if ($subquery instanceof Zend_Search_Lucene_Search_Query_Term) { + $terms[] = $subquery->getTerm(); + $tsigns[] = $signs[$id]; + $boostFactors[] = $subquery->getBoost(); + + // remove subquery from a subqueries list + unset($subqueries[$id]); + unset($signs[$id]); + } else if ($subquery instanceof Zend_Search_Lucene_Search_Query_MultiTerm) { + $subTerms = $subquery->getTerms(); + $subSigns = $subquery->getSigns(); + + if ($signs[$id] === true) { + // It's a required multi-term subquery. + // Something like '... +(+term1 -term2 term3 ...) ...' + + // Multi-term required subquery can be decomposed only if it contains + // required terms and doesn't contain prohibited terms: + // ... +(+term1 term2 ...) ... => ... +term1 term2 ... + // + // Check this + $hasRequired = false; + $hasProhibited = false; + if ($subSigns === null) { + // All subterms are required + $hasRequired = true; + } else { + foreach ($subSigns as $sign) { + if ($sign === true) { + $hasRequired = true; + } else if ($sign === false) { + $hasProhibited = true; + break; + } + } + } + // Continue if subquery has prohibited terms or doesn't have required terms + if ($hasProhibited || !$hasRequired) { + continue; + } + + foreach ($subTerms as $termId => $term) { + $terms[] = $term; + $tsigns[] = ($subSigns === null)? true : $subSigns[$termId]; + $boostFactors[] = $subquery->getBoost(); + } + + // remove subquery from a subqueries list + unset($subqueries[$id]); + unset($signs[$id]); + + } else { // $signs[$id] === null || $signs[$id] === false + // It's an optional or prohibited multi-term subquery. + // Something like '... (+term1 -term2 term3 ...) ...' + // or + // something like '... -(+term1 -term2 term3 ...) ...' + + // Multi-term optional and required subqueries can be decomposed + // only if all terms are optional. + // + // Check if all terms are optional. + $onlyOptional = true; + if ($subSigns === null) { + // All subterms are required + $onlyOptional = false; + } else { + foreach ($subSigns as $sign) { + if ($sign !== null) { + $onlyOptional = false; + break; + } + } + } + + // Continue if non-optional terms are presented in this multi-term subquery + if (!$onlyOptional) { + continue; + } + + foreach ($subTerms as $termId => $term) { + $terms[] = $term; + $tsigns[] = ($signs[$id] === null)? null /* optional */ : + false /* prohibited */; + $boostFactors[] = $subquery->getBoost(); + } + + // remove subquery from a subqueries list + unset($subqueries[$id]); + unset($signs[$id]); + } + } + } + + + // Check, if there are no decomposed subqueries + if (count($terms) == 0 ) { + // return prepared candidate + return $optimizedQuery; + } + + + // Check, if all subqueries have been decomposed and all terms has the same boost factor + if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) { + $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns); + $optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost()); + + return $optimizedQuery; + } + + + // This boolean query can't be transformed to Term/MultiTerm query and still contains + // several subqueries + + // Separate prohibited terms + $prohibitedTerms = array(); + foreach ($terms as $id => $term) { + if ($tsigns[$id] === false) { + $prohibitedTerms[] = $term; + + unset($terms[$id]); + unset($tsigns[$id]); + unset($boostFactors[$id]); + } + } + + if (count($terms) == 1) { + $clause = new Zend_Search_Lucene_Search_Query_Term(reset($terms)); + $clause->setBoost(reset($boostFactors)); + + $subqueries[] = $clause; + $signs[] = reset($tsigns); + + // Clear terms list + $terms = array(); + } else if (count($terms) > 1 && count(array_unique($boostFactors)) == 1) { + $clause = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns); + $clause->setBoost(reset($boostFactors)); + + $subqueries[] = $clause; + // Clause sign is 'required' if clause contains required terms. 'Optional' otherwise. + $signs[] = (in_array(true, $tsigns))? true : null; + + // Clear terms list + $terms = array(); + } + + if (count($prohibitedTerms) == 1) { + // (boost factors are not significant for prohibited clauses) + $subqueries[] = new Zend_Search_Lucene_Search_Query_Term(reset($prohibitedTerms)); + $signs[] = false; + + // Clear prohibited terms list + $prohibitedTerms = array(); + } else if (count($prohibitedTerms) > 1) { + // prepare signs array + $prohibitedSigns = array(); + foreach ($prohibitedTerms as $id => $term) { + // all prohibited term are grouped as optional into multi-term query + $prohibitedSigns[$id] = null; + } + + // (boost factors are not significant for prohibited clauses) + $subqueries[] = new Zend_Search_Lucene_Search_Query_MultiTerm($prohibitedTerms, $prohibitedSigns); + // Clause sign is 'prohibited' + $signs[] = false; + + // Clear terms list + $prohibitedTerms = array(); + } + + /** @todo Group terms with the same boost factors together */ + + // Check, that all terms are processed + // Replace candidate for optimized query + if (count($terms) == 0 && count($prohibitedTerms) == 0) { + $optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs); + $optimizedQuery->setBoost($this->getBoost()); + } + + return $optimizedQuery; + } + + /** + * Returns subqueries + * + * @return array + */ + public function getSubqueries() + { + return $this->_subqueries; + } + + + /** + * Return subqueries signs + * + * @return array + */ + public function getSigns() + { + return $this->_signs; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene_Interface $reader + * @return Zend_Search_Lucene_Search_Weight + */ + public function createWeight(Zend_Search_Lucene_Interface $reader) + { + $this->_weight = new Zend_Search_Lucene_Search_Weight_Boolean($this, $reader); + return $this->_weight; + } + + + /** + * Calculate result vector for Conjunction query + * (like ' AND AND ') + */ + private function _calculateConjunctionResult() + { + $this->_resVector = null; + + if (count($this->_subqueries) == 0) { + $this->_resVector = array(); + } + + foreach ($this->_subqueries as $subquery) { + if($this->_resVector === null) { + $this->_resVector = $subquery->matchedDocs(); + } else { + $this->_resVector = array_intersect_key($this->_resVector, $subquery->matchedDocs()); + } + + if (count($this->_resVector) == 0) { + // Empty result set, we don't need to check other terms + break; + } + } + + ksort($this->_resVector, SORT_NUMERIC); + } + + + /** + * Calculate result vector for non Conjunction query + * (like ' AND AND NOT OR ') + */ + private function _calculateNonConjunctionResult() + { + $required = null; + $optional = array(); + + foreach ($this->_subqueries as $subqueryId => $subquery) { + $docs = $subquery->matchedDocs(); + + if ($this->_signs[$subqueryId] === true) { + // required + if ($required !== null) { + // array intersection + $required = array_intersect_key($required, $docs); + } else { + $required = $docs; + } + } elseif ($this->_signs[$subqueryId] === false) { + // prohibited + // Do nothing. matchedDocs() may include non-matching id's + } else { + // neither required, nor prohibited + // array union + $optional += $docs; + } + } + + if ($required !== null) { + $this->_resVector = &$required; + } else { + $this->_resVector = &$optional; + } + + ksort($this->_resVector, SORT_NUMERIC); + } + + + /** + * Score calculator for conjunction queries (all subqueries are required) + * + * @param integer $docId + * @param Zend_Search_Lucene_Interface $reader + * @return float + */ + public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader) + { + if ($this->_coord === null) { + $this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries), + count($this->_subqueries) ); + } + + $score = 0; + + foreach ($this->_subqueries as $subquery) { + $subscore = $subquery->score($docId, $reader); + + if ($subscore == 0) { + return 0; + } + + $score += $subquery->score($docId, $reader) * $this->_coord; + } + + return $score * $this->_coord * $this->getBoost(); + } + + + /** + * Score calculator for non conjunction queries (not all subqueries are required) + * + * @param integer $docId + * @param Zend_Search_Lucene_Interface $reader + * @return float + */ + public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader) + { + if ($this->_coord === null) { + $this->_coord = array(); + + $maxCoord = 0; + foreach ($this->_signs as $sign) { + if ($sign !== false /* not prohibited */) { + $maxCoord++; + } + } + + for ($count = 0; $count <= $maxCoord; $count++) { + $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); + } + } + + $score = 0; + $matchedSubqueries = 0; + foreach ($this->_subqueries as $subqueryId => $subquery) { + $subscore = $subquery->score($docId, $reader); + + // Prohibited + if ($this->_signs[$subqueryId] === false && $subscore != 0) { + return 0; + } + + // is required, but doen't match + if ($this->_signs[$subqueryId] === true && $subscore == 0) { + return 0; + } + + if ($subscore != 0) { + $matchedSubqueries++; + $score += $subscore; + } + } + + return $score * $this->_coord[$matchedSubqueries] * $this->getBoost(); + } + + /** + * Execute query in context of index reader + * It also initializes necessary internal structures + * + * @param Zend_Search_Lucene_Interface $reader + */ + public function execute(Zend_Search_Lucene_Interface $reader) + { + // Initialize weight if it's not done yet + $this->_initWeight($reader); + + foreach ($this->_subqueries as $subquery) { + $subquery->execute($reader); + } + + if ($this->_signs === null) { + $this->_calculateConjunctionResult(); + } else { + $this->_calculateNonConjunctionResult(); + } + } + + + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + public function matchedDocs() + { + return $this->_resVector; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene_Interface $reader + * @return float + */ + public function score($docId, Zend_Search_Lucene_Interface $reader) + { + if (isset($this->_resVector[$docId])) { + if ($this->_signs === null) { + return $this->_conjunctionScore($docId, $reader); + } else { + return $this->_nonConjunctionScore($docId, $reader); + } + } else { + return 0; + } + } + + /** + * Return query terms + * + * @return array + */ + public function getQueryTerms() + { + $terms = array(); + + foreach ($this->_subqueries as $id => $subquery) { + if ($this->_signs === null || $this->_signs[$id] !== false) { + $terms = array_merge($terms, $subquery->getQueryTerms()); + } + } + + return $terms; + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) + { + foreach ($this->_subqueries as $id => $subquery) { + if ($this->_signs === null || $this->_signs[$id] !== false) { + $subquery->highlightMatchesDOM($doc, $colorIndex); + } + } + } + + /** + * Print a query + * + * @return string + */ + public function __toString() + { + // It's used only for query visualisation, so we don't care about characters escaping + + $query = ''; + + foreach ($this->_subqueries as $id => $subquery) { + if ($id != 0) { + $query .= ' '; + } + + if ($this->_signs === null || $this->_signs[$id] === true) { + $query .= '+'; + } else if ($this->_signs[$id] === false) { + $query .= '-'; + } + + $query .= '(' . $subquery->__toString() . ')'; + + if ($subquery->getBoost() != 1) { + $query .= '^' . $subquery->getBoost(); + } + } + + return $query; + } +} + diff --git a/search/Zend/Search/Lucene/Search/Query/Empty.php b/search/Zend/Search/Lucene/Search/Query/Empty.php new file mode 100644 index 0000000000..738e33232f --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Query/Empty.php @@ -0,0 +1,139 @@ +dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_Empty */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Empty.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_Query_Empty extends Zend_Search_Lucene_Search_Query +{ + /** + * Re-write query into primitive queries in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function rewrite(Zend_Search_Lucene_Interface $index) + { + return $this; + } + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function optimize(Zend_Search_Lucene_Interface $index) + { + // "Empty" query is a primitive query and don't need to be optimized + return $this; + } + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene_Interface $reader + * @return Zend_Search_Lucene_Search_Weight + */ + public function createWeight(Zend_Search_Lucene_Interface $reader) + { + return new Zend_Search_Lucene_Search_Weight_Empty(); + } + + /** + * Execute query in context of index reader + * It also initializes necessary internal structures + * + * @param Zend_Search_Lucene_Interface $reader + */ + public function execute(Zend_Search_Lucene_Interface $reader) + { + // Do nothing + } + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + public function matchedDocs() + { + return array(); + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene_Interface $reader + * @return float + */ + public function score($docId, Zend_Search_Lucene_Interface $reader) + { + return 0; + } + + /** + * Return query terms + * + * @return array + */ + public function getQueryTerms() + { + return array(); + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) + { + // Do nothing + } + + /** + * Print a query + * + * @return string + */ + public function __toString() + { + return ''; + } +} + diff --git a/search/Zend/Search/Lucene/Search/Query/MultiTerm.php b/search/Zend/Search/Lucene/Search/Query/MultiTerm.php index d3ec761bc6..9258279a00 100644 --- a/search/Zend/Search/Lucene/Search/Query/MultiTerm.php +++ b/search/Zend/Search/Lucene/Search/Query/MultiTerm.php @@ -15,23 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Query */ -require_once 'Zend/Search/Lucene/Search/Query.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; /** Zend_Search_Lucene_Search_Weight_MultiTerm */ -require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query @@ -55,27 +55,24 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * * @var array */ - - private $_signs = array(); + private $_signs; /** * Result vector. - * Bitset or array of document IDs - * (depending from Bitset extension availability). * - * @var mixed + * @var array */ private $_resVector = null; /** * Terms positions vectors. * Array of Arrays: - * term1Id => (docId => array( pos1, pos2, ... ), ...) - * term2Id => (docId => array( pos1, pos2, ... ), ...) + * term1Id => (docId => freq, ...) + * term2Id => (docId => freq, ...) * * @var array */ - private $_termsPositions = array(); + private $_termsFreqs = array(); /** @@ -101,15 +98,15 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc /** * Class constructor. Create a new multi-term query object. * + * if $signs array is omitted then all terms are required + * it differs from addTerm() behavior, but should never be used + * * @param array $terms Array of Zend_Search_Lucene_Index_Term objects * @param array $signs Array of signs. Sign is boolean|null. * @return void */ public function __construct($terms = null, $signs = null) { - /** - * @todo Check contents of $terms and $signs before adding them. - */ if (is_array($terms)) { $this->_terms = $terms; @@ -119,7 +116,7 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc foreach ($signs as $sign ) { if ($sign !== true) { $this->_signs = $signs; - continue; + break; } } } @@ -139,25 +136,122 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * @param boolean|null $sign * @return void */ - public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) { + public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) { + if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required + if ($this->_signs === null) { // Check, If all previous terms are required + foreach ($this->_terms as $prevTerm) { + $this->_signs[] = true; + } + } + $this->_signs[] = $sign; + } + $this->_terms[] = $term; + } - /** - * @todo This is not good. Sometimes $this->_signs is an array, sometimes - * it is null, even when there are terms. It will be changed so that - * it is always an array. - */ - if ($this->_signs === null) { - if ($sign !== null) { - $this->_signs = array(); - foreach ($this->_terms as $term) { - $this->_signs[] = null; + + /** + * Re-write query into primitive queries in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function rewrite(Zend_Search_Lucene_Interface $index) + { + if (count($this->_terms) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + // Check, that all fields are qualified + $allQualified = true; + foreach ($this->_terms as $term) { + if ($term->field === null) { + $allQualified = false; + break; + } + } + + if ($allQualified) { + return $this; + } else { + /** transform multiterm query to boolean and apply rewrite() method to subqueries. */ + $query = new Zend_Search_Lucene_Search_Query_Boolean(); + $query->setBoost($this->getBoost()); + + foreach ($this->_terms as $termId => $term) { + $subquery = new Zend_Search_Lucene_Search_Query_Term($term); + + $query->addSubquery($subquery->rewrite($index), + ($this->_signs === null)? true : $this->_signs[$termId]); + } + + return $query; + } + } + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function optimize(Zend_Search_Lucene_Interface $index) + { + $terms = $this->_terms; + $signs = $this->_signs; + + foreach ($terms as $id => $term) { + if (!$index->hasTerm($term)) { + if ($signs === null || $signs[$id] === true) { + // Term is required + return new Zend_Search_Lucene_Search_Query_Empty(); + } else { + // Term is optional or prohibited + // Remove it from terms and signs list + unset($terms[$id]); + unset($signs[$id]); } - $this->_signs[] = $sign; } + } + + // Check if all presented terms are prohibited + $allProhibited = true; + if ($signs === null) { + $allProhibited = false; } else { - $this->_signs[] = $sign; + foreach ($signs as $sign) { + if ($sign !== false) { + $allProhibited = false; + break; + } + } + } + if ($allProhibited) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + /** + * @todo make an optimization for repeated terms + * (they may have different signs) + */ + + if (count($terms) == 1) { + // It's already checked, that it's not a prohibited term + + // It's one term query with one required or optional element + $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms)); + $optimizedQuery->setBoost($this->getBoost()); + + return $optimizedQuery; } + + if (count($terms) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs); + $optimizedQuery->setBoost($this->getBoost()); + return $optimizedQuery; } @@ -198,12 +292,13 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc /** * Constructs an appropriate Weight implementation for this query. * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return Zend_Search_Lucene_Search_Weight */ - protected function _createWeight($reader) + public function createWeight(Zend_Search_Lucene_Interface $reader) { - return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); + $this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); + return $this->_weight; } @@ -211,38 +306,32 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * Calculate result vector for Conjunction query * (like '+something +another') * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader */ - private function _calculateConjunctionResult($reader) + private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader) { - if (extension_loaded('bitset')) { - foreach( $this->_terms as $termId=>$term ) { - if($this->_resVector === null) { - $this->_resVector = bitset_from_array($reader->termDocs($term)); - } else { - $this->_resVector = bitset_intersection( - $this->_resVector, - bitset_from_array($reader->termDocs($term)) ); - } + $this->_resVector = null; - $this->_termsPositions[$termId] = $reader->termPositions($term); + if (count($this->_terms) == 0) { + $this->_resVector = array(); + } + + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $this->_resVector = array_intersect_key($this->_resVector, array_flip($reader->termDocs($term))); } - } else { - foreach( $this->_terms as $termId=>$term ) { - if($this->_resVector === null) { - $this->_resVector = array_flip($reader->termDocs($term)); - } else { - $termDocs = array_flip($reader->termDocs($term)); - foreach($this->_resVector as $key=>$value) { - if (!isset( $termDocs[$key] )) { - unset( $this->_resVector[$key] ); - } - } - } - $this->_termsPositions[$termId] = $reader->termPositions($term); + if (count($this->_resVector) == 0) { + // Empty result set, we don't need to check other terms + break; } + + $this->_termsFreqs[$termId] = $reader->termFreqs($term); } + + ksort($this->_resVector, SORT_NUMERIC); } @@ -250,89 +339,49 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * Calculate result vector for non Conjunction query * (like '+something -another') * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader */ - private function _calculateNonConjunctionResult($reader) + private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader) { - if (extension_loaded('bitset')) { - $required = null; - $neither = bitset_empty(); - $prohibited = bitset_empty(); - - foreach ($this->_terms as $termId => $term) { - $termDocs = bitset_from_array($reader->termDocs($term)); - - if ($this->_signs[$termId] === true) { - // required - if ($required !== null) { - $required = bitset_intersection($required, $termDocs); - } else { - $required = $termDocs; - } - } elseif ($this->_signs[$termId] === false) { - // prohibited - $prohibited = bitset_union($prohibited, $termDocs); + $required = null; + $optional = array(); + $prohibited = array(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = array_flip($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + // array intersection + $required = array_intersect_key($required, $termDocs); } else { - // neither required, nor prohibited - $neither = bitset_union($neither, $termDocs); + $required = $termDocs; } - - $this->_termsPositions[$termId] = $reader->termPositions($term); - } - - if ($required === null) { - $required = $neither; - } - $this->_resVector = bitset_intersection( $required, - bitset_invert($prohibited, $reader->count()) ); - } else { - $required = null; - $neither = array(); - $prohibited = array(); - - foreach ($this->_terms as $termId => $term) { - $termDocs = array_flip($reader->termDocs($term)); - - if ($this->_signs[$termId] === true) { - // required - if ($required !== null) { - // substitute for bitset_intersection - foreach ($required as $key => $value) { - if (!isset( $termDocs[$key] )) { - unset($required[$key]); - } - } - } else { - $required = $termDocs; - } - } elseif ($this->_signs[$termId] === false) { - // prohibited - // substitute for bitset_union - foreach ($termDocs as $key => $value) { - $prohibited[$key] = $value; - } - } else { - // neither required, nor prohibited - // substitute for bitset_union - foreach ($termDocs as $key => $value) { - $neither[$key] = $value; - } - } - - $this->_termsPositions[$termId] = $reader->termPositions($term); + } elseif ($this->_signs[$termId] === false) { + // prohibited + // array union + $prohibited += $termDocs; + } else { + // neither required, nor prohibited + // array union + $optional += $termDocs; } - if ($required === null) { - $required = $neither; - } + $this->_termsFreqs[$termId] = $reader->termFreqs($term); + } - foreach ($required as $key=>$value) { - if (isset( $prohibited[$key] )) { - unset($required[$key]); - } - } - $this->_resVector = $required; + if ($required !== null) { + $this->_resVector = (count($prohibited) > 0) ? + array_diff_key($required, $prohibited) : + $required; + } else { + $this->_resVector = (count($prohibited) > 0) ? + array_diff_key($optional, $prohibited) : + $optional; } + + ksort($this->_resVector, SORT_NUMERIC); } @@ -340,10 +389,10 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * Score calculator for conjunction queries (all terms are required) * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ - public function _conjunctionScore($docId, $reader) + public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader) { if ($this->_coord === null) { $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), @@ -353,12 +402,16 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc $score = 0.0; foreach ($this->_terms as $termId=>$term) { - $score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + /** + * We don't need to check that term freq is not 0 + * Score calculation is performed only for matched docs + */ + $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } - return $score * $this->_coord; + return $score * $this->_coord * $this->getBoost(); } @@ -366,7 +419,7 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc * Score calculator for non conjunction queries (not all terms are required) * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ public function _nonConjunctionScore($docId, $reader) @@ -390,50 +443,155 @@ class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Searc $matchedTerms = 0; foreach ($this->_terms as $termId=>$term) { // Check if term is - if ($this->_signs[$termId] !== false && // not prohibited - isset($this->_termsPositions[$termId][$docId]) // matched + if ($this->_signs[$termId] !== false && // not prohibited + isset($this->_termsFreqs[$termId][$docId]) // matched ) { $matchedTerms++; + + /** + * We don't need to check that term freq is not 0 + * Score calculation is performed only for matched docs + */ $score += - $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } } - return $score * $this->_coord[$matchedTerms]; + return $score * $this->_coord[$matchedTerms] * $this->getBoost(); + } + + /** + * Execute query in context of index reader + * It also initializes necessary internal structures + * + * @param Zend_Search_Lucene_Interface $reader + */ + public function execute(Zend_Search_Lucene_Interface $reader) + { + if ($this->_signs === null) { + $this->_calculateConjunctionResult($reader); + } else { + $this->_calculateNonConjunctionResult($reader); + } + + // Initialize weight if it's not done yet + $this->_initWeight($reader); + } + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + public function matchedDocs() + { + return $this->_resVector; } /** * Score specified document * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ - public function score($docId, $reader) + public function score($docId, Zend_Search_Lucene_Interface $reader) { - if($this->_resVector === null) { + if (isset($this->_resVector[$docId])) { if ($this->_signs === null) { - $this->_calculateConjunctionResult($reader); + return $this->_conjunctionScore($docId, $reader); } else { - $this->_calculateNonConjunctionResult($reader); + return $this->_nonConjunctionScore($docId, $reader); } + } else { + return 0; + } + } + + /** + * Return query terms + * + * @return array + */ + public function getQueryTerms() + { + if ($this->_signs === null) { + return $this->_terms; + } + + $terms = array(); - $this->_initWeight($reader); + foreach ($this->_signs as $id => $sign) { + if ($sign !== false) { + $terms[] = $this->_terms[$id]; + } } - if ( (extension_loaded('bitset')) ? - bitset_in($this->_resVector, $docId) : - isset($this->_resVector[$docId]) ) { - if ($this->_signs === null) { - return $this->_conjunctionScore($docId, $reader); - } else { - return $this->_nonConjunctionScore($docId, $reader); + return $terms; + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) + { + $words = array(); + + if ($this->_signs === null) { + foreach ($this->_terms as $term) { + $words[] = $term->text; } } else { - return 0; + foreach ($this->_signs as $id => $sign) { + if ($sign !== false) { + $words[] = $this->_terms[$id]->text; + } + } } + + $doc->highlight($words, $this->_getHighlightColor($colorIndex)); + } + + /** + * Print a query + * + * @return string + */ + public function __toString() + { + // It's used only for query visualisation, so we don't care about characters escaping + + $query = ''; + + foreach ($this->_terms as $id => $term) { + if ($id != 0) { + $query .= ' '; + } + + if ($this->_signs === null || $this->_signs[$id] === true) { + $query .= '+'; + } else if ($this->_signs[$id] === false) { + $query .= '-'; + } + + if ($term->field !== null) { + $query .= $term->field . ':'; + } + $query .= $term->text; + } + + if ($this->getBoost() != 1) { + $query = '(' . $query . ')^' . $this->getBoost(); + } + + return $query; } } diff --git a/search/Zend/Search/Lucene/Search/Query/Phrase.php b/search/Zend/Search/Lucene/Search/Query/Phrase.php index b1d40b4bea..3e7b782792 100644 --- a/search/Zend/Search/Lucene/Search/Query/Phrase.php +++ b/search/Zend/Search/Lucene/Search/Query/Phrase.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -23,12 +23,12 @@ /** * Zend_Search_Lucene_Search_Query */ -require_once 'Zend/Search/Lucene/Search/Query.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; /** * Zend_Search_Lucene_Search_Weight_MultiTerm */ -require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Phrase.php'; /** @@ -37,7 +37,7 @@ require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query @@ -73,16 +73,14 @@ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Q * * The slop is zero by default, requiring exact matches. * - * @var unknown_type + * @var integer */ private $_slop; /** * Result vector. - * Bitset or array of document IDs - * (depending from Bitset extension availability). * - * @var mixed + * @var array */ private $_resVector = null; @@ -183,6 +181,70 @@ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Q } + /** + * Re-write query into primitive queries in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function rewrite(Zend_Search_Lucene_Interface $index) + { + if (count($this->_terms) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } else if ($this->_terms[0]->field !== null) { + return $this; + } else { + $query = new Zend_Search_Lucene_Search_Query_Boolean(); + $query->setBoost($this->getBoost()); + + foreach ($index->getFieldNames(true) as $fieldName) { + $subquery = new Zend_Search_Lucene_Search_Query_Phrase(); + $subquery->setSlop($this->getSlop()); + + foreach ($this->_terms as $termId => $term) { + $qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName); + + $subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]); + } + + $query->addSubquery($subquery); + } + + return $query; + } + } + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function optimize(Zend_Search_Lucene_Interface $index) + { + // Check, that index contains all phrase terms + foreach ($this->_terms as $term) { + if (!$index->hasTerm($term)) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + } + + if (count($this->_terms) == 1) { + // It's one term query + $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms)); + $optimizedQuery->setBoost($this->getBoost()); + + return $optimizedQuery; + } + + if (count($this->_terms) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + + return $this; + } + /** * Returns query term * @@ -209,50 +271,13 @@ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Q /** * Constructs an appropriate Weight implementation for this query. * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return Zend_Search_Lucene_Search_Weight */ - protected function _createWeight($reader) + public function createWeight(Zend_Search_Lucene_Interface $reader) { - return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); - } - - - /** - * Calculate result vector - * - * @param Zend_Search_Lucene $reader - */ - private function _calculateResult($reader) - { - if (extension_loaded('bitset')) { - foreach( $this->_terms as $termId=>$term ) { - if($this->_resVector === null) { - $this->_resVector = bitset_from_array($reader->termDocs($term)); - } else { - $this->_resVector = bitset_intersection( - $this->_resVector, - bitset_from_array($reader->termDocs($term)) ); - } - - $this->_termsPositions[$termId] = $reader->termPositions($term); - } - } else { - foreach( $this->_terms as $termId=>$term ) { - if($this->_resVector === null) { - $this->_resVector = array_flip($reader->termDocs($term)); - } else { - $termDocs = array_flip($reader->termDocs($term)); - foreach($this->_resVector as $key=>$value) { - if (!isset( $termDocs[$key] )) { - unset( $this->_resVector[$key] ); - } - } - } - - $this->_termsPositions[$termId] = $reader->termPositions($term); - } - } + $this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); + return $this->_weight; } @@ -305,10 +330,10 @@ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Q * Score calculator for sloppy phrase queries (terms sequence is fixed) * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ - public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader) + public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader) { $freq = 0; @@ -377,50 +402,141 @@ class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Q return $freq; } - /** - * Score specified document + * Execute query in context of index reader + * It also initializes necessary internal structures * - * @param integer $docId - * @param Zend_Search_Lucene $reader - * @return float + * @param Zend_Search_Lucene_Interface $reader */ - public function score($docId, $reader) + public function execute(Zend_Search_Lucene_Interface $reader) { - // optimize zero-term case + $this->_resVector = null; + if (count($this->_terms) == 0) { - return 0; + $this->_resVector = array(); } - if($this->_resVector === null) { - $this->_calculateResult($reader); - $this->_initWeight($reader); + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $this->_resVector = array_intersect_key($this->_resVector, array_flip($reader->termDocs($term))); + } + + if (count($this->_resVector) == 0) { + // Empty result set, we don't need to check other terms + break; + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); } - if ( (extension_loaded('bitset')) ? - bitset_in($this->_resVector, $docId) : - isset($this->_resVector[$docId]) ) { + ksort($this->_resVector, SORT_NUMERIC); + + // Initialize weight if it's not done yet + $this->_initWeight($reader); + } + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + public function matchedDocs() + { + return $this->_resVector; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene_Interface $reader + * @return float + */ + public function score($docId, Zend_Search_Lucene_Interface $reader) + { + if (isset($this->_resVector[$docId])) { if ($this->_slop == 0) { $freq = $this->_exactPhraseFreq($docId); } else { $freq = $this->_sloppyPhraseFreq($docId, $reader); } -/* - return $reader->getSimilarity()->tf($freq) * - $this->_weight->getValue() * - $reader->norm($docId, reset($this->_terms)->field); -*/ if ($freq != 0) { $tf = $reader->getSimilarity()->tf($freq); $weight = $this->_weight->getValue(); $norm = $reader->norm($docId, reset($this->_terms)->field); - return $tf*$weight*$norm; + return $tf * $weight * $norm * $this->getBoost(); } + + // Included in result, but culculated freq is zero + return 0; } else { return 0; } } + + /** + * Return query terms + * + * @return array + */ + public function getQueryTerms() + { + return $this->_terms; + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) + { + $words = array(); + foreach ($this->_terms as $term) { + $words[] = $term->text; + } + + $doc->highlight($words, $this->_getHighlightColor($colorIndex)); + } + + /** + * Print a query + * + * @return string + */ + public function __toString() + { + // It's used only for query visualisation, so we don't care about characters escaping + + $query = ''; + + if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) { + $query .= $this->_terms[0]->field . ':'; + } + + $query .= '"'; + + foreach ($this->_terms as $id => $term) { + if ($id != 0) { + $query .= ' '; + } + $query .= $term->text; + } + + $query .= '"'; + + if ($this->_slop != 0) { + $query .= '~' . $this->_slop; + } + + return $query; + } } diff --git a/search/Zend/Search/Lucene/Search/Query/Term.php b/search/Zend/Search/Lucene/Search/Query/Term.php index b0baf0f5ac..0240104e32 100644 --- a/search/Zend/Search/Lucene/Search/Query/Term.php +++ b/search/Zend/Search/Lucene/Search/Query/Term.php @@ -15,23 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Query */ -require_once 'Zend/Search/Lucene/Search/Query.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; /** Zend_Search_Lucene_Search_Weight_Term */ -require_once 'Zend/Search/Lucene/Search/Weight/Term.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Term.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query @@ -43,31 +43,20 @@ class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Que */ private $_term; - /** - * Term sign. - * If true then term is required - * If false then term is prohibited. - * - * @var bool - */ - private $_sign; - /** * Documents vector. - * Bitset or array of document IDs - * (depending from Bitset extension availability). * - * @var mixed + * @var array */ private $_docVector = null; /** - * Term positions vector. - * Array: docId => array( pos1, pos2, ... ) + * Term freqs vector. + * array(docId => freq, ...) * * @var array */ - private $_termPositions; + private $_termFreqs; /** @@ -76,53 +65,160 @@ class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Que * @param Zend_Search_Lucene_Index_Term $term * @param boolean $sign */ - public function __construct( $term, $sign = true ) + public function __construct($term) { $this->_term = $term; - $this->_sign = $sign; + } + + /** + * Re-write query into primitive queries in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function rewrite(Zend_Search_Lucene_Interface $index) + { + if ($this->_term->field != null) { + return $this; + } else { + $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); + $query->setBoost($this->getBoost()); + + foreach ($index->getFieldNames(true) as $fieldName) { + $term = new Zend_Search_Lucene_Index_Term($this->_term->text, $fieldName); + + $query->addTerm($term); + } + + return $query->rewrite($index); + } + } + + /** + * Optimize query in the context of specified index + * + * @param Zend_Search_Lucene_Interface $index + * @return Zend_Search_Lucene_Search_Query + */ + public function optimize(Zend_Search_Lucene_Interface $index) + { + // Check, that index contains specified term + if (!$index->hasTerm($this->_term)) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + return $this; } /** * Constructs an appropriate Weight implementation for this query. * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return Zend_Search_Lucene_Search_Weight */ - protected function _createWeight($reader) + public function createWeight(Zend_Search_Lucene_Interface $reader) + { + $this->_weight = new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); + return $this->_weight; + } + + /** + * Execute query in context of index reader + * It also initializes necessary internal structures + * + * @param Zend_Search_Lucene_Interface $reader + */ + public function execute(Zend_Search_Lucene_Interface $reader) { - return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); + $this->_docVector = array_flip($reader->termDocs($this->_term)); + $this->_termFreqs = $reader->termFreqs($this->_term); + + // Initialize weight if it's not done yet + $this->_initWeight($reader); + } + + /** + * Get document ids likely matching the query + * + * It's an array with document ids as keys (performance considerations) + * + * @return array + */ + public function matchedDocs() + { + return $this->_docVector; } /** * Score specified document * * @param integer $docId - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return float */ - public function score( $docId, $reader ) + public function score($docId, Zend_Search_Lucene_Interface $reader) { - if($this->_docVector===null) { - if (extension_loaded('bitset')) { - $this->_docVector = bitset_from_array( $reader->termDocs($this->_term) ); - } else { - $this->_docVector = array_flip($reader->termDocs($this->_term)); - } - - $this->_termPositions = $reader->termPositions($this->_term); - $this->_initWeight($reader); - } - - $match = extension_loaded('bitset') ? bitset_in($this->_docVector, $docId) : - isset($this->_docVector[$docId]); - if ($this->_sign && $match) { - return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) * + if (isset($this->_docVector[$docId])) { + return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) * $this->_weight->getValue() * - $reader->norm($docId, $this->_term->field); + $reader->norm($docId, $this->_term->field) * + $this->getBoost(); } else { return 0; } } + + /** + * Return query terms + * + * @return array + */ + public function getQueryTerms() + { + return array($this->_term); + } + + /** + * Return query term + * + * @return Zend_Search_Lucene_Index_Term + */ + public function getTerm() + { + return $this->_term; + } + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + /** + * Highlight query terms + * + * @param integer &$colorIndex + * @param Zend_Search_Lucene_Document_Html $doc + */ + public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) + { + $doc->highlight($this->_term->text, $this->_getHighlightColor($colorIndex)); + } + + /** + * Print a query + * + * @return string + */ + public function __toString() + { + // It's used only for query visualisation, so we don't care about characters escaping + return (($this->_term->field === null)? '':$this->_term->field . ':') . $this->_term->text; + } } diff --git a/search/Zend/Search/Lucene/Search/QueryEntry.php b/search/Zend/Search/Lucene/Search/QueryEntry.php new file mode 100644 index 0000000000..53777c8570 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryEntry.php @@ -0,0 +1,87 @@ +dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryEntry_Term */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry/Term.php'; + +/** Zend_Search_Lucene_Search_QueryEntry_Phrase */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry/Phrase.php'; + +/** Zend_Search_Lucene_Search_QueryEntry_Subquery */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry/Subquery.php'; + + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +abstract class Zend_Search_Lucene_Search_QueryEntry +{ + /** + * Query entry boost factor + * + * @var float + */ + protected $_boost = 1.0; + + + /** + * Process modifier ('~') + * + * @param mixed $parameter + */ + abstract public function processFuzzyProximityModifier($parameter = null); + + + /** + * Transform entry to a subquery + * + * @param string $encoding + * @return Zend_Search_Lucene_Search_Query + */ + abstract public function getQuery($encoding); + + /** + * Boost query entry + * + * @param float $boostFactor + */ + public function boost($boostFactor) + { + $this->_boost *= $boostFactor; + } + + +} diff --git a/search/Zend/Search/Lucene/Search/QueryEntry/Phrase.php b/search/Zend/Search/Lucene/Search/QueryEntry/Phrase.php new file mode 100644 index 0000000000..e90a58b497 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryEntry/Phrase.php @@ -0,0 +1,147 @@ +dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryEntry */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php'; + + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_QueryEntry_Phrase extends Zend_Search_Lucene_Search_QueryEntry +{ + /** + * Phrase value + * + * @var string + */ + private $_phrase; + + /** + * Field + * + * @var string|null + */ + private $_field; + + + /** + * Proximity phrase query + * + * @var boolean + */ + private $_proximityQuery = false; + + /** + * Words distance, used for proximiti queries + * + * @var integer + */ + private $_wordsDistance = 0; + + + /** + * Object constractor + * + * @param string $phrase + * @param string $field + */ + public function __construct($phrase, $field) + { + $this->_phrase = $phrase; + $this->_field = $field; + } + + /** + * Process modifier ('~') + * + * @param mixed $parameter + */ + public function processFuzzyProximityModifier($parameter = null) + { + $this->_proximityQuery = true; + + if ($parameter !== null) { + $this->_wordsDistance = $parameter; + } + } + + /** + * Transform entry to a subquery + * + * @param string $encoding + * @return Zend_Search_Lucene_Search_Query + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function getQuery($encoding) + { + if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { + throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); + } + + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $encoding); + + if (count($tokens) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + if (count($tokens) == 1) { + $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); + $query = new Zend_Search_Lucene_Search_Query_Term($term); + $query->setBoost($this->_boost); + + return $query; + } + + //It's not empty or one term query + $query = new Zend_Search_Lucene_Search_Query_Phrase(); + foreach ($tokens as $token) { + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); + $query->addTerm($term); + } + + if ($this->_proximityQuery) { + $query->setSlop($this->_wordsDistance); + } + + $query->setBoost($this->_boost); + + return $query; + } +} diff --git a/search/Zend/Search/Lucene/Search/QueryEntry/Subquery.php b/search/Zend/Search/Lucene/Search/QueryEntry/Subquery.php new file mode 100644 index 0000000000..8d6e64fc63 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryEntry/Subquery.php @@ -0,0 +1,86 @@ +dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryEntry */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_QueryEntry_Subquery extends Zend_Search_Lucene_Search_QueryEntry +{ + /** + * Query + * + * @var Zend_Search_Lucene_Search_Query + */ + private $_query; + + /** + * Object constractor + * + * @param Zend_Search_Lucene_Search_Query $query + */ + public function __construct(Zend_Search_Lucene_Search_Query $query) + { + $this->_query = $query; + } + + /** + * Process modifier ('~') + * + * @param mixed $parameter + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function processFuzzyProximityModifier($parameter = null) + { + throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' sign must follow term or phrase'); + } + + + /** + * Transform entry to a subquery + * + * @param string $encoding + * @return Zend_Search_Lucene_Search_Query + */ + public function getQuery($encoding) + { + $this->_query->setBoost($this->_boost); + + return $this->_query; + } +} diff --git a/search/Zend/Search/Lucene/Search/QueryEntry/Term.php b/search/Zend/Search/Lucene/Search/QueryEntry/Term.php new file mode 100644 index 0000000000..f9b3d9746e --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryEntry/Term.php @@ -0,0 +1,154 @@ +dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryEntry */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php'; + + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_QueryEntry_Term extends Zend_Search_Lucene_Search_QueryEntry +{ + /** + * Term value + * + * @var string + */ + private $_term; + + /** + * Field + * + * @var string|null + */ + private $_field; + + + /** + * Fuzzy search query + * + * @var boolean + */ + private $_fuzzyQuery = false; + + /** + * Similarity + * + * @var float + */ + private $_similarity = 1.; + + + /** + * Object constractor + * + * @param string $term + * @param string $field + */ + public function __construct($term, $field) + { + $this->_term = $term; + $this->_field = $field; + } + + /** + * Process modifier ('~') + * + * @param mixed $parameter + */ + public function processFuzzyProximityModifier($parameter = null) + { + $this->_fuzzyQuery = true; + + if ($parameter !== null) { + $this->_similarity = $parameter; + } else { + $this->_similarity = 0.5; + } + } + + /** + * Transform entry to a subquery + * + * @param string $encoding + * @return Zend_Search_Lucene_Search_Query + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function getQuery($encoding) + { + if ($this->_fuzzyQuery) { + throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported yet.'); + } + + if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) { + throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard queries are not supported yet.'); + } + + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding); + + if (count($tokens) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + if (count($tokens) == 1) { + $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); + $query = new Zend_Search_Lucene_Search_Query_Term($term); + $query->setBoost($this->_boost); + + return $query; + } + + //It's not empty or one term query + $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); + + /** + * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other + * analizer design features + */ + foreach ($tokens as $token) { + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); + $query->addTerm($term, true); // all subterms are required + } + + $query->setBoost($this->_boost); + + return $query; + } +} diff --git a/search/Zend/Search/Lucene/Search/QueryHit.php b/search/Zend/Search/Lucene/Search/QueryHit.php index 19ab381fe4..12278ea4ad 100644 --- a/search/Zend/Search/Lucene/Search/QueryHit.php +++ b/search/Zend/Search/Lucene/Search/QueryHit.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -24,14 +24,14 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_QueryHit { /** * Object handle of the index - * @var Zend_Search_Lucene + * @var Zend_Search_Lucene_Interface */ protected $_index = null; @@ -55,15 +55,15 @@ class Zend_Search_Lucene_Search_QueryHit /** - * Constructor - pass object handle of Zend_Search_Lucene index that produced + * Constructor - pass object handle of Zend_Search_Lucene_Interface index that produced * the hit so the document can be retrieved easily from the hit. * - * @param Zend_Search_Lucene $index + * @param Zend_Search_Lucene_Interface $index */ - public function __construct(Zend_Search_Lucene $index) + public function __construct(Zend_Search_Lucene_Interface $index) { - $this->_index = $index; + $this->_index = new Zend_Search_Lucene_Proxy($index); } @@ -98,7 +98,7 @@ class Zend_Search_Lucene_Search_QueryHit /** * Return the index object for this hit * - * @return Zend_Search_Lucene + * @return Zend_Search_Lucene_Interface */ public function getIndex() { diff --git a/search/Zend/Search/Lucene/Search/QueryLexer.php b/search/Zend/Search/Lucene/Search/QueryLexer.php new file mode 100644 index 0000000000..6b72110bfd --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryLexer.php @@ -0,0 +1,508 @@ +dirroot.'/search/Zend/Search/Lucene/FSM.php'; + +/** Zend_Search_Lucene_Search_QueryParser */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryToken.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM +{ + /** State Machine states */ + const ST_WHITE_SPACE = 0; + const ST_SYNT_LEXEME = 1; + const ST_LEXEME = 2; + const ST_QUOTED_LEXEME = 3; + const ST_ESCAPED_CHAR = 4; + const ST_ESCAPED_QCHAR = 5; + const ST_LEXEME_MODIFIER = 6; + const ST_NUMBER = 7; + const ST_MANTISSA = 8; + const ST_ERROR = 9; + + /** Input symbols */ + const IN_WHITE_SPACE = 0; + const IN_SYNT_CHAR = 1; + const IN_LEXEME_MODIFIER = 2; + const IN_ESCAPE_CHAR = 3; + const IN_QUOTE = 4; + const IN_DECIMAL_POINT = 5; + const IN_ASCII_DIGIT = 6; + const IN_CHAR = 7; + const IN_MUTABLE_CHAR = 8; + + const QUERY_WHITE_SPACE_CHARS = " \n\r\t"; + const QUERY_SYNT_CHARS = ':()[]{}!|&'; + const QUERY_MUTABLE_CHARS = '+-'; + const QUERY_DOUBLECHARLEXEME_CHARS = '|&'; + const QUERY_LEXEMEMODIFIER_CHARS = '~^'; + const QUERY_ASCIIDIGITS_CHARS = '0123456789'; + + /** + * List of recognized lexemes + * + * @var array + */ + private $_lexemes; + + /** + * Query string (array of single- or non single-byte characters) + * + * @var array + */ + private $_queryString; + + /** + * Current position within a query string + * Used to create appropriate error messages + * + * @var integer + */ + private $_queryStringPosition; + + /** + * Recognized part of current lexeme + * + * @var string + */ + private $_currentLexeme; + + public function __construct() + { + parent::__construct( array(self::ST_WHITE_SPACE, + self::ST_SYNT_LEXEME, + self::ST_LEXEME, + self::ST_QUOTED_LEXEME, + self::ST_ESCAPED_CHAR, + self::ST_ESCAPED_QCHAR, + self::ST_LEXEME_MODIFIER, + self::ST_NUMBER, + self::ST_MANTISSA, + self::ST_ERROR), + array(self::IN_WHITE_SPACE, + self::IN_SYNT_CHAR, + self::IN_MUTABLE_CHAR, + self::IN_LEXEME_MODIFIER, + self::IN_ESCAPE_CHAR, + self::IN_QUOTE, + self::IN_DECIMAL_POINT, + self::IN_ASCII_DIGIT, + self::IN_CHAR)); + + + $lexemeModifierErrorAction = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException'); + $quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException'); + $wrongNumberErrorAction = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException'); + + + + $this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), + array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME), + array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME), + array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME), + array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME) + )); + $this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), + array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME), + array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME), + array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME), + array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME) + )); + $this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME), + array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), + + // IN_QUOTE not allowed + array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction), + + array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME), + array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME), + array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME) + )); + $this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR), + array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE), + array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME), + array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME) + )); + $this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME), + array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME) + )); + $this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME), + array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME) + )); + $this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + + // IN_ESCAPE_CHAR not allowed + array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction), + + // IN_QUOTE not allowed + array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction), + + + array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA), + array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER), + + // IN_CHAR not allowed + array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction), + )); + $this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + + // IN_ESCAPE_CHAR not allowed + array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction), + + // IN_QUOTE not allowed + array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction), + + array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA), + array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER), + + // IN_CHAR not allowed + array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction), + )); + $this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), + array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), + array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), + + // IN_ESCAPE_CHAR not allowed + array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction), + + // IN_QUOTE not allowed + array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction), + + // IN_DECIMAL_POINT not allowed + array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction), + + array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA), + + // IN_CHAR not allowed + array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction), + )); + + + /** Actions */ + $syntaxLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme'); + $lexemeModifierAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier'); + $addLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addLexeme'); + $addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme'); + $addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme'); + $addLexemeCharAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar'); + + + /** Syntax lexeme */ + $this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction); + // Two lexemes in succession + $this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction); + + + /** Lexeme */ + $this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction); + // ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action + + $this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction); + $this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction); + + + /** Quoted lexeme */ + // We don't need entry action (skeep quote) + $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction); + $this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction); + // Closing quote changes state to the ST_WHITE_SPACE other states are not used + $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction); + + + /** Lexeme modifier */ + $this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction); + + + /** Number */ + $this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction); + $this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction); + $this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction); + // ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action + $this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction); + + $this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction); + $this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction); + $this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction); + $this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction); + $this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction); + $this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction); + } + + + + + /** + * Translate input char to an input symbol of state machine + * + * @param string $char + * @return integer + */ + private function _translateInput($char) + { + if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE; + } else if (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR; + } else if (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR; + } else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER; + } else if (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT; + } else if ($char === '"' ) { return self::IN_QUOTE; + } else if ($char === '.' ) { return self::IN_DECIMAL_POINT; + } else if ($char === '\\') { return self::IN_ESCAPE_CHAR; + } else { return self::IN_CHAR; + } + } + + + /** + * This method is used to tokenize query string into lexemes + * + * @param string $inputString + * @param string $encoding + * @return array + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function tokenize($inputString, $encoding) + { + $this->reset(); + + $this->_lexemes = array(); + $this->_queryString = array(); + + $strLength = iconv_strlen($inputString, $encoding); + + // Workaround for iconv_substr bug + $inputString .= ' '; + + for ($count = 0; $count < $strLength; $count++) { + $this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding); + } + + for ($this->_queryStringPosition = 0; + $this->_queryStringPosition < count($this->_queryString); + $this->_queryStringPosition++) { + $this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition])); + } + + $this->process(self::IN_WHITE_SPACE); + + if ($this->getState() != self::ST_WHITE_SPACE) { + throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query'); + } + + $this->_queryString = null; + + return $this->_lexemes; + } + + + + /********************************************************************* + * Actions implementation + * + * Actions affect on recognized lexemes list + *********************************************************************/ + + /** + * Add query syntax lexeme + * + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function addQuerySyntaxLexeme() + { + $lexeme = $this->_queryString[$this->_queryStringPosition]; + + // Process two char lexemes + if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) { + // increase current position in a query string + $this->_queryStringPosition++; + + // check, + if ($this->_queryStringPosition == count($this->_queryString) || + $this->_queryString[$this->_queryStringPosition] != $lexeme) { + throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg()); + } + + // duplicate character + $lexeme .= $lexeme; + } + + $token = new Zend_Search_Lucene_Search_QueryToken( + Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT, + $lexeme, + $this->_queryStringPosition); + + // Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word' + if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) { + $token = array_pop($this->_lexemes); + if ($token === null || $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) { + throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg()); + } + + $token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD; + } + + $this->_lexemes[] = $token; + } + + /** + * Add lexeme modifier + */ + public function addLexemeModifier() + { + $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken( + Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT, + $this->_queryString[$this->_queryStringPosition], + $this->_queryStringPosition); + } + + + /** + * Add lexeme + */ + public function addLexeme() + { + $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken( + Zend_Search_Lucene_Search_QueryToken::TC_WORD, + $this->_currentLexeme, + $this->_queryStringPosition - 1); + + $this->_currentLexeme = ''; + } + + /** + * Add quoted lexeme + */ + public function addQuotedLexeme() + { + $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken( + Zend_Search_Lucene_Search_QueryToken::TC_PHRASE, + $this->_currentLexeme, + $this->_queryStringPosition); + + $this->_currentLexeme = ''; + } + + /** + * Add number lexeme + */ + public function addNumberLexeme() + { + $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken( + Zend_Search_Lucene_Search_QueryToken::TC_NUMBER, + $this->_currentLexeme, + $this->_queryStringPosition - 1); + $this->_currentLexeme = ''; + } + + /** + * Extend lexeme by one char + */ + public function addLexemeChar() + { + $this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition]; + } + + + /** + * Position message + * + * @return string + */ + private function _positionMsg() + { + return 'Position is ' . $this->_queryStringPosition . '.'; + } + + + /********************************************************************* + * Syntax errors actions + *********************************************************************/ + public function lexModifierErrException() + { + throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg()); + } + public function quoteWithinLexemeErrException() + { + throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg()); + } + public function wrongNumberErrException() + { + throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg()); + } +} + diff --git a/search/Zend/Search/Lucene/Search/QueryParser.php b/search/Zend/Search/Lucene/Search/QueryParser.php index 63b6497e05..1a3d5712de 100644 --- a/search/Zend/Search/Lucene/Search/QueryParser.php +++ b/search/Zend/Search/Lucene/Search/QueryParser.php @@ -15,128 +15,507 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ -/** Zend_Search_Lucene_Search_QueryTokenizer */ -require_once 'Zend/Search/Lucene/Search/QueryTokenizer.php'; - /** Zend_Search_Lucene_Index_Term */ -require_once 'Zend/Search/Lucene/Index/Term.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; /** Zend_Search_Lucene_Search_Query_Term */ -require_once 'Zend/Search/Lucene/Search/Query/Term.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Term.php'; /** Zend_Search_Lucene_Search_Query_MultiTerm */ -require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/MultiTerm.php'; + +/** Zend_Search_Lucene_Search_Query_Boolean */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Boolean.php'; /** Zend_Search_Lucene_Search_Query_Phrase */ -require_once 'Zend/Search/Lucene/Search/Query/Phrase.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Phrase.php'; + +/** Zend_Search_Lucene_Search_Query_Empty */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Empty.php'; + + +/** Zend_Search_Lucene_Search_QueryLexer */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryLexer.php'; + +/** Zend_Search_Lucene_Search_QueryParserContext */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserContext.php'; + +/** Zend_Search_Lucene_FSM */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/FSM.php'; /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ -class Zend_Search_Lucene_Search_QueryParser +class Zend_Search_Lucene_Search_QueryParser extends Zend_Search_Lucene_FSM { + /** + * Parser instance + * + * @var Zend_Search_Lucene_Search_QueryParser + */ + private static $_instance = null; + /** - * Parses a query string, returning a Zend_Search_Lucene_Search_Query + * Query lexer + * + * @var Zend_Search_Lucene_Search_QueryLexer + */ + private $_lexer; + + /** + * Tokens list + * Array of Zend_Search_Lucene_Search_QueryToken objects + * + * @var array + */ + private $_tokens; + + /** + * Current token + * + * @var integer|string + */ + private $_currentToken; + + /** + * Last token + * + * It can be processed within FSM states, but this addirional state simplifies FSM + * + * @var Zend_Search_Lucene_Search_QueryToken + */ + private $_lastToken = null; + + /** + * Range query first term + * + * @var string + */ + private $_rqFirstTerm = null; + + /** + * Current query parser context + * + * @var Zend_Search_Lucene_Search_QueryParserContext + */ + private $_context; + + /** + * Context stack + * + * @var array + */ + private $_contextStack; + + /** + * Query string encoding + * + * @var string + */ + private $_encoding; + + /** + * Query string default encoding + * + * @var string + */ + private $_defaultEncoding = ''; + + + /** + * Boolean operators constants + */ + const B_OR = 0; + const B_AND = 1; + + /** + * Default boolean queries operator + * + * @var integer + */ + private $_defaultOperator = self::B_OR; + + + /** Query parser State Machine states */ + const ST_COMMON_QUERY_ELEMENT = 0; // Terms, phrases, operators + const ST_CLOSEDINT_RQ_START = 1; // Range query start (closed interval) - '[' + const ST_CLOSEDINT_RQ_FIRST_TERM = 2; // First term in '[term1 to term2]' construction + const ST_CLOSEDINT_RQ_TO_TERM = 3; // 'TO' lexeme in '[term1 to term2]' construction + const ST_CLOSEDINT_RQ_LAST_TERM = 4; // Second term in '[term1 to term2]' construction + const ST_CLOSEDINT_RQ_END = 5; // Range query end (closed interval) - ']' + const ST_OPENEDINT_RQ_START = 6; // Range query start (opened interval) - '{' + const ST_OPENEDINT_RQ_FIRST_TERM = 7; // First term in '{term1 to term2}' construction + const ST_OPENEDINT_RQ_TO_TERM = 8; // 'TO' lexeme in '{term1 to term2}' construction + const ST_OPENEDINT_RQ_LAST_TERM = 9; // Second term in '{term1 to term2}' construction + const ST_OPENEDINT_RQ_END = 10; // Range query end (opened interval) - '}' + + /** + * Parser constructor + */ + public function __construct() + { + parent::__construct(array(self::ST_COMMON_QUERY_ELEMENT, + self::ST_CLOSEDINT_RQ_START, + self::ST_CLOSEDINT_RQ_FIRST_TERM, + self::ST_CLOSEDINT_RQ_TO_TERM, + self::ST_CLOSEDINT_RQ_LAST_TERM, + self::ST_CLOSEDINT_RQ_END, + self::ST_OPENEDINT_RQ_START, + self::ST_OPENEDINT_RQ_FIRST_TERM, + self::ST_OPENEDINT_RQ_TO_TERM, + self::ST_OPENEDINT_RQ_LAST_TERM, + self::ST_OPENEDINT_RQ_END + ), + Zend_Search_Lucene_Search_QueryToken::getTypes()); + + $this->addRules( + array(array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_START, self::ST_CLOSEDINT_RQ_START), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_START, self::ST_OPENEDINT_RQ_START), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, self::ST_COMMON_QUERY_ELEMENT), + array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, self::ST_COMMON_QUERY_ELEMENT) + )); + $this->addRules( + array(array(self::ST_CLOSEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_FIRST_TERM), + array(self::ST_CLOSEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_CLOSEDINT_RQ_TO_TERM), + array(self::ST_CLOSEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_LAST_TERM), + array(self::ST_CLOSEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_END, self::ST_COMMON_QUERY_ELEMENT) + )); + $this->addRules( + array(array(self::ST_OPENEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_FIRST_TERM), + array(self::ST_OPENEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_OPENEDINT_RQ_TO_TERM), + array(self::ST_OPENEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_LAST_TERM), + array(self::ST_OPENEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_END, self::ST_COMMON_QUERY_ELEMENT) + )); + + + + $addTermEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addTermEntry'); + $addPhraseEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addPhraseEntry'); + $setFieldAction = new Zend_Search_Lucene_FSMAction($this, 'setField'); + $setSignAction = new Zend_Search_Lucene_FSMAction($this, 'setSign'); + $setFuzzyProxAction = new Zend_Search_Lucene_FSMAction($this, 'processFuzzyProximityModifier'); + $processModifierParameterAction = new Zend_Search_Lucene_FSMAction($this, 'processModifierParameter'); + $subqueryStartAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryStart'); + $subqueryEndAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryEnd'); + $logicalOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'logicalOperator'); + $openedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQFirstTerm'); + $openedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQLastTerm'); + $closedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQFirstTerm'); + $closedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQLastTerm'); + + + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, $addTermEntryAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, $addPhraseEntryAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, $setFieldAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, $setSignAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, $setSignAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, $setFuzzyProxAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, $processModifierParameterAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, $subqueryStartAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, $subqueryEndAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, $logicalOperatorAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, $logicalOperatorAction); + $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, $logicalOperatorAction); + + $this->addEntryAction(self::ST_OPENEDINT_RQ_FIRST_TERM, $openedRQFirstTermAction); + $this->addEntryAction(self::ST_OPENEDINT_RQ_LAST_TERM, $openedRQLastTermAction); + $this->addEntryAction(self::ST_CLOSEDINT_RQ_FIRST_TERM, $closedRQFirstTermAction); + $this->addEntryAction(self::ST_CLOSEDINT_RQ_LAST_TERM, $closedRQLastTermAction); + + + + $this->_lexer = new Zend_Search_Lucene_Search_QueryLexer(); + } + + + /** + * Set query string default encoding + * + * @param string $encoding + */ + public static function setDefaultEncoding($encoding) + { + if (self::$_instance === null) { + self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); + } + + self::$_instance->_defaultEncoding = $encoding; + } + + /** + * Get query string default encoding + * + * @return string + */ + public static function getDefaultEncoding() + { + if (self::$_instance === null) { + self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); + } + + return self::$_instance->_defaultEncoding; + } + + /** + * Set default boolean operator + * + * @param integer $operator + */ + public static function setDefaultOperator($operator) + { + if (self::$_instance === null) { + self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); + } + + self::$_instance->_defaultOperator = $operator; + } + + /** + * Get default boolean operator + * + * @return integer + */ + public static function getDefaultOperator() + { + if (self::$_instance === null) { + self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); + } + + return self::$_instance->_defaultOperator; + } + + /** + * Parses a query string * * @param string $strQuery + * @param string $encoding * @return Zend_Search_Lucene_Search_Query + * @throws Zend_Search_Lucene_Search_QueryParserException */ - static public function parse($strQuery) + public static function parse($strQuery, $encoding = null) { - $tokens = new Zend_Search_Lucene_Search_QueryTokenizer($strQuery); + if (self::$_instance === null) { + self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); + } + + self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding; + self::$_instance->_lastToken = null; + self::$_instance->_context = new Zend_Search_Lucene_Search_QueryParserContext(self::$_instance->_encoding); + self::$_instance->_contextStack = array(); + self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding); // Empty query - if (!$tokens->count()) { - throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.'); + if (count(self::$_instance->_tokens) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); } - // Term query - if ($tokens->count() == 1) { - if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { - return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents')); - } else { - throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.'); - } - } + foreach (self::$_instance->_tokens as $token) { + try { + self::$_instance->_currentToken = $token; + self::$_instance->process($token->type); + + self::$_instance->_lastToken = $token; + } catch (Exception $e) { + if (strpos($e->getMessage(), 'There is no any rule for') !== false) { + throw new Zend_Search_Lucene_Search_QueryParserException( 'Syntax error at char position ' . $token->position . '.' ); + } - /** - * MultiTerm Query - * - * Process each token that was returned by the tokenizer. - */ - $terms = array(); - $signs = array(); - $prevToken = null; - $openBrackets = 0; - $field = 'contents'; - foreach ($tokens as $token) { - switch ($token->type) { - case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD: - $terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field); - $field = 'contents'; - if ($prevToken !== null && - $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { - if ($prevToken->text == "+") { - $signs[] = true; - } else { - $signs[] = false; - } - } else { - $signs[] = null; - } - break; - case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN: - if ($prevToken !== null && - $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { - throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.'); - } - break; - case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD: - $field = $token->text; - // let previous token to be signed as next $prevToken - $token = $prevToken; - break; - case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET: - $token->text=='(' ? $openBrackets++ : $openBrackets--; + throw $e; } - $prevToken = $token; } - // Finish up parsing: check the last token in the query for an opening sign or parenthesis. - if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { - throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.'); + if (count(self::$_instance->_contextStack) != 0) { + throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' ); } - // Finish up parsing: check that every opening bracket has a matching closing bracket. - if ($openBrackets != 0) { - throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.'); + return self::$_instance->_context->getQuery(); + } + + + /********************************************************************* + * Actions implementation + * + * Actions affect on recognized lexemes list + *********************************************************************/ + + /** + * Add term to a query + */ + public function addTermEntry() + { + $entry = new Zend_Search_Lucene_Search_QueryEntry_Term($this->_currentToken->text, $this->_context->getField()); + $this->_context->addEntry($entry); + } + + /** + * Add phrase to a query + */ + public function addPhraseEntry() + { + $entry = new Zend_Search_Lucene_Search_QueryEntry_Phrase($this->_currentToken->text, $this->_context->getField()); + $this->_context->addEntry($entry); + } + + /** + * Set entry field + */ + public function setField() + { + $this->_context->setNextEntryField($this->_currentToken->text); + } + + /** + * Set entry sign + */ + public function setSign() + { + $this->_context->setNextEntrySign($this->_currentToken->type); + } + + + /** + * Process fuzzy search/proximity modifier - '~' + */ + public function processFuzzyProximityModifier() + { + $this->_context->processFuzzyProximityModifier(); + } + + /** + * Process modifier parameter + * + * @throws Zend_Search_Lucene_Exception + */ + public function processModifierParameter() + { + if ($this->_lastToken === null) { + throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' ); } - switch (count($terms)) { - case 0: - throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.'); - case 1: - return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false); + switch ($this->_lastToken->type) { + case Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK: + $this->_context->processFuzzyProximityModifier($this->_currentToken->text); + break; + + case Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK: + $this->_context->boost($this->_currentToken->text); + break; + default: - return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs); + // It's not a user input exception + throw new Zend_Search_Lucene_Exception('Lexeme modifier parameter must follow lexeme modifier. Char position .' ); + } + } + + + /** + * Start subquery + */ + public function subqueryStart() + { + $this->_contextStack[] = $this->_context; + $this->_context = new Zend_Search_Lucene_Search_QueryParserContext($this->_encoding, $this->_context->getField()); + } + + /** + * End subquery + */ + public function subqueryEnd() + { + if (count($this->_contextStack) == 0) { + throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing. Char position ' . $this->_currentToken->position . '.' ); } + + $query = $this->_context->getQuery(); + $this->_context = array_pop($this->_contextStack); + + $this->_context->addEntry(new Zend_Search_Lucene_Search_QueryEntry_Subquery($query)); + } + + /** + * Process logical operator + */ + public function logicalOperator() + { + $this->_context->addLogicalOperator($this->_currentToken->type); + } + + /** + * Process first range query term (opened interval) + */ + public function openedRQFirstTerm() + { + $this->_rqFirstTerm = $this->_currentToken->text; + } + + /** + * Process last range query term (opened interval) + * + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function openedRQLastTerm() + { + throw new Zend_Search_Lucene_Search_QueryParserException('Range queries are not supported yet.'); + + // $firstTerm = new Zend_Search_Lucene_Index_Term($this->_rqFirstTerm, $this->_context->getField()); + // $lastTerm = new Zend_Search_Lucene_Index_Term($this->_currentToken->text, $this->_context->getField()); + + // $query = new Zend_Search_Lucene_Search_Query_Range($firstTerm, $lastTerm, false); + // $this->_context->addentry($query); } + /** + * Process first range query term (closed interval) + */ + public function closedRQFirstTerm() + { + $this->_rqFirstTerm = $this->_currentToken->text; + } + + /** + * Process last range query term (closed interval) + * + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function closedRQLastTerm() + { + throw new Zend_Search_Lucene_Search_QueryParserException('Range queries are not supported yet.'); + + // $firstTerm = new Zend_Search_Lucene_Index_Term($this->_rqFirstTerm, $this->_context->getField()); + // $lastTerm = new Zend_Search_Lucene_Index_Term($this->_currentToken->text, $this->_context->getField()); + + // $query = new Zend_Search_Lucene_Search_Query_Range($firstTerm, $lastTerm, true); + // $this->_context->addentry($query); + } } diff --git a/search/Zend/Search/Lucene/Search/QueryParserContext.php b/search/Zend/Search/Lucene/Search/QueryParserContext.php new file mode 100644 index 0000000000..9d172c69fa --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryParserContext.php @@ -0,0 +1,416 @@ +dirroot.'/search/Zend/Search/Lucene/FSM.php'; + + +/** Zend_Search_Lucene_Index_Term */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Search_QueryToken */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryToken.php'; + +/** Zend_Search_Lucene_Search_Query_Term */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Term.php'; + +/** Zend_Search_Lucene_Search_Query_MultiTerm */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/MultiTerm.php'; + +/** Zend_Search_Lucene_Search_Query_Boolean */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Boolean.php'; + +/** Zend_Search_Lucene_Search_Query_Phrase */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Phrase.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Search_QueryParserException */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; + +/** Zend_Search_Lucene_Search_BooleanExpressionRecognizer */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php'; + +/** Zend_Search_Lucene_Search_QueryEntry */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryEntry.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_QueryParserContext +{ + /** + * Default field for the context. + * + * null means, that term should be searched through all fields + * Zend_Search_Lucene_Search_Query::rewriteQuery($index) transletes such queries to several + * + * @var string|null + */ + private $_defaultField; + + /** + * Field specified for next entry + * + * @var string + */ + private $_nextEntryField = null; + + /** + * True means, that term is required. + * False means, that term is prohibited. + * null means, that term is neither prohibited, nor required + * + * @var boolean + */ + private $_nextEntrySign = null; + + + /** + * Entries grouping mode + */ + const GM_SIGNS = 0; // Signs mode: '+term1 term2 -term3 +(subquery1) -(subquery2)' + const GM_BOOLEAN = 1; // Boolean operators mode: 'term1 and term2 or (subquery1) and not (subquery2)' + + /** + * Grouping mode + * + * @var integer + */ + private $_mode = null; + + /** + * Entries signs. + * Used in GM_SIGNS grouping mode + * + * @var arrays + */ + private $_signs = array(); + + /** + * Query entries + * Each entry is a Zend_Search_Lucene_Search_QueryEntry object or + * boolean operator (Zend_Search_Lucene_Search_QueryToken class constant) + * + * @var array + */ + private $_entries = array(); + + /** + * Query string encoding + * + * @var string + */ + private $_encoding; + + + /** + * Context object constructor + * + * @param string $encoding + * @param string|null $defaultField + */ + public function __construct($encoding, $defaultField = null) + { + $this->_encoding = $encoding; + $this->_defaultField = $defaultField; + } + + + /** + * Get context default field + * + * @return string|null + */ + public function getField() + { + return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField; + } + + /** + * Set field for next entry + * + * @param string $field + */ + public function setNextEntryField($field) + { + $this->_nextEntryField = $field; + } + + + /** + * Set sign for next entry + * + * @param integer $sign + * @throws Zend_Search_Lucene_Exception + */ + public function setNextEntrySign($sign) + { + if ($this->_mode === self::GM_BOOLEAN) { + throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.'); + } + + $this->_mode = self::GM_SIGNS; + + if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED) { + $this->_nextEntrySign = true; + } else if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED) { + $this->_nextEntrySign = false; + } else { + throw new Zend_Search_Lucene_Exception('Unrecognized sign type.'); + } + } + + + /** + * Add entry to a query + * + * @param Zend_Search_Lucene_Search_QueryEntry $entry + */ + public function addEntry(Zend_Search_Lucene_Search_QueryEntry $entry) + { + if ($this->_mode !== self::GM_BOOLEAN) { + $this->_signs[] = $this->_nextEntrySign; + } + + $this->_entries[] = $entry; + + $this->_nextEntryField = null; + $this->_nextEntrySign = null; + } + + + /** + * Process fuzzy search or proximity search modifier + * + * @throws Zend_Search_Lucene_Search_QueryParserException + */ + public function processFuzzyProximityModifier($parameter = null) + { + // Check, that modifier has came just after word or phrase + if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) { + throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.'); + } + + $lastEntry = array_pop($this->_entries); + + if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) { + // there are no entries or last entry is boolean operator + throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.'); + } + + $lastEntry->processFuzzyProximityModifier($parameter); + + $this->_entries[] = $lastEntry; + } + + /** + * Set boost factor to the entry + * + * @param float $boostFactor + */ + public function boost($boostFactor) + { + // Check, that modifier has came just after word or phrase + if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) { + throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.'); + } + + $lastEntry = array_pop($this->_entries); + + if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) { + // there are no entries or last entry is boolean operator + throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.'); + } + + $lastEntry->boost($boostFactor); + + $this->_entries[] = $lastEntry; + } + + /** + * Process logical operator + * + * @param integer $operator + */ + public function addLogicalOperator($operator) + { + if ($this->_mode === self::GM_SIGNS) { + throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.'); + } + + $this->_mode = self::GM_BOOLEAN; + + $this->_entries[] = $operator; + } + + + /** + * Generate 'signs style' query from the context + * '+term1 term2 -term3 +() ...' + * + * @return Zend_Search_Lucene_Search_Query + */ + public function _signStyleExpressionQuery() + { + $query = new Zend_Search_Lucene_Search_Query_Boolean(); + + if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) { + $defaultSign = true; // required + } else { + // Zend_Search_Lucene_Search_QueryParser::B_OR + $defaultSign = null; // optional + } + + foreach ($this->_entries as $entryId => $entry) { + $sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign; + $query->addSubquery($entry->getQuery($this->_encoding), $sign); + } + + return $query; + } + + + /** + * Generate 'boolean style' query from the context + * 'term1 and term2 or term3 and () and not ()' + * + * @return Zend_Search_Lucene_Search_Query + * @throws Zend_Search_Lucene + */ + private function _booleanExpressionQuery() + { + /** + * We treat each level of an expression as a boolean expression in + * a Disjunctive Normal Form + * + * AND operator has higher precedence than OR + * + * Thus logical query is a disjunction of one or more conjunctions of + * one or more query entries + */ + + $expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer(); + + try { + foreach ($this->_entries as $entry) { + if ($entry instanceof Zend_Search_Lucene_Search_QueryEntry) { + $expressionRecognizer->processLiteral($entry); + } else { + switch ($entry) { + case Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME: + $expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_AND_OPERATOR); + break; + + case Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME: + $expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_OR_OPERATOR); + break; + + case Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME: + $expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_NOT_OPERATOR); + break; + + default: + throw new Zend_Search_Lucene('Boolean expression error. Unknown operator type.'); + } + } + } + + $conjuctions = $expressionRecognizer->finishExpression(); + } catch (Zend_Search_Exception $e) { + // throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error. Error message: \'' . + // $e->getMessage() . '\'.' ); + // It's query syntax error message and it should be user friendly. So FSM message is omitted + throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error.'); + } + + // Remove 'only negative' conjunctions + foreach ($conjuctions as $conjuctionId => $conjuction) { + $nonNegativeEntryFound = false; + + foreach ($conjuction as $conjuctionEntry) { + if ($conjuctionEntry[1]) { + $nonNegativeEntryFound = true; + break; + } + } + + if (!$nonNegativeEntryFound) { + unset($conjuctions[$conjuctionId]); + } + } + + + $subqueries = array(); + foreach ($conjuctions as $conjuction) { + // Check, if it's a one term conjuction + if (count($conjuction) == 1) { + $subqueries[] = $conjuction[0][0]->getQuery($this->_encoding); + } else { + $subquery = new Zend_Search_Lucene_Search_Query_Boolean(); + + foreach ($conjuction as $conjuctionEntry) { + $subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]); + } + + $subqueries[] = $subquery; + } + } + + if (count($subqueries) == 0) { + return new Zend_Search_Lucene_Search_Query_Empty(); + } + + if (count($subqueries) == 1) { + return $subqueries[0]; + } + + + $query = new Zend_Search_Lucene_Search_Query_Boolean(); + + foreach ($subqueries as $subquery) { + // Non-requirered entry/subquery + $query->addSubquery($subquery); + } + + return $query; + } + + /** + * Generate query from current context + * + * @return Zend_Search_Lucene_Search_Query + */ + public function getQuery() + { + if ($this->_mode === self::GM_BOOLEAN) { + return $this->_booleanExpressionQuery(); + } else { + return $this->_signStyleExpressionQuery(); + } + } +} diff --git a/search/Zend/Search/Lucene/Search/QueryParserException.php b/search/Zend/Search/Lucene/Search/QueryParserException.php new file mode 100644 index 0000000000..8ca791ffb7 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/QueryParserException.php @@ -0,0 +1,40 @@ +dirroot.'/search/Zend/Search/Lucene/Exception.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + * + * Special exception type, which may be used to intercept wrong user input + */ +class Zend_Search_Lucene_Search_QueryParserException extends Zend_Search_Lucene_Exception +{} + diff --git a/search/Zend/Search/Lucene/Search/QueryToken.php b/search/Zend/Search/Lucene/Search/QueryToken.php index 56d3522c71..cf153096d2 100644 --- a/search/Zend/Search/Lucene/Search/QueryToken.php +++ b/search/Zend/Search/Lucene/Search/QueryToken.php @@ -15,46 +15,86 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_QueryToken { /** - * Token type Word. + * Token types. */ - const TOKTYPE_WORD = 0; + const TT_WORD = 0; // Word + const TT_PHRASE = 1; // Phrase (one or several quoted words) + const TT_FIELD = 2; // Field name in 'field:word', field: or field:() pairs + const TT_FIELD_INDICATOR = 3; // ':' + const TT_REQUIRED = 4; // '+' + const TT_PROHIBITED = 5; // '-' + const TT_FUZZY_PROX_MARK = 6; // '~' + const TT_BOOSTING_MARK = 7; // '^' + const TT_RANGE_INCL_START = 8; // '[' + const TT_RANGE_INCL_END = 9; // ']' + const TT_RANGE_EXCL_START = 10; // '{' + const TT_RANGE_EXCL_END = 11; // '}' + const TT_SUBQUERY_START = 12; // '(' + const TT_SUBQUERY_END = 13; // ')' + const TT_AND_LEXEME = 14; // 'AND' or 'and' + const TT_OR_LEXEME = 15; // 'OR' or 'or' + const TT_NOT_LEXEME = 16; // 'NOT' or 'not' + const TT_TO_LEXEME = 17; // 'TO' or 'to' + const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, .... - /** - * Token type Field. - * Field indicator in 'field:word' pair - */ - const TOKTYPE_FIELD = 1; /** - * Token type Sign. - * '+' (required) or '-' (absentee) sign + * Returns all possible lexeme types. + * It's used for syntax analyzer state machine initialization + * + * @return array */ - const TOKTYPE_SIGN = 2; + public static function getTypes() + { + return array( self::TT_WORD, + self::TT_PHRASE, + self::TT_FIELD, + self::TT_FIELD_INDICATOR, + self::TT_REQUIRED, + self::TT_PROHIBITED, + self::TT_FUZZY_PROX_MARK, + self::TT_BOOSTING_MARK, + self::TT_RANGE_INCL_START, + self::TT_RANGE_INCL_END, + self::TT_RANGE_EXCL_START, + self::TT_RANGE_EXCL_END, + self::TT_SUBQUERY_START, + self::TT_SUBQUERY_END, + self::TT_AND_LEXEME, + self::TT_OR_LEXEME, + self::TT_NOT_LEXEME, + self::TT_TO_LEXEME, + self::TT_NUMBER + ); + } + /** - * Token type Bracket. - * '(' or ')' + * TokenCategories */ - const TOKTYPE_BRACKET = 3; + const TC_WORD = 0; // Word + const TC_PHRASE = 1; // Phrase (one or several quoted words) + const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8 + const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^ /** @@ -71,34 +111,118 @@ class Zend_Search_Lucene_Search_QueryToken */ public $text; + /** + * Token position within query. + * + * @var integer + */ + public $position; + /** * IndexReader constructor needs token type and token text as a parameters. * - * @param $tokType integer - * @param $tokText string + * @param integer $tokenCategory + * @param string $tokText + * @param integer $position */ - public function __construct($tokType, $tokText) + public function __construct($tokenCategory, $tokenText, $position) { - switch ($tokType) { - case self::TOKTYPE_BRACKET: - // fall through to the next case - case self::TOKTYPE_FIELD: - // fall through to the next case - case self::TOKTYPE_SIGN: - // fall through to the next case - case self::TOKTYPE_WORD: + $this->text = $tokenText; + $this->position = $position + 1; // Start from 1 + + switch ($tokenCategory) { + case self::TC_WORD: + if ( strtolower($tokenText) == 'and') { + $this->type = self::TT_AND_LEXEME; + } else if (strtolower($tokenText) == 'or') { + $this->type = self::TT_OR_LEXEME; + } else if (strtolower($tokenText) == 'not') { + $this->type = self::TT_NOT_LEXEME; + } else if (strtolower($tokenText) == 'to') { + $this->type = self::TT_TO_LEXEME; + } else { + $this->type = self::TT_WORD; + } break; - default: - throw new Zend_Search_Lucene_Exception("Unrecognized token type \"$tokType\"."); - } - if (!strlen($tokText)) { - throw new Zend_Search_Lucene_Exception('Token text must be supplied.'); - } + case self::TC_PHRASE: + $this->type = self::TT_PHRASE; + break; + + case self::TC_NUMBER: + $this->type = self::TT_NUMBER; + break; + + case self::TC_SYNTAX_ELEMENT: + switch ($tokenText) { + case ':': + $this->type = self::TT_FIELD_INDICATOR; + break; - $this->type = $tokType; - $this->text = $tokText; + case '+': + $this->type = self::TT_REQUIRED; + break; + + case '-': + $this->type = self::TT_PROHIBITED; + break; + + case '~': + $this->type = self::TT_FUZZY_PROX_MARK; + break; + + case '^': + $this->type = self::TT_BOOSTING_MARK; + break; + + case '[': + $this->type = self::TT_RANGE_INCL_START; + break; + + case ']': + $this->type = self::TT_RANGE_INCL_END; + break; + + case '{': + $this->type = self::TT_RANGE_EXCL_START; + break; + + case '}': + $this->type = self::TT_RANGE_EXCL_END; + break; + + case '(': + $this->type = self::TT_SUBQUERY_START; + break; + + case ')': + $this->type = self::TT_SUBQUERY_END; + break; + + case '!': + $this->type = self::TT_NOT_LEXEME; + break; + + case '&&': + $this->type = self::TT_AND_LEXEME; + break; + + case '||': + $this->type = self::TT_OR_LEXEME; + break; + + default: + throw new Zend_Search_Lucene_Exception('Unrecognized query syntax lexeme: \'' . $tokenText . '\''); + } + break; + + case self::TC_NUMBER: + $this->type = self::TT_NUMBER; + + default: + throw new Zend_Search_Lucene_Exception('Unrecognized lexeme type: \'' . $tokenCategory . '\''); + } } } diff --git a/search/Zend/Search/Lucene/Search/QueryTokenizer.php b/search/Zend/Search/Lucene/Search/QueryTokenizer.php index 4fe870bedc..2965eaa753 100644 --- a/search/Zend/Search/Lucene/Search/QueryTokenizer.php +++ b/search/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -21,10 +21,10 @@ /** Zend_Search_Lucene_Search_QueryToken */ -require_once 'Zend/Search/Lucene/Search/QueryToken.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryToken.php'; /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** diff --git a/search/Zend/Search/Lucene/Search/Similarity.php b/search/Zend/Search/Lucene/Search/Similarity.php index 74ecb1dda6..016d232a24 100644 --- a/search/Zend/Search/Lucene/Search/Similarity.php +++ b/search/Zend/Search/Lucene/Search/Similarity.php @@ -15,20 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Similarity_Default */ -require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity/Default.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Search_Similarity @@ -38,7 +38,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * * @var Zend_Search_Lucene_Search_Similarity */ - static private $_defaultImpl; + private static $_defaultImpl; /** * Cache of decoded bytes. @@ -46,7 +46,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * * @var array */ - static private $_normTable = array( 0 => 0.0, + private static $_normTable = array( 0 => 0.0, 1 => 5.820766E-10, 2 => 6.9849193E-10, 3 => 8.1490725E-10, @@ -310,7 +310,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * * @param Zend_Search_Lucene_Search_Similarity $similarity */ - static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) + public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) { self::$_defaultImpl = $similarity; } @@ -322,7 +322,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * * @return Zend_Search_Lucene_Search_Similarity */ - static public function getDefault() + public static function getDefault() { if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); @@ -381,7 +381,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * @param integer $byte * @return float */ - static public function decodeNorm($byte) + public static function decodeNorm($byte) { return self::$_normTable[$byte & 0xFF]; } @@ -412,7 +412,7 @@ abstract class Zend_Search_Lucene_Search_Similarity * @param integer $b * @return float */ - static private function _floatToByte($f) + private static function _floatToByte($f) { // round negatives up to zero if ($f <= 0.0) { @@ -495,10 +495,10 @@ abstract class Zend_Search_Lucene_Search_Similarity * Returns a score factor for the term * * @param mixed $input - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader * @return a score factor for the term */ - public function idf($input, $reader) + public function idf($input, Zend_Search_Lucene_Interface $reader) { if (!is_array($input)) { return $this->idfFreq($reader->docFreq($input), $reader->count()); diff --git a/search/Zend/Search/Lucene/Search/Similarity/Default.php b/search/Zend/Search/Lucene/Search/Similarity/Default.php index 6cafb59668..8263f2a72f 100644 --- a/search/Zend/Search/Lucene/Search/Similarity/Default.php +++ b/search/Zend/Search/Lucene/Search/Similarity/Default.php @@ -15,16 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ +/** Zend_Search_Lucene_Search_Similarity */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php'; + + /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity diff --git a/search/Zend/Search/Lucene/Search/Weight.php b/search/Zend/Search/Lucene/Search/Weight.php index 248f5cb2b4..2faba2861b 100644 --- a/search/Zend/Search/Lucene/Search/Weight.php +++ b/search/Zend/Search/Lucene/Search/Weight.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -32,17 +32,40 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Search_Weight { + /** + * Normalization factor. + * This value is stored only for query expanation purpose and not used in any other place + * + * @var float + */ + protected $_queryNorm; + + /** + * Weight value + * + * Weight value may be initialized in sumOfSquaredWeights() or normalize() + * because they both are invoked either in Query::_initWeight (for top-level query) or + * in corresponding methods of parent query's weights + * + * @var float + */ + protected $_value; + + /** * The weight for this query. * * @return float */ - abstract public function getValue(); + public function getValue() + { + return $this->_value; + } /** * The sum of squared weights of contained query clauses. diff --git a/search/Zend/Search/Lucene/Search/Weight/Boolean.php b/search/Zend/Search/Lucene/Search/Weight/Boolean.php new file mode 100644 index 0000000000..7a42ed2826 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Weight/Boolean.php @@ -0,0 +1,136 @@ +dirroot.'/search/Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_Weight_Boolean extends Zend_Search_Lucene_Search_Weight +{ + /** + * IndexReader. + * + * @var Zend_Search_Lucene_Interface + */ + private $_reader; + + /** + * The query that this concerns. + * + * @var Zend_Search_Lucene_Search_Query + */ + private $_query; + + /** + * Queries weights + * Array of Zend_Search_Lucene_Search_Weight + * + * @var array + */ + private $_weights; + + + /** + * Zend_Search_Lucene_Search_Weight_Boolean constructor + * query - the query that this concerns. + * reader - index reader + * + * @param Zend_Search_Lucene_Search_Query $query + * @param Zend_Search_Lucene_Interface $reader + */ + public function __construct(Zend_Search_Lucene_Search_Query $query, + Zend_Search_Lucene_Interface $reader) + { + $this->_query = $query; + $this->_reader = $reader; + $this->_weights = array(); + + $signs = $query->getSigns(); + + foreach ($query->getSubqueries() as $num => $subquery) { + if ($signs === null || $signs[$num] === null || $signs[$num]) { + $this->_weights[$num] = $subquery->createWeight($reader); + } + } + } + + + /** + * The weight for this query + * Standard Weight::$_value is not used for boolean queries + * + * @return float + */ + public function getValue() + { + return $this->_query->getBoost(); + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + $sum = 0; + foreach ($this->_weights as $weight) { + // sum sub weights + $sum += $weight->sumOfSquaredWeights(); + } + + // boost each sub-weight + $sum *= $this->_query->getBoost() * $this->_query->getBoost(); + + // check for empty query (like '-something -another') + if ($sum == 0) { + $sum = 1.0; + } + return $sum; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + // incorporate boost + $queryNorm *= $this->_query->getBoost(); + + foreach ($this->_weights as $weight) { + $weight->normalize($queryNorm); + } + } +} + + diff --git a/search/Zend/Search/Lucene/Search/Weight/Empty.php b/search/Zend/Search/Lucene/Search/Weight/Empty.php new file mode 100644 index 0000000000..00c1bce451 --- /dev/null +++ b/search/Zend/Search/Lucene/Search/Weight/Empty.php @@ -0,0 +1,56 @@ +dirroot.'/search/Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Search_Weight_Empty extends Zend_Search_Lucene_Search_Weight +{ + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + return 1; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + } +} + diff --git a/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php b/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php index 448bb064eb..122cfa4035 100644 --- a/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php +++ b/search/Zend/Search/Lucene/Search/Weight/MultiTerm.php @@ -15,20 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Weight */ -require_once 'Zend/Search/Lucene/Search/Weight.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight @@ -36,14 +36,14 @@ class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Sear /** * IndexReader. * - * @var Zend_Search_Lucene + * @var Zend_Search_Lucene_Interface */ private $_reader; /** * The query that this concerns. * - * @var Zend_Search_Lucene_Search_Query_MultiTerm + * @var Zend_Search_Lucene_Search_Query */ private $_query; @@ -61,10 +61,11 @@ class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Sear * query - the query that this concerns. * reader - index reader * - * @param Zend_Search_Lucene_Search_Query_MultiTerm $query - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Search_Query $query + * @param Zend_Search_Lucene_Interface $reader */ - public function __construct($query, $reader) + public function __construct(Zend_Search_Lucene_Search_Query $query, + Zend_Search_Lucene_Interface $reader) { $this->_query = $query; $this->_reader = $reader; @@ -72,10 +73,10 @@ class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Sear $signs = $query->getSigns(); - foreach ($query->getTerms() as $num => $term) { - if ($signs === null || $signs[$num] === null || $signs[$num]) { - $this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); - $query->setWeight($num, $this->_weights[$num]); + foreach ($query->getTerms() as $id => $term) { + if ($signs === null || $signs[$id] === null || $signs[$id]) { + $this->_weights[$id] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); + $query->setWeight($id, $this->_weights[$id]); } } } @@ -83,6 +84,7 @@ class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Sear /** * The weight for this query + * Standard Weight::$_value is not used for boolean queries * * @return float */ diff --git a/search/Zend/Search/Lucene/Search/Weight/Phrase.php b/search/Zend/Search/Lucene/Search/Weight/Phrase.php index 536659614c..7faaa7ad13 100644 --- a/search/Zend/Search/Lucene/Search/Weight/Phrase.php +++ b/search/Zend/Search/Lucene/Search/Weight/Phrase.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -23,14 +23,14 @@ /** * Zend_Search_Lucene_Search_Weight */ -require_once 'Zend/Search/Lucene/Search/Weight.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight @@ -38,7 +38,7 @@ class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_ /** * IndexReader. * - * @var Zend_Search_Lucene + * @var Zend_Search_Lucene_Interface */ private $_reader; @@ -49,13 +49,6 @@ class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_ */ private $_query; - /** - * Weight value - * - * @var float - */ - private $_value; - /** * Score factor * @@ -63,46 +56,19 @@ class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_ */ private $_idf; - /** - * Normalization factor - * - * @var float - */ - private $_queryNorm; - - - /** - * Query weight - * - * @var float - */ - private $_queryWeight; - - /** * Zend_Search_Lucene_Search_Weight_Phrase constructor * * @param Zend_Search_Lucene_Search_Query_Phrase $query - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Interface $reader */ - public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query, Zend_Search_Lucene $reader) + public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query, + Zend_Search_Lucene_Interface $reader) { $this->_query = $query; $this->_reader = $reader; } - - /** - * The weight for this query - * - * @return float - */ - public function getValue() - { - return $this->_value; - } - - /** * The sum of squared weights of contained query clauses. * diff --git a/search/Zend/Search/Lucene/Search/Weight/Term.php b/search/Zend/Search/Lucene/Search/Weight/Term.php index d502896a5b..478cb2a805 100644 --- a/search/Zend/Search/Lucene/Search/Weight/Term.php +++ b/search/Zend/Search/Lucene/Search/Weight/Term.php @@ -15,20 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Weight */ -require_once 'Zend/Search/Lucene/Search/Weight.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight @@ -36,7 +36,7 @@ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_We /** * IndexReader. * - * @var Zend_Search_Lucene + * @var Zend_Search_Lucene_Interface */ private $_reader; @@ -54,13 +54,6 @@ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_We */ private $_query; - /** - * Weight value - * - * @var float - */ - private $_value; - /** * Score factor * @@ -68,14 +61,6 @@ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_We */ private $_idf; - /** - * Normalization factor - * - * @var float - */ - private $_queryNorm; - - /** * Query weight * @@ -88,9 +73,13 @@ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_We * Zend_Search_Lucene_Search_Weight_Term constructor * reader - index reader * - * @param Zend_Search_Lucene $reader + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Search_Query $query + * @param Zend_Search_Lucene_Interface $reader */ - public function __construct($term, $query, $reader) + public function __construct(Zend_Search_Lucene_Index_Term $term, + Zend_Search_Lucene_Search_Query $query, + Zend_Search_Lucene_Interface $reader) { $this->_term = $term; $this->_query = $query; @@ -98,17 +87,6 @@ class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_We } - /** - * The weight for this query - * - * @return float - */ - public function getValue() - { - return $this->_value; - } - - /** * The sum of squared weights of contained query clauses. * diff --git a/search/Zend/Search/Lucene/Storage/Directory.php b/search/Zend/Search/Lucene/Storage/Directory.php index 01ea380e3c..67e2b295f5 100644 --- a/search/Zend/Search/Lucene/Storage/Directory.php +++ b/search/Zend/Search/Lucene/Storage/Directory.php @@ -15,7 +15,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -24,7 +24,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Storage_Directory @@ -111,10 +111,16 @@ abstract class Zend_Search_Lucene_Storage_Directory /** * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. * + * If $shareHandler option is true, then file handler can be shared between File Object + * requests. It speed-ups performance, but makes problems with file position. + * Shared handler are good for short atomic requests. + * Non-shared handlers are useful for stream file reading (especial for compound files). + * * @param string $filename + * @param boolean $shareHandler * @return Zend_Search_Lucene_Storage_File */ - abstract public function getFileObject($filename); + abstract public function getFileObject($filename, $shareHandler = true); } diff --git a/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php b/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php index 8d675c35b5..7ac76810ab 100644 --- a/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php +++ b/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php @@ -15,16 +15,16 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Storage_Directory */ -require_once 'Zend/Search/Lucene/Storage/Directory.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory.php'; /** Zend_Search_Lucene_Storage_File_Filesystem */ -require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Filesystem.php'; /** @@ -33,7 +33,7 @@ require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php'; * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory @@ -64,7 +64,7 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene * @return boolean */ - static public function mkdirs($dir, $mode = 0777, $recursive = true) + public static function mkdirs($dir, $mode = 0777, $recursive = true) { if (is_null($dir) || $dir === '') { return false; @@ -113,7 +113,7 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene $fileObject->close(); } - unset($this->_fileHandlers); + $this->_fileHandlers = array(); } @@ -127,15 +127,14 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene $result = array(); $dirContent = opendir( $this->_dirPath ); - while ($file = readdir($dirContent)) { + while (($file = readdir($dirContent)) !== false) { if (($file == '..')||($file == '.')) continue; - $fullName = $this->_dirPath . '/' . $file; - if( !is_dir($this->_dirPath . '/' . $file) ) { $result[] = $file; } } + closedir($dirContent); return $result; } @@ -165,11 +164,17 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene */ public function deleteFile($filename) { + /** + * @todo add support of "deletable" file + * "deletable" is used on Windows systems if file can't be deleted + * (while it is still open). + */ + if (isset($this->_fileHandlers[$filename])) { $this->_fileHandlers[$filename]->close(); } unset($this->_fileHandlers[$filename]); - unlink($this->_dirPath .'/'. $filename); + unlink($this->_dirPath . '/' . $filename); } @@ -219,24 +224,40 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene * @param string $from * @param string $to * @return void + * @throws Zend_Search_Lucene_Exception */ public function renameFile($from, $to) { - if ($this->_fileHandlers[$from] !== null) { + global $php_errormsg; + + if (isset($this->_fileHandlers[$from])) { $this->_fileHandlers[$from]->close(); } unset($this->_fileHandlers[$from]); - if ($this->_fileHandlers[$to] !== null) { + if (isset($this->_fileHandlers[$to])) { $this->_fileHandlers[$to]->close(); } unset($this->_fileHandlers[$to]); if (file_exists($this->_dirPath . '/' . $to)) { - unlink($this->_dirPath . '/' . $to); + if (!unlink($this->_dirPath . '/' . $to)) { + throw new Zend_Search_Lucene_Exception('Delete operation failed'); + } + } + + $trackErrors = ini_get('track_errors'); + ini_set('track_errors', '1'); + + $success = @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); + if (!$success) { + ini_set('track_errors', $trackErrors); + throw new Zend_Search_Lucene_Exception($php_errormsg); } - return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); + ini_set('track_errors', $trackErrors); + + return $success; } @@ -255,17 +276,29 @@ class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene /** * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. * + * If $shareHandler option is true, then file handler can be shared between File Object + * requests. It speed-ups performance, but makes problems with file position. + * Shared handler are good for short atomic requests. + * Non-shared handlers are useful for stream file reading (especial for compound files). + * * @param string $filename + * @param boolean $shareHandler * @return Zend_Search_Lucene_Storage_File */ - public function getFileObject($filename) + public function getFileObject($filename, $shareHandler = true) { + $fullFilename = $this->_dirPath . '/' . $filename; + + if (!$shareHandler) { + return new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename); + } + if (isset( $this->_fileHandlers[$filename] )) { $this->_fileHandlers[$filename]->seek(0); return $this->_fileHandlers[$filename]; } - $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename); + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename); return $this->_fileHandlers[$filename]; } } diff --git a/search/Zend/Search/Lucene/Storage/File.php b/search/Zend/Search/Lucene/Storage/File.php index 5a195ae85f..8370c719db 100644 --- a/search/Zend/Search/Lucene/Storage/File.php +++ b/search/Zend/Search/Lucene/Storage/File.php @@ -15,21 +15,21 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Storage_File @@ -69,6 +69,15 @@ abstract class Zend_Search_Lucene_Storage_File */ abstract public function tell(); + /** + * Flush output. + * + * Returns true on success or false on failure. + * + * @return boolean + */ + abstract public function flush(); + /** * Writes $length number of bytes (all, if $length===null) to the end * of the file. @@ -78,6 +87,20 @@ abstract class Zend_Search_Lucene_Storage_File */ abstract protected function _fwrite($data, $length=null); + /** + * Lock file + * + * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) + * + * @param integer $lockType + * @return boolean + */ + abstract public function lock($lockType, $nonBlockinLock = false); + + /** + * Unlock file + */ + abstract public function unlock(); /** * Reads a byte from the current position in the file @@ -401,4 +424,4 @@ abstract class Zend_Search_Lucene_Storage_File { return $this->_fread($this->readVInt()); } -} \ No newline at end of file +} diff --git a/search/Zend/Search/Lucene/Storage/File/Filesystem.php b/search/Zend/Search/Lucene/Storage/File/Filesystem.php index 7c33543dd6..1f2097eb4f 100644 --- a/search/Zend/Search/Lucene/Storage/File/Filesystem.php +++ b/search/Zend/Search/Lucene/Storage/File/Filesystem.php @@ -15,23 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Storage_File */ -require_once 'Zend/Search/Lucene/Storage/File.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File.php'; /** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Storage - * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File @@ -53,12 +53,12 @@ class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Stor { global $php_errormsg; - $trackErrors = ini_get( "track_errors"); + $trackErrors = ini_get('track_errors'); ini_set('track_errors', '1'); $this->_fileHandle = @fopen($filename, $mode); - if ($this->_fileHandle===false) { + if ($this->_fileHandle === false) { ini_set('track_errors', $trackErrors); throw new Zend_Search_Lucene_Exception($php_errormsg); } @@ -100,6 +100,17 @@ class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Stor return ftell($this->_fileHandle); } + /** + * Flush output. + * + * Returns true on success or false on failure. + * + * @return boolean + */ + public function flush() + { + return fflush($this->_fileHandle); + } /** * Close File object @@ -167,5 +178,39 @@ class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Stor fwrite($this->_fileHandle, $data, $length); } } + + /** + * Lock file + * + * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) + * + * @param integer $lockType + * @param boolean $nonBlockinLock + * @return boolean + */ + public function lock($lockType, $nonBlockinLock = false) + { + if ($nonBlockinLock) { + return flock($this->_fileHandle, $lockType | LOCK_NB); + } else { + return flock($this->_fileHandle, $lockType); + } + } + + /** + * Unlock file + * + * Returns true on success + * + * @return boolean + */ + public function unlock() + { + if ($this->_fileHandle !== null ) { + return flock($this->_fileHandle, LOCK_UN); + } else { + return true; + } + } } diff --git a/search/Zend/Search/Lucene/Storage/File/Memory.php b/search/Zend/Search/Lucene/Storage/File/Memory.php new file mode 100644 index 0000000000..e3567c66da --- /dev/null +++ b/search/Zend/Search/Lucene/Storage/File/Memory.php @@ -0,0 +1,555 @@ +dirroot.'/search/Zend/Search/Lucene/Storage/File.php'; + +/** Zend_Search_Lucene_Exception */ +require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; + + +/** + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ +class Zend_Search_Lucene_Storage_File_Memory extends Zend_Search_Lucene_Storage_File +{ + /** + * FileData + * + * @var string + */ + private $_data; + + /** + * File Position + * + * @var integer + */ + private $_position = 0; + + + /** + * Object constractor + * + * @param string $data + */ + public function __construct($data) + { + $this->_data = $data; + } + + /** + * Reads $length number of bytes at the current position in the + * file and advances the file pointer. + * + * @param integer $length + * @return string + */ + protected function _fread($length = 1) + { + $returnValue = substr($this->_data, $this->_position, $length); + $this->_position += $length; + return $returnValue; + } + + + /** + * Sets the file position indicator and advances the file pointer. + * The new position, measured in bytes from the beginning of the file, + * is obtained by adding offset to the position specified by whence, + * whose values are defined as follows: + * SEEK_SET - Set position equal to offset bytes. + * SEEK_CUR - Set position to current location plus offset. + * SEEK_END - Set position to end-of-file plus offset. (To move to + * a position before the end-of-file, you need to pass a negative value + * in offset.) + * Upon success, returns 0; otherwise, returns -1 + * + * @param integer $offset + * @param integer $whence + * @return integer + */ + public function seek($offset, $whence=SEEK_SET) + { + switch ($whence) { + case SEEK_SET: + $this->_position = $offset; + break; + + case SEEK_CUR: + $this->_position += $offset; + break; + + case SEEK_END: + $this->_position = strlen($this->_data); + $this->_position += $offset; + break; + + default: + break; + } + } + + /** + * Get file position. + * + * @return integer + */ + public function tell() + { + return $this->_position; + } + + /** + * Flush output. + * + * Returns true on success or false on failure. + * + * @return boolean + */ + public function flush() + { + // Do nothing + + return true; + } + + /** + * Writes $length number of bytes (all, if $length===null) to the end + * of the file. + * + * @param string $data + * @param integer $length + */ + protected function _fwrite($data, $length=null) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + if ($length !== null) { + $this->_data .= substr($data, 0, $length); + } else { + $this->_data .= $data; + } + + $this->_position = strlen($this->_data); + } + + /** + * Lock file + * + * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) + * + * @param integer $lockType + * @return boolean + */ + public function lock($lockType, $nonBlockinLock = false) + { + // Memory files can't be shared + // do nothing + + return true; + } + + /** + * Unlock file + */ + public function unlock() + { + // Memory files can't be shared + // do nothing + } + + /** + * Reads a byte from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readByte() + { + return ord($this->_data[$this->_position++]); + } + + /** + * Writes a byte to the end of the file. + * + * @param integer $byte + */ + public function writeByte($byte) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + $this->_data .= chr($byte); + $this->_position = strlen($this->_data); + + return 1; + } + + /** + * Read num bytes from the current position in the file + * and advances the file pointer. + * + * @param integer $num + * @return string + */ + public function readBytes($num) + { + $returnValue = substr($this->_data, $this->_position, $num); + $this->_position += $num; + + return $returnValue; + } + + /** + * Writes num bytes of data (all, if $num===null) to the end + * of the string. + * + * @param string $data + * @param integer $num + */ + public function writeBytes($data, $num=null) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + if ($num !== null) { + $this->_data .= substr($data, 0, $num); + } else { + $this->_data .= $data; + } + + $this->_position = strlen($this->_data); + } + + + /** + * Reads an integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readInt() + { + $str = substr($this->_data, $this->_position, 4); + $this->_position += 4; + + return ord($str{0}) << 24 | + ord($str{1}) << 16 | + ord($str{2}) << 8 | + ord($str{3}); + } + + + /** + * Writes an integer to the end of file. + * + * @param integer $value + */ + public function writeInt($value) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + settype($value, 'integer'); + $this->_data .= chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF); + + $this->_position = strlen($this->_data); + } + + + /** + * Returns a long integer from the current position in the file + * and advances the file pointer. + * + * @return integer + * @throws Zend_Search_Lucene_Exception + */ + public function readLong() + { + $str = substr($this->_data, $this->_position, 8); + $this->_position += 8; + + /** + * Check, that we work in 64-bit mode. + * fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb + */ + if (PHP_INT_SIZE > 4) { + return ord($str{0}) << 56 | + ord($str{1}) << 48 | + ord($str{2}) << 40 | + ord($str{3}) << 32 | + ord($str{4}) << 24 | + ord($str{5}) << 16 | + ord($str{6}) << 8 | + ord($str{7}); + } else { + if ((ord($str{0}) != 0) || + (ord($str{1}) != 0) || + (ord($str{2}) != 0) || + (ord($str{3}) != 0) || + ((ord($str{0}) & 0x80) != 0)) { + throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); + } + + return ord($str{4}) << 24 | + ord($str{5}) << 16 | + ord($str{6}) << 8 | + ord($str{7}); + } + } + + /** + * Writes long integer to the end of file + * + * @param integer $value + * @throws Zend_Search_Lucene_Exception + */ + public function writeLong($value) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + /** + * Check, that we work in 64-bit mode. + * fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb + */ + if (PHP_INT_SIZE > 4) { + settype($value, 'integer'); + $this->_data .= chr($value>>56 & 0xFF) . + chr($value>>48 & 0xFF) . + chr($value>>40 & 0xFF) . + chr($value>>32 & 0xFF) . + chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF); + } else { + if ($value > 0x7FFFFFFF) { + throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); + } + + $this->_data .= chr(0) . chr(0) . chr(0) . chr(0) . + chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF); + } + + $this->_position = strlen($this->_data); + } + + + + /** + * Returns a variable-length integer from the current + * position in the file and advances the file pointer. + * + * @return integer + */ + public function readVInt() + { + $nextByte = ord($this->_data[$this->_position++]); + $val = $nextByte & 0x7F; + + for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { + $nextByte = ord($this->_data[$this->_position++]); + $val |= ($nextByte & 0x7F) << $shift; + } + return $val; + } + + /** + * Writes a variable-length integer to the end of file. + * + * @param integer $value + */ + public function writeVInt($value) + { + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + settype($value, 'integer'); + while ($value > 0x7F) { + $this->_data .= chr( ($value & 0x7F)|0x80 ); + $value >>= 7; + } + $this->_data .= chr($value); + + $this->_position = strlen($this->_data); + } + + + /** + * Reads a string from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readString() + { + $strlen = $this->readVInt(); + if ($strlen == 0) { + return ''; + } else { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + $str_val = substr($this->_data, $this->_position, $strlen); + $this->_position += $strlen; + + for ($count = 0; $count < $strlen; $count++ ) { + if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($str_val{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + if (ord($str_val{$count}) & 0x10 ) { + $addBytes++; + } + } + $str_val .= substr($this->_data, $this->_position, $addBytes); + $this->_position += $addBytes; + $strlen += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($str_val{$count}) == 0xC0 && + ord($str_val{$count+1}) == 0x80 ) { + $str_val{$count} = 0; + $str_val = substr($str_val,0,$count+1) + . substr($str_val,$count+2); + } + $count += $addBytes; + } + } + + return $str_val; + } + } + + /** + * Writes a string to the end of file. + * + * @param string $str + * @throws Zend_Search_Lucene_Exception + */ + public function writeString($str) + { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + // We do not need to check if file position points to the end of "file". + // Only append operation is supported now + + // convert input to a string before iterating string characters + settype($str, 'string'); + + $chars = $strlen = strlen($str); + $containNullChars = false; + + for ($count = 0; $count < $strlen; $count++ ) { + /** + * String is already in Java 2 representation. + * We should only calculate actual string length and replace + * \x00 by \xC0\x80 + */ + if ((ord($str{$count}) & 0xC0) == 0xC0) { + $addBytes = 1; + if (ord($str{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + // and we dont't support non-BMP characters + if (ord($str{$count}) & 0x10 ) { + $addBytes++; + } + } + $chars -= $addBytes; + + if (ord($str{$count}) == 0 ) { + $containNullChars = true; + } + $count += $addBytes; + } + } + + if ($chars < 0) { + throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string'); + } + + $this->writeVInt($chars); + if ($containNullChars) { + $this->_data .= str_replace($str, "\x00", "\xC0\x80"); + + } else { + $this->_data .= $str; + } + + $this->_position = strlen($this->_data); + } + + + /** + * Reads binary data from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readBinary() + { + $length = $this->readVInt(); + $returnValue = substr($this->_data, $this->_position, $length); + $this->_position += $length; + return $returnValue; + } +} + -- 2.39.5