From 792a590da7318d41bd33c71c97b9743504c921d1 Mon Sep 17 00:00:00 2001 From: moodler Date: Mon, 31 Jan 2005 07:39:03 +0000 Subject: [PATCH] New libraries to support lexer-based searching. From Tony Hursh - thanks! --- lib/lexer.php | 393 ++++++++++++++++++++++++++++++++++++++++++++++ lib/searchlib.php | 262 +++++++++++++++++++++++++++++++ 2 files changed, 655 insertions(+) create mode 100644 lib/lexer.php create mode 100644 lib/searchlib.php diff --git a/lib/lexer.php b/lib/lexer.php new file mode 100644 index 0000000000..fe01255157 --- /dev/null +++ b/lib/lexer.php @@ -0,0 +1,393 @@ +_case = $case; + $this->_patterns = array(); + $this->_labels = array(); + $this->_regex = null; + } + + /** + * Adds a pattern with an optional label. + * @param $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param $label Label of regex to be returned + * on a match. + * @public + */ + function addPattern($pattern, $label = true) { + $count = count($this->_patterns); + $this->_patterns[$count] = $pattern; + $this->_labels[$count] = $label; + $this->_regex = null; + } + + /** + * Attempts to match all patterns at once against + * a string. + * @param $subject String to match against. + * @param $match First matched portion of + * subject. + * @return True on success. + * @public + */ + function match($subject, &$match) { + if (count($this->_patterns) == 0) { + return false; + } + if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) { + $match = ""; + return false; + } + $match = $matches[0]; + for ($i = 1; $i < count($matches); $i++) { + if ($matches[$i]) { + return $this->_labels[$i - 1]; + } + } + return true; + } + + /** + * Compounds the patterns into a single + * regular expression separated with the + * "or" operator. Caches the regex. + * Will automatically escape (, ) and / tokens. + * @param $patterns List of patterns in order. + * @private + */ + function _getCompoundedRegex() { + if ($this->_regex == null) { + for ($i = 0; $i < count($this->_patterns); $i++) { + $this->_patterns[$i] = '(' . str_replace( + array('/', '(', ')'), + array('\/', '\(', '\)'), + $this->_patterns[$i]) . ')'; + } + $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); + } + return $this->_regex; + } + + /** + * Accessor for perl regex mode flags to use. + * @return Flags as string. + * @private + */ + function _getPerlMatchingFlags() { + return ($this->_case ? "msS" : "msSi"); + } + } + + /** + * States for a stack machine. + */ + class StateStack { + var $_stack; + + /** + * Constructor. Starts in named state. + * @param $start Starting state name. + * @public + */ + function StateStack($start) { + $this->_stack = array($start); + } + + /** + * Accessor for current state. + * @return State as string. + * @public + */ + function getCurrent() { + return $this->_stack[count($this->_stack) - 1]; + } + + /** + * Adds a state to the stack and sets it + * to be the current state. + * @param $state New state. + * @public + */ + function enter($state) { + array_push($this->_stack, $state); + } + + /** + * Leaves the current state and reverts + * to the previous one. + * @return False if we drop off + * the bottom of the list. + * @public + */ + function leave() { + if (count($this->_stack) == 1) { + return false; + } + array_pop($this->_stack); + return true; + } + } + + /** + * Accepts text and breaks it into tokens. + * Some optimisation to make the sure the + * content is only scanned by the PHP regex + * parser once. Lexer modes must not start + * with leading underscores. + */ + class Lexer { + var $_regexes; + var $_parser; + var $_mode; + var $_mode_handlers; + var $_case; + + /** + * Sets up the lexer in case insensitive matching + * by default. + * @param $parser Handling strategy by + * reference. + * @param $start Starting handler. + * @param $case True for case sensitive. + * @public + */ + function Lexer(&$parser, $start = "accept", $case = false) { + $this->_case = $case; + $this->_regexes = array(); + $this->_parser = &$parser; + $this->_mode = new StateStack($start); + $this->_mode_handlers = array(); + } + + /** + * Adds a token search pattern for a particular + * parsing mode. The pattern does not change the + * current mode. + * @param $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param $mode Should only apply this + * pattern when dealing with + * this type of input. + * @public + */ + function addPattern($pattern, $mode = "accept") { + if (!isset($this->_regexes[$mode])) { + $this->_regexes[$mode] = new ParallelRegex($this->_case); + } + $this->_regexes[$mode]->addPattern($pattern); + } + + /** + * Adds a pattern that will enter a new parsing + * mode. Useful for entering parenthesis, strings, + * tags, etc. + * @param $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param $mode Should only apply this + * pattern when dealing with + * this type of input. + * @param $new_mode Change parsing to this new + * nested mode. + * @public + */ + function addEntryPattern($pattern, $mode, $new_mode) { + if (!isset($this->_regexes[$mode])) { + $this->_regexes[$mode] = new ParallelRegex($this->_case); + } + $this->_regexes[$mode]->addPattern($pattern, $new_mode); + } + + /** + * Adds a pattern that will exit the current mode + * and re-enter the previous one. + * @param $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param $mode Mode to leave. + * @public + */ + function addExitPattern($pattern, $mode) { + if (!isset($this->_regexes[$mode])) { + $this->_regexes[$mode] = new ParallelRegex($this->_case); + } + $this->_regexes[$mode]->addPattern($pattern, "__exit"); + } + + /** + * Adds a pattern that has a special mode. + * Acts as an entry and exit pattern in one go. + * @param $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param $mode Should only apply this + * pattern when dealing with + * this type of input. + * @param $special Use this mode for this one token. + * @public + */ + function addSpecialPattern($pattern, $mode, $special) { + if (!isset($this->_regexes[$mode])) { + $this->_regexes[$mode] = new ParallelRegex($this->_case); + } + $this->_regexes[$mode]->addPattern($pattern, "_$special"); + } + + /** + * Adds a mapping from a mode to another handler. + * @param $mode Mode to be remapped. + * @param $handler New target handler. + * @public + */ + function mapHandler($mode, $handler) { + $this->_mode_handlers[$mode] = $handler; + } + + /** + * Splits the page text into tokens. Will fail + * if the handlers report an error or if no + * content is consumed. If successful then each + * unparsed and parsed token invokes a call to the + * held listener. + * @param $raw Raw HTML text. + * @return True on success, else false. + * @public + */ + function parse($raw) { + if (!isset($this->_parser)) { + return false; + } + $length = strlen($raw); + while (is_array($parsed = $this->_reduce($raw))) { + list($unmatched, $matched, $mode) = $parsed; + if (!$this->_dispatchTokens($unmatched, $matched, $mode)) { + return false; + } + if (strlen($raw) == $length) { + return false; + } + $length = strlen($raw); + } + if (!$parsed) { + return false; + } + return $this->_invokeParser($raw, LEXER_UNMATCHED); + } + + /** + * Sends the matched token and any leading unmatched + * text to the parser changing the lexer to a new + * mode if one is listed. + * @param $unmatched Unmatched leading portion. + * @param $matched Actual token match. + * @param $mode Mode after match. The "_exit" + * mode causes a stack pop. An + * false mode causes no change. + * @return False if there was any error + * from the parser. + * @private + */ + function _dispatchTokens($unmatched, $matched, $mode = false) { + if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED)) { + return false; + } + if ($mode === "__exit") { + if (!$this->_invokeParser($matched, LEXER_EXIT)) { + return false; + } + return $this->_mode->leave(); + } + if (strncmp($mode, "_", 1) == 0) { + $mode = substr($mode, 1); + $this->_mode->enter($mode); + if (!$this->_invokeParser($matched, LEXER_SPECIAL)) { + return false; + } + return $this->_mode->leave(); + } + if (is_string($mode)) { + $this->_mode->enter($mode); + return $this->_invokeParser($matched, LEXER_ENTER); + } + return $this->_invokeParser($matched, LEXER_MATCHED); + } + + /** + * Calls the parser method named after the current + * mode. Empty content will be ignored. + * @param $content Text parsed. + * @param $is_match Token is recognised rather + * than unparsed data. + * @private + */ + function _invokeParser($content, $is_match) { + if (($content === "") || ($content === false)) { + return true; + } + $handler = $this->_mode->getCurrent(); + if (isset($this->_mode_handlers[$handler])) { + $handler = $this->_mode_handlers[$handler]; + } + return $this->_parser->$handler($content, $is_match); + } + + /** + * Tries to match a chunk of text and if successful + * removes the recognised chunk and any leading + * unparsed data. Empty strings will not be matched. + * @param $raw The subject to parse. This is the + * content that will be eaten. + * @return Three item list of unparsed + * content followed by the + * recognised token and finally the + * action the parser is to take. + * True if no match, false if there + * is a parsing error. + * @private + */ + function _reduce(&$raw) { + if (!isset($this->_regexes[$this->_mode->getCurrent()])) { + return false; + } + if ($raw === "") { + return true; + } + if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { + $count = strpos($raw, $match); + $unparsed = substr($raw, 0, $count); + $raw = substr($raw, $count + strlen($match)); + return array($unparsed, $match, $action); + } + return true; + } + } +?> diff --git a/lib/searchlib.php b/lib/searchlib.php new file mode 100644 index 0000000000..47c9aac924 --- /dev/null +++ b/lib/searchlib.php @@ -0,0 +1,262 @@ +libdir.'/lexer.php'); + +// Constants for the various types of tokens + +define("T_USER","0"); +define("T_META","1"); +define("T_EXACT","2"); +define("T_NEGATE","3"); +define("T_STRING","4"); + +// Class to hold token/value pairs after they're parsed. + +class search_token { + var $value; + var $type; + function search_token($type,$value){ + $this->type = $type; + $this->value = $this->sanitize($value); + + } + + // Try to clean up user input to avoid potential security issues. + // Need to think about this some more. + + function sanitize($userstring){ + return htmlentities(addslashes($userstring)); + } + function getValue(){ + return $this->value; + } + function getType(){ + return $this->type; + } +} + + + +// This class does the heavy lifting of lexing the search string into tokens. +// Using a full-blown lexer is probably overkill for this application, but +// might be useful for other tasks. + +class search_lexer extends Lexer{ + + function search_lexer($parser){ + + // Call parent constructor. + $this->Lexer($parser); + + //Set up the state machine and pattern matches for transitions. + + // Patterns to handle strings of the form user:foo + + // If we see the string user: while in the base accept state, start + // parsing a username and go to the inusername state. + $this->addEntryPattern("user:\S+","accept","inusername"); + + // Snarf everything into the username until we see whitespace, then exit + // back to the base accept state. + $this->addExitPattern("\s","inusername"); + + // Patterns to handle strings of the form meta:foo + + // If we see the string meta: while in the base accept state, start + // parsing a username and go to the inmeta state. + $this->addEntryPattern("subject:\S+","accept","inmeta"); + + // Snarf everything into the meta token until we see whitespace, then exit + // back to the base accept state. + $this->addExitPattern("\s","inmeta"); + + + // Patterns to handle required exact match strings (+foo) . + + // If we see a + sign while in the base accept state, start + // parsing an exact match string and enter the inrequired state + $this->addEntryPattern("\+\S+","accept","inrequired"); + // When we see white space, exit back to accept state. + $this->addExitPattern("\s","inrequired"); + + // Handle excluded strings (-foo) + + // If we see a - sign while in the base accept state, start + // parsing an excluded string and enter the inexcluded state + $this->addEntryPattern("\-\S+","accept","inexcluded"); + // When we see white space, exit back to accept state. + $this->addExitPattern("\s","inexcluded"); + + + // Patterns to handle quoted strings. + + // If we see a quote while in the base accept state, start + // parsing a quoted string and enter the inquotedstring state. + // Grab everything until we see the closing quote. + + $this->addEntryPattern("\"[^\"]+","accept","inquotedstring"); + + // When we see a closing quote, reenter the base accept state. + $this->addExitPattern("\"","inquotedstring"); + + // Patterns to handle ordinary, nonquoted words. + + // When we see non-whitespace, snarf everything into the nonquoted word + // until we see whitespace again. + $this->addEntryPattern("\S+","accept","plainstring"); + + // Once we see whitespace, reenter the base accept state. + $this->addExitPattern("\s","plainstring"); + + } +} + + + + +// This class takes care of sticking the proper token type/value pairs into +// the parsed token array. +// Most functions in this class should only be called by the lexer, the +// one exception being getParseArray() which returns the result. + +class search_parser { + var $tokens; + + + // This function is called by the code that's interested in the result of the parse operation. + function get_parsed_array(){ + return $this->tokens; + } + + /* + * Functions below this are part of the state machine for the parse + * operation and should not be called directly. + */ + + // Base state. No output emitted. + function accept() { + return true; + } + + + // State for handling user:foo constructs. Potentially emits a token. + function inusername($content){ + if(strlen($content) < 6) // State exit or missing parameter. + return true; + // Strip off the user: part and add the reminder to the parsed token array + $param = trim(substr($content,5)); + $this->tokens[] = new search_token(T_USER,$param); + return true; + } + + + // State for handling meta:foo constructs. Potentially emits a token. + function inmeta($content){ + if(strlen($content) < 9) // Missing parameter. + return true; + // Strip off the meta: part and add the reminder to the parsed token array. + $param = trim(substr($content,8)); + $this->tokens[] = new search_token(T_META,$param); + return true; + } + + + // State entered when we've seen a required string (+foo). Potentially + // emits a token. + function inrequired($content){ + if(strlen($content) < 2) // State exit or missing parameter, don't emit. + return true; + // Strip off the + sign and add the reminder to the parsed token array. + $this->tokens[] = new search_token(T_EXACT,substr($content,1)); + return true; + } + + // State entered when we've seen an excluded string (-foo). Potentially + // emits a token. + function inexcluded($content){ + if(strlen($content) < 2) // State exit or missing parameter. + return true; + // Strip off the -sign and add the reminder to the parsed token array. + $this->tokens[] = new search_token(T_NEGATE,substr($content,1)); + return true; + } + + + // State entered when we've seen a quoted string. Potentially emits a token. + function inquotedstring($content){ + if(strlen($content) < 2) // State exit or missing parameter. + return true; + // Strip off the opening quote and add the reminder to the parsed token array. + $this->tokens[] = new search_token(T_STRING,substr($content,1)); + return true; + } + + // State entered when we've seen an ordinary, non-quoted word. Potentially + // emits a token. + function plainstring($content){ + if(ctype_space($content)) // State exit + return true; + // Add the string to the parsed token array. + $this->tokens[] = new search_token(T_STRING,$content); + return true; + } + + + +} + + +// Primitive function to generate a SQL string from a parse tree. +// Parameters: +// +// $parsetree should be a parse tree generated by a +// search_lexer/search_parser combination. +// Other fields are database table names to search. + +function search_generate_SQL($parsetree,$datafield,$metafield,$mainidfield,$useridfield,$userfirstnamefield,$userlastnamefield){ + global $CFG; + if ($CFG->dbtype == "postgres7") { + $LIKE = "ILIKE"; // case-insensitive + $NOTLIKE = "NOT ILIKE"; // case-insensitive + $REGEXP = "~*"; + $NOTREGEXP = "!~*"; + } else { + $LIKE = "LIKE"; + $NOTLIKE = "NOT LIKE"; + $REGEXP = "REGEXP"; + $NOTREGEXP = "NOT REGEXP"; + } + + $ntokens = count($parsetree); + if($ntokens == 0) + return ""; + + for($i = 0; $i < $ntokens; $i++){ + if($i > 0) // We have more than one clause, need to tack on AND + $SQLString .= " AND "; + + $type = $parsetree[$i]->getType(); + $value = $parsetree[$i]->getValue(); + switch($type){ + case T_STRING : $SQLString .= "(($datafield $LIKE '%$value%') OR ($metafield $LIKE '%$value%') )"; + break; + case T_EXACT: + $SQLString .= "(($datafield $REGEXP '[[:<:]]".$value."[[:>:]]') OR ($metafield $REGEXP '[[:<:]]".$value."[[:>:]]'))"; + break; + case T_META : if($metafield != "") + $SQLString .= "($metafield $LIKE '%$value%')"; + break; + case T_USER : $SQLString .= "(($mainidfield = $useridfield) AND (($userfirstnamefield $LIKE '%$value%') OR ($userlastnamefield $LIKE '%$value%')))"; + break; + case T_NEGATE: $SQLString .= "(NOT (($datafield $LIKE '%$value%') OR ($metafield $LIKE '%$value%')))"; + break; + default: + return ""; + + } + } + return $SQLString; +} + + +?> -- 2.39.5