PageRenderTime 53ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/projects/netbeans-7.3/php.editor/test/unit/data/testfiles/TextSearchQuery.php

https://gitlab.com/essere.lab.public/qualitas.class-corpus
PHP | 1155 lines | 665 code | 99 blank | 391 comment | 96 complexity | b12b4e562e40c95b5b404c8a5eeb6d91 MD5 | raw file
  1. <?php rcs_id('$Id: TextSearchQuery.php,v 1.28 2007/03/18 17:35:26 rurban Exp $');
  2. /**
  3. * A text search query, converting queries to PCRE and SQL matchers.
  4. *
  5. * This represents an enhanced "Google-like" text search query:
  6. * <dl>
  7. * <dt> default: case-insensitive glob-style search with special operators OR AND NOT -
  8. * <dt> wiki -test
  9. * <dd> Match strings containing the substring 'wiki', and NOT containing the
  10. * substring 'test'.
  11. * <dt> wiki word or page
  12. * <dd> Match strings containing the substring 'wiki' AND either the substring
  13. * 'word' OR the substring 'page'.
  14. * <dt> auto-detect regex hints, glob-style or regex-style, and converts them
  15. * to PCRE and SQL matchers:
  16. * <dd> "^word$" => EXACT(word)
  17. * <dd> "^word" => STARTS_WITH(word)
  18. * <dd> "word*" => STARTS_WITH(word)
  19. * <dd> "*word" => ENDS_WITH(word)
  20. * <dd> "/^word.* /" => REGEX(^word.*)
  21. * <dd> "word*word" => REGEX(word.*word)
  22. * </dl>
  23. *
  24. * The full query syntax, in order of precedence, is roughly:
  25. *
  26. * The unary 'NOT' or '-' operator (they are equivalent) negates the
  27. * following search clause.
  28. *
  29. * Search clauses may be joined with the (left-associative) binary operators
  30. * 'AND' and 'OR'. (case-insensitive)
  31. *
  32. * Two adjoining search clauses are joined with an implicit 'AND'. This has
  33. * lower precedence than either an explicit 'AND' or 'OR', so "a b OR c"
  34. * parses as "a AND ( b OR c )", while "a AND b OR c" parses as
  35. * "( a AND b ) OR c" (due to the left-associativity of 'AND' and 'OR'.)
  36. *
  37. * Search clauses can be grouped with parentheses.
  38. *
  39. * Phrases (or other things which don't look like words) can be forced to
  40. * be interpreted as words by quoting them, either with single (') or double (")
  41. * quotes. If you wan't to include the quote character within a quoted string,
  42. * double-up on the quote character: 'I''m hungry' is equivalent to
  43. * "I'm hungry".
  44. *
  45. * Force regex on "re:word" => posix-style, "/word/" => pcre-style
  46. * or use regex='glob' to use file wildcard-like matching. (not yet)
  47. *
  48. * The parsed tree is then converted to the needed PCRE (highlight,
  49. * simple backends) or SQL functions.
  50. *
  51. * @author: Jeff Dairiki
  52. * @author: Reini Urban (case and regex detection, enhanced sql callbacks)
  53. */
  54. // regex-style: 'auto', 'none', 'glob', 'posix', 'pcre', 'sql'
  55. define ('TSQ_REGEX_NONE', 0);
  56. define ('TSQ_REGEX_AUTO', 1);
  57. define ('TSQ_REGEX_POSIX',2);
  58. define ('TSQ_REGEX_GLOB', 4);
  59. define ('TSQ_REGEX_PCRE', 8);
  60. define ('TSQ_REGEX_SQL', 16);
  61. class TextSearchQuery {
  62. /**
  63. * Create a new query.
  64. *
  65. * @param $search_query string The query. Syntax is as described above.
  66. * Note that an empty $search_query will match anything.
  67. * @param $case_exact boolean
  68. * @param $regex string one of 'auto', 'none', 'glob', 'posix', 'pcre', 'sql'
  69. * @see TextSearchQuery
  70. */
  71. function TextSearchQuery($search_query, $case_exact=false, $regex='auto') {
  72. if ($regex == 'none' or !$regex)
  73. $this->_regex = 0;
  74. elseif (defined("TSQ_REGEX_".strtoupper($regex)))
  75. $this->_regex = constant("TSQ_REGEX_".strtoupper($regex));
  76. else {
  77. trigger_error(fmt("Unsupported argument: %s=%s", 'regex', $regex));
  78. $this->_regex = 0;
  79. }
  80. $this->_case_exact = $case_exact;
  81. if ($regex != 'pcre') {
  82. $parser = new TextSearchQuery_Parser;
  83. $this->_tree = $parser->parse($search_query, $case_exact, $this->_regex);
  84. $this->_optimize(); // broken under certain circumstances: "word -word -word"
  85. if (defined("FULLTEXTSEARCH_STOPLIST"))
  86. $this->_stoplist = FULLTEXTSEARCH_STOPLIST;
  87. else // default stoplist, localizable.
  88. $this->_stoplist = _("(A|An|And|But|By|For|From|In|Is|It|Of|On|Or|The|To|With)");
  89. }
  90. else {
  91. $this->_tree = new TextSearchQuery_node_regex_pcre($search_query);
  92. if (preg_match("/^\/(.*)\/(\w*)$/", $search_query, $m)) {
  93. $this->_tree->word = $m[1];
  94. $this->_regex_modifier = $m[2]; // overrides case_exact
  95. }
  96. }
  97. }
  98. function getType() { return 'text'; }
  99. function _optimize() {
  100. $this->_tree = $this->_tree->optimize();
  101. }
  102. /**
  103. * Get a PCRE regexp which matches the query.
  104. */
  105. function asRegexp() {
  106. if (!isset($this->_regexp)) {
  107. if (!isset($this->_regex_modifier))
  108. $this->_regex_modifier = ($this->_case_exact?'':'i').'sS';
  109. if ($this->_regex)
  110. $this->_regexp = '/' . $this->_tree->regexp() . '/'.$this->_regex_modifier;
  111. else
  112. $this->_regexp = '/^' . $this->_tree->regexp() . '/'.$this->_regex_modifier;
  113. }
  114. return $this->_regexp;
  115. }
  116. /**
  117. * Match query against string.
  118. *
  119. * @param $string string The string to match.
  120. * @return boolean True if the string matches the query.
  121. */
  122. function match($string) {
  123. return preg_match($this->asRegexp(), $string);
  124. }
  125. /**
  126. * Get a regular expression suitable for highlighting matched words.
  127. *
  128. * This returns a PCRE regular expression which matches any non-negated
  129. * word in the query.
  130. *
  131. * @return string The PCRE regexp.
  132. */
  133. function getHighlightRegexp() {
  134. if (!isset($this->_hilight_regexp)) {
  135. $words = array_unique($this->_tree->highlight_words());
  136. if (!$words) {
  137. $this->_hilight_regexp = false;
  138. } else {
  139. foreach ($words as $key => $word)
  140. $words[$key] = preg_quote($word, '/');
  141. $this->_hilight_regexp = '(?:' . join('|', $words) . ')';
  142. }
  143. }
  144. return $this->_hilight_regexp;
  145. }
  146. /**
  147. * Make an SQL clause which matches the query.
  148. * Deprecated, use makeSqlClauseObj instead.
  149. *
  150. * @param $make_sql_clause_cb WikiCallback
  151. * A callback which takes a single word as an argument and
  152. * returns an SQL clause which will match exactly those records
  153. * containing the word. The word passed to the callback will always
  154. * be in all lower case.
  155. *
  156. * Support db-specific extensions, like MATCH AGAINST or REGEX
  157. * mysql => 4.0.1 can also do Google: MATCH AGAINST IN BOOLEAN MODE
  158. * by using makeSqlClauseObj
  159. *
  160. * Old example usage:
  161. * <pre>
  162. * function sql_title_match($word) {
  163. * return sprintf("LOWER(title) like '%s'",
  164. * addslashes($word));
  165. * }
  166. *
  167. * ...
  168. *
  169. * $query = new TextSearchQuery("wiki -page");
  170. * $cb = new WikiFunctionCb('sql_title_match');
  171. * $sql_clause = $query->makeSqlClause($cb);
  172. * </pre>
  173. * This will result in $sql_clause containing something like
  174. * "(LOWER(title) like 'wiki') AND NOT (LOWER(title) like 'page')".
  175. *
  176. * @return string The SQL clause.
  177. */
  178. function makeSqlClause($sql_clause_cb) {
  179. $this->_sql_clause_cb = $sql_clause_cb;
  180. return $this->_sql_clause($this->_tree);
  181. }
  182. // deprecated: use _sql_clause_obj now.
  183. function _sql_clause($node) {
  184. switch ($node->op) {
  185. case 'WORD': // word => %word%
  186. return $this->_sql_clause_cb->call($node->word);
  187. case 'NOT':
  188. return "NOT (" . $this->_sql_clause($node->leaves[0]) . ")";
  189. case 'AND':
  190. case 'OR':
  191. $subclauses = array();
  192. foreach ($node->leaves as $leaf)
  193. $subclauses[] = "(" . $this->_sql_clause($leaf) . ")";
  194. return join(" $node->op ", $subclauses);
  195. default:
  196. assert($node->op == 'VOID');
  197. return '1=1';
  198. }
  199. }
  200. /** Get away with the callback and use a db-specific search class instead.
  201. * @see WikiDB_backend_PearDB_search
  202. */
  203. function makeSqlClauseObj(&$sql_search_cb) {
  204. $this->_sql_clause_cb = $sql_search_cb;
  205. return $this->_sql_clause_obj($this->_tree);
  206. }
  207. function _sql_clause_obj($node) {
  208. switch ($node->op) {
  209. case 'NOT':
  210. return "NOT (" . $this->_sql_clause_cb->call($node->leaves[0]) . ")";
  211. case 'AND':
  212. case 'OR':
  213. $subclauses = array();
  214. foreach ($node->leaves as $leaf)
  215. $subclauses[] = "(" . $this->_sql_clause_obj($leaf) . ")";
  216. return join(" $node->op ", $subclauses);
  217. case 'VOID':
  218. return '0=1';
  219. case 'ALL':
  220. return '1=1';
  221. default:
  222. return $this->_sql_clause_cb->call($node);
  223. }
  224. }
  225. /*
  226. postgresql tsearch2 uses no WHERE operators, just & | and ! in the searchstring
  227. */
  228. function makeTsearch2SqlClauseObj(&$sql_search_cb) {
  229. $this->_sql_clause_cb = $sql_search_cb;
  230. return $this->_Tsearch2Sql_clause_obj($this->_tree);
  231. }
  232. function _Tsearch2Sql_clause_obj($node) {
  233. // TODO: "such a phrase"
  234. switch ($node->op) {
  235. case 'NOT':
  236. return "!" . $node->leaves[0];
  237. case 'AND':
  238. $subclauses = array();
  239. foreach ($node->leaves as $leaf)
  240. $subclauses[] = $this->_Tsearch2Sql_clause_obj($leaf);
  241. return join("&", $subclauses);
  242. case 'OR':
  243. $subclauses = array();
  244. foreach ($node->leaves as $leaf)
  245. $subclauses[] = $this->_Tsearch2Sql_clause_obj($leaf);
  246. return join("|", $subclauses);
  247. case 'VOID':
  248. return '';
  249. case 'ALL':
  250. return '1';
  251. default:
  252. return $this->_sql_clause_cb->call($node);
  253. }
  254. }
  255. function sql() { return '%'.$this->_sql_quote($this->word).'%'; }
  256. /**
  257. * Get printable representation of the parse tree.
  258. *
  259. * This is for debugging only.
  260. * @return string Printable parse tree.
  261. */
  262. function asString() {
  263. return $this->_as_string($this->_tree);
  264. }
  265. function _as_string($node, $indent = '') {
  266. switch ($node->op) {
  267. case 'WORD':
  268. return $indent . "WORD: $node->word";
  269. case 'VOID':
  270. return $indent . "VOID";
  271. case 'ALL':
  272. return $indent . "ALL";
  273. default:
  274. $lines = array($indent . $node->op . ":");
  275. $indent .= " ";
  276. foreach ($node->leaves as $leaf)
  277. $lines[] = $this->_as_string($leaf, $indent);
  278. return join("\n", $lines);
  279. }
  280. }
  281. }
  282. /**
  283. * This is a TextSearchQuery which matches nothing.
  284. */
  285. class NullTextSearchQuery extends TextSearchQuery {
  286. /**
  287. * Create a new query.
  288. *
  289. * @see TextSearchQuery
  290. */
  291. function NullTextSearchQuery() {}
  292. function asRegexp() { return '/^(?!a)a/x'; }
  293. function match($string) { return false; }
  294. function getHighlightRegexp() { return ""; }
  295. function makeSqlClause($make_sql_clause_cb) { return "(1 = 0)"; }
  296. function asString() { return "NullTextSearchQuery"; }
  297. };
  298. /**
  299. * A simple algebraic matcher for numeric attributes.
  300. * NumericSearchQuery can do ("population < 20000 and area > 1000000", array("population", "area"))
  301. * ->match(array('population' => 100000, 'area' => 10000000))
  302. *
  303. * Supports all mathematical PHP comparison operators, plus ':=' for equality.
  304. * "(x < 2000000 and x >= 10000) or (x >= 100 and x < 2000)"
  305. * "x := 100000" is the same as "x == 100000"
  306. *
  307. * Since this is basic numerics only, we simply try to get away with
  308. * replacing the variable values at the right positions and do an eval then.
  309. *
  310. * @package NumericSearchQuery
  311. * @author Reini Urban
  312. * @see SemanticAttributeSearchQuery
  313. */
  314. class NumericSearchQuery
  315. {
  316. /**
  317. * Create a new query.
  318. * NumericSearchQuery("population > 20000 or population < 200", "population")
  319. * NumericSearchQuery("population < 20000 and area > 1000000", array("population", "area"))
  320. *
  321. * With a single variable it is easy: The valid name must be matched elsewhere, just
  322. * replace the given number in match in the query.
  323. * ->match(2000)
  324. *
  325. * With matching a struct we need strict names, no * as name is allowed.
  326. * So always when the placeholder is an array, the names of the target struct must match
  327. * and all vars be defined. Use the method can_match($struct) therefore.
  328. *
  329. * @access public
  330. * @param $search_query string A numerical query with placeholders as variable.
  331. * @param $placeholders array or string All placeholders in the query must be defined
  332. * here, and will be replaced by the matcher.
  333. */
  334. function NumericSearchQuery($search_query, $placeholders) {
  335. // added some basic security checks against user input
  336. $this->_query = $search_query;
  337. $this->_placeholders = $placeholders;
  338. // we should also allow the M_ constants
  339. $this->_allowed_functions = explode(':','abs:acos:acosh:asin:asinh:atan2:atan:atanh:base_convert:bindec:ceil:cos:cosh:decbin:dechex:decoct:deg2rad:exp:expm1:floor:fmod:getrandmax:hexdec:hypot:is_finite:is_infinite:is_nan:lcg_value:log10:log1p:log:max:min:mt_getrandmax:mt_rand:mt_srand:octdec:pi:pow:rad2deg:rand:round:sin:sinh:sqrt:srand:tan:tanh');
  340. $this->_allowed_operators = explode(',', '-,<,<=,>,>=,==,!=,*,+,/,(,),%,and,or,xor,<<,>>,===,!==,&,^,|,&&,||');
  341. $this->_parser_check = array();
  342. // check should be fast, so make a hash
  343. foreach ($this->_allowed_functions as $f)
  344. $this->_parser_check[$f] = 1;
  345. foreach ($this->_allowed_operators as $f)
  346. $this->_parser_check[$f] = 1;
  347. if (is_array($placeholders))
  348. foreach ($placeholders as $f)
  349. $this->_parser_check[$f] = 1;
  350. else $this->_parser_check[$placeholders] = 1;
  351. // This is a speciality: := looks like the attribute definition and is
  352. // therefore a dummy check for this definition.
  353. // php-4.2.2 has a problem with /\b:=\b/ matching "population := 1223400"
  354. $this->_query = preg_replace("/:=/", "==", $this->_query);
  355. $this->_query = $this->check_query($this->_query);
  356. }
  357. function getType() { return 'numeric'; }
  358. /**
  359. * Check the symbolic definition query against unwanted functions and characters.
  360. * "population < 20000 and area > 1000000" vs
  361. * "area > 1000000 and mail($me,file("/etc/passwd"),...)"
  362. * http://localhost/wikicvs/SemanticSearch?attribute=*&attr_op=<0 and find(1)>&s=-0.01&start_debug=1
  363. */
  364. function check_query ($query) {
  365. $tmp = $query; // check for all function calls, in case the tokenizer is not available.
  366. while (preg_match("/([a-z][a-z0-9]+)\s*\((.*)$/i", $tmp, $m)) {
  367. if (!in_array($m[1], $this->_allowed_functions)
  368. and !in_array($m[1], $this->_allowed_operators))
  369. {
  370. trigger_error("Illegal function in query: ".$m[1], E_USER_WARNING);
  371. return '';
  372. }
  373. $tmp = $m[2];
  374. }
  375. // Strictly check for illegal functions and operators, which are no placeholders.
  376. if (function_exists('token_get_all')) {
  377. $parsed = token_get_all("<?$query?>");
  378. foreach ($parsed as $x) { // flat, non-recursive array
  379. if (is_string($x) and !isset($this->_parser_check[$x])) {
  380. // single char op or name
  381. trigger_error("Illegal string or operator in query: \"$x\"", E_USER_WARNING);
  382. $query = '';
  383. }
  384. elseif (is_array($x)) {
  385. $n = token_name($x[0]);
  386. if ($n == 'T_OPEN_TAG' or $n == 'T_WHITESPACE'
  387. or $n == 'T_CLOSE_TAG' or $n == 'T_LNUMBER'
  388. or $n == 'T_CONST' or $n == 'T_DNUMBER' ) continue;
  389. if ($n == 'T_VARIABLE') { // but we do allow consts
  390. trigger_error("Illegal variable in query: \"$x[1]\"", E_USER_WARNING);
  391. $query = '';
  392. }
  393. if (is_string($x[1]) and !isset($this->_parser_check[$x[1]])) {
  394. // multi-char char op or name
  395. trigger_error("Illegal $n in query: \"$x[1]\"", E_USER_WARNING);
  396. $query = '';
  397. }
  398. }
  399. }
  400. //echo "$query <br>";
  401. //$this->_parse_token($parsed);
  402. //echo "<br>\n";
  403. //var_dump($parsed);
  404. /*
  405. "_x > 0" =>
  406. { T_OPEN_TAG "<?"} { T_STRING "_x"} { T_WHITESPACE " "} ">" { T_WHITESPACE " "} { T_LNUMBER "0"} { T_CLOSE_TAG "?>"}
  407. Interesting: on-char ops, as ">" are not tokenized.
  408. "_x <= 0"
  409. { T_OPEN_TAG "< ?" } { T_STRING "_x" } { T_WHITESPACE " " } { T_IS_SMALLER_OR_EQUAL "<=" } { T_WHITESPACE " " } { T_LNUMBER "0" } { T_CLOSE_TAG "?>" }
  410. */
  411. } else {
  412. // Detect illegal characters besides nums, words and ops.
  413. // So attribute names can not be utf-8
  414. $c = "/([^\d\w.,\s".preg_quote(join("",$this->_allowed_operators),"/")."])/";
  415. if (preg_match($c, $query, $m)) {
  416. trigger_error("Illegal character in query: ".$m[1], E_USER_WARNING);
  417. return '';
  418. }
  419. }
  420. return $query;
  421. }
  422. /**
  423. * Check the bound, numeric-only query against unwanted functions and sideeffects.
  424. * "4560000 < 20000 and 1456022 > 1000000"
  425. */
  426. function _live_check () {
  427. // TODO: check $this->_workquery again?
  428. return !empty($this->_workquery);
  429. }
  430. /**
  431. * A numeric query can only operate with predefined variables. "x < 0 and y < 1"
  432. *
  433. * @return array The names as array of strings. => ('x', 'y') the placeholders.
  434. */
  435. function getVars() {
  436. if(is_array($this->_placeholders)) return $this->_placeholders;
  437. else return array($this->_placeholders);
  438. }
  439. /**
  440. * Strip non-numeric chars from the variable (as the groupseperator) and replace
  441. * it in the symbolic query for evaluation.
  442. *
  443. * @access private
  444. * @param $value number A numerical value: integer, float or string.
  445. * @param $x string The variable name to be replaced in the query.
  446. * @return string
  447. */
  448. function _bind($value, $x) {
  449. // TODO: check is_number, is_float, is_integer and do casting
  450. $this->_bound[] = array('linkname' => $x,
  451. 'linkvalue' => $value);
  452. $value = preg_replace("/[^-+0123456789.,]/", "", $value);
  453. //$c = "/\b".preg_quote($x,"/")."\b/";
  454. $this->_workquery = preg_replace("/\b".preg_quote($x,"/")."\b/", $value, $this->_workquery);
  455. // FIXME: do again a final check. now only numbers and some operators are allowed.
  456. return $this->_workquery;
  457. }
  458. /* array of successfully bound vars, and in case of success, the resulting vars
  459. */
  460. function _bound() {
  461. return $this->_bound;
  462. }
  463. /**
  464. * With an array of placeholders we need a hash to check against, if all required names are given.
  465. * Purpose: Be silent about missing vars, just return false.
  466. `*
  467. * @access public
  468. * @param $variable string or hash of name => value The keys must satisfy all placeholders in the definition.
  469. * We want the full hash and not just the keys because a hash check is faster than the array of keys check.
  470. * @return boolean
  471. */
  472. function can_match(&$variables) {
  473. if (empty($this->_query))
  474. return false;
  475. $p =& $this->_placeholders;
  476. if (!is_array($variables) and !is_array($p))
  477. return $variables == $p; // This was easy.
  478. // Check if all placeholders have definitions. can be overdefined but not underdefined.
  479. if (!is_array($p)) {
  480. if (!isset($variables[$p])) return false;
  481. } else {
  482. foreach ($p as $x) {
  483. if (!isset($variables[$x])) return false;
  484. }
  485. }
  486. return true;
  487. }
  488. /**
  489. * We can match against a single variable or against a hash of variables.
  490. * With one placeholder we need just a number.
  491. * With an array of placeholders we need a hash.
  492. *
  493. * @access public
  494. * @param $variable number or array of name => value The keys must satisfy all placeholders in the definition.
  495. * @return boolean
  496. */
  497. function match(&$variable) {
  498. $p =& $this->_placeholders;
  499. $this->_workquery = $this->_query;
  500. if (!is_array($p)) {
  501. if (is_array($variable)) { // which var to match? we cannot decide this here
  502. if (!isset($variable[$p]))
  503. trigger_error("Required NumericSearchQuery->match variable $x not defined.", E_USER_ERROR);
  504. $this->_bind($variable[$p], $p);
  505. } else {
  506. $this->_bind($variable, $p);
  507. }
  508. } else {
  509. foreach ($p as $x) {
  510. if (!isset($variable[$x]))
  511. trigger_error("Required NumericSearchQuery->match variable $x not defined.", E_USER_ERROR);
  512. $this->_bind($variable[$x], $x);
  513. }
  514. }
  515. if (!$this->_live_check()) // check returned an error
  516. return false;
  517. $search = $this->_workquery;
  518. $result = false;
  519. //if (DEBUG & _DEBUG_VERBOSE)
  520. // trigger_error("\$result = (boolean)($search);", E_USER_NOTICE);
  521. // We might have a numerical problem:
  522. // php-4.2.2 eval'ed as module: "9.636e+08 > 1000" false;
  523. // php-5.1.2 cgi true, 4.2.2 cgi true
  524. eval("\$result = (boolean)($search);");
  525. if ($result and is_array($p)) {
  526. return $this->_bound();
  527. }
  528. return $result;
  529. }
  530. }
  531. ////////////////////////////////////////////////////////////////
  532. //
  533. // Remaining classes are private.
  534. //
  535. ////////////////////////////////////////////////////////////////
  536. /**
  537. * Virtual base class for nodes in a TextSearchQuery parse tree.
  538. *
  539. * Also serves as a 'VOID' (contentless) node.
  540. */
  541. class TextSearchQuery_node
  542. {
  543. var $op = 'VOID';
  544. /**
  545. * Optimize this node.
  546. * @return object Optimized node.
  547. */
  548. function optimize() {
  549. return $this;
  550. }
  551. /**
  552. * @return regexp matching this node.
  553. */
  554. function regexp() {
  555. return '';
  556. }
  557. /**
  558. * @param bool True if this node has been negated (higher in the parse tree.)
  559. * @return array A list of all non-negated words contained by this node.
  560. */
  561. function highlight_words($negated = false) {
  562. return array();
  563. }
  564. function sql() { return $this->word; }
  565. }
  566. /**
  567. * A word.
  568. */
  569. class TextSearchQuery_node_word
  570. extends TextSearchQuery_node
  571. {
  572. var $op = "WORD";
  573. function TextSearchQuery_node_word($word) {
  574. $this->word = $word;
  575. }
  576. function regexp() {
  577. return '(?=.*' . preg_quote($this->word, '/') . ')';
  578. }
  579. function highlight_words ($negated = false) {
  580. return $negated ? array() : array($this->word);
  581. }
  582. function _sql_quote() {
  583. $word = preg_replace('/(?=[%_\\\\])/', "\\", $this->word);
  584. return $GLOBALS['request']->_dbi->_backend->qstr($word);
  585. }
  586. function sql() { return '%'.$this->_sql_quote($this->word).'%'; }
  587. }
  588. class TextSearchQuery_node_all
  589. extends TextSearchQuery_node {
  590. var $op = "ALL";
  591. function regexp() { return '(?=.*)'; }
  592. function sql() { return '%'; }
  593. }
  594. class TextSearchQuery_node_starts_with
  595. extends TextSearchQuery_node_word {
  596. var $op = "STARTS_WITH";
  597. function regexp() { return '(?=.*\b' . preg_quote($this->word, '/') . ')'; }
  598. function sql () { return $this->_sql_quote($this->word).'%'; }
  599. }
  600. class TextSearchQuery_node_ends_with
  601. extends TextSearchQuery_node_word {
  602. var $op = "ENDS_WITH";
  603. function regexp() { return '(?=.*' . preg_quote($this->word, '/') . '\b)'; }
  604. function sql () { return '%'.$this->_sql_quote($this->word); }
  605. }
  606. class TextSearchQuery_node_exact
  607. extends TextSearchQuery_node_word {
  608. var $op = "EXACT";
  609. function regexp() { return '(?=\b' . preg_quote($this->word, '/') . '\b)'; }
  610. function sql () { return $this->_sql_squote($this->word); }
  611. }
  612. class TextSearchQuery_node_regex // posix regex. FIXME!
  613. extends TextSearchQuery_node_word {
  614. var $op = "REGEX"; // using REGEXP or ~ extension
  615. function regexp() { return '(?=.*\b' . $this->word . '\b)'; }
  616. function sql () { return $this->_sql_quote($this->word); }
  617. }
  618. class TextSearchQuery_node_regex_glob
  619. extends TextSearchQuery_node_regex {
  620. var $op = "REGEX_GLOB";
  621. function regexp() { return '(?=.*\b' . glob_to_pcre($this->word) . '\b)'; }
  622. }
  623. class TextSearchQuery_node_regex_pcre // how to handle pcre modifiers? /i
  624. extends TextSearchQuery_node_regex {
  625. var $op = "REGEX_PCRE";
  626. function regexp() { return $this->word; }
  627. }
  628. class TextSearchQuery_node_regex_sql
  629. extends TextSearchQuery_node_regex {
  630. var $op = "REGEX_SQL"; // using LIKE
  631. function regexp() { return str_replace(array("/%/","/_/"), array(".*","."), $this->word); }
  632. function sql() { return $this->word; }
  633. }
  634. /**
  635. * A negated clause.
  636. */
  637. class TextSearchQuery_node_not
  638. extends TextSearchQuery_node
  639. {
  640. var $op = "NOT";
  641. function TextSearchQuery_node_not($leaf) {
  642. $this->leaves = array($leaf);
  643. }
  644. function optimize() {
  645. $leaf = &$this->leaves[0];
  646. $leaf = $leaf->optimize();
  647. if ($leaf->op == 'NOT')
  648. return $leaf->leaves[0]; // ( NOT ( NOT x ) ) -> x
  649. return $this;
  650. }
  651. function regexp() {
  652. $leaf = &$this->leaves[0];
  653. return '(?!' . $leaf->regexp() . ')';
  654. }
  655. function highlight_words ($negated = false) {
  656. return $this->leaves[0]->highlight_words(!$negated);
  657. }
  658. }
  659. /**
  660. * Virtual base class for 'AND' and 'OR conjoins.
  661. */
  662. class TextSearchQuery_node_binop
  663. extends TextSearchQuery_node
  664. {
  665. function TextSearchQuery_node_binop($leaves) {
  666. $this->leaves = $leaves;
  667. }
  668. function _flatten() {
  669. // This flattens e.g. (AND (AND a b) (OR c d) e)
  670. // to (AND a b e (OR c d))
  671. $flat = array();
  672. foreach ($this->leaves as $leaf) {
  673. $leaf = $leaf->optimize();
  674. if ($this->op == $leaf->op)
  675. $flat = array_merge($flat, $leaf->leaves);
  676. else
  677. $flat[] = $leaf;
  678. }
  679. $this->leaves = $flat;
  680. }
  681. function optimize() {
  682. $this->_flatten();
  683. assert(!empty($this->leaves));
  684. if (count($this->leaves) == 1)
  685. return $this->leaves[0]; // (AND x) -> x
  686. return $this;
  687. }
  688. function highlight_words($negated = false) {
  689. $words = array();
  690. foreach ($this->leaves as $leaf)
  691. array_splice($words,0,0,
  692. $leaf->highlight_words($negated));
  693. return $words;
  694. }
  695. }
  696. /**
  697. * A (possibly multi-argument) 'AND' conjoin.
  698. */
  699. class TextSearchQuery_node_and
  700. extends TextSearchQuery_node_binop
  701. {
  702. var $op = "AND";
  703. function optimize() {
  704. $this->_flatten();
  705. // Convert (AND (NOT a) (NOT b) c d) into (AND (NOT (OR a b)) c d).
  706. // Since OR's are more efficient for regexp matching:
  707. // (?!.*a)(?!.*b) vs (?!.*(?:a|b))
  708. // Suck out the negated leaves.
  709. $nots = array();
  710. foreach ($this->leaves as $key => $leaf) {
  711. if ($leaf->op == 'NOT') {
  712. $nots[] = $leaf->leaves[0];
  713. unset($this->leaves[$key]);
  714. }
  715. }
  716. // Combine the negated leaves into a single negated or.
  717. if ($nots) {
  718. $node = ( new TextSearchQuery_node_not
  719. (new TextSearchQuery_node_or($nots)) );
  720. array_unshift($this->leaves, $node->optimize());
  721. }
  722. assert(!empty($this->leaves));
  723. if (count($this->leaves) == 1)
  724. return $this->leaves[0]; // (AND x) -> x
  725. return $this;
  726. }
  727. /* FIXME!
  728. * Either we need all combinations of all words to be position independent,
  729. * or we have to use multiple match calls for each AND
  730. * (AND x y) => /(?(:x)(:y))|(?(:y)(:x))/
  731. */
  732. function regexp() {
  733. $regexp = '';
  734. foreach ($this->leaves as $leaf)
  735. $regexp .= $leaf->regexp();
  736. return $regexp;
  737. }
  738. }
  739. /**
  740. * A (possibly multi-argument) 'OR' conjoin.
  741. */
  742. class TextSearchQuery_node_or
  743. extends TextSearchQuery_node_binop
  744. {
  745. var $op = "OR";
  746. function regexp() {
  747. // We will combine any of our direct descendents which are WORDs
  748. // into a single (?=.*(?:word1|word2|...)) regexp.
  749. $regexps = array();
  750. $words = array();
  751. foreach ($this->leaves as $leaf) {
  752. if ($leaf->op == 'WORD')
  753. $words[] = preg_quote($leaf->word, '/');
  754. else
  755. $regexps[] = $leaf->regexp();
  756. }
  757. if ($words)
  758. array_unshift($regexps,
  759. '(?=.*' . $this->_join($words) . ')');
  760. return $this->_join($regexps);
  761. }
  762. function _join($regexps) {
  763. assert(count($regexps) > 0);
  764. if (count($regexps) > 1)
  765. return '(?:' . join('|', $regexps) . ')';
  766. else
  767. return $regexps[0];
  768. }
  769. }
  770. ////////////////////////////////////////////////////////////////
  771. //
  772. // Parser:
  773. // op's (and, or, not) are forced to lowercase in the tokenizer.
  774. //
  775. ////////////////////////////////////////////////////////////////
  776. define ('TSQ_TOK_BINOP', 1);
  777. define ('TSQ_TOK_NOT', 2);
  778. define ('TSQ_TOK_LPAREN', 4);
  779. define ('TSQ_TOK_RPAREN', 8);
  780. define ('TSQ_TOK_WORD', 16);
  781. define ('TSQ_TOK_STARTS_WITH', 32);
  782. define ('TSQ_TOK_ENDS_WITH', 64);
  783. define ('TSQ_TOK_EXACT', 128);
  784. define ('TSQ_TOK_REGEX', 256);
  785. define ('TSQ_TOK_REGEX_GLOB', 512);
  786. define ('TSQ_TOK_REGEX_PCRE', 1024);
  787. define ('TSQ_TOK_REGEX_SQL', 2048);
  788. define ('TSQ_TOK_ALL', 4096);
  789. // all bits from word to the last.
  790. define ('TSQ_ALLWORDS', (4096*2)-1 - (16-1));
  791. class TextSearchQuery_Parser
  792. {
  793. /*
  794. * This is a simple recursive descent parser, based on the following grammar:
  795. *
  796. * toplist :
  797. * | toplist expr
  798. * ;
  799. *
  800. *
  801. * list : expr
  802. * | list expr
  803. * ;
  804. *
  805. * expr : atom
  806. * | expr BINOP atom
  807. * ;
  808. *
  809. * atom : '(' list ')'
  810. * | NOT atom
  811. * | WORD
  812. * ;
  813. *
  814. * The terminal tokens are:
  815. *
  816. *
  817. * and|or BINOP
  818. * -|not NOT
  819. * ( LPAREN
  820. * ) RPAREN
  821. * /[^-()\s][^()\s]* WORD
  822. * /"[^"]*"/ WORD
  823. * /'[^']*'/ WORD
  824. *
  825. * ^WORD STARTS_WITH
  826. * WORD* STARTS_WITH
  827. * *WORD ENDS_WITH
  828. * ^WORD$ EXACT
  829. * * ALL
  830. */
  831. function parse ($search_expr, $case_exact=false, $regex=TSQ_REGEX_AUTO) {
  832. $this->lexer = new TextSearchQuery_Lexer($search_expr, $case_exact, $regex);
  833. $this->_regex = $regex;
  834. $tree = $this->get_list('toplevel');
  835. assert($this->lexer->eof());
  836. unset($this->lexer);
  837. return $tree;
  838. }
  839. function get_list ($is_toplevel = false) {
  840. $list = array();
  841. // token types we'll accept as words (and thus expr's) for the
  842. // purpose of error recovery:
  843. $accept_as_words = TSQ_TOK_NOT | TSQ_TOK_BINOP;
  844. if ($is_toplevel)
  845. $accept_as_words |= TSQ_TOK_LPAREN | TSQ_TOK_RPAREN;
  846. while ( ($expr = $this->get_expr())
  847. || ($expr = $this->get_word($accept_as_words)) ) {
  848. $list[] = $expr;
  849. }
  850. if (!$list) {
  851. if ($is_toplevel)
  852. return new TextSearchQuery_node;
  853. else
  854. return false;
  855. }
  856. return new TextSearchQuery_node_and($list);
  857. }
  858. function get_expr () {
  859. if ( !($expr = $this->get_atom()) )
  860. return false;
  861. $savedpos = $this->lexer->tell();
  862. while ( ($op = $this->lexer->get(TSQ_TOK_BINOP)) ) {
  863. if ( ! ($right = $this->get_atom()) ) {
  864. break;
  865. }
  866. if ($op == 'and')
  867. $expr = new TextSearchQuery_node_and(array($expr, $right));
  868. else {
  869. assert($op == 'or');
  870. $expr = new TextSearchQuery_node_or(array($expr, $right));
  871. }
  872. $savedpos = $this->lexer->tell();
  873. }
  874. $this->lexer->seek($savedpos);
  875. return $expr;
  876. }
  877. function get_atom() {
  878. if ($word = $this->get_word(TSQ_ALLWORDS))
  879. return $word;
  880. $savedpos = $this->lexer->tell();
  881. if ( $this->lexer->get(TSQ_TOK_LPAREN) ) {
  882. if ( ($list = $this->get_list()) && $this->lexer->get(TSQ_TOK_RPAREN) )
  883. return $list;
  884. }
  885. elseif ( $this->lexer->get(TSQ_TOK_NOT) ) {
  886. if ( ($atom = $this->get_atom()) )
  887. return new TextSearchQuery_node_not($atom);
  888. }
  889. $this->lexer->seek($savedpos);
  890. return false;
  891. }
  892. function get_word($accept = TSQ_ALLWORDS) {
  893. foreach (array("WORD","STARTS_WITH","ENDS_WITH","EXACT",
  894. "REGEX","REGEX_GLOB","REGEX_PCRE","ALL") as $tok) {
  895. $const = constant("TSQ_TOK_".$tok);
  896. if ( $accept & $const and ($word = $this->lexer->get($const)) ) {
  897. $classname = "TextSearchQuery_node_".strtolower($tok);
  898. return new $classname($word);
  899. }
  900. }
  901. return false;
  902. }
  903. }
  904. class TextSearchQuery_Lexer {
  905. function TextSearchQuery_Lexer ($query_str, $case_exact=false, $regex=TSQ_REGEX_AUTO) {
  906. $this->tokens = $this->tokenize($query_str, $case_exact, $regex);
  907. $this->pos = 0;
  908. }
  909. function tell() {
  910. return $this->pos;
  911. }
  912. function seek($pos) {
  913. $this->pos = $pos;
  914. }
  915. function eof() {
  916. return $this->pos == count($this->tokens);
  917. }
  918. /**
  919. * TODO: support more regex styles, esp. prefer the forced ones over auto
  920. * re: and // stuff
  921. */
  922. function tokenize($string, $case_exact=false, $regex=TSQ_REGEX_AUTO) {
  923. $tokens = array();
  924. $buf = $case_exact ? ltrim($string) : strtolower(ltrim($string));
  925. while (!empty($buf)) {
  926. if (preg_match('/^(and|or)\b\s*/i', $buf, $m)) {
  927. $val = strtolower($m[1]);
  928. $type = TSQ_TOK_BINOP;
  929. }
  930. elseif (preg_match('/^(-|not\b)\s*/i', $buf, $m)) {
  931. $val = strtolower($m[1]);
  932. $type = TSQ_TOK_NOT;
  933. }
  934. elseif (preg_match('/^([()])\s*/', $buf, $m)) {
  935. $val = $m[1];
  936. $type = $m[1] == '(' ? TSQ_TOK_LPAREN : TSQ_TOK_RPAREN;
  937. }
  938. // * => ALL
  939. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_GLOB)
  940. and preg_match('/^\*\s*/', $buf, $m)) {
  941. $val = "*";
  942. $type = TSQ_TOK_ALL;
  943. }
  944. // .* => ALL
  945. elseif ($regex & (TSQ_REGEX_PCRE)
  946. and preg_match('/^\.\*\s*/', $buf, $m)) {
  947. $val = ".*";
  948. $type = TSQ_TOK_ALL;
  949. }
  950. // % => ALL
  951. elseif ($regex & (TSQ_REGEX_SQL)
  952. and preg_match('/^%\s*/', $buf, $m)) {
  953. $val = "%";
  954. $type = TSQ_TOK_ALL;
  955. }
  956. // ^word
  957. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_PCRE)
  958. and preg_match('/^\^([^-()][^()\s]*)\s*/', $buf, $m)) {
  959. $val = $m[1];
  960. $type = TSQ_TOK_STARTS_WITH;
  961. }
  962. // word*
  963. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_GLOB)
  964. and preg_match('/^([^-()][^()\s]*)\*\s*/', $buf, $m)) {
  965. $val = $m[1];
  966. $type = TSQ_TOK_STARTS_WITH;
  967. }
  968. // *word
  969. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_GLOB)
  970. and preg_match('/^\*([^-()][^()\s]*)\s*/', $buf, $m)) {
  971. $val = $m[1];
  972. $type = TSQ_TOK_ENDS_WITH;
  973. }
  974. // word$
  975. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_PCRE)
  976. and preg_match('/^([^-()][^()\s]*)\$\s*/', $buf, $m)) {
  977. $val = $m[1];
  978. $type = TSQ_TOK_ENDS_WITH;
  979. }
  980. // ^word$
  981. elseif ($regex & (TSQ_REGEX_AUTO|TSQ_REGEX_POSIX|TSQ_REGEX_PCRE)
  982. and preg_match('/^\^([^-()][^()\s]*)\$\s*/', $buf, $m)) {
  983. $val = $m[1];
  984. $type = TSQ_TOK_EXACT;
  985. }
  986. // "words "
  987. elseif (preg_match('/^ " ( (?: [^"]+ | "" )* ) " \s*/x', $buf, $m)) {
  988. $val = str_replace('""', '"', $m[1]);
  989. $type = TSQ_TOK_WORD;
  990. }
  991. // 'words '
  992. elseif (preg_match("/^ ' ( (?:[^']+|'')* ) ' \s*/x", $buf, $m)) {
  993. $val = str_replace("''", "'", $m[1]);
  994. $type = TSQ_TOK_WORD;
  995. }
  996. // word
  997. elseif (preg_match('/^([^-()][^()\s]*)\s*/', $buf, $m)) {
  998. $val = $m[1];
  999. $type = TSQ_TOK_WORD;
  1000. }
  1001. else {
  1002. assert(empty($buf));
  1003. break;
  1004. }
  1005. $buf = substr($buf, strlen($m[0]));
  1006. /* refine the simple parsing from above: bla*bla, bla?bla, ...
  1007. if ($regex and $type == TSQ_TOK_WORD) {
  1008. if (substr($val,0,1) == "^")
  1009. $type = TSQ_TOK_STARTS_WITH;
  1010. elseif (substr($val,0,1) == "*")
  1011. $type = TSQ_TOK_ENDS_WITH;
  1012. elseif (substr($val,-1,1) == "*")
  1013. $type = TSQ_TOK_STARTS_WITH;
  1014. }
  1015. */
  1016. $tokens[] = array($type, $val);
  1017. }
  1018. return $tokens;
  1019. }
  1020. function get($accept) {
  1021. if ($this->pos >= count($this->tokens))
  1022. return false;
  1023. list ($type, $val) = $this->tokens[$this->pos];
  1024. if (($type & $accept) == 0)
  1025. return false;
  1026. $this->pos++;
  1027. return $val;
  1028. }
  1029. }
  1030. // $Log: TextSearchQuery.php,v $
  1031. // Revision 1.28 2007/03/18 17:35:26 rurban
  1032. // Improve comments
  1033. //
  1034. // Revision 1.27 2007/01/21 23:27:32 rurban
  1035. // Fix ->_backend->qstr()
  1036. //
  1037. // Revision 1.26 2007/01/04 16:41:52 rurban
  1038. // Improve error description. Fix the function parser for illegal functions, when the tokenizer cannot be used.
  1039. //
  1040. // Revision 1.25 2007/01/03 21:22:34 rurban
  1041. // add getType(). NumericSearchQuery::check Improve hacker detection using token_get_all(). Better support for multiple attributes. Add getVars().
  1042. //
  1043. // Revision 1.24 2007/01/02 13:19:05 rurban
  1044. // add NumericSearchQuery. change on pcre: no parsing done, detect modifiers
  1045. //
  1046. // Revision 1.23 2006/04/13 19:30:44 rurban
  1047. // make TextSearchQuery->_stoplist localizable and overridable within config.ini
  1048. //
  1049. // Local Variables:
  1050. // mode: php
  1051. // tab-width: 8
  1052. // c-basic-offset: 4
  1053. // c-hanging-comment-ender-p: nil
  1054. // indent-tabs-mode: nil
  1055. // End:
  1056. ?>