PageRenderTime 1167ms CodeModel.GetById 36ms RepoModel.GetById 10ms app.codeStats 0ms

/wwwroot/phpbb/phpbb/search/fulltext_native.php

https://github.com/spring/spring-website
PHP | 1920 lines | 1343 code | 209 blank | 368 comment | 173 complexity | 38d86ec76d4da1798fcc987429553569 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. namespace phpbb\search;
  14. /**
  15. * phpBB's own db driven fulltext search, version 2
  16. */
  17. class fulltext_native extends \phpbb\search\base
  18. {
  19. /**
  20. * Associative array holding index stats
  21. * @var array
  22. */
  23. protected $stats = array();
  24. /**
  25. * Associative array stores the min and max word length to be searched
  26. * @var array
  27. */
  28. protected $word_length = array();
  29. /**
  30. * Contains tidied search query.
  31. * Operators are prefixed in search query and common words excluded
  32. * @var string
  33. */
  34. protected $search_query;
  35. /**
  36. * Contains common words.
  37. * Common words are words with length less/more than min/max length
  38. * @var array
  39. */
  40. protected $common_words = array();
  41. /**
  42. * Post ids of posts containing words that are to be included
  43. * @var array
  44. */
  45. protected $must_contain_ids = array();
  46. /**
  47. * Post ids of posts containing words that should not be included
  48. * @var array
  49. */
  50. protected $must_not_contain_ids = array();
  51. /**
  52. * Post ids of posts containing atleast one word that needs to be excluded
  53. * @var array
  54. */
  55. protected $must_exclude_one_ids = array();
  56. /**
  57. * Relative path to board root
  58. * @var string
  59. */
  60. protected $phpbb_root_path;
  61. /**
  62. * PHP Extension
  63. * @var string
  64. */
  65. protected $php_ext;
  66. /**
  67. * Config object
  68. * @var \phpbb\config\config
  69. */
  70. protected $config;
  71. /**
  72. * Database connection
  73. * @var \phpbb\db\driver\driver_interface
  74. */
  75. protected $db;
  76. /**
  77. * phpBB event dispatcher object
  78. * @var \phpbb\event\dispatcher_interface
  79. */
  80. protected $phpbb_dispatcher;
  81. /**
  82. * User object
  83. * @var \phpbb\user
  84. */
  85. protected $user;
  86. /**
  87. * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
  88. *
  89. * @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
  90. * @param \phpbb\event\dispatcher_interface $phpbb_dispatcher Event dispatcher object
  91. */
  92. public function __construct(&$error, $phpbb_root_path, $phpEx, $auth, $config, $db, $user, $phpbb_dispatcher)
  93. {
  94. $this->phpbb_root_path = $phpbb_root_path;
  95. $this->php_ext = $phpEx;
  96. $this->config = $config;
  97. $this->db = $db;
  98. $this->phpbb_dispatcher = $phpbb_dispatcher;
  99. $this->user = $user;
  100. $this->word_length = array('min' => $this->config['fulltext_native_min_chars'], 'max' => $this->config['fulltext_native_max_chars']);
  101. /**
  102. * Load the UTF tools
  103. */
  104. if (!class_exists('utf_normalizer'))
  105. {
  106. include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
  107. }
  108. if (!function_exists('utf8_decode_ncr'))
  109. {
  110. include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
  111. }
  112. $error = false;
  113. }
  114. /**
  115. * Returns the name of this search backend to be displayed to administrators
  116. *
  117. * @return string Name
  118. */
  119. public function get_name()
  120. {
  121. return 'phpBB Native Fulltext';
  122. }
  123. /**
  124. * Returns the search_query
  125. *
  126. * @return string search query
  127. */
  128. public function get_search_query()
  129. {
  130. return $this->search_query;
  131. }
  132. /**
  133. * Returns the common_words array
  134. *
  135. * @return array common words that are ignored by search backend
  136. */
  137. public function get_common_words()
  138. {
  139. return $this->common_words;
  140. }
  141. /**
  142. * Returns the word_length array
  143. *
  144. * @return array min and max word length for searching
  145. */
  146. public function get_word_length()
  147. {
  148. return $this->word_length;
  149. }
  150. /**
  151. * This function fills $this->search_query with the cleaned user search query
  152. *
  153. * If $terms is 'any' then the words will be extracted from the search query
  154. * and combined with | inside brackets. They will afterwards be treated like
  155. * an standard search query.
  156. *
  157. * Then it analyses the query and fills the internal arrays $must_not_contain_ids,
  158. * $must_contain_ids and $must_exclude_one_ids which are later used by keyword_search()
  159. *
  160. * @param string $keywords contains the search query string as entered by the user
  161. * @param string $terms is either 'all' (use search query as entered, default words to 'must be contained in post')
  162. * or 'any' (find all posts containing at least one of the given words)
  163. * @return boolean false if no valid keywords were found and otherwise true
  164. */
  165. public function split_keywords($keywords, $terms)
  166. {
  167. $tokens = '+-|()*';
  168. $keywords = trim($this->cleanup($keywords, $tokens));
  169. // allow word|word|word without brackets
  170. if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false))
  171. {
  172. $keywords = '(' . $keywords . ')';
  173. }
  174. $open_bracket = $space = false;
  175. for ($i = 0, $n = strlen($keywords); $i < $n; $i++)
  176. {
  177. if ($open_bracket !== false)
  178. {
  179. switch ($keywords[$i])
  180. {
  181. case ')':
  182. if ($open_bracket + 1 == $i)
  183. {
  184. $keywords[$i - 1] = '|';
  185. $keywords[$i] = '|';
  186. }
  187. $open_bracket = false;
  188. break;
  189. case '(':
  190. $keywords[$i] = '|';
  191. break;
  192. case '+':
  193. case '-':
  194. case ' ':
  195. $keywords[$i] = '|';
  196. break;
  197. case '*':
  198. if ($i === 0 || ($keywords[$i - 1] !== '*' && strcspn($keywords[$i - 1], $tokens) === 0))
  199. {
  200. if ($i === $n - 1 || ($keywords[$i + 1] !== '*' && strcspn($keywords[$i + 1], $tokens) === 0))
  201. {
  202. $keywords = substr($keywords, 0, $i) . substr($keywords, $i + 1);
  203. }
  204. }
  205. break;
  206. }
  207. }
  208. else
  209. {
  210. switch ($keywords[$i])
  211. {
  212. case ')':
  213. $keywords[$i] = ' ';
  214. break;
  215. case '(':
  216. $open_bracket = $i;
  217. $space = false;
  218. break;
  219. case '|':
  220. $keywords[$i] = ' ';
  221. break;
  222. case '-':
  223. case '+':
  224. $space = $keywords[$i];
  225. break;
  226. case ' ':
  227. if ($space !== false)
  228. {
  229. $keywords[$i] = $space;
  230. }
  231. break;
  232. default:
  233. $space = false;
  234. }
  235. }
  236. }
  237. if ($open_bracket)
  238. {
  239. $keywords .= ')';
  240. }
  241. $match = array(
  242. '# +#',
  243. '#\|\|+#',
  244. '#(\+|\-)(?:\+|\-)+#',
  245. '#\(\|#',
  246. '#\|\)#',
  247. );
  248. $replace = array(
  249. ' ',
  250. '|',
  251. '$1',
  252. '(',
  253. ')',
  254. );
  255. $keywords = preg_replace($match, $replace, $keywords);
  256. $num_keywords = sizeof(explode(' ', $keywords));
  257. // We limit the number of allowed keywords to minimize load on the database
  258. if ($this->config['max_num_search_keywords'] && $num_keywords > $this->config['max_num_search_keywords'])
  259. {
  260. trigger_error($this->user->lang('MAX_NUM_SEARCH_KEYWORDS_REFINE', (int) $this->config['max_num_search_keywords'], $num_keywords));
  261. }
  262. // $keywords input format: each word separated by a space, words in a bracket are not separated
  263. // the user wants to search for any word, convert the search query
  264. if ($terms == 'any')
  265. {
  266. $words = array();
  267. preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
  268. if (sizeof($words[1]))
  269. {
  270. $keywords = '(' . implode('|', $words[1]) . ')';
  271. }
  272. }
  273. // set the search_query which is shown to the user
  274. $this->search_query = $keywords;
  275. $exact_words = array();
  276. preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words);
  277. $exact_words = $exact_words[1];
  278. $common_ids = $words = array();
  279. if (sizeof($exact_words))
  280. {
  281. $sql = 'SELECT word_id, word_text, word_common
  282. FROM ' . SEARCH_WORDLIST_TABLE . '
  283. WHERE ' . $this->db->sql_in_set('word_text', $exact_words) . '
  284. ORDER BY word_count ASC';
  285. $result = $this->db->sql_query($sql);
  286. // store an array of words and ids, remove common words
  287. while ($row = $this->db->sql_fetchrow($result))
  288. {
  289. if ($row['word_common'])
  290. {
  291. $this->common_words[] = $row['word_text'];
  292. $common_ids[$row['word_text']] = (int) $row['word_id'];
  293. continue;
  294. }
  295. $words[$row['word_text']] = (int) $row['word_id'];
  296. }
  297. $this->db->sql_freeresult($result);
  298. }
  299. // Handle +, - without preceeding whitespace character
  300. $match = array('#(\S)\+#', '#(\S)-#');
  301. $replace = array('$1 +', '$1 +');
  302. $keywords = preg_replace($match, $replace, $keywords);
  303. // now analyse the search query, first split it using the spaces
  304. $query = explode(' ', $keywords);
  305. $this->must_contain_ids = array();
  306. $this->must_not_contain_ids = array();
  307. $this->must_exclude_one_ids = array();
  308. $mode = '';
  309. $ignore_no_id = true;
  310. foreach ($query as $word)
  311. {
  312. if (empty($word))
  313. {
  314. continue;
  315. }
  316. // words which should not be included
  317. if ($word[0] == '-')
  318. {
  319. $word = substr($word, 1);
  320. // a group of which at least one may not be in the resulting posts
  321. if ($word[0] == '(')
  322. {
  323. $word = array_unique(explode('|', substr($word, 1, -1)));
  324. $mode = 'must_exclude_one';
  325. }
  326. // one word which should not be in the resulting posts
  327. else
  328. {
  329. $mode = 'must_not_contain';
  330. }
  331. $ignore_no_id = true;
  332. }
  333. // words which have to be included
  334. else
  335. {
  336. // no prefix is the same as a +prefix
  337. if ($word[0] == '+')
  338. {
  339. $word = substr($word, 1);
  340. }
  341. // a group of words of which at least one word should be in every resulting post
  342. if ($word[0] == '(')
  343. {
  344. $word = array_unique(explode('|', substr($word, 1, -1)));
  345. }
  346. $ignore_no_id = false;
  347. $mode = 'must_contain';
  348. }
  349. if (empty($word))
  350. {
  351. continue;
  352. }
  353. // if this is an array of words then retrieve an id for each
  354. if (is_array($word))
  355. {
  356. $non_common_words = array();
  357. $id_words = array();
  358. foreach ($word as $i => $word_part)
  359. {
  360. if (strpos($word_part, '*') !== false)
  361. {
  362. $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
  363. $non_common_words[] = $word_part;
  364. }
  365. else if (isset($words[$word_part]))
  366. {
  367. $id_words[] = $words[$word_part];
  368. $non_common_words[] = $word_part;
  369. }
  370. else
  371. {
  372. $len = utf8_strlen($word_part);
  373. if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
  374. {
  375. $this->common_words[] = $word_part;
  376. }
  377. }
  378. }
  379. if (sizeof($id_words))
  380. {
  381. sort($id_words);
  382. if (sizeof($id_words) > 1)
  383. {
  384. $this->{$mode . '_ids'}[] = $id_words;
  385. }
  386. else
  387. {
  388. $mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode;
  389. $this->{$mode . '_ids'}[] = $id_words[0];
  390. }
  391. }
  392. // throw an error if we shall not ignore unexistant words
  393. else if (!$ignore_no_id && sizeof($non_common_words))
  394. {
  395. trigger_error(sprintf($this->user->lang['WORDS_IN_NO_POST'], implode($this->user->lang['COMMA_SEPARATOR'], $non_common_words)));
  396. }
  397. unset($non_common_words);
  398. }
  399. // else we only need one id
  400. else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word]))
  401. {
  402. if ($wildcard)
  403. {
  404. $len = utf8_strlen(str_replace('*', '', $word));
  405. if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
  406. {
  407. $this->{$mode . '_ids'}[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word)) . '\'';
  408. }
  409. else
  410. {
  411. $this->common_words[] = $word;
  412. }
  413. }
  414. else
  415. {
  416. $this->{$mode . '_ids'}[] = $words[$word];
  417. }
  418. }
  419. else
  420. {
  421. if (!isset($common_ids[$word]))
  422. {
  423. $len = utf8_strlen($word);
  424. if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
  425. {
  426. $this->common_words[] = $word;
  427. }
  428. }
  429. }
  430. }
  431. // Return true if all words are not common words
  432. if (sizeof($exact_words) - sizeof($this->common_words) > 0)
  433. {
  434. return true;
  435. }
  436. return false;
  437. }
  438. /**
  439. * Performs a search on keywords depending on display specific params. You have to run split_keywords() first
  440. *
  441. * @param string $type contains either posts or topics depending on what should be searched for
  442. * @param string $fields contains either titleonly (topic titles should be searched), msgonly (only message bodies should be searched), firstpost (only subject and body of the first post should be searched) or all (all post bodies and subjects should be searched)
  443. * @param string $terms is either 'all' (use query as entered, words without prefix should default to "have to be in field") or 'any' (ignore search query parts and just return all posts that contain any of the specified words)
  444. * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
  445. * @param string $sort_key is the key of $sort_by_sql for the selected sorting
  446. * @param string $sort_dir is either a or d representing ASC and DESC
  447. * @param string $sort_days specifies the maximum amount of days a post may be old
  448. * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
  449. * @param string $post_visibility specifies which types of posts the user can view in which forums
  450. * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
  451. * @param array $author_ary an array of author ids if the author should be ignored during the search the array is empty
  452. * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
  453. * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
  454. * @param int $start indicates the first index of the page
  455. * @param int $per_page number of ids each page is supposed to contain
  456. * @return boolean|int total number of results
  457. */
  458. public function keyword_search($type, $fields, $terms, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $post_visibility, $topic_id, $author_ary, $author_name, &$id_ary, &$start, $per_page)
  459. {
  460. // No keywords? No posts.
  461. if (empty($this->search_query))
  462. {
  463. return false;
  464. }
  465. // we can't search for negatives only
  466. if (empty($this->must_contain_ids))
  467. {
  468. return false;
  469. }
  470. $must_contain_ids = $this->must_contain_ids;
  471. $must_not_contain_ids = $this->must_not_contain_ids;
  472. $must_exclude_one_ids = $this->must_exclude_one_ids;
  473. sort($must_contain_ids);
  474. sort($must_not_contain_ids);
  475. sort($must_exclude_one_ids);
  476. // generate a search_key from all the options to identify the results
  477. $search_key = md5(implode('#', array(
  478. serialize($must_contain_ids),
  479. serialize($must_not_contain_ids),
  480. serialize($must_exclude_one_ids),
  481. $type,
  482. $fields,
  483. $terms,
  484. $sort_days,
  485. $sort_key,
  486. $topic_id,
  487. implode(',', $ex_fid_ary),
  488. $post_visibility,
  489. implode(',', $author_ary),
  490. $author_name,
  491. )));
  492. // try reading the results from cache
  493. $total_results = 0;
  494. if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
  495. {
  496. return $total_results;
  497. }
  498. $id_ary = array();
  499. $sql_where = array();
  500. $group_by = false;
  501. $m_num = 0;
  502. $w_num = 0;
  503. $sql_array = array(
  504. 'SELECT' => ($type == 'posts') ? 'p.post_id' : 'p.topic_id',
  505. 'FROM' => array(
  506. SEARCH_WORDMATCH_TABLE => array(),
  507. SEARCH_WORDLIST_TABLE => array(),
  508. ),
  509. 'LEFT_JOIN' => array(array(
  510. 'FROM' => array(POSTS_TABLE => 'p'),
  511. 'ON' => 'm0.post_id = p.post_id',
  512. )),
  513. );
  514. $title_match = '';
  515. $left_join_topics = false;
  516. $group_by = true;
  517. // Build some display specific sql strings
  518. switch ($fields)
  519. {
  520. case 'titleonly':
  521. $title_match = 'title_match = 1';
  522. $group_by = false;
  523. // no break
  524. case 'firstpost':
  525. $left_join_topics = true;
  526. $sql_where[] = 'p.post_id = t.topic_first_post_id';
  527. break;
  528. case 'msgonly':
  529. $title_match = 'title_match = 0';
  530. $group_by = false;
  531. break;
  532. }
  533. if ($type == 'topics')
  534. {
  535. $left_join_topics = true;
  536. $group_by = true;
  537. }
  538. /**
  539. * @todo Add a query optimizer (handle stuff like "+(4|3) +4")
  540. */
  541. foreach ($this->must_contain_ids as $subquery)
  542. {
  543. if (is_array($subquery))
  544. {
  545. $group_by = true;
  546. $word_id_sql = array();
  547. $word_ids = array();
  548. foreach ($subquery as $id)
  549. {
  550. if (is_string($id))
  551. {
  552. $sql_array['LEFT_JOIN'][] = array(
  553. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  554. 'ON' => "w$w_num.word_text LIKE $id"
  555. );
  556. $word_ids[] = "w$w_num.word_id";
  557. $w_num++;
  558. }
  559. else
  560. {
  561. $word_ids[] = $id;
  562. }
  563. }
  564. $sql_where[] = $this->db->sql_in_set("m$m_num.word_id", $word_ids);
  565. unset($word_id_sql);
  566. unset($word_ids);
  567. }
  568. else if (is_string($subquery))
  569. {
  570. $sql_array['FROM'][SEARCH_WORDLIST_TABLE][] = 'w' . $w_num;
  571. $sql_where[] = "w$w_num.word_text LIKE $subquery";
  572. $sql_where[] = "m$m_num.word_id = w$w_num.word_id";
  573. $group_by = true;
  574. $w_num++;
  575. }
  576. else
  577. {
  578. $sql_where[] = "m$m_num.word_id = $subquery";
  579. }
  580. $sql_array['FROM'][SEARCH_WORDMATCH_TABLE][] = 'm' . $m_num;
  581. if ($title_match)
  582. {
  583. $sql_where[] = "m$m_num.$title_match";
  584. }
  585. if ($m_num != 0)
  586. {
  587. $sql_where[] = "m$m_num.post_id = m0.post_id";
  588. }
  589. $m_num++;
  590. }
  591. foreach ($this->must_not_contain_ids as $key => $subquery)
  592. {
  593. if (is_string($subquery))
  594. {
  595. $sql_array['LEFT_JOIN'][] = array(
  596. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  597. 'ON' => "w$w_num.word_text LIKE $subquery"
  598. );
  599. $this->must_not_contain_ids[$key] = "w$w_num.word_id";
  600. $group_by = true;
  601. $w_num++;
  602. }
  603. }
  604. if (sizeof($this->must_not_contain_ids))
  605. {
  606. $sql_array['LEFT_JOIN'][] = array(
  607. 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
  608. 'ON' => $this->db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id"
  609. );
  610. $sql_where[] = "m$m_num.word_id IS NULL";
  611. $m_num++;
  612. }
  613. foreach ($this->must_exclude_one_ids as $ids)
  614. {
  615. $is_null_joins = array();
  616. foreach ($ids as $id)
  617. {
  618. if (is_string($id))
  619. {
  620. $sql_array['LEFT_JOIN'][] = array(
  621. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  622. 'ON' => "w$w_num.word_text LIKE $id"
  623. );
  624. $id = "w$w_num.word_id";
  625. $group_by = true;
  626. $w_num++;
  627. }
  628. $sql_array['LEFT_JOIN'][] = array(
  629. 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
  630. 'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '')
  631. );
  632. $is_null_joins[] = "m$m_num.word_id IS NULL";
  633. $m_num++;
  634. }
  635. $sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')';
  636. }
  637. $sql_where[] = $post_visibility;
  638. $search_query = $this->search_query;
  639. $must_exclude_one_ids = $this->must_exclude_one_ids;
  640. $must_not_contain_ids = $this->must_not_contain_ids;
  641. $must_contain_ids = $this->must_contain_ids;
  642. /**
  643. * Allow changing the query used for counting for posts using fulltext_native
  644. *
  645. * @event core.search_native_keywords_count_query_before
  646. * @var string search_query The parsed keywords used for this search
  647. * @var array must_not_contain_ids Ids that cannot be taken into account for the results
  648. * @var array must_exclude_one_ids Ids that cannot be on the results
  649. * @var array must_contain_ids Ids that must be on the results
  650. * @var int result_count The previous result count for the format of the query
  651. * Set to 0 to force a re-count
  652. * @var bool join_topic Weather or not TOPICS_TABLE should be CROSS JOIN'ED
  653. * @var array author_ary Array of user_id containing the users to filter the results to
  654. * @var string author_name An extra username to search on (!empty(author_ary) must be true, to be relevant)
  655. * @var array ex_fid_ary Which forums not to search on
  656. * @var int topic_id Limit the search to this topic_id only
  657. * @var string sql_sort_table Extra tables to include in the SQL query.
  658. * Used in conjunction with sql_sort_join
  659. * @var string sql_sort_join SQL conditions to join all the tables used together.
  660. * Used in conjunction with sql_sort_table
  661. * @var int sort_days Time, in days, of the oldest possible post to list
  662. * @var string sql_where An array of the current WHERE clause conditions
  663. * @var string sql_match Which columns to do the search on
  664. * @var string sql_match_where Extra conditions to use to properly filter the matching process
  665. * @var string group_by Whether or not the SQL query requires a GROUP BY for the elements in the SELECT clause
  666. * @var string sort_by_sql The possible predefined sort types
  667. * @var string sort_key The sort type used from the possible sort types
  668. * @var string sort_dir "a" for ASC or "d" dor DESC for the sort order used
  669. * @var string sql_sort The result SQL when processing sort_by_sql + sort_key + sort_dir
  670. * @var int start How many posts to skip in the search results (used for pagination)
  671. * @since 3.1.5-RC1
  672. */
  673. $vars = array(
  674. 'search_query',
  675. 'must_not_contain_ids',
  676. 'must_exclude_one_ids',
  677. 'must_contain_ids',
  678. 'result_count',
  679. 'join_topic',
  680. 'author_ary',
  681. 'author_name',
  682. 'ex_fid_ary',
  683. 'topic_id',
  684. 'sql_sort_table',
  685. 'sql_sort_join',
  686. 'sort_days',
  687. 'sql_where',
  688. 'sql_match',
  689. 'sql_match_where',
  690. 'group_by',
  691. 'sort_by_sql',
  692. 'sort_key',
  693. 'sort_dir',
  694. 'sql_sort',
  695. 'start',
  696. );
  697. extract($this->phpbb_dispatcher->trigger_event('core.search_native_keywords_count_query_before', compact($vars)));
  698. if ($topic_id)
  699. {
  700. $sql_where[] = 'p.topic_id = ' . $topic_id;
  701. }
  702. if (sizeof($author_ary))
  703. {
  704. if ($author_name)
  705. {
  706. // first one matches post of registered users, second one guests and deleted users
  707. $sql_author = '(' . $this->db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
  708. }
  709. else
  710. {
  711. $sql_author = $this->db->sql_in_set('p.poster_id', $author_ary);
  712. }
  713. $sql_where[] = $sql_author;
  714. }
  715. if (sizeof($ex_fid_ary))
  716. {
  717. $sql_where[] = $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true);
  718. }
  719. if ($sort_days)
  720. {
  721. $sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400));
  722. }
  723. $sql_array['WHERE'] = implode(' AND ', $sql_where);
  724. $is_mysql = false;
  725. // if the total result count is not cached yet, retrieve it from the db
  726. if (!$total_results)
  727. {
  728. $sql = '';
  729. $sql_array_count = $sql_array;
  730. if ($left_join_topics)
  731. {
  732. $sql_array_count['LEFT_JOIN'][] = array(
  733. 'FROM' => array(TOPICS_TABLE => 't'),
  734. 'ON' => 'p.topic_id = t.topic_id'
  735. );
  736. }
  737. switch ($this->db->get_sql_layer())
  738. {
  739. case 'mysql4':
  740. case 'mysqli':
  741. // 3.x does not support SQL_CALC_FOUND_ROWS
  742. // $sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
  743. $is_mysql = true;
  744. break;
  745. case 'sqlite':
  746. case 'sqlite3':
  747. $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
  748. $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
  749. FROM (' . $this->db->sql_build_query('SELECT', $sql_array_count) . ')';
  750. // no break
  751. default:
  752. $sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results';
  753. $sql = (!$sql) ? $this->db->sql_build_query('SELECT', $sql_array_count) : $sql;
  754. $result = $this->db->sql_query($sql);
  755. $total_results = (int) $this->db->sql_fetchfield('total_results');
  756. $this->db->sql_freeresult($result);
  757. if (!$total_results)
  758. {
  759. return false;
  760. }
  761. break;
  762. }
  763. unset($sql_array_count, $sql);
  764. }
  765. // Build sql strings for sorting
  766. $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
  767. switch ($sql_sort[0])
  768. {
  769. case 'u':
  770. $sql_array['FROM'][USERS_TABLE] = 'u';
  771. $sql_where[] = 'u.user_id = p.poster_id ';
  772. break;
  773. case 't':
  774. $left_join_topics = true;
  775. break;
  776. case 'f':
  777. $sql_array['FROM'][FORUMS_TABLE] = 'f';
  778. $sql_where[] = 'f.forum_id = p.forum_id';
  779. break;
  780. }
  781. if ($left_join_topics)
  782. {
  783. $sql_array['LEFT_JOIN'][] = array(
  784. 'FROM' => array(TOPICS_TABLE => 't'),
  785. 'ON' => 'p.topic_id = t.topic_id'
  786. );
  787. }
  788. // if using mysql and the total result count is not calculated yet, get it from the db
  789. if (!$total_results && $is_mysql)
  790. {
  791. // Also count rows for the query as if there was not LIMIT. Add SQL_CALC_FOUND_ROWS to SQL
  792. $sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
  793. }
  794. $sql_array['WHERE'] = implode(' AND ', $sql_where);
  795. $sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : '';
  796. $sql_array['ORDER_BY'] = $sql_sort;
  797. unset($sql_where, $sql_sort, $group_by);
  798. $sql = $this->db->sql_build_query('SELECT', $sql_array);
  799. $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start);
  800. while ($row = $this->db->sql_fetchrow($result))
  801. {
  802. $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
  803. }
  804. $this->db->sql_freeresult($result);
  805. if (!$total_results && $is_mysql)
  806. {
  807. // Get the number of results as calculated by MySQL
  808. $sql_count = 'SELECT FOUND_ROWS() as total_results';
  809. $result = $this->db->sql_query($sql_count);
  810. $total_results = (int) $this->db->sql_fetchfield('total_results');
  811. $this->db->sql_freeresult($result);
  812. if (!$total_results)
  813. {
  814. return false;
  815. }
  816. }
  817. if ($start >= $total_results)
  818. {
  819. $start = floor(($total_results - 1) / $per_page) * $per_page;
  820. $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start);
  821. while ($row = $this->db->sql_fetchrow($result))
  822. {
  823. $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
  824. }
  825. $this->db->sql_freeresult($result);
  826. }
  827. // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page
  828. $this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir);
  829. $id_ary = array_slice($id_ary, 0, (int) $per_page);
  830. return $total_results;
  831. }
  832. /**
  833. * Performs a search on an author's posts without caring about message contents. Depends on display specific params
  834. *
  835. * @param string $type contains either posts or topics depending on what should be searched for
  836. * @param boolean $firstpost_only if true, only topic starting posts will be considered
  837. * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
  838. * @param string $sort_key is the key of $sort_by_sql for the selected sorting
  839. * @param string $sort_dir is either a or d representing ASC and DESC
  840. * @param string $sort_days specifies the maximum amount of days a post may be old
  841. * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
  842. * @param string $post_visibility specifies which types of posts the user can view in which forums
  843. * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
  844. * @param array $author_ary an array of author ids
  845. * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
  846. * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
  847. * @param int $start indicates the first index of the page
  848. * @param int $per_page number of ids each page is supposed to contain
  849. * @return boolean|int total number of results
  850. */
  851. public function author_search($type, $firstpost_only, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $post_visibility, $topic_id, $author_ary, $author_name, &$id_ary, &$start, $per_page)
  852. {
  853. // No author? No posts
  854. if (!sizeof($author_ary))
  855. {
  856. return 0;
  857. }
  858. // generate a search_key from all the options to identify the results
  859. $search_key = md5(implode('#', array(
  860. '',
  861. $type,
  862. ($firstpost_only) ? 'firstpost' : '',
  863. '',
  864. '',
  865. $sort_days,
  866. $sort_key,
  867. $topic_id,
  868. implode(',', $ex_fid_ary),
  869. $post_visibility,
  870. implode(',', $author_ary),
  871. $author_name,
  872. )));
  873. // try reading the results from cache
  874. $total_results = 0;
  875. if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
  876. {
  877. return $total_results;
  878. }
  879. $id_ary = array();
  880. // Create some display specific sql strings
  881. if ($author_name)
  882. {
  883. // first one matches post of registered users, second one guests and deleted users
  884. $sql_author = '(' . $this->db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
  885. }
  886. else
  887. {
  888. $sql_author = $this->db->sql_in_set('p.poster_id', $author_ary);
  889. }
  890. $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
  891. $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
  892. $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
  893. $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
  894. $post_visibility = ($post_visibility) ? ' AND ' . $post_visibility : '';
  895. // Build sql strings for sorting
  896. $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
  897. $sql_sort_table = $sql_sort_join = '';
  898. switch ($sql_sort[0])
  899. {
  900. case 'u':
  901. $sql_sort_table = USERS_TABLE . ' u, ';
  902. $sql_sort_join = ' AND u.user_id = p.poster_id ';
  903. break;
  904. case 't':
  905. $sql_sort_table = ($type == 'posts' && !$firstpost_only) ? TOPICS_TABLE . ' t, ' : '';
  906. $sql_sort_join = ($type == 'posts' && !$firstpost_only) ? ' AND t.topic_id = p.topic_id ' : '';
  907. break;
  908. case 'f':
  909. $sql_sort_table = FORUMS_TABLE . ' f, ';
  910. $sql_sort_join = ' AND f.forum_id = p.forum_id ';
  911. break;
  912. }
  913. $select = ($type == 'posts') ? 'p.post_id' : 't.topic_id';
  914. $is_mysql = false;
  915. /**
  916. * Allow changing the query used to search for posts by author in fulltext_native
  917. *
  918. * @event core.search_native_author_count_query_before
  919. * @var int total_results The previous result count for the format of the query.
  920. * Set to 0 to force a re-count
  921. * @var string select SQL SELECT clause for what to get
  922. * @var string sql_sort_table CROSS JOIN'ed table to allow doing the sort chosen
  923. * @var string sql_sort_join Condition to define how to join the CROSS JOIN'ed table specifyed in sql_sort_table
  924. * @var array sql_author SQL WHERE condition for the post author ids
  925. * @var int topic_id Limit the search to this topic_id only
  926. * @var string sort_by_sql The possible predefined sort types
  927. * @var string sort_key The sort type used from the possible sort types
  928. * @var string sort_dir "a" for ASC or "d" dor DESC for the sort order used
  929. * @var string sql_sort The result SQL when processing sort_by_sql + sort_key + sort_dir
  930. * @var string sort_days Time, in days, that the oldest post showing can have
  931. * @var string sql_time The SQL to search on the time specifyed by sort_days
  932. * @var bool firstpost_only Wether or not to search only on the first post of the topics
  933. * @var array ex_fid_ary Forum ids that must not be searched on
  934. * @var array sql_fora SQL query for ex_fid_ary
  935. * @var int start How many posts to skip in the search results (used for pagination)
  936. * @since 3.1.5-RC1
  937. */
  938. $vars = array(
  939. 'total_results',
  940. 'select',
  941. 'sql_sort_table',
  942. 'sql_sort_join',
  943. 'sql_author',
  944. 'topic_id',
  945. 'sort_by_sql',
  946. 'sort_key',
  947. 'sort_dir',
  948. 'sql_sort',
  949. 'sort_days',
  950. 'sql_time',
  951. 'firstpost_only',
  952. 'ex_fid_ary',
  953. 'sql_fora',
  954. 'start',
  955. );
  956. extract($this->phpbb_dispatcher->trigger_event('core.search_native_author_count_query_before', compact($vars)));
  957. // If the cache was completely empty count the results
  958. if (!$total_results)
  959. {
  960. switch ($this->db->get_sql_layer())
  961. {
  962. case 'mysql4':
  963. case 'mysqli':
  964. // $select = 'SQL_CALC_FOUND_ROWS ' . $select;
  965. $is_mysql = true;
  966. break;
  967. default:
  968. if ($type == 'posts')
  969. {
  970. $sql = 'SELECT COUNT(p.post_id) as total_results
  971. FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . "
  972. WHERE $sql_author
  973. $sql_topic_id
  974. $sql_firstpost
  975. $post_visibility
  976. $sql_fora
  977. $sql_time";
  978. }
  979. else
  980. {
  981. if ($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3')
  982. {
  983. $sql = 'SELECT COUNT(topic_id) as total_results
  984. FROM (SELECT DISTINCT t.topic_id';
  985. }
  986. else
  987. {
  988. $sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results';
  989. }
  990. $sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
  991. WHERE $sql_author
  992. $sql_topic_id
  993. $sql_firstpost
  994. $post_visibility
  995. $sql_fora
  996. AND t.topic_id = p.topic_id
  997. $sql_time" . (($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3') ? ')' : '');
  998. }
  999. $result = $this->db->sql_query($sql);
  1000. $total_results = (int) $this->db->sql_fetchfield('total_results');
  1001. $this->db->sql_freeresult($result);
  1002. if (!$total_results)
  1003. {
  1004. return false;
  1005. }
  1006. break;
  1007. }
  1008. }
  1009. // Build the query for really selecting the post_ids
  1010. if ($type == 'posts')
  1011. {
  1012. $sql = "SELECT $select
  1013. FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . "
  1014. WHERE $sql_author
  1015. $sql_topic_id
  1016. $sql_firstpost
  1017. $post_visibility
  1018. $sql_fora
  1019. $sql_sort_join
  1020. $sql_time
  1021. ORDER BY $sql_sort";
  1022. $field = 'post_id';
  1023. }
  1024. else
  1025. {
  1026. $sql = "SELECT $select
  1027. FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
  1028. WHERE $sql_author
  1029. $sql_topic_id
  1030. $sql_firstpost
  1031. $post_visibility
  1032. $sql_fora
  1033. AND t.topic_id = p.topic_id
  1034. $sql_sort_join
  1035. $sql_time
  1036. GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . '
  1037. ORDER BY ' . $sql_sort;
  1038. $field = 'topic_id';
  1039. }
  1040. // Only read one block of posts from the db and then cache it
  1041. $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start);
  1042. while ($row = $this->db->sql_fetchrow($result))
  1043. {
  1044. $id_ary[] = (int) $row[$field];
  1045. }
  1046. $this->db->sql_freeresult($result);
  1047. if (!$total_results && $is_mysql)
  1048. {
  1049. // Count rows for the executed queries. Replace $select within $sql with SQL_CALC_FOUND_ROWS, and run it.
  1050. $sql_calc = str_replace('SELECT ' . $select, 'SELECT DISTINCT SQL_CALC_FOUND_ROWS p.post_id', $sql);
  1051. $result = $this->db->sql_query($sql_calc);
  1052. $this->db->sql_freeresult($result);
  1053. $sql_count = 'SELECT FOUND_ROWS() as total_results';
  1054. $result = $this->db->sql_query($sql_count);
  1055. $total_results = (int) $this->db->sql_fetchfield('total_results');
  1056. $this->db->sql_freeresult($result);
  1057. if (!$total_results)
  1058. {
  1059. return false;
  1060. }
  1061. }
  1062. if ($start >= $total_results)
  1063. {
  1064. $start = floor(($total_results - 1) / $per_page) * $per_page;
  1065. $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start);
  1066. while ($row = $this->db->sql_fetchrow($result))
  1067. {
  1068. $id_ary[] = (int) $row[$field];
  1069. }
  1070. $this->db->sql_freeresult($result);
  1071. }
  1072. if (sizeof($id_ary))
  1073. {
  1074. $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
  1075. $id_ary = array_slice($id_ary, 0, $per_page);
  1076. return $total_results;
  1077. }
  1078. return false;
  1079. }
  1080. /**
  1081. * Split a text into words of a given length
  1082. *
  1083. * The text is converted to UTF-8, cleaned up, and split. Then, words that
  1084. * conform to the defined length range are returned in an array.
  1085. *
  1086. * NOTE: duplicates are NOT removed from the return array
  1087. *
  1088. * @param string $text Text to split, encoded in UTF-8
  1089. * @return array Array of UTF-8 words
  1090. */
  1091. public function split_message($text)
  1092. {
  1093. $match = $words = array();
  1094. /**
  1095. * Taken from the original code
  1096. */
  1097. // Do not index code
  1098. $match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
  1099. // BBcode
  1100. $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
  1101. $min = $this->word_length['min'];
  1102. $max = $this->word_length['max'];
  1103. $isset_min = $min - 1;
  1104. /**
  1105. * Clean up the string, remove HTML tags, remove BBCodes
  1106. */
  1107. $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
  1108. while (strlen($word))
  1109. {
  1110. if (strlen($word) > 255 || strlen($word) <= $isset_min)
  1111. {
  1112. /**
  1113. * Words longer than 255 bytes are ignored. This will have to be
  1114. * changed whenever we change the length of search_wordlist.word_text
  1115. *
  1116. * Words shorter than $isset_min bytes are ignored, too
  1117. */
  1118. $word = strtok(' ');
  1119. continue;
  1120. }
  1121. $len = utf8_strlen($word);
  1122. /**
  1123. * Test whether the word is too short to be indexed.
  1124. *
  1125. * Note that this limit does NOT apply to CJK and Hangul
  1126. */
  1127. if ($len < $min)
  1128. {
  1129. /**
  1130. * Note: this could be optimized. If the codepoint is lower than Hangul's range
  1131. * we know that it will also be lower than CJK ranges
  1132. */
  1133. if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
  1134. && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
  1135. && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
  1136. {
  1137. $word = strtok(' ');
  1138. continue;
  1139. }
  1140. }
  1141. $words[] = $word;
  1142. $word = strtok(' ');
  1143. }
  1144. return $words;
  1145. }
  1146. /**
  1147. * Updates wordlist and wordmatch tables when a message is posted or changed
  1148. *
  1149. * @param string $mode Contains the post mode: edit, post, reply, quote
  1150. * @param int $post_id The id of the post which is modified/created
  1151. * @param string &$message New or updated post content
  1152. * @param string &$subject New or updated post subject
  1153. * @param int $poster_id Post author's user id
  1154. * @param int $forum_id The id of the forum in which the post is located
  1155. */
  1156. public function index($mode, $post_id, &$message, &$subject, $poster_id, $forum_id)
  1157. {
  1158. if (!$this->config['fulltext_native_load_upd'])
  1159. {
  1160. /**
  1161. * The search indexer is disabled, return
  1162. */
  1163. return;
  1164. }
  1165. // Split old and new post/subject to obtain array of 'words'
  1166. $split_text = $this->split_message($message);
  1167. $split_title = $this->split_message($subject);
  1168. $cur_words = array('post' => array(), 'title' => array());
  1169. $words = array();
  1170. if ($mode == 'edit')
  1171. {
  1172. $words['add']['post'] = array();
  1173. $words['add']['title'] = array();
  1174. $words['del']['post'] = array();
  1175. $words['del']['title'] = array();
  1176. $sql = 'SELECT w.word_id, w.word_text, m.title_match
  1177. FROM ' . SEARCH_WORDLIST_TABLE . ' w, ' . SEARCH_WORDMATCH_TABLE . " m
  1178. WHERE m.post_id = $post_id
  1179. AND w.word_id = m.word_id";
  1180. $result = $this->db->sql_query($sql);
  1181. while ($row = $this->db->sql_fetchrow($result))
  1182. {
  1183. $which = ($row['title_match']) ? 'title' : 'post';
  1184. $cur_words[$which][$row['word_text']] = $row['word_id'];
  1185. }
  1186. $this->db->sql_freeresult($result);
  1187. $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
  1188. $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
  1189. $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
  1190. $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
  1191. }
  1192. else
  1193. {
  1194. $words['add']['post'] = $split_text;
  1195. $words['add']['title'] = $split_title;
  1196. $words['del']['post'] = array();
  1197. $words['del']['title'] = array();
  1198. }
  1199. unset($split_text);
  1200. unset($split_title);
  1201. // Get unique words from the above arrays
  1202. $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
  1203. // We now have unique arrays of all words to be added and removed and
  1204. // individual arrays of added and removed words for text and title. What
  1205. // we need to do now is add the new words (if they don't already exist)
  1206. // and then add (or remove) matches between the words and this post
  1207. if (sizeof($unique_add_words))
  1208. {
  1209. $sql = 'SELECT word_id, word_text
  1210. FROM ' . SEARCH_WORDLIST_TABLE . '
  1211. WHERE ' . $this->db->sql_in_set('word_text', $unique_add_words);
  1212. $result = $this->db->sql_query($sql);
  1213. $word_ids = array();
  1214. while ($row = $this->db->sql_fetchrow($result))
  1215. {
  1216. $word_ids[$row['word_text']] = $row['word_id'];
  1217. }
  1218. $this->db->sql_freeresult($result);
  1219. $new_words = array_diff($unique_add_words, array_keys($word_ids));
  1220. $this->db->sql_transaction('begin');
  1221. if (sizeof($new_words))
  1222. {
  1223. $sql_ary = array();
  1224. foreach ($new_words as $word)
  1225. {
  1226. $sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0);
  1227. }
  1228. $this->db->sql_return_on_error(true);
  1229. $this->db->sql_multi_insert(SEARCH_WORDLIST_TABLE, $sql_ary);
  1230. $this->db->sql_return_on_error(false);
  1231. }
  1232. unset($new_words, $sql_ary);
  1233. }
  1234. else
  1235. {
  1236. $this->db->sql_transaction('begin');
  1237. }
  1238. // now update the search match table, remove links to removed words and add links to new words
  1239. foreach ($words['del'] as $word_in => $word_ary)
  1240. {
  1241. $title_match = ($word_in == 'title') ? 1 : 0;
  1242. if (sizeof($word_ary))
  1243. {
  1244. $sql_in = array();
  1245. foreach ($word_ary as $word)
  1246. {
  1247. $sql_in[] = $cur_words[$word_in][$word];
  1248. }
  1249. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1250. WHERE ' . $this->db->sql_in_set('word_id', $sql_in) . '
  1251. AND post_id = ' . intval($post_id) . "
  1252. AND title_match = $title_match";
  1253. $this->db->sql_query($sql);
  1254. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1255. SET word_count = word_count - 1
  1256. WHERE ' . $this->db->sql_in_set('word_id', $sql_in) . '
  1257. AND word_count > 0';
  1258. $this->db->sql_query($sql);
  1259. unset($sql_in);
  1260. }
  1261. }
  1262. $this->db->sql_return_on_error(true);
  1263. foreach ($words['add'] as $word_in => $word_ary)
  1264. {
  1265. $title_match = ($word_in == 'title') ? 1 : 0;
  1266. if (sizeof($word_ary))
  1267. {
  1268. $sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
  1269. SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
  1270. FROM ' . SEARCH_WORDLIST_TABLE . '
  1271. WHERE ' . $this->db->sql_in_set('word_text', $word_ary);
  1272. $this->db->sql_query($sql);
  1273. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1274. SET word_count = word_count + 1
  1275. WHERE ' . $this->db->sql_in_set('word_text', $word_ary);
  1276. $this->db->sql_query($sql);
  1277. }
  1278. }
  1279. $this->db->sql_return_on_error(false);
  1280. $this->db->sql_transaction('commit');
  1281. // destroy cached search results containing any of the words removed or added
  1282. $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id));
  1283. unset($unique_add_words);
  1284. unset($words);
  1285. unset($cur_words);
  1286. }
  1287. /**
  1288. * Removes entries from the wordmatch table for the specified post_ids
  1289. */
  1290. public function index_remove($post_ids, $author_ids, $forum_ids)
  1291. {
  1292. if (sizeof($post_ids))
  1293. {
  1294. $sql = 'SELECT w.word_id, w.word_text, m.title_match
  1295. FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
  1296. WHERE ' . $this->db->sql_in_set('m.post_id', $post_ids) . '
  1297. AND w.word_id = m.word_id';
  1298. $result = $this->db->sql_query($sql);
  1299. $message_word_ids = $title_word_ids = $word_texts = array();
  1300. while ($row = $this->db->sql_fetchrow($result))
  1301. {
  1302. if ($row['title_match'])
  1303. {
  1304. $title_word_ids[] = $row['word_id'];
  1305. }
  1306. else
  1307. {
  1308. $message_word_ids[] = $row['word_id'];
  1309. }
  1310. $word_texts[] = $row['word_text'];
  1311. }
  1312. $this->db->sql_freeresult($result);
  1313. if (sizeof($title_word_ids))
  1314. {
  1315. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1316. SET word_count = word_count - 1
  1317. WHERE ' . $this->db->sql_in_set('word_id', $title_word_ids) . '
  1318. AND word_count > 0';
  1319. $this->db->sql_query($sql);
  1320. }
  1321. if (sizeof($message_word_ids))
  1322. {
  1323. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1324. SET word_count = word_count - 1
  1325. WHERE ' . $this->db->sql_in_set('word_id', $message_word_ids) . '
  1326. AND word_count > 0';
  1327. $this->db->sql_query($sql);
  1328. }
  1329. unset($title_word_ids);
  1330. unset($message_word_ids);
  1331. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1332. WHERE ' . $this->db->sql_in_set('post_id', $post_ids);
  1333. $this->db->sql_query($sql);
  1334. }
  1335. $this->destroy_cache(array_unique($word_texts), array_unique($author_ids));
  1336. }
  1337. /**
  1338. * Tidy up indexes: Tag 'common words' and remove
  1339. * words no longer referenced in the match table
  1340. */
  1341. public function tidy()
  1342. {
  1343. // Is the fulltext indexer disabled? If yes then we need not
  1344. // carry on ... it's okay ... I know when I'm not wanted boo hoo
  1345. if (!$this->config['fulltext_native_load_upd'])
  1346. {
  1347. set_config('search_last_gc', time(), true);
  1348. return;
  1349. }
  1350. $destroy_cache_words = array();
  1351. // Remove common words
  1352. if ($this->config['num_posts'] >= 100 && $this->config['fulltext_native_common_thres'])
  1353. {
  1354. $common_threshold = ((double) $this->config['fulltext_native_common_thres']) / 100.0;
  1355. // First, get the IDs of common words
  1356. $sql = 'SELECT word_id, word_text
  1357. FROM ' . SEARCH_WORDLIST_TABLE . '
  1358. WHERE word_count > ' . floor($this->config['num_posts'] * $common_threshold) . '
  1359. OR word_common = 1';
  1360. $result = $this->db->sql_query($sql);
  1361. $sql_in = array();
  1362. while ($row = $this->db->sql_fetchrow($result))
  1363. {
  1364. $sql_in[] = $row['word_id'];
  1365. $destroy_cache_words[] = $row['word_text'];
  1366. }
  1367. $this->db->sql_freeresult($result);
  1368. if (sizeof($sql_in))
  1369. {
  1370. // Flag the words
  1371. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1372. SET word_common = 1
  1373. WHERE ' . $this->db->sql_in_set('word_id', $sql_in);
  1374. $this->db->sql_query($sql);
  1375. // by setting search_last_gc to the new time here we make sure that if a user reloads because the
  1376. // following query takes too long, he won't run into it again
  1377. set_config('search_last_gc', time(), true);
  1378. // Delete the matches
  1379. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1380. WHERE ' . $this->db->sql_in_set('word_id', $sql_in);
  1381. $this->db->sql_query($sql);
  1382. }
  1383. unset($sql_in);
  1384. }
  1385. if (sizeof($destroy_cache_words))
  1386. {
  1387. // destroy cached search results containing any of the words that are now common or were removed
  1388. $this->destroy_cache(array_unique($destroy_cache_words));
  1389. }
  1390. set_config('search_last_gc', time(), true);
  1391. }
  1392. /**
  1393. * Deletes all words from the index
  1394. */
  1395. public function delete_index($acp_module, $u_action)
  1396. {
  1397. switch ($this->db->get_sql_layer())
  1398. {
  1399. case 'sqlite':
  1400. case 'sqlite3':
  1401. $this->db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
  1402. $this->db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
  1403. $this->db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
  1404. break;
  1405. default:
  1406. $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
  1407. $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
  1408. $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
  1409. break;
  1410. }
  1411. }
  1412. /**
  1413. * Returns true if both FULLTEXT indexes exist
  1414. */
  1415. public function index_created()
  1416. {
  1417. if (!sizeof($this->stats))
  1418. {
  1419. $this->get_stats();
  1420. }
  1421. return ($this->stats['total_words'] && $this->stats['total_matches']) ? true : false;
  1422. }
  1423. /**
  1424. * Returns an associative array containing information about the indexes
  1425. */
  1426. public function index_stats()
  1427. {
  1428. if (!sizeof($this->stats))
  1429. {
  1430. $this->get_stats();
  1431. }
  1432. return array(
  1433. $this->user->lang['TOTAL_WORDS'] => $this->stats['total_words'],
  1434. $this->user->lang['TOTAL_MATCHES'] => $this->stats['total_matches']);
  1435. }
  1436. protected function get_stats()
  1437. {
  1438. $this->stats['total_words'] = $this->db->get_estimated_row_count(SEARCH_WORDLIST_TABLE);
  1439. $this->stats['total_matches'] = $this->db->get_estimated_row_count(SEARCH_WORDMATCH_TABLE);
  1440. }
  1441. /**
  1442. * Clean up a text to remove non-alphanumeric characters
  1443. *
  1444. * This method receives a UTF-8 string, normalizes and validates it, replaces all
  1445. * non-alphanumeric characters with strings then returns the result.
  1446. *
  1447. * Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
  1448. *
  1449. * @param string $text Text to split, in UTF-8 (not normalized or sanitized)
  1450. * @param string $allowed_chars String of special chars to allow
  1451. * @param string $encoding Text encoding
  1452. * @return string Cleaned up text, only alphanumeric chars are left
  1453. *
  1454. * @todo \normalizer::cleanup being able to be used?
  1455. */
  1456. protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
  1457. {
  1458. static $conv = array(), $conv_loaded = array();
  1459. $words = $allow = array();
  1460. // Convert the text to UTF-8
  1461. $encoding = strtolower($encoding);
  1462. if ($encoding != 'utf-8')
  1463. {
  1464. $text = utf8_recode($text, $encoding);
  1465. }
  1466. $utf_len_mask = array(
  1467. "\xC0" => 2,
  1468. "\xD0" => 2,
  1469. "\xE0" => 3,
  1470. "\xF0" => 4
  1471. );
  1472. /**
  1473. * Replace HTML entities and NCRs
  1474. */
  1475. $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
  1476. /**
  1477. * Load the UTF-8 normalizer
  1478. *
  1479. * If we use it more widely, an instance of that class should be held in a
  1480. * a global variable instead
  1481. */
  1482. \utf_normalizer::nfc($text);
  1483. /**
  1484. * The first thing we do is:
  1485. *
  1486. * - convert ASCII-7 letters to lowercase
  1487. * - remove the ASCII-7 non-alpha characters
  1488. * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
  1489. * 0xC1 and 0xF5-0xFF
  1490. *
  1491. * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
  1492. */
  1493. $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
  1494. $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
  1495. /**
  1496. * This is the list of legal ASCII chars, it is automatically extended
  1497. * with ASCII chars from $allowed_chars
  1498. */
  1499. $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
  1500. /**
  1501. * Prepare an array containing the extra chars to allow
  1502. */
  1503. if (isset($allowed_chars[0]))
  1504. {
  1505. $pos = 0;
  1506. $len = strlen($allowed_chars);
  1507. do
  1508. {
  1509. $c = $allowed_chars[$pos];
  1510. if ($c < "\x80")
  1511. {
  1512. /**
  1513. * ASCII char
  1514. */
  1515. $sb_pos = strpos($sb_match, $c);
  1516. if (is_int($sb_pos))
  1517. {
  1518. /**
  1519. * Remove the char from $sb_match and its corresponding
  1520. * replacement in $sb_replace
  1521. */
  1522. $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
  1523. $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
  1524. $legal_ascii .= $c;
  1525. }
  1526. ++$pos;
  1527. }
  1528. else
  1529. {
  1530. /**
  1531. * UTF-8 char
  1532. */
  1533. $utf_len = $utf_len_mask[$c & "\xF0"];
  1534. $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
  1535. $pos += $utf_len;
  1536. }
  1537. }
  1538. while ($pos < $len);
  1539. }
  1540. $text = strtr($text, $sb_match, $sb_replace);
  1541. $ret = '';
  1542. $pos = 0;
  1543. $len = strlen($text);
  1544. do
  1545. {
  1546. /**
  1547. * Do all consecutive ASCII chars at once
  1548. */
  1549. if ($spn = strspn($text, $legal_ascii, $pos))
  1550. {
  1551. $ret .= substr($text, $pos, $spn);
  1552. $pos += $spn;
  1553. }
  1554. if ($pos >= $len)
  1555. {
  1556. return $ret;
  1557. }
  1558. /**
  1559. * Capture the UTF char
  1560. */
  1561. $utf_len = $utf_len_mask[$text[$pos] & "\xF0"];
  1562. $utf_char = substr($text, $pos, $utf_len);
  1563. $pos += $utf_len;
  1564. if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
  1565. || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
  1566. || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
  1567. {
  1568. /**
  1569. * All characters within these ranges are valid
  1570. *
  1571. * We separate them with a space in order to index each character
  1572. * individually
  1573. */
  1574. $ret .= ' ' . $utf_char . ' ';
  1575. continue;
  1576. }
  1577. if (isset($allow[$utf_char]))
  1578. {
  1579. /**
  1580. * The char is explicitly allowed
  1581. */
  1582. $ret .= $utf_char;
  1583. continue;
  1584. }
  1585. if (isset($conv[$utf_char]))
  1586. {
  1587. /**
  1588. * The char is mapped to something, maybe to itself actually
  1589. */
  1590. $ret .= $conv[$utf_char];
  1591. continue;
  1592. }
  1593. /**
  1594. * The char isn't mapped, but did we load its conversion table?
  1595. *
  1596. * The search indexer table is split into blocks. The block number of
  1597. * each char is equal to its codepoint right-shifted for 11 bits. It
  1598. * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
  1599. * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
  1600. * all UTF chars encoded in 2 bytes are in the same first block.
  1601. */
  1602. if (isset($utf_char[2]))
  1603. {
  1604. if (isset($utf_char[3]))
  1605. {
  1606. /**
  1607. * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
  1608. * 0000 0111 0011 1111 0010 0000
  1609. */
  1610. $idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5);
  1611. }
  1612. else
  1613. {
  1614. /**
  1615. * 1110 nnnn 10nx xxxx 10xx xxxx
  1616. * 0000 0111 0010 0000
  1617. */
  1618. $idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5);
  1619. }
  1620. }
  1621. else
  1622. {
  1623. /**
  1624. * 110x xxxx 10xx xxxx
  1625. * 0000 0000 0000 0000
  1626. */
  1627. $idx = 0;
  1628. }
  1629. /**
  1630. * Check if the required conv table has been loaded already
  1631. */
  1632. if (!isset($conv_loaded[$idx]))
  1633. {
  1634. $conv_loaded[$idx] = 1;
  1635. $file = $this->phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $this->php_ext;
  1636. if (file_exists($file))
  1637. {
  1638. $conv += include($file);
  1639. }
  1640. }
  1641. if (isset($conv[$utf_char]))
  1642. {
  1643. $ret .= $conv[$utf_char];
  1644. }
  1645. else
  1646. {
  1647. /**
  1648. * We add an entry to the conversion table so that we
  1649. * don't have to convert to codepoint and perform the checks
  1650. * that are above this block
  1651. */
  1652. $conv[$utf_char] = ' ';
  1653. $ret .= ' ';
  1654. }
  1655. }
  1656. while (1);
  1657. return $ret;
  1658. }
  1659. /**
  1660. * Returns a list of options for the ACP to display
  1661. */
  1662. public function acp()
  1663. {
  1664. /**
  1665. * if we need any options, copied from fulltext_native for now, will have to be adjusted or removed
  1666. */
  1667. $tpl = '
  1668. <dl>
  1669. <dt><label for="fulltext_native_load_upd">' . $this->user->lang['YES_SEARCH_UPDATE'] . $this->user->lang['COLON'] . '</label><br /><span>' . $this->user->lang['YES_SEARCH_UPDATE_EXPLAIN'] . '</span></dt>
  1670. <dd><label><input type="radio" id="fulltext_native_load_upd" name="config[fulltext_native_load_upd]" value="1"' . (($this->config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $this->user->lang['YES'] . '</label><label><input type="radio" name="config[fulltext_native_load_upd]" value="0"' . ((!$this->config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $this->user->lang['NO'] . '</label></dd>
  1671. </dl>
  1672. <dl>
  1673. <dt><label for="fulltext_native_min_chars">' . $this->user->lang['MIN_SEARCH_CHARS'] . $this->user->lang['COLON'] . '</label><br /><span>' . $this->user->lang['MIN_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
  1674. <dd><input id="fulltext_native_min_chars" type="number" size="3" maxlength="3" min="0" max="255" name="config[fulltext_native_min_chars]" value="' . (int) $this->config['fulltext_native_min_chars'] . '" /></dd>
  1675. </dl>
  1676. <dl>
  1677. <dt><label for="fulltext_native_max_chars">' . $this->user->lang['MAX_SEARCH_CHARS'] . $this->user->lang['COLON'] . '</label><br /><span>' . $this->user->lang['MAX_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
  1678. <dd><input id="fulltext_native_max_chars" type="number" size="3" maxlength="3" min="0" max="255" name="config[fulltext_native_max_chars]" value="' . (int) $this->config['fulltext_native_max_chars'] . '" /></dd>
  1679. </dl>
  1680. <dl>
  1681. <dt><label for="fulltext_native_common_thres">' . $this->user->lang['COMMON_WORD_THRESHOLD'] . $this->user->lang['COLON'] . '</label><br /><span>' . $this->user->lang['COMMON_WORD_THRESHOLD_EXPLAIN'] . '</span></dt>
  1682. <dd><input id="fulltext_native_common_thres" type="text" size="3" maxlength="3" name="config[fulltext_native_common_thres]" value="' . (double) $this->config['fulltext_native_common_thres'] . '" /> %</dd>
  1683. </dl>
  1684. ';
  1685. // These are fields required in the config table
  1686. return array(
  1687. 'tpl' => $tpl,
  1688. 'config' => array('fulltext_native_load_upd' => 'bool', 'fulltext_native_min_chars' => 'integer:0:255', 'fulltext_native_max_chars' => 'integer:0:255', 'fulltext_native_common_thres' => 'double:0:100')
  1689. );
  1690. }
  1691. }