PageRenderTime 59ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/phpBB/includes/search/fulltext_native.php

http://github.com/phpbb/phpbb3
PHP | 1757 lines | 1306 code | 198 blank | 253 comment | 174 complexity | dbfa3c9ac1b0c553850f3e2661a3f416 MD5 | raw file
Possible License(s): AGPL-1.0
  1. <?php
  2. /**
  3. *
  4. * @package search
  5. * @copyright (c) 2005 phpBB Group
  6. * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License v2
  7. *
  8. */
  9. /**
  10. * @ignore
  11. */
  12. if (!defined('IN_PHPBB'))
  13. {
  14. exit;
  15. }
  16. /**
  17. * fulltext_native
  18. * phpBB's own db driven fulltext search, version 2
  19. * @package search
  20. */
  21. class phpbb_search_fulltext_native extends phpbb_search_base
  22. {
  23. var $stats = array();
  24. var $word_length = array();
  25. var $search_query;
  26. var $common_words = array();
  27. var $must_contain_ids = array();
  28. var $must_not_contain_ids = array();
  29. var $must_exclude_one_ids = array();
  30. /**
  31. * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded.
  32. *
  33. * @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure.
  34. */
  35. public function __construct(&$error)
  36. {
  37. global $phpbb_root_path, $phpEx, $config;
  38. $this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
  39. /**
  40. * Load the UTF tools
  41. */
  42. if (!class_exists('utf_normalizer'))
  43. {
  44. include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
  45. }
  46. $error = false;
  47. }
  48. /**
  49. * Returns the name of this search backend to be displayed to administrators
  50. *
  51. * @return string Name
  52. */
  53. public function get_name()
  54. {
  55. return 'phpBB Native Fulltext';
  56. }
  57. /**
  58. * This function fills $this->search_query with the cleaned user search query.
  59. *
  60. * If $terms is 'any' then the words will be extracted from the search query
  61. * and combined with | inside brackets. They will afterwards be treated like
  62. * an standard search query.
  63. *
  64. * Then it analyses the query and fills the internal arrays $must_not_contain_ids,
  65. * $must_contain_ids and $must_exclude_one_ids which are later used by keyword_search().
  66. *
  67. * @param string $keywords contains the search query string as entered by the user
  68. * @param string $terms is either 'all' (use search query as entered, default words to 'must be contained in post')
  69. * or 'any' (find all posts containing at least one of the given words)
  70. * @return boolean false if no valid keywords were found and otherwise true
  71. *
  72. * @access public
  73. */
  74. function split_keywords($keywords, $terms)
  75. {
  76. global $db, $user, $config;
  77. $tokens = '+-|()*';
  78. $keywords = trim($this->cleanup($keywords, $tokens));
  79. // allow word|word|word without brackets
  80. if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false))
  81. {
  82. $keywords = '(' . $keywords . ')';
  83. }
  84. $open_bracket = $space = false;
  85. for ($i = 0, $n = strlen($keywords); $i < $n; $i++)
  86. {
  87. if ($open_bracket !== false)
  88. {
  89. switch ($keywords[$i])
  90. {
  91. case ')':
  92. if ($open_bracket + 1 == $i)
  93. {
  94. $keywords[$i - 1] = '|';
  95. $keywords[$i] = '|';
  96. }
  97. $open_bracket = false;
  98. break;
  99. case '(':
  100. $keywords[$i] = '|';
  101. break;
  102. case '+':
  103. case '-':
  104. case ' ':
  105. $keywords[$i] = '|';
  106. break;
  107. case '*':
  108. if ($i === 0 || ($keywords[$i - 1] !== '*' && strcspn($keywords[$i - 1], $tokens) === 0))
  109. {
  110. if ($i === $n - 1 || ($keywords[$i + 1] !== '*' && strcspn($keywords[$i + 1], $tokens) === 0))
  111. {
  112. $keywords = substr($keywords, 0, $i) . substr($keywords, $i + 1);
  113. }
  114. }
  115. break;
  116. }
  117. }
  118. else
  119. {
  120. switch ($keywords[$i])
  121. {
  122. case ')':
  123. $keywords[$i] = ' ';
  124. break;
  125. case '(':
  126. $open_bracket = $i;
  127. $space = false;
  128. break;
  129. case '|':
  130. $keywords[$i] = ' ';
  131. break;
  132. case '-':
  133. case '+':
  134. $space = $keywords[$i];
  135. break;
  136. case ' ':
  137. if ($space !== false)
  138. {
  139. $keywords[$i] = $space;
  140. }
  141. break;
  142. default:
  143. $space = false;
  144. }
  145. }
  146. }
  147. if ($open_bracket)
  148. {
  149. $keywords .= ')';
  150. }
  151. $match = array(
  152. '# +#',
  153. '#\|\|+#',
  154. '#(\+|\-)(?:\+|\-)+#',
  155. '#\(\|#',
  156. '#\|\)#',
  157. );
  158. $replace = array(
  159. ' ',
  160. '|',
  161. '$1',
  162. '(',
  163. ')',
  164. );
  165. $keywords = preg_replace($match, $replace, $keywords);
  166. $num_keywords = sizeof(explode(' ', $keywords));
  167. // We limit the number of allowed keywords to minimize load on the database
  168. if ($config['max_num_search_keywords'] && $num_keywords > $config['max_num_search_keywords'])
  169. {
  170. trigger_error($user->lang('MAX_NUM_SEARCH_KEYWORDS_REFINE', $config['max_num_search_keywords'], $num_keywords));
  171. }
  172. // $keywords input format: each word separated by a space, words in a bracket are not separated
  173. // the user wants to search for any word, convert the search query
  174. if ($terms == 'any')
  175. {
  176. $words = array();
  177. preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
  178. if (sizeof($words[1]))
  179. {
  180. $keywords = '(' . implode('|', $words[1]) . ')';
  181. }
  182. }
  183. // set the search_query which is shown to the user
  184. $this->search_query = $keywords;
  185. $exact_words = array();
  186. preg_match_all('#([^\\s+\\-|*()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words);
  187. $exact_words = $exact_words[1];
  188. $common_ids = $words = array();
  189. if (sizeof($exact_words))
  190. {
  191. $sql = 'SELECT word_id, word_text, word_common
  192. FROM ' . SEARCH_WORDLIST_TABLE . '
  193. WHERE ' . $db->sql_in_set('word_text', $exact_words) . '
  194. ORDER BY word_count ASC';
  195. $result = $db->sql_query($sql);
  196. // store an array of words and ids, remove common words
  197. while ($row = $db->sql_fetchrow($result))
  198. {
  199. if ($row['word_common'])
  200. {
  201. $this->common_words[] = $row['word_text'];
  202. $common_ids[$row['word_text']] = (int) $row['word_id'];
  203. continue;
  204. }
  205. $words[$row['word_text']] = (int) $row['word_id'];
  206. }
  207. $db->sql_freeresult($result);
  208. }
  209. unset($exact_words);
  210. // now analyse the search query, first split it using the spaces
  211. $query = explode(' ', $keywords);
  212. $this->must_contain_ids = array();
  213. $this->must_not_contain_ids = array();
  214. $this->must_exclude_one_ids = array();
  215. $mode = '';
  216. $ignore_no_id = true;
  217. foreach ($query as $word)
  218. {
  219. if (empty($word))
  220. {
  221. continue;
  222. }
  223. // words which should not be included
  224. if ($word[0] == '-')
  225. {
  226. $word = substr($word, 1);
  227. // a group of which at least one may not be in the resulting posts
  228. if ($word[0] == '(')
  229. {
  230. $word = array_unique(explode('|', substr($word, 1, -1)));
  231. $mode = 'must_exclude_one';
  232. }
  233. // one word which should not be in the resulting posts
  234. else
  235. {
  236. $mode = 'must_not_contain';
  237. }
  238. $ignore_no_id = true;
  239. }
  240. // words which have to be included
  241. else
  242. {
  243. // no prefix is the same as a +prefix
  244. if ($word[0] == '+')
  245. {
  246. $word = substr($word, 1);
  247. }
  248. // a group of words of which at least one word should be in every resulting post
  249. if ($word[0] == '(')
  250. {
  251. $word = array_unique(explode('|', substr($word, 1, -1)));
  252. }
  253. $ignore_no_id = false;
  254. $mode = 'must_contain';
  255. }
  256. if (empty($word))
  257. {
  258. continue;
  259. }
  260. // if this is an array of words then retrieve an id for each
  261. if (is_array($word))
  262. {
  263. $non_common_words = array();
  264. $id_words = array();
  265. foreach ($word as $i => $word_part)
  266. {
  267. if (strpos($word_part, '*') !== false)
  268. {
  269. $id_words[] = '\'' . $db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
  270. $non_common_words[] = $word_part;
  271. }
  272. else if (isset($words[$word_part]))
  273. {
  274. $id_words[] = $words[$word_part];
  275. $non_common_words[] = $word_part;
  276. }
  277. else
  278. {
  279. $len = utf8_strlen($word_part);
  280. if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
  281. {
  282. $this->common_words[] = $word_part;
  283. }
  284. }
  285. }
  286. if (sizeof($id_words))
  287. {
  288. sort($id_words);
  289. if (sizeof($id_words) > 1)
  290. {
  291. $this->{$mode . '_ids'}[] = $id_words;
  292. }
  293. else
  294. {
  295. $mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode;
  296. $this->{$mode . '_ids'}[] = $id_words[0];
  297. }
  298. }
  299. // throw an error if we shall not ignore unexistant words
  300. else if (!$ignore_no_id && sizeof($non_common_words))
  301. {
  302. trigger_error(sprintf($user->lang['WORDS_IN_NO_POST'], implode(', ', $non_common_words)));
  303. }
  304. unset($non_common_words);
  305. }
  306. // else we only need one id
  307. else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word]))
  308. {
  309. if ($wildcard)
  310. {
  311. $len = utf8_strlen(str_replace('*', '', $word));
  312. if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
  313. {
  314. $this->{$mode . '_ids'}[] = '\'' . $db->sql_escape(str_replace('*', '%', $word)) . '\'';
  315. }
  316. else
  317. {
  318. $this->common_words[] = $word;
  319. }
  320. }
  321. else
  322. {
  323. $this->{$mode . '_ids'}[] = $words[$word];
  324. }
  325. }
  326. // throw an error if we shall not ignore unexistant words
  327. else if (!$ignore_no_id)
  328. {
  329. if (!isset($common_ids[$word]))
  330. {
  331. $len = utf8_strlen($word);
  332. if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
  333. {
  334. trigger_error(sprintf($user->lang['WORD_IN_NO_POST'], $word));
  335. }
  336. else
  337. {
  338. $this->common_words[] = $word;
  339. }
  340. }
  341. }
  342. else
  343. {
  344. $len = utf8_strlen($word);
  345. if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
  346. {
  347. $this->common_words[] = $word;
  348. }
  349. }
  350. }
  351. // we can't search for negatives only
  352. if (!sizeof($this->must_contain_ids))
  353. {
  354. return false;
  355. }
  356. if (!empty($this->search_query))
  357. {
  358. return true;
  359. }
  360. return false;
  361. }
  362. /**
  363. * Performs a search on keywords depending on display specific params. You have to run split_keywords() first.
  364. *
  365. * @param string $type contains either posts or topics depending on what should be searched for
  366. * @param string $fields contains either titleonly (topic titles should be searched), msgonly (only message bodies should be searched), firstpost (only subject and body of the first post should be searched) or all (all post bodies and subjects should be searched)
  367. * @param string $terms is either 'all' (use query as entered, words without prefix should default to "have to be in field") or 'any' (ignore search query parts and just return all posts that contain any of the specified words)
  368. * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
  369. * @param string $sort_key is the key of $sort_by_sql for the selected sorting
  370. * @param string $sort_dir is either a or d representing ASC and DESC
  371. * @param string $sort_days specifies the maximum amount of days a post may be old
  372. * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
  373. * @param array $m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
  374. * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
  375. * @param array $author_ary an array of author ids if the author should be ignored during the search the array is empty
  376. * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
  377. * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
  378. * @param int $start indicates the first index of the page
  379. * @param int $per_page number of ids each page is supposed to contain
  380. * @return boolean|int total number of results
  381. *
  382. * @access public
  383. */
  384. function keyword_search($type, $fields, $terms, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $m_approve_fid_ary, $topic_id, $author_ary, $author_name, &$id_ary, $start, $per_page)
  385. {
  386. global $config, $db;
  387. // No keywords? No posts.
  388. if (empty($this->search_query))
  389. {
  390. return false;
  391. }
  392. $must_contain_ids = $this->must_contain_ids;
  393. $must_not_contain_ids = $this->must_not_contain_ids;
  394. $must_exclude_one_ids = $this->must_exclude_one_ids;
  395. sort($must_contain_ids);
  396. sort($must_not_contain_ids);
  397. sort($must_exclude_one_ids);
  398. // generate a search_key from all the options to identify the results
  399. $search_key = md5(implode('#', array(
  400. serialize($must_contain_ids),
  401. serialize($must_not_contain_ids),
  402. serialize($must_exclude_one_ids),
  403. $type,
  404. $fields,
  405. $terms,
  406. $sort_days,
  407. $sort_key,
  408. $topic_id,
  409. implode(',', $ex_fid_ary),
  410. implode(',', $m_approve_fid_ary),
  411. implode(',', $author_ary),
  412. $author_name,
  413. )));
  414. // try reading the results from cache
  415. $total_results = 0;
  416. if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
  417. {
  418. return $total_results;
  419. }
  420. $id_ary = array();
  421. $sql_where = array();
  422. $group_by = false;
  423. $m_num = 0;
  424. $w_num = 0;
  425. $sql_array = array(
  426. 'SELECT' => ($type == 'posts') ? 'p.post_id' : 'p.topic_id',
  427. 'FROM' => array(
  428. SEARCH_WORDMATCH_TABLE => array(),
  429. SEARCH_WORDLIST_TABLE => array(),
  430. ),
  431. 'LEFT_JOIN' => array(array(
  432. 'FROM' => array(POSTS_TABLE => 'p'),
  433. 'ON' => 'm0.post_id = p.post_id',
  434. )),
  435. );
  436. $title_match = '';
  437. $left_join_topics = false;
  438. $group_by = true;
  439. // Build some display specific sql strings
  440. switch ($fields)
  441. {
  442. case 'titleonly':
  443. $title_match = 'title_match = 1';
  444. $group_by = false;
  445. // no break
  446. case 'firstpost':
  447. $left_join_topics = true;
  448. $sql_where[] = 'p.post_id = t.topic_first_post_id';
  449. break;
  450. case 'msgonly':
  451. $title_match = 'title_match = 0';
  452. $group_by = false;
  453. break;
  454. }
  455. if ($type == 'topics')
  456. {
  457. $left_join_topics = true;
  458. $group_by = true;
  459. }
  460. /**
  461. * @todo Add a query optimizer (handle stuff like "+(4|3) +4")
  462. */
  463. foreach ($this->must_contain_ids as $subquery)
  464. {
  465. if (is_array($subquery))
  466. {
  467. $group_by = true;
  468. $word_id_sql = array();
  469. $word_ids = array();
  470. foreach ($subquery as $id)
  471. {
  472. if (is_string($id))
  473. {
  474. $sql_array['LEFT_JOIN'][] = array(
  475. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  476. 'ON' => "w$w_num.word_text LIKE $id"
  477. );
  478. $word_ids[] = "w$w_num.word_id";
  479. $w_num++;
  480. }
  481. else
  482. {
  483. $word_ids[] = $id;
  484. }
  485. }
  486. $sql_where[] = $db->sql_in_set("m$m_num.word_id", $word_ids);
  487. unset($word_id_sql);
  488. unset($word_ids);
  489. }
  490. else if (is_string($subquery))
  491. {
  492. $sql_array['FROM'][SEARCH_WORDLIST_TABLE][] = 'w' . $w_num;
  493. $sql_where[] = "w$w_num.word_text LIKE $subquery";
  494. $sql_where[] = "m$m_num.word_id = w$w_num.word_id";
  495. $group_by = true;
  496. $w_num++;
  497. }
  498. else
  499. {
  500. $sql_where[] = "m$m_num.word_id = $subquery";
  501. }
  502. $sql_array['FROM'][SEARCH_WORDMATCH_TABLE][] = 'm' . $m_num;
  503. if ($title_match)
  504. {
  505. $sql_where[] = "m$m_num.$title_match";
  506. }
  507. if ($m_num != 0)
  508. {
  509. $sql_where[] = "m$m_num.post_id = m0.post_id";
  510. }
  511. $m_num++;
  512. }
  513. foreach ($this->must_not_contain_ids as $key => $subquery)
  514. {
  515. if (is_string($subquery))
  516. {
  517. $sql_array['LEFT_JOIN'][] = array(
  518. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  519. 'ON' => "w$w_num.word_text LIKE $subquery"
  520. );
  521. $this->must_not_contain_ids[$key] = "w$w_num.word_id";
  522. $group_by = true;
  523. $w_num++;
  524. }
  525. }
  526. if (sizeof($this->must_not_contain_ids))
  527. {
  528. $sql_array['LEFT_JOIN'][] = array(
  529. 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
  530. 'ON' => $db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id"
  531. );
  532. $sql_where[] = "m$m_num.word_id IS NULL";
  533. $m_num++;
  534. }
  535. foreach ($this->must_exclude_one_ids as $ids)
  536. {
  537. $is_null_joins = array();
  538. foreach ($ids as $id)
  539. {
  540. if (is_string($id))
  541. {
  542. $sql_array['LEFT_JOIN'][] = array(
  543. 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
  544. 'ON' => "w$w_num.word_text LIKE $id"
  545. );
  546. $id = "w$w_num.word_id";
  547. $group_by = true;
  548. $w_num++;
  549. }
  550. $sql_array['LEFT_JOIN'][] = array(
  551. 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
  552. 'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '')
  553. );
  554. $is_null_joins[] = "m$m_num.word_id IS NULL";
  555. $m_num++;
  556. }
  557. $sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')';
  558. }
  559. if (!sizeof($m_approve_fid_ary))
  560. {
  561. $sql_where[] = 'p.post_approved = 1';
  562. }
  563. else if ($m_approve_fid_ary !== array(-1))
  564. {
  565. $sql_where[] = '(p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
  566. }
  567. if ($topic_id)
  568. {
  569. $sql_where[] = 'p.topic_id = ' . $topic_id;
  570. }
  571. if (sizeof($author_ary))
  572. {
  573. if ($author_name)
  574. {
  575. // first one matches post of registered users, second one guests and deleted users
  576. $sql_author = '(' . $db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
  577. }
  578. else
  579. {
  580. $sql_author = $db->sql_in_set('p.poster_id', $author_ary);
  581. }
  582. $sql_where[] = $sql_author;
  583. }
  584. if (sizeof($ex_fid_ary))
  585. {
  586. $sql_where[] = $db->sql_in_set('p.forum_id', $ex_fid_ary, true);
  587. }
  588. if ($sort_days)
  589. {
  590. $sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400));
  591. }
  592. $sql_array['WHERE'] = implode(' AND ', $sql_where);
  593. $is_mysql = false;
  594. // if the total result count is not cached yet, retrieve it from the db
  595. if (!$total_results)
  596. {
  597. $sql = '';
  598. $sql_array_count = $sql_array;
  599. if ($left_join_topics)
  600. {
  601. $sql_array_count['LEFT_JOIN'][] = array(
  602. 'FROM' => array(TOPICS_TABLE => 't'),
  603. 'ON' => 'p.topic_id = t.topic_id'
  604. );
  605. }
  606. switch ($db->sql_layer)
  607. {
  608. case 'mysql4':
  609. case 'mysqli':
  610. // 3.x does not support SQL_CALC_FOUND_ROWS
  611. // $sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
  612. $is_mysql = true;
  613. break;
  614. case 'sqlite':
  615. $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
  616. $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
  617. FROM (' . $db->sql_build_query('SELECT', $sql_array_count) . ')';
  618. // no break
  619. default:
  620. $sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results';
  621. $sql = (!$sql) ? $db->sql_build_query('SELECT', $sql_array_count) : $sql;
  622. $result = $db->sql_query($sql);
  623. $total_results = (int) $db->sql_fetchfield('total_results');
  624. $db->sql_freeresult($result);
  625. if (!$total_results)
  626. {
  627. return false;
  628. }
  629. break;
  630. }
  631. unset($sql_array_count, $sql);
  632. }
  633. // Build sql strings for sorting
  634. $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
  635. switch ($sql_sort[0])
  636. {
  637. case 'u':
  638. $sql_array['FROM'][USERS_TABLE] = 'u';
  639. $sql_where[] = 'u.user_id = p.poster_id ';
  640. break;
  641. case 't':
  642. $left_join_topics = true;
  643. break;
  644. case 'f':
  645. $sql_array['FROM'][FORUMS_TABLE] = 'f';
  646. $sql_where[] = 'f.forum_id = p.forum_id';
  647. break;
  648. }
  649. if ($left_join_topics)
  650. {
  651. $sql_array['LEFT_JOIN'][] = array(
  652. 'FROM' => array(TOPICS_TABLE => 't'),
  653. 'ON' => 'p.topic_id = t.topic_id'
  654. );
  655. }
  656. $sql_array['WHERE'] = implode(' AND ', $sql_where);
  657. $sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : '';
  658. $sql_array['ORDER_BY'] = $sql_sort;
  659. unset($sql_where, $sql_sort, $group_by);
  660. $sql = $db->sql_build_query('SELECT', $sql_array);
  661. $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
  662. while ($row = $db->sql_fetchrow($result))
  663. {
  664. $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
  665. }
  666. $db->sql_freeresult($result);
  667. if (!sizeof($id_ary))
  668. {
  669. return false;
  670. }
  671. // if we use mysql and the total result count is not cached yet, retrieve it from the db
  672. if (!$total_results && $is_mysql)
  673. {
  674. // Count rows for the executed queries. Replace $select within $sql with SQL_CALC_FOUND_ROWS, and run it.
  675. $sql_array_copy = $sql_array;
  676. $sql_array_copy['SELECT'] = 'SQL_CALC_FOUND_ROWS p.post_id ';
  677. $sql = $db->sql_build_query('SELECT', $sql_array_copy);
  678. unset($sql_array_copy);
  679. $db->sql_query($sql);
  680. $db->sql_freeresult($result);
  681. $sql = 'SELECT FOUND_ROWS() as total_results';
  682. $result = $db->sql_query($sql);
  683. $total_results = (int) $db->sql_fetchfield('total_results');
  684. $db->sql_freeresult($result);
  685. if (!$total_results)
  686. {
  687. return false;
  688. }
  689. }
  690. // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page
  691. $this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir);
  692. $id_ary = array_slice($id_ary, 0, (int) $per_page);
  693. return $total_results;
  694. }
  695. /**
  696. * Performs a search on an author's posts without caring about message contents. Depends on display specific params
  697. *
  698. * @param string $type contains either posts or topics depending on what should be searched for
  699. * @param boolean $firstpost_only if true, only topic starting posts will be considered
  700. * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
  701. * @param string $sort_key is the key of $sort_by_sql for the selected sorting
  702. * @param string $sort_dir is either a or d representing ASC and DESC
  703. * @param string $sort_days specifies the maximum amount of days a post may be old
  704. * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
  705. * @param array $m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
  706. * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
  707. * @param array $author_ary an array of author ids
  708. * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
  709. * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
  710. * @param int $start indicates the first index of the page
  711. * @param int $per_page number of ids each page is supposed to contain
  712. * @return boolean|int total number of results
  713. *
  714. * @access public
  715. */
  716. function author_search($type, $firstpost_only, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $m_approve_fid_ary, $topic_id, $author_ary, $author_name, &$id_ary, $start, $per_page)
  717. {
  718. global $config, $db;
  719. // No author? No posts.
  720. if (!sizeof($author_ary))
  721. {
  722. return 0;
  723. }
  724. // generate a search_key from all the options to identify the results
  725. $search_key = md5(implode('#', array(
  726. '',
  727. $type,
  728. ($firstpost_only) ? 'firstpost' : '',
  729. '',
  730. '',
  731. $sort_days,
  732. $sort_key,
  733. $topic_id,
  734. implode(',', $ex_fid_ary),
  735. implode(',', $m_approve_fid_ary),
  736. implode(',', $author_ary),
  737. $author_name,
  738. )));
  739. // try reading the results from cache
  740. $total_results = 0;
  741. if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
  742. {
  743. return $total_results;
  744. }
  745. $id_ary = array();
  746. // Create some display specific sql strings
  747. if ($author_name)
  748. {
  749. // first one matches post of registered users, second one guests and deleted users
  750. $sql_author = '(' . $db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
  751. }
  752. else
  753. {
  754. $sql_author = $db->sql_in_set('p.poster_id', $author_ary);
  755. }
  756. $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
  757. $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
  758. $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
  759. $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
  760. // Build sql strings for sorting
  761. $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
  762. $sql_sort_table = $sql_sort_join = '';
  763. switch ($sql_sort[0])
  764. {
  765. case 'u':
  766. $sql_sort_table = USERS_TABLE . ' u, ';
  767. $sql_sort_join = ' AND u.user_id = p.poster_id ';
  768. break;
  769. case 't':
  770. $sql_sort_table = ($type == 'posts' && !$firstpost_only) ? TOPICS_TABLE . ' t, ' : '';
  771. $sql_sort_join = ($type == 'posts' && !$firstpost_only) ? ' AND t.topic_id = p.topic_id ' : '';
  772. break;
  773. case 'f':
  774. $sql_sort_table = FORUMS_TABLE . ' f, ';
  775. $sql_sort_join = ' AND f.forum_id = p.forum_id ';
  776. break;
  777. }
  778. if (!sizeof($m_approve_fid_ary))
  779. {
  780. $m_approve_fid_sql = ' AND p.post_approved = 1';
  781. }
  782. else if ($m_approve_fid_ary == array(-1))
  783. {
  784. $m_approve_fid_sql = '';
  785. }
  786. else
  787. {
  788. $m_approve_fid_sql = ' AND (p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
  789. }
  790. $select = ($type == 'posts') ? 'p.post_id' : 't.topic_id';
  791. $is_mysql = false;
  792. // If the cache was completely empty count the results
  793. if (!$total_results)
  794. {
  795. switch ($db->sql_layer)
  796. {
  797. case 'mysql4':
  798. case 'mysqli':
  799. // $select = 'SQL_CALC_FOUND_ROWS ' . $select;
  800. $is_mysql = true;
  801. break;
  802. default:
  803. if ($type == 'posts')
  804. {
  805. $sql = 'SELECT COUNT(p.post_id) as total_results
  806. FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . "
  807. WHERE $sql_author
  808. $sql_topic_id
  809. $sql_firstpost
  810. $m_approve_fid_sql
  811. $sql_fora
  812. $sql_time";
  813. }
  814. else
  815. {
  816. if ($db->sql_layer == 'sqlite')
  817. {
  818. $sql = 'SELECT COUNT(topic_id) as total_results
  819. FROM (SELECT DISTINCT t.topic_id';
  820. }
  821. else
  822. {
  823. $sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results';
  824. }
  825. $sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
  826. WHERE $sql_author
  827. $sql_topic_id
  828. $sql_firstpost
  829. $m_approve_fid_sql
  830. $sql_fora
  831. AND t.topic_id = p.topic_id
  832. $sql_time" . (($db->sql_layer == 'sqlite') ? ')' : '');
  833. }
  834. $result = $db->sql_query($sql);
  835. $total_results = (int) $db->sql_fetchfield('total_results');
  836. $db->sql_freeresult($result);
  837. if (!$total_results)
  838. {
  839. return false;
  840. }
  841. break;
  842. }
  843. }
  844. // Build the query for really selecting the post_ids
  845. if ($type == 'posts')
  846. {
  847. $sql = "SELECT $select
  848. FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . "
  849. WHERE $sql_author
  850. $sql_topic_id
  851. $sql_firstpost
  852. $m_approve_fid_sql
  853. $sql_fora
  854. $sql_sort_join
  855. $sql_time
  856. ORDER BY $sql_sort";
  857. $field = 'post_id';
  858. }
  859. else
  860. {
  861. $sql = "SELECT $select
  862. FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
  863. WHERE $sql_author
  864. $sql_topic_id
  865. $sql_firstpost
  866. $m_approve_fid_sql
  867. $sql_fora
  868. AND t.topic_id = p.topic_id
  869. $sql_sort_join
  870. $sql_time
  871. GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . '
  872. ORDER BY ' . $sql_sort;
  873. $field = 'topic_id';
  874. }
  875. // Only read one block of posts from the db and then cache it
  876. $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
  877. while ($row = $db->sql_fetchrow($result))
  878. {
  879. $id_ary[] = (int) $row[$field];
  880. }
  881. $db->sql_freeresult($result);
  882. if (!$total_results && $is_mysql)
  883. {
  884. // Count rows for the executed queries. Replace $select within $sql with SQL_CALC_FOUND_ROWS, and run it.
  885. $sql = str_replace('SELECT ' . $select, 'SELECT DISTINCT SQL_CALC_FOUND_ROWS p.post_id', $sql);
  886. $db->sql_query($sql);
  887. $db->sql_freeresult($result);
  888. $sql = 'SELECT FOUND_ROWS() as total_results';
  889. $result = $db->sql_query($sql);
  890. $total_results = (int) $db->sql_fetchfield('total_results');
  891. $db->sql_freeresult($result);
  892. if (!$total_results)
  893. {
  894. return false;
  895. }
  896. }
  897. if (sizeof($id_ary))
  898. {
  899. $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
  900. $id_ary = array_slice($id_ary, 0, $per_page);
  901. return $total_results;
  902. }
  903. return false;
  904. }
  905. /**
  906. * Split a text into words of a given length
  907. *
  908. * The text is converted to UTF-8, cleaned up, and split. Then, words that
  909. * conform to the defined length range are returned in an array.
  910. *
  911. * NOTE: duplicates are NOT removed from the return array
  912. *
  913. * @param string $text Text to split, encoded in UTF-8
  914. * @return array Array of UTF-8 words
  915. *
  916. * @access private
  917. */
  918. function split_message($text)
  919. {
  920. global $phpbb_root_path, $phpEx, $user;
  921. $match = $words = array();
  922. /**
  923. * Taken from the original code
  924. */
  925. // Do not index code
  926. $match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
  927. // BBcode
  928. $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
  929. $min = $this->word_length['min'];
  930. $max = $this->word_length['max'];
  931. $isset_min = $min - 1;
  932. /**
  933. * Clean up the string, remove HTML tags, remove BBCodes
  934. */
  935. $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
  936. while (strlen($word))
  937. {
  938. if (strlen($word) > 255 || strlen($word) <= $isset_min)
  939. {
  940. /**
  941. * Words longer than 255 bytes are ignored. This will have to be
  942. * changed whenever we change the length of search_wordlist.word_text
  943. *
  944. * Words shorter than $isset_min bytes are ignored, too
  945. */
  946. $word = strtok(' ');
  947. continue;
  948. }
  949. $len = utf8_strlen($word);
  950. /**
  951. * Test whether the word is too short to be indexed.
  952. *
  953. * Note that this limit does NOT apply to CJK and Hangul
  954. */
  955. if ($len < $min)
  956. {
  957. /**
  958. * Note: this could be optimized. If the codepoint is lower than Hangul's range
  959. * we know that it will also be lower than CJK ranges
  960. */
  961. if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
  962. && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
  963. && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
  964. {
  965. $word = strtok(' ');
  966. continue;
  967. }
  968. }
  969. $words[] = $word;
  970. $word = strtok(' ');
  971. }
  972. return $words;
  973. }
  974. /**
  975. * Updates wordlist and wordmatch tables when a message is posted or changed
  976. *
  977. * @param string $mode Contains the post mode: edit, post, reply, quote
  978. * @param int $post_id The id of the post which is modified/created
  979. * @param string &$message New or updated post content
  980. * @param string &$subject New or updated post subject
  981. * @param int $poster_id Post author's user id
  982. * @param int $forum_id The id of the forum in which the post is located
  983. *
  984. * @access public
  985. */
  986. function index($mode, $post_id, &$message, &$subject, $poster_id, $forum_id)
  987. {
  988. global $config, $db, $user;
  989. if (!$config['fulltext_native_load_upd'])
  990. {
  991. /**
  992. * The search indexer is disabled, return
  993. */
  994. return;
  995. }
  996. // Split old and new post/subject to obtain array of 'words'
  997. $split_text = $this->split_message($message);
  998. $split_title = $this->split_message($subject);
  999. $cur_words = array('post' => array(), 'title' => array());
  1000. $words = array();
  1001. if ($mode == 'edit')
  1002. {
  1003. $words['add']['post'] = array();
  1004. $words['add']['title'] = array();
  1005. $words['del']['post'] = array();
  1006. $words['del']['title'] = array();
  1007. $sql = 'SELECT w.word_id, w.word_text, m.title_match
  1008. FROM ' . SEARCH_WORDLIST_TABLE . ' w, ' . SEARCH_WORDMATCH_TABLE . " m
  1009. WHERE m.post_id = $post_id
  1010. AND w.word_id = m.word_id";
  1011. $result = $db->sql_query($sql);
  1012. while ($row = $db->sql_fetchrow($result))
  1013. {
  1014. $which = ($row['title_match']) ? 'title' : 'post';
  1015. $cur_words[$which][$row['word_text']] = $row['word_id'];
  1016. }
  1017. $db->sql_freeresult($result);
  1018. $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
  1019. $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
  1020. $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
  1021. $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
  1022. }
  1023. else
  1024. {
  1025. $words['add']['post'] = $split_text;
  1026. $words['add']['title'] = $split_title;
  1027. $words['del']['post'] = array();
  1028. $words['del']['title'] = array();
  1029. }
  1030. unset($split_text);
  1031. unset($split_title);
  1032. // Get unique words from the above arrays
  1033. $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
  1034. // We now have unique arrays of all words to be added and removed and
  1035. // individual arrays of added and removed words for text and title. What
  1036. // we need to do now is add the new words (if they don't already exist)
  1037. // and then add (or remove) matches between the words and this post
  1038. if (sizeof($unique_add_words))
  1039. {
  1040. $sql = 'SELECT word_id, word_text
  1041. FROM ' . SEARCH_WORDLIST_TABLE . '
  1042. WHERE ' . $db->sql_in_set('word_text', $unique_add_words);
  1043. $result = $db->sql_query($sql);
  1044. $word_ids = array();
  1045. while ($row = $db->sql_fetchrow($result))
  1046. {
  1047. $word_ids[$row['word_text']] = $row['word_id'];
  1048. }
  1049. $db->sql_freeresult($result);
  1050. $new_words = array_diff($unique_add_words, array_keys($word_ids));
  1051. $db->sql_transaction('begin');
  1052. if (sizeof($new_words))
  1053. {
  1054. $sql_ary = array();
  1055. foreach ($new_words as $word)
  1056. {
  1057. $sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0);
  1058. }
  1059. $db->sql_return_on_error(true);
  1060. $db->sql_multi_insert(SEARCH_WORDLIST_TABLE, $sql_ary);
  1061. $db->sql_return_on_error(false);
  1062. }
  1063. unset($new_words, $sql_ary);
  1064. }
  1065. else
  1066. {
  1067. $db->sql_transaction('begin');
  1068. }
  1069. // now update the search match table, remove links to removed words and add links to new words
  1070. foreach ($words['del'] as $word_in => $word_ary)
  1071. {
  1072. $title_match = ($word_in == 'title') ? 1 : 0;
  1073. if (sizeof($word_ary))
  1074. {
  1075. $sql_in = array();
  1076. foreach ($word_ary as $word)
  1077. {
  1078. $sql_in[] = $cur_words[$word_in][$word];
  1079. }
  1080. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1081. WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
  1082. AND post_id = ' . intval($post_id) . "
  1083. AND title_match = $title_match";
  1084. $db->sql_query($sql);
  1085. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1086. SET word_count = word_count - 1
  1087. WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
  1088. AND word_count > 0';
  1089. $db->sql_query($sql);
  1090. unset($sql_in);
  1091. }
  1092. }
  1093. $db->sql_return_on_error(true);
  1094. foreach ($words['add'] as $word_in => $word_ary)
  1095. {
  1096. $title_match = ($word_in == 'title') ? 1 : 0;
  1097. if (sizeof($word_ary))
  1098. {
  1099. $sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
  1100. SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
  1101. FROM ' . SEARCH_WORDLIST_TABLE . '
  1102. WHERE ' . $db->sql_in_set('word_text', $word_ary);
  1103. $db->sql_query($sql);
  1104. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1105. SET word_count = word_count + 1
  1106. WHERE ' . $db->sql_in_set('word_text', $word_ary);
  1107. $db->sql_query($sql);
  1108. }
  1109. }
  1110. $db->sql_return_on_error(false);
  1111. $db->sql_transaction('commit');
  1112. // destroy cached search results containing any of the words removed or added
  1113. $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id));
  1114. unset($unique_add_words);
  1115. unset($words);
  1116. unset($cur_words);
  1117. }
  1118. /**
  1119. * Removes entries from the wordmatch table for the specified post_ids
  1120. */
  1121. function index_remove($post_ids, $author_ids, $forum_ids)
  1122. {
  1123. global $db;
  1124. if (sizeof($post_ids))
  1125. {
  1126. $sql = 'SELECT w.word_id, w.word_text, m.title_match
  1127. FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
  1128. WHERE ' . $db->sql_in_set('m.post_id', $post_ids) . '
  1129. AND w.word_id = m.word_id';
  1130. $result = $db->sql_query($sql);
  1131. $message_word_ids = $title_word_ids = $word_texts = array();
  1132. while ($row = $db->sql_fetchrow($result))
  1133. {
  1134. if ($row['title_match'])
  1135. {
  1136. $title_word_ids[] = $row['word_id'];
  1137. }
  1138. else
  1139. {
  1140. $message_word_ids[] = $row['word_id'];
  1141. }
  1142. $word_texts[] = $row['word_text'];
  1143. }
  1144. $db->sql_freeresult($result);
  1145. if (sizeof($title_word_ids))
  1146. {
  1147. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1148. SET word_count = word_count - 1
  1149. WHERE ' . $db->sql_in_set('word_id', $title_word_ids) . '
  1150. AND word_count > 0';
  1151. $db->sql_query($sql);
  1152. }
  1153. if (sizeof($message_word_ids))
  1154. {
  1155. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1156. SET word_count = word_count - 1
  1157. WHERE ' . $db->sql_in_set('word_id', $message_word_ids) . '
  1158. AND word_count > 0';
  1159. $db->sql_query($sql);
  1160. }
  1161. unset($title_word_ids);
  1162. unset($message_word_ids);
  1163. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1164. WHERE ' . $db->sql_in_set('post_id', $post_ids);
  1165. $db->sql_query($sql);
  1166. }
  1167. $this->destroy_cache(array_unique($word_texts), $author_ids);
  1168. }
  1169. /**
  1170. * Tidy up indexes: Tag 'common words' and remove
  1171. * words no longer referenced in the match table
  1172. */
  1173. function tidy()
  1174. {
  1175. global $db, $config;
  1176. // Is the fulltext indexer disabled? If yes then we need not
  1177. // carry on ... it's okay ... I know when I'm not wanted boo hoo
  1178. if (!$config['fulltext_native_load_upd'])
  1179. {
  1180. set_config('search_last_gc', time(), true);
  1181. return;
  1182. }
  1183. $destroy_cache_words = array();
  1184. // Remove common words
  1185. if ($config['num_posts'] >= 100 && $config['fulltext_native_common_thres'])
  1186. {
  1187. $common_threshold = ((double) $config['fulltext_native_common_thres']) / 100.0;
  1188. // First, get the IDs of common words
  1189. $sql = 'SELECT word_id, word_text
  1190. FROM ' . SEARCH_WORDLIST_TABLE . '
  1191. WHERE word_count > ' . floor($config['num_posts'] * $common_threshold) . '
  1192. OR word_common = 1';
  1193. $result = $db->sql_query($sql);
  1194. $sql_in = array();
  1195. while ($row = $db->sql_fetchrow($result))
  1196. {
  1197. $sql_in[] = $row['word_id'];
  1198. $destroy_cache_words[] = $row['word_text'];
  1199. }
  1200. $db->sql_freeresult($result);
  1201. if (sizeof($sql_in))
  1202. {
  1203. // Flag the words
  1204. $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
  1205. SET word_common = 1
  1206. WHERE ' . $db->sql_in_set('word_id', $sql_in);
  1207. $db->sql_query($sql);
  1208. // by setting search_last_gc to the new time here we make sure that if a user reloads because the
  1209. // following query takes too long, he won't run into it again
  1210. set_config('search_last_gc', time(), true);
  1211. // Delete the matches
  1212. $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
  1213. WHERE ' . $db->sql_in_set('word_id', $sql_in);
  1214. $db->sql_query($sql);
  1215. }
  1216. unset($sql_in);
  1217. }
  1218. if (sizeof($destroy_cache_words))
  1219. {
  1220. // destroy cached search results containing any of the words that are now common or were removed
  1221. $this->destroy_cache(array_unique($destroy_cache_words));
  1222. }
  1223. set_config('search_last_gc', time(), true);
  1224. }
  1225. /**
  1226. * Deletes all words from the index
  1227. */
  1228. function delete_index($acp_module, $u_action)
  1229. {
  1230. global $db;
  1231. switch ($db->sql_layer)
  1232. {
  1233. case 'sqlite':
  1234. case 'firebird':
  1235. $db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
  1236. $db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
  1237. $db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
  1238. break;
  1239. default:
  1240. $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
  1241. $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
  1242. $db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
  1243. break;
  1244. }
  1245. }
  1246. /**
  1247. * Returns true if both FULLTEXT indexes exist
  1248. */
  1249. function index_created()
  1250. {
  1251. if (!sizeof($this->stats))
  1252. {
  1253. $this->get_stats();
  1254. }
  1255. return ($this->stats['total_words'] && $this->stats['total_matches']) ? true : false;
  1256. }
  1257. /**
  1258. * Returns an associative array containing information about the indexes
  1259. */
  1260. function index_stats()
  1261. {
  1262. global $user;
  1263. if (!sizeof($this->stats))
  1264. {
  1265. $this->get_stats();
  1266. }
  1267. return array(
  1268. $user->lang['TOTAL_WORDS'] => $this->stats['total_words'],
  1269. $user->lang['TOTAL_MATCHES'] => $this->stats['total_matches']);
  1270. }
  1271. function get_stats()
  1272. {
  1273. global $db;
  1274. $sql = 'SELECT COUNT(*) as total_words
  1275. FROM ' . SEARCH_WORDLIST_TABLE;
  1276. $result = $db->sql_query($sql);
  1277. $this->stats['total_words'] = (int) $db->sql_fetchfield('total_words');
  1278. $db->sql_freeresult($result);
  1279. $sql = 'SELECT COUNT(*) as total_matches
  1280. FROM ' . SEARCH_WORDMATCH_TABLE;
  1281. $result = $db->sql_query($sql);
  1282. $this->stats['total_matches'] = (int) $db->sql_fetchfield('total_matches');
  1283. $db->sql_freeresult($result);
  1284. }
  1285. /**
  1286. * Clean up a text to remove non-alphanumeric characters
  1287. *
  1288. * This method receives a UTF-8 string, normalizes and validates it, replaces all
  1289. * non-alphanumeric characters with strings then returns the result.
  1290. *
  1291. * Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
  1292. *
  1293. * @param string $text Text to split, in UTF-8 (not normalized or sanitized)
  1294. * @param string $allowed_chars String of special chars to allow
  1295. * @param string $encoding Text encoding
  1296. * @return string Cleaned up text, only alphanumeric chars are left
  1297. *
  1298. * @todo normalizer::cleanup being able to be used?
  1299. */
  1300. function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
  1301. {
  1302. global $phpbb_root_path, $phpEx;
  1303. static $conv = array(), $conv_loaded = array();
  1304. $words = $allow = array();
  1305. // Convert the text to UTF-8
  1306. $encoding = strtolower($encoding);
  1307. if ($encoding != 'utf-8')
  1308. {
  1309. $text = utf8_recode($text, $encoding);
  1310. }
  1311. $utf_len_mask = array(
  1312. "\xC0" => 2,
  1313. "\xD0" => 2,
  1314. "\xE0" => 3,
  1315. "\xF0" => 4
  1316. );
  1317. /**
  1318. * Replace HTML entities and NCRs
  1319. */
  1320. $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
  1321. /**
  1322. * Load the UTF-8 normalizer
  1323. *
  1324. * If we use it more widely, an instance of that class should be held in a
  1325. * a global variable instead
  1326. */
  1327. utf_normalizer::nfc($text);
  1328. /**
  1329. * The first thing we do is:
  1330. *
  1331. * - convert ASCII-7 letters to lowercase
  1332. * - remove the ASCII-7 non-alpha characters
  1333. * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
  1334. * 0xC1 and 0xF5-0xFF
  1335. *
  1336. * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
  1337. */
  1338. $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
  1339. $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
  1340. /**
  1341. * This is the list of legal ASCII chars, it is automatically extended
  1342. * with ASCII chars from $allowed_chars
  1343. */
  1344. $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
  1345. /**
  1346. * Prepare an array containing the extra chars to allow
  1347. */
  1348. if (isset($allowed_chars[0]))
  1349. {
  1350. $pos = 0;
  1351. $len = strlen($allowed_chars);
  1352. do
  1353. {
  1354. $c = $allowed_chars[$pos];
  1355. if ($c < "\x80")
  1356. {
  1357. /**
  1358. * ASCII char
  1359. */
  1360. $sb_pos = strpos($sb_match, $c);
  1361. if (is_int($sb_pos))
  1362. {
  1363. /**
  1364. * Remove the char from $sb_match and its corresponding
  1365. * replacement in $sb_replace
  1366. */
  1367. $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
  1368. $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
  1369. $legal_ascii .= $c;
  1370. }
  1371. ++$pos;
  1372. }
  1373. else
  1374. {
  1375. /**
  1376. * UTF-8 char
  1377. */
  1378. $utf_len = $utf_len_mask[$c & "\xF0"];
  1379. $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
  1380. $pos += $utf_len;
  1381. }
  1382. }
  1383. while ($pos < $len);
  1384. }
  1385. $text = strtr($text, $sb_match, $sb_replace);
  1386. $ret = '';
  1387. $pos = 0;
  1388. $len = strlen($text);
  1389. do
  1390. {
  1391. /**
  1392. * Do all consecutive ASCII chars at once
  1393. */
  1394. if ($spn = strspn($text, $legal_ascii, $pos))
  1395. {
  1396. $ret .= substr($text, $pos, $spn);
  1397. $pos += $spn;
  1398. }
  1399. if ($pos >= $len)
  1400. {
  1401. return $ret;
  1402. }
  1403. /**
  1404. * Capture the UTF char
  1405. */
  1406. $utf_len = $utf_len_mask[$text[$pos] & "\xF0"];
  1407. $utf_char = substr($text, $pos, $utf_len);
  1408. $pos += $utf_len;
  1409. if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
  1410. || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
  1411. || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
  1412. {
  1413. /**
  1414. * All characters within these ranges are valid
  1415. *
  1416. * We separate them with a space in order to index each character
  1417. * individually
  1418. */
  1419. $ret .= ' ' . $utf_char . ' ';
  1420. continue;
  1421. }
  1422. if (isset($allow[$utf_char]))
  1423. {
  1424. /**
  1425. * The char is explicitly allowed
  1426. */
  1427. $ret .= $utf_char;
  1428. continue;
  1429. }
  1430. if (isset($conv[$utf_char]))
  1431. {
  1432. /**
  1433. * The char is mapped to something, maybe to itself actually
  1434. */
  1435. $ret .= $conv[$utf_char];
  1436. continue;
  1437. }
  1438. /**
  1439. * The char isn't mapped, but did we load its conversion table?
  1440. *
  1441. * The search indexer table is split into blocks. The block number of
  1442. * each char is equal to its codepoint right-shifted for 11 bits. It
  1443. * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
  1444. * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
  1445. * all UTF chars encoded in 2 bytes are in the same first block.
  1446. */
  1447. if (isset($utf_char[2]))
  1448. {
  1449. if (isset($utf_char[3]))
  1450. {
  1451. /**
  1452. * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
  1453. * 0000 0111 0011 1111 0010 0000
  1454. */
  1455. $idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5);
  1456. }
  1457. else
  1458. {
  1459. /**
  1460. * 1110 nnnn 10nx xxxx 10xx xxxx
  1461. * 0000 0111 0010 0000
  1462. */
  1463. $idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5);
  1464. }
  1465. }
  1466. else
  1467. {
  1468. /**
  1469. * 110x xxxx 10xx xxxx
  1470. * 0000 0000 0000 0000
  1471. */
  1472. $idx = 0;
  1473. }
  1474. /**
  1475. * Check if the required conv table has been loaded already
  1476. */
  1477. if (!isset($conv_loaded[$idx]))
  1478. {
  1479. $conv_loaded[$idx] = 1;
  1480. $file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx;
  1481. if (file_exists($file))
  1482. {
  1483. $conv += include($file);
  1484. }
  1485. }
  1486. if (isset($conv[$utf_char]))
  1487. {
  1488. $ret .= $conv[$utf_char];
  1489. }
  1490. else
  1491. {
  1492. /**
  1493. * We add an entry to the conversion table so that we
  1494. * don't have to convert to codepoint and perform the checks
  1495. * that are above this block
  1496. */
  1497. $conv[$utf_char] = ' ';
  1498. $ret .= ' ';
  1499. }
  1500. }
  1501. while (1);
  1502. return $ret;
  1503. }
  1504. /**
  1505. * Returns a list of options for the ACP to display
  1506. */
  1507. function acp()
  1508. {
  1509. global $user, $config;
  1510. /**
  1511. * if we need any options, copied from fulltext_native for now, will have to be adjusted or removed
  1512. */
  1513. $tpl = '
  1514. <dl>
  1515. <dt><label for="fulltext_native_load_upd">' . $user->lang['YES_SEARCH_UPDATE'] . ':</label><br /><span>' . $user->lang['YES_SEARCH_UPDATE_EXPLAIN'] . '</span></dt>
  1516. <dd><label><input type="radio" id="fulltext_native_load_upd" name="config[fulltext_native_load_upd]" value="1"' . (($config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['YES'] . '</label><label><input type="radio" name="config[fulltext_native_load_upd]" value="0"' . ((!$config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['NO'] . '</label></dd>
  1517. </dl>
  1518. <dl>
  1519. <dt><label for="fulltext_native_min_chars">' . $user->lang['MIN_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MIN_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
  1520. <dd><input id="fulltext_native_min_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_min_chars]" value="' . (int) $config['fulltext_native_min_chars'] . '" /></dd>
  1521. </dl>
  1522. <dl>
  1523. <dt><label for="fulltext_native_max_chars">' . $user->lang['MAX_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MAX_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
  1524. <dd><input id="fulltext_native_max_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_max_chars]" value="' . (int) $config['fulltext_native_max_chars'] . '" /></dd>
  1525. </dl>
  1526. <dl>
  1527. <dt><label for="fulltext_native_common_thres">' . $user->lang['COMMON_WORD_THRESHOLD'] . ':</label><br /><span>' . $user->lang['COMMON_WORD_THRESHOLD_EXPLAIN'] . '</span></dt>
  1528. <dd><input id="fulltext_native_common_thres" type="text" size="3" maxlength="3" name="config[fulltext_native_common_thres]" value="' . (double) $config['fulltext_native_common_thres'] . '" /> %</dd>
  1529. </dl>
  1530. ';
  1531. // These are fields required in the config table
  1532. return array(
  1533. 'tpl' => $tpl,
  1534. 'config' => array('fulltext_native_load_upd' => 'bool', 'fulltext_native_min_chars' => 'integer:0:255', 'fulltext_native_max_chars' => 'integer:0:255', 'fulltext_native_common_thres' => 'double:0:100')
  1535. );
  1536. }
  1537. }