PageRenderTime 51ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/administrator/components/com_finder/helpers/indexer/indexer.php

https://bitbucket.org/izubizarreta/https-bitbucket.org-bityvip
PHP | 1336 lines | 656 code | 167 blank | 513 comment | 74 complexity | 15f3abd22bc020b078640696724105c8 MD5 | raw file
Possible License(s): LGPL-3.0, LGPL-2.0, JSON, GPL-2.0, BSD-3-Clause, LGPL-2.1, MIT
  1. <?php
  2. /**
  3. * @package Joomla.Administrator
  4. * @subpackage com_finder
  5. *
  6. * @copyright Copyright (C) 2005 - 2012 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('_JEXEC') or die;
  10. // Register dependent classes.
  11. JLoader::register('FinderIndexerHelper', dirname(__FILE__) . '/helper.php');
  12. JLoader::register('FinderIndexerParser', dirname(__FILE__) . '/parser.php');
  13. JLoader::register('FinderIndexerStemmer', dirname(__FILE__) . '/stemmer.php');
  14. JLoader::register('FinderIndexerTaxonomy', dirname(__FILE__) . '/taxonomy.php');
  15. JLoader::register('FinderIndexerToken', dirname(__FILE__) . '/token.php');
  16. jimport('joomla.filesystem.file');
  17. /**
  18. * Main indexer class for the Finder indexer package.
  19. *
  20. * The indexer class provides the core functionality of the Finder
  21. * search engine. It is responsible for adding and updating the
  22. * content links table; extracting and scoring tokens; and maintaining
  23. * all referential information for the content.
  24. *
  25. * Note: All exceptions thrown from within this class should be caught
  26. * by the controller.
  27. *
  28. * @package Joomla.Administrator
  29. * @subpackage com_finder
  30. * @since 2.5
  31. */
  32. class FinderIndexer
  33. {
  34. /**
  35. * The title context identifier.
  36. *
  37. * @var integer
  38. * @since 2.5
  39. */
  40. const TITLE_CONTEXT = 1;
  41. /**
  42. * The text context identifier.
  43. *
  44. * @var integer
  45. * @since 2.5
  46. */
  47. const TEXT_CONTEXT = 2;
  48. /**
  49. * The meta context identifier.
  50. *
  51. * @var integer
  52. * @since 2.5
  53. */
  54. const META_CONTEXT = 3;
  55. /**
  56. * The path context identifier.
  57. *
  58. * @var integer
  59. * @since 2.5
  60. */
  61. const PATH_CONTEXT = 4;
  62. /**
  63. * The misc context identifier.
  64. *
  65. * @var integer
  66. * @since 2.5
  67. */
  68. const MISC_CONTEXT = 5;
  69. /**
  70. * The indexer state object.
  71. *
  72. * @var object
  73. * @since 2.5
  74. */
  75. public static $state;
  76. /**
  77. * The indexer profiler object.
  78. *
  79. * @var object
  80. * @since 2.5
  81. */
  82. public static $profiler;
  83. /**
  84. * Method to get the indexer state.
  85. *
  86. * @return object The indexer state object.
  87. *
  88. * @since 2.5
  89. */
  90. public static function getState()
  91. {
  92. // First, try to load from the internal state.
  93. if (!empty(self::$state))
  94. {
  95. return self::$state;
  96. }
  97. // If we couldn't load from the internal state, try the session.
  98. $session = JFactory::getSession();
  99. $data = $session->get('_finder.state', null);
  100. // If the state is empty, load the values for the first time.
  101. if (empty($data))
  102. {
  103. $data = new JObject;
  104. // Load the default configuration options.
  105. $data->options = JComponentHelper::getParams('com_finder');
  106. // Setup the weight lookup information.
  107. $data->weights = array(
  108. self::TITLE_CONTEXT => round($data->options->get('title_multiplier', 1.7), 2),
  109. self::TEXT_CONTEXT => round($data->options->get('text_multiplier', 0.7), 2),
  110. self::META_CONTEXT => round($data->options->get('meta_multiplier', 1.2), 2),
  111. self::PATH_CONTEXT => round($data->options->get('path_multiplier', 2.0), 2),
  112. self::MISC_CONTEXT => round($data->options->get('misc_multiplier', 0.3), 2)
  113. );
  114. // Set the current time as the start time.
  115. $data->startTime = JFactory::getDate()->toSQL();
  116. // Set the remaining default values.
  117. $data->batchSize = (int) $data->options->get('batch_size', 50);
  118. $data->batchOffset = 0;
  119. $data->totalItems = 0;
  120. $data->pluginState = array();
  121. }
  122. // Setup the profiler if debugging is enabled.
  123. if (JFactory::getApplication()->getCfg('debug'))
  124. {
  125. jimport('joomla.error.profiler');
  126. self::$profiler = JProfiler::getInstance('FinderIndexer');
  127. }
  128. // Setup the stemmer.
  129. if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
  130. {
  131. FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
  132. }
  133. // Set the state.
  134. self::$state = $data;
  135. return self::$state;
  136. }
  137. /**
  138. * Method to set the indexer state.
  139. *
  140. * @param object $data A new indexer state object.
  141. *
  142. * @return boolean True on success, false on failure.
  143. *
  144. * @since 2.5
  145. */
  146. public static function setState($data)
  147. {
  148. // Check the state object.
  149. if (empty($data) || !$data instanceof JObject)
  150. {
  151. return false;
  152. }
  153. // Set the new internal state.
  154. self::$state = $data;
  155. // Set the new session state.
  156. $session = JFactory::getSession();
  157. $session->set('_finder.state', $data);
  158. return true;
  159. }
  160. /**
  161. * Method to reset the indexer state.
  162. *
  163. * @return void
  164. *
  165. * @since 2.5
  166. */
  167. public static function resetState()
  168. {
  169. // Reset the internal state to null.
  170. self::$state = null;
  171. // Reset the session state to null.
  172. $session = JFactory::getSession();
  173. $session->set('_finder.state', null);
  174. }
  175. /**
  176. * Method to index a content item.
  177. *
  178. * @param FinderIndexerResult $item The content item to index.
  179. * @param string $format The format of the content. [optional]
  180. *
  181. * @return integer The ID of the record in the links table.
  182. *
  183. * @since 2.5
  184. * @throws Exception on database error.
  185. */
  186. public static function index($item, $format = 'html')
  187. {
  188. // Mark beforeIndexing in the profiler.
  189. self::$profiler ? self::$profiler->mark('beforeIndexing') : null;
  190. $db = JFactory::getDBO();
  191. $nd = $db->getNullDate();
  192. // Check if the item is in the database.
  193. $query = $db->getQuery(true);
  194. $query->select($db->quoteName('link_id') . ', ' . $db->quoteName('md5sum'));
  195. $query->from($db->quoteName('#__finder_links'));
  196. $query->where($db->quoteName('url') . ' = ' . $db->quote($item->url));
  197. // Load the item from the database.
  198. $db->setQuery($query);
  199. $link = $db->loadObject();
  200. // Check for a database error.
  201. if ($db->getErrorNum())
  202. {
  203. // Throw database error exception.
  204. throw new Exception($db->getErrorMsg(), 500);
  205. }
  206. // Get the indexer state.
  207. $state = FinderIndexer::getState();
  208. // Get the signatures of the item.
  209. $curSig = self::getSignature($item);
  210. $oldSig = isset($link->md5sum) ? $link->md5sum : null;
  211. // Get the other item information.
  212. $linkId = empty($link->link_id) ? null : $link->link_id;
  213. $isNew = empty($link->link_id) ? true : false;
  214. // Check the signatures. If they match, the item is up to date.
  215. if (!$isNew && $curSig == $oldSig)
  216. {
  217. return $linkId;
  218. }
  219. /*
  220. * If the link already exists, flush all the term maps for the item.
  221. * Maps are stored in 16 tables so we need to iterate through and flush
  222. * each table one at a time.
  223. */
  224. if (!$isNew)
  225. {
  226. for ($i = 0; $i <= 15; $i++)
  227. {
  228. // Flush the maps for the link.
  229. $query->clear();
  230. $query->delete();
  231. $query->from($db->quoteName('#__finder_links_terms' . dechex($i)));
  232. $query->where($db->quoteName('link_id') . ' = ' . (int) $linkId);
  233. $db->setQuery($query);
  234. $db->query();
  235. // Check for a database error.
  236. if ($db->getErrorNum())
  237. {
  238. // Throw database error exception.
  239. throw new Exception($db->getErrorMsg(), 500);
  240. }
  241. }
  242. // Remove the taxonomy maps.
  243. FinderIndexerTaxonomy::removeMaps($linkId);
  244. }
  245. // Mark afterUnmapping in the profiler.
  246. self::$profiler ? self::$profiler->mark('afterUnmapping') : null;
  247. // Perform cleanup on the item data.
  248. $item->publish_start_date = intval($item->publish_start_date) != 0 ? $item->publish_start_date : $nd;
  249. $item->publish_end_date = intval($item->publish_end_date) != 0 ? $item->publish_end_date : $nd;
  250. $item->start_date = intval($item->start_date) != 0 ? $item->start_date : $nd;
  251. $item->end_date = intval($item->end_date) != 0 ? $item->end_date : $nd;
  252. // Prepare the item description.
  253. $item->description = FinderIndexerHelper::parse($item->summary);
  254. /*
  255. * Now, we need to enter the item into the links table. If the item
  256. * already exists in the database, we need to use an UPDATE query.
  257. * Otherwise, we need to use an INSERT to get the link id back.
  258. */
  259. if ($isNew)
  260. {
  261. $columnsArray = array(
  262. $db->quoteName('url'), $db->quoteName('route'), $db->quoteName('title'), $db->quoteName('description'),
  263. $db->quoteName('indexdate'), $db->quoteName('published'), $db->quoteName('state'), $db->quoteName('access'),
  264. $db->quoteName('language'), $db->quoteName('type_id'), $db->quoteName('object'), $db->quoteName('publish_start_date'),
  265. $db->quoteName('publish_end_date'), $db->quoteName('start_date'), $db->quoteName('end_date'), $db->quoteName('list_price'),
  266. $db->quoteName('sale_price')
  267. );
  268. // Insert the link.
  269. $query->clear();
  270. $query->insert($db->quoteName('#__finder_links'));
  271. $query->columns($columnsArray);
  272. $query->values(
  273. $db->quote($item->url) . ', '
  274. . $db->quote($item->route) . ', '
  275. . $db->quote($item->title) . ', '
  276. . $db->quote($item->description) . ', '
  277. . $query->currentTimestamp() . ', '
  278. . '1, '
  279. . (int) $item->state . ', '
  280. . (int) $item->access . ', '
  281. . $db->quote($item->language) . ', '
  282. . (int) $item->type_id . ', '
  283. . $db->quote(serialize($item)) . ', '
  284. . $db->quote($item->publish_start_date) . ', '
  285. . $db->quote($item->publish_end_date) . ', '
  286. . $db->quote($item->start_date) . ', '
  287. . $db->quote($item->end_date) . ', '
  288. . $db->quote($item->list_price) . ', '
  289. . $db->quote($item->sale_price)
  290. );
  291. $db->setQuery($query);
  292. $db->query();
  293. // Check for a database error.
  294. if ($db->getErrorNum())
  295. {
  296. // Throw database error exception.
  297. throw new Exception($db->getErrorMsg(), 500);
  298. }
  299. // Get the link id.
  300. $linkId = (int) $db->insertid();
  301. }
  302. else
  303. {
  304. // Update the link.
  305. //@TODO: Implement this
  306. $query->clear();
  307. $query->update($db->qn('#__finder_links'));
  308. $query->set($db->qn('route') . ' = ' . $db->quote($item->route));
  309. $query->set($db->qn('title') . ' = ' . $db->quote($item->title));
  310. $query->set($db->qn('description') . ' = ' . $db->quote($item->description));
  311. $query->set($db->qn('indexdate') . ' = ' . $query->currentTimestamp());
  312. $query->set($db->qn('state') . ' = ' . (int) $item->state);
  313. $query->set($db->qn('access') . ' = ' . (int) $item->access);
  314. $query->set($db->qn('language') . ' = ' . $db->quote($item->language));
  315. $query->set($db->qn('type_id') . ' = ' . (int) $item->type_id);
  316. $query->set($db->qn('object') . ' = ' . $db->quote(serialize($item)));
  317. $query->set($db->qn('publish_start_date') . ' = ' . $db->quote($item->publish_start_date));
  318. $query->set($db->qn('publish_end_date') . ' = ' . $db->quote($item->publish_end_date));
  319. $query->set($db->qn('start_date') . ' = ' . $db->quote($item->start_date));
  320. $query->set($db->qn('end_date') . ' = ' . $db->quote($item->end_date));
  321. $query->set($db->qn('list_price') . ' = ' . $db->quote($item->list_price));
  322. $query->set($db->qn('sale_price') . ' = ' . $db->quote($item->sale_price));
  323. $query->where('link_id = ' . (int) $linkId);
  324. $db->setQuery($query);
  325. $db->query();
  326. // Check for a database error.
  327. if ($db->getErrorNum())
  328. {
  329. // Throw database error exception.
  330. throw new Exception($db->getErrorMsg(), 500);
  331. }
  332. }
  333. // Set up the variables we will need during processing.
  334. $tokens = array();
  335. $count = 0;
  336. // Mark afterLinking in the profiler.
  337. self::$profiler ? self::$profiler->mark('afterLinking') : null;
  338. // Truncate the tokens tables.
  339. $db->truncateTable('#__finder_tokens');
  340. // Check for a database error.
  341. if ($db->getErrorNum())
  342. {
  343. // Throw database error exception.
  344. throw new Exception($db->getErrorMsg(), 500);
  345. }
  346. // Truncate the tokens aggregate table.
  347. $db->truncateTable('#__finder_tokens_aggregate');
  348. // Check for a database error.
  349. if ($db->getErrorNum())
  350. {
  351. // Throw database error exception.
  352. throw new Exception($db->getErrorMsg(), 500);
  353. }
  354. /*
  355. * Process the item's content. The items can customize their
  356. * processing instructions to define extra properties to process
  357. * or rearrange how properties are weighted.
  358. */
  359. foreach ($item->getInstructions() as $group => $properties)
  360. {
  361. // Iterate through the properties of the group.
  362. foreach ($properties as $property)
  363. {
  364. // Check if the property exists in the item.
  365. if (empty($item->$property))
  366. {
  367. continue;
  368. }
  369. // Tokenize the property.
  370. if (is_array($item->$property))
  371. {
  372. // Tokenize an array of content and add it to the database.
  373. foreach ($item->$property as $ip)
  374. {
  375. // If the group is path, we need to a few extra processing
  376. // steps to strip the extension and convert slashes and dashes
  377. // to spaces.
  378. if ($group === self::PATH_CONTEXT)
  379. {
  380. $ip = JFile::stripExt($ip);
  381. $ip = str_replace('/', ' ', $ip);
  382. $ip = str_replace('-', ' ', $ip);
  383. }
  384. // Tokenize a string of content and add it to the database.
  385. $count += FinderIndexer::tokenizeToDB($ip, $group, $item->language, $format);
  386. // Check if we're approaching the memory limit of the token table.
  387. if ($count > self::$state->options->get('memory_table_limit', 30000))
  388. {
  389. FinderIndexer::toggleTables(false);
  390. }
  391. }
  392. }
  393. else
  394. {
  395. // If the group is path, we need to a few extra processing
  396. // steps to strip the extension and convert slashes and dashes
  397. // to spaces.
  398. if ($group === self::PATH_CONTEXT)
  399. {
  400. $item->$property = JFile::stripExt($item->$property);
  401. $item->$property = str_replace('/', ' ', $item->$property);
  402. $item->$property = str_replace('-', ' ', $item->$property);
  403. }
  404. // Tokenize a string of content and add it to the database.
  405. $count += FinderIndexer::tokenizeToDB($item->$property, $group, $item->language, $format);
  406. // Check if we're approaching the memory limit of the token table.
  407. if ($count > self::$state->options->get('memory_table_limit', 30000))
  408. {
  409. FinderIndexer::toggleTables(false);
  410. }
  411. }
  412. }
  413. }
  414. /*
  415. * Process the item's taxonomy. The items can customize their
  416. * taxonomy mappings to define extra properties to map.
  417. */
  418. foreach ($item->getTaxonomy() as $branch => $nodes)
  419. {
  420. // Iterate through the nodes and map them to the branch.
  421. foreach ($nodes as $node)
  422. {
  423. // Add the node to the tree.
  424. $nodeId = FinderIndexerTaxonomy::addNode($branch, $node->title, $node->state, $node->access);
  425. // Add the link => node map.
  426. FinderIndexerTaxonomy::addMap($linkId, $nodeId);
  427. // Tokenize the node title and add them to the database.
  428. $count += FinderIndexer::tokenizeToDB($node->title, self::META_CONTEXT, $item->language, $format);
  429. }
  430. }
  431. // Mark afterProcessing in the profiler.
  432. self::$profiler ? self::$profiler->mark('afterProcessing') : null;
  433. /*
  434. * At this point, all of the item's content has been parsed, tokenized
  435. * and inserted into the #__finder_tokens table. Now, we need to
  436. * aggregate all the data into that table into a more usable form. The
  437. * aggregated data will be inserted into #__finder_tokens_aggregate
  438. * table.
  439. */
  440. $query = 'INSERT INTO ' . $db->quoteName('#__finder_tokens_aggregate') .
  441. ' (' . $db->quoteName('term_id') .
  442. ', ' . $db->quoteName('term') .
  443. ', ' . $db->quoteName('stem') .
  444. ', ' . $db->quoteName('common') .
  445. ', ' . $db->quoteName('phrase') .
  446. ', ' . $db->quoteName('term_weight') .
  447. ', ' . $db->quoteName('context') .
  448. ', ' . $db->quoteName('context_weight') . ')' .
  449. ' SELECT' .
  450. ' t.term_id, t1.term, t1.stem, t1.common, t1.phrase, t1.weight, t1.context,' .
  451. ' ROUND( t1.weight * COUNT( t2.term ) * %F, 8 ) AS context_weight' .
  452. ' FROM (' .
  453. ' SELECT DISTINCT t1.term, t1.stem, t1.common, t1.phrase, t1.weight, t1.context' .
  454. ' FROM ' . $db->quoteName('#__finder_tokens') . ' AS t1' .
  455. ' WHERE t1.context = %d' .
  456. ' ) AS t1' .
  457. ' JOIN ' . $db->quoteName('#__finder_tokens') . ' AS t2 ON t2.term = t1.term' .
  458. ' LEFT JOIN ' . $db->quoteName('#__finder_terms') . ' AS t ON t.term = t1.term' .
  459. ' WHERE t2.context = %d' .
  460. ' GROUP BY t1.term' .
  461. ' ORDER BY t1.term DESC';
  462. // Iterate through the contexts and aggregate the tokens per context.
  463. foreach ($state->weights as $context => $multiplier)
  464. {
  465. // Run the query to aggregate the tokens for this context..
  466. $db->setQuery(sprintf($query, $multiplier, $context, $context));
  467. $db->query();
  468. // Check for a database error.
  469. if ($db->getErrorNum())
  470. {
  471. // Throw database error exception.
  472. throw new Exception($db->getErrorMsg(), 500);
  473. }
  474. }
  475. // Mark afterAggregating in the profiler.
  476. self::$profiler ? self::$profiler->mark('afterAggregating') : null;
  477. /*
  478. * When we pulled down all of the aggregate data, we did a LEFT JOIN
  479. * over the terms table to try to find all the term ids that
  480. * already exist for our tokens. If any of the rows in the aggregate
  481. * table have a term of 0, then no term record exists for that
  482. * term so we need to add it to the terms table.
  483. */
  484. //@TODO: PostgreSQL doesn't support SOUNDEX out of the box
  485. /* This edit is causing the indexer to fail.
  486. $queryInsIgn = 'INSERT INTO ' . $db->quoteName('#__finder_terms') .
  487. ' (' . $db->quoteName('term') .
  488. ', ' . $db->quoteName('stem') .
  489. ', ' . $db->quoteName('common') .
  490. ', ' . $db->quoteName('phrase') .
  491. ', ' . $db->quoteName('weight') .
  492. ', ' . $db->quoteName('soundex') . ')' .
  493. ' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' .
  494. ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' .
  495. ' WHERE 1 NOT IN ' .
  496. '( SELECT 1 FROM ' . $db->quoteName('#__finder_terms') .
  497. ' WHERE ta.term_id = 0 )' .
  498. ' AND ta.term_id = 0' .
  499. ' GROUP BY ta.term';
  500. $db->setQuery($queryInsIgn);
  501. $db->query();
  502. // Check for a database error.
  503. if ($db->getErrorNum())
  504. {
  505. //@TODO: PostgreSQL doesn't support SOUNDEX out of the box
  506. $query->clear();
  507. $query->select('ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)')
  508. ->from($db->quoteName('#__finder_tokens_aggregate') . ' AS ta')
  509. ->where('ta.term_id = 0');
  510. $db->setQuery($query);
  511. $subQuVal = $db->loadObject();
  512. $quRepl_p1 = 'UPDATE ' . $db->quoteName('#__finder_terms') . ' AS ta' .
  513. ' SET ' .
  514. ' (' . $db->quoteName('term') .
  515. ', ' . $db->quoteName('stem') .
  516. ', ' . $db->quoteName('common') .
  517. ', ' . $db->quoteName('phrase') .
  518. ', ' . $db->quoteName('weight') .
  519. ', ' . $db->quoteName('soundex') . ')' .
  520. ' = ' .
  521. ' (' . $db->quote($subQuVal->term) .
  522. ', ' . $db->quote($subQuVal->stem) .
  523. ', ' . $db->quote($subQuVal->common) .
  524. ', ' . $db->quote($subQuVal->phrase) .
  525. ', ' . $db->quote($subQuVal->weight) .
  526. ', ' . $db->quote($subQuVal->soundex) . ')' .
  527. ' WHERE ' .
  528. $db->quoteName('term') . ' = ' . $db->quote($subQuVal->term) . ' AND ' .
  529. $db->quoteName('stem') . ' = ' . $db->quote($subQuVal->stem) . ' AND ' .
  530. $db->quoteName('common') . ' = ' . $db->quote($subQuVal->common) . ' AND ' .
  531. $db->quoteName('phrase') . ' = ' . $db->quote($subQuVal->phrase) . ' AND ' .
  532. $db->quoteName('weight') . ' = ' . $db->quote($subQuVal->weight) . ' AND ' .
  533. $db->quoteName('soundex') . ' = ' . $db->quote($subQuVal->soundex);
  534. $db->setQuery($quRepl_p1);
  535. $db->query();
  536. $quRepl_p2 = 'INSERT INTO ' . $db->quoteName('#__finder_terms') .
  537. ' (' . $db->quoteName('term') .
  538. ', ' . $db->quoteName('stem') .
  539. ', ' . $db->quoteName('common') .
  540. ', ' . $db->quoteName('phrase') .
  541. ', ' . $db->quoteName('weight') .
  542. ', ' . $db->quoteName('soundex') . ')' .
  543. ' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' .
  544. ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' .
  545. ' WHERE 1 NOT IN ' .
  546. '( SELECT 1 FROM ' . $db->quoteName('#__finder_terms') .
  547. ' WHERE ta.term_id = 0 )' .
  548. ' AND ta.term_id = 0' .
  549. ' GROUP BY ta.term';
  550. $db->setQuery($quRepl_p2);
  551. $db->query();
  552. // Check for a database error.
  553. if ($db->getErrorNum())
  554. {
  555. throw new Exception($db->getErrorMsg(), 500);
  556. }
  557. }
  558. End of failing edit */
  559. //@TODO: PostgreSQL doesn't support INSERT IGNORE INTO
  560. //@TODO: PostgreSQL doesn't support SOUNDEX out of the box
  561. $db->setQuery(
  562. 'INSERT IGNORE INTO ' . $db->quoteName('#__finder_terms') .
  563. ' (' . $db->quoteName('term') .
  564. ', ' . $db->quoteName('stem') .
  565. ', ' . $db->quoteName('common') .
  566. ', ' . $db->quoteName('phrase') .
  567. ', ' . $db->quoteName('weight') .
  568. ', ' . $db->quoteName('soundex') . ')' .
  569. ' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' .
  570. ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' .
  571. ' WHERE ta.term_id = 0' .
  572. ' GROUP BY ta.term'
  573. );
  574. $db->query();
  575. // Check for a database error.
  576. if ($db->getErrorNum())
  577. {
  578. {
  579. throw new Exception($db->getErrorMsg(), 500);
  580. }
  581. }
  582. /*
  583. * Now, we just inserted a bunch of new records into the terms table
  584. * so we need to go back and update the aggregate table with all the
  585. * new term ids.
  586. */
  587. $query = $db->getQuery(true);
  588. $query->update($db->quoteName('#__finder_tokens_aggregate') . ' AS ta');
  589. $query->join('INNER', $db->quoteName('#__finder_terms') . ' AS t ON t.term = ta.term');
  590. $query->set('ta.term_id = t.term_id');
  591. $query->where('ta.term_id = 0');
  592. $db->setQuery($query);
  593. $db->query();
  594. // Check for a database error.
  595. if ($db->getErrorNum())
  596. {
  597. // Throw database error exception.
  598. throw new Exception($db->getErrorMsg(), 500);
  599. }
  600. // Mark afterTerms in the profiler.
  601. self::$profiler ? self::$profiler->mark('afterTerms') : null;
  602. /*
  603. * After we've made sure that all of the terms are in the terms table
  604. * and the aggregate table has the correct term ids, we need to update
  605. * the links counter for each term by one.
  606. */
  607. $query->clear();
  608. $query->update($db->quoteName('#__finder_terms') . ' AS t');
  609. $query->join('INNER', $db->quoteName('#__finder_tokens_aggregate') . ' AS ta ON ta.term_id = t.term_id');
  610. $query->set('t.' . $db->quoteName('links') . ' = t.links + 1');
  611. $db->setQuery($query);
  612. $db->query();
  613. // Check for a database error.
  614. if ($db->getErrorNum())
  615. {
  616. // Throw database error exception.
  617. throw new Exception($db->getErrorMsg(), 500);
  618. }
  619. // Mark afterTerms in the profiler.
  620. self::$profiler ? self::$profiler->mark('afterTerms') : null;
  621. /*
  622. * Before we can insert all of the mapping rows, we have to figure out
  623. * which mapping table the rows need to be inserted into. The mapping
  624. * table for each term is based on the first character of the md5 of
  625. * the first character of the term. In php, it would be expressed as
  626. * substr(md5(substr($token, 0, 1)), 0, 1)
  627. */
  628. $query->clear();
  629. $query->update($db->quoteName('#__finder_tokens_aggregate'));
  630. $query->set($db->quoteName('map_suffix') . ' = SUBSTR(MD5(SUBSTR(' . $db->quoteName('term') . ', 1, 1)), 1, 1)');
  631. $db->setQuery($query);
  632. $db->query();
  633. // Check for a database error.
  634. if ($db->getErrorNum())
  635. {
  636. // Throw database error exception.
  637. throw new Exception($db->getErrorMsg(), 500);
  638. }
  639. /*
  640. * At this point, the aggregate table contains a record for each
  641. * term in each context. So, we're going to pull down all of that
  642. * data while grouping the records by term and add all of the
  643. * sub-totals together to arrive at the final total for each token for
  644. * this link. Then, we insert all of that data into the appropriate
  645. * mapping table.
  646. */
  647. for ($i = 0; $i <= 15; $i++)
  648. {
  649. // Get the mapping table suffix.
  650. $suffix = dechex($i);
  651. /*
  652. * We have to run this query 16 times, one for each link => term
  653. * mapping table.
  654. */
  655. //@TODO: Convert to JDatabaseQuery
  656. $db->setQuery(
  657. 'INSERT INTO ' . $db->quoteName('#__finder_links_terms' . $suffix) .
  658. ' (' . $db->quoteName('link_id') .
  659. ', ' . $db->quoteName('term_id') .
  660. ', ' . $db->quoteName('weight') . ')' .
  661. ' SELECT ' . (int) $linkId . ', ' . $db->quoteName('term_id') . ',' .
  662. ' ROUND(SUM(' . $db->quoteName('context_weight') . '), 8)' .
  663. ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') .
  664. ' WHERE ' . $db->quoteName('map_suffix') . ' = ' . $db->quote($suffix) .
  665. ' GROUP BY ' . $db->quoteName('term') .
  666. ' ORDER BY ' . $db->quoteName('term') . ' DESC'
  667. );
  668. $db->query();
  669. // Check for a database error.
  670. if ($db->getErrorNum())
  671. {
  672. // Throw database error exception.
  673. throw new Exception($db->getErrorMsg(), 500);
  674. }
  675. }
  676. // Mark afterMapping in the profiler.
  677. self::$profiler ? self::$profiler->mark('afterMapping') : null;
  678. // Update the signature.
  679. $query->clear();
  680. $query->update($db->quoteName('#__finder_links'));
  681. $query->set($db->quoteName('md5sum') . ' = ' . $db->quote($curSig));
  682. $query->where($db->quoteName('link_id') . ' = ' . $db->quote($linkId));
  683. $db->setQuery($query);
  684. $db->query();
  685. // Check for a database error.
  686. if ($db->getErrorNum())
  687. {
  688. // Throw database error exception.
  689. throw new Exception($db->getErrorMsg(), 500);
  690. }
  691. // Mark afterSigning in the profiler.
  692. self::$profiler ? self::$profiler->mark('afterSigning') : null;
  693. // Truncate the tokens tables.
  694. $db->truncateTable('#__finder_tokens');
  695. // Check for a database error.
  696. if ($db->getErrorNum())
  697. {
  698. // Throw database error exception.
  699. throw new Exception($db->getErrorMsg(), 500);
  700. }
  701. // Truncate the tokens aggregate table.
  702. $db->truncateTable('#__finder_tokens_aggregate');
  703. // Check for a database error.
  704. if ($db->getErrorNum())
  705. {
  706. // Throw database error exception.
  707. throw new Exception($db->getErrorMsg(), 500);
  708. }
  709. // Toggle the token tables back to memory tables.
  710. FinderIndexer::toggleTables(true);
  711. // Mark afterTruncating in the profiler.
  712. self::$profiler ? self::$profiler->mark('afterTruncating') : null;
  713. return $linkId;
  714. }
  715. /**
  716. * Method to remove a link from the index.
  717. *
  718. * @param integer $linkId The id of the link.
  719. *
  720. * @return boolean True on success.
  721. *
  722. * @since 2.5
  723. * @throws Exception on database error.
  724. */
  725. public static function remove($linkId)
  726. {
  727. $db = JFactory::getDBO();
  728. $query = $db->getQuery(true);
  729. // Get the indexer state.
  730. $state = FinderIndexer::getState();
  731. // Update the link counts and remove the mapping records.
  732. for ($i = 0; $i <= 15; $i++)
  733. {
  734. // Update the link counts for the terms.
  735. $query->update($db->quoteName('#__finder_terms') . ' AS t');
  736. $query->join('INNER', $db->quoteName('#__finder_links_terms' . dechex($i)) . ' AS m ON m.term_id = t.term_id');
  737. $query->set($db->quoteName('t'). '.' . $db->quoteName('links') . ' ='. $db->quoteName('t') .'.' . $db->quoteName('links') . ' - 1');
  738. $query->where($db->quoteName('m') . '.' . $db->quoteName('link_id') . ' = ' . $db->quote((int) $linkId));
  739. $db->setQuery($query);
  740. $db->query();
  741. // Check for a database error.
  742. if ($db->getErrorNum())
  743. {
  744. // Throw database error exception.
  745. throw new Exception($db->getErrorMsg(), 500);
  746. }
  747. // Remove all records from the mapping tables.
  748. $query->clear();
  749. $query->delete();
  750. $query->from($db->quoteName('#__finder_links_terms' . dechex($i)));
  751. $query->where($db->quoteName('link_id') . ' = ' . (int) $linkId);
  752. $db->setQuery($query);
  753. $db->query();
  754. // Check for a database error.
  755. if ($db->getErrorNum())
  756. {
  757. // Throw database error exception.
  758. throw new Exception($db->getErrorMsg(), 500);
  759. }
  760. }
  761. // Delete all orphaned terms.
  762. $query->clear();
  763. $query->delete();
  764. $query->from($db->quoteName('#__finder_terms'));
  765. $query->where($db->quoteName('links') . ' <= 0');
  766. $db->setQuery($query);
  767. $db->query();
  768. // Check for a database error.
  769. if ($db->getErrorNum())
  770. {
  771. // Throw database error exception.
  772. throw new Exception($db->getErrorMsg(), 500);
  773. }
  774. // Delete the link from the index.
  775. $query->clear();
  776. $query->delete();
  777. $query->from($db->quoteName('#__finder_links'));
  778. $query->where($db->quoteName('link_id') . ' = ' . $db->quote((int) $linkId));
  779. $db->setQuery($query);
  780. $db->query();
  781. // Check for a database error.
  782. if ($db->getErrorNum())
  783. {
  784. // Throw database error exception.
  785. throw new Exception($db->getErrorMsg(), 500);
  786. }
  787. // Remove the taxonomy maps.
  788. FinderIndexerTaxonomy::removeMaps($linkId);
  789. // Remove the orphaned taxonomy nodes.
  790. FinderIndexerTaxonomy::removeOrphanNodes();
  791. return true;
  792. }
  793. /**
  794. * Method to optimize the index. We use this method to remove unused terms
  795. * and any other optimizations that might be necessary.
  796. *
  797. * @return boolean True on success.
  798. *
  799. * @since 2.5
  800. * @throws Exception on database error.
  801. */
  802. public static function optimize()
  803. {
  804. // Get the indexer state.
  805. $state = FinderIndexer::getState();
  806. // Get the database object.
  807. $db = JFactory::getDBO();
  808. $query = $db->getQuery(true);
  809. // Delete all orphaned terms.
  810. $query->delete();
  811. $query->from($db->quoteName('#__finder_terms'));
  812. $query->where($db->quoteName('links') . ' <= 0');
  813. $db->setQuery($query);
  814. $db->query();
  815. // Check for a database error.
  816. if ($db->getErrorNum())
  817. {
  818. // Throw database error exception.
  819. throw new Exception($db->getErrorMsg(), 500);
  820. }
  821. // Optimize the links table.
  822. //@TODO: PostgreSQL doesn't support OPTIMIZE TABLE
  823. // Temporary workaround for non-MySQL solutions
  824. if (strpos($db->name, 'mysql') === 0)
  825. {
  826. $db->setQuery('OPTIMIZE TABLE ' . $db->quoteName('#__finder_links'));
  827. $db->query();
  828. // Check for a database error.
  829. if ($db->getErrorNum())
  830. {
  831. // Throw database error exception.
  832. throw new Exception($db->getErrorMsg(), 500);
  833. }
  834. }
  835. //@TODO: PostgreSQL doesn't support OPTIMIZE TABLE
  836. // Temporary workaround for non-MySQL solutions
  837. if (strpos($db->name, 'mysql') === 0)
  838. {
  839. for ($i = 0; $i <= 15; $i++)
  840. {
  841. // Optimize the terms mapping table.
  842. $db->setQuery('OPTIMIZE TABLE ' . $db->quoteName('#__finder_links_terms' . dechex($i)));
  843. $db->query();
  844. // Check for a database error.
  845. if ($db->getErrorNum())
  846. {
  847. // Throw database error exception.
  848. throw new Exception($db->getErrorMsg(), 500);
  849. }
  850. }
  851. }
  852. // Optimize the terms mapping table.
  853. //@TODO: PostgreSQL doesn't support OPTIMIZE TABLE
  854. // Temporary workaround for non-MySQL solutions
  855. if (strpos($db->name, 'mysql') === 0)
  856. {
  857. $db->setQuery('OPTIMIZE TABLE ' . $db->quoteName('#__finder_links_terms'));
  858. $db->query();
  859. // Check for a database error.
  860. if ($db->getErrorNum())
  861. {
  862. // Throw database error exception.
  863. throw new Exception($db->getErrorMsg(), 500);
  864. }
  865. }
  866. // Remove the orphaned taxonomy nodes.
  867. FinderIndexerTaxonomy::removeOrphanNodes();
  868. // Optimize the taxonomy mapping table.
  869. //@TODO: PostgreSQL doesn't support OPTIMIZE TABLE
  870. // Temporary workaround for non-MySQL solutions
  871. if (strpos($db->name, 'mysql') === 0)
  872. {
  873. $db->setQuery('OPTIMIZE TABLE ' . $db->quoteName('#__finder_taxonomy_map'));
  874. $db->query();
  875. // Check for a database error.
  876. if ($db->getErrorNum())
  877. {
  878. // Throw database error exception.
  879. throw new Exception($db->getErrorMsg(), 500);
  880. }
  881. }
  882. return true;
  883. }
  884. /**
  885. * Method to get a content item's signature.
  886. *
  887. * @param object $item The content item to index.
  888. *
  889. * @return string The content item's signature.
  890. *
  891. * @since 2.5
  892. */
  893. protected static function getSignature($item)
  894. {
  895. // Get the indexer state.
  896. $state = FinderIndexer::getState();
  897. // Get the relevant configuration variables.
  898. $config = array();
  899. $config[] = $state->weights;
  900. $config[] = $state->options->get('stem', 1);
  901. $config[] = $state->options->get('stemmer', 'porter_en');
  902. return md5(serialize(array($item, $config)));
  903. }
  904. /**
  905. * Method to parse input, tokenize it, and then add it to the database.
  906. *
  907. * @param mixed $input String or resource to use as input. A resource
  908. * input will automatically be chunked to conserve
  909. * memory. Strings will be chunked if longer than
  910. * 2K in size.
  911. * @param integer $context The context of the input. See context constants.
  912. * @param string $lang The language of the input.
  913. * @param string $format The format of the input.
  914. *
  915. * @return integer The number of tokens extracted from the input.
  916. *
  917. * @since 2.5
  918. */
  919. protected static function tokenizeToDB($input, $context, $lang, $format)
  920. {
  921. $count = 0;
  922. $buffer = null;
  923. // If the input is a resource, batch the process out.
  924. if (is_resource($input))
  925. {
  926. // Batch the process out to avoid memory limits.
  927. while (!feof($input))
  928. {
  929. // Read into the buffer.
  930. $buffer .= fread($input, 2048);
  931. // If we haven't reached the end of the file, seek to the last
  932. // space character and drop whatever is after that to make sure
  933. // we didn't truncate a term while reading the input.
  934. if (!feof($input))
  935. {
  936. // Find the last space character.
  937. $ls = strrpos($buffer, ' ');
  938. // Adjust string based on the last space character.
  939. if ($ls)
  940. {
  941. // Truncate the string to the last space character.
  942. $string = substr($buffer, 0, $ls);
  943. // Adjust the buffer based on the last space for the
  944. // next iteration and trim.
  945. $buffer = JString::trim(substr($buffer, $ls));
  946. }
  947. // No space character was found.
  948. else
  949. {
  950. $string = $buffer;
  951. }
  952. }
  953. // We've reached the end of the file, so parse whatever remains.
  954. else
  955. {
  956. $string = $buffer;
  957. }
  958. // Parse the input.
  959. $string = FinderIndexerHelper::parse($string, $format);
  960. // Check the input.
  961. if (empty($string))
  962. {
  963. continue;
  964. }
  965. // Tokenize the input.
  966. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  967. // Add the tokens to the database.
  968. $count += FinderIndexer::addTokensToDB($tokens, $context);
  969. // Check if we're approaching the memory limit of the token table.
  970. if ($count > self::$state->options->get('memory_table_limit', 30000))
  971. {
  972. FinderIndexer::toggleTables(false);
  973. }
  974. unset($string);
  975. unset($tokens);
  976. }
  977. }
  978. // If the input is greater than 2K in size, it is more efficient to
  979. // batch out the operation into smaller chunks of work.
  980. elseif (strlen($input) > 2048)
  981. {
  982. $start = 0;
  983. $end = strlen($input);
  984. $chunk = 2048;
  985. // As it turns out, the complex regular expressions we use for
  986. // sanitizing input are not very efficient when given large
  987. // strings. It is much faster to process lots of short strings.
  988. while ($start < $end)
  989. {
  990. // Setup the string.
  991. $string = substr($input, $start, $chunk);
  992. // Find the last space character if we aren't at the end.
  993. $ls = (($start + $chunk) < $end ? strrpos($string, ' ') : false);
  994. // Truncate to the last space character.
  995. if ($ls !== false)
  996. {
  997. $string = substr($string, 0, $ls);
  998. }
  999. // Adjust the start position for the next iteration.
  1000. $start += ($ls !== false ? ($ls + 1 - $chunk) + $chunk : $chunk);
  1001. // Parse the input.
  1002. $string = FinderIndexerHelper::parse($string, $format);
  1003. // Check the input.
  1004. if (empty($string))
  1005. {
  1006. continue;
  1007. }
  1008. // Tokenize the input.
  1009. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  1010. // Add the tokens to the database.
  1011. $count += FinderIndexer::addTokensToDB($tokens, $context);
  1012. // Check if we're approaching the memory limit of the token table.
  1013. if ($count > self::$state->options->get('memory_table_limit', 30000))
  1014. {
  1015. FinderIndexer::toggleTables(false);
  1016. }
  1017. }
  1018. }
  1019. else
  1020. {
  1021. // Parse the input.
  1022. $input = FinderIndexerHelper::parse($input, $format);
  1023. // Check the input.
  1024. if (empty($input))
  1025. {
  1026. return $count;
  1027. }
  1028. // Tokenize the input.
  1029. $tokens = FinderIndexerHelper::tokenize($input, $lang);
  1030. // Add the tokens to the database.
  1031. $count = FinderIndexer::addTokensToDB($tokens, $context);
  1032. }
  1033. return $count;
  1034. }
  1035. /**
  1036. * Method to add a set of tokens to the database.
  1037. *
  1038. * @param mixed $tokens An array or single FinderIndexerToken object.
  1039. * @param mixed $context The context of the tokens. See context constants. [optional]
  1040. *
  1041. * @return integer The number of tokens inserted into the database.
  1042. *
  1043. * @since 2.5
  1044. * @throws Exception on database error.
  1045. */
  1046. protected static function addTokensToDB($tokens, $context = '')
  1047. {
  1048. // Get the database object.
  1049. $db = JFactory::getDBO();
  1050. $query = $db->getQuery(true);
  1051. // Force tokens to an array.
  1052. $tokens = is_array($tokens) ? $tokens : array($tokens);
  1053. // Count the number of token values.
  1054. $values = 0;
  1055. // Iterate through the tokens to create SQL value sets.
  1056. foreach ($tokens as $token)
  1057. {
  1058. $query->values(
  1059. $db->quote($token->term) . ', '
  1060. . $db->quote($token->stem) . ', '
  1061. . (int) $token->common . ', '
  1062. . (int) $token->phrase . ', '
  1063. . (float) $token->weight . ', '
  1064. . (int) $context
  1065. );
  1066. $values++;
  1067. }
  1068. // Insert the tokens into the database.
  1069. $query->insert($db->quoteName('#__finder_tokens'));
  1070. $query->columns(
  1071. array(
  1072. $db->quoteName('term'),
  1073. $db->quoteName('stem'),
  1074. $db->quoteName('common'),
  1075. $db->quoteName('phrase'),
  1076. $db->quoteName('weight'),
  1077. $db->quoteName('context')
  1078. )
  1079. );
  1080. $db->setQuery($query);
  1081. $db->query();
  1082. // Check for a database error.
  1083. if ($db->getErrorNum())
  1084. {
  1085. // Throw database error exception.
  1086. throw new Exception($db->getErrorMsg(), 500);
  1087. }
  1088. return $values;
  1089. }
  1090. /**
  1091. * Method to switch the token tables from Memory tables to MyISAM tables
  1092. * when they are close to running out of memory.
  1093. *
  1094. * @param boolean $memory Flag to control how they should be toggled.
  1095. *
  1096. * @return boolean True on success.
  1097. *
  1098. * @since 2.5
  1099. * @throws Exception on database error.
  1100. * @todo PostgreSQL doesn't support setting ENGINEs, determine how to handle setting tables
  1101. */
  1102. protected static function toggleTables($memory)
  1103. {
  1104. static $state;
  1105. // Get the database adapter.
  1106. $db = JFactory::getDBO();
  1107. // Temporary workaround for non-MySQL solutions
  1108. if (strpos($db->name, 'mysql') !== 0)
  1109. {
  1110. return true;
  1111. }
  1112. // Check if we are setting the tables to the Memory engine.
  1113. if ($memory === true && $state !== true)
  1114. {
  1115. // Set the tokens table to Memory.
  1116. $db->setQuery('ALTER TABLE ' . $db->quoteName('#__finder_tokens') . ' ENGINE = MEMORY');
  1117. $db->query();
  1118. // Check for a database error.
  1119. if ($db->getErrorNum())
  1120. {
  1121. // Throw database error exception.
  1122. throw new Exception($db->getErrorMsg(), 500);
  1123. }
  1124. // Set the tokens aggregate table to Memory.
  1125. $db->setQuery('ALTER TABLE ' . $db->quoteName('#__finder_tokens_aggregate') . ' ENGINE = MEMORY');
  1126. $db->query();
  1127. // Check for a database error.
  1128. if ($db->getErrorNum())
  1129. {
  1130. // Throw database error exception.
  1131. throw new Exception($db->getErrorMsg(), 500);
  1132. }
  1133. // Set the internal state.
  1134. $state = $memory;
  1135. }
  1136. // We must be setting the tables to the MyISAM engine.
  1137. elseif ($memory === false && $state !== false)
  1138. {
  1139. // Set the tokens table to MyISAM.
  1140. $db->setQuery('ALTER TABLE ' . $db->quoteName('#__finder_tokens') . ' ENGINE = MYISAM');
  1141. $db->query();
  1142. // Check for a database error.
  1143. if ($db->getErrorNum())
  1144. {
  1145. // Throw database error exception.
  1146. throw new Exception($db->getErrorMsg(), 500);
  1147. }
  1148. // Set the tokens aggregate table to MyISAM.
  1149. $db->setQuery('ALTER TABLE ' . $db->quoteName('#__finder_tokens_aggregate') . ' ENGINE = MYISAM');
  1150. $db->query();
  1151. // Check for a database error.
  1152. if ($db->getErrorNum())
  1153. {
  1154. // Throw database error exception.
  1155. throw new Exception($db->getErrorMsg(), 500);
  1156. }
  1157. // Set the internal state.
  1158. $state = $memory;
  1159. }
  1160. return true;
  1161. }
  1162. }