PageRenderTime 60ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/home/administrator/components/com_finder/helpers/indexer/indexer.php

https://bitbucket.org/rubbystar/carimod
PHP | 479 lines | 187 code | 65 blank | 227 comment | 25 complexity | 9a00b13f1d523339eb6cecdf5faad00f MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0, GPL-3.0
  1. <?php
  2. /**
  3. * @package Joomla.Administrator
  4. * @subpackage com_finder
  5. *
  6. * @copyright Copyright (C) 2005 - 2016 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('_JEXEC') or die;
  10. JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php');
  11. JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
  12. JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
  13. JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php');
  14. JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
  15. jimport('joomla.filesystem.file');
  16. /**
  17. * Main indexer class for the Finder indexer package.
  18. *
  19. * The indexer class provides the core functionality of the Finder
  20. * search engine. It is responsible for adding and updating the
  21. * content links table; extracting and scoring tokens; and maintaining
  22. * all referential information for the content.
  23. *
  24. * Note: All exceptions thrown from within this class should be caught
  25. * by the controller.
  26. *
  27. * @since 2.5
  28. */
  29. abstract class FinderIndexer
  30. {
  31. /**
  32. * The title context identifier.
  33. *
  34. * @var integer
  35. * @since 2.5
  36. */
  37. const TITLE_CONTEXT = 1;
  38. /**
  39. * The text context identifier.
  40. *
  41. * @var integer
  42. * @since 2.5
  43. */
  44. const TEXT_CONTEXT = 2;
  45. /**
  46. * The meta context identifier.
  47. *
  48. * @var integer
  49. * @since 2.5
  50. */
  51. const META_CONTEXT = 3;
  52. /**
  53. * The path context identifier.
  54. *
  55. * @var integer
  56. * @since 2.5
  57. */
  58. const PATH_CONTEXT = 4;
  59. /**
  60. * The misc context identifier.
  61. *
  62. * @var integer
  63. * @since 2.5
  64. */
  65. const MISC_CONTEXT = 5;
  66. /**
  67. * The indexer state object.
  68. *
  69. * @var JObject
  70. * @since 2.5
  71. */
  72. public static $state;
  73. /**
  74. * The indexer profiler object.
  75. *
  76. * @var JProfiler
  77. * @since 2.5
  78. */
  79. public static $profiler;
  80. /**
  81. * Returns a reference to the FinderIndexer object.
  82. *
  83. * @return FinderIndexer instance based on the database driver
  84. *
  85. * @since 3.0
  86. * @throws RuntimeException if driver class for indexer not present.
  87. */
  88. public static function getInstance()
  89. {
  90. // Setup the adapter for the indexer.
  91. $format = JFactory::getDbo()->name;
  92. if ($format == 'mysqli' || $format == 'pdomysql')
  93. {
  94. $format = 'mysql';
  95. }
  96. elseif ($format == 'sqlazure')
  97. {
  98. $format = 'sqlsrv';
  99. }
  100. $path = __DIR__ . '/driver/' . $format . '.php';
  101. $class = 'FinderIndexerDriver' . ucfirst($format);
  102. // Check if a parser exists for the format.
  103. if (file_exists($path))
  104. {
  105. // Instantiate the parser.
  106. include_once $path;
  107. return new $class;
  108. }
  109. // Throw invalid format exception.
  110. throw new RuntimeException(JText::sprintf('COM_FINDER_INDEXER_INVALID_DRIVER', $format));
  111. }
  112. /**
  113. * Method to get the indexer state.
  114. *
  115. * @return object The indexer state object.
  116. *
  117. * @since 2.5
  118. */
  119. public static function getState()
  120. {
  121. // First, try to load from the internal state.
  122. if (!empty(static::$state))
  123. {
  124. return static::$state;
  125. }
  126. // If we couldn't load from the internal state, try the session.
  127. $session = JFactory::getSession();
  128. $data = $session->get('_finder.state', null);
  129. // If the state is empty, load the values for the first time.
  130. if (empty($data))
  131. {
  132. $data = new JObject;
  133. // Load the default configuration options.
  134. $data->options = JComponentHelper::getParams('com_finder');
  135. // Setup the weight lookup information.
  136. $data->weights = array(
  137. self::TITLE_CONTEXT => round($data->options->get('title_multiplier', 1.7), 2),
  138. self::TEXT_CONTEXT => round($data->options->get('text_multiplier', 0.7), 2),
  139. self::META_CONTEXT => round($data->options->get('meta_multiplier', 1.2), 2),
  140. self::PATH_CONTEXT => round($data->options->get('path_multiplier', 2.0), 2),
  141. self::MISC_CONTEXT => round($data->options->get('misc_multiplier', 0.3), 2)
  142. );
  143. // Set the current time as the start time.
  144. $data->startTime = JFactory::getDate()->toSql();
  145. // Set the remaining default values.
  146. $data->batchSize = (int) $data->options->get('batch_size', 50);
  147. $data->batchOffset = 0;
  148. $data->totalItems = 0;
  149. $data->pluginState = array();
  150. }
  151. // Setup the profiler if debugging is enabled.
  152. if (JFactory::getApplication()->get('debug'))
  153. {
  154. static::$profiler = JProfiler::getInstance('FinderIndexer');
  155. }
  156. // Setup the stemmer.
  157. if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
  158. {
  159. FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
  160. }
  161. // Set the state.
  162. static::$state = $data;
  163. return static::$state;
  164. }
  165. /**
  166. * Method to set the indexer state.
  167. *
  168. * @param object $data A new indexer state object.
  169. *
  170. * @return boolean True on success, false on failure.
  171. *
  172. * @since 2.5
  173. */
  174. public static function setState($data)
  175. {
  176. // Check the state object.
  177. if (empty($data) || !$data instanceof JObject)
  178. {
  179. return false;
  180. }
  181. // Set the new internal state.
  182. static::$state = $data;
  183. // Set the new session state.
  184. JFactory::getSession()->set('_finder.state', $data);
  185. return true;
  186. }
  187. /**
  188. * Method to reset the indexer state.
  189. *
  190. * @return void
  191. *
  192. * @since 2.5
  193. */
  194. public static function resetState()
  195. {
  196. // Reset the internal state to null.
  197. self::$state = null;
  198. // Reset the session state to null.
  199. JFactory::getSession()->set('_finder.state', null);
  200. }
  201. /**
  202. * Method to index a content item.
  203. *
  204. * @param FinderIndexerResult $item The content item to index.
  205. * @param string $format The format of the content. [optional]
  206. *
  207. * @return integer The ID of the record in the links table.
  208. *
  209. * @since 2.5
  210. * @throws Exception on database error.
  211. */
  212. abstract public function index($item, $format = 'html');
  213. /**
  214. * Method to remove a link from the index.
  215. *
  216. * @param integer $linkId The id of the link.
  217. *
  218. * @return boolean True on success.
  219. *
  220. * @since 2.5
  221. * @throws Exception on database error.
  222. */
  223. abstract public function remove($linkId);
  224. /**
  225. * Method to optimize the index. We use this method to remove unused terms
  226. * and any other optimizations that might be necessary.
  227. *
  228. * @return boolean True on success.
  229. *
  230. * @since 2.5
  231. * @throws Exception on database error.
  232. */
  233. abstract public function optimize();
  234. /**
  235. * Method to get a content item's signature.
  236. *
  237. * @param object $item The content item to index.
  238. *
  239. * @return string The content item's signature.
  240. *
  241. * @since 2.5
  242. */
  243. protected static function getSignature($item)
  244. {
  245. // Get the indexer state.
  246. $state = static::getState();
  247. // Get the relevant configuration variables.
  248. $config = array(
  249. $state->weights,
  250. $state->options->get('stem', 1),
  251. $state->options->get('stemmer', 'porter_en')
  252. );
  253. return md5(serialize(array($item, $config)));
  254. }
  255. /**
  256. * Method to parse input, tokenize it, and then add it to the database.
  257. *
  258. * @param mixed $input String or resource to use as input. A resource input will automatically be chunked to conserve
  259. * memory. Strings will be chunked if longer than 2K in size.
  260. * @param integer $context The context of the input. See context constants.
  261. * @param string $lang The language of the input.
  262. * @param string $format The format of the input.
  263. *
  264. * @return integer The number of tokens extracted from the input.
  265. *
  266. * @since 2.5
  267. */
  268. protected function tokenizeToDb($input, $context, $lang, $format)
  269. {
  270. $count = 0;
  271. $buffer = null;
  272. if (!empty($input))
  273. {
  274. // If the input is a resource, batch the process out.
  275. if (is_resource($input))
  276. {
  277. // Batch the process out to avoid memory limits.
  278. while (!feof($input))
  279. {
  280. // Read into the buffer.
  281. $buffer .= fread($input, 2048);
  282. /*
  283. * If we haven't reached the end of the file, seek to the last
  284. * space character and drop whatever is after that to make sure
  285. * we didn't truncate a term while reading the input.
  286. */
  287. if (!feof($input))
  288. {
  289. // Find the last space character.
  290. $ls = strrpos($buffer, ' ');
  291. // Adjust string based on the last space character.
  292. if ($ls)
  293. {
  294. // Truncate the string to the last space character.
  295. $string = substr($buffer, 0, $ls);
  296. // Adjust the buffer based on the last space for the next iteration and trim.
  297. $buffer = JString::trim(substr($buffer, $ls));
  298. }
  299. // No space character was found.
  300. else
  301. {
  302. $string = $buffer;
  303. }
  304. }
  305. // We've reached the end of the file, so parse whatever remains.
  306. else
  307. {
  308. $string = $buffer;
  309. }
  310. // Parse the input.
  311. $string = FinderIndexerHelper::parse($string, $format);
  312. // Check the input.
  313. if (empty($string))
  314. {
  315. continue;
  316. }
  317. // Tokenize the input.
  318. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  319. // Add the tokens to the database.
  320. $count += $this->addTokensToDb($tokens, $context);
  321. // Check if we're approaching the memory limit of the token table.
  322. if ($count > static::$state->options->get('memory_table_limit', 30000))
  323. {
  324. $this->toggleTables(false);
  325. }
  326. unset($string);
  327. unset($tokens);
  328. }
  329. }
  330. // If the input is greater than 2K in size, it is more efficient to
  331. // batch out the operation into smaller chunks of work.
  332. elseif (strlen($input) > 2048)
  333. {
  334. $start = 0;
  335. $end = strlen($input);
  336. $chunk = 2048;
  337. /*
  338. * As it turns out, the complex regular expressions we use for
  339. * sanitizing input are not very efficient when given large
  340. * strings. It is much faster to process lots of short strings.
  341. */
  342. while ($start < $end)
  343. {
  344. // Setup the string.
  345. $string = substr($input, $start, $chunk);
  346. // Find the last space character if we aren't at the end.
  347. $ls = (($start + $chunk) < $end ? strrpos($string, ' ') : false);
  348. // Truncate to the last space character.
  349. if ($ls !== false)
  350. {
  351. $string = substr($string, 0, $ls);
  352. }
  353. // Adjust the start position for the next iteration.
  354. $start += ($ls !== false ? ($ls + 1 - $chunk) + $chunk : $chunk);
  355. // Parse the input.
  356. $string = FinderIndexerHelper::parse($string, $format);
  357. // Check the input.
  358. if (empty($string))
  359. {
  360. continue;
  361. }
  362. // Tokenize the input.
  363. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  364. // Add the tokens to the database.
  365. $count += $this->addTokensToDb($tokens, $context);
  366. // Check if we're approaching the memory limit of the token table.
  367. if ($count > static::$state->options->get('memory_table_limit', 30000))
  368. {
  369. $this->toggleTables(false);
  370. }
  371. }
  372. }
  373. else
  374. {
  375. // Parse the input.
  376. $input = FinderIndexerHelper::parse($input, $format);
  377. // Check the input.
  378. if (empty($input))
  379. {
  380. return $count;
  381. }
  382. // Tokenize the input.
  383. $tokens = FinderIndexerHelper::tokenize($input, $lang);
  384. // Add the tokens to the database.
  385. $count = $this->addTokensToDb($tokens, $context);
  386. }
  387. }
  388. return $count;
  389. }
  390. /**
  391. * Method to add a set of tokens to the database.
  392. *
  393. * @param mixed $tokens An array or single FinderIndexerToken object.
  394. * @param mixed $context The context of the tokens. See context constants. [optional]
  395. *
  396. * @return integer The number of tokens inserted into the database.
  397. *
  398. * @since 2.5
  399. * @throws Exception on database error.
  400. */
  401. abstract protected function addTokensToDb($tokens, $context = '');
  402. /**
  403. * Method to switch the token tables from Memory tables to MyISAM tables
  404. * when they are close to running out of memory.
  405. *
  406. * @param boolean $memory Flag to control how they should be toggled.
  407. *
  408. * @return boolean True on success.
  409. *
  410. * @since 2.5
  411. * @throws Exception on database error.
  412. */
  413. abstract protected function toggleTables($memory);
  414. }