PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/joomla/administrator/components/com_finder/helpers/indexer/indexer.php

https://gitlab.com/ricardosanchez/prueba
PHP | 484 lines | 191 code | 64 blank | 229 comment | 25 complexity | f8ef6d9c91b2d3b864817d8708a54e6a MD5 | raw file
  1. <?php
  2. /**
  3. * @package Joomla.Administrator
  4. * @subpackage com_finder
  5. *
  6. * @copyright Copyright (C) 2005 - 2015 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('_JEXEC') or die;
  10. JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php');
  11. JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
  12. JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
  13. JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php');
  14. JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
  15. jimport('joomla.filesystem.file');
  16. /**
  17. * Main indexer class for the Finder indexer package.
  18. *
  19. * The indexer class provides the core functionality of the Finder
  20. * search engine. It is responsible for adding and updating the
  21. * content links table; extracting and scoring tokens; and maintaining
  22. * all referential information for the content.
  23. *
  24. * Note: All exceptions thrown from within this class should be caught
  25. * by the controller.
  26. *
  27. * @since 2.5
  28. */
  29. abstract class FinderIndexer
  30. {
  31. /**
  32. * The title context identifier.
  33. *
  34. * @var integer
  35. * @since 2.5
  36. */
  37. const TITLE_CONTEXT = 1;
  38. /**
  39. * The text context identifier.
  40. *
  41. * @var integer
  42. * @since 2.5
  43. */
  44. const TEXT_CONTEXT = 2;
  45. /**
  46. * The meta context identifier.
  47. *
  48. * @var integer
  49. * @since 2.5
  50. */
  51. const META_CONTEXT = 3;
  52. /**
  53. * The path context identifier.
  54. *
  55. * @var integer
  56. * @since 2.5
  57. */
  58. const PATH_CONTEXT = 4;
  59. /**
  60. * The misc context identifier.
  61. *
  62. * @var integer
  63. * @since 2.5
  64. */
  65. const MISC_CONTEXT = 5;
  66. /**
  67. * The indexer state object.
  68. *
  69. * @var object
  70. * @since 2.5
  71. */
  72. public static $state;
  73. /**
  74. * The indexer profiler object.
  75. *
  76. * @var object
  77. * @since 2.5
  78. */
  79. public static $profiler;
  80. /**
  81. * Returns a reference to the FinderIndexer object.
  82. *
  83. * @return FinderIndexer instance based on the database driver
  84. *
  85. * @since 3.0
  86. * @throws RuntimeException if driver class for indexer not present.
  87. */
  88. public static function getInstance()
  89. {
  90. // Setup the adapter for the indexer.
  91. $format = JFactory::getDbo()->name;
  92. if ($format == 'mysqli' || $format == 'pdomysql')
  93. {
  94. $format = 'mysql';
  95. }
  96. elseif ($format == 'sqlazure')
  97. {
  98. $format = 'sqlsrv';
  99. }
  100. $path = __DIR__ . '/driver/' . $format . '.php';
  101. $class = 'FinderIndexerDriver' . ucfirst($format);
  102. // Check if a parser exists for the format.
  103. if (file_exists($path))
  104. {
  105. // Instantiate the parser.
  106. include_once $path;
  107. return new $class;
  108. }
  109. else
  110. {
  111. // Throw invalid format exception.
  112. throw new RuntimeException(JText::sprintf('COM_FINDER_INDEXER_INVALID_DRIVER', $format));
  113. }
  114. }
  115. /**
  116. * Method to get the indexer state.
  117. *
  118. * @return object The indexer state object.
  119. *
  120. * @since 2.5
  121. */
  122. public static function getState()
  123. {
  124. // First, try to load from the internal state.
  125. if (!empty(self::$state))
  126. {
  127. return self::$state;
  128. }
  129. // If we couldn't load from the internal state, try the session.
  130. $session = JFactory::getSession();
  131. $data = $session->get('_finder.state', null);
  132. // If the state is empty, load the values for the first time.
  133. if (empty($data))
  134. {
  135. $data = new JObject;
  136. // Load the default configuration options.
  137. $data->options = JComponentHelper::getParams('com_finder');
  138. // Setup the weight lookup information.
  139. $data->weights = array(
  140. self::TITLE_CONTEXT => round($data->options->get('title_multiplier', 1.7), 2),
  141. self::TEXT_CONTEXT => round($data->options->get('text_multiplier', 0.7), 2),
  142. self::META_CONTEXT => round($data->options->get('meta_multiplier', 1.2), 2),
  143. self::PATH_CONTEXT => round($data->options->get('path_multiplier', 2.0), 2),
  144. self::MISC_CONTEXT => round($data->options->get('misc_multiplier', 0.3), 2)
  145. );
  146. // Set the current time as the start time.
  147. $data->startTime = JFactory::getDate()->toSql();
  148. // Set the remaining default values.
  149. $data->batchSize = (int) $data->options->get('batch_size', 50);
  150. $data->batchOffset = 0;
  151. $data->totalItems = 0;
  152. $data->pluginState = array();
  153. }
  154. // Setup the profiler if debugging is enabled.
  155. if (JFactory::getApplication()->get('debug'))
  156. {
  157. self::$profiler = JProfiler::getInstance('FinderIndexer');
  158. }
  159. // Setup the stemmer.
  160. if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
  161. {
  162. FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
  163. }
  164. // Set the state.
  165. self::$state = $data;
  166. return self::$state;
  167. }
  168. /**
  169. * Method to set the indexer state.
  170. *
  171. * @param object $data A new indexer state object.
  172. *
  173. * @return boolean True on success, false on failure.
  174. *
  175. * @since 2.5
  176. */
  177. public static function setState($data)
  178. {
  179. // Check the state object.
  180. if (empty($data) || !$data instanceof JObject)
  181. {
  182. return false;
  183. }
  184. // Set the new internal state.
  185. self::$state = $data;
  186. // Set the new session state.
  187. $session = JFactory::getSession();
  188. $session->set('_finder.state', $data);
  189. return true;
  190. }
  191. /**
  192. * Method to reset the indexer state.
  193. *
  194. * @return void
  195. *
  196. * @since 2.5
  197. */
  198. public static function resetState()
  199. {
  200. // Reset the internal state to null.
  201. self::$state = null;
  202. // Reset the session state to null.
  203. $session = JFactory::getSession();
  204. $session->set('_finder.state', null);
  205. }
  206. /**
  207. * Method to index a content item.
  208. *
  209. * @param FinderIndexerResult $item The content item to index.
  210. * @param string $format The format of the content. [optional]
  211. *
  212. * @return integer The ID of the record in the links table.
  213. *
  214. * @since 2.5
  215. * @throws Exception on database error.
  216. */
  217. abstract public function index($item, $format = 'html');
  218. /**
  219. * Method to remove a link from the index.
  220. *
  221. * @param integer $linkId The id of the link.
  222. *
  223. * @return boolean True on success.
  224. *
  225. * @since 2.5
  226. * @throws Exception on database error.
  227. */
  228. abstract public function remove($linkId);
  229. /**
  230. * Method to optimize the index. We use this method to remove unused terms
  231. * and any other optimizations that might be necessary.
  232. *
  233. * @return boolean True on success.
  234. *
  235. * @since 2.5
  236. * @throws Exception on database error.
  237. */
  238. abstract public function optimize();
  239. /**
  240. * Method to get a content item's signature.
  241. *
  242. * @param object $item The content item to index.
  243. *
  244. * @return string The content item's signature.
  245. *
  246. * @since 2.5
  247. */
  248. protected static function getSignature($item)
  249. {
  250. // Get the indexer state.
  251. $state = self::getState();
  252. // Get the relevant configuration variables.
  253. $config = array();
  254. $config[] = $state->weights;
  255. $config[] = $state->options->get('stem', 1);
  256. $config[] = $state->options->get('stemmer', 'porter_en');
  257. return md5(serialize(array($item, $config)));
  258. }
  259. /**
  260. * Method to parse input, tokenize it, and then add it to the database.
  261. *
  262. * @param mixed $input String or resource to use as input. A resource
  263. * input will automatically be chunked to conserve
  264. * memory. Strings will be chunked if longer than
  265. * 2K in size.
  266. * @param integer $context The context of the input. See context constants.
  267. * @param string $lang The language of the input.
  268. * @param string $format The format of the input.
  269. *
  270. * @return integer The number of tokens extracted from the input.
  271. *
  272. * @since 2.5
  273. */
  274. protected function tokenizeToDb($input, $context, $lang, $format)
  275. {
  276. $count = 0;
  277. $buffer = null;
  278. if (!empty($input))
  279. {
  280. // If the input is a resource, batch the process out.
  281. if (is_resource($input))
  282. {
  283. // Batch the process out to avoid memory limits.
  284. while (!feof($input))
  285. {
  286. // Read into the buffer.
  287. $buffer .= fread($input, 2048);
  288. /*
  289. * If we haven't reached the end of the file, seek to the last
  290. * space character and drop whatever is after that to make sure
  291. * we didn't truncate a term while reading the input.
  292. */
  293. if (!feof($input))
  294. {
  295. // Find the last space character.
  296. $ls = strrpos($buffer, ' ');
  297. // Adjust string based on the last space character.
  298. if ($ls)
  299. {
  300. // Truncate the string to the last space character.
  301. $string = substr($buffer, 0, $ls);
  302. // Adjust the buffer based on the last space for the next iteration and trim.
  303. $buffer = JString::trim(substr($buffer, $ls));
  304. }
  305. // No space character was found.
  306. else
  307. {
  308. $string = $buffer;
  309. }
  310. }
  311. // We've reached the end of the file, so parse whatever remains.
  312. else
  313. {
  314. $string = $buffer;
  315. }
  316. // Parse the input.
  317. $string = FinderIndexerHelper::parse($string, $format);
  318. // Check the input.
  319. if (empty($string))
  320. {
  321. continue;
  322. }
  323. // Tokenize the input.
  324. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  325. // Add the tokens to the database.
  326. $count += $this->addTokensToDb($tokens, $context);
  327. // Check if we're approaching the memory limit of the token table.
  328. if ($count > self::$state->options->get('memory_table_limit', 30000))
  329. {
  330. $this->toggleTables(false);
  331. }
  332. unset($string);
  333. unset($tokens);
  334. }
  335. }
  336. // If the input is greater than 2K in size, it is more efficient to
  337. // batch out the operation into smaller chunks of work.
  338. elseif (strlen($input) > 2048)
  339. {
  340. $start = 0;
  341. $end = strlen($input);
  342. $chunk = 2048;
  343. /*
  344. * As it turns out, the complex regular expressions we use for
  345. * sanitizing input are not very efficient when given large
  346. * strings. It is much faster to process lots of short strings.
  347. */
  348. while ($start < $end)
  349. {
  350. // Setup the string.
  351. $string = substr($input, $start, $chunk);
  352. // Find the last space character if we aren't at the end.
  353. $ls = (($start + $chunk) < $end ? strrpos($string, ' ') : false);
  354. // Truncate to the last space character.
  355. if ($ls !== false)
  356. {
  357. $string = substr($string, 0, $ls);
  358. }
  359. // Adjust the start position for the next iteration.
  360. $start += ($ls !== false ? ($ls + 1 - $chunk) + $chunk : $chunk);
  361. // Parse the input.
  362. $string = FinderIndexerHelper::parse($string, $format);
  363. // Check the input.
  364. if (empty($string))
  365. {
  366. continue;
  367. }
  368. // Tokenize the input.
  369. $tokens = FinderIndexerHelper::tokenize($string, $lang);
  370. // Add the tokens to the database.
  371. $count += $this->addTokensToDb($tokens, $context);
  372. // Check if we're approaching the memory limit of the token table.
  373. if ($count > self::$state->options->get('memory_table_limit', 30000))
  374. {
  375. $this->toggleTables(false);
  376. }
  377. }
  378. }
  379. else
  380. {
  381. // Parse the input.
  382. $input = FinderIndexerHelper::parse($input, $format);
  383. // Check the input.
  384. if (empty($input))
  385. {
  386. return $count;
  387. }
  388. // Tokenize the input.
  389. $tokens = FinderIndexerHelper::tokenize($input, $lang);
  390. // Add the tokens to the database.
  391. $count = $this->addTokensToDb($tokens, $context);
  392. }
  393. }
  394. return $count;
  395. }
  396. /**
  397. * Method to add a set of tokens to the database.
  398. *
  399. * @param mixed $tokens An array or single FinderIndexerToken object.
  400. * @param mixed $context The context of the tokens. See context constants. [optional]
  401. *
  402. * @return integer The number of tokens inserted into the database.
  403. *
  404. * @since 2.5
  405. * @throws Exception on database error.
  406. */
  407. abstract protected function addTokensToDb($tokens, $context = '');
  408. /**
  409. * Method to switch the token tables from Memory tables to MyISAM tables
  410. * when they are close to running out of memory.
  411. *
  412. * @param boolean $memory Flag to control how they should be toggled.
  413. *
  414. * @return boolean True on success.
  415. *
  416. * @since 2.5
  417. * @throws Exception on database error.
  418. */
  419. abstract protected function toggleTables($memory);
  420. }