PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/administrator/components/com_finder/helpers/indexer/helper.php

https://bitbucket.org/izubizarreta/https-bitbucket.org-bityvip
PHP | 528 lines | 253 code | 64 blank | 211 comment | 31 complexity | 32937869f0c08106d3bb409f969dc23d MD5 | raw file
Possible License(s): LGPL-3.0, LGPL-2.0, JSON, GPL-2.0, BSD-3-Clause, LGPL-2.1, MIT
  1. <?php
  2. /**
  3. * @package Joomla.Administrator
  4. * @subpackage com_finder
  5. *
  6. * @copyright Copyright (C) 2005 - 2012 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('_JEXEC') or die;
  10. // Register dependent classes.
  11. JLoader::register('FinderIndexerStemmer', dirname(__FILE__) . '/stemmer.php');
  12. JLoader::register('FinderIndexerToken', dirname(__FILE__) . '/token.php');
  13. /**
  14. * Helper class for the Finder indexer package.
  15. *
  16. * @package Joomla.Administrator
  17. * @subpackage com_finder
  18. * @since 2.5
  19. */
  20. class FinderIndexerHelper
  21. {
  22. /**
  23. * The token stemmer object. The stemmer is set by whatever class
  24. * wishes to use it but it must be an instance of FinderIndexerStemmer.
  25. *
  26. * @var FinderIndexerStemmer
  27. * @since 2.5
  28. */
  29. public static $stemmer;
  30. /**
  31. * Method to parse input into plain text.
  32. *
  33. * @param string $input The raw input.
  34. * @param string $format The format of the input. [optional]
  35. *
  36. * @return string The parsed input.
  37. *
  38. * @since 2.5
  39. * @throws Exception on invalid parser.
  40. */
  41. public static function parse($input, $format = 'html')
  42. {
  43. // Get a parser for the specified format and parse the input.
  44. return FinderIndexerParser::getInstance($format)->parse($input);
  45. }
  46. /**
  47. * Method to tokenize a text string.
  48. *
  49. * @param string $input The input to tokenize.
  50. * @param string $lang The language of the input.
  51. * @param boolean $phrase Flag to indicate whether input could be a phrase. [optional]
  52. *
  53. * @return array An array of FinderIndexerToken objects.
  54. *
  55. * @since 2.5
  56. */
  57. public static function tokenize($input, $lang, $phrase = false)
  58. {
  59. static $cache;
  60. $store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
  61. // Check if the string has been tokenized already.
  62. if ($store && isset($cache[$store]))
  63. {
  64. return $cache[$store];
  65. }
  66. $tokens = array();
  67. $terms = array();
  68. $quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
  69. // Get the simple language key.
  70. $lang = FinderIndexerHelper::getPrimaryLanguage($lang);
  71. /*
  72. * Parsing the string input into terms is a multi-step process.
  73. *
  74. * Regexes:
  75. * 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
  76. * 2. Remove plus, dash, period, and comma characters located before letter characters.
  77. * 3. Remove plus, dash, period, and comma characters located after other characters.
  78. * 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
  79. * 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
  80. * 6. Remove orphaned quote characters.
  81. * 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
  82. * 8. Remove multiple space characters and replaces with a single space.
  83. */
  84. $input = JString::strtolower($input);
  85. $input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
  86. $input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
  87. $input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
  88. $input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input); // Ungreedy
  89. $input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
  90. $input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
  91. $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
  92. $input = preg_replace('#\s+#mui', ' ', $input);
  93. $input = JString::trim($input);
  94. // Explode the normalized string to get the terms.
  95. $terms = explode(' ', $input);
  96. /*
  97. * If we have Unicode support and are dealing with Chinese text, Chinese
  98. * has to be handled specially because there are not necessarily any spaces
  99. * between the "words". So, we have to test if the words belong to the Chinese
  100. * character set and if so, explode them into single glyphs or "words".
  101. */
  102. if ($lang === 'zh')
  103. {
  104. // Iterate through the terms and test if they contain Chinese.
  105. for ($i = 0, $n = count($terms); $i < $n; $i++)
  106. {
  107. $charMatches = array();
  108. $charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
  109. // Split apart any groups of Chinese characters.
  110. for ($j = 0; $j < $charCount; $j++)
  111. {
  112. $tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
  113. if (!empty($tSplit))
  114. {
  115. $terms[$i] = $tSplit;
  116. }
  117. else
  118. {
  119. unset($terms[$i]);
  120. }
  121. $terms[] = $charMatches[0][$j];
  122. }
  123. }
  124. // Reset array keys.
  125. $terms = array_values($terms);
  126. }
  127. /*
  128. * If we have to handle the input as a phrase, that means we don't
  129. * tokenize the individual terms and we do not create the two and three
  130. * term combinations. The phrase must contain more than one word!
  131. */
  132. if ($phrase === true && count($terms) > 1)
  133. {
  134. // Create tokens from the phrase.
  135. $tokens[] = new FinderIndexerToken($terms, $lang);
  136. }
  137. else
  138. {
  139. // Create tokens from the terms.
  140. for ($i = 0, $n = count($terms); $i < $n; $i++)
  141. {
  142. $tokens[] = new FinderIndexerToken($terms[$i], $lang);
  143. }
  144. // Create two and three word phrase tokens from the individual words.
  145. for ($i = 0, $n = count($tokens); $i < $n; $i++)
  146. {
  147. // Setup the phrase positions.
  148. $i2 = $i + 1;
  149. $i3 = $i + 2;
  150. // Create the two word phrase.
  151. if ($i2 < $n && isset($tokens[$i2]))
  152. {
  153. // Tokenize the two word phrase.
  154. $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
  155. $token->derived = true;
  156. // Add the token to the stack.
  157. $tokens[] = $token;
  158. }
  159. // Create the three word phrase.
  160. if ($i3 < $n && isset($tokens[$i3]))
  161. {
  162. // Tokenize the three word phrase.
  163. $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
  164. $token->derived = true;
  165. // Add the token to the stack.
  166. $tokens[] = $token;
  167. }
  168. }
  169. }
  170. if ($store)
  171. {
  172. $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
  173. return $cache[$store];
  174. }
  175. else
  176. {
  177. return count($tokens) > 1 ? $tokens : array_shift($tokens);
  178. }
  179. }
  180. /**
  181. * Method to get the base word of a token. This method uses the public
  182. * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
  183. * the original token is returned.
  184. *
  185. * @param string $token The token to stem.
  186. * @param string $lang The language of the token.
  187. *
  188. * @return string The root token.
  189. *
  190. * @since 2.5
  191. */
  192. public static function stem($token, $lang)
  193. {
  194. // Trim apostrophes at either end of the token.
  195. $token = JString::trim($token, '\'');
  196. // Trim everything after any apostrophe in the token.
  197. if (($pos = JString::strpos($token, '\'')) !== false)
  198. {
  199. $token = JString::substr($token, 0, $pos);
  200. }
  201. // Stem the token if we have a valid stemmer to use.
  202. if (self::$stemmer instanceof FinderIndexerStemmer)
  203. {
  204. return self::$stemmer->stem($token, $lang);
  205. }
  206. else
  207. {
  208. return $token;
  209. }
  210. }
  211. /**
  212. * Method to add a content type to the database.
  213. *
  214. * @param string $title The type of content. For example: PDF
  215. * @param string $mime The mime type of the content. For example: PDF [optional]
  216. *
  217. * @return integer The id of the content type.
  218. *
  219. * @since 2.5
  220. * @throws Exception on database error.
  221. */
  222. public static function addContentType($title, $mime = null)
  223. {
  224. static $types;
  225. $db = JFactory::getDBO();
  226. $query = $db->getQuery(true);
  227. // Check if the types are loaded.
  228. if (empty($types))
  229. {
  230. // Build the query to get the types.
  231. $query->select('*');
  232. $query->from($db->quoteName('#__finder_types'));
  233. // Get the types.
  234. $db->setQuery($query);
  235. $types = $db->loadObjectList('title');
  236. // Check for a database error.
  237. if ($db->getErrorNum())
  238. {
  239. // Throw database error exception.
  240. throw new Exception($db->getErrorMsg(), 500);
  241. }
  242. }
  243. // Check if the type already exists.
  244. if (isset($types[$title]))
  245. {
  246. return (int) $types[$title]->id;
  247. }
  248. // Add the type.
  249. $query->clear();
  250. $query->insert($db->quoteName('#__finder_types'));
  251. $query->columns(array($db->quoteName('title'), $db->quoteName('mime')));
  252. $query->values($db->quote($title) . ', ' . $db->quote($mime));
  253. $db->setQuery($query);
  254. $db->query();
  255. // Check for a database error.
  256. if ($db->getErrorNum())
  257. {
  258. // Throw database error exception.
  259. throw new Exception($db->getErrorMsg(), 500);
  260. }
  261. // Return the new id.
  262. return (int) $db->insertid();
  263. }
  264. /**
  265. * Method to check if a token is common in a language.
  266. *
  267. * @param string $token The token to test.
  268. * @param string $lang The language to reference.
  269. *
  270. * @return boolean True if common, false otherwise.
  271. *
  272. * @since 2.5
  273. */
  274. public static function isCommon($token, $lang)
  275. {
  276. static $data;
  277. // Load the common tokens for the language if necessary.
  278. if (!isset($data[$lang]))
  279. {
  280. $data[$lang] = FinderIndexerHelper::getCommonWords($lang);
  281. }
  282. // Check if the token is in the common array.
  283. if (in_array($token, $data[$lang]))
  284. {
  285. return true;
  286. }
  287. else
  288. {
  289. return false;
  290. }
  291. }
  292. /**
  293. * Method to get an array of common terms for a language.
  294. *
  295. * @param string $lang The language to use.
  296. *
  297. * @return array Array of common terms.
  298. *
  299. * @since 2.5
  300. * @throws Exception on database error.
  301. */
  302. public static function getCommonWords($lang)
  303. {
  304. $db = JFactory::getDBO();
  305. // Create the query to load all the common terms for the language.
  306. $query = $db->getQuery(true);
  307. $query->select($db->quoteName('term'));
  308. $query->from($db->quoteName('#__finder_terms_common'));
  309. $query->where($db->quoteName('language') . ' = ' . $db->quote($lang));
  310. // Load all of the common terms for the language.
  311. $db->setQuery($query);
  312. $results = $db->loadColumn();
  313. // Check for a database error.
  314. if ($db->getErrorNum())
  315. {
  316. // Throw database error exception.
  317. throw new Exception($db->getErrorMsg(), 500);
  318. }
  319. return $results;
  320. }
  321. /**
  322. * Method to get the default language for the site.
  323. *
  324. * @return string The default language string.
  325. *
  326. * @since 2.5
  327. */
  328. public static function getDefaultLanguage()
  329. {
  330. static $lang;
  331. // Get the default language.
  332. if (empty($lang))
  333. {
  334. $lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
  335. }
  336. return $lang;
  337. }
  338. /**
  339. * Method to parse a language/locale key and return a simple language string.
  340. *
  341. * @param string $lang The language/locale key. For example: en-GB
  342. *
  343. * @return string The simple language string. For example: en
  344. *
  345. * @since 2.5
  346. */
  347. public static function getPrimaryLanguage($lang)
  348. {
  349. static $data;
  350. // Only parse the identifier if necessary.
  351. if (!isset($data[$lang]))
  352. {
  353. if (is_callable(array('Locale', 'getPrimaryLanguage')))
  354. {
  355. // Get the language key using the Locale package.
  356. $data[$lang] = Locale::getPrimaryLanguage($lang);
  357. }
  358. else
  359. {
  360. // Get the language key using string position.
  361. $data[$lang] = JString::substr($lang, 0, JString::strpos($lang, '-'));
  362. }
  363. }
  364. return $data[$lang];
  365. }
  366. /**
  367. * Method to get the path (SEF route) for a content item.
  368. *
  369. * @param string $url The non-SEF route to the content item.
  370. *
  371. * @return string The path for the content item.
  372. *
  373. * @since 2.5
  374. */
  375. public static function getContentPath($url)
  376. {
  377. static $router;
  378. // Only get the router once.
  379. if (!($router instanceof JRouter))
  380. {
  381. jimport('joomla.application.router');
  382. include_once JPATH_SITE . '/includes/application.php';
  383. // Get and configure the site router.
  384. $config = JFactory::getConfig();
  385. $router = JRouter::getInstance('site');
  386. $router->setMode($config->get('sef', 1));
  387. }
  388. // Build the relative route.
  389. $uri = $router->build($url);
  390. $route = $uri->toString(array('path', 'query', 'fragment'));
  391. $route = str_replace(JURI::base(true) . '/', '', $route);
  392. return $route;
  393. }
  394. /**
  395. * Method to get extra data for a content before being indexed. This is how
  396. * we add Comments, Tags, Labels, etc. that should be available to Finder.
  397. *
  398. * @param FinderIndexerResult &$item The item to index as an FinderIndexerResult object.
  399. *
  400. * @return boolean True on success, false on failure.
  401. *
  402. * @since 2.5
  403. * @throws Exception on database error.
  404. */
  405. public static function getContentExtras(FinderIndexerResult &$item)
  406. {
  407. // Get the event dispatcher.
  408. $dispatcher = JDispatcher::getInstance();
  409. // Load the finder plugin group.
  410. JPluginHelper::importPlugin('finder');
  411. try
  412. {
  413. // Trigger the event.
  414. $results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
  415. // Check the returned results. This is for plugins that don't throw
  416. // exceptions when they encounter serious errors.
  417. if (in_array(false, $results))
  418. {
  419. throw new Exception($dispatcher->getError(), 500);
  420. }
  421. }
  422. catch (Exception $e)
  423. {
  424. // Handle a caught exception.
  425. throw $e;
  426. }
  427. return true;
  428. }
  429. /**
  430. * Method to process content text using the onContentPrepare event trigger.
  431. *
  432. * @param string $text The content to process.
  433. * @param JRegistry $params The parameters object. [optional]
  434. *
  435. * @return string The processed content.
  436. *
  437. * @since 2.5
  438. */
  439. public static function prepareContent($text, $params = null)
  440. {
  441. static $loaded;
  442. // Get the dispatcher.
  443. $dispatcher = JDispatcher::getInstance();
  444. // Load the content plugins if necessary.
  445. if (empty($loaded))
  446. {
  447. JPluginHelper::importPlugin('content');
  448. $loaded = true;
  449. }
  450. // Instantiate the parameter object if necessary.
  451. if (!($params instanceof JRegistry))
  452. {
  453. $registry = new JRegistry;
  454. $registry->loadString($params);
  455. $params = $registry;
  456. }
  457. // Create a mock content object.
  458. $content = JTable::getInstance('Content');
  459. $content->text = $text;
  460. // Fire the onContentPrepare event.
  461. $dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
  462. return $content->text;
  463. }
  464. }