PageRenderTime 28ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/web/core/modules/search/src/SearchIndex.php

https://gitlab.com/mohamed_hussein/prodt
PHP | 330 lines | 198 code | 25 blank | 107 comment | 36 complexity | 8e0a5a941ee83d42be5dcb03ed9e11d5 MD5 | raw file
  1. <?php
  2. namespace Drupal\search;
  3. use Drupal\Core\Cache\CacheTagsInvalidatorInterface;
  4. use Drupal\Core\Config\ConfigFactoryInterface;
  5. use Drupal\Core\Database\Connection;
  6. use Drupal\search\Exception\SearchIndexException;
  7. /**
  8. * Provides search index management functions.
  9. */
  10. class SearchIndex implements SearchIndexInterface {
  11. /**
  12. * The config factory.
  13. *
  14. * @var \Drupal\Core\Config\ConfigFactoryInterface
  15. */
  16. protected $configFactory;
  17. /**
  18. * The database connection.
  19. *
  20. * @var \Drupal\Core\Database\Connection
  21. */
  22. protected $connection;
  23. /**
  24. * The database replica connection.
  25. *
  26. * @var \Drupal\Core\Database\Connection
  27. */
  28. protected $replica;
  29. /**
  30. * The cache tags invalidator.
  31. *
  32. * @var \Drupal\Core\Cache\CacheTagsInvalidatorInterface
  33. */
  34. protected $cacheTagsInvalidator;
  35. /**
  36. * The text processor.
  37. *
  38. * @var \Drupal\search\SearchTextProcessorInterface
  39. */
  40. protected $textProcessor;
  41. /**
  42. * SearchIndex constructor.
  43. *
  44. * @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory
  45. * The config factory.
  46. * @param \Drupal\Core\Database\Connection $connection
  47. * The database connection.
  48. * @param \Drupal\Core\Database\Connection $replica
  49. * The database replica connection.
  50. * @param \Drupal\Core\Cache\CacheTagsInvalidatorInterface $cache_tags_invalidator
  51. * The cache tags invalidator.
  52. * @param \Drupal\search\SearchTextProcessorInterface $text_processor
  53. * The text processor.
  54. */
  55. public function __construct(ConfigFactoryInterface $config_factory, Connection $connection, Connection $replica, CacheTagsInvalidatorInterface $cache_tags_invalidator, SearchTextProcessorInterface $text_processor = NULL) {
  56. $this->configFactory = $config_factory;
  57. $this->connection = $connection;
  58. $this->replica = $replica;
  59. $this->cacheTagsInvalidator = $cache_tags_invalidator;
  60. if ($text_processor === NULL) {
  61. @trigger_error('Calling ' . __METHOD__ . ' without $text_processor argument is deprecated in drupal:9.1.0 and will be required in drupal:10.0.0. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
  62. $text_processor = \Drupal::service('search.text_processor');
  63. }
  64. $this->textProcessor = $text_processor;
  65. }
  66. /**
  67. * {@inheritdoc}
  68. */
  69. public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
  70. $settings = $this->configFactory->get('search.settings');
  71. $minimum_word_size = $settings->get('index.minimum_word_size');
  72. // Keep track of the words that need to have their weights updated.
  73. $current_words = [];
  74. // Multipliers for scores of words inside certain HTML tags. The weights are
  75. // stored in config so that modules can overwrite the default weights.
  76. // Note: 'a' must be included for link ranking to work.
  77. $tags = $settings->get('index.tag_weights');
  78. // Strip off all ignored tags to speed up processing, but insert space
  79. // before and after them to keep word boundaries.
  80. $text = str_replace(['<', '>'], [' <', '> '], $text);
  81. $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
  82. // Split HTML tags from plain text.
  83. $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  84. // Note: PHP ensures the array consists of alternating delimiters and
  85. // literals and begins and ends with a literal (inserting $null as
  86. // required).
  87. // Odd/even counter. Tag or no tag.
  88. $tag = FALSE;
  89. // Starting score per word.
  90. $score = 1;
  91. // Accumulator for cleaned up data.
  92. $accum = ' ';
  93. // Stack with open tags.
  94. $tagstack = [];
  95. // Counter for consecutive words.
  96. $tagwords = 0;
  97. // Focus state.
  98. $focus = 1;
  99. // Accumulator for words for index.
  100. $scored_words = [];
  101. foreach ($split as $value) {
  102. if ($tag) {
  103. // Increase or decrease score per word based on tag.
  104. [$tagname] = explode(' ', $value, 2);
  105. $tagname = mb_strtolower($tagname);
  106. // Closing or opening tag?
  107. if ($tagname[0] == '/') {
  108. $tagname = substr($tagname, 1);
  109. // If we encounter unexpected tags, reset score to avoid incorrect
  110. // boosting.
  111. if (!count($tagstack) || $tagstack[0] != $tagname) {
  112. $tagstack = [];
  113. $score = 1;
  114. }
  115. else {
  116. // Remove from tag stack and decrement score.
  117. $score = max(1, $score - $tags[array_shift($tagstack)]);
  118. }
  119. }
  120. else {
  121. if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
  122. // None of the tags we look for make sense when nested identically.
  123. // If they are, it's probably broken HTML.
  124. $tagstack = [];
  125. $score = 1;
  126. }
  127. else {
  128. // Add to open tag stack and increment score.
  129. array_unshift($tagstack, $tagname);
  130. $score += $tags[$tagname];
  131. }
  132. }
  133. // A tag change occurred, reset counter.
  134. $tagwords = 0;
  135. }
  136. else {
  137. // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
  138. // values.
  139. if ($value != '') {
  140. $words = $this->textProcessor->process($value, $langcode);
  141. foreach ($words as $word) {
  142. // Add word to accumulator.
  143. $accum .= $word . ' ';
  144. // Check word length.
  145. if (is_numeric($word) || mb_strlen($word) >= $minimum_word_size) {
  146. if (!isset($scored_words[$word])) {
  147. $scored_words[$word] = 0;
  148. }
  149. $scored_words[$word] += $score * $focus;
  150. // Focus is a decaying value in terms of the amount of unique
  151. // words up to this point. From 100 words and more, it decays, to
  152. // e.g. 0.5 at 500 words and 0.3 at 1000 words.
  153. $focus = min(1, .01 + 3.5 / (2 + count($scored_words) * .015));
  154. }
  155. $tagwords++;
  156. // Too many words inside a single tag probably mean a tag was
  157. // accidentally left open.
  158. if (count($tagstack) && $tagwords >= 15) {
  159. $tagstack = [];
  160. $score = 1;
  161. }
  162. }
  163. }
  164. }
  165. $tag = !$tag;
  166. }
  167. // Remove the item $sid from the search index, and invalidate the relevant
  168. // cache tags.
  169. $this->clear($type, $sid, $langcode);
  170. try {
  171. // Insert cleaned up data into dataset.
  172. $this->connection->insert('search_dataset')
  173. ->fields([
  174. 'sid' => $sid,
  175. 'langcode' => $langcode,
  176. 'type' => $type,
  177. 'data' => $accum,
  178. 'reindex' => 0,
  179. ])
  180. ->execute();
  181. // Insert results into search index.
  182. foreach ($scored_words as $word => $score) {
  183. // If a word already exists in the database, its score gets increased
  184. // appropriately. If not, we create a new record with the appropriate
  185. // starting score.
  186. $this->connection->merge('search_index')
  187. ->keys([
  188. 'word' => $word,
  189. 'sid' => $sid,
  190. 'langcode' => $langcode,
  191. 'type' => $type,
  192. ])
  193. ->fields(['score' => $score])
  194. ->expression('score', '[score] + :score', [':score' => $score])
  195. ->execute();
  196. $current_words[$word] = TRUE;
  197. }
  198. }
  199. catch (\Exception $e) {
  200. throw new SearchIndexException("Failed to insert dataset in index for type '$type', sid '$sid' and langcode '$langcode'", 0, $e);
  201. }
  202. finally {
  203. if ($update_weights) {
  204. $this->updateWordWeights($current_words);
  205. }
  206. }
  207. return $current_words;
  208. }
  209. /**
  210. * {@inheritdoc}
  211. */
  212. public function clear($type = NULL, $sid = NULL, $langcode = NULL) {
  213. try {
  214. $query_index = $this->connection->delete('search_index');
  215. $query_dataset = $this->connection->delete('search_dataset');
  216. if ($type) {
  217. $query_index->condition('type', $type);
  218. $query_dataset->condition('type', $type);
  219. if ($sid) {
  220. $query_index->condition('sid', $sid);
  221. $query_dataset->condition('sid', $sid);
  222. if ($langcode) {
  223. $query_index->condition('langcode', $langcode);
  224. $query_dataset->condition('langcode', $langcode);
  225. }
  226. }
  227. }
  228. $query_index->execute();
  229. $query_dataset->execute();
  230. }
  231. catch (\Exception $e) {
  232. throw new SearchIndexException("Failed to clear index for type '$type', sid '$sid' and langcode '$langcode'", 0, $e);
  233. }
  234. if ($type) {
  235. // Invalidate all render cache items that contain data from this index.
  236. $this->cacheTagsInvalidator->invalidateTags(['search_index:' . $type]);
  237. }
  238. else {
  239. // Invalidate all render cache items that contain data from any index.
  240. $this->cacheTagsInvalidator->invalidateTags(['search_index']);
  241. }
  242. }
  243. /**
  244. * {@inheritdoc}
  245. */
  246. public function markForReindex($type = NULL, $sid = NULL, $langcode = NULL) {
  247. try {
  248. $query = $this->connection->update('search_dataset')
  249. ->fields(['reindex' => REQUEST_TIME])
  250. // Only mark items that were not previously marked for reindex, so that
  251. // marked items maintain their priority by request time.
  252. ->condition('reindex', 0);
  253. if ($type) {
  254. $query->condition('type', $type);
  255. if ($sid) {
  256. $query->condition('sid', $sid);
  257. if ($langcode) {
  258. $query->condition('langcode', $langcode);
  259. }
  260. }
  261. }
  262. $query->execute();
  263. }
  264. catch (\Exception $e) {
  265. throw new SearchIndexException("Failed to mark index for re-indexing for type '$type', sid '$sid' and langcode '$langcode'", 0, $e);
  266. }
  267. }
  268. /**
  269. * {@inheritdoc}
  270. */
  271. public function updateWordWeights(array $words) {
  272. try {
  273. // Update word IDF (Inverse Document Frequency) counts for new/changed
  274. // words.
  275. $words = array_keys($words);
  276. foreach ($words as $word) {
  277. // Get total count.
  278. $total = $this->replica->query("SELECT SUM([score]) FROM {search_index} WHERE [word] = :word", [':word' => $word])
  279. ->fetchField();
  280. // Apply Zipf's law to equalize the probability distribution.
  281. $total = log10(1 + 1 / (max(1, $total)));
  282. $this->connection->merge('search_total')
  283. ->key('word', $word)
  284. ->fields(['count' => $total])
  285. ->execute();
  286. }
  287. // Find words that were deleted from search_index, but are still in
  288. // search_total. We use a LEFT JOIN between the two tables and keep only
  289. // the rows which fail to join.
  290. $result = $this->replica->query("SELECT [t].[word] AS [realword], [i].[word] FROM {search_total} [t] LEFT JOIN {search_index} [i] ON [t].[word] = [i].[word] WHERE [i].[word] IS NULL");
  291. $or = $this->replica->condition('OR');
  292. foreach ($result as $word) {
  293. $or->condition('word', $word->realword);
  294. }
  295. if (count($or) > 0) {
  296. $this->connection->delete('search_total')
  297. ->condition($or)
  298. ->execute();
  299. }
  300. }
  301. catch (\Exception $e) {
  302. throw new SearchIndexException("Failed to update totals for index words.", 0, $e);
  303. }
  304. }
  305. }