PageRenderTime 52ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/search/engine/solr/classes/engine.php

http://github.com/moodle/moodle
PHP | 1484 lines | 779 code | 208 blank | 497 comment | 156 complexity | e0904e964dd9e5d3926948108cd60f1f MD5 | raw file
Possible License(s): MIT, AGPL-3.0, MPL-2.0-no-copyleft-exception, LGPL-3.0, GPL-3.0, Apache-2.0, LGPL-2.1, BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. // This file is part of Moodle - http://moodle.org/
  3. //
  4. // Moodle is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // Moodle is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
  16. /**
  17. * Solr engine.
  18. *
  19. * @package search_solr
  20. * @copyright 2015 Daniel Neis Araujo
  21. * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  22. */
  23. namespace search_solr;
  24. defined('MOODLE_INTERNAL') || die();
  25. /**
  26. * Solr engine.
  27. *
  28. * @package search_solr
  29. * @copyright 2015 Daniel Neis Araujo
  30. * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  31. */
  32. class engine extends \core_search\engine {
  33. /**
  34. * @var string The date format used by solr.
  35. */
  36. const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
  37. /**
  38. * @var int Commit documents interval (number of miliseconds).
  39. */
  40. const AUTOCOMMIT_WITHIN = 15000;
  41. /**
  42. * The maximum number of results to fetch at a time.
  43. */
  44. const QUERY_SIZE = 120;
  45. /**
  46. * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
  47. */
  48. const FRAG_SIZE = 510;
  49. /**
  50. * Marker for the start of a highlight.
  51. */
  52. const HIGHLIGHT_START = '@@HI_S@@';
  53. /**
  54. * Marker for the end of a highlight.
  55. */
  56. const HIGHLIGHT_END = '@@HI_E@@';
  57. /** @var float Boost value for matching course in location-ordered searches */
  58. const COURSE_BOOST = 1;
  59. /** @var float Boost value for matching context (in addition to course boost) */
  60. const CONTEXT_BOOST = 0.5;
  61. /**
  62. * @var \SolrClient
  63. */
  64. protected $client = null;
  65. /**
  66. * @var bool True if we should reuse SolrClients, false if not.
  67. */
  68. protected $cacheclient = true;
  69. /**
  70. * @var \curl Direct curl object.
  71. */
  72. protected $curl = null;
  73. /**
  74. * @var array Fields that can be highlighted.
  75. */
  76. protected $highlightfields = array('title', 'content', 'description1', 'description2');
  77. /**
  78. * @var int Number of total docs reported by Sorl for the last query.
  79. */
  80. protected $totalenginedocs = 0;
  81. /**
  82. * @var int Number of docs we have processed for the last query.
  83. */
  84. protected $processeddocs = 0;
  85. /**
  86. * @var int Number of docs that have been skipped while processing the last query.
  87. */
  88. protected $skippeddocs = 0;
  89. /**
  90. * Solr server major version.
  91. *
  92. * @var int
  93. */
  94. protected $solrmajorversion = null;
  95. /**
  96. * Initialises the search engine configuration.
  97. *
  98. * @return void
  99. */
  100. public function __construct() {
  101. parent::__construct();
  102. $curlversion = curl_version();
  103. if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) {
  104. // There is a flaw with curl 7.35.0 that causes problems with client reuse.
  105. $this->cacheclient = false;
  106. }
  107. }
  108. /**
  109. * Prepares a Solr query, applies filters and executes it returning its results.
  110. *
  111. * @throws \core_search\engine_exception
  112. * @param \stdClass $filters Containing query and filters.
  113. * @param \stdClass $accessinfo Information about areas user can access.
  114. * @param int $limit The maximum number of results to return.
  115. * @return \core_search\document[] Results or false if no results
  116. */
  117. public function execute_query($filters, $accessinfo, $limit = 0) {
  118. global $USER;
  119. if (empty($limit)) {
  120. $limit = \core_search\manager::MAX_RESULTS;
  121. }
  122. // If there is any problem we trigger the exception as soon as possible.
  123. $client = $this->get_search_client();
  124. // Create the query object.
  125. $query = $this->create_user_query($filters, $accessinfo);
  126. // If the query cannot have results, return none.
  127. if (!$query) {
  128. return [];
  129. }
  130. // We expect good match rates, so for our first get, we will get a small number of records.
  131. // This significantly speeds solr response time for first few pages.
  132. $query->setRows(min($limit * 3, static::QUERY_SIZE));
  133. $response = $this->get_query_response($query);
  134. // Get count data out of the response, and reset our counters.
  135. list($included, $found) = $this->get_response_counts($response);
  136. $this->totalenginedocs = $found;
  137. $this->processeddocs = 0;
  138. $this->skippeddocs = 0;
  139. if ($included == 0 || $this->totalenginedocs == 0) {
  140. // No results.
  141. return array();
  142. }
  143. // Get valid documents out of the response.
  144. $results = $this->process_response($response, $limit);
  145. // We have processed all the docs in the response at this point.
  146. $this->processeddocs += $included;
  147. // If we haven't reached the limit, and there are more docs left in Solr, lets keep trying.
  148. while (count($results) < $limit && ($this->totalenginedocs - $this->processeddocs) > 0) {
  149. // Offset the start of the query, and since we are making another call, get more per call.
  150. $query->setStart($this->processeddocs);
  151. $query->setRows(static::QUERY_SIZE);
  152. $response = $this->get_query_response($query);
  153. list($included, $found) = $this->get_response_counts($response);
  154. if ($included == 0 || $found == 0) {
  155. // No new results were found. Found being empty would be weird, so we will just return.
  156. return $results;
  157. }
  158. $this->totalenginedocs = $found;
  159. // Get the new response docs, limiting to remaining we need, then add it to the end of the results array.
  160. $newdocs = $this->process_response($response, $limit - count($results));
  161. $results = array_merge($results, $newdocs);
  162. // Add to our processed docs count.
  163. $this->processeddocs += $included;
  164. }
  165. return $results;
  166. }
  167. /**
  168. * Takes a query and returns the response in SolrObject format.
  169. *
  170. * @param SolrQuery $query Solr query object.
  171. * @return SolrObject|false Response document or false on error.
  172. */
  173. protected function get_query_response($query) {
  174. try {
  175. return $this->get_search_client()->query($query)->getResponse();
  176. } catch (\SolrClientException $ex) {
  177. debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
  178. $this->queryerror = $ex->getMessage();
  179. return false;
  180. } catch (\SolrServerException $ex) {
  181. debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
  182. $this->queryerror = $ex->getMessage();
  183. return false;
  184. }
  185. }
  186. /**
  187. * Returns the total number of documents available for the most recently call to execute_query.
  188. *
  189. * @return int
  190. */
  191. public function get_query_total_count() {
  192. // Return the total engine count minus the docs we have determined are bad.
  193. return $this->totalenginedocs - $this->skippeddocs;
  194. }
  195. /**
  196. * Returns count information for a provided response. Will return 0, 0 for invalid or empty responses.
  197. *
  198. * @param SolrDocument $response The response document from Solr.
  199. * @return array A two part array. First how many response docs are in the response.
  200. * Second, how many results are vailable in the engine.
  201. */
  202. protected function get_response_counts($response) {
  203. $found = 0;
  204. $included = 0;
  205. if (isset($response->grouped->solr_filegroupingid->ngroups)) {
  206. // Get the number of results for file grouped queries.
  207. $found = $response->grouped->solr_filegroupingid->ngroups;
  208. $included = count($response->grouped->solr_filegroupingid->groups);
  209. } else if (isset($response->response->numFound)) {
  210. // Get the number of results for standard queries.
  211. $found = $response->response->numFound;
  212. if ($found > 0 && is_array($response->response->docs)) {
  213. $included = count($response->response->docs);
  214. }
  215. }
  216. return array($included, $found);
  217. }
  218. /**
  219. * Prepares a new query object with needed limits, filters, etc.
  220. *
  221. * @param \stdClass $filters Containing query and filters.
  222. * @param \stdClass $accessinfo Information about contexts the user can access
  223. * @return \SolrDisMaxQuery|null Query object or null if they can't get any results
  224. */
  225. protected function create_user_query($filters, $accessinfo) {
  226. global $USER;
  227. // Let's keep these changes internal.
  228. $data = clone $filters;
  229. $query = new \SolrDisMaxQuery();
  230. $this->set_query($query, self::replace_underlines($data->q));
  231. $this->add_fields($query);
  232. // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
  233. // we are really interested in caching contexts filters instead.
  234. if (!empty($data->title)) {
  235. $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
  236. }
  237. if (!empty($data->areaids)) {
  238. // If areaids are specified, we want to get any that match.
  239. $query->addFilterQuery('{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')');
  240. }
  241. if (!empty($data->courseids)) {
  242. $query->addFilterQuery('{!cache=false}courseid:(' . implode(' OR ', $data->courseids) . ')');
  243. }
  244. if (!empty($data->groupids)) {
  245. $query->addFilterQuery('{!cache=false}groupid:(' . implode(' OR ', $data->groupids) . ')');
  246. }
  247. if (!empty($data->userids)) {
  248. $query->addFilterQuery('{!cache=false}userid:(' . implode(' OR ', $data->userids) . ')');
  249. }
  250. if (!empty($data->timestart) or !empty($data->timeend)) {
  251. if (empty($data->timestart)) {
  252. $data->timestart = '*';
  253. } else {
  254. $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
  255. }
  256. if (empty($data->timeend)) {
  257. $data->timeend = '*';
  258. } else {
  259. $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
  260. }
  261. // No cache.
  262. $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
  263. }
  264. // Restrict to users who are supposed to be able to see a particular result.
  265. $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
  266. // And finally restrict it to the context where the user can access, we want this one cached.
  267. // If the user can access all contexts $usercontexts value is just true, we don't need to filter
  268. // in that case.
  269. if (!$accessinfo->everything && is_array($accessinfo->usercontexts)) {
  270. // Join all area contexts into a single array and implode.
  271. $allcontexts = array();
  272. foreach ($accessinfo->usercontexts as $areaid => $areacontexts) {
  273. if (!empty($data->areaids) && !in_array($areaid, $data->areaids)) {
  274. // Skip unused areas.
  275. continue;
  276. }
  277. foreach ($areacontexts as $contextid) {
  278. // Ensure they are unique.
  279. $allcontexts[$contextid] = $contextid;
  280. }
  281. }
  282. if (empty($allcontexts)) {
  283. // This means there are no valid contexts for them, so they get no results.
  284. return null;
  285. }
  286. $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
  287. }
  288. if (!$accessinfo->everything && $accessinfo->separategroupscontexts) {
  289. // Add another restriction to handle group ids. If there are any contexts using separate
  290. // groups, then results in that context will not show unless you belong to the group.
  291. // (Note: Access all groups is taken care of earlier, when computing these arrays.)
  292. // This special exceptions list allows for particularly pig-headed developers to create
  293. // multiple search areas within the same module, where one of them uses separate
  294. // groups and the other uses visible groups. It is a little inefficient, but this should
  295. // be rare.
  296. $exceptions = '';
  297. if ($accessinfo->visiblegroupscontextsareas) {
  298. foreach ($accessinfo->visiblegroupscontextsareas as $contextid => $areaids) {
  299. $exceptions .= ' OR (contextid:' . $contextid . ' AND areaid:(' .
  300. implode(' OR ', $areaids) . '))';
  301. }
  302. }
  303. if ($accessinfo->usergroups) {
  304. // Either the document has no groupid, or the groupid is one that the user
  305. // belongs to, or the context is not one of the separate groups contexts.
  306. $query->addFilterQuery('(*:* -groupid:[* TO *]) OR ' .
  307. 'groupid:(' . implode(' OR ', $accessinfo->usergroups) . ') OR ' .
  308. '(*:* -contextid:(' . implode(' OR ', $accessinfo->separategroupscontexts) . '))' .
  309. $exceptions);
  310. } else {
  311. // Either the document has no groupid, or the context is not a restricted one.
  312. $query->addFilterQuery('(*:* -groupid:[* TO *]) OR ' .
  313. '(*:* -contextid:(' . implode(' OR ', $accessinfo->separategroupscontexts) . '))' .
  314. $exceptions);
  315. }
  316. }
  317. if ($this->file_indexing_enabled()) {
  318. // Now group records by solr_filegroupingid. Limit to 3 results per group.
  319. $query->setGroup(true);
  320. $query->setGroupLimit(3);
  321. $query->setGroupNGroups(true);
  322. $query->addGroupField('solr_filegroupingid');
  323. } else {
  324. // Make sure we only get text files, in case the index has pre-existing files.
  325. $query->addFilterQuery('type:'.\core_search\manager::TYPE_TEXT);
  326. }
  327. // If ordering by location, add in boost for the relevant course or context ids.
  328. if (!empty($filters->order) && $filters->order === 'location') {
  329. $coursecontext = $filters->context->get_course_context();
  330. $query->addBoostQuery('courseid', $coursecontext->instanceid, self::COURSE_BOOST);
  331. if ($filters->context->contextlevel !== CONTEXT_COURSE) {
  332. // If it's a block or activity, also add a boost for the specific context id.
  333. $query->addBoostQuery('contextid', $filters->context->id, self::CONTEXT_BOOST);
  334. }
  335. }
  336. return $query;
  337. }
  338. /**
  339. * Prepares a new query by setting the query, start offset and rows to return.
  340. *
  341. * @param SolrQuery $query
  342. * @param object $q Containing query and filters.
  343. */
  344. protected function set_query($query, $q) {
  345. // Set hightlighting.
  346. $query->setHighlight(true);
  347. foreach ($this->highlightfields as $field) {
  348. $query->addHighlightField($field);
  349. }
  350. $query->setHighlightFragsize(static::FRAG_SIZE);
  351. $query->setHighlightSimplePre(self::HIGHLIGHT_START);
  352. $query->setHighlightSimplePost(self::HIGHLIGHT_END);
  353. $query->setHighlightMergeContiguous(true);
  354. $query->setQuery($q);
  355. // A reasonable max.
  356. $query->setRows(static::QUERY_SIZE);
  357. }
  358. /**
  359. * Sets fields to be returned in the result.
  360. *
  361. * @param SolrDisMaxQuery|SolrQuery $query object.
  362. */
  363. public function add_fields($query) {
  364. $documentclass = $this->get_document_classname();
  365. $fields = $documentclass::get_default_fields_definition();
  366. $dismax = false;
  367. if ($query instanceof \SolrDisMaxQuery) {
  368. $dismax = true;
  369. }
  370. foreach ($fields as $key => $field) {
  371. $query->addField($key);
  372. if ($dismax && !empty($field['mainquery'])) {
  373. // Add fields the main query should be run against.
  374. // Due to a regression in the PECL solr extension, https://bugs.php.net/bug.php?id=72740,
  375. // a boost value is required, even if it is optional; to avoid boosting one among other fields,
  376. // the explicit boost value will be the default one, for every field.
  377. $query->addQueryField($key, 1);
  378. }
  379. }
  380. }
  381. /**
  382. * Finds the key common to both highlighing and docs array returned from response.
  383. * @param object $response containing results.
  384. */
  385. public function add_highlight_content($response) {
  386. if (!isset($response->highlighting)) {
  387. // There is no highlighting to add.
  388. return;
  389. }
  390. $highlightedobject = $response->highlighting;
  391. foreach ($response->response->docs as $doc) {
  392. $x = $doc->id;
  393. $highlighteddoc = $highlightedobject->$x;
  394. $this->merge_highlight_field_values($doc, $highlighteddoc);
  395. }
  396. }
  397. /**
  398. * Adds the highlighting array values to docs array values.
  399. *
  400. * @throws \core_search\engine_exception
  401. * @param object $doc containing the results.
  402. * @param object $highlighteddoc containing the highlighted results values.
  403. */
  404. public function merge_highlight_field_values($doc, $highlighteddoc) {
  405. foreach ($this->highlightfields as $field) {
  406. if (!empty($doc->$field)) {
  407. // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
  408. if (is_array($doc->{$field})) {
  409. throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
  410. }
  411. if (!empty($highlighteddoc->$field)) {
  412. // Replace by the highlighted result.
  413. $doc->$field = reset($highlighteddoc->$field);
  414. }
  415. }
  416. }
  417. }
  418. /**
  419. * Filters the response on Moodle side.
  420. *
  421. * @param SolrObject $response Solr object containing the response return from solr server.
  422. * @param int $limit The maximum number of results to return. 0 for all.
  423. * @param bool $skipaccesscheck Don't use check_access() on results. Only to be used when results have known access.
  424. * @return array $results containing final results to be displayed.
  425. */
  426. protected function process_response($response, $limit = 0, $skipaccesscheck = false) {
  427. global $USER;
  428. if (empty($response)) {
  429. return array();
  430. }
  431. if (isset($response->grouped)) {
  432. return $this->grouped_files_process_response($response, $limit);
  433. }
  434. $userid = $USER->id;
  435. $noownerid = \core_search\manager::NO_OWNER_ID;
  436. $numgranted = 0;
  437. if (!$docs = $response->response->docs) {
  438. return array();
  439. }
  440. $out = array();
  441. if (!empty($response->response->numFound)) {
  442. $this->add_highlight_content($response);
  443. // Iterate through the results checking its availability and whether they are available for the user or not.
  444. foreach ($docs as $key => $docdata) {
  445. if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
  446. // If owneruserid is set, no other user should be able to access this record.
  447. continue;
  448. }
  449. if (!$searcharea = $this->get_search_area($docdata->areaid)) {
  450. continue;
  451. }
  452. $docdata = $this->standarize_solr_obj($docdata);
  453. if ($skipaccesscheck) {
  454. $access = \core_search\manager::ACCESS_GRANTED;
  455. } else {
  456. $access = $searcharea->check_access($docdata['itemid']);
  457. }
  458. switch ($access) {
  459. case \core_search\manager::ACCESS_DELETED:
  460. $this->delete_by_id($docdata['id']);
  461. // Remove one from our processed and total counters, since we promptly deleted.
  462. $this->processeddocs--;
  463. $this->totalenginedocs--;
  464. break;
  465. case \core_search\manager::ACCESS_DENIED:
  466. $this->skippeddocs++;
  467. break;
  468. case \core_search\manager::ACCESS_GRANTED:
  469. $numgranted++;
  470. // Add the doc.
  471. $out[] = $this->to_document($searcharea, $docdata);
  472. break;
  473. }
  474. // Stop when we hit our limit.
  475. if (!empty($limit) && count($out) >= $limit) {
  476. break;
  477. }
  478. }
  479. }
  480. return $out;
  481. }
  482. /**
  483. * Processes grouped file results into documents, with attached matching files.
  484. *
  485. * @param SolrObject $response The response returned from solr server
  486. * @param int $limit The maximum number of results to return. 0 for all.
  487. * @return array Final results to be displayed.
  488. */
  489. protected function grouped_files_process_response($response, $limit = 0) {
  490. // If we can't find the grouping, or there are no matches in the grouping, return empty.
  491. if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
  492. return array();
  493. }
  494. $numgranted = 0;
  495. $orderedids = array();
  496. $completedocs = array();
  497. $incompletedocs = array();
  498. $highlightingobj = $response->highlighting;
  499. // Each group represents a "master document".
  500. $groups = $response->grouped->solr_filegroupingid->groups;
  501. foreach ($groups as $group) {
  502. $groupid = $group->groupValue;
  503. $groupdocs = $group->doclist->docs;
  504. $firstdoc = reset($groupdocs);
  505. if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
  506. // Well, this is a problem.
  507. continue;
  508. }
  509. // Check for access.
  510. $access = $searcharea->check_access($firstdoc->itemid);
  511. switch ($access) {
  512. case \core_search\manager::ACCESS_DELETED:
  513. // If deleted from Moodle, delete from index and then continue.
  514. $this->delete_by_id($firstdoc->id);
  515. // Remove one from our processed and total counters, since we promptly deleted.
  516. $this->processeddocs--;
  517. $this->totalenginedocs--;
  518. continue 2;
  519. break;
  520. case \core_search\manager::ACCESS_DENIED:
  521. // This means we should just skip for the current user.
  522. $this->skippeddocs++;
  523. continue 2;
  524. break;
  525. }
  526. $numgranted++;
  527. $maindoc = false;
  528. $fileids = array();
  529. // Seperate the main document and any files returned.
  530. foreach ($groupdocs as $groupdoc) {
  531. if ($groupdoc->id == $groupid) {
  532. $maindoc = $groupdoc;
  533. } else if (isset($groupdoc->solr_fileid)) {
  534. $fileids[] = $groupdoc->solr_fileid;
  535. }
  536. }
  537. // Store the id of this group, in order, for later merging.
  538. $orderedids[] = $groupid;
  539. if (!$maindoc) {
  540. // We don't have the main doc, store what we know for later building.
  541. $incompletedocs[$groupid] = $fileids;
  542. } else {
  543. if (isset($highlightingobj->$groupid)) {
  544. // Merge the highlighting for this doc.
  545. $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
  546. }
  547. $docdata = $this->standarize_solr_obj($maindoc);
  548. $doc = $this->to_document($searcharea, $docdata);
  549. // Now we need to attach the result files to the doc.
  550. foreach ($fileids as $fileid) {
  551. $doc->add_stored_file($fileid);
  552. }
  553. $completedocs[$groupid] = $doc;
  554. }
  555. if (!empty($limit) && $numgranted >= $limit) {
  556. // We have hit the max results, we will just ignore the rest.
  557. break;
  558. }
  559. }
  560. $incompletedocs = $this->get_missing_docs($incompletedocs);
  561. $out = array();
  562. // Now merge the complete and incomplete documents, in results order.
  563. foreach ($orderedids as $docid) {
  564. if (isset($completedocs[$docid])) {
  565. $out[] = $completedocs[$docid];
  566. } else if (isset($incompletedocs[$docid])) {
  567. $out[] = $incompletedocs[$docid];
  568. }
  569. }
  570. return $out;
  571. }
  572. /**
  573. * Retreive any missing main documents and attach provided files.
  574. *
  575. * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
  576. * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
  577. *
  578. * Return array also indexed by document id.
  579. *
  580. * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
  581. * @return document[]
  582. */
  583. protected function get_missing_docs($missingdocs) {
  584. if (empty($missingdocs)) {
  585. return array();
  586. }
  587. $docids = array_keys($missingdocs);
  588. // Build a custom query that will get all the missing documents.
  589. $query = new \SolrQuery();
  590. $this->set_query($query, '*');
  591. $this->add_fields($query);
  592. $query->setRows(count($docids));
  593. $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
  594. $response = $this->get_query_response($query);
  595. // We know the missing docs have already been checked for access, so don't recheck.
  596. $results = $this->process_response($response, 0, true);
  597. $out = array();
  598. foreach ($results as $result) {
  599. $resultid = $result->get('id');
  600. if (!isset($missingdocs[$resultid])) {
  601. // We got a result we didn't expect. Skip it.
  602. continue;
  603. }
  604. // Attach the files.
  605. foreach ($missingdocs[$resultid] as $filedoc) {
  606. $result->add_stored_file($filedoc);
  607. }
  608. $out[$resultid] = $result;
  609. }
  610. return $out;
  611. }
  612. /**
  613. * Returns a standard php array from a \SolrObject instance.
  614. *
  615. * @param \SolrObject $obj
  616. * @return array The returned document as an array.
  617. */
  618. public function standarize_solr_obj(\SolrObject $obj) {
  619. $properties = $obj->getPropertyNames();
  620. $docdata = array();
  621. foreach($properties as $name) {
  622. // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
  623. $name = trim($name);
  624. $docdata[$name] = $obj->offsetGet($name);
  625. }
  626. return $docdata;
  627. }
  628. /**
  629. * Adds a document to the search engine.
  630. *
  631. * This does not commit to the search engine.
  632. *
  633. * @param document $document
  634. * @param bool $fileindexing True if file indexing is to be used
  635. * @return bool
  636. */
  637. public function add_document($document, $fileindexing = false) {
  638. $docdata = $document->export_for_engine();
  639. if (!$this->add_solr_document($docdata)) {
  640. return false;
  641. }
  642. if ($fileindexing) {
  643. // This will take care of updating all attached files in the index.
  644. $this->process_document_files($document);
  645. }
  646. return true;
  647. }
  648. /**
  649. * Replaces underlines at edges of words in the content with spaces.
  650. *
  651. * For example '_frogs_' will become 'frogs', '_frogs and toads_' will become 'frogs and toads',
  652. * and 'frogs_and_toads' will be left as 'frogs_and_toads'.
  653. *
  654. * The reason for this is that for italic content_to_text puts _italic_ underlines at the start
  655. * and end of the italicised phrase (not between words). Solr treats underlines as part of the
  656. * word, which means that if you search for a word in italic then you can't find it.
  657. *
  658. * @param string $str String to replace
  659. * @return string Replaced string
  660. */
  661. protected static function replace_underlines(string $str): string {
  662. return preg_replace('~\b_|_\b~', '', $str);
  663. }
  664. /**
  665. * Adds a text document to the search engine.
  666. *
  667. * @param array $doc
  668. * @return bool
  669. */
  670. protected function add_solr_document($doc) {
  671. $solrdoc = new \SolrInputDocument();
  672. // Replace underlines in the content with spaces. The reason for this is that for italic
  673. // text, content_to_text puts _italic_ underlines. Solr treats underlines as part of the
  674. // word, which means that if you search for a word in italic then you can't find it.
  675. if (array_key_exists('content', $doc)) {
  676. $doc['content'] = self::replace_underlines($doc['content']);
  677. }
  678. foreach ($doc as $field => $value) {
  679. $solrdoc->addField($field, $value);
  680. }
  681. try {
  682. $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
  683. return true;
  684. } catch (\SolrClientException $e) {
  685. debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
  686. } catch (\SolrServerException $e) {
  687. // We only use the first line of the message, as it's a fully java stacktrace behind it.
  688. $msg = strtok($e->getMessage(), "\n");
  689. debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
  690. }
  691. return false;
  692. }
  693. /**
  694. * Index files attached to the docuemnt, ensuring the index matches the current document files.
  695. *
  696. * For documents that aren't known to be new, we check the index for existing files.
  697. * - New files we will add.
  698. * - Existing and unchanged files we will skip.
  699. * - File that are in the index but not on the document will be deleted from the index.
  700. * - Files that have changed will be re-indexed.
  701. *
  702. * @param document $document
  703. */
  704. protected function process_document_files($document) {
  705. if (!$this->file_indexing_enabled()) {
  706. return;
  707. }
  708. // Maximum rows to process at a time.
  709. $rows = 500;
  710. // Get the attached files.
  711. $files = $document->get_files();
  712. // If this isn't a new document, we need to check the exiting indexed files.
  713. if (!$document->get_is_new()) {
  714. // We do this progressively, so we can handle lots of files cleanly.
  715. list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
  716. $count = 0;
  717. $idstodelete = array();
  718. do {
  719. // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
  720. foreach ($indexedfiles as $indexedfile) {
  721. $fileid = $indexedfile->solr_fileid;
  722. if (isset($files[$fileid])) {
  723. // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
  724. // Filelib does not guarantee time modified is updated, so we will check important values.
  725. if ($indexedfile->modified != $files[$fileid]->get_timemodified()) {
  726. continue;
  727. }
  728. if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
  729. continue;
  730. }
  731. if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
  732. continue;
  733. }
  734. if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
  735. $this->file_is_indexable($files[$fileid])) {
  736. // This means that the last time we indexed this file, filtering blocked it.
  737. // Current settings say it is indexable, so we will allow it to be indexed.
  738. continue;
  739. }
  740. // If the file is already indexed, we can just remove it from the files array and skip it.
  741. unset($files[$fileid]);
  742. } else {
  743. // This means we have found a file that is no longer attached, so we need to delete from the index.
  744. // We do it later, since this is progressive, and it could reorder results.
  745. $idstodelete[] = $indexedfile->id;
  746. }
  747. }
  748. $count += $rows;
  749. if ($count < $numfound) {
  750. // If we haven't hit the total count yet, fetch the next batch.
  751. list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
  752. }
  753. } while ($count < $numfound);
  754. // Delete files that are no longer attached.
  755. foreach ($idstodelete as $id) {
  756. // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
  757. $this->get_search_client()->deleteById($id);
  758. }
  759. }
  760. // Now we can actually index all the remaining files.
  761. foreach ($files as $file) {
  762. $this->add_stored_file($document, $file);
  763. }
  764. }
  765. /**
  766. * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
  767. *
  768. * @param document $document
  769. * @param int $start The row to start the results on. Zero indexed.
  770. * @param int $rows The number of rows to fetch
  771. * @return array A two element array, the first is the total number of availble results, the second is an array
  772. * of documents for the current request.
  773. */
  774. protected function get_indexed_files($document, $start = 0, $rows = 500) {
  775. // Build a custom query that will get any document files that are in our solr_filegroupingid.
  776. $query = new \SolrQuery();
  777. // We want to get all file records tied to a document.
  778. // For efficiency, we are building our own, stripped down, query.
  779. $query->setQuery('*');
  780. $query->setRows($rows);
  781. $query->setStart($start);
  782. // We want a consistent sorting.
  783. $query->addSortField('id');
  784. // We only want the bare minimum of fields.
  785. $query->addField('id');
  786. $query->addField('modified');
  787. $query->addField('title');
  788. $query->addField('solr_fileid');
  789. $query->addField('solr_filecontenthash');
  790. $query->addField('solr_fileindexstatus');
  791. $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
  792. $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
  793. $response = $this->get_query_response($query);
  794. if (empty($response->response->numFound)) {
  795. return array(0, array());
  796. }
  797. return array($response->response->numFound, $this->convert_file_results($response));
  798. }
  799. /**
  800. * A very lightweight handler for getting information about already indexed files from a Solr response.
  801. *
  802. * @param SolrObject $responsedoc A Solr response document
  803. * @return stdClass[] An array of objects that contain the basic information for file processing.
  804. */
  805. protected function convert_file_results($responsedoc) {
  806. if (!$docs = $responsedoc->response->docs) {
  807. return array();
  808. }
  809. $out = array();
  810. foreach ($docs as $doc) {
  811. // Copy the bare minimim needed info.
  812. $result = new \stdClass();
  813. $result->id = $doc->id;
  814. $result->modified = document::import_time_from_engine($doc->modified);
  815. $result->title = $doc->title;
  816. $result->solr_fileid = $doc->solr_fileid;
  817. $result->solr_filecontenthash = $doc->solr_filecontenthash;
  818. $result->solr_fileindexstatus = $doc->solr_fileindexstatus;
  819. $out[] = $result;
  820. }
  821. return $out;
  822. }
  823. /**
  824. * Adds a file to the search engine.
  825. *
  826. * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
  827. * Tika has much better content type detection than Moodle, and we will have many more doc failures
  828. * if we try to send mime types.
  829. *
  830. * @param document $document
  831. * @param \stored_file $storedfile
  832. * @return void
  833. */
  834. protected function add_stored_file($document, $storedfile) {
  835. $filedoc = $document->export_file_for_engine($storedfile);
  836. if (!$this->file_is_indexable($storedfile)) {
  837. // For files that we don't consider indexable, we will still place a reference in the search engine.
  838. $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
  839. $this->add_solr_document($filedoc);
  840. return;
  841. }
  842. $curl = $this->get_curl_object();
  843. $url = $this->get_connection_url('/update/extract');
  844. // Return results as XML.
  845. $url->param('wt', 'xml');
  846. // This will prevent solr from automatically making fields for every tika output.
  847. $url->param('uprefix', 'ignored_');
  848. // Control how content is captured. This will keep our file content clean of non-important metadata.
  849. $url->param('captureAttr', 'true');
  850. // Move the content to a field for indexing.
  851. $url->param('fmap.content', 'solr_filecontent');
  852. // These are common fields that matches the standard *_point dynamic field and causes an error.
  853. $url->param('fmap.media_white_point', 'ignored_mwp');
  854. $url->param('fmap.media_black_point', 'ignored_mbp');
  855. // Copy each key to the url with literal.
  856. // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
  857. foreach ($filedoc as $key => $value) {
  858. // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
  859. $url->param('fmap.'.$key, 'ignored_'.$key);
  860. // Place data in a tmp field.
  861. $url->param('literal.mdltmp_'.$key, $value);
  862. // Then move to the final field.
  863. $url->param('fmap.mdltmp_'.$key, $key);
  864. }
  865. // This sets the true filename for Tika.
  866. $url->param('resource.name', $storedfile->get_filename());
  867. // A giant block of code that is really just error checking around the curl request.
  868. try {
  869. // Now actually do the request.
  870. $result = $curl->post($url->out(false), array('myfile' => $storedfile));
  871. $code = $curl->get_errno();
  872. $info = $curl->get_info();
  873. // Now error handling. It is just informational, since we aren't tracking per file/doc results.
  874. if ($code != 0) {
  875. // This means an internal cURL error occurred error is in result.
  876. $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
  877. debugging($message, DEBUG_DEVELOPER);
  878. } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
  879. // Unexpected HTTP response code.
  880. $message = 'Error while indexing file with document id '.$filedoc['id'];
  881. // Try to get error message out of msg or title if it exists.
  882. if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
  883. $message .= ': '.$matches[1];
  884. } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
  885. $message .= ': '.$matches[1];
  886. }
  887. // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
  888. if (CLI_SCRIPT && !PHPUNIT_TEST) {
  889. mtrace($message);
  890. }
  891. } else {
  892. // Check for the expected status field.
  893. if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
  894. // Now check for the expected status of 0, if not, error.
  895. if ((int)$matches[1] !== 0) {
  896. $message = 'Unexpected Solr status code '.(int)$matches[1];
  897. $message .= ' while indexing file with document id '.$filedoc['id'].'.';
  898. debugging($message, DEBUG_DEVELOPER);
  899. } else {
  900. // The document was successfully indexed.
  901. return;
  902. }
  903. } else {
  904. // We received an unprocessable response.
  905. $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
  906. $message .= strtok($result, "\n");
  907. debugging($message, DEBUG_DEVELOPER);
  908. }
  909. }
  910. } catch (\Exception $e) {
  911. // There was an error, but we are not tracking per-file success, so we just continue on.
  912. debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
  913. }
  914. // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
  915. $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
  916. $this->add_solr_document($filedoc);
  917. }
  918. /**
  919. * Checks to see if a passed file is indexable.
  920. *
  921. * @param \stored_file $file The file to check
  922. * @return bool True if the file can be indexed
  923. */
  924. protected function file_is_indexable($file) {
  925. if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
  926. // The file is too big to index.
  927. return false;
  928. }
  929. $mime = $file->get_mimetype();
  930. if ($mime == 'application/vnd.moodle.backup') {
  931. // We don't index Moodle backup files. There is nothing usefully indexable in them.
  932. return false;
  933. }
  934. return true;
  935. }
  936. /**
  937. * Commits all pending changes.
  938. *
  939. * @return void
  940. */
  941. protected function commit() {
  942. $this->get_search_client()->commit();
  943. }
  944. /**
  945. * Do any area cleanup needed, and do anything to confirm contents.
  946. *
  947. * Return false to prevent the search area completed time and stats from being updated.
  948. *
  949. * @param \core_search\base $searcharea The search area that was complete
  950. * @param int $numdocs The number of documents that were added to the index
  951. * @param bool $fullindex True if a full index is being performed
  952. * @return bool True means that data is considered indexed
  953. */
  954. public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
  955. $this->commit();
  956. return true;
  957. }
  958. /**
  959. * Return true if file indexing is supported and enabled. False otherwise.
  960. *
  961. * @return bool
  962. */
  963. public function file_indexing_enabled() {
  964. return (bool)$this->config->fileindexing;
  965. }
  966. /**
  967. * Defragments the index.
  968. *
  969. * @return void
  970. */
  971. public function optimize() {
  972. $this->get_search_client()->optimize(1, true, false);
  973. }
  974. /**
  975. * Deletes the specified document.
  976. *
  977. * @param string $id The document id to delete
  978. * @return void
  979. */
  980. public function delete_by_id($id) {
  981. // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
  982. $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
  983. $this->commit();
  984. }
  985. /**
  986. * Delete all area's documents.
  987. *
  988. * @param string $areaid
  989. * @return void
  990. */
  991. public function delete($areaid = null) {
  992. if ($areaid) {
  993. $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
  994. } else {
  995. $this->get_search_client()->deleteByQuery('*:*');
  996. }
  997. $this->commit();
  998. }
  999. /**
  1000. * Pings the Solr server using search_solr config
  1001. *
  1002. * @return true|string Returns true if all good or an error string.
  1003. */
  1004. public function is_server_ready() {
  1005. $configured = $this->is_server_configured();
  1006. if ($configured !== true) {
  1007. return $configured;
  1008. }
  1009. // As part of the above we have already checked that we can contact the server. For pages
  1010. // where performance is important, we skip doing a full schema check as well.
  1011. if ($this->should_skip_schema_check()) {
  1012. return true;
  1013. }
  1014. // Update schema if required/possible.
  1015. $schemalatest = $this->check_latest_schema();
  1016. if ($schemalatest !== true) {
  1017. return $schemalatest;
  1018. }
  1019. // Check that the schema is already set up.
  1020. try {
  1021. $schema = new \search_solr\schema();
  1022. $schema->validate_setup();
  1023. } catch (\moodle_exception $e) {
  1024. return $e->getMessage();
  1025. }
  1026. return true;
  1027. }
  1028. /**
  1029. * Is the solr server properly configured?.
  1030. *
  1031. * @return true|string Returns true if all good or an error string.
  1032. */
  1033. public function is_server_configured() {
  1034. if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
  1035. return 'No solr configuration found';
  1036. }
  1037. if (!$client = $this->get_search_client(false)) {
  1038. return get_string('engineserverstatus', 'search');
  1039. }
  1040. try {
  1041. if ($this->get_solr_major_version() < 4) {
  1042. // Minimum solr 4.0.
  1043. return get_string('minimumsolr4', 'search_solr');
  1044. }
  1045. } catch (\SolrClientException $ex) {
  1046. debugging('Solr client error: ' . html_to_text($ex->getMessage()), DEBUG_DEVELOPER);
  1047. return get_string('engineserverstatus', 'search');
  1048. } catch (\SolrServerException $ex) {
  1049. debugging('Solr server error: ' . html_to_text($ex->getMessage()), DEBUG_DEVELOPER);
  1050. return get_string('engineserverstatus', 'search');
  1051. }
  1052. return true;
  1053. }
  1054. /**
  1055. * Returns the solr server major version.
  1056. *
  1057. * @return int
  1058. */
  1059. public function get_solr_major_version() {
  1060. if ($this->solrmajorversion !== null) {
  1061. return $this->solrmajorversion;
  1062. }
  1063. // We should really ping first the server to see if the specified indexname is valid but
  1064. // we want to minimise solr server requests as they are expensive. system() emits a warning
  1065. // if it can not connect to the configured index in the configured server.
  1066. $systemdata = @$this->get_search_client()->system();
  1067. $solrversion = $systemdata->getResponse()->offsetGet('lucene')->offsetGet('solr-spec-version');
  1068. $this->solrmajorversion = intval(substr($solrversion, 0, strpos($solrversion, '.')));
  1069. return $this->solrmajorversion;
  1070. }
  1071. /**
  1072. * Checks if the PHP Solr extension is available.
  1073. *
  1074. * @return bool
  1075. */
  1076. public function is_installed() {
  1077. return function_exists('solr_get_version');
  1078. }
  1079. /**
  1080. * Returns the solr client instance.
  1081. *
  1082. * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl.
  1083. *
  1084. * @throws \core_search\engine_exception
  1085. * @param bool $triggerexception
  1086. * @return \SolrClient
  1087. */
  1088. protected function get_search_client($triggerexception = true) {
  1089. global $CFG;
  1090. // Type comparison as it is set to false if not available.
  1091. if ($this->client !== null) {
  1092. return $this->client;
  1093. }
  1094. $options = array(
  1095. 'hostname' => $this->config->server_hostname,
  1096. 'path' => '/solr/' . $this->config->indexname,
  1097. 'login' => !empty($this->config->server_username) ? $this->config->server_username : '',
  1098. 'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
  1099. 'port' => !empty($this->config->server_port) ? $this->config->server_port : '',
  1100. 'secure' => !empt

Large files files are truncated, but you can click here to view the full file