PageRenderTime 25ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/deferred/SearchUpdate.php

https://gitlab.com/coco-codeselfstudy/mediawiki-core
PHP | 204 lines | 108 code | 24 blank | 72 comment | 11 complexity | 3221c8acd3b7e60249d36057f7940cd1 MD5 | raw file
  1. <?php
  2. /**
  3. * Search index updater
  4. *
  5. * See deferred.txt
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License along
  18. * with this program; if not, write to the Free Software Foundation, Inc.,
  19. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20. * http://www.gnu.org/copyleft/gpl.html
  21. *
  22. * @file
  23. * @ingroup Search
  24. */
  25. /**
  26. * Database independant search index updater
  27. *
  28. * @ingroup Search
  29. */
  30. class SearchUpdate implements DeferrableUpdate {
  31. /** @var int Page id being updated */
  32. private $id = 0;
  33. /** @var Title Title we're updating */
  34. private $title;
  35. /** @var Content|bool Content of the page (not text) */
  36. private $content;
  37. /**
  38. * Constructor
  39. *
  40. * @param int $id Page id to update
  41. * @param Title|string $title Title of page to update
  42. * @param Content|string|bool $c Content of the page to update. Default: false.
  43. * If a Content object, text will be gotten from it. String is for back-compat.
  44. * Passing false tells the backend to just update the title, not the content
  45. */
  46. public function __construct( $id, $title, $c = false ) {
  47. if ( is_string( $title ) ) {
  48. $nt = Title::newFromText( $title );
  49. } else {
  50. $nt = $title;
  51. }
  52. if ( $nt ) {
  53. $this->id = $id;
  54. // is_string() check is back-compat for ApprovedRevs
  55. if ( is_string( $c ) ) {
  56. $this->content = new TextContent( $c );
  57. } else {
  58. $this->content = $c ?: false;
  59. }
  60. $this->title = $nt;
  61. } else {
  62. wfDebug( "SearchUpdate object created with invalid title '$title'\n" );
  63. }
  64. }
  65. /**
  66. * Perform actual update for the entry
  67. */
  68. public function doUpdate() {
  69. global $wgDisableSearchUpdate;
  70. if ( $wgDisableSearchUpdate || !$this->id ) {
  71. return;
  72. }
  73. $page = WikiPage::newFromID( $this->id, WikiPage::READ_LATEST );
  74. foreach ( SearchEngine::getSearchTypes() as $type ) {
  75. $search = SearchEngine::create( $type );
  76. $indexTitle = $this->indexTitle( $search );
  77. if ( !$search->supports( 'search-update' ) ) {
  78. continue;
  79. }
  80. $normalTitle = $search->normalizeText( $indexTitle );
  81. if ( $page === null ) {
  82. $search->delete( $this->id, $normalTitle );
  83. continue;
  84. } elseif ( $this->content === false ) {
  85. $search->updateTitle( $this->id, $normalTitle );
  86. continue;
  87. }
  88. $text = $search->getTextFromContent( $this->title, $this->content );
  89. if ( !$search->textAlreadyUpdatedForIndex() ) {
  90. $text = self::updateText( $text );
  91. }
  92. # Perform the actual update
  93. $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
  94. }
  95. }
  96. /**
  97. * Clean text for indexing. Only really suitable for indexing in databases.
  98. * If you're using a real search engine, you'll probably want to override
  99. * this behavior and do something nicer with the original wikitext.
  100. * @param string $text
  101. * @return string
  102. */
  103. public static function updateText( $text ) {
  104. global $wgContLang;
  105. # Language-specific strip/conversion
  106. $text = $wgContLang->normalizeForSearch( $text );
  107. $lc = SearchEngine::legalSearchChars() . '&#;';
  108. $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
  109. ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
  110. $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
  111. "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
  112. # Strip external URLs
  113. $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
  114. $protos = "http|https|ftp|mailto|news|gopher";
  115. $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
  116. $text = preg_replace( $pat, "\\1 \\3", $text );
  117. $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
  118. $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
  119. $text = preg_replace( $p1, "\\1 ", $text );
  120. $text = preg_replace( $p2, "\\1 \\3 ", $text );
  121. # Internal image links
  122. $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
  123. $text = preg_replace( $pat2, " \\1 \\3", $text );
  124. $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
  125. "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
  126. # Strip all remaining non-search characters
  127. $text = preg_replace( "/[^{$lc}]+/", " ", $text );
  128. # Handle 's, s'
  129. #
  130. # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
  131. # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
  132. #
  133. # These tail-anchored regexps are insanely slow. The worst case comes
  134. # when Japanese or Chinese text (ie, no word spacing) is written on
  135. # a wiki configured for Western UTF-8 mode. The Unicode characters are
  136. # expanded to hex codes and the "words" are very long paragraph-length
  137. # monstrosities. On a large page the above regexps may take over 20
  138. # seconds *each* on a 1GHz-level processor.
  139. #
  140. # Following are reversed versions which are consistently fast
  141. # (about 3 milliseconds on 1GHz-level processor).
  142. #
  143. $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
  144. $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
  145. # Strip wiki '' and '''
  146. $text = preg_replace( "/''[']*/", " ", $text );
  147. return $text;
  148. }
  149. /**
  150. * Get a string representation of a title suitable for
  151. * including in a search index
  152. *
  153. * @param SearchEngine $search
  154. * @return string A stripped-down title string ready for the search index
  155. */
  156. private function indexTitle( SearchEngine $search ) {
  157. global $wgContLang;
  158. $ns = $this->title->getNamespace();
  159. $title = $this->title->getText();
  160. $lc = $search->legalSearchChars() . '&#;';
  161. $t = $wgContLang->normalizeForSearch( $title );
  162. $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
  163. $t = $wgContLang->lc( $t );
  164. # Handle 's, s'
  165. $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
  166. $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
  167. $t = preg_replace( "/\\s+/", ' ', $t );
  168. if ( $ns == NS_FILE ) {
  169. $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t );
  170. }
  171. return trim( $t );
  172. }
  173. }