PageRenderTime 28ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/extensions/SphinxSearch/SphinxMWSearch.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 636 lines | 459 code | 63 blank | 114 comment | 67 complexity | a80a32f3529be77aeca046913e22c36c MD5 | raw file
  1. <?php
  2. /**
  3. * Class file for the SphinxMWSearch extension
  4. *
  5. * http://www.mediawiki.org/wiki/Extension:SphinxSearch
  6. *
  7. * Released under GNU General Public License (see http://www.fsf.org/licenses/gpl.html)
  8. *
  9. * @file
  10. * @ingroup Extensions
  11. * @author Svemir Brkic <svemir@deveblog.com>
  12. */
  13. class SphinxMWSearch extends SearchEngine {
  14. var $categories = array();
  15. var $exc_categories = array();
  16. var $db;
  17. var $sphinx_client = null;
  18. var $prefix_handlers = array(
  19. 'intitle' => 'filterByTitle',
  20. 'incategory' => 'filterByCategory',
  21. 'prefix' => 'filterByPrefix',
  22. );
  23. /**
  24. * Do not go to a near match if query prefixed with ~
  25. *
  26. * @param $searchterm String
  27. * @return Title
  28. */
  29. public static function getNearMatch( $searchterm ) {
  30. if ( $searchterm[ 0 ] === '~' ) {
  31. return null;
  32. } else {
  33. return parent::getNearMatch( $searchterm );
  34. }
  35. }
  36. /**
  37. * PrefixSearchBackend override for OpenSearch results
  38. */
  39. static function prefixSearch( $namespaces, $term, $limit, &$results ) {
  40. $search_engine = new SphinxMWSearch( wfGetDB( DB_SLAVE ) );
  41. $search_engine->namespaces = $namespaces;
  42. $search_engine->setLimitOffset( $limit, 0 );
  43. $result_set = $search_engine->searchText( '@page_title: ^' . $term . '*' );
  44. $results = array();
  45. if ( $result_set ) {
  46. while ( $res = $result_set->next() ) {
  47. $results[ ] = $res->getTitle()->getPrefixedText();
  48. }
  49. }
  50. return false;
  51. }
  52. /**
  53. * Perform a full text search query and return a result set.
  54. *
  55. * @param string $term - Raw search term
  56. * @return SphinxMWSearchResultSet
  57. * @access public
  58. */
  59. function searchText( $term ) {
  60. global $wgSphinxSearch_index_list;
  61. if ( !$this->sphinx_client ) {
  62. $this->sphinx_client = $this->prepareSphinxClient( $term );
  63. }
  64. if ( $this->sphinx_client ) {
  65. $this->searchTerms = $term;
  66. $escape = '/';
  67. $delims = array(
  68. '(' => ')',
  69. '[' => ']',
  70. '"' => '',
  71. );
  72. // temporarily replace already escaped characters
  73. $placeholders = array(
  74. '\\(' => '_PLC_O_PAR_',
  75. '\\)' => '_PLC_C_PAR_',
  76. '\\[' => '_PLC_O_BRA_',
  77. '\\]' => '_PLC_C_BRA_',
  78. '\\"' => '_PLC_QUOTE_',
  79. );
  80. $term = str_replace(array_keys($placeholders), $placeholders, $term);
  81. foreach ($delims as $open => $close) {
  82. $open_cnt = substr_count( $term, $open );
  83. if ($close) {
  84. // if counts do not match, escape them all
  85. $close_cnt = substr_count( $term, $close );
  86. if ($open_cnt != $close_cnt) {
  87. $escape .= $open . $close;
  88. }
  89. } elseif ($open_cnt % 2 == 1) {
  90. // if there is no closing symbol, count should be even
  91. $escape .= $open;
  92. }
  93. }
  94. $term = str_replace($placeholders, array_keys($placeholders), $term);
  95. $term = addcslashes( $term, $escape );
  96. wfDebug( "SphinxSearch query: $term\n" );
  97. $resultSet = $this->sphinx_client->Query(
  98. $term,
  99. $wgSphinxSearch_index_list
  100. );
  101. } else {
  102. $resultSet = false;
  103. }
  104. if ( $resultSet === false ) {
  105. return null;
  106. } else {
  107. return new SphinxMWSearchResultSet( $resultSet, $term, $this->sphinx_client, $this->db );
  108. }
  109. }
  110. /**
  111. * @return SphinxClient: ready to run or false if term is empty
  112. */
  113. function prepareSphinxClient( &$term ) {
  114. global $wgSphinxSearch_sortmode, $wgSphinxSearch_sortby, $wgSphinxSearch_host,
  115. $wgSphinxSearch_port, $wgSphinxSearch_index_weights,
  116. $wgSphinxSearch_mode, $wgSphinxSearch_maxmatches,
  117. $wgSphinxSearch_cutoff, $wgSphinxSearch_weights;
  118. // don't do anything for blank searches
  119. if ( trim( $term ) === '' ) {
  120. return false;
  121. }
  122. wfRunHooks( 'SphinxSearchBeforeResults', array(
  123. &$term,
  124. &$this->offset,
  125. &$this->namespaces,
  126. &$this->categories,
  127. &$this->exc_categories
  128. ) );
  129. $cl = new SphinxClient();
  130. $cl->SetServer( $wgSphinxSearch_host, $wgSphinxSearch_port );
  131. if ( $wgSphinxSearch_weights && count( $wgSphinxSearch_weights ) ) {
  132. $cl->SetFieldWeights( $wgSphinxSearch_weights );
  133. }
  134. if ( is_array( $wgSphinxSearch_index_weights ) ) {
  135. $cl->SetIndexWeights( $wgSphinxSearch_index_weights );
  136. }
  137. if ( $wgSphinxSearch_mode ) {
  138. $cl->SetMatchMode( $wgSphinxSearch_mode );
  139. }
  140. if ( $this->namespaces && count( $this->namespaces ) ) {
  141. $cl->SetFilter( 'page_namespace', $this->namespaces );
  142. }
  143. if( !$this->showRedirects ) {
  144. $cl->SetFilter( 'page_is_redirect', array( 0 ) );
  145. }
  146. if ( $this->categories && count( $this->categories ) ) {
  147. $cl->SetFilter( 'category', $this->categories );
  148. wfDebug( "SphinxSearch included categories: " . join( ', ', $this->categories ) . "\n" );
  149. }
  150. if ( $this->exc_categories && count( $this->exc_categories ) ) {
  151. $cl->SetFilter( 'category', $this->exc_categories, true );
  152. wfDebug( "SphinxSearch excluded categories: " . join( ', ', $this->exc_categories ) . "\n" );
  153. }
  154. $cl->SetSortMode( $wgSphinxSearch_sortmode, $wgSphinxSearch_sortby );
  155. $cl->SetLimits(
  156. $this->offset,
  157. $this->limit,
  158. $wgSphinxSearch_maxmatches,
  159. $wgSphinxSearch_cutoff
  160. );
  161. wfRunHooks( 'SphinxSearchBeforeQuery', array( &$term, &$cl ) );
  162. return $cl;
  163. }
  164. /**
  165. * Find snippet highlight settings for a given user
  166. *
  167. * @param $user User
  168. * @return Array contextlines, contextchars
  169. */
  170. public static function userHighlightPrefs( &$user ) {
  171. $contextlines = $user->getOption( 'contextlines', 2 );
  172. $contextchars = $user->getOption( 'contextchars', 75 );
  173. return array( $contextlines, $contextchars );
  174. }
  175. /**
  176. * Prepare query for sphinx search daemon
  177. *
  178. * @param string $query
  179. * @return string rewritten query
  180. */
  181. function replacePrefixes( $query ) {
  182. // ~ prefix is used to avoid near-term search, remove it now
  183. if ( $query[ 0 ] === '~' ) {
  184. $query = substr( $query, 1 );
  185. }
  186. $parts = preg_split( '/(")/', $query, -1, PREG_SPLIT_DELIM_CAPTURE );
  187. $inquotes = false;
  188. $rewritten = '';
  189. foreach ( $parts as $key => $part ) {
  190. if ( $part == '"' ) { // stuff in quotes doesn't get rewritten
  191. $rewritten .= $part;
  192. $inquotes = !$inquotes;
  193. } elseif ( $inquotes ) {
  194. $rewritten .= $part;
  195. } else {
  196. if ( strpos( $query, ':' ) !== false ) {
  197. $regexp = $this->preparePrefixRegexp();
  198. $part = preg_replace_callback(
  199. '/(^|[| :]|-)(' . $regexp . '):([^ ]+)/i',
  200. array( $this, 'replaceQueryPrefix' ),
  201. $part
  202. );
  203. }
  204. $rewritten .= str_replace(
  205. array( ' OR ', ' AND ' ),
  206. array( ' | ', ' & ' ),
  207. $part
  208. );
  209. }
  210. }
  211. return $rewritten;
  212. }
  213. /**
  214. * @return string Regexp to match namespaces and other prefixes
  215. */
  216. function preparePrefixRegexp() {
  217. global $wgContLang, $wgCanonicalNamespaceNames, $wgNamespaceAliases;
  218. // "search everything" keyword
  219. $allkeyword = wfMsgForContent( 'searchall' );
  220. $this->prefix_handlers[ $allkeyword ] = 'searchAllNamespaces';
  221. $all_prefixes = array_merge(
  222. $wgContLang->getNamespaces(),
  223. $wgCanonicalNamespaceNames,
  224. array_keys( array_merge( $wgNamespaceAliases, $wgContLang->getNamespaceAliases() ) ),
  225. array_keys( $this->prefix_handlers )
  226. );
  227. $regexp_prefixes = array();
  228. foreach ( $all_prefixes as $prefix ) {
  229. if ( $prefix != '' ) {
  230. $regexp_prefixes[] = preg_quote( str_replace( ' ', '_', $prefix ), '/' );
  231. }
  232. }
  233. return implode( '|', array_unique( $regexp_prefixes ) );
  234. }
  235. /**
  236. * preg callback to process foo: prefixes in the query
  237. *
  238. * @param array $matches
  239. * @return string
  240. */
  241. function replaceQueryPrefix( $matches ) {
  242. if ( isset( $this->prefix_handlers[ $matches[ 2 ] ] ) ) {
  243. $callback = $this->prefix_handlers[ $matches[ 2 ] ];
  244. return $this->$callback( $matches );
  245. } else {
  246. return $this->filterByNamespace( $matches );
  247. }
  248. }
  249. function filterByNamespace( $matches ) {
  250. global $wgContLang;
  251. $inx = $wgContLang->getNsIndex( str_replace( ' ', '_', $matches[ 2 ] ) );
  252. if ( $inx === false ) {
  253. return $matches[ 0 ];
  254. } else {
  255. $this->namespaces[] = $inx;
  256. return $matches[ 3 ];
  257. }
  258. }
  259. function searchAllNamespaces( $matches ) {
  260. $this->namespaces = null;
  261. return $matches[ 3 ];
  262. }
  263. function filterByTitle( $matches ) {
  264. return '@page_title ' . $matches[ 3 ];
  265. }
  266. function filterByPrefix( $matches ) {
  267. $prefix = $matches[ 3 ];
  268. if ( strpos( $matches[ 3 ], ':' ) !== false ) {
  269. global $wgContLang;
  270. list( $ns, $prefix ) = explode( ':', $matches[ 3 ] );
  271. $inx = $wgContLang->getNsIndex( str_replace( ' ', '_', $ns ) );
  272. if ( $inx !== false ) {
  273. $this->namespaces = array( $inx );
  274. }
  275. }
  276. return '@page_title ^' . $prefix . '*';
  277. }
  278. function filterByCategory( $matches ) {
  279. $page_id = $this->db->selectField( 'page', 'page_id',
  280. array(
  281. 'page_title' => $matches[ 3 ],
  282. 'page_namespace' => NS_CATEGORY
  283. ),
  284. __METHOD__
  285. );
  286. $category = intval( $page_id );
  287. if ( $matches[ 1 ] === '-' ) {
  288. $this->exc_categories[ ] = $category;
  289. } else {
  290. $this->categories[ ] = $category;
  291. }
  292. return '';
  293. }
  294. }
  295. class SphinxMWSearchResultSet extends SearchResultSet {
  296. var $mNdx = 0;
  297. var $sphinx_client;
  298. var $mSuggestion = '';
  299. var $db;
  300. var $total_hits = 0;
  301. function __construct( $resultSet, $terms, $sphinx_client, $dbr ) {
  302. global $wgSearchHighlightBoundaries;
  303. $this->sphinx_client = $sphinx_client;
  304. $this->mResultSet = array();
  305. $this->db = $dbr ? $dbr : wfGetDB( DB_SLAVE );
  306. if ( is_array( $resultSet ) && isset( $resultSet['matches'] ) ) {
  307. $this->total_hits = $resultSet[ 'total_found' ];
  308. foreach ( $resultSet['matches'] as $id => $docinfo ) {
  309. $res = $this->db->select(
  310. 'page',
  311. array( 'page_id', 'page_title', 'page_namespace' ),
  312. array( 'page_id' => $id ),
  313. __METHOD__,
  314. array()
  315. );
  316. if ( $this->db->numRows( $res ) > 0 ) {
  317. $this->mResultSet[] = $this->db->fetchObject( $res );
  318. }
  319. }
  320. }
  321. $this->mNdx = 0;
  322. $this->mTerms = preg_split( "/$wgSearchHighlightBoundaries+/ui", $terms );
  323. }
  324. /**
  325. * Some search modes return a suggested alternate term if there are
  326. * no exact hits. Returns true if there is one on this set.
  327. *
  328. * @return Boolean
  329. */
  330. function hasSuggestion() {
  331. global $wgSphinxSuggestMode;
  332. if ( $wgSphinxSuggestMode ) {
  333. $this->mSuggestion = '';
  334. if ( $wgSphinxSuggestMode === 'enchant' ) {
  335. $this->suggestWithEnchant();
  336. } elseif ( $wgSphinxSuggestMode === 'soundex' ) {
  337. $this->suggestWithSoundex();
  338. } elseif ( $wgSphinxSuggestMode === 'aspell' ) {
  339. $this->suggestWithAspell();
  340. }
  341. if ($this->mSuggestion) {
  342. return true;
  343. }
  344. }
  345. return false;
  346. }
  347. /**
  348. * Wiki-specific search suggestions using enchant library.
  349. * Use SphinxSearch_setup.php to create the dictionary
  350. */
  351. function suggestWithEnchant() {
  352. if (!function_exists('enchant_broker_init')) {
  353. return;
  354. }
  355. $broker = enchant_broker_init();
  356. enchant_broker_set_dict_path($broker, ENCHANT_MYSPELL, dirname( __FILE__ ));
  357. if ( enchant_broker_dict_exists( $broker, 'sphinx' ) ) {
  358. $dict = enchant_broker_request_dict( $broker, 'sphinx' );
  359. $suggestion_found = false;
  360. $full_suggestion = '';
  361. foreach ( $this->mTerms as $word ) {
  362. if ( !enchant_dict_check($dict, $word) ) {
  363. $suggestions = enchant_dict_suggest($dict, $word);
  364. while ( count( $suggestions ) ) {
  365. $candidate = array_shift( $suggestions );
  366. if ( strtolower($candidate) != strtolower($word) ) {
  367. $word = $candidate;
  368. $suggestion_found = true;
  369. break;
  370. }
  371. }
  372. }
  373. $full_suggestion .= $word . ' ';
  374. }
  375. enchant_broker_free_dict( $dict );
  376. if ($suggestion_found) {
  377. $this->mSuggestion = trim( $full_suggestion );
  378. }
  379. }
  380. enchant_broker_free( $broker );
  381. }
  382. /**
  383. * Default (weak) suggestions implementation relies on MySQL soundex
  384. */
  385. function suggestWithSoundex() {
  386. $joined_terms = $this->db->addQuotes( join( ' ', $this->mTerms ) );
  387. $res = $this->db->select(
  388. array( 'page' ),
  389. array( 'page_title' ),
  390. array(
  391. "page_title SOUNDS LIKE " . $joined_terms,
  392. // avoid (re)recommending the search string
  393. "page_title NOT LIKE " . $joined_terms
  394. ),
  395. __METHOD__,
  396. array(
  397. 'ORDER BY' => 'page_counter desc',
  398. 'LIMIT' => 1
  399. )
  400. );
  401. $suggestion = $this->db->fetchObject( $res );
  402. if ( is_object( $suggestion ) ) {
  403. $this->mSuggestion = trim( $suggestion->page_title );
  404. }
  405. }
  406. function suggestWithAspell() {
  407. global $wgLanguageCode, $wgSphinxSearchPersonalDictionary, $wgSphinxSearchAspellPath;
  408. // aspell will only return mis-spelled words, so remember all here
  409. $words = $this->mTerms;
  410. $word_suggestions = array();
  411. foreach ( $words as $word ) {
  412. $word_suggestions[ $word ] = $word;
  413. }
  414. // prepare the system call with optional dictionary
  415. $aspellcommand = 'echo ' . escapeshellarg( join( ' ', $words ) ) .
  416. ' | ' . escapeshellarg( $wgSphinxSearchAspellPath ) .
  417. ' -a --ignore-accents --ignore-case --lang=' . $wgLanguageCode;
  418. if ( $wgSphinxSearchPersonalDictionary ) {
  419. $aspellcommand .= ' --home-dir=' . dirname( $wgSphinxSearchPersonalDictionary );
  420. $aspellcommand .= ' -p ' . basename( $wgSphinxSearchPersonalDictionary );
  421. }
  422. // run aspell
  423. $shell_return = shell_exec( $aspellcommand );
  424. // parse return line by line
  425. $returnarray = explode( "\n", $shell_return );
  426. $suggestion_needed = false;
  427. foreach ( $returnarray as $key => $value ) {
  428. // lines with suggestions start with &
  429. if ( $value[0] === '&' ) {
  430. $correction = explode( ' ', $value );
  431. $word = $correction[ 1 ];
  432. $suggestions = substr( $value, strpos( $value, ':' ) + 2 );
  433. $suggestions = explode( ', ', $suggestions );
  434. if ( count( $suggestions ) ) {
  435. $guess = array_shift( $suggestions );
  436. if ( strtolower( $word ) != strtolower( $guess ) ) {
  437. $word_suggestions[ $word ] = $guess;
  438. $suggestion_needed = true;
  439. }
  440. }
  441. }
  442. }
  443. if ( $suggestion_needed ) {
  444. $this->mSuggestion = join( ' ', $word_suggestions );
  445. }
  446. }
  447. /**
  448. * @return String: suggested query, null if none
  449. */
  450. function getSuggestionQuery(){
  451. return $this->mSuggestion;
  452. }
  453. /**
  454. * @return String: HTML highlighted suggested query, '' if none
  455. */
  456. function getSuggestionSnippet(){
  457. return $this->mSuggestion;
  458. }
  459. /**
  460. * @return Array: search terms
  461. */
  462. function termMatches() {
  463. return $this->mTerms;
  464. }
  465. /**
  466. * @return Integer: number of results
  467. */
  468. function numRows() {
  469. return count( $this->mResultSet );
  470. }
  471. /**
  472. * Some search modes return a total hit count for the query
  473. * in the entire article database. This may include pages
  474. * in namespaces that would not be matched on the given
  475. * settings.
  476. *
  477. * Return null if no total hits number is supported.
  478. *
  479. * @return Integer
  480. */
  481. function getTotalHits() {
  482. return $this->total_hits;
  483. }
  484. /**
  485. * Return information about how and from where the results were fetched.
  486. *
  487. * @return string
  488. */
  489. function getInfo() {
  490. return wfMsg( 'sphinxPowered', "http://www.sphinxsearch.com" );
  491. }
  492. /**
  493. * @return SphinxMWSearchResult: next result, false if none
  494. */
  495. function next() {
  496. if ( isset( $this->mResultSet[$this->mNdx] ) ) {
  497. $row = $this->mResultSet[$this->mNdx];
  498. ++$this->mNdx;
  499. return new SphinxMWSearchResult( $row, $this->sphinx_client );
  500. } else {
  501. return false;
  502. }
  503. }
  504. function free() {
  505. unset( $this->mResultSet );
  506. }
  507. }
  508. class SphinxMWSearchResult extends SearchResult {
  509. var $sphinx_client = null;
  510. function __construct( $row, $sphinx_client ) {
  511. $this->sphinx_client = $sphinx_client;
  512. parent::__construct( $row );
  513. }
  514. /**
  515. * Emulates SearchEngine getTextSnippet so that we can use our own userHighlightPrefs
  516. * (only needed until userHighlightPrefs in SearchEngine is fixed)
  517. *
  518. * @param $terms array of terms to highlight
  519. * @return string highlighted text snippet
  520. */
  521. function getTextSnippet( $terms ) {
  522. global $wgUser, $wgAdvancedSearchHighlighting;
  523. global $wgSphinxSearchMWHighlighter, $wgSphinxSearch_index;
  524. $this->initText();
  525. list( $contextlines, $contextchars ) = SphinxMWSearch::userHighlightPrefs( $wgUser );
  526. if ( $wgSphinxSearchMWHighlighter ) {
  527. $h = new SearchHighlighter();
  528. if ( $wgAdvancedSearchHighlighting ) {
  529. return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
  530. } else {
  531. return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
  532. }
  533. }
  534. $excerpts_opt = array(
  535. "before_match" => "(searchmatch)",
  536. "after_match" => "(/searchmatch)",
  537. "chunk_separator" => " ... ",
  538. "limit" => $contextlines * $contextchars,
  539. "around" => $contextchars,
  540. );
  541. $excerpts = $this->sphinx_client->BuildExcerpts(
  542. array( $this->mText ),
  543. $wgSphinxSearch_index,
  544. join( ' ', $terms ),
  545. $excerpts_opt
  546. );
  547. if ( is_array( $excerpts ) ) {
  548. $ret = '';
  549. foreach ( $excerpts as $entry ) {
  550. // remove some wiki markup
  551. $entry = preg_replace(
  552. '/([\[\]\{\}\*\#\|\!]+|==+|<br ?\/?>)/',
  553. ' ',
  554. $entry
  555. );
  556. $entry = str_replace(
  557. array("<", ">"),
  558. array("&lt;", "&gt;"),
  559. $entry
  560. );
  561. $entry = str_replace(
  562. array( "(searchmatch)", "(/searchmatch)" ),
  563. array( "<span class='searchmatch'>", "</span>" ),
  564. $entry
  565. );
  566. $ret .= "<div style='margin: 0.2em 1em 0.2em 1em;'>$entry</div>\n";
  567. }
  568. } else {
  569. $ret = wfMsg( 'internalerror_info', $this->sphinx_client->GetLastError() );
  570. }
  571. return $ret;
  572. }
  573. }