PageRenderTime 26ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 1ms

/inc/search/lucene.php

https://bitbucket.org/wez/mtrack/
PHP | 704 lines | 487 code | 106 blank | 111 comment | 76 complexity | aef6b75e1a299a3b00ae6c72f06871fc MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. <?php # vim:ts=2:sw=2:et:
  2. /* For licensing and copyright terms, see the file named LICENSE */
  3. require_once 'Zend/Search/Lucene.php';
  4. require_once 'Zend/Search/Lucene/Search/Highlighter/Interface.php';
  5. /**
  6. * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
  7. * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
  8. * were borrowed from the (broken) implementation by Jon Abernathy.
  9. */
  10. class PorterStemmer {
  11. /**
  12. * Regex for matching a consonant
  13. * @var string
  14. */
  15. private static $regex_consonant =
  16. '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  17. /**
  18. * Regex for matching a vowel
  19. * @var string
  20. */
  21. private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  22. /**
  23. * Stems a word. Simple huh?
  24. *
  25. * @param string $word Word to stem
  26. * @return string Stemmed word
  27. */
  28. public static function Stem($word)
  29. {
  30. if (strlen($word) <= 2) {
  31. return $word;
  32. }
  33. $word = self::step1ab($word);
  34. $word = self::step1c($word);
  35. $word = self::step2($word);
  36. $word = self::step3($word);
  37. $word = self::step4($word);
  38. $word = self::step5($word);
  39. return $word;
  40. }
  41. /**
  42. * Step 1
  43. */
  44. private static function step1ab($word)
  45. {
  46. // Part a
  47. if (substr($word, -1) == 's') {
  48. self::replace($word, 'sses', 'ss')
  49. OR self::replace($word, 'ies', 'i')
  50. OR self::replace($word, 'ss', 'ss')
  51. OR self::replace($word, 's', '');
  52. }
  53. // Part b
  54. if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
  55. $v = self::$regex_vowel;
  56. // ing and ed
  57. if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
  58. OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
  59. // If one of above two test successful
  60. if ( !self::replace($word, 'at', 'ate')
  61. AND !self::replace($word, 'bl', 'ble')
  62. AND !self::replace($word, 'iz', 'ize')) {
  63. // Double consonant ending
  64. if ( self::doubleConsonant($word)
  65. AND substr($word, -2) != 'll'
  66. AND substr($word, -2) != 'ss'
  67. AND substr($word, -2) != 'zz') {
  68. $word = substr($word, 0, -1);
  69. } else if (self::m($word) == 1 AND self::cvc($word)) {
  70. $word .= 'e';
  71. }
  72. }
  73. }
  74. }
  75. return $word;
  76. }
  77. /**
  78. * Step 1c
  79. *
  80. * @param string $word Word to stem
  81. */
  82. private static function step1c($word)
  83. {
  84. $v = self::$regex_vowel;
  85. if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
  86. self::replace($word, 'y', 'i');
  87. }
  88. return $word;
  89. }
  90. /**
  91. * Step 2
  92. *
  93. * @param string $word Word to stem
  94. */
  95. private static function step2($word)
  96. {
  97. switch (substr($word, -2, 1)) {
  98. case 'a':
  99. self::replace($word, 'ational', 'ate', 0)
  100. OR self::replace($word, 'tional', 'tion', 0);
  101. break;
  102. case 'c':
  103. self::replace($word, 'enci', 'ence', 0)
  104. OR self::replace($word, 'anci', 'ance', 0);
  105. break;
  106. case 'e':
  107. self::replace($word, 'izer', 'ize', 0);
  108. break;
  109. case 'g':
  110. self::replace($word, 'logi', 'log', 0);
  111. break;
  112. case 'l':
  113. self::replace($word, 'entli', 'ent', 0)
  114. OR self::replace($word, 'ousli', 'ous', 0)
  115. OR self::replace($word, 'alli', 'al', 0)
  116. OR self::replace($word, 'bli', 'ble', 0)
  117. OR self::replace($word, 'eli', 'e', 0);
  118. break;
  119. case 'o':
  120. self::replace($word, 'ization', 'ize', 0)
  121. OR self::replace($word, 'ation', 'ate', 0)
  122. OR self::replace($word, 'ator', 'ate', 0);
  123. break;
  124. case 's':
  125. self::replace($word, 'iveness', 'ive', 0)
  126. OR self::replace($word, 'fulness', 'ful', 0)
  127. OR self::replace($word, 'ousness', 'ous', 0)
  128. OR self::replace($word, 'alism', 'al', 0);
  129. break;
  130. case 't':
  131. self::replace($word, 'biliti', 'ble', 0)
  132. OR self::replace($word, 'aliti', 'al', 0)
  133. OR self::replace($word, 'iviti', 'ive', 0);
  134. break;
  135. }
  136. return $word;
  137. }
  138. /**
  139. * Step 3
  140. *
  141. * @param string $word String to stem
  142. */
  143. private static function step3($word)
  144. {
  145. switch (substr($word, -2, 1)) {
  146. case 'a':
  147. self::replace($word, 'ical', 'ic', 0);
  148. break;
  149. case 's':
  150. self::replace($word, 'ness', '', 0);
  151. break;
  152. case 't':
  153. self::replace($word, 'icate', 'ic', 0)
  154. OR self::replace($word, 'iciti', 'ic', 0);
  155. break;
  156. case 'u':
  157. self::replace($word, 'ful', '', 0);
  158. break;
  159. case 'v':
  160. self::replace($word, 'ative', '', 0);
  161. break;
  162. case 'z':
  163. self::replace($word, 'alize', 'al', 0);
  164. break;
  165. }
  166. return $word;
  167. }
  168. /**
  169. * Step 4
  170. *
  171. * @param string $word Word to stem
  172. */
  173. private static function step4($word)
  174. {
  175. switch (substr($word, -2, 1)) {
  176. case 'a':
  177. self::replace($word, 'al', '', 1);
  178. break;
  179. case 'c':
  180. self::replace($word, 'ance', '', 1)
  181. OR self::replace($word, 'ence', '', 1);
  182. break;
  183. case 'e':
  184. self::replace($word, 'er', '', 1);
  185. break;
  186. case 'i':
  187. self::replace($word, 'ic', '', 1);
  188. break;
  189. case 'l':
  190. self::replace($word, 'able', '', 1)
  191. OR self::replace($word, 'ible', '', 1);
  192. break;
  193. case 'n':
  194. self::replace($word, 'ant', '', 1)
  195. OR self::replace($word, 'ement', '', 1)
  196. OR self::replace($word, 'ment', '', 1)
  197. OR self::replace($word, 'ent', '', 1);
  198. break;
  199. case 'o':
  200. if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
  201. self::replace($word, 'ion', '', 1);
  202. } else {
  203. self::replace($word, 'ou', '', 1);
  204. }
  205. break;
  206. case 's':
  207. self::replace($word, 'ism', '', 1);
  208. break;
  209. case 't':
  210. self::replace($word, 'ate', '', 1)
  211. OR self::replace($word, 'iti', '', 1);
  212. break;
  213. case 'u':
  214. self::replace($word, 'ous', '', 1);
  215. break;
  216. case 'v':
  217. self::replace($word, 'ive', '', 1);
  218. break;
  219. case 'z':
  220. self::replace($word, 'ize', '', 1);
  221. break;
  222. }
  223. return $word;
  224. }
  225. /**
  226. * Step 5
  227. *
  228. * @param string $word Word to stem
  229. */
  230. private static function step5($word)
  231. {
  232. // Part a
  233. if (substr($word, -1) == 'e') {
  234. if (self::m(substr($word, 0, -1)) > 1) {
  235. self::replace($word, 'e', '');
  236. } else if (self::m(substr($word, 0, -1)) == 1) {
  237. if (!self::cvc(substr($word, 0, -1))) {
  238. self::replace($word, 'e', '');
  239. }
  240. }
  241. }
  242. // Part b
  243. if (self::m($word) > 1 AND
  244. self::doubleConsonant($word) AND substr($word, -1) == 'l') {
  245. $word = substr($word, 0, -1);
  246. }
  247. return $word;
  248. }
  249. /**
  250. * Replaces the first string with the second, at the end of the string. If third
  251. * arg is given, then the preceding string must match that m count at least.
  252. *
  253. * @param string $str String to check
  254. * @param string $check Ending to check for
  255. * @param string $repl Replacement string
  256. * @param int $m Optional minimum number of m() to meet
  257. * @return bool Whether the $check string was at the end
  258. * of the $str string. True does not necessarily mean
  259. * that it was replaced.
  260. */
  261. private static function replace(&$str, $check, $repl, $m = null)
  262. {
  263. $len = 0 - strlen($check);
  264. if (substr($str, $len) == $check) {
  265. $substr = substr($str, 0, $len);
  266. if (is_null($m) OR self::m($substr) > $m) {
  267. $str = $substr . $repl;
  268. }
  269. return true;
  270. }
  271. return false;
  272. }
  273. /**
  274. * What, you mean it's not obvious from the name?
  275. *
  276. * m() measures the number of consonant sequences in $str. if c is
  277. * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  278. * presence,
  279. *
  280. * <c><v> gives 0
  281. * <c>vc<v> gives 1
  282. * <c>vcvc<v> gives 2
  283. * <c>vcvcvc<v> gives 3
  284. *
  285. * @param string $str The string to return the m count for
  286. * @return int The m count
  287. */
  288. private static function m($str)
  289. {
  290. $c = self::$regex_consonant;
  291. $v = self::$regex_vowel;
  292. $str = preg_replace("#^$c+#", '', $str);
  293. $str = preg_replace("#$v+$#", '', $str);
  294. preg_match_all("#($v+$c+)#", $str, $matches);
  295. return count($matches[1]);
  296. }
  297. /**
  298. * Returns true/false as to whether the given string contains two
  299. * of the same consonant next to each other at the end of the string.
  300. *
  301. * @param string $str String to check
  302. * @return bool Result
  303. */
  304. private static function doubleConsonant($str)
  305. {
  306. $c = self::$regex_consonant;
  307. return preg_match("#$c{2}$#", $str, $matches)
  308. AND $matches[0]{0} == $matches[0]{1};
  309. }
  310. /**
  311. * Checks for ending CVC sequence where second C is not W, X or Y
  312. *
  313. * @param string $str String to check
  314. * @return bool Result
  315. */
  316. private static function cvc($str)
  317. {
  318. $c = self::$regex_consonant;
  319. $v = self::$regex_vowel;
  320. return preg_match("#($c$v$c)$#", $str, $matches)
  321. AND strlen($matches[1]) == 3
  322. AND $matches[1]{2} != 'w'
  323. AND $matches[1]{2} != 'x'
  324. AND $matches[1]{2} != 'y';
  325. }
  326. }
  327. class MTrackSearchStemmer extends
  328. Zend_Search_Lucene_Analysis_TokenFilter {
  329. public function normalize(Zend_Search_Lucene_Analysis_Token $tok)
  330. {
  331. $text = $tok->getTermText();
  332. $text = PorterStemmer::Stem($text);
  333. $ntok = new Zend_Search_Lucene_Analysis_Token($text,
  334. $tok->getStartOffset(),
  335. $tok->getEndOffset());
  336. $ntok->setPositionIncrement($tok->getPositionIncrement());
  337. return $tok;
  338. }
  339. }
  340. class MTrackSearchDateToken extends Zend_Search_Lucene_Analysis_Token {
  341. }
  342. class MTrackSearchAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
  343. {
  344. private $_position;
  345. private $_bytePosition;
  346. private $_moreTokens = array();
  347. function reset()
  348. {
  349. $this->_position = 0;
  350. $this->_bytePosition = 0;
  351. }
  352. function nextToken()
  353. {
  354. if (count($this->_moreTokens)) {
  355. $tok = array_shift($this->_moreTokens);
  356. return $tok;
  357. }
  358. if ($this->_input == null) {
  359. return null;
  360. }
  361. do {
  362. /* first check for date fields */
  363. $is_date = false;
  364. // 2008-12-22T05:42:42.285445Z
  365. if (preg_match('/\d{4}-\d\d-\d\d(?:T\d\d:\d\d:\d\d(?:\.\d+)?Z?)?/u',
  366. $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
  367. $is_date = true;
  368. } else if (!preg_match('/[\p{L}\p{N}_]+/u',
  369. $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
  370. return null;
  371. }
  372. if (!function_exists('mb_strtolower')) {
  373. $matchedWord = strtolower($match[0][0]);
  374. } else {
  375. $matchedWord = mb_strtolower($match[0][0], 'UTF-8');
  376. }
  377. $binStartPos = $match[0][1];
  378. $startPos = $this->_position +
  379. iconv_strlen(substr($this->_input, $this->_bytePosition,
  380. $binStartPos - $this->_bytePosition),
  381. 'UTF-8');
  382. $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
  383. $this->_bytePosition = $binStartPos + strlen($matchedWord);
  384. $this->_position = $endPos;
  385. if ($is_date) {
  386. // $this->_moreTokens[] = new MTrackSearchDateToken($matchedWord,
  387. // $startPos, $endPos);
  388. /* Seems very difficult to allow range searching on strings
  389. * of the form "2009-10-10", so we just smush it together */
  390. $no_sep = str_replace(array('-', ':'), array('', ''), $matchedWord);
  391. list($no_sep) = explode('.', $no_sep);
  392. /* full date and time */
  393. // $this->_moreTokens[] = new MTrackSearchDateToken(
  394. // $no_sep, $startPos, $endPos);
  395. /* date only */
  396. $date = substr($no_sep, 0, 8);
  397. $this->_moreTokens[] = new MTrackSearchDateToken(
  398. $date, $startPos, $endPos);
  399. } else {
  400. $token = new Zend_Search_Lucene_Analysis_Token(
  401. $matchedWord, $startPos, $endPos);
  402. $token = $this->normalize($token);
  403. if ($token !== null) {
  404. $this->_moreTokens[] = $token;
  405. }
  406. }
  407. if (!$is_date) {
  408. /* split by underscores and add those tokens too */
  409. foreach (explode('_', $matchedWord) as $ele) {
  410. $token = new Zend_Search_Lucene_Analysis_Token(
  411. $ele, $startPos, $endPos);
  412. $token = $this->normalize($token);
  413. if ($token !== null) {
  414. $this->_moreTokens[] = $token;
  415. }
  416. }
  417. }
  418. } while (count($this->_moreTokens) == 0);
  419. return array_shift($this->_moreTokens);
  420. }
  421. function normalize(Zend_Search_Lucene_Analysis_Token $tok)
  422. {
  423. if ($tok instanceof MTrackSearchDateToken) {
  424. return $tok;
  425. }
  426. return parent::normalize($tok);
  427. }
  428. }
  429. /* the highlighter insists on using html document things,
  430. * so we force in our own dummy so that we can present the
  431. * same text we used initially */
  432. class MTrackSearchLuceneDummyDocument {
  433. public $text;
  434. function __construct($text) {
  435. $this->text = $text;
  436. }
  437. function getFieldUtf8Value($name) {
  438. return $this->text;
  439. }
  440. }
  441. class MTrackHLText
  442. implements Zend_Search_Lucene_Search_Highlighter_Interface {
  443. public $doc;
  444. public $context = array();
  445. public $text;
  446. public $matched = array();
  447. function setDocument(Zend_Search_Lucene_Document_Html $doc)
  448. {
  449. /* sure, I'll get right on that... */
  450. }
  451. function getDocument() {
  452. /* we just return our dummy doc instead */
  453. return $this->doc;
  454. }
  455. function highlight($words) {
  456. if (!is_array($words)) {
  457. $words = array($words);
  458. }
  459. foreach ($words as $word) {
  460. foreach ($this->text as $line) {
  461. $x = stripos($line, $word);
  462. if ($x !== false) {
  463. if (isset($this->matched[$word])) {
  464. $this->matched[$word]++;
  465. } else {
  466. $this->matched[$word] = 1;
  467. }
  468. if (isset($this->context[$line])) {
  469. $this->context[$line]++;
  470. } else {
  471. $this->context[$line] = 1;
  472. }
  473. }
  474. }
  475. }
  476. }
  477. function __construct($text, $query)
  478. {
  479. $this->doc = new MTrackSearchLuceneDummyDocument($text);
  480. $text = wordwrap($text);
  481. $this->text = preg_split("/\r?\n/", $text);
  482. $query->htmlFragmenthighlightMatches($text, 'utf-8', $this);
  483. }
  484. }
  485. class MTrackSearchResultLucene extends MTrackSearchResult {
  486. var $_query;
  487. function getExcerpt($text) {
  488. $hl = new MTrackHLText($text, $this->_query);
  489. $lines = array();
  490. foreach ($hl->context as $line => $count) {
  491. $line = trim($line);
  492. if (!strlen($line)) continue;
  493. foreach ($hl->matched as $word => $wcount) {
  494. $line = preg_replace("/($word)/i",
  495. "<span class='hl'>\\1</span>", $line);
  496. }
  497. $lines[] = $line;
  498. if (count($lines) > 6) {
  499. break;
  500. }
  501. }
  502. $ex = join(" &hellip; ", $lines);
  503. if (strlen($ex)) {
  504. return "<div class='excerpt'>$ex</div>";
  505. }
  506. return '';
  507. }
  508. }
  509. class MTrackSearchEngineLucene implements IMTrackSearchEngine
  510. {
  511. var $idx = null;
  512. function getIdx() {
  513. if ($this->idx) return $this->idx;
  514. $ana = new MTrackSearchAnalyzer;
  515. $ana->addFilter(new MTrackSearchStemmer);
  516. Zend_Search_Lucene_Analysis_Analyzer::setDefault($ana);
  517. $p = MTrackConfig::get('core', 'searchdb');
  518. if (!is_dir($p)) {
  519. $idx = Zend_Search_Lucene::create($p);
  520. if (!is_dir($p)) {
  521. throw new Exception("unable to initialize search db in '$p', check permissions and ensure that the web server user is able to create files and directories in its parent");
  522. }
  523. chmod($p, 0777);
  524. } else {
  525. $idx = Zend_Search_Lucene::open($p);
  526. }
  527. $this->index = $idx;
  528. return $idx;
  529. }
  530. public function setBatchMode()
  531. {
  532. $idx = $this->getIdx();
  533. $idx->setMaxBufferedDocs(64);
  534. $idx->setMergeFactor(15);
  535. }
  536. public function commit($optimize = false)
  537. {
  538. $idx = $this->getIdx();
  539. if ($optimize) {
  540. $idx->optimize();
  541. }
  542. $idx->commit();
  543. $this->idx = null;
  544. }
  545. public function remove($object)
  546. {
  547. $idx = $this->getIdx();
  548. foreach ($idx->find("object:\"$object\"") as $hit) {
  549. $res = $idx->delete($hit->id);
  550. }
  551. $idx->commit();
  552. }
  553. public function add($object, $fields, $replace = false)
  554. {
  555. echo "lucene: add($object)\n";
  556. $idx = $this->getIdx();
  557. if ($replace) {
  558. foreach ($idx->find("object:\"$object\"") as $hit) {
  559. $idx->delete($hit->id);
  560. }
  561. }
  562. $doc = new Zend_Search_Lucene_Document();
  563. $doc->addField(Zend_Search_Lucene_Field::Text('object', $object, 'utf-8'));
  564. foreach ($fields as $key => $value) {
  565. if (!strlen($value)) continue;
  566. if (!strncmp($key, 'stored:', 7)) {
  567. $key = substr($key, 7);
  568. $F = Zend_Search_Lucene_Field::Text($key, $value, 'utf-8');
  569. } else {
  570. $F = Zend_Search_Lucene_Field::UnStored($key, $value, 'utf-8');
  571. }
  572. $doc->addField($F);
  573. }
  574. $idx->addDocument($doc);
  575. }
  576. public function search($query) {
  577. Zend_Search_Lucene::setTermsPerQueryLimit(150);
  578. Zend_Search_Lucene::setResultSetLimit(250);
  579. $q = Zend_Search_Lucene_Search_QueryParser::parse($query);
  580. $idx = $this->getIdx();
  581. $hits = $idx->find($q);
  582. $result = array();
  583. foreach ($hits as $hit) {
  584. if ($idx->isDeleted($hit->id)) {
  585. continue;
  586. }
  587. $r = new MTrackSearchResultLucene;
  588. $r->_query = $q;
  589. $r->objectid = $hit->object;
  590. $r->score = $hit->score;
  591. $result[] = $r;
  592. }
  593. return $result;
  594. }
  595. public function highlighterNeedsContext() {
  596. return true;
  597. }
  598. }