PageRenderTime 50ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/src/site/tmp/install_4a925da139185/admin/plugins/dokuwiki/doku_search.php

https://bitbucket.org/manchas/jrobotz
PHP | 676 lines | 468 code | 95 blank | 113 comment | 121 complexity | de0bdfd50296a971d68052c24c6c930c MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1, GPL-2.0, Apache-2.0
  1. <?php
  2. /**
  3. * @package JFusion_dokuwiki
  4. * @subpackage dokuwiki
  5. * @author Andreas Gohr <andi@splitbrain.org>
  6. * @author JFusion development team
  7. * @copyright Copyright (C) 2008 JFusion. All rights reserved.
  8. * @license http://www.gnu.org/copyleft/gpl.html GNU/GPL
  9. */
  10. function ft_pageSearch($query,&$highlight){
  11. $q = ft_queryParser($query);
  12. $highlight = array();
  13. // remember for hilighting later
  14. foreach($q['words'] as $wrd){
  15. $highlight[] = str_replace('*','',$wrd);
  16. }
  17. // lookup all words found in the query
  18. $words = array_merge($q['and'],$q['not']);
  19. if(!count($words)) return array();
  20. $result = idx_lookup($words);
  21. if(!count($result)) return array();
  22. // merge search results with query
  23. foreach($q['and'] as $pos => $w){
  24. $q['and'][$pos] = $result[$w];
  25. }
  26. // create a list of unwanted docs
  27. $not = array();
  28. foreach($q['not'] as $pos => $w){
  29. $not = array_merge($not,array_keys($result[$w]));
  30. }
  31. // combine and-words
  32. if(count($q['and']) > 1){
  33. $docs = ft_resultCombine($q['and']);
  34. }else{
  35. $docs = $q['and'][0];
  36. }
  37. if(!count($docs)) return array();
  38. // create a list of hidden pages in the result
  39. $hidden = array();
  40. $hidden = array_filter(array_keys($docs),'isHiddenPage');
  41. $not = array_merge($not,$hidden);
  42. // filter unmatched namespaces
  43. if(!empty($q['ns'])) {
  44. $pattern = implode('|^',$q['ns']);
  45. foreach($docs as $key => $val) {
  46. if(!preg_match('/^'.$pattern.'/',$key)) {
  47. unset($docs[$key]);
  48. }
  49. }
  50. }
  51. // remove negative matches
  52. foreach($not as $n){
  53. unset($docs[$n]);
  54. }
  55. if(!count($docs)) return array();
  56. // handle phrases
  57. if(count($q['phrases'])){
  58. $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
  59. // use this for higlighting later:
  60. $highlight = array_merge($highlight,$q['phrases']);
  61. $q['phrases'] = array_map('preg_quote_cb',$q['phrases']);
  62. // check the source of all documents for the exact phrases
  63. foreach(array_keys($docs) as $id){
  64. $text = utf8_strtolower(rawWiki($id));
  65. foreach($q['phrases'] as $phrase){
  66. if(!preg_match('/'.$phrase.'/usi',$text)){
  67. unset($docs[$id]); // no hit - remove
  68. break;
  69. }
  70. }
  71. }
  72. }
  73. if(!count($docs)) return array();
  74. // check ACL permissions
  75. /*
  76. foreach(array_keys($docs) as $doc){
  77. if(auth_quickaclcheck($doc) < AUTH_READ){
  78. unset($docs[$doc]);
  79. }
  80. }
  81. */
  82. if(!count($docs)) return array();
  83. // if there are any hits left, sort them by count
  84. arsort($docs);
  85. return $docs;
  86. }
  87. /**
  88. * Returns the backlinks for a given page
  89. *
  90. * Does a quick lookup with the fulltext index, then
  91. * evaluates the instructions of the found pages
  92. */
  93. function ft_backlinks($id){
  94. global $conf;
  95. $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  96. $stopwords = @file_exists($swfile) ? file($swfile) : array();
  97. $result = array();
  98. // quick lookup of the pagename
  99. $page = noNS($id);
  100. $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
  101. $docs = array_keys(ft_resultCombine(array_values($matches)));
  102. $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
  103. if(!count($docs)) return $result;
  104. require_once(DOKU_INC.'inc/parserutils.php');
  105. // check metadata for matching links
  106. foreach($docs as $match){
  107. // metadata relation reference links are already resolved
  108. $links = p_get_metadata($match,'relation references');
  109. if (isset($links[$id])) $result[] = $match;
  110. }
  111. if(!count($result)) return $result;
  112. // check ACL permissions
  113. foreach(array_keys($result) as $idx){
  114. if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
  115. unset($result[$idx]);
  116. }
  117. }
  118. sort($result);
  119. return $result;
  120. }
  121. /**
  122. * Returns the pages that use a given media file
  123. *
  124. * Does a quick lookup with the fulltext index, then
  125. * evaluates the instructions of the found pages
  126. *
  127. * Aborts after $max found results
  128. */
  129. function ft_mediause($id,$max){
  130. global $conf;
  131. $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  132. $stopwords = @file_exists($swfile) ? file($swfile) : array();
  133. if(!$max) $max = 1; // need to find at least one
  134. $result = array();
  135. // quick lookup of the mediafile
  136. $media = noNS($id);
  137. $matches = idx_lookup(idx_tokenizer($media,$stopwords));
  138. $docs = array_keys(ft_resultCombine(array_values($matches)));
  139. if(!count($docs)) return $result;
  140. // go through all found pages
  141. $found = 0;
  142. $pcre = preg_quote($media,'/');
  143. foreach($docs as $doc){
  144. $ns = getNS($doc);
  145. preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches);
  146. foreach($matches[1] as $img){
  147. $img = trim($img);
  148. if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
  149. list($img) = explode('?',$img); // remove any parameters
  150. resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img
  151. if($img == $id){ // we have a match
  152. $result[] = $doc;
  153. $found++;
  154. break;
  155. }
  156. }
  157. if($found >= $max) break;
  158. }
  159. sort($result);
  160. return $result;
  161. }
  162. /**
  163. * Quicksearch for pagenames
  164. *
  165. * By default it only matches the pagename and ignores the
  166. * namespace. This can be changed with the second parameter
  167. *
  168. * @author Andreas Gohr <andi@splitbrain.org>
  169. */
  170. function ft_pageLookup($id,$pageonly=true){
  171. global $conf, $rootFolder;
  172. $id = preg_quote($id,'/');
  173. $pages = file($rootFolder . '/data/index'.'/page.idx');
  174. if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages));
  175. $cnt = count($pages);
  176. for($i=0; $i<$cnt; $i++){
  177. if($pageonly){
  178. if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
  179. unset($pages[$i]);
  180. continue;
  181. }
  182. }
  183. if(!page_exists($pages[$i])){
  184. unset($pages[$i]);
  185. continue;
  186. }
  187. }
  188. $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages
  189. if(!count($pages)) return array();
  190. // check ACL permissions
  191. foreach(array_keys($pages) as $idx){
  192. if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){
  193. unset($pages[$idx]);
  194. }
  195. }
  196. $pages = array_map('trim',$pages);
  197. sort($pages);
  198. return $pages;
  199. }
  200. /**
  201. * Creates a snippet extract
  202. *
  203. * @author Andreas Gohr <andi@splitbrain.org>
  204. */
  205. function ft_snippet($id,$highlight){
  206. $text = rawWiki($id);
  207. $match = array();
  208. $snippets = array();
  209. $utf8_offset = $offset = $end = 0;
  210. $len = utf8_strlen($text);
  211. // build a regexp from the phrases to highlight
  212. $re = join('|',array_map('preg_quote_cb',array_filter((array) $highlight)));
  213. for ($cnt=3; $cnt--;) {
  214. if (!preg_match('#('.$re.')#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
  215. list($str,$idx) = $match[0];
  216. // convert $idx (a byte offset) into a utf8 character offset
  217. $utf8_idx = utf8_strlen(substr($text,0,$idx));
  218. $utf8_len = utf8_strlen($str);
  219. // establish context, 100 bytes surrounding the match string
  220. // first look to see if we can go 100 either side,
  221. // then drop to 50 adding any excess if the other side can't go to 50,
  222. $pre = min($utf8_idx-$utf8_offset,100);
  223. $post = min($len-$utf8_idx-$utf8_len,100);
  224. if ($pre>50 && $post>50) {
  225. $pre = $post = 50;
  226. } else if ($pre>50) {
  227. $pre = min($pre,100-$post);
  228. } else if ($post>50) {
  229. $post = min($post, 100-$pre);
  230. } else {
  231. // both are less than 50, means the context is the whole string
  232. // make it so and break out of this loop - there is no need for the
  233. // complex snippet calculations
  234. $snippets = array($text);
  235. break;
  236. }
  237. // establish context start and end points, try to append to previous
  238. // context if possible
  239. $start = $utf8_idx - $pre;
  240. $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
  241. $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
  242. if ($append) {
  243. $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
  244. } else {
  245. $snippets[] = utf8_substr($text,$start,$end-$start);
  246. }
  247. // set $offset for next match attempt
  248. // substract strlen to avoid splitting a potential search success,
  249. // this is an approximation as the search pattern may match strings
  250. // of varying length and it will fail if the context snippet
  251. // boundary breaks a matching string longer than the current match
  252. $utf8_offset = $utf8_idx + $post;
  253. $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
  254. $offset = utf8_correctIdx($text,$offset);
  255. }
  256. $m = "\1";
  257. $snippets = preg_replace('#('.$re.')#iu',$m.'$1'.$m,$snippets);
  258. $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
  259. return $snippet;
  260. }
  261. /**
  262. * Combine found documents and sum up their scores
  263. *
  264. * This function is used to combine searched words with a logical
  265. * AND. Only documents available in all arrays are returned.
  266. *
  267. * based upon PEAR's PHP_Compat function for array_intersect_key()
  268. *
  269. * @param array $args An array of page arrays
  270. */
  271. function ft_resultCombine($args){
  272. $array_count = count($args);
  273. if($array_count == 1){
  274. return $args[0];
  275. }
  276. $result = array();
  277. if ($array_count > 1) {
  278. foreach ($args[0] as $key => $value) {
  279. $result[$key] = $value;
  280. for ($i = 1; $i !== $array_count; $i++) {
  281. if (!isset($args[$i][$key])) {
  282. unset($result[$key]);
  283. break;
  284. }
  285. $result[$key] += $args[$i][$key];
  286. }
  287. }
  288. }
  289. return $result;
  290. }
  291. /**
  292. * Builds an array of search words from a query
  293. *
  294. * @todo support OR and parenthesises?
  295. */
  296. function ft_queryParser($query){
  297. global $conf;
  298. $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  299. if(@file_exists($swfile)){
  300. $stopwords = file($swfile);
  301. }else{
  302. $stopwords = array();
  303. }
  304. $q = array();
  305. $q['query'] = $query;
  306. $q['ns'] = array();
  307. $q['phrases'] = array();
  308. $q['words'] = array();
  309. $q['and'] = array();
  310. $q['not'] = array();
  311. // strip namespace from query
  312. if(preg_match('/([^@]*)@(.*)/',$query,$match)) {
  313. $query = $match[1];
  314. $q['ns'] = explode('@',preg_replace("/ /",'',$match[2]));
  315. }
  316. // handle phrase searches
  317. while(preg_match('/"(.*?)"/',$query,$match)){
  318. $q['phrases'][] = $match[1];
  319. $q['and'] = array_merge($q['and'], idx_tokenizer($match[0],$stopwords));
  320. $query = preg_replace('/"(.*?)"/','',$query,1);
  321. }
  322. $words = explode(' ',$query);
  323. foreach($words as $w){
  324. if($w{0} == '-'){
  325. $token = idx_tokenizer($w,$stopwords,true);
  326. if(count($token)) $q['not'] = array_merge($q['not'],$token);
  327. }else{
  328. // asian "words" need to be searched as phrases
  329. if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){
  330. $q['phrases'] = array_merge($q['phrases'],$matches[1]);
  331. }
  332. $token = idx_tokenizer($w,$stopwords,true);
  333. if(count($token)){
  334. $q['and'] = array_merge($q['and'],$token);
  335. $q['words'] = array_merge($q['words'],$token);
  336. }
  337. }
  338. }
  339. return $q;
  340. }
  341. function idx_tokenizer($string,&$stopwords,$wc=false){
  342. $words = array();
  343. $wc = ($wc) ? '' : $wc = '\*';
  344. if(preg_match('/[^0-9A-Za-z]/u', $string)){
  345. // handle asian chars as single words (may fail on older PHP version)
  346. $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
  347. if(!is_null($asia)) $string = $asia; //recover from regexp failure
  348. $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
  349. foreach ($arr as $w) {
  350. if (!is_numeric($w) && strlen($w) < 3) continue;
  351. $w = utf8_strtolower($w);
  352. if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
  353. $words[] = $w;
  354. }
  355. }else{
  356. $w = $string;
  357. if (!is_numeric($w) && strlen($w) < 3) return $words;
  358. $w = strtolower($w);
  359. if(is_int(array_search("$w\n",$stopwords))) return $words;
  360. $words[] = $w;
  361. }
  362. return $words;
  363. }
  364. function idx_lookup($words){
  365. global $conf;
  366. $result = array();
  367. $wids = idx_getIndexWordsSorted($words, $result);
  368. if(empty($wids)) return array();
  369. // load known words and documents
  370. $page_idx = idx_getIndex('page','');
  371. $docs = array(); // hold docs found
  372. foreach(array_keys($wids) as $wlen){
  373. $wids[$wlen] = array_unique($wids[$wlen]);
  374. $index = idx_getIndex('i',$wlen);
  375. foreach($wids[$wlen] as $ixid){
  376. if($ixid < count($index))
  377. $docs["$wlen*$ixid"] = idx_parseIndexLine($page_idx,$index[$ixid]);
  378. }
  379. }
  380. // merge found pages into final result array
  381. $final = array();
  382. foreach(array_keys($result) as $word){
  383. $final[$word] = array();
  384. foreach($result[$word] as $wid){
  385. $hits = &$docs[$wid];
  386. foreach ($hits as $hitkey => $hitcnt) {
  387. $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
  388. }
  389. }
  390. }
  391. return $final;
  392. }
  393. function idx_getIndexWordsSorted($words,&$result){
  394. // parse and sort tokens
  395. $tokens = array();
  396. $tokenlength = array();
  397. $tokenwild = array();
  398. foreach($words as $word){
  399. $result[$word] = array();
  400. $wild = 0;
  401. $xword = $word;
  402. $wlen = wordlen($word);
  403. // check for wildcards
  404. if(substr($xword,0,1) == '*'){
  405. $xword = substr($xword,1);
  406. $wild |= 1;
  407. $wlen -= 1;
  408. }
  409. if(substr($xword,-1,1) == '*'){
  410. $xword = substr($xword,0,-1);
  411. $wild |= 2;
  412. $wlen -= 1;
  413. }
  414. if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
  415. if(!isset($tokens[$xword])){
  416. $tokenlength[$wlen][] = $xword;
  417. }
  418. if($wild){
  419. $ptn = preg_quote($xword,'/');
  420. if(($wild&1) == 0) $ptn = '^'.$ptn;
  421. if(($wild&2) == 0) $ptn = $ptn.'$';
  422. $tokens[$xword][] = array($word, '/'.$ptn.'/');
  423. if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen;
  424. }else
  425. $tokens[$xword][] = array($word, null);
  426. }
  427. asort($tokenwild);
  428. // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
  429. // $tokenlength = array( base word length => base word ... )
  430. // $tokenwild = array( base word => base word length ... )
  431. $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
  432. $indexes_known = idx_indexLengths($length_filter);
  433. if(!empty($tokenwild)) sort($indexes_known);
  434. // get word IDs
  435. $wids = array();
  436. foreach($indexes_known as $ixlen){
  437. $word_idx = idx_getIndex('w',$ixlen);
  438. // handle exact search
  439. if(isset($tokenlength[$ixlen])){
  440. foreach($tokenlength[$ixlen] as $xword){
  441. $wid = array_search("$xword\n",$word_idx);
  442. if(is_int($wid)){
  443. $wids[$ixlen][] = $wid;
  444. foreach($tokens[$xword] as $w)
  445. $result[$w[0]][] = "$ixlen*$wid";
  446. }
  447. }
  448. }
  449. // handle wildcard search
  450. foreach($tokenwild as $xword => $wlen){
  451. if($wlen >= $ixlen) break;
  452. foreach($tokens[$xword] as $w){
  453. if(is_null($w[1])) continue;
  454. foreach(array_keys(preg_grep($w[1],$word_idx)) as $wid){
  455. $wids[$ixlen][] = $wid;
  456. $result[$w[0]][] = "$ixlen*$wid";
  457. }
  458. }
  459. }
  460. }
  461. return $wids;
  462. }
  463. function wordlen($w){
  464. $l = strlen($w);
  465. // If left alone, all chinese "words" will get put into w3.idx
  466. // So the "length" of a "word" is faked
  467. if(preg_match('/'.IDX_ASIAN2.'/u',$w))
  468. $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF
  469. return $l;
  470. }
  471. function idx_indexLengths(&$filter){
  472. global $conf, $rootFolder;
  473. $dir = @opendir($rootFolder . '/data/index');
  474. if($dir===false)
  475. return array();
  476. $idx = array();
  477. if(is_array($filter)){
  478. while (($f = readdir($dir)) !== false) {
  479. if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
  480. $i = substr($f,1,-4);
  481. if (is_numeric($i) && isset($filter[(int)$i]))
  482. $idx[] = (int)$i;
  483. }
  484. }
  485. }else{
  486. // Exact match first.
  487. if(@file_exists($rootFolder . '/data/index'."/i$filter.idx"))
  488. $idx[] = $filter;
  489. while (($f = readdir($dir)) !== false) {
  490. if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
  491. $i = substr($f,1,-4);
  492. if (is_numeric($i) && $i > $filter)
  493. $idx[] = (int)$i;
  494. }
  495. }
  496. }
  497. closedir($dir);
  498. return $idx;
  499. }
  500. function utf8_stripspecials($string,$repl='',$additional=''){
  501. global $UTF8_SPECIAL_CHARS;
  502. global $UTF8_SPECIAL_CHARS2;
  503. static $specials = null;
  504. if(is_null($specials)){
  505. # $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
  506. $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
  507. }
  508. return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
  509. }
  510. function idx_getIndex($pre, $wlen){
  511. global $conf, $rootFolder;
  512. $fn = $rootFolder . '/data/index'.'/'.$pre.$wlen.'.idx';
  513. if(!@file_exists($fn)) return array();
  514. return file($fn);
  515. }
  516. function idx_parseIndexLine(&$page_idx,$line){
  517. $result = array();
  518. $line = trim($line);
  519. if($line == '') return $result;
  520. $parts = explode(':',$line);
  521. foreach($parts as $part){
  522. if($part == '') continue;
  523. list($doc,$cnt) = explode('*',$part);
  524. if(!$cnt) continue;
  525. $doc = trim($page_idx[$doc]);
  526. if(!$doc) continue;
  527. // make sure the document still exists
  528. if(!page_exists($doc,'',false)) continue;
  529. $result[$doc] = $cnt;
  530. }
  531. return $result;
  532. }
  533. function page_exists($id,$rev='',$clean=true) {
  534. return @file_exists(wikiFN($id,$rev,$clean));
  535. }
  536. function wikiFN($raw_id,$rev='',$clean=true){
  537. global $conf, $rootFolder;
  538. global $cache_wikifn;
  539. $cache = & $cache_wikifn;
  540. if (isset($cache[$raw_id]) && isset($cache[$raw_id][$rev])) {
  541. return $cache[$raw_id][$rev];
  542. }
  543. $id = $raw_id;
  544. if ($clean) $id = cleanID($id);
  545. $id = str_replace(':','/',$id);
  546. if(empty($rev)){
  547. $fn = $rootFolder . '/data/pages'.'/'.utf8_encodeFN($id).'.txt';
  548. }else{
  549. $fn = $conf['olddir'].'/'.utf8_encodeFN($id).'.'.$rev.'.txt';
  550. if($conf['compression']){
  551. //test for extensions here, we want to read both compressions
  552. if (@file_exists($fn . '.gz')){
  553. $fn .= '.gz';
  554. }else if(@file_exists($fn . '.bz2')){
  555. $fn .= '.bz2';
  556. }else{
  557. //file doesnt exist yet, so we take the configured extension
  558. $fn .= '.' . $conf['compression'];
  559. }
  560. }
  561. }
  562. if (!isset($cache[$raw_id])) { $cache[$raw_id] = array(); }
  563. $cache[$raw_id][$rev] = $fn;
  564. return $fn;
  565. }
  566. function utf8_encodeFN($file,$safe=true){
  567. if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
  568. return $file;
  569. }
  570. $file = urlencode($file);
  571. $file = str_replace('%2F','/',$file);
  572. return $file;
  573. }
  574. function isHiddenPage($id){
  575. global $conf;
  576. if(empty($conf['hidepages'])) return false;
  577. if(preg_match('/'.$conf['hidepages'].'/ui',':'.$id)){
  578. return true;
  579. }
  580. return false;
  581. }