PageRenderTime 46ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/inc/indexer.php

http://github.com/splitbrain/dokuwiki
PHP | 369 lines | 212 code | 27 blank | 130 comment | 60 complexity | 98cbc3a494df63a8eabe2e7237f70ff7 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, GPL-2.0
  1. <?php
  2. /**
  3. * Functions to create the fulltext search index
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. * @author Tom N Harris <tnharris@whoopdedo.org>
  8. */
  9. use dokuwiki\Extension\Event;
  10. use dokuwiki\Search\Indexer;
  11. // Version tag used to force rebuild on upgrade
  12. define('INDEXER_VERSION', 8);
  13. // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  14. if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
  15. /**
  16. * Version of the indexer taking into consideration the external tokenizer.
  17. * The indexer is only compatible with data written by the same version.
  18. *
  19. * @triggers INDEXER_VERSION_GET
  20. * Plugins that modify what gets indexed should hook this event and
  21. * add their version info to the event data like so:
  22. * $data[$plugin_name] = $plugin_version;
  23. *
  24. * @author Tom N Harris <tnharris@whoopdedo.org>
  25. * @author Michael Hamann <michael@content-space.de>
  26. *
  27. * @return int|string
  28. */
  29. function idx_get_version(){
  30. static $indexer_version = null;
  31. if ($indexer_version == null) {
  32. $version = INDEXER_VERSION;
  33. // DokuWiki version is included for the convenience of plugins
  34. $data = array('dokuwiki'=>$version);
  35. Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  36. unset($data['dokuwiki']); // this needs to be first
  37. ksort($data);
  38. foreach ($data as $plugin=>$vers)
  39. $version .= '+'.$plugin.'='.$vers;
  40. $indexer_version = $version;
  41. }
  42. return $indexer_version;
  43. }
  44. /**
  45. * Measure the length of a string.
  46. * Differs from strlen in handling of asian characters.
  47. *
  48. * @author Tom N Harris <tnharris@whoopdedo.org>
  49. *
  50. * @param string $w
  51. * @return int
  52. */
  53. function wordlen($w){
  54. $l = strlen($w);
  55. // If left alone, all chinese "words" will get put into w3.idx
  56. // So the "length" of a "word" is faked
  57. if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
  58. foreach($leadbytes[0] as $b)
  59. $l += ord($b) - 0xE1;
  60. }
  61. return $l;
  62. }
  63. /**
  64. * Create an instance of the indexer.
  65. *
  66. * @return Indexer an Indexer
  67. *
  68. * @author Tom N Harris <tnharris@whoopdedo.org>
  69. */
  70. function idx_get_indexer() {
  71. static $Indexer;
  72. if (!isset($Indexer)) {
  73. $Indexer = new Indexer();
  74. }
  75. return $Indexer;
  76. }
  77. /**
  78. * Returns words that will be ignored.
  79. *
  80. * @return array list of stop words
  81. *
  82. * @author Tom N Harris <tnharris@whoopdedo.org>
  83. */
  84. function & idx_get_stopwords() {
  85. static $stopwords = null;
  86. if (is_null($stopwords)) {
  87. global $conf;
  88. $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  89. if(file_exists($swfile)){
  90. $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
  91. }else{
  92. $stopwords = array();
  93. }
  94. }
  95. return $stopwords;
  96. }
  97. /**
  98. * Adds/updates the search index for the given page
  99. *
  100. * Locking is handled internally.
  101. *
  102. * @param string $page name of the page to index
  103. * @param boolean $verbose print status messages
  104. * @param boolean $force force reindexing even when the index is up to date
  105. * @return string|boolean the function completed successfully
  106. *
  107. * @author Tom N Harris <tnharris@whoopdedo.org>
  108. */
  109. function idx_addPage($page, $verbose=false, $force=false) {
  110. $idxtag = metaFN($page,'.indexed');
  111. // check if page was deleted but is still in the index
  112. if (!page_exists($page)) {
  113. if (!file_exists($idxtag)) {
  114. if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
  115. return false;
  116. }
  117. $Indexer = idx_get_indexer();
  118. $result = $Indexer->deletePage($page);
  119. if ($result === "locked") {
  120. if ($verbose) print("Indexer: locked".DOKU_LF);
  121. return false;
  122. }
  123. @unlink($idxtag);
  124. return $result;
  125. }
  126. // check if indexing needed
  127. if(!$force && file_exists($idxtag)){
  128. if(trim(io_readFile($idxtag)) == idx_get_version()){
  129. $last = @filemtime($idxtag);
  130. if($last > @filemtime(wikiFN($page))){
  131. if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
  132. return false;
  133. }
  134. }
  135. }
  136. $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
  137. if ($indexenabled === false) {
  138. $result = false;
  139. if (file_exists($idxtag)) {
  140. $Indexer = idx_get_indexer();
  141. $result = $Indexer->deletePage($page);
  142. if ($result === "locked") {
  143. if ($verbose) print("Indexer: locked".DOKU_LF);
  144. return false;
  145. }
  146. @unlink($idxtag);
  147. }
  148. if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
  149. return $result;
  150. }
  151. $Indexer = idx_get_indexer();
  152. $pid = $Indexer->getPID($page);
  153. if ($pid === false) {
  154. if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
  155. return false;
  156. }
  157. $body = '';
  158. $metadata = array();
  159. $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
  160. if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
  161. $metadata['relation_references'] = array_keys($references);
  162. else
  163. $metadata['relation_references'] = array();
  164. if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
  165. $metadata['relation_media'] = array_keys($media);
  166. else
  167. $metadata['relation_media'] = array();
  168. $data = compact('page', 'body', 'metadata', 'pid');
  169. $evt = new Event('INDEXER_PAGE_ADD', $data);
  170. if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
  171. $evt->advise_after();
  172. unset($evt);
  173. extract($data);
  174. $result = $Indexer->addPageWords($page, $body);
  175. if ($result === "locked") {
  176. if ($verbose) print("Indexer: locked".DOKU_LF);
  177. return false;
  178. }
  179. if ($result) {
  180. $result = $Indexer->addMetaKeys($page, $metadata);
  181. if ($result === "locked") {
  182. if ($verbose) print("Indexer: locked".DOKU_LF);
  183. return false;
  184. }
  185. }
  186. if ($result)
  187. io_saveFile(metaFN($page,'.indexed'), idx_get_version());
  188. if ($verbose) {
  189. print("Indexer: finished".DOKU_LF);
  190. return true;
  191. }
  192. return $result;
  193. }
  194. /**
  195. * Find tokens in the fulltext index
  196. *
  197. * Takes an array of words and will return a list of matching
  198. * pages for each one.
  199. *
  200. * Important: No ACL checking is done here! All results are
  201. * returned, regardless of permissions
  202. *
  203. * @param array $words list of words to search for
  204. * @return array list of pages found, associated with the search terms
  205. */
  206. function idx_lookup(&$words) {
  207. $Indexer = idx_get_indexer();
  208. return $Indexer->lookup($words);
  209. }
  210. /**
  211. * Split a string into tokens
  212. *
  213. * @param string $string
  214. * @param bool $wc
  215. *
  216. * @return array
  217. */
  218. function idx_tokenizer($string, $wc=false) {
  219. $Indexer = idx_get_indexer();
  220. return $Indexer->tokenizer($string, $wc);
  221. }
  222. /* For compatibility */
  223. /**
  224. * Read the list of words in an index (if it exists).
  225. *
  226. * @author Tom N Harris <tnharris@whoopdedo.org>
  227. *
  228. * @param string $idx
  229. * @param string $suffix
  230. * @return array
  231. */
  232. function idx_getIndex($idx, $suffix) {
  233. global $conf;
  234. $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
  235. if (!file_exists($fn)) return array();
  236. return file($fn);
  237. }
  238. /**
  239. * Get the list of lengths indexed in the wiki.
  240. *
  241. * Read the index directory or a cache file and returns
  242. * a sorted array of lengths of the words used in the wiki.
  243. *
  244. * @author YoBoY <yoboy.leguesh@gmail.com>
  245. *
  246. * @return array
  247. */
  248. function idx_listIndexLengths() {
  249. global $conf;
  250. // testing what we have to do, create a cache file or not.
  251. if ($conf['readdircache'] == 0) {
  252. $docache = false;
  253. } else {
  254. clearstatcache();
  255. if (file_exists($conf['indexdir'].'/lengths.idx')
  256. && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
  257. if (
  258. ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
  259. !== false
  260. ) {
  261. $idx = array();
  262. foreach ($lengths as $length) {
  263. $idx[] = (int)$length;
  264. }
  265. return $idx;
  266. }
  267. }
  268. $docache = true;
  269. }
  270. if ($conf['readdircache'] == 0 || $docache) {
  271. $dir = @opendir($conf['indexdir']);
  272. if ($dir === false)
  273. return array();
  274. $idx = array();
  275. while (($f = readdir($dir)) !== false) {
  276. if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
  277. $i = substr($f, 1, -4);
  278. if (is_numeric($i))
  279. $idx[] = (int)$i;
  280. }
  281. }
  282. closedir($dir);
  283. sort($idx);
  284. // save this in a file
  285. if ($docache) {
  286. $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
  287. @fwrite($handle, implode("\n", $idx));
  288. @fclose($handle);
  289. }
  290. return $idx;
  291. }
  292. return array();
  293. }
  294. /**
  295. * Get the word lengths that have been indexed.
  296. *
  297. * Reads the index directory and returns an array of lengths
  298. * that there are indices for.
  299. *
  300. * @author YoBoY <yoboy.leguesh@gmail.com>
  301. *
  302. * @param array|int $filter
  303. * @return array
  304. */
  305. function idx_indexLengths($filter) {
  306. global $conf;
  307. $idx = array();
  308. if (is_array($filter)) {
  309. // testing if index files exist only
  310. $path = $conf['indexdir']."/i";
  311. foreach ($filter as $key => $value) {
  312. if (file_exists($path.$key.'.idx'))
  313. $idx[] = $key;
  314. }
  315. } else {
  316. $lengths = idx_listIndexLengths();
  317. foreach ($lengths as $key => $length) {
  318. // keep all the values equal or superior
  319. if ((int)$length >= (int)$filter)
  320. $idx[] = $length;
  321. }
  322. }
  323. return $idx;
  324. }
  325. /**
  326. * Clean a name of a key for use as a file name.
  327. *
  328. * Romanizes non-latin characters, then strips away anything that's
  329. * not a letter, number, or underscore.
  330. *
  331. * @author Tom N Harris <tnharris@whoopdedo.org>
  332. *
  333. * @param string $name
  334. * @return string
  335. */
  336. function idx_cleanName($name) {
  337. $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
  338. $name = preg_replace('#[ \./\\:-]+#', '_', $name);
  339. $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
  340. return strtolower($name);
  341. }
  342. //Setup VIM: ex: et ts=4 :