PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/exe/indexer.php

https://github.com/caillou/dokuwiki-jQuery
PHP | 449 lines | 289 code | 60 blank | 100 comment | 61 complexity | d2f63ca92d6034df72a1fe6c7113629c MD5 | raw file
  1. <?php
  2. /**
  3. * DokuWiki indexer
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. */
  8. if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../');
  9. define('DOKU_DISABLE_GZIP_OUTPUT',1);
  10. require_once(DOKU_INC.'inc/init.php');
  11. session_write_close(); //close session
  12. if(!defined('NL')) define('NL',"\n");
  13. // Version tag used to force rebuild on upgrade
  14. define('INDEXER_VERSION', 2);
  15. // keep running after browser closes connection
  16. @ignore_user_abort(true);
  17. // check if user abort worked, if yes send output early
  18. $defer = !@ignore_user_abort() || $conf['broken_iua'];
  19. if(!$defer){
  20. sendGIF(); // send gif
  21. }
  22. $ID = cleanID($_REQUEST['id']);
  23. // Catch any possible output (e.g. errors)
  24. if(!isset($_REQUEST['debug'])) ob_start();
  25. // run one of the jobs
  26. $tmp = array(); // No event data
  27. $evt = new Doku_Event('INDEXER_TASKS_RUN', $tmp);
  28. if ($evt->advise_before()) {
  29. runIndexer() or
  30. metaUpdate() or
  31. runSitemapper() or
  32. sendDigest() or
  33. runTrimRecentChanges() or
  34. runTrimRecentChanges(true) or
  35. $evt->advise_after();
  36. }
  37. if($defer) sendGIF();
  38. if(!isset($_REQUEST['debug'])) ob_end_clean();
  39. exit;
  40. // --------------------------------------------------------------------
  41. /**
  42. * Trims the recent changes cache (or imports the old changelog) as needed.
  43. *
  44. * @param media_changes If the media changelog shall be trimmed instead of
  45. * the page changelog
  46. *
  47. * @author Ben Coburn <btcoburn@silicodon.net>
  48. */
  49. function runTrimRecentChanges($media_changes = false) {
  50. global $conf;
  51. $fn = ($media_changes ? $conf['media_changelog'] : $conf['changelog']);
  52. // Trim the Recent Changes
  53. // Trims the recent changes cache to the last $conf['changes_days'] recent
  54. // changes or $conf['recent'] items, which ever is larger.
  55. // The trimming is only done once a day.
  56. if (@file_exists($fn) &&
  57. (@filemtime($fn.'.trimmed')+86400)<time() &&
  58. !@file_exists($fn.'_tmp')) {
  59. @touch($fn.'.trimmed');
  60. io_lock($fn);
  61. $lines = file($fn);
  62. if (count($lines)<=$conf['recent']) {
  63. // nothing to trim
  64. io_unlock($fn);
  65. return false;
  66. }
  67. io_saveFile($fn.'_tmp', ''); // presave tmp as 2nd lock
  68. $trim_time = time() - $conf['recent_days']*86400;
  69. $out_lines = array();
  70. for ($i=0; $i<count($lines); $i++) {
  71. $log = parseChangelogLine($lines[$i]);
  72. if ($log === false) continue; // discard junk
  73. if ($log['date'] < $trim_time) {
  74. $old_lines[$log['date'].".$i"] = $lines[$i]; // keep old lines for now (append .$i to prevent key collisions)
  75. } else {
  76. $out_lines[$log['date'].".$i"] = $lines[$i]; // definitely keep these lines
  77. }
  78. }
  79. if (count($lines)==count($out_lines)) {
  80. // nothing to trim
  81. @unlink($fn.'_tmp');
  82. io_unlock($fn);
  83. return false;
  84. }
  85. // sort the final result, it shouldn't be necessary,
  86. // however the extra robustness in making the changelog cache self-correcting is worth it
  87. ksort($out_lines);
  88. $extra = $conf['recent'] - count($out_lines); // do we need extra lines do bring us up to minimum
  89. if ($extra > 0) {
  90. ksort($old_lines);
  91. $out_lines = array_merge(array_slice($old_lines,-$extra),$out_lines);
  92. }
  93. // save trimmed changelog
  94. io_saveFile($fn.'_tmp', implode('', $out_lines));
  95. @unlink($fn);
  96. if (!rename($fn.'_tmp', $fn)) {
  97. // rename failed so try another way...
  98. io_unlock($fn);
  99. io_saveFile($fn, implode('', $out_lines));
  100. @unlink($fn.'_tmp');
  101. } else {
  102. io_unlock($fn);
  103. }
  104. return true;
  105. }
  106. // nothing done
  107. return false;
  108. }
  109. /**
  110. * Runs the indexer for the current page
  111. *
  112. * @author Andreas Gohr <andi@splitbrain.org>
  113. */
  114. function runIndexer(){
  115. global $ID;
  116. global $conf;
  117. print "runIndexer(): started".NL;
  118. if(!$ID) return false;
  119. // check if indexing needed
  120. $idxtag = metaFN($ID,'.indexed');
  121. if(@file_exists($idxtag)){
  122. if(io_readFile($idxtag) >= INDEXER_VERSION){
  123. $last = @filemtime($idxtag);
  124. if($last > @filemtime(wikiFN($ID))){
  125. print "runIndexer(): index for $ID up to date".NL;
  126. return false;
  127. }
  128. }
  129. }
  130. // try to aquire a lock
  131. $lock = $conf['lockdir'].'/_indexer.lock';
  132. while(!@mkdir($lock,$conf['dmode'])){
  133. usleep(50);
  134. if(time()-@filemtime($lock) > 60*5){
  135. // looks like a stale lock - remove it
  136. @rmdir($lock);
  137. print "runIndexer(): stale lock removed".NL;
  138. }else{
  139. print "runIndexer(): indexer locked".NL;
  140. return false;
  141. }
  142. }
  143. if($conf['dperm']) chmod($lock, $conf['dperm']);
  144. // upgrade to version 2
  145. if (!@file_exists($conf['indexdir'].'/pageword.idx'))
  146. idx_upgradePageWords();
  147. // do the work
  148. idx_addPage($ID);
  149. // we're finished - save and free lock
  150. io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION);
  151. @rmdir($lock);
  152. print "runIndexer(): finished".NL;
  153. return true;
  154. }
  155. /**
  156. * Will render the metadata for the page if not exists yet
  157. *
  158. * This makes sure pages which are created from outside DokuWiki will
  159. * gain their data when viewed for the first time.
  160. */
  161. function metaUpdate(){
  162. global $ID;
  163. print "metaUpdate(): started".NL;
  164. if(!$ID) return false;
  165. $file = metaFN($ID, '.meta');
  166. echo "meta file: $file".NL;
  167. // rendering needed?
  168. if (@file_exists($file)) return false;
  169. if (!@file_exists(wikiFN($ID))) return false;
  170. global $conf;
  171. // gather some additional info from changelog
  172. $info = io_grep($conf['changelog'],
  173. '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/',
  174. 0,true);
  175. $meta = array();
  176. if(!empty($info)){
  177. $meta['date']['created'] = $info[0][1];
  178. foreach($info as $item){
  179. if($item[4] != '*'){
  180. $meta['date']['modified'] = $item[1];
  181. if($item[3]){
  182. $meta['contributor'][$item[3]] = $item[3];
  183. }
  184. }
  185. }
  186. }
  187. $meta = p_render_metadata($ID, $meta);
  188. io_saveFile($file, serialize($meta));
  189. echo "metaUpdate(): finished".NL;
  190. return true;
  191. }
  192. /**
  193. * Builds a Google Sitemap of all public pages known to the indexer
  194. *
  195. * The map is placed in the root directory named sitemap.xml.gz - This
  196. * file needs to be writable!
  197. *
  198. * @author Andreas Gohr
  199. * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html
  200. */
  201. function runSitemapper(){
  202. global $conf;
  203. print "runSitemapper(): started".NL;
  204. if(!$conf['sitemap']) return false;
  205. if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){
  206. $sitemap = 'sitemap.xml.gz';
  207. }else{
  208. $sitemap = 'sitemap.xml';
  209. }
  210. print "runSitemapper(): using $sitemap".NL;
  211. if(@file_exists(DOKU_INC.$sitemap)){
  212. if(!is_writable(DOKU_INC.$sitemap)) return false;
  213. }else{
  214. if(!is_writable(DOKU_INC)) return false;
  215. }
  216. if(@filesize(DOKU_INC.$sitemap) &&
  217. @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){
  218. print 'runSitemapper(): Sitemap up to date'.NL;
  219. return false;
  220. }
  221. $pages = idx_getIndex('page', '');
  222. print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
  223. // build the sitemap
  224. ob_start();
  225. print '<?xml version="1.0" encoding="UTF-8"?>'.NL;
  226. print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL;
  227. foreach($pages as $id){
  228. $id = trim($id);
  229. $file = wikiFN($id);
  230. //skip hidden, non existing and restricted files
  231. if(isHiddenPage($id)) continue;
  232. $date = @filemtime($file);
  233. if(!$date) continue;
  234. if(auth_aclcheck($id,'','') < AUTH_READ) continue;
  235. print ' <url>'.NL;
  236. print ' <loc>'.wl($id,'',true).'</loc>'.NL;
  237. print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL;
  238. print ' </url>'.NL;
  239. }
  240. print '</urlset>'.NL;
  241. $data = ob_get_contents();
  242. ob_end_clean();
  243. //save the new sitemap
  244. io_saveFile(DOKU_INC.$sitemap,$data);
  245. //ping search engines...
  246. $http = new DokuHTTPClient();
  247. $http->timeout = 8;
  248. //ping google
  249. print 'runSitemapper(): pinging google'.NL;
  250. $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap=';
  251. $url .= urlencode(DOKU_URL.$sitemap);
  252. $resp = $http->get($url);
  253. if($http->error) print 'runSitemapper(): '.$http->error.NL;
  254. print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL;
  255. //ping yahoo
  256. print 'runSitemapper(): pinging yahoo'.NL;
  257. $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url=';
  258. $url .= urlencode(DOKU_URL.$sitemap);
  259. $resp = $http->get($url);
  260. if($http->error) print 'runSitemapper(): '.$http->error.NL;
  261. print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL;
  262. //ping microsoft
  263. print 'runSitemapper(): pinging microsoft'.NL;
  264. $url = 'http://www.bing.com/webmaster/ping.aspx?siteMap=';
  265. $url .= urlencode(DOKU_URL.$sitemap);
  266. $resp = $http->get($url);
  267. if($http->error) print 'runSitemapper(): '.$http->error.NL;
  268. print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL;
  269. print 'runSitemapper(): finished'.NL;
  270. return true;
  271. }
  272. /**
  273. * Send digest and list mails for all subscriptions which are in effect for the
  274. * current page
  275. *
  276. * @author Adrian Lang <lang@cosmocode.de>
  277. */
  278. function sendDigest() {
  279. echo 'sendDigest(): start'.NL;
  280. global $ID;
  281. global $conf;
  282. if (!$conf['subscribers']) {
  283. return;
  284. }
  285. $subscriptions = subscription_find($ID, array('style' => '(digest|list)',
  286. 'escaped' => true));
  287. global $auth;
  288. global $lang;
  289. global $conf;
  290. global $USERINFO;
  291. // remember current user info
  292. $olduinfo = $USERINFO;
  293. $olduser = $_SERVER['REMOTE_USER'];
  294. foreach($subscriptions as $id => $users) {
  295. if (!subscription_lock($id)) {
  296. continue;
  297. }
  298. foreach($users as $data) {
  299. list($user, $style, $lastupdate) = $data;
  300. $lastupdate = (int) $lastupdate;
  301. if ($lastupdate + $conf['subscribe_time'] > time()) {
  302. // Less than the configured time period passed since last
  303. // update.
  304. continue;
  305. }
  306. // Work as the user to make sure ACLs apply correctly
  307. $USERINFO = $auth->getUserData($user);
  308. $_SERVER['REMOTE_USER'] = $user;
  309. if ($USERINFO === false) {
  310. continue;
  311. }
  312. if (substr($id, -1, 1) === ':') {
  313. // The subscription target is a namespace
  314. $changes = getRecentsSince($lastupdate, null, getNS($id));
  315. } else {
  316. if(auth_quickaclcheck($id) < AUTH_READ) continue;
  317. $meta = p_get_metadata($id);
  318. $changes = array($meta['last_change']);
  319. }
  320. // Filter out pages only changed in small and own edits
  321. $change_ids = array();
  322. foreach($changes as $rev) {
  323. $n = 0;
  324. while (!is_null($rev) && $rev['date'] >= $lastupdate &&
  325. ($_SERVER['REMOTE_USER'] === $rev['user'] ||
  326. $rev['type'] === DOKU_CHANGE_TYPE_MINOR_EDIT)) {
  327. $rev = getRevisions($rev['id'], $n++, 1);
  328. $rev = (count($rev) > 0) ? $rev[0] : null;
  329. }
  330. if (!is_null($rev) && $rev['date'] >= $lastupdate) {
  331. // Some change was not a minor one and not by myself
  332. $change_ids[] = $rev['id'];
  333. }
  334. }
  335. if ($style === 'digest') {
  336. foreach($change_ids as $change_id) {
  337. subscription_send_digest($USERINFO['mail'], $change_id,
  338. $lastupdate);
  339. }
  340. } elseif ($style === 'list') {
  341. subscription_send_list($USERINFO['mail'], $change_ids, $id);
  342. }
  343. // TODO: Handle duplicate subscriptions.
  344. // Update notification time.
  345. subscription_set($user, $id, $style, time(), true);
  346. }
  347. subscription_unlock($id);
  348. }
  349. // restore current user info
  350. $USERINFO = $olduinfo;
  351. $_SERVER['REMOTE_USER'] = $olduser;
  352. }
  353. /**
  354. * Formats a timestamp as ISO 8601 date
  355. *
  356. * @author <ungu at terong dot com>
  357. * @link http://www.php.net/manual/en/function.date.php#54072
  358. */
  359. function date_iso8601($int_date) {
  360. //$int_date: current date in UNIX timestamp
  361. $date_mod = date('Y-m-d\TH:i:s', $int_date);
  362. $pre_timezone = date('O', $int_date);
  363. $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2);
  364. $date_mod .= $time_zone;
  365. return $date_mod;
  366. }
  367. /**
  368. * Just send a 1x1 pixel blank gif to the browser
  369. *
  370. * @author Andreas Gohr <andi@splitbrain.org>
  371. * @author Harry Fuecks <fuecks@gmail.com>
  372. */
  373. function sendGIF(){
  374. if(isset($_REQUEST['debug'])){
  375. header('Content-Type: text/plain');
  376. return;
  377. }
  378. $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7');
  379. header('Content-Type: image/gif');
  380. header('Content-Length: '.strlen($img));
  381. header('Connection: Close');
  382. print $img;
  383. flush();
  384. // Browser should drop connection after this
  385. // Thinks it's got the whole image
  386. }
  387. //Setup VIM: ex: et ts=4 enc=utf-8 :
  388. // No trailing PHP closing tag - no output please!
  389. // See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php