PageRenderTime 52ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/WikiZam/maintenance/generateSitemap.php

https://github.com/Seizam/seizamcore
PHP | 466 lines | 218 code | 46 blank | 202 comment | 17 complexity | cc1b898c96a94e8de858bc1a0e5a0e83 MD5 | raw file
  1. <?php
  2. /**
  3. * Creates a sitemap for the site
  4. *
  5. * Copyright © 2005, Ævar Arnfjörð Bjarmason, Jens Frank <jeluf@gmx.de> and
  6. * Brion Vibber <brion@pobox.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup Maintenance
  25. * @see http://www.sitemaps.org/
  26. * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
  27. */
  28. require_once( dirname( __FILE__ ) . '/Maintenance.php' );
  29. class GenerateSitemap extends Maintenance {
  30. const GS_MAIN = -2;
  31. const GS_TALK = -1;
  32. /**
  33. * The maximum amount of urls in a sitemap file
  34. *
  35. * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
  36. *
  37. * @var int
  38. */
  39. var $url_limit;
  40. /**
  41. * The maximum size of a sitemap file
  42. *
  43. * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
  44. *
  45. * @var int
  46. */
  47. var $size_limit;
  48. /**
  49. * The path to prepend to the filename
  50. *
  51. * @var string
  52. */
  53. var $fspath;
  54. /**
  55. * The URL path to prepend to filenames in the index; should resolve to the same directory as $fspath
  56. *
  57. * @var string
  58. */
  59. var $urlpath;
  60. /**
  61. * Whether or not to use compression
  62. *
  63. * @var bool
  64. */
  65. var $compress;
  66. /**
  67. * The number of entries to save in each sitemap file
  68. *
  69. * @var array
  70. */
  71. var $limit = array();
  72. /**
  73. * Key => value entries of namespaces and their priorities
  74. *
  75. * @var array
  76. */
  77. var $priorities = array();
  78. /**
  79. * A one-dimensional array of namespaces in the wiki
  80. *
  81. * @var array
  82. */
  83. var $namespaces = array();
  84. /**
  85. * When this sitemap batch was generated
  86. *
  87. * @var string
  88. */
  89. var $timestamp;
  90. /**
  91. * A database slave object
  92. *
  93. * @var object
  94. */
  95. var $dbr;
  96. /**
  97. * A resource pointing to the sitemap index file
  98. *
  99. * @var resource
  100. */
  101. var $findex;
  102. /**
  103. * A resource pointing to a sitemap file
  104. *
  105. * @var resource
  106. */
  107. var $file;
  108. /**
  109. * Identifier to use in filenames, default $wgDBname
  110. *
  111. * @var string
  112. */
  113. private $identifier;
  114. /**
  115. * Constructor
  116. */
  117. public function __construct() {
  118. parent::__construct();
  119. $this->mDescription = "Creates a sitemap for the site";
  120. $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true );
  121. $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true );
  122. $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
  123. $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true );
  124. }
  125. /**
  126. * Execute
  127. */
  128. public function execute() {
  129. $this->setNamespacePriorities();
  130. $this->url_limit = 50000;
  131. $this->size_limit = pow( 2, 20 ) * 10;
  132. $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
  133. $this->urlpath = $this->getOption( 'urlpath', "" );
  134. if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
  135. $this->urlpath .= '/';
  136. }
  137. $this->identifier = $this->getOption( 'identifier', wfWikiID() );
  138. $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
  139. $this->dbr = wfGetDB( DB_SLAVE );
  140. $this->generateNamespaces();
  141. $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
  142. $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
  143. $this->main();
  144. }
  145. private function setNamespacePriorities() {
  146. // Custom main namespaces
  147. $this->priorities[self::GS_MAIN] = '0.5';
  148. // Custom talk namesspaces
  149. $this->priorities[self::GS_TALK] = '0.1';
  150. // MediaWiki standard namespaces
  151. $this->priorities[NS_MAIN] = '1.0';
  152. $this->priorities[NS_TALK] = '0.1';
  153. $this->priorities[NS_USER] = '0.5';
  154. $this->priorities[NS_USER_TALK] = '0.1';
  155. $this->priorities[NS_PROJECT] = '0.5';
  156. $this->priorities[NS_PROJECT_TALK] = '0.1';
  157. $this->priorities[NS_FILE] = '0.5';
  158. $this->priorities[NS_FILE_TALK] = '0.1';
  159. $this->priorities[NS_MEDIAWIKI] = '0.0';
  160. $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
  161. $this->priorities[NS_TEMPLATE] = '0.0';
  162. $this->priorities[NS_TEMPLATE_TALK] = '0.1';
  163. $this->priorities[NS_HELP] = '0.5';
  164. $this->priorities[NS_HELP_TALK] = '0.1';
  165. $this->priorities[NS_CATEGORY] = '0.5';
  166. $this->priorities[NS_CATEGORY_TALK] = '0.1';
  167. }
  168. /**
  169. * Create directory if it does not exist and return pathname with a trailing slash
  170. */
  171. private static function init_path( $fspath ) {
  172. if ( !isset( $fspath ) ) {
  173. return null;
  174. }
  175. # Create directory if needed
  176. if ( $fspath && !is_dir( $fspath ) ) {
  177. wfMkdirParents( $fspath ) or die( "Can not create directory $fspath.\n" );
  178. }
  179. return realpath( $fspath ) . DIRECTORY_SEPARATOR ;
  180. }
  181. /**
  182. * Generate a one-dimensional array of existing namespaces
  183. */
  184. function generateNamespaces() {
  185. // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
  186. global $wgSitemapNamespaces;
  187. if ( is_array( $wgSitemapNamespaces ) ) {
  188. $this->namespaces = $wgSitemapNamespaces;
  189. return;
  190. }
  191. $res = $this->dbr->select( 'page',
  192. array( 'page_namespace' ),
  193. array(),
  194. __METHOD__,
  195. array(
  196. 'GROUP BY' => 'page_namespace',
  197. 'ORDER BY' => 'page_namespace',
  198. )
  199. );
  200. foreach ( $res as $row )
  201. $this->namespaces[] = $row->page_namespace;
  202. }
  203. /**
  204. * Get the priority of a given namespace
  205. *
  206. * @param $namespace Integer: the namespace to get the priority for
  207. * @return String
  208. */
  209. function priority( $namespace ) {
  210. return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
  211. }
  212. /**
  213. * If the namespace isn't listed on the priority list return the
  214. * default priority for the namespace, varies depending on whether it's
  215. * a talkpage or not.
  216. *
  217. * @param $namespace Integer: the namespace to get the priority for
  218. * @return String
  219. */
  220. function guessPriority( $namespace ) {
  221. return MWNamespace::isMain( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK];
  222. }
  223. /**
  224. * Return a database resolution of all the pages in a given namespace
  225. *
  226. * @param $namespace Integer: limit the query to this namespace
  227. * @return Resource
  228. */
  229. function getPageRes( $namespace ) {
  230. return $this->dbr->select( 'page',
  231. array(
  232. 'page_namespace',
  233. 'page_title',
  234. 'page_touched',
  235. ),
  236. array( 'page_namespace' => $namespace ),
  237. __METHOD__
  238. );
  239. }
  240. /**
  241. * Main loop
  242. */
  243. public function main() {
  244. global $wgContLang;
  245. fwrite( $this->findex, $this->openIndex() );
  246. foreach ( $this->namespaces as $namespace ) {
  247. $res = $this->getPageRes( $namespace );
  248. $this->file = false;
  249. $this->generateLimit( $namespace );
  250. $length = $this->limit[0];
  251. $i = $smcount = 0;
  252. $fns = $wgContLang->getFormattedNsText( $namespace );
  253. $this->output( "$namespace ($fns)\n" );
  254. foreach ( $res as $row ) {
  255. if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
  256. if ( $this->file !== false ) {
  257. $this->write( $this->file, $this->closeFile() );
  258. $this->close( $this->file );
  259. }
  260. $filename = $this->sitemapFilename( $namespace, $smcount++ );
  261. $this->file = $this->open( $this->fspath . $filename, 'wb' );
  262. $this->write( $this->file, $this->openFile() );
  263. fwrite( $this->findex, $this->indexEntry( $filename ) );
  264. $this->output( "\t$this->fspath$filename\n" );
  265. $length = $this->limit[0];
  266. $i = 1;
  267. }
  268. $title = Title::makeTitle( $row->page_namespace, $row->page_title );
  269. $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
  270. $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
  271. $length += strlen( $entry );
  272. $this->write( $this->file, $entry );
  273. // generate pages for language variants
  274. if ( $wgContLang->hasVariants() ) {
  275. $variants = $wgContLang->getVariants();
  276. foreach ( $variants as $vCode ) {
  277. if ( $vCode == $wgContLang->getCode() ) continue; // we don't want default variant
  278. $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) );
  279. $length += strlen( $entry );
  280. $this->write( $this->file, $entry );
  281. }
  282. }
  283. }
  284. if ( $this->file ) {
  285. $this->write( $this->file, $this->closeFile() );
  286. $this->close( $this->file );
  287. }
  288. }
  289. fwrite( $this->findex, $this->closeIndex() );
  290. fclose( $this->findex );
  291. }
  292. /**
  293. * gzopen() / fopen() wrapper
  294. *
  295. * @return Resource
  296. */
  297. function open( $file, $flags ) {
  298. return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
  299. }
  300. /**
  301. * gzwrite() / fwrite() wrapper
  302. */
  303. function write( &$handle, $str ) {
  304. if ( $this->compress )
  305. gzwrite( $handle, $str );
  306. else
  307. fwrite( $handle, $str );
  308. }
  309. /**
  310. * gzclose() / fclose() wrapper
  311. */
  312. function close( &$handle ) {
  313. if ( $this->compress )
  314. gzclose( $handle );
  315. else
  316. fclose( $handle );
  317. }
  318. /**
  319. * Get a sitemap filename
  320. *
  321. * @param $namespace Integer: the namespace
  322. * @param $count Integer: the count
  323. * @return String
  324. */
  325. function sitemapFilename( $namespace, $count ) {
  326. $ext = $this->compress ? '.gz' : '';
  327. return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
  328. }
  329. /**
  330. * Return the XML required to open an XML file
  331. *
  332. * @return string
  333. */
  334. function xmlHead() {
  335. return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
  336. }
  337. /**
  338. * Return the XML schema being used
  339. *
  340. * @return String
  341. */
  342. function xmlSchema() {
  343. return 'http://www.sitemaps.org/schemas/sitemap/0.9';
  344. }
  345. /**
  346. * Return the XML required to open a sitemap index file
  347. *
  348. * @return String
  349. */
  350. function openIndex() {
  351. return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
  352. }
  353. /**
  354. * Return the XML for a single sitemap indexfile entry
  355. *
  356. * @param $filename String: the filename of the sitemap file
  357. * @return String
  358. */
  359. function indexEntry( $filename ) {
  360. return
  361. "\t<sitemap>\n" .
  362. "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
  363. "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
  364. "\t</sitemap>\n";
  365. }
  366. /**
  367. * Return the XML required to close a sitemap index file
  368. *
  369. * @return String
  370. */
  371. function closeIndex() {
  372. return "</sitemapindex>\n";
  373. }
  374. /**
  375. * Return the XML required to open a sitemap file
  376. *
  377. * @return String
  378. */
  379. function openFile() {
  380. return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
  381. }
  382. /**
  383. * Return the XML for a single sitemap entry
  384. *
  385. * @param $url String: an RFC 2396 compliant URL
  386. * @param $date String: a ISO 8601 date
  387. * @param $priority String: a priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
  388. * @return String
  389. */
  390. function fileEntry( $url, $date, $priority ) {
  391. return
  392. "\t<url>\n" .
  393. "\t\t<loc>$url</loc>\n" .
  394. "\t\t<lastmod>$date</lastmod>\n" .
  395. "\t\t<priority>$priority</priority>\n" .
  396. "\t</url>\n";
  397. }
  398. /**
  399. * Return the XML required to close sitemap file
  400. *
  401. * @return String
  402. */
  403. function closeFile() {
  404. return "</urlset>\n";
  405. }
  406. /**
  407. * Populate $this->limit
  408. */
  409. function generateLimit( $namespace ) {
  410. // bug 17961: make a title with the longest possible URL in this namespace
  411. $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
  412. $this->limit = array(
  413. strlen( $this->openFile() ),
  414. strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
  415. strlen( $this->closeFile() )
  416. );
  417. }
  418. }
  419. $maintClass = "GenerateSitemap";
  420. require_once( RUN_MAINTENANCE_IF_MAIN );