PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/maintenance/dumpInterwiki.php

https://github.com/daevid/MWFork
PHP | 251 lines | 162 code | 37 blank | 52 comment | 24 complexity | fe1a10fbf4fc0c6466e616d070d0230f MD5 | raw file
  1. <?php
  2. /**
  3. * Build constant slightly compact database of interwiki prefixes
  4. * Wikimedia specific!
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with this program; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. * http://www.gnu.org/copyleft/gpl.html
  20. *
  21. * @file
  22. * @todo document
  23. * @ingroup Maintenance
  24. * @ingroup Wikimedia
  25. */
  26. require_once( dirname( __FILE__ ) . '/Site.php' );
  27. require_once( dirname( __FILE__ ) . '/Maintenance.php' );
  28. class DumpInterwiki extends Maintenance {
  29. public function __construct() {
  30. parent::__construct();
  31. $this->mDescription = "Build constant slightly compact database of interwiki prefixes.";
  32. $this->addOption( 'langlist', 'File with one language code per line', false, true );
  33. $this->addOption( 'dblist', 'File with one db per line', false, true );
  34. $this->addOption( 'specialdbs', "File with one 'special' db per line", false, true );
  35. $this->addOption( 'o', 'Cdb output file', false, true );
  36. }
  37. function execute() {
  38. # List of language prefixes likely to be found in multi-language sites
  39. $this->langlist = array_map( "trim", file( $this->getOption( 'langlist', "/home/wikipedia/common/langlist" ) ) );
  40. # List of all database names
  41. $this->dblist = array_map( "trim", file( $this->getOption( 'dblist', "/home/wikipedia/common/all.dblist" ) ) );
  42. # Special-case databases
  43. $this->specials = array_flip( array_map( "trim", file( $this->getOption( 'specialdbs', "/home/wikipedia/common/special.dblist" ) ) ) );
  44. if ( $this->hasOption( 'o' ) ) {
  45. $this->dbFile = CdbWriter::open( $this->getOption( 'o' ) ) ;
  46. } else {
  47. $this->dbFile = false;
  48. }
  49. $this->getRebuildInterwikiDump();
  50. }
  51. function getRebuildInterwikiDump() {
  52. global $wgContLang;
  53. # Multi-language sites
  54. # db suffix => db suffix, iw prefix, hostname
  55. $sites = array(
  56. 'wiki' => new Site( 'wiki', 'w', 'wikipedia.org' ),
  57. 'wiktionary' => new Site( 'wiktionary', 'wikt', 'wiktionary.org' ),
  58. 'wikiquote' => new Site( 'wikiquote', 'q', 'wikiquote.org' ),
  59. 'wikibooks' => new Site( 'wikibooks', 'b', 'wikibooks.org' ),
  60. 'wikinews' => new Site( 'wikinews', 'n', 'wikinews.org' ),
  61. 'wikisource' => new Site( 'wikisource', 's', 'wikisource.org' ),
  62. 'wikimedia' => new Site( 'wikimedia', 'chapter', 'wikimedia.org' ),
  63. 'wikiversity' => new Site( 'wikiversity', 'v', 'wikiversity.org' ),
  64. );
  65. # Extra interwiki links that can't be in the intermap for some reason
  66. $extraLinks = array(
  67. array( 'm', 'http://meta.wikimedia.org/wiki/$1', 1 ),
  68. array( 'meta', 'http://meta.wikimedia.org/wiki/$1', 1 ),
  69. array( 'sep11', 'http://sep11.wikipedia.org/wiki/$1', 1 ),
  70. );
  71. # Language aliases, usually configured as redirects to the real wiki in apache
  72. # Interlanguage links are made directly to the real wiki
  73. # Something horrible happens if you forget to list an alias here, I can't
  74. # remember what
  75. $this->languageAliases = array(
  76. 'zh-cn' => 'zh',
  77. 'zh-tw' => 'zh',
  78. 'dk' => 'da',
  79. 'nb' => 'no',
  80. );
  81. # Special case prefix rewrites, for the benefit of Swedish which uses s:t
  82. # as an abbreviation for saint
  83. $this->prefixRewrites = array(
  84. 'svwiki' => array( 's' => 'src' ),
  85. );
  86. # Construct a list of reserved prefixes
  87. $reserved = array();
  88. foreach ( $this->langlist as $lang ) {
  89. $reserved[$lang] = 1;
  90. }
  91. foreach ( $this->languageAliases as $alias => $lang ) {
  92. $reserved[$alias] = 1;
  93. }
  94. foreach ( $sites as $site ) {
  95. $reserved[$site->lateral] = 1;
  96. }
  97. # Extract the intermap from meta
  98. $intermap = Http::get( 'http://meta.wikimedia.org/w/index.php?title=Interwiki_map&action=raw', 30 );
  99. $lines = array_map( 'trim', explode( "\n", trim( $intermap ) ) );
  100. if ( !$lines || count( $lines ) < 2 ) {
  101. $this->error( "m:Interwiki_map not found", true );
  102. }
  103. # Global interwiki map
  104. foreach ( $lines as $line ) {
  105. if ( preg_match( '/^\|\s*(.*?)\s*\|\|\s*(.*?)\s*$/', $line, $matches ) ) {
  106. $prefix = $wgContLang->lc( $matches[1] );
  107. $prefix = str_replace( ' ', '_', $prefix );
  108. $url = $matches[2];
  109. if ( preg_match( '/(wikipedia|wiktionary|wikisource|wikiquote|wikibooks|wikimedia)\.org/', $url ) ) {
  110. $local = 1;
  111. } else {
  112. $local = 0;
  113. }
  114. if ( empty( $reserved[$prefix] ) ) {
  115. $imap = array( "iw_prefix" => $prefix, "iw_url" => $url, "iw_local" => $local );
  116. $this->makeLink ( $imap, "__global" );
  117. }
  118. }
  119. }
  120. # Exclude Wikipedia for Wikipedia
  121. $this->makeLink ( array ( 'iw_prefix' => 'wikipedia', 'is_url' => null ), "_wiki" );
  122. # Multilanguage sites
  123. foreach ( $sites as $site ) {
  124. $this->makeLanguageLinks ( $site, "_" . $site->suffix );
  125. }
  126. foreach ( $this->dblist as $db ) {
  127. if ( isset( $this->specials[$db] ) ) {
  128. # Special wiki
  129. # Has interwiki links and interlanguage links to wikipedia
  130. $this->makeLink( array( 'iw_prefix' => $db, 'iw_url' => "wiki" ), "__sites" );
  131. # Links to multilanguage sites
  132. foreach ( $sites as $targetSite ) {
  133. $this->makeLink( array( 'iw_prefix' => $targetSite->lateral,
  134. 'iw_url' => $targetSite->getURL( 'en' ),
  135. 'iw_local' => 1 ), $db );
  136. }
  137. } else {
  138. # Find out which site this DB belongs to
  139. $site = false;
  140. foreach ( $sites as $candidateSite ) {
  141. $suffix = $candidateSite->suffix;
  142. if ( preg_match( "/(.*)$suffix$/", $db, $matches ) ) {
  143. $site = $candidateSite;
  144. break;
  145. }
  146. }
  147. $this->makeLink( array( 'iw_prefix' => $db, 'iw_url' => $site->suffix ), "__sites" );
  148. if ( !$site ) {
  149. $this->error( "Invalid database $db\n" );
  150. continue;
  151. }
  152. $lang = $matches[1];
  153. # Lateral links
  154. foreach ( $sites as $targetSite ) {
  155. if ( $targetSite->suffix != $site->suffix ) {
  156. $this->makeLink( array( 'iw_prefix' => $targetSite->lateral,
  157. 'iw_url' => $targetSite->getURL( $lang ),
  158. 'iw_local' => 1 ), $db );
  159. }
  160. }
  161. if ( $site->suffix == "wiki" ) {
  162. $this->makeLink( array( 'iw_prefix' => 'w',
  163. 'iw_url' => "http://en.wikipedia.org/wiki/$1",
  164. 'iw_local' => 1 ), $db );
  165. }
  166. }
  167. }
  168. foreach ( $extraLinks as $link ) {
  169. $this->makeLink( $link, "__global" );
  170. }
  171. # List prefixes for each source
  172. foreach ( $this->prefixLists as $source => $hash ) {
  173. $list = array_keys( $hash );
  174. sort( $list );
  175. if ( $this->dbFile ) {
  176. $this->dbFile->set( "__list:{$source}", implode( ' ', $list ) );
  177. } else {
  178. print "__list:{$source} " . implode( ' ', $list ) . "\n";
  179. }
  180. }
  181. }
  182. # ------------------------------------------------------------------------------------------
  183. # Executes part of an INSERT statement, corresponding to all interlanguage links to a particular site
  184. function makeLanguageLinks( &$site, $source ) {
  185. # Actual languages with their own databases
  186. foreach ( $this->langlist as $targetLang ) {
  187. $this->makeLink( array( $targetLang, $site->getURL( $targetLang ), 1 ), $source );
  188. }
  189. # Language aliases
  190. foreach ( $this->languageAliases as $alias => $lang ) {
  191. $this->makeLink( array( $alias, $site->getURL( $lang ), 1 ), $source );
  192. }
  193. }
  194. function makeLink( $entry, $source ) {
  195. if ( isset( $this->prefixRewrites[$source] ) && isset( $this->prefixRewrites[$source][$entry[0]] ) )
  196. $entry[0] = $this->prefixRewrites[$source][$entry[0]];
  197. if ( !array_key_exists( "iw_prefix", $entry ) ) {
  198. $entry = array( "iw_prefix" => $entry[0], "iw_url" => $entry[1], "iw_local" => $entry[2] );
  199. }
  200. if ( array_key_exists( $source, $this->prefixRewrites ) &&
  201. array_key_exists( $entry['iw_prefix'], $this->prefixRewrites[$source] ) ) {
  202. $entry['iw_prefix'] = $this->prefixRewrites[$source][$entry['iw_prefix']];
  203. }
  204. if ( $this->dbFile ) {
  205. $this->dbFile->set( "{$source}:{$entry['iw_prefix']}", trim( "{$entry['iw_local']} {$entry['iw_url']}" ) );
  206. } else {
  207. $this->output( "{$source}:{$entry['iw_prefix']} {$entry['iw_url']} {$entry['iw_local']}\n" );
  208. }
  209. # Add to list of prefixes
  210. $this->prefixLists[$source][$entry['iw_prefix']] = 1;
  211. }
  212. }
  213. $maintClass = "DumpInterwiki";
  214. require_once( RUN_MAINTENANCE_IF_MAIN );