PageRenderTime 64ms CodeModel.GetById 34ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/cache/BacklinkCache.php

https://gitlab.com/qiusct/mediawiki-i
PHP | 493 lines | 253 code | 56 blank | 184 comment | 49 complexity | 3fa031e2adae76c04d0793921d2dfe19 MD5 | raw file
Possible License(s): Apache-2.0, MIT, GPL-2.0
  1. <?php
  2. /**
  3. * Class for fetching backlink lists, approximate backlink counts and
  4. * partitions.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with this program; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. * http://www.gnu.org/copyleft/gpl.html
  20. *
  21. * @file
  22. * @author Tim Starling
  23. * @author Aaron Schulz
  24. * @copyright © 2009, Tim Starling, Domas Mituzas
  25. * @copyright © 2010, Max Sem
  26. * @copyright © 2011, Antoine Musso
  27. */
  28. /**
  29. * Class for fetching backlink lists, approximate backlink counts and
  30. * partitions. This is a shared cache.
  31. *
  32. * Instances of this class should typically be fetched with the method
  33. * $title->getBacklinkCache().
  34. *
  35. * Ideally you should only get your backlinks from here when you think
  36. * there is some advantage in caching them. Otherwise it's just a waste
  37. * of memory.
  38. *
  39. * Introduced by r47317
  40. *
  41. * @internal documentation reviewed on 18 Mar 2011 by hashar
  42. */
  43. class BacklinkCache {
  44. /** @var ProcessCacheLRU */
  45. protected static $cache;
  46. /**
  47. * Multi dimensions array representing batches. Keys are:
  48. * > (string) links table name
  49. * > (int) batch size
  50. * > 'numRows' : Number of rows for this link table
  51. * > 'batches' : array( $start, $end )
  52. *
  53. * @see BacklinkCache::partitionResult()
  54. *
  55. * Cleared with BacklinkCache::clear()
  56. */
  57. protected $partitionCache = array();
  58. /**
  59. * Contains the whole links from a database result.
  60. * This is raw data that will be partitioned in $partitionCache
  61. *
  62. * Initialized with BacklinkCache::getLinks()
  63. * Cleared with BacklinkCache::clear()
  64. */
  65. protected $fullResultCache = array();
  66. /**
  67. * Local copy of a database object.
  68. *
  69. * Accessor: BacklinkCache::getDB()
  70. * Mutator : BacklinkCache::setDB()
  71. * Cleared with BacklinkCache::clear()
  72. */
  73. protected $db;
  74. /**
  75. * Local copy of a Title object
  76. */
  77. protected $title;
  78. const CACHE_EXPIRY = 3600;
  79. /**
  80. * Create a new BacklinkCache
  81. *
  82. * @param Title $title : Title object to create a backlink cache for
  83. */
  84. public function __construct( Title $title ) {
  85. $this->title = $title;
  86. }
  87. /**
  88. * Create a new BacklinkCache or reuse any existing one.
  89. * Currently, only one cache instance can exist; callers that
  90. * need multiple backlink cache objects should keep them in scope.
  91. *
  92. * @param Title $title Title object to get a backlink cache for
  93. * @return BacklinkCache
  94. */
  95. public static function get( Title $title ) {
  96. if ( !self::$cache ) { // init cache
  97. self::$cache = new ProcessCacheLRU( 1 );
  98. }
  99. $dbKey = $title->getPrefixedDBkey();
  100. if ( !self::$cache->has( $dbKey, 'obj', 3600 ) ) {
  101. self::$cache->set( $dbKey, 'obj', new self( $title ) );
  102. }
  103. return self::$cache->get( $dbKey, 'obj' );
  104. }
  105. /**
  106. * Serialization handler, diasallows to serialize the database to prevent
  107. * failures after this class is deserialized from cache with dead DB
  108. * connection.
  109. *
  110. * @return array
  111. */
  112. function __sleep() {
  113. return array( 'partitionCache', 'fullResultCache', 'title' );
  114. }
  115. /**
  116. * Clear locally stored data and database object.
  117. */
  118. public function clear() {
  119. $this->partitionCache = array();
  120. $this->fullResultCache = array();
  121. unset( $this->db );
  122. }
  123. /**
  124. * Set the Database object to use
  125. *
  126. * @param DatabaseBase $db
  127. */
  128. public function setDB( $db ) {
  129. $this->db = $db;
  130. }
  131. /**
  132. * Get the slave connection to the database
  133. * When non existing, will initialize the connection.
  134. * @return DatabaseBase
  135. */
  136. protected function getDB() {
  137. if ( !isset( $this->db ) ) {
  138. $this->db = wfGetDB( DB_SLAVE );
  139. }
  140. return $this->db;
  141. }
  142. /**
  143. * Get the backlinks for a given table. Cached in process memory only.
  144. * @param string $table
  145. * @param int|bool $startId
  146. * @param int|bool $endId
  147. * @param int|INF $max
  148. * @return TitleArrayFromResult
  149. */
  150. public function getLinks( $table, $startId = false, $endId = false, $max = INF ) {
  151. return TitleArray::newFromResult( $this->queryLinks( $table, $startId, $endId, $max ) );
  152. }
  153. /**
  154. * Get the backlinks for a given table. Cached in process memory only.
  155. * @param string $table
  156. * @param int|bool $startId
  157. * @param int|bool $endId
  158. * @param int|INF $max
  159. * @param string $select 'all' or 'ids'
  160. * @return ResultWrapper
  161. */
  162. protected function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
  163. wfProfileIn( __METHOD__ );
  164. $fromField = $this->getPrefix( $table ) . '_from';
  165. if ( !$startId && !$endId && is_infinite( $max )
  166. && isset( $this->fullResultCache[$table] )
  167. ) {
  168. wfDebug( __METHOD__ . ": got results from cache\n" );
  169. $res = $this->fullResultCache[$table];
  170. } else {
  171. wfDebug( __METHOD__ . ": got results from DB\n" );
  172. $conds = $this->getConditions( $table );
  173. // Use the from field in the condition rather than the joined page_id,
  174. // because databases are stupid and don't necessarily propagate indexes.
  175. if ( $startId ) {
  176. $conds[] = "$fromField >= " . intval( $startId );
  177. }
  178. if ( $endId ) {
  179. $conds[] = "$fromField <= " . intval( $endId );
  180. }
  181. $options = array( 'ORDER BY' => $fromField );
  182. if ( is_finite( $max ) && $max > 0 ) {
  183. $options['LIMIT'] = $max;
  184. }
  185. if ( $select === 'ids' ) {
  186. // Just select from the backlink table and ignore the page JOIN
  187. $res = $this->getDB()->select(
  188. $table,
  189. array( $this->getPrefix( $table ) . '_from AS page_id' ),
  190. array_filter( $conds, function ( $clause ) { // kind of janky
  191. return !preg_match( '/(\b|=)page_id(\b|=)/', $clause );
  192. } ),
  193. __METHOD__,
  194. $options
  195. );
  196. } else {
  197. // Select from the backlink table and JOIN with page title information
  198. $res = $this->getDB()->select(
  199. array( $table, 'page' ),
  200. array( 'page_namespace', 'page_title', 'page_id' ),
  201. $conds,
  202. __METHOD__,
  203. array_merge( array( 'STRAIGHT_JOIN' ), $options )
  204. );
  205. }
  206. if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
  207. // The full results fit within the limit, so cache them
  208. $this->fullResultCache[$table] = $res;
  209. } else {
  210. wfDebug( __METHOD__ . ": results from DB were uncacheable\n" );
  211. }
  212. }
  213. wfProfileOut( __METHOD__ );
  214. return $res;
  215. }
  216. /**
  217. * Get the field name prefix for a given table
  218. * @param string $table
  219. * @throws MWException
  220. * @return null|string
  221. */
  222. protected function getPrefix( $table ) {
  223. static $prefixes = array(
  224. 'pagelinks' => 'pl',
  225. 'imagelinks' => 'il',
  226. 'categorylinks' => 'cl',
  227. 'templatelinks' => 'tl',
  228. 'redirect' => 'rd',
  229. );
  230. if ( isset( $prefixes[$table] ) ) {
  231. return $prefixes[$table];
  232. } else {
  233. $prefix = null;
  234. wfRunHooks( 'BacklinkCacheGetPrefix', array( $table, &$prefix ) );
  235. if ( $prefix ) {
  236. return $prefix;
  237. } else {
  238. throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
  239. }
  240. }
  241. }
  242. /**
  243. * Get the SQL condition array for selecting backlinks, with a join
  244. * on the page table.
  245. * @param string $table
  246. * @throws MWException
  247. * @return array|null
  248. */
  249. protected function getConditions( $table ) {
  250. $prefix = $this->getPrefix( $table );
  251. switch ( $table ) {
  252. case 'pagelinks':
  253. case 'templatelinks':
  254. $conds = array(
  255. "{$prefix}_namespace" => $this->title->getNamespace(),
  256. "{$prefix}_title" => $this->title->getDBkey(),
  257. "page_id={$prefix}_from"
  258. );
  259. break;
  260. case 'redirect':
  261. $conds = array(
  262. "{$prefix}_namespace" => $this->title->getNamespace(),
  263. "{$prefix}_title" => $this->title->getDBkey(),
  264. $this->getDb()->makeList( array(
  265. "{$prefix}_interwiki" => '',
  266. "{$prefix}_interwiki IS NULL",
  267. ), LIST_OR ),
  268. "page_id={$prefix}_from"
  269. );
  270. break;
  271. case 'imagelinks':
  272. case 'categorylinks':
  273. $conds = array(
  274. "{$prefix}_to" => $this->title->getDBkey(),
  275. "page_id={$prefix}_from"
  276. );
  277. break;
  278. default:
  279. $conds = null;
  280. wfRunHooks( 'BacklinkCacheGetConditions', array( $table, $this->title, &$conds ) );
  281. if ( !$conds ) {
  282. throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
  283. }
  284. }
  285. return $conds;
  286. }
  287. /**
  288. * Check if there are any backlinks
  289. * @param string $table
  290. * @return bool
  291. */
  292. public function hasLinks( $table ) {
  293. return ( $this->getNumLinks( $table, 1 ) > 0 );
  294. }
  295. /**
  296. * Get the approximate number of backlinks
  297. * @param string $table
  298. * @param int|INF $max Only count up to this many backlinks
  299. * @return int
  300. */
  301. public function getNumLinks( $table, $max = INF ) {
  302. global $wgMemc, $wgUpdateRowsPerJob;
  303. // 1) try partition cache ...
  304. if ( isset( $this->partitionCache[$table] ) ) {
  305. $entry = reset( $this->partitionCache[$table] );
  306. return min( $max, $entry['numRows'] );
  307. }
  308. // 2) ... then try full result cache ...
  309. if ( isset( $this->fullResultCache[$table] ) ) {
  310. return min( $max, $this->fullResultCache[$table]->numRows() );
  311. }
  312. $memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
  313. // 3) ... fallback to memcached ...
  314. $count = $wgMemc->get( $memcKey );
  315. if ( $count ) {
  316. return min( $max, $count );
  317. }
  318. // 4) fetch from the database ...
  319. if ( is_infinite( $max ) ) { // no limit at all
  320. // Use partition() since it will batch the query and skip the JOIN.
  321. // Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
  322. $this->partition( $table, $wgUpdateRowsPerJob ); // updates $this->partitionCache
  323. return $this->partitionCache[$table][$wgUpdateRowsPerJob]['numRows'];
  324. } else { // probably some sane limit
  325. // Fetch the full title info, since the caller will likely need it next
  326. $count = $this->getLinks( $table, false, false, $max )->count();
  327. if ( $count < $max ) { // full count
  328. $wgMemc->set( $memcKey, $count, self::CACHE_EXPIRY );
  329. }
  330. }
  331. return min( $max, $count );
  332. }
  333. /**
  334. * Partition the backlinks into batches.
  335. * Returns an array giving the start and end of each range. The first
  336. * batch has a start of false, and the last batch has an end of false.
  337. *
  338. * @param string $table The links table name
  339. * @param int $batchSize
  340. * @return array
  341. */
  342. public function partition( $table, $batchSize ) {
  343. global $wgMemc;
  344. // 1) try partition cache ...
  345. if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
  346. wfDebug( __METHOD__ . ": got from partition cache\n" );
  347. return $this->partitionCache[$table][$batchSize]['batches'];
  348. }
  349. $this->partitionCache[$table][$batchSize] = false;
  350. $cacheEntry =& $this->partitionCache[$table][$batchSize];
  351. // 2) ... then try full result cache ...
  352. if ( isset( $this->fullResultCache[$table] ) ) {
  353. $cacheEntry = $this->partitionResult( $this->fullResultCache[$table], $batchSize );
  354. wfDebug( __METHOD__ . ": got from full result cache\n" );
  355. return $cacheEntry['batches'];
  356. }
  357. $memcKey = wfMemcKey(
  358. 'backlinks',
  359. md5( $this->title->getPrefixedDBkey() ),
  360. $table,
  361. $batchSize
  362. );
  363. // 3) ... fallback to memcached ...
  364. $memcValue = $wgMemc->get( $memcKey );
  365. if ( is_array( $memcValue ) ) {
  366. $cacheEntry = $memcValue;
  367. wfDebug( __METHOD__ . ": got from memcached $memcKey\n" );
  368. return $cacheEntry['batches'];
  369. }
  370. // 4) ... finally fetch from the slow database :(
  371. $cacheEntry = array( 'numRows' => 0, 'batches' => array() ); // final result
  372. // Do the selects in batches to avoid client-side OOMs (bug 43452).
  373. // Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
  374. $selectSize = max( $batchSize, 200000 - ( 200000 % $batchSize ) );
  375. $start = false;
  376. do {
  377. $res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
  378. $partitions = $this->partitionResult( $res, $batchSize, false );
  379. // Merge the link count and range partitions for this chunk
  380. $cacheEntry['numRows'] += $partitions['numRows'];
  381. $cacheEntry['batches'] = array_merge( $cacheEntry['batches'], $partitions['batches'] );
  382. if ( count( $partitions['batches'] ) ) {
  383. list( , $lEnd ) = end( $partitions['batches'] );
  384. $start = $lEnd + 1; // pick up after this inclusive range
  385. }
  386. } while ( $partitions['numRows'] >= $selectSize );
  387. // Make sure the first range has start=false and the last one has end=false
  388. if ( count( $cacheEntry['batches'] ) ) {
  389. $cacheEntry['batches'][0][0] = false;
  390. $cacheEntry['batches'][count( $cacheEntry['batches'] ) - 1][1] = false;
  391. }
  392. // Save partitions to memcached
  393. $wgMemc->set( $memcKey, $cacheEntry, self::CACHE_EXPIRY );
  394. // Save backlink count to memcached
  395. $memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
  396. $wgMemc->set( $memcKey, $cacheEntry['numRows'], self::CACHE_EXPIRY );
  397. wfDebug( __METHOD__ . ": got from database\n" );
  398. return $cacheEntry['batches'];
  399. }
  400. /**
  401. * Partition a DB result with backlinks in it into batches
  402. * @param ResultWrapper $res Database result
  403. * @param int $batchSize
  404. * @param bool $isComplete Whether $res includes all the backlinks
  405. * @throws MWException
  406. * @return array
  407. */
  408. protected function partitionResult( $res, $batchSize, $isComplete = true ) {
  409. $batches = array();
  410. $numRows = $res->numRows();
  411. $numBatches = ceil( $numRows / $batchSize );
  412. for ( $i = 0; $i < $numBatches; $i++ ) {
  413. if ( $i == 0 && $isComplete ) {
  414. $start = false;
  415. } else {
  416. $rowNum = $i * $batchSize;
  417. $res->seek( $rowNum );
  418. $row = $res->fetchObject();
  419. $start = (int)$row->page_id;
  420. }
  421. if ( $i == ( $numBatches - 1 ) && $isComplete ) {
  422. $end = false;
  423. } else {
  424. $rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
  425. $res->seek( $rowNum );
  426. $row = $res->fetchObject();
  427. $end = (int)$row->page_id;
  428. }
  429. # Sanity check order
  430. if ( $start && $end && $start > $end ) {
  431. throw new MWException( __METHOD__ . ': Internal error: query result out of order' );
  432. }
  433. $batches[] = array( $start, $end );
  434. }
  435. return array( 'numRows' => $numRows, 'batches' => $batches );
  436. }
  437. }