PageRenderTime 24ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/maintenance/storage/trackBlobs.php

https://github.com/daevid/MWFork
PHP | 396 lines | 318 code | 35 blank | 43 comment | 30 complexity | c69b334754ea471abec8d74918a2d585 MD5 | raw file
  1. <?php
  2. /**
  3. * Adds blobs from a given external storage cluster to the blob_tracking table.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Maintenance
  22. * @see wfWaitForSlaves()
  23. */
  24. require( dirname( __FILE__ ) . '/../commandLine.inc' );
  25. if ( count( $args ) < 1 ) {
  26. echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
  27. echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
  28. echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
  29. exit( 1 );
  30. }
  31. $tracker = new TrackBlobs( $args );
  32. $tracker->run();
  33. echo "All done.\n";
  34. class TrackBlobs {
  35. var $clusters, $textClause;
  36. var $doBlobOrphans;
  37. var $trackedBlobs = array();
  38. var $batchSize = 1000;
  39. var $reportingInterval = 10;
  40. function __construct( $clusters ) {
  41. $this->clusters = $clusters;
  42. if ( extension_loaded( 'gmp' ) ) {
  43. $this->doBlobOrphans = true;
  44. foreach ( $clusters as $cluster ) {
  45. $this->trackedBlobs[$cluster] = gmp_init( 0 );
  46. }
  47. } else {
  48. echo "Warning: the gmp extension is needed to find orphan blobs\n";
  49. }
  50. }
  51. function run() {
  52. $this->checkIntegrity();
  53. $this->initTrackingTable();
  54. $this->trackRevisions();
  55. $this->trackOrphanText();
  56. if ( $this->doBlobOrphans ) {
  57. $this->findOrphanBlobs();
  58. }
  59. }
  60. function checkIntegrity() {
  61. echo "Doing integrity check...\n";
  62. $dbr = wfGetDB( DB_SLAVE );
  63. // Scan for HistoryBlobStub objects in the text table (bug 20757)
  64. $exists = $dbr->selectField( 'text', 1,
  65. 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
  66. 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
  67. __METHOD__
  68. );
  69. if ( $exists ) {
  70. echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
  71. "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
  72. "to fix this.\n";
  73. exit( 1 );
  74. }
  75. // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
  76. $flags = $dbr->selectField( 'archive', 'ar_flags',
  77. 'ar_flags LIKE \'%external%\' OR (' .
  78. 'ar_flags LIKE \'%object%\' ' .
  79. 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
  80. __METHOD__
  81. );
  82. if ( strpos( $flags, 'external' ) !== false ) {
  83. echo "Integrity check failed: found external storage pointers in your archive table.\n" .
  84. "Run normaliseArchiveTable.php to fix this.\n";
  85. exit( 1 );
  86. } elseif ( $flags ) {
  87. echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
  88. "These objects are probably already broken, continuing would make them\n" .
  89. "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
  90. exit( 1 );
  91. }
  92. echo "Integrity check OK\n";
  93. }
  94. function initTrackingTable() {
  95. $dbw = wfGetDB( DB_MASTER );
  96. if ( $dbw->tableExists( 'blob_tracking' ) ) {
  97. $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
  98. $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
  99. }
  100. $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
  101. }
  102. function getTextClause() {
  103. if ( !$this->textClause ) {
  104. $dbr = wfGetDB( DB_SLAVE );
  105. $this->textClause = '';
  106. foreach ( $this->clusters as $cluster ) {
  107. if ( $this->textClause != '' ) {
  108. $this->textClause .= ' OR ';
  109. }
  110. $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
  111. }
  112. }
  113. return $this->textClause;
  114. }
  115. function interpretPointer( $text ) {
  116. if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
  117. return false;
  118. }
  119. return array(
  120. 'cluster' => $m[1],
  121. 'id' => intval( $m[2] ),
  122. 'hash' => isset( $m[3] ) ? $m[3] : null
  123. );
  124. }
  125. /**
  126. * Scan the revision table for rows stored in the specified clusters
  127. */
  128. function trackRevisions() {
  129. $dbw = wfGetDB( DB_MASTER );
  130. $dbr = wfGetDB( DB_SLAVE );
  131. $textClause = $this->getTextClause();
  132. $startId = 0;
  133. $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
  134. $batchesDone = 0;
  135. $rowsInserted = 0;
  136. echo "Finding revisions...\n";
  137. while ( true ) {
  138. $res = $dbr->select( array( 'revision', 'text' ),
  139. array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
  140. array(
  141. 'rev_id > ' . $dbr->addQuotes( $startId ),
  142. 'rev_text_id=old_id',
  143. $textClause,
  144. 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
  145. ),
  146. __METHOD__,
  147. array(
  148. 'ORDER BY' => 'rev_id',
  149. 'LIMIT' => $this->batchSize
  150. )
  151. );
  152. if ( !$res->numRows() ) {
  153. break;
  154. }
  155. $insertBatch = array();
  156. foreach ( $res as $row ) {
  157. $startId = $row->rev_id;
  158. $info = $this->interpretPointer( $row->old_text );
  159. if ( !$info ) {
  160. echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
  161. continue;
  162. }
  163. if ( !in_array( $info['cluster'], $this->clusters ) ) {
  164. echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
  165. continue;
  166. }
  167. $insertBatch[] = array(
  168. 'bt_page' => $row->rev_page,
  169. 'bt_rev_id' => $row->rev_id,
  170. 'bt_text_id' => $row->old_id,
  171. 'bt_cluster' => $info['cluster'],
  172. 'bt_blob_id' => $info['id'],
  173. 'bt_cgz_hash' => $info['hash']
  174. );
  175. if ( $this->doBlobOrphans ) {
  176. gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
  177. }
  178. }
  179. $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
  180. $rowsInserted += count( $insertBatch );
  181. ++$batchesDone;
  182. if ( $batchesDone >= $this->reportingInterval ) {
  183. $batchesDone = 0;
  184. echo "$startId / $endId\n";
  185. wfWaitForSlaves();
  186. }
  187. }
  188. echo "Found $rowsInserted revisions\n";
  189. }
  190. /**
  191. * Scan the text table for orphan text
  192. * Orphan text here does not imply DB corruption -- deleted text tracked by the
  193. * archive table counts as orphan for our purposes.
  194. */
  195. function trackOrphanText() {
  196. # Wait until the blob_tracking table is available in the slave
  197. $dbw = wfGetDB( DB_MASTER );
  198. $dbr = wfGetDB( DB_SLAVE );
  199. $pos = $dbw->getMasterPos();
  200. $dbr->masterPosWait( $pos, 100000 );
  201. $textClause = $this->getTextClause( $this->clusters );
  202. $startId = 0;
  203. $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
  204. $rowsInserted = 0;
  205. $batchesDone = 0;
  206. echo "Finding orphan text...\n";
  207. # Scan the text table for orphan text
  208. while ( true ) {
  209. $res = $dbr->select( array( 'text', 'blob_tracking' ),
  210. array( 'old_id', 'old_flags', 'old_text' ),
  211. array(
  212. 'old_id>' . $dbr->addQuotes( $startId ),
  213. $textClause,
  214. 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
  215. 'bt_text_id IS NULL'
  216. ),
  217. __METHOD__,
  218. array(
  219. 'ORDER BY' => 'old_id',
  220. 'LIMIT' => $this->batchSize
  221. ),
  222. array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
  223. );
  224. $ids = array();
  225. foreach ( $res as $row ) {
  226. $ids[] = $row->old_id;
  227. }
  228. if ( !$res->numRows() ) {
  229. break;
  230. }
  231. $insertBatch = array();
  232. foreach ( $res as $row ) {
  233. $startId = $row->old_id;
  234. $info = $this->interpretPointer( $row->old_text );
  235. if ( !$info ) {
  236. echo "Invalid DB:// URL in old_id {$row->old_id}\n";
  237. continue;
  238. }
  239. if ( !in_array( $info['cluster'], $this->clusters ) ) {
  240. echo "Invalid cluster returned in SQL query\n";
  241. continue;
  242. }
  243. $insertBatch[] = array(
  244. 'bt_page' => 0,
  245. 'bt_rev_id' => 0,
  246. 'bt_text_id' => $row->old_id,
  247. 'bt_cluster' => $info['cluster'],
  248. 'bt_blob_id' => $info['id'],
  249. 'bt_cgz_hash' => $info['hash']
  250. );
  251. if ( $this->doBlobOrphans ) {
  252. gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
  253. }
  254. }
  255. $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
  256. $rowsInserted += count( $insertBatch );
  257. ++$batchesDone;
  258. if ( $batchesDone >= $this->reportingInterval ) {
  259. $batchesDone = 0;
  260. echo "$startId / $endId\n";
  261. wfWaitForSlaves();
  262. }
  263. }
  264. echo "Found $rowsInserted orphan text rows\n";
  265. }
  266. /**
  267. * Scan the blobs table for rows not registered in blob_tracking (and thus not
  268. * registered in the text table).
  269. *
  270. * Orphan blobs are indicative of DB corruption. They are inaccessible and
  271. * should probably be deleted.
  272. */
  273. function findOrphanBlobs() {
  274. if ( !extension_loaded( 'gmp' ) ) {
  275. echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
  276. return;
  277. }
  278. $dbw = wfGetDB( DB_MASTER );
  279. foreach ( $this->clusters as $cluster ) {
  280. echo "Searching for orphan blobs in $cluster...\n";
  281. $lb = wfGetLBFactory()->getExternalLB( $cluster );
  282. try {
  283. $extDB = $lb->getConnection( DB_SLAVE );
  284. } catch ( DBConnectionError $e ) {
  285. if ( strpos( $e->error, 'Unknown database' ) !== false ) {
  286. echo "No database on $cluster\n";
  287. } else {
  288. echo "Error on $cluster: " . $e->getMessage() . "\n";
  289. }
  290. continue;
  291. }
  292. $table = $extDB->getLBInfo( 'blobs table' );
  293. if ( is_null( $table ) ) {
  294. $table = 'blobs';
  295. }
  296. if ( !$extDB->tableExists( $table ) ) {
  297. echo "No blobs table on cluster $cluster\n";
  298. continue;
  299. }
  300. $startId = 0;
  301. $batchesDone = 0;
  302. $actualBlobs = gmp_init( 0 );
  303. $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
  304. // Build a bitmap of actual blob rows
  305. while ( true ) {
  306. $res = $extDB->select( $table,
  307. array( 'blob_id' ),
  308. array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
  309. __METHOD__,
  310. array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
  311. );
  312. if ( !$res->numRows() ) {
  313. break;
  314. }
  315. foreach ( $res as $row ) {
  316. gmp_setbit( $actualBlobs, $row->blob_id );
  317. }
  318. $startId = $row->blob_id;
  319. ++$batchesDone;
  320. if ( $batchesDone >= $this->reportingInterval ) {
  321. $batchesDone = 0;
  322. echo "$startId / $endId\n";
  323. }
  324. }
  325. // Find actual blobs that weren't tracked by the previous passes
  326. // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
  327. $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
  328. // Traverse the orphan list
  329. $insertBatch = array();
  330. $id = 0;
  331. $numOrphans = 0;
  332. while ( true ) {
  333. $id = gmp_scan1( $orphans, $id );
  334. if ( $id == -1 ) {
  335. break;
  336. }
  337. $insertBatch[] = array(
  338. 'bo_cluster' => $cluster,
  339. 'bo_blob_id' => $id
  340. );
  341. if ( count( $insertBatch ) > $this->batchSize ) {
  342. $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
  343. $insertBatch = array();
  344. }
  345. ++$id;
  346. ++$numOrphans;
  347. }
  348. if ( $insertBatch ) {
  349. $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
  350. }
  351. echo "Found $numOrphans orphan(s) in $cluster\n";
  352. }
  353. }
  354. }