PageRenderTime 6709ms CodeModel.GetById 61ms RepoModel.GetById 1ms app.codeStats 1ms

/src/mongo/s/d_migrate.cpp

http://github.com/mongodb/mongo
C++ | 2942 lines | 2084 code | 487 blank | 371 comment | 273 complexity | a46ce05711deb0377b47f46d4360984d MD5 | raw file
Possible License(s): BSD-3-Clause-No-Nuclear-License-2014, GPL-2.0, Apache-2.0, BSD-3-Clause, WTFPL

Large files files are truncated, but you can click here to view the full file

  1. // d_migrate.cpp
  2. /**
  3. * Copyright (C) 2008-2014 MongoDB Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. *
  17. * As a special exception, the copyright holders give permission to link the
  18. * code of portions of this program with the OpenSSL library under certain
  19. * conditions as described in each individual source file and distribute
  20. * linked combinations including the program with the OpenSSL library. You
  21. * must comply with the GNU Affero General Public License in all respects
  22. * for all of the code used other than as permitted herein. If you modify
  23. * file(s) with this exception, you may extend this exception to your
  24. * version of the file(s), but you are not obligated to do so. If you do not
  25. * wish to do so, delete this exception statement from your version. If you
  26. * delete this exception statement from all source files in the program,
  27. * then also delete it in the license file.
  28. */
  29. /**
  30. these are commands that live in mongod
  31. mostly around shard management and checking
  32. */
  33. #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
  34. #include "mongo/platform/basic.h"
  35. #include <algorithm>
  36. #include <boost/scoped_ptr.hpp>
  37. #include <boost/thread/thread.hpp>
  38. #include <map>
  39. #include <string>
  40. #include <vector>
  41. #include "mongo/client/connpool.h"
  42. #include "mongo/client/dbclientcursor.h"
  43. #include "mongo/db/auth/action_set.h"
  44. #include "mongo/db/auth/action_type.h"
  45. #include "mongo/db/auth/authorization_manager.h"
  46. #include "mongo/db/auth/authorization_manager_global.h"
  47. #include "mongo/db/auth/authorization_session.h"
  48. #include "mongo/db/auth/privilege.h"
  49. #include "mongo/db/catalog/document_validation.h"
  50. #include "mongo/db/catalog/index_create.h"
  51. #include "mongo/db/clientcursor.h"
  52. #include "mongo/db/commands.h"
  53. #include "mongo/db/concurrency/lock_state.h"
  54. #include "mongo/db/db_raii.h"
  55. #include "mongo/db/dbhelpers.h"
  56. #include "mongo/db/exec/plan_stage.h"
  57. #include "mongo/db/field_parser.h"
  58. #include "mongo/db/service_context.h"
  59. #include "mongo/db/hasher.h"
  60. #include "mongo/db/jsobj.h"
  61. #include "mongo/db/op_observer.h"
  62. #include "mongo/db/operation_context_impl.h"
  63. #include "mongo/db/ops/delete.h"
  64. #include "mongo/db/query/internal_plans.h"
  65. #include "mongo/db/query/query_knobs.h"
  66. #include "mongo/db/range_deleter_service.h"
  67. #include "mongo/db/repl/repl_client_info.h"
  68. #include "mongo/db/repl/replication_coordinator_global.h"
  69. #include "mongo/db/storage/mmap_v1/dur.h"
  70. #include "mongo/db/write_concern.h"
  71. #include "mongo/logger/ramlog.h"
  72. #include "mongo/s/catalog/catalog_manager.h"
  73. #include "mongo/s/catalog/type_chunk.h"
  74. #include "mongo/s/chunk.h"
  75. #include "mongo/s/chunk_version.h"
  76. #include "mongo/s/config.h"
  77. #include "mongo/s/d_state.h"
  78. #include "mongo/s/catalog/dist_lock_manager.h"
  79. #include "mongo/s/grid.h"
  80. #include "mongo/s/client/shard.h"
  81. #include "mongo/util/assert_util.h"
  82. #include "mongo/util/elapsed_tracker.h"
  83. #include "mongo/util/exit.h"
  84. #include "mongo/util/fail_point_service.h"
  85. #include "mongo/util/log.h"
  86. #include "mongo/util/processinfo.h"
  87. #include "mongo/util/queue.h"
  88. #include "mongo/util/startup_test.h"
  89. // Pause while a fail point is enabled.
  90. #define MONGO_FP_PAUSE_WHILE(symbol) while (MONGO_FAIL_POINT(symbol)) { sleepmillis(100); }
  91. using namespace std;
  92. namespace {
  93. using boost::scoped_ptr;
  94. using mongo::WriteConcernOptions;
  95. using mongo::repl::ReplicationCoordinator;
  96. using mongo::repl::OpTime;
  97. const int kDefaultWTimeoutMs = 60 * 1000;
  98. const WriteConcernOptions DefaultWriteConcern(2, WriteConcernOptions::NONE, kDefaultWTimeoutMs);
  99. /**
  100. * Returns the default write concern for migration cleanup (at donor shard) and
  101. * cloning documents (at recipient shard).
  102. */
  103. WriteConcernOptions getDefaultWriteConcern() {
  104. ReplicationCoordinator* replCoordinator =
  105. mongo::repl::getGlobalReplicationCoordinator();
  106. if (replCoordinator->getReplicationMode() ==
  107. mongo::repl::ReplicationCoordinator::modeReplSet) {
  108. mongo::Status status =
  109. replCoordinator->checkIfWriteConcernCanBeSatisfied(DefaultWriteConcern);
  110. if (status.isOK()) {
  111. return DefaultWriteConcern;
  112. }
  113. }
  114. return WriteConcernOptions(1, WriteConcernOptions::NONE, 0);
  115. }
  116. }
  117. namespace mongo {
  118. MONGO_FP_DECLARE(failMigrationCommit);
  119. MONGO_FP_DECLARE(failMigrationConfigWritePrepare);
  120. MONGO_FP_DECLARE(failMigrationApplyOps);
  121. Tee* migrateLog = RamLog::get("migrate");
  122. class MoveTimingHelper {
  123. public:
  124. MoveTimingHelper(OperationContext* txn,
  125. const string& where,
  126. const string& ns,
  127. BSONObj min,
  128. BSONObj max ,
  129. int total,
  130. string* cmdErrmsg,
  131. string toShard,
  132. string fromShard)
  133. : _txn(txn),
  134. _where(where),
  135. _ns(ns),
  136. _to(toShard),
  137. _from(fromShard),
  138. _next(0),
  139. _total(total),
  140. _cmdErrmsg(cmdErrmsg) {
  141. _b.append( "min" , min );
  142. _b.append( "max" , max );
  143. }
  144. ~MoveTimingHelper() {
  145. // even if logChange doesn't throw, bson does
  146. // sigh
  147. try {
  148. if ( !_to.empty() ){
  149. _b.append( "to", _to );
  150. }
  151. if ( !_from.empty() ){
  152. _b.append( "from", _from );
  153. }
  154. if ( _next != _total ) {
  155. _b.append( "note" , "aborted" );
  156. }
  157. else {
  158. _b.append( "note" , "success" );
  159. }
  160. if ( !_cmdErrmsg->empty() ) {
  161. _b.append( "errmsg" , *_cmdErrmsg );
  162. }
  163. grid.catalogManager()->logChange(_txn,
  164. (string)"moveChunk." + _where,
  165. _ns,
  166. _b.obj());
  167. }
  168. catch ( const std::exception& e ) {
  169. warning() << "couldn't record timing for moveChunk '" << _where << "': " << e.what() << migrateLog;
  170. }
  171. }
  172. void done(int step) {
  173. verify( step == ++_next );
  174. verify( step <= _total );
  175. stringstream ss;
  176. ss << "step " << step << " of " << _total;
  177. string s = ss.str();
  178. CurOp * op = _txn->getCurOp();
  179. if ( op )
  180. op->setMessage( s.c_str() );
  181. else
  182. warning() << "op is null in MoveTimingHelper::done" << migrateLog;
  183. _b.appendNumber( s , _t.millis() );
  184. _t.reset();
  185. #if 0
  186. // debugging for memory leak?
  187. ProcessInfo pi;
  188. ss << " v:" << pi.getVirtualMemorySize()
  189. << " r:" << pi.getResidentSize();
  190. log() << ss.str() << migrateLog;
  191. #endif
  192. }
  193. private:
  194. OperationContext* const _txn;
  195. Timer _t;
  196. string _where;
  197. string _ns;
  198. string _to;
  199. string _from;
  200. int _next;
  201. int _total; // expected # of steps
  202. const string* _cmdErrmsg;
  203. BSONObjBuilder _b;
  204. };
  205. class ChunkCommandHelper : public Command {
  206. public:
  207. ChunkCommandHelper( const char * name )
  208. : Command( name ) {
  209. }
  210. virtual void help( stringstream& help ) const {
  211. help << "internal - should not be called directly";
  212. }
  213. virtual bool slaveOk() const { return false; }
  214. virtual bool adminOnly() const { return true; }
  215. virtual bool isWriteCommandForConfigServer() const { return false; }
  216. };
  217. bool isInRange( const BSONObj& obj ,
  218. const BSONObj& min ,
  219. const BSONObj& max ,
  220. const BSONObj& shardKeyPattern ) {
  221. ShardKeyPattern shardKey( shardKeyPattern );
  222. BSONObj k = shardKey.extractShardKeyFromDoc( obj );
  223. return k.woCompare( min ) >= 0 && k.woCompare( max ) < 0;
  224. }
  225. class MigrateFromStatus {
  226. public:
  227. MigrateFromStatus():
  228. _inCriticalSection(false),
  229. _memoryUsed(0),
  230. _active(false) {
  231. }
  232. /**
  233. * @return false if cannot start. One of the reason for not being able to
  234. * start is there is already an existing migration in progress.
  235. */
  236. bool start(OperationContext* txn,
  237. const std::string& ns,
  238. const BSONObj& min,
  239. const BSONObj& max,
  240. const BSONObj& shardKeyPattern) {
  241. verify(!min.isEmpty());
  242. verify(!max.isEmpty());
  243. verify(!ns.empty());
  244. // Get global shared to synchronize with logOp. Also see comments in the class
  245. // members declaration for more details.
  246. Lock::GlobalRead globalShared(txn->lockState());
  247. boost::lock_guard<boost::mutex> lk(_mutex);
  248. if (_active) {
  249. return false;
  250. }
  251. _ns = ns;
  252. _min = min;
  253. _max = max;
  254. _shardKeyPattern = shardKeyPattern;
  255. verify(_deleted.size() == 0);
  256. verify(_reload.size() == 0);
  257. verify(_memoryUsed == 0);
  258. _active = true;
  259. boost::lock_guard<boost::mutex> tLock(_cloneLocsMutex);
  260. verify(_cloneLocs.size() == 0);
  261. return true;
  262. }
  263. void done(OperationContext* txn) {
  264. log() << "MigrateFromStatus::done About to acquire global lock to exit critical "
  265. "section" << endl;
  266. // Get global shared to synchronize with logOp. Also see comments in the class
  267. // members declaration for more details.
  268. Lock::GlobalRead globalShared(txn->lockState());
  269. boost::lock_guard<boost::mutex> lk(_mutex);
  270. _active = false;
  271. _deleteNotifyExec.reset( NULL );
  272. _inCriticalSection = false;
  273. _inCriticalSectionCV.notify_all();
  274. _deleted.clear();
  275. _reload.clear();
  276. _memoryUsed = 0;
  277. boost::lock_guard<boost::mutex> cloneLock(_cloneLocsMutex);
  278. _cloneLocs.clear();
  279. }
  280. void logOp(OperationContext* txn,
  281. const char* opstr,
  282. const char* ns,
  283. const BSONObj& obj,
  284. BSONObj* patt,
  285. bool notInActiveChunk) {
  286. ensureShardVersionOKOrThrow(ns);
  287. const char op = opstr[0];
  288. if (notInActiveChunk) {
  289. // Ignore writes that came from the migration process like cleanup so they
  290. // won't be transferred to the recipient shard. Also ignore ops from
  291. // _migrateClone and _transferMods since it is impossible to move a chunk
  292. // to self.
  293. return;
  294. }
  295. dassert(txn->lockState()->isWriteLocked()); // Must have Global IX.
  296. if (!_active)
  297. return;
  298. if (_ns != ns)
  299. return;
  300. // no need to log if this is not an insertion, an update, or an actual deletion
  301. // note: opstr 'db' isn't a deletion but a mention that a database exists
  302. // (for replication machinery mostly).
  303. if (op == 'n' || op == 'c' || (op == 'd' && opstr[1] == 'b'))
  304. return;
  305. BSONElement ide;
  306. if (patt)
  307. ide = patt->getField("_id");
  308. else
  309. ide = obj["_id"];
  310. if (ide.eoo()) {
  311. warning() << "logOpForSharding got mod with no _id, ignoring obj: "
  312. << obj << migrateLog;
  313. return;
  314. }
  315. if (op == 'i' && (!isInRange(obj, _min, _max, _shardKeyPattern))) {
  316. return;
  317. }
  318. BSONObj idObj(ide.wrap());
  319. if (op == 'u') {
  320. BSONObj fullDoc;
  321. OldClientContext ctx(txn, _ns, false);
  322. if (!Helpers::findById(txn, ctx.db(), _ns.c_str(), idObj, fullDoc)) {
  323. warning() << "logOpForSharding couldn't find: " << idObj
  324. << " even though should have" << migrateLog;
  325. dassert(false); // TODO: Abort the migration.
  326. return;
  327. }
  328. if (!isInRange(fullDoc, _min, _max, _shardKeyPattern)) {
  329. return;
  330. }
  331. }
  332. // Note: can't check if delete is in active chunk since the document is gone!
  333. txn->recoveryUnit()->registerChange(new LogOpForShardingHandler(this, idObj, op));
  334. }
  335. /**
  336. * Insert items from docIdList to a new array with the given fieldName in the given
  337. * builder. If explode is true, the inserted object will be the full version of the
  338. * document. Note that the whenever an item from the docList is inserted to the array,
  339. * it will also be removed from docList.
  340. *
  341. * Should be holding the collection lock for ns if explode is true.
  342. */
  343. void xfer(OperationContext* txn,
  344. const string& ns,
  345. Database* db,
  346. list<BSONObj> *docIdList,
  347. BSONObjBuilder& builder,
  348. const char* fieldName,
  349. long long& size,
  350. bool explode) {
  351. const long long maxSize = 1024 * 1024;
  352. if (docIdList->size() == 0 || size > maxSize)
  353. return;
  354. BSONArrayBuilder arr(builder.subarrayStart(fieldName));
  355. list<BSONObj>::iterator docIdIter = docIdList->begin();
  356. while (docIdIter != docIdList->end() && size < maxSize) {
  357. BSONObj idDoc = *docIdIter;
  358. if (explode) {
  359. BSONObj fullDoc;
  360. if (Helpers::findById(txn, db, ns.c_str(), idDoc, fullDoc)) {
  361. arr.append( fullDoc );
  362. size += fullDoc.objsize();
  363. }
  364. }
  365. else {
  366. arr.append(idDoc);
  367. size += idDoc.objsize();
  368. }
  369. docIdIter = docIdList->erase(docIdIter);
  370. }
  371. arr.done();
  372. }
  373. /**
  374. * called from the dest of a migrate
  375. * transfers mods from src to dest
  376. */
  377. bool transferMods(OperationContext* txn, string& errmsg, BSONObjBuilder& b) {
  378. long long size = 0;
  379. {
  380. AutoGetCollectionForRead ctx(txn, getNS());
  381. boost::lock_guard<boost::mutex> sl(_mutex);
  382. if (!_active) {
  383. errmsg = "no active migration!";
  384. return false;
  385. }
  386. // TODO: fix SERVER-16540 race
  387. xfer(txn, _ns, ctx.getDb(), &_deleted, b, "deleted", size, false);
  388. xfer(txn, _ns, ctx.getDb(), &_reload, b, "reload", size, true);
  389. }
  390. b.append( "size" , size );
  391. return true;
  392. }
  393. /**
  394. * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs
  395. * (to avoid seeking disk later).
  396. *
  397. * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices)
  398. * is considered too large to move.
  399. * @param errmsg filled with textual description of error if this call return false.
  400. * @return false if approximate chunk size is too big to move or true otherwise.
  401. */
  402. bool storeCurrentLocs(OperationContext* txn,
  403. long long maxChunkSize,
  404. string& errmsg,
  405. BSONObjBuilder& result ) {
  406. AutoGetCollectionForRead ctx(txn, getNS());
  407. Collection* collection = ctx.getCollection();
  408. if ( !collection ) {
  409. errmsg = "ns not found, should be impossible";
  410. return false;
  411. }
  412. // Allow multiKey based on the invariant that shard keys must be single-valued.
  413. // Therefore, any multi-key index prefixed by shard key cannot be multikey over
  414. // the shard key fields.
  415. IndexDescriptor *idx =
  416. collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn,
  417. _shardKeyPattern ,
  418. false); // requireSingleKey
  419. if (idx == NULL) {
  420. errmsg = str::stream() << "can't find index with prefix " << _shardKeyPattern
  421. << " in storeCurrentLocs for " << _ns;
  422. return false;
  423. }
  424. // Assume both min and max non-empty, append MinKey's to make them fit chosen index
  425. BSONObj min;
  426. BSONObj max;
  427. KeyPattern kp(idx->keyPattern());
  428. {
  429. // It's alright not to lock _mutex all the way through based on the assumption
  430. // that this is only called by the main thread that drives the migration and
  431. // only it can start and stop the current migration.
  432. boost::lock_guard<boost::mutex> sl(_mutex);
  433. invariant( _deleteNotifyExec.get() == NULL );
  434. WorkingSet* ws = new WorkingSet();
  435. DeleteNotificationStage* dns = new DeleteNotificationStage();
  436. PlanExecutor* deleteNotifyExec;
  437. // Takes ownership of 'ws' and 'dns'.
  438. Status execStatus = PlanExecutor::make(txn,
  439. ws,
  440. dns,
  441. collection,
  442. PlanExecutor::YIELD_MANUAL,
  443. &deleteNotifyExec);
  444. invariant(execStatus.isOK());
  445. deleteNotifyExec->registerExec();
  446. _deleteNotifyExec.reset(deleteNotifyExec);
  447. min = Helpers::toKeyFormat(kp.extendRangeBound(_min, false));
  448. max = Helpers::toKeyFormat(kp.extendRangeBound(_max, false));
  449. }
  450. auto_ptr<PlanExecutor> exec(
  451. InternalPlanner::indexScan(txn, collection, idx, min, max, false));
  452. // We can afford to yield here because any change to the base data that we might
  453. // miss is already being queued and will migrate in the 'transferMods' stage.
  454. exec->setYieldPolicy(PlanExecutor::YIELD_AUTO);
  455. // use the average object size to estimate how many objects a full chunk would carry
  456. // do that while traversing the chunk's range using the sharding index, below
  457. // there's a fair amount of slack before we determine a chunk is too large because object sizes will vary
  458. unsigned long long maxRecsWhenFull;
  459. long long avgRecSize;
  460. const long long totalRecs = collection->numRecords(txn);
  461. if ( totalRecs > 0 ) {
  462. avgRecSize = collection->dataSize(txn) / totalRecs;
  463. maxRecsWhenFull = maxChunkSize / avgRecSize;
  464. maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1) , 130 * maxRecsWhenFull / 100 /* slack */ );
  465. }
  466. else {
  467. avgRecSize = 0;
  468. maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1;
  469. }
  470. // do a full traversal of the chunk and don't stop even if we think it is a large chunk
  471. // we want the number of records to better report, in that case
  472. bool isLargeChunk = false;
  473. unsigned long long recCount = 0;;
  474. RecordId dl;
  475. while (PlanExecutor::ADVANCED == exec->getNext(NULL, &dl)) {
  476. if ( ! isLargeChunk ) {
  477. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  478. _cloneLocs.insert( dl );
  479. }
  480. if ( ++recCount > maxRecsWhenFull ) {
  481. isLargeChunk = true;
  482. // continue on despite knowing that it will fail,
  483. // just to get the correct value for recCount.
  484. }
  485. }
  486. exec.reset();
  487. if ( isLargeChunk ) {
  488. boost::lock_guard<boost::mutex> sl(_mutex);
  489. warning() << "cannot move chunk: the maximum number of documents for a chunk is "
  490. << maxRecsWhenFull << " , the maximum chunk size is " << maxChunkSize
  491. << " , average document size is " << avgRecSize
  492. << ". Found " << recCount << " documents in chunk "
  493. << " ns: " << _ns << " "
  494. << _min << " -> " << _max << migrateLog;
  495. result.appendBool( "chunkTooBig" , true );
  496. result.appendNumber( "estimatedChunkSize" , (long long)(recCount * avgRecSize) );
  497. errmsg = "chunk too big to move";
  498. return false;
  499. }
  500. log() << "moveChunk number of documents: " << cloneLocsRemaining() << migrateLog;
  501. txn->recoveryUnit()->commitAndRestart();
  502. return true;
  503. }
  504. bool clone(OperationContext* txn, string& errmsg , BSONObjBuilder& result ) {
  505. ElapsedTracker tracker(internalQueryExecYieldIterations,
  506. internalQueryExecYieldPeriodMS);
  507. int allocSize = 0;
  508. {
  509. AutoGetCollectionForRead ctx(txn, getNS());
  510. boost::lock_guard<boost::mutex> sl(_mutex);
  511. if (!_active) {
  512. errmsg = "not active";
  513. return false;
  514. }
  515. Collection* collection = ctx.getCollection();
  516. if (!collection) {
  517. errmsg = str::stream() << "collection " << _ns << " does not exist";
  518. return false;
  519. }
  520. allocSize =
  521. std::min(BSONObjMaxUserSize,
  522. static_cast<int>((12 + collection->averageObjectSize(txn)) *
  523. cloneLocsRemaining()));
  524. }
  525. bool isBufferFilled = false;
  526. BSONArrayBuilder clonedDocsArrayBuilder(allocSize);
  527. while (!isBufferFilled) {
  528. AutoGetCollectionForRead ctx(txn, getNS());
  529. boost::lock_guard<boost::mutex> sl(_mutex);
  530. if (!_active) {
  531. errmsg = "not active";
  532. return false;
  533. }
  534. // TODO: fix SERVER-16540 race
  535. Collection* collection = ctx.getCollection();
  536. if (!collection) {
  537. errmsg = str::stream() << "collection " << _ns << " does not exist";
  538. return false;
  539. }
  540. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  541. set<RecordId>::iterator cloneLocsIter = _cloneLocs.begin();
  542. for ( ; cloneLocsIter != _cloneLocs.end(); ++cloneLocsIter) {
  543. if (tracker.intervalHasElapsed()) // should I yield?
  544. break;
  545. RecordId dl = *cloneLocsIter;
  546. Snapshotted<BSONObj> doc;
  547. if (!collection->findDoc(txn, dl, &doc)) {
  548. // doc was deleted
  549. continue;
  550. }
  551. // Use the builder size instead of accumulating 'doc's size so that we take
  552. // into consideration the overhead of BSONArray indices, and *always*
  553. // append one doc.
  554. if (clonedDocsArrayBuilder.arrSize() != 0 &&
  555. (clonedDocsArrayBuilder.len() + doc.value().objsize() + 1024)
  556. > BSONObjMaxUserSize) {
  557. isBufferFilled = true; // break out of outer while loop
  558. break;
  559. }
  560. clonedDocsArrayBuilder.append(doc.value());
  561. }
  562. _cloneLocs.erase(_cloneLocs.begin(), cloneLocsIter);
  563. // Note: must be holding _cloneLocsMutex, don't move this inside while condition!
  564. if (_cloneLocs.empty()) {
  565. break;
  566. }
  567. }
  568. result.appendArray("objects", clonedDocsArrayBuilder.arr());
  569. return true;
  570. }
  571. void aboutToDelete( const RecordId& dl ) {
  572. // Even though above we call findDoc to check for existance
  573. // that check only works for non-mmapv1 engines, and this is needed
  574. // for mmapv1.
  575. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  576. _cloneLocs.erase( dl );
  577. }
  578. std::size_t cloneLocsRemaining() {
  579. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  580. return _cloneLocs.size();
  581. }
  582. long long mbUsed() const {
  583. boost::lock_guard<boost::mutex> lk(_mutex);
  584. return _memoryUsed / ( 1024 * 1024 );
  585. }
  586. bool getInCriticalSection() const {
  587. boost::lock_guard<boost::mutex> lk(_mutex);
  588. return _inCriticalSection;
  589. }
  590. void setInCriticalSection( bool b ) {
  591. boost::lock_guard<boost::mutex> lk(_mutex);
  592. _inCriticalSection = b;
  593. _inCriticalSectionCV.notify_all();
  594. }
  595. std::string getNS() const {
  596. boost::lock_guard<boost::mutex> sl(_mutex);
  597. return _ns;
  598. }
  599. /**
  600. * @return true if we are NOT in the critical section
  601. */
  602. bool waitTillNotInCriticalSection( int maxSecondsToWait ) {
  603. boost::xtime xt;
  604. boost::xtime_get(&xt, MONGO_BOOST_TIME_UTC);
  605. xt.sec += maxSecondsToWait;
  606. boost::unique_lock<boost::mutex> lk(_mutex);
  607. while (_inCriticalSection) {
  608. if (!_inCriticalSectionCV.timed_wait(lk, xt))
  609. return false;
  610. }
  611. return true;
  612. }
  613. bool isActive() const { return _getActive(); }
  614. private:
  615. bool _getActive() const { boost::lock_guard<boost::mutex> lk(_mutex); return _active; }
  616. void _setActive( bool b ) { boost::lock_guard<boost::mutex> lk(_mutex); _active = b; }
  617. /**
  618. * Used to commit work for LogOpForSharding. Used to keep track of changes in documents
  619. * that are part of a chunk being migrated.
  620. */
  621. class LogOpForShardingHandler : public RecoveryUnit::Change {
  622. public:
  623. /**
  624. * Invariant: idObj should belong to a document that is part of the active chunk
  625. * being migrated.
  626. */
  627. LogOpForShardingHandler(MigrateFromStatus* migrateFromStatus,
  628. const BSONObj& idObj,
  629. const char op):
  630. _migrateFromStatus(migrateFromStatus),
  631. _idObj(idObj.getOwned()),
  632. _op(op) {
  633. }
  634. virtual void commit() {
  635. switch (_op) {
  636. case 'd': {
  637. boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
  638. _migrateFromStatus->_deleted.push_back(_idObj);
  639. _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
  640. break;
  641. }
  642. case 'i':
  643. case 'u':
  644. {
  645. boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
  646. _migrateFromStatus->_reload.push_back(_idObj);
  647. _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
  648. break;
  649. }
  650. default:
  651. invariant(false);
  652. }
  653. }
  654. virtual void rollback() { }
  655. private:
  656. MigrateFromStatus* _migrateFromStatus;
  657. const BSONObj _idObj;
  658. const char _op;
  659. };
  660. /**
  661. * Used to receive invalidation notifications.
  662. *
  663. * XXX: move to the exec/ directory.
  664. */
  665. class DeleteNotificationStage : public PlanStage {
  666. public:
  667. virtual void invalidate(OperationContext* txn,
  668. const RecordId& dl,
  669. InvalidationType type);
  670. virtual StageState work(WorkingSetID* out) {
  671. invariant( false );
  672. }
  673. virtual bool isEOF() {
  674. invariant( false );
  675. return false;
  676. }
  677. virtual void kill() {
  678. }
  679. virtual void saveState() {
  680. invariant( false );
  681. }
  682. virtual void restoreState(OperationContext* opCtx) {
  683. invariant( false );
  684. }
  685. virtual PlanStageStats* getStats() {
  686. invariant( false );
  687. return NULL;
  688. }
  689. virtual CommonStats* getCommonStats() const {
  690. invariant( false );
  691. return NULL;
  692. }
  693. virtual SpecificStats* getSpecificStats() const {
  694. invariant( false );
  695. return NULL;
  696. }
  697. virtual std::vector<PlanStage*> getChildren() const {
  698. vector<PlanStage*> empty;
  699. return empty;
  700. }
  701. virtual StageType stageType() const {
  702. return STAGE_NOTIFY_DELETE;
  703. }
  704. };
  705. //
  706. // All member variables are labeled with one of the following codes indicating the
  707. // synchronization rules for accessing them.
  708. //
  709. // (M) Must hold _mutex for access.
  710. // (MG) For reads, _mutex *OR* Global IX Lock must be held.
  711. // For writes, the _mutex *AND* (Global Shared or Exclusive Lock) must be held.
  712. // (C) Must hold _cloneLocsMutex for access.
  713. //
  714. // Locking order:
  715. //
  716. // Global Lock -> _mutex -> _cloneLocsMutex
  717. mutable mongo::mutex _mutex;
  718. boost::condition _inCriticalSectionCV; // (M)
  719. // Is migration currently in critical section. This can be used to block new writes.
  720. bool _inCriticalSection; // (M)
  721. scoped_ptr<PlanExecutor> _deleteNotifyExec; // (M)
  722. // List of _id of documents that were modified that must be re-cloned.
  723. list<BSONObj> _reload; // (M)
  724. // List of _id of documents that were deleted during clone that should be deleted later.
  725. list<BSONObj> _deleted; // (M)
  726. // bytes in _reload + _deleted
  727. long long _memoryUsed; // (M)
  728. // If a migration is currently active.
  729. bool _active; // (MG)
  730. string _ns; // (MG)
  731. BSONObj _min; // (MG)
  732. BSONObj _max; // (MG)
  733. BSONObj _shardKeyPattern; // (MG)
  734. mutable mongo::mutex _cloneLocsMutex;
  735. // List of record id that needs to be transferred from here to the other side.
  736. set<RecordId> _cloneLocs; // (C)
  737. } migrateFromStatus;
  738. void MigrateFromStatus::DeleteNotificationStage::invalidate(OperationContext *txn,
  739. const RecordId& dl,
  740. InvalidationType type) {
  741. if ( type == INVALIDATION_DELETION ) {
  742. migrateFromStatus.aboutToDelete( dl );
  743. }
  744. }
  745. struct MigrateStatusHolder {
  746. MigrateStatusHolder( OperationContext* txn,
  747. const std::string& ns ,
  748. const BSONObj& min ,
  749. const BSONObj& max ,
  750. const BSONObj& shardKeyPattern )
  751. : _txn(txn) {
  752. _isAnotherMigrationActive =
  753. !migrateFromStatus.start(txn, ns, min, max, shardKeyPattern);
  754. }
  755. ~MigrateStatusHolder() {
  756. if (!_isAnotherMigrationActive) {
  757. migrateFromStatus.done(_txn);
  758. }
  759. }
  760. bool isAnotherMigrationActive() const {
  761. return _isAnotherMigrationActive;
  762. }
  763. private:
  764. OperationContext* _txn;
  765. bool _isAnotherMigrationActive;
  766. };
  767. void logOpForSharding(OperationContext* txn,
  768. const char * opstr,
  769. const char * ns,
  770. const BSONObj& obj,
  771. BSONObj * patt,
  772. bool notInActiveChunk) {
  773. migrateFromStatus.logOp(txn, opstr, ns, obj, patt, notInActiveChunk);
  774. }
  775. class TransferModsCommand : public ChunkCommandHelper {
  776. public:
  777. void help(stringstream& h) const { h << "internal"; }
  778. TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {}
  779. virtual void addRequiredPrivileges(const std::string& dbname,
  780. const BSONObj& cmdObj,
  781. std::vector<Privilege>* out) {
  782. ActionSet actions;
  783. actions.addAction(ActionType::internal);
  784. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  785. }
  786. bool run(OperationContext* txn,
  787. const string&,
  788. BSONObj& cmdObj,
  789. int,
  790. string& errmsg,
  791. BSONObjBuilder& result) {
  792. return migrateFromStatus.transferMods(txn, errmsg, result);
  793. }
  794. } transferModsCommand;
  795. class InitialCloneCommand : public ChunkCommandHelper {
  796. public:
  797. void help(stringstream& h) const { h << "internal"; }
  798. InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {}
  799. virtual void addRequiredPrivileges(const std::string& dbname,
  800. const BSONObj& cmdObj,
  801. std::vector<Privilege>* out) {
  802. ActionSet actions;
  803. actions.addAction(ActionType::internal);
  804. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  805. }
  806. bool run(OperationContext* txn,
  807. const string&,
  808. BSONObj& cmdObj,
  809. int,
  810. string& errmsg,
  811. BSONObjBuilder& result) {
  812. return migrateFromStatus.clone(txn, errmsg, result);
  813. }
  814. } initialCloneCommand;
  815. // Tests can pause / resume moveChunk's progress at each step by enabling / disabling each fail point.
  816. MONGO_FP_DECLARE(moveChunkHangAtStep1);
  817. MONGO_FP_DECLARE(moveChunkHangAtStep2);
  818. MONGO_FP_DECLARE(moveChunkHangAtStep3);
  819. MONGO_FP_DECLARE(moveChunkHangAtStep4);
  820. MONGO_FP_DECLARE(moveChunkHangAtStep5);
  821. MONGO_FP_DECLARE(moveChunkHangAtStep6);
  822. /**
  823. * this is the main entry for moveChunk
  824. * called to initial a move
  825. * usually by a mongos
  826. * this is called on the "from" side
  827. *
  828. * Format:
  829. * {
  830. * moveChunk: "namespace",
  831. * from: "hostAndPort",
  832. * fromShard: "shardName",
  833. * to: "hostAndPort",
  834. * toShard: "shardName",
  835. * min: {},
  836. * max: {},
  837. * maxChunkBytes: numeric,
  838. * configdb: "hostAndPort",
  839. *
  840. * // optional
  841. * secondaryThrottle: bool, //defaults to true.
  842. * writeConcern: {} // applies to individual writes.
  843. * }
  844. */
  845. class MoveChunkCommand : public Command {
  846. public:
  847. MoveChunkCommand() : Command( "moveChunk" ) {}
  848. virtual void help( stringstream& help ) const {
  849. help << "should not be calling this directly";
  850. }
  851. virtual bool slaveOk() const { return false; }
  852. virtual bool adminOnly() const { return true; }
  853. virtual bool isWriteCommandForConfigServer() const { return false; }
  854. virtual Status checkAuthForCommand(ClientBasic* client,
  855. const std::string& dbname,
  856. const BSONObj& cmdObj) {
  857. if (!AuthorizationSession::get(client)->isAuthorizedForActionsOnResource(
  858. ResourcePattern::forExactNamespace(NamespaceString(parseNs(dbname, cmdObj))),
  859. ActionType::moveChunk)) {
  860. return Status(ErrorCodes::Unauthorized, "Unauthorized");
  861. }
  862. return Status::OK();
  863. }
  864. virtual std::string parseNs(const std::string& dbname, const BSONObj& cmdObj) const {
  865. return parseNsFullyQualified(dbname, cmdObj);
  866. }
  867. bool run(OperationContext* txn,
  868. const string& dbname,
  869. BSONObj& cmdObj,
  870. int,
  871. string& errmsg,
  872. BSONObjBuilder& result) {
  873. // 1. Parse options
  874. // 2. Make sure my view is complete and lock the distributed lock to ensure shard
  875. // metadata stability.
  876. // 3. Migration
  877. // Retrieve all RecordIds, which need to be migrated in order to do as little seeking
  878. // as possible during transfer. Retrieval of the RecordIds happens under a collection
  879. // lock, but then the collection lock is dropped. This opens up an opportunity for
  880. // repair or compact to invalidate these RecordIds, because these commands do not
  881. // synchronized with migration. Note that data modifications are not a problem,
  882. // because we are registered for change notifications.
  883. //
  884. // 4. pause till migrate caught up
  885. // 5. LOCK
  886. // a) update my config, essentially locking
  887. // b) finish migrate
  888. // c) update config server
  889. // d) logChange to config server
  890. // 6. wait for all current cursors to expire
  891. // 7. remove data locally
  892. // -------------------------------
  893. // 1.
  894. string ns = parseNs(dbname, cmdObj);
  895. // The shard addresses, redundant, but allows for validation
  896. string toShardHost = cmdObj["to"].str();
  897. string fromShardHost = cmdObj["from"].str();
  898. // The shard names
  899. string toShardName = cmdObj["toShard"].str();
  900. string fromShardName = cmdObj["fromShard"].str();
  901. // Process secondary throttle settings and assign defaults if necessary.
  902. BSONObj secThrottleObj;
  903. WriteConcernOptions writeConcern;
  904. Status status = writeConcern.parseSecondaryThrottle(cmdObj, &secThrottleObj);
  905. if (!status.isOK()){
  906. if (status.code() != ErrorCodes::WriteConcernNotDefined) {
  907. warning() << status.toString() << endl;
  908. return appendCommandStatus(result, status);
  909. }
  910. writeConcern = getDefaultWriteConcern();
  911. }
  912. else {
  913. repl::ReplicationCoordinator* replCoordinator =
  914. repl::getGlobalReplicationCoordinator();
  915. if (replCoordinator->getReplicationMode() ==
  916. repl::ReplicationCoordinator::modeMasterSlave &&
  917. writeConcern.shouldWaitForOtherNodes()) {
  918. warning() << "moveChunk cannot check if secondary throttle setting "
  919. << writeConcern.toBSON()
  920. << " can be enforced in a master slave configuration";
  921. }
  922. Status status = replCoordinator->checkIfWriteConcernCanBeSatisfied(writeConcern);
  923. if (!status.isOK() && status != ErrorCodes::NoReplicationEnabled) {
  924. warning() << status.toString() << endl;
  925. return appendCommandStatus(result, status);
  926. }
  927. }
  928. if (writeConcern.shouldWaitForOtherNodes() &&
  929. writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) {
  930. // Don't allow no timeout.
  931. writeConcern.wTimeout = kDefaultWTimeoutMs;
  932. }
  933. // Do inline deletion
  934. bool waitForDelete = cmdObj["waitForDelete"].trueValue();
  935. if (waitForDelete) {
  936. log() << "moveChunk waiting for full cleanup after move" << endl;
  937. }
  938. BSONObj min = cmdObj["min"].Obj();
  939. BSONObj max = cmdObj["max"].Obj();
  940. BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
  941. if ( ns.empty() ) {
  942. errmsg = "need to specify namespace in command";
  943. return false;
  944. }
  945. if ( toShardName.empty() ) {
  946. errmsg = "need to specify shard to move chunk to";
  947. return false;
  948. }
  949. if ( fromShardName.empty() ) {
  950. errmsg = "need to specify shard to move chunk from";
  951. return false;
  952. }
  953. if ( min.isEmpty() ) {
  954. errmsg = "need to specify a min";
  955. return false;
  956. }
  957. if ( max.isEmpty() ) {
  958. errmsg = "need to specify a max";
  959. return false;
  960. }
  961. if ( maxSizeElem.eoo() || ! maxSizeElem.isNumber() ) {
  962. errmsg = "need to specify maxChunkSizeBytes";
  963. return false;
  964. }
  965. const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
  966. // This could be the first call that enables sharding - make sure we initialize the
  967. // sharding state for this shard.
  968. if ( ! shardingState.enabled() ) {
  969. if ( cmdObj["configdb"].type() != String ) {
  970. errmsg = "sharding not enabled";
  971. warning() << errmsg << endl;
  972. return false;
  973. }
  974. string configdb = cmdObj["configdb"].String();
  975. ShardingState::initialize(configdb);
  976. }
  977. // Initialize our current shard name in the shard state if needed
  978. shardingState.gotShardName(fromShardName);
  979. // Make sure we're as up-to-date as possible with shard information
  980. // This catches the case where we had to previously changed a shard's host by
  981. // removing/adding a shard with the same name
  982. Shard::reloadShardInfo();
  983. Shard toShard(toShardName);
  984. Shard fromShard(fromShardName);
  985. ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(),
  986. errmsg);
  987. if (!configLoc.isValid()) {
  988. warning() << errmsg;
  989. return false;
  990. }
  991. MoveTimingHelper timing(txn, "from" , ns , min , max , 6 /* steps */ , &errmsg,
  992. toShardName, fromShardName );
  993. log() << "received moveChunk request: " << cmdObj << migrateLog;
  994. timing.done(1);
  995. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep1);
  996. // 2.
  997. if ( migrateFromStatus.isActive() ) {
  998. errmsg = "migration already in progress";
  999. warning() << errmsg << endl;
  1000. return false;
  1001. }
  1002. //
  1003. // Get the distributed lock
  1004. //
  1005. string whyMessage(str::stream() << "migrating chunk [" << minKey << ", " << maxKey
  1006. << ") in " << ns);
  1007. auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
  1008. ns, whyMessage);
  1009. if (!scopedDistLock.isOK()) {
  1010. errmsg = stream() << "could not acquire collection lock for " << ns
  1011. << " to migrate chunk [" << minKey << "," << maxKey << ")"
  1012. << causedBy(scopedDistLock.getStatus());
  1013. warning() << errmsg << endl;
  1014. return false;
  1015. }
  1016. BSONObj chunkInfo =
  1017. BSON("min" << min << "max" << max <<
  1018. "from" << fromShard.getName() << "to" << toShard.getName());
  1019. grid.catalogManager()->logChange(txn, "moveChunk.start", ns, chunkInfo);
  1020. // Always refresh our metadata remotely
  1021. ChunkVersion origShardVersion;
  1022. Status refreshStatus = shardingState.refreshMetadataNow(txn, ns, &origShardVersion);
  1023. if (!refreshStatus.isOK()) {
  1024. errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
  1025. << "[" << minKey << "," << maxKey << ")"
  1026. << causedBy(refreshStatus.reason());
  1027. warning() << errmsg;
  1028. return false;
  1029. }
  1030. if (origShardVersion.majorVersion() == 0) {
  1031. // It makes no sense to migrate if our version is zero and we have no chunks
  1032. errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
  1033. << "[" << minKey << "," << maxKey << ")"
  1034. << " with zero shard version";
  1035. warning() << errmsg;
  1036. return false;
  1037. }
  1038. // From mongos >= v3.0.
  1039. BSONElement epochElem(cmdObj["epoch"]);
  1040. if (epochElem.type() == jstOID) {
  1041. OID cmdEpoch = epochElem.OID();
  1042. if (cmdEpoch != origShardVersion.epoch()) {
  1043. errmsg = str::stream() << "moveChunk cannot move chunk "
  1044. << "[" << minKey << ","
  1045. << maxKey << "), "
  1046. << "collection may have been dropped. "
  1047. << "current epoch: " << origShardVersion.epoch()
  1048. << ", cmd epoch: " << cmdEpoch;
  1049. warning() << errmsg;
  1050. return false;
  1051. }
  1052. }
  1053. // Get collection metadata
  1054. const CollectionMetadataPtr origCollMetadata(shardingState.getCollectionMetadata(ns));
  1055. // With nonzero shard version, we must have metadata
  1056. invariant(NULL != origCollMetadata);
  1057. ChunkVersion origCollVersion = origCollMetadata->getCollVersion();
  1058. BSONObj shardKeyPattern = origCollMetadata->getKeyPattern();
  1059. // With nonzero shard version, we must have a coll version >= our shard version
  1060. invariant(origCollVersion >= origShardVersion);
  1061. // With nonzero shard version, we must have a shard key
  1062. invariant(!shardKeyPattern.isEmpty());
  1063. ChunkType origChunk;
  1064. if (!origCollMetadata->getNextChunk(min, &origChunk)
  1065. || origChunk.getMin().woCompare(min) || origChunk.getMax().woCompare(max)) {
  1066. // Our boundaries are different from those passed in
  1067. errmsg = str::stream() << "moveChunk cannot find chunk "
  1068. << "[" << minKey << "," << maxKey << ")"
  1069. << " to migrate, the chunk boundaries may be stale";
  1070. warning() << errmsg;
  1071. return false;
  1072. }
  1073. log() << "moveChunk request accepted at version " << origShardVersion;
  1074. timing.done(2);
  1075. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep2);
  1076. // 3.
  1077. MigrateStatusHolder statusHolder(txn, ns, min, max, shardKeyPattern);
  1078. if (statusHolder.isAnotherMigrationActive()) {
  1079. errmsg = "moveChunk is already in progress from this shard";
  1080. warning() << errmsg << endl;
  1081. return false;
  1082. }

Large files files are truncated, but you can click here to view the full file