PageRenderTime 452ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 1ms

/src/mongo/s/d_migrate.cpp

http://github.com/mongodb/mongo
C++ | 2942 lines | 2084 code | 487 blank | 371 comment | 273 complexity | a46ce05711deb0377b47f46d4360984d MD5 | raw file
Possible License(s): BSD-3-Clause-No-Nuclear-License-2014, GPL-2.0, Apache-2.0, BSD-3-Clause, WTFPL
  1. // d_migrate.cpp
  2. /**
  3. * Copyright (C) 2008-2014 MongoDB Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. *
  17. * As a special exception, the copyright holders give permission to link the
  18. * code of portions of this program with the OpenSSL library under certain
  19. * conditions as described in each individual source file and distribute
  20. * linked combinations including the program with the OpenSSL library. You
  21. * must comply with the GNU Affero General Public License in all respects
  22. * for all of the code used other than as permitted herein. If you modify
  23. * file(s) with this exception, you may extend this exception to your
  24. * version of the file(s), but you are not obligated to do so. If you do not
  25. * wish to do so, delete this exception statement from your version. If you
  26. * delete this exception statement from all source files in the program,
  27. * then also delete it in the license file.
  28. */
  29. /**
  30. these are commands that live in mongod
  31. mostly around shard management and checking
  32. */
  33. #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
  34. #include "mongo/platform/basic.h"
  35. #include <algorithm>
  36. #include <boost/scoped_ptr.hpp>
  37. #include <boost/thread/thread.hpp>
  38. #include <map>
  39. #include <string>
  40. #include <vector>
  41. #include "mongo/client/connpool.h"
  42. #include "mongo/client/dbclientcursor.h"
  43. #include "mongo/db/auth/action_set.h"
  44. #include "mongo/db/auth/action_type.h"
  45. #include "mongo/db/auth/authorization_manager.h"
  46. #include "mongo/db/auth/authorization_manager_global.h"
  47. #include "mongo/db/auth/authorization_session.h"
  48. #include "mongo/db/auth/privilege.h"
  49. #include "mongo/db/catalog/document_validation.h"
  50. #include "mongo/db/catalog/index_create.h"
  51. #include "mongo/db/clientcursor.h"
  52. #include "mongo/db/commands.h"
  53. #include "mongo/db/concurrency/lock_state.h"
  54. #include "mongo/db/db_raii.h"
  55. #include "mongo/db/dbhelpers.h"
  56. #include "mongo/db/exec/plan_stage.h"
  57. #include "mongo/db/field_parser.h"
  58. #include "mongo/db/service_context.h"
  59. #include "mongo/db/hasher.h"
  60. #include "mongo/db/jsobj.h"
  61. #include "mongo/db/op_observer.h"
  62. #include "mongo/db/operation_context_impl.h"
  63. #include "mongo/db/ops/delete.h"
  64. #include "mongo/db/query/internal_plans.h"
  65. #include "mongo/db/query/query_knobs.h"
  66. #include "mongo/db/range_deleter_service.h"
  67. #include "mongo/db/repl/repl_client_info.h"
  68. #include "mongo/db/repl/replication_coordinator_global.h"
  69. #include "mongo/db/storage/mmap_v1/dur.h"
  70. #include "mongo/db/write_concern.h"
  71. #include "mongo/logger/ramlog.h"
  72. #include "mongo/s/catalog/catalog_manager.h"
  73. #include "mongo/s/catalog/type_chunk.h"
  74. #include "mongo/s/chunk.h"
  75. #include "mongo/s/chunk_version.h"
  76. #include "mongo/s/config.h"
  77. #include "mongo/s/d_state.h"
  78. #include "mongo/s/catalog/dist_lock_manager.h"
  79. #include "mongo/s/grid.h"
  80. #include "mongo/s/client/shard.h"
  81. #include "mongo/util/assert_util.h"
  82. #include "mongo/util/elapsed_tracker.h"
  83. #include "mongo/util/exit.h"
  84. #include "mongo/util/fail_point_service.h"
  85. #include "mongo/util/log.h"
  86. #include "mongo/util/processinfo.h"
  87. #include "mongo/util/queue.h"
  88. #include "mongo/util/startup_test.h"
  89. // Pause while a fail point is enabled.
  90. #define MONGO_FP_PAUSE_WHILE(symbol) while (MONGO_FAIL_POINT(symbol)) { sleepmillis(100); }
  91. using namespace std;
  92. namespace {
  93. using boost::scoped_ptr;
  94. using mongo::WriteConcernOptions;
  95. using mongo::repl::ReplicationCoordinator;
  96. using mongo::repl::OpTime;
  97. const int kDefaultWTimeoutMs = 60 * 1000;
  98. const WriteConcernOptions DefaultWriteConcern(2, WriteConcernOptions::NONE, kDefaultWTimeoutMs);
  99. /**
  100. * Returns the default write concern for migration cleanup (at donor shard) and
  101. * cloning documents (at recipient shard).
  102. */
  103. WriteConcernOptions getDefaultWriteConcern() {
  104. ReplicationCoordinator* replCoordinator =
  105. mongo::repl::getGlobalReplicationCoordinator();
  106. if (replCoordinator->getReplicationMode() ==
  107. mongo::repl::ReplicationCoordinator::modeReplSet) {
  108. mongo::Status status =
  109. replCoordinator->checkIfWriteConcernCanBeSatisfied(DefaultWriteConcern);
  110. if (status.isOK()) {
  111. return DefaultWriteConcern;
  112. }
  113. }
  114. return WriteConcernOptions(1, WriteConcernOptions::NONE, 0);
  115. }
  116. }
  117. namespace mongo {
  118. MONGO_FP_DECLARE(failMigrationCommit);
  119. MONGO_FP_DECLARE(failMigrationConfigWritePrepare);
  120. MONGO_FP_DECLARE(failMigrationApplyOps);
  121. Tee* migrateLog = RamLog::get("migrate");
  122. class MoveTimingHelper {
  123. public:
  124. MoveTimingHelper(OperationContext* txn,
  125. const string& where,
  126. const string& ns,
  127. BSONObj min,
  128. BSONObj max ,
  129. int total,
  130. string* cmdErrmsg,
  131. string toShard,
  132. string fromShard)
  133. : _txn(txn),
  134. _where(where),
  135. _ns(ns),
  136. _to(toShard),
  137. _from(fromShard),
  138. _next(0),
  139. _total(total),
  140. _cmdErrmsg(cmdErrmsg) {
  141. _b.append( "min" , min );
  142. _b.append( "max" , max );
  143. }
  144. ~MoveTimingHelper() {
  145. // even if logChange doesn't throw, bson does
  146. // sigh
  147. try {
  148. if ( !_to.empty() ){
  149. _b.append( "to", _to );
  150. }
  151. if ( !_from.empty() ){
  152. _b.append( "from", _from );
  153. }
  154. if ( _next != _total ) {
  155. _b.append( "note" , "aborted" );
  156. }
  157. else {
  158. _b.append( "note" , "success" );
  159. }
  160. if ( !_cmdErrmsg->empty() ) {
  161. _b.append( "errmsg" , *_cmdErrmsg );
  162. }
  163. grid.catalogManager()->logChange(_txn,
  164. (string)"moveChunk." + _where,
  165. _ns,
  166. _b.obj());
  167. }
  168. catch ( const std::exception& e ) {
  169. warning() << "couldn't record timing for moveChunk '" << _where << "': " << e.what() << migrateLog;
  170. }
  171. }
  172. void done(int step) {
  173. verify( step == ++_next );
  174. verify( step <= _total );
  175. stringstream ss;
  176. ss << "step " << step << " of " << _total;
  177. string s = ss.str();
  178. CurOp * op = _txn->getCurOp();
  179. if ( op )
  180. op->setMessage( s.c_str() );
  181. else
  182. warning() << "op is null in MoveTimingHelper::done" << migrateLog;
  183. _b.appendNumber( s , _t.millis() );
  184. _t.reset();
  185. #if 0
  186. // debugging for memory leak?
  187. ProcessInfo pi;
  188. ss << " v:" << pi.getVirtualMemorySize()
  189. << " r:" << pi.getResidentSize();
  190. log() << ss.str() << migrateLog;
  191. #endif
  192. }
  193. private:
  194. OperationContext* const _txn;
  195. Timer _t;
  196. string _where;
  197. string _ns;
  198. string _to;
  199. string _from;
  200. int _next;
  201. int _total; // expected # of steps
  202. const string* _cmdErrmsg;
  203. BSONObjBuilder _b;
  204. };
  205. class ChunkCommandHelper : public Command {
  206. public:
  207. ChunkCommandHelper( const char * name )
  208. : Command( name ) {
  209. }
  210. virtual void help( stringstream& help ) const {
  211. help << "internal - should not be called directly";
  212. }
  213. virtual bool slaveOk() const { return false; }
  214. virtual bool adminOnly() const { return true; }
  215. virtual bool isWriteCommandForConfigServer() const { return false; }
  216. };
  217. bool isInRange( const BSONObj& obj ,
  218. const BSONObj& min ,
  219. const BSONObj& max ,
  220. const BSONObj& shardKeyPattern ) {
  221. ShardKeyPattern shardKey( shardKeyPattern );
  222. BSONObj k = shardKey.extractShardKeyFromDoc( obj );
  223. return k.woCompare( min ) >= 0 && k.woCompare( max ) < 0;
  224. }
  225. class MigrateFromStatus {
  226. public:
  227. MigrateFromStatus():
  228. _inCriticalSection(false),
  229. _memoryUsed(0),
  230. _active(false) {
  231. }
  232. /**
  233. * @return false if cannot start. One of the reason for not being able to
  234. * start is there is already an existing migration in progress.
  235. */
  236. bool start(OperationContext* txn,
  237. const std::string& ns,
  238. const BSONObj& min,
  239. const BSONObj& max,
  240. const BSONObj& shardKeyPattern) {
  241. verify(!min.isEmpty());
  242. verify(!max.isEmpty());
  243. verify(!ns.empty());
  244. // Get global shared to synchronize with logOp. Also see comments in the class
  245. // members declaration for more details.
  246. Lock::GlobalRead globalShared(txn->lockState());
  247. boost::lock_guard<boost::mutex> lk(_mutex);
  248. if (_active) {
  249. return false;
  250. }
  251. _ns = ns;
  252. _min = min;
  253. _max = max;
  254. _shardKeyPattern = shardKeyPattern;
  255. verify(_deleted.size() == 0);
  256. verify(_reload.size() == 0);
  257. verify(_memoryUsed == 0);
  258. _active = true;
  259. boost::lock_guard<boost::mutex> tLock(_cloneLocsMutex);
  260. verify(_cloneLocs.size() == 0);
  261. return true;
  262. }
  263. void done(OperationContext* txn) {
  264. log() << "MigrateFromStatus::done About to acquire global lock to exit critical "
  265. "section" << endl;
  266. // Get global shared to synchronize with logOp. Also see comments in the class
  267. // members declaration for more details.
  268. Lock::GlobalRead globalShared(txn->lockState());
  269. boost::lock_guard<boost::mutex> lk(_mutex);
  270. _active = false;
  271. _deleteNotifyExec.reset( NULL );
  272. _inCriticalSection = false;
  273. _inCriticalSectionCV.notify_all();
  274. _deleted.clear();
  275. _reload.clear();
  276. _memoryUsed = 0;
  277. boost::lock_guard<boost::mutex> cloneLock(_cloneLocsMutex);
  278. _cloneLocs.clear();
  279. }
  280. void logOp(OperationContext* txn,
  281. const char* opstr,
  282. const char* ns,
  283. const BSONObj& obj,
  284. BSONObj* patt,
  285. bool notInActiveChunk) {
  286. ensureShardVersionOKOrThrow(ns);
  287. const char op = opstr[0];
  288. if (notInActiveChunk) {
  289. // Ignore writes that came from the migration process like cleanup so they
  290. // won't be transferred to the recipient shard. Also ignore ops from
  291. // _migrateClone and _transferMods since it is impossible to move a chunk
  292. // to self.
  293. return;
  294. }
  295. dassert(txn->lockState()->isWriteLocked()); // Must have Global IX.
  296. if (!_active)
  297. return;
  298. if (_ns != ns)
  299. return;
  300. // no need to log if this is not an insertion, an update, or an actual deletion
  301. // note: opstr 'db' isn't a deletion but a mention that a database exists
  302. // (for replication machinery mostly).
  303. if (op == 'n' || op == 'c' || (op == 'd' && opstr[1] == 'b'))
  304. return;
  305. BSONElement ide;
  306. if (patt)
  307. ide = patt->getField("_id");
  308. else
  309. ide = obj["_id"];
  310. if (ide.eoo()) {
  311. warning() << "logOpForSharding got mod with no _id, ignoring obj: "
  312. << obj << migrateLog;
  313. return;
  314. }
  315. if (op == 'i' && (!isInRange(obj, _min, _max, _shardKeyPattern))) {
  316. return;
  317. }
  318. BSONObj idObj(ide.wrap());
  319. if (op == 'u') {
  320. BSONObj fullDoc;
  321. OldClientContext ctx(txn, _ns, false);
  322. if (!Helpers::findById(txn, ctx.db(), _ns.c_str(), idObj, fullDoc)) {
  323. warning() << "logOpForSharding couldn't find: " << idObj
  324. << " even though should have" << migrateLog;
  325. dassert(false); // TODO: Abort the migration.
  326. return;
  327. }
  328. if (!isInRange(fullDoc, _min, _max, _shardKeyPattern)) {
  329. return;
  330. }
  331. }
  332. // Note: can't check if delete is in active chunk since the document is gone!
  333. txn->recoveryUnit()->registerChange(new LogOpForShardingHandler(this, idObj, op));
  334. }
  335. /**
  336. * Insert items from docIdList to a new array with the given fieldName in the given
  337. * builder. If explode is true, the inserted object will be the full version of the
  338. * document. Note that the whenever an item from the docList is inserted to the array,
  339. * it will also be removed from docList.
  340. *
  341. * Should be holding the collection lock for ns if explode is true.
  342. */
  343. void xfer(OperationContext* txn,
  344. const string& ns,
  345. Database* db,
  346. list<BSONObj> *docIdList,
  347. BSONObjBuilder& builder,
  348. const char* fieldName,
  349. long long& size,
  350. bool explode) {
  351. const long long maxSize = 1024 * 1024;
  352. if (docIdList->size() == 0 || size > maxSize)
  353. return;
  354. BSONArrayBuilder arr(builder.subarrayStart(fieldName));
  355. list<BSONObj>::iterator docIdIter = docIdList->begin();
  356. while (docIdIter != docIdList->end() && size < maxSize) {
  357. BSONObj idDoc = *docIdIter;
  358. if (explode) {
  359. BSONObj fullDoc;
  360. if (Helpers::findById(txn, db, ns.c_str(), idDoc, fullDoc)) {
  361. arr.append( fullDoc );
  362. size += fullDoc.objsize();
  363. }
  364. }
  365. else {
  366. arr.append(idDoc);
  367. size += idDoc.objsize();
  368. }
  369. docIdIter = docIdList->erase(docIdIter);
  370. }
  371. arr.done();
  372. }
  373. /**
  374. * called from the dest of a migrate
  375. * transfers mods from src to dest
  376. */
  377. bool transferMods(OperationContext* txn, string& errmsg, BSONObjBuilder& b) {
  378. long long size = 0;
  379. {
  380. AutoGetCollectionForRead ctx(txn, getNS());
  381. boost::lock_guard<boost::mutex> sl(_mutex);
  382. if (!_active) {
  383. errmsg = "no active migration!";
  384. return false;
  385. }
  386. // TODO: fix SERVER-16540 race
  387. xfer(txn, _ns, ctx.getDb(), &_deleted, b, "deleted", size, false);
  388. xfer(txn, _ns, ctx.getDb(), &_reload, b, "reload", size, true);
  389. }
  390. b.append( "size" , size );
  391. return true;
  392. }
  393. /**
  394. * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs
  395. * (to avoid seeking disk later).
  396. *
  397. * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices)
  398. * is considered too large to move.
  399. * @param errmsg filled with textual description of error if this call return false.
  400. * @return false if approximate chunk size is too big to move or true otherwise.
  401. */
  402. bool storeCurrentLocs(OperationContext* txn,
  403. long long maxChunkSize,
  404. string& errmsg,
  405. BSONObjBuilder& result ) {
  406. AutoGetCollectionForRead ctx(txn, getNS());
  407. Collection* collection = ctx.getCollection();
  408. if ( !collection ) {
  409. errmsg = "ns not found, should be impossible";
  410. return false;
  411. }
  412. // Allow multiKey based on the invariant that shard keys must be single-valued.
  413. // Therefore, any multi-key index prefixed by shard key cannot be multikey over
  414. // the shard key fields.
  415. IndexDescriptor *idx =
  416. collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn,
  417. _shardKeyPattern ,
  418. false); // requireSingleKey
  419. if (idx == NULL) {
  420. errmsg = str::stream() << "can't find index with prefix " << _shardKeyPattern
  421. << " in storeCurrentLocs for " << _ns;
  422. return false;
  423. }
  424. // Assume both min and max non-empty, append MinKey's to make them fit chosen index
  425. BSONObj min;
  426. BSONObj max;
  427. KeyPattern kp(idx->keyPattern());
  428. {
  429. // It's alright not to lock _mutex all the way through based on the assumption
  430. // that this is only called by the main thread that drives the migration and
  431. // only it can start and stop the current migration.
  432. boost::lock_guard<boost::mutex> sl(_mutex);
  433. invariant( _deleteNotifyExec.get() == NULL );
  434. WorkingSet* ws = new WorkingSet();
  435. DeleteNotificationStage* dns = new DeleteNotificationStage();
  436. PlanExecutor* deleteNotifyExec;
  437. // Takes ownership of 'ws' and 'dns'.
  438. Status execStatus = PlanExecutor::make(txn,
  439. ws,
  440. dns,
  441. collection,
  442. PlanExecutor::YIELD_MANUAL,
  443. &deleteNotifyExec);
  444. invariant(execStatus.isOK());
  445. deleteNotifyExec->registerExec();
  446. _deleteNotifyExec.reset(deleteNotifyExec);
  447. min = Helpers::toKeyFormat(kp.extendRangeBound(_min, false));
  448. max = Helpers::toKeyFormat(kp.extendRangeBound(_max, false));
  449. }
  450. auto_ptr<PlanExecutor> exec(
  451. InternalPlanner::indexScan(txn, collection, idx, min, max, false));
  452. // We can afford to yield here because any change to the base data that we might
  453. // miss is already being queued and will migrate in the 'transferMods' stage.
  454. exec->setYieldPolicy(PlanExecutor::YIELD_AUTO);
  455. // use the average object size to estimate how many objects a full chunk would carry
  456. // do that while traversing the chunk's range using the sharding index, below
  457. // there's a fair amount of slack before we determine a chunk is too large because object sizes will vary
  458. unsigned long long maxRecsWhenFull;
  459. long long avgRecSize;
  460. const long long totalRecs = collection->numRecords(txn);
  461. if ( totalRecs > 0 ) {
  462. avgRecSize = collection->dataSize(txn) / totalRecs;
  463. maxRecsWhenFull = maxChunkSize / avgRecSize;
  464. maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1) , 130 * maxRecsWhenFull / 100 /* slack */ );
  465. }
  466. else {
  467. avgRecSize = 0;
  468. maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1;
  469. }
  470. // do a full traversal of the chunk and don't stop even if we think it is a large chunk
  471. // we want the number of records to better report, in that case
  472. bool isLargeChunk = false;
  473. unsigned long long recCount = 0;;
  474. RecordId dl;
  475. while (PlanExecutor::ADVANCED == exec->getNext(NULL, &dl)) {
  476. if ( ! isLargeChunk ) {
  477. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  478. _cloneLocs.insert( dl );
  479. }
  480. if ( ++recCount > maxRecsWhenFull ) {
  481. isLargeChunk = true;
  482. // continue on despite knowing that it will fail,
  483. // just to get the correct value for recCount.
  484. }
  485. }
  486. exec.reset();
  487. if ( isLargeChunk ) {
  488. boost::lock_guard<boost::mutex> sl(_mutex);
  489. warning() << "cannot move chunk: the maximum number of documents for a chunk is "
  490. << maxRecsWhenFull << " , the maximum chunk size is " << maxChunkSize
  491. << " , average document size is " << avgRecSize
  492. << ". Found " << recCount << " documents in chunk "
  493. << " ns: " << _ns << " "
  494. << _min << " -> " << _max << migrateLog;
  495. result.appendBool( "chunkTooBig" , true );
  496. result.appendNumber( "estimatedChunkSize" , (long long)(recCount * avgRecSize) );
  497. errmsg = "chunk too big to move";
  498. return false;
  499. }
  500. log() << "moveChunk number of documents: " << cloneLocsRemaining() << migrateLog;
  501. txn->recoveryUnit()->commitAndRestart();
  502. return true;
  503. }
  504. bool clone(OperationContext* txn, string& errmsg , BSONObjBuilder& result ) {
  505. ElapsedTracker tracker(internalQueryExecYieldIterations,
  506. internalQueryExecYieldPeriodMS);
  507. int allocSize = 0;
  508. {
  509. AutoGetCollectionForRead ctx(txn, getNS());
  510. boost::lock_guard<boost::mutex> sl(_mutex);
  511. if (!_active) {
  512. errmsg = "not active";
  513. return false;
  514. }
  515. Collection* collection = ctx.getCollection();
  516. if (!collection) {
  517. errmsg = str::stream() << "collection " << _ns << " does not exist";
  518. return false;
  519. }
  520. allocSize =
  521. std::min(BSONObjMaxUserSize,
  522. static_cast<int>((12 + collection->averageObjectSize(txn)) *
  523. cloneLocsRemaining()));
  524. }
  525. bool isBufferFilled = false;
  526. BSONArrayBuilder clonedDocsArrayBuilder(allocSize);
  527. while (!isBufferFilled) {
  528. AutoGetCollectionForRead ctx(txn, getNS());
  529. boost::lock_guard<boost::mutex> sl(_mutex);
  530. if (!_active) {
  531. errmsg = "not active";
  532. return false;
  533. }
  534. // TODO: fix SERVER-16540 race
  535. Collection* collection = ctx.getCollection();
  536. if (!collection) {
  537. errmsg = str::stream() << "collection " << _ns << " does not exist";
  538. return false;
  539. }
  540. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  541. set<RecordId>::iterator cloneLocsIter = _cloneLocs.begin();
  542. for ( ; cloneLocsIter != _cloneLocs.end(); ++cloneLocsIter) {
  543. if (tracker.intervalHasElapsed()) // should I yield?
  544. break;
  545. RecordId dl = *cloneLocsIter;
  546. Snapshotted<BSONObj> doc;
  547. if (!collection->findDoc(txn, dl, &doc)) {
  548. // doc was deleted
  549. continue;
  550. }
  551. // Use the builder size instead of accumulating 'doc's size so that we take
  552. // into consideration the overhead of BSONArray indices, and *always*
  553. // append one doc.
  554. if (clonedDocsArrayBuilder.arrSize() != 0 &&
  555. (clonedDocsArrayBuilder.len() + doc.value().objsize() + 1024)
  556. > BSONObjMaxUserSize) {
  557. isBufferFilled = true; // break out of outer while loop
  558. break;
  559. }
  560. clonedDocsArrayBuilder.append(doc.value());
  561. }
  562. _cloneLocs.erase(_cloneLocs.begin(), cloneLocsIter);
  563. // Note: must be holding _cloneLocsMutex, don't move this inside while condition!
  564. if (_cloneLocs.empty()) {
  565. break;
  566. }
  567. }
  568. result.appendArray("objects", clonedDocsArrayBuilder.arr());
  569. return true;
  570. }
  571. void aboutToDelete( const RecordId& dl ) {
  572. // Even though above we call findDoc to check for existance
  573. // that check only works for non-mmapv1 engines, and this is needed
  574. // for mmapv1.
  575. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  576. _cloneLocs.erase( dl );
  577. }
  578. std::size_t cloneLocsRemaining() {
  579. boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
  580. return _cloneLocs.size();
  581. }
  582. long long mbUsed() const {
  583. boost::lock_guard<boost::mutex> lk(_mutex);
  584. return _memoryUsed / ( 1024 * 1024 );
  585. }
  586. bool getInCriticalSection() const {
  587. boost::lock_guard<boost::mutex> lk(_mutex);
  588. return _inCriticalSection;
  589. }
  590. void setInCriticalSection( bool b ) {
  591. boost::lock_guard<boost::mutex> lk(_mutex);
  592. _inCriticalSection = b;
  593. _inCriticalSectionCV.notify_all();
  594. }
  595. std::string getNS() const {
  596. boost::lock_guard<boost::mutex> sl(_mutex);
  597. return _ns;
  598. }
  599. /**
  600. * @return true if we are NOT in the critical section
  601. */
  602. bool waitTillNotInCriticalSection( int maxSecondsToWait ) {
  603. boost::xtime xt;
  604. boost::xtime_get(&xt, MONGO_BOOST_TIME_UTC);
  605. xt.sec += maxSecondsToWait;
  606. boost::unique_lock<boost::mutex> lk(_mutex);
  607. while (_inCriticalSection) {
  608. if (!_inCriticalSectionCV.timed_wait(lk, xt))
  609. return false;
  610. }
  611. return true;
  612. }
  613. bool isActive() const { return _getActive(); }
  614. private:
  615. bool _getActive() const { boost::lock_guard<boost::mutex> lk(_mutex); return _active; }
  616. void _setActive( bool b ) { boost::lock_guard<boost::mutex> lk(_mutex); _active = b; }
  617. /**
  618. * Used to commit work for LogOpForSharding. Used to keep track of changes in documents
  619. * that are part of a chunk being migrated.
  620. */
  621. class LogOpForShardingHandler : public RecoveryUnit::Change {
  622. public:
  623. /**
  624. * Invariant: idObj should belong to a document that is part of the active chunk
  625. * being migrated.
  626. */
  627. LogOpForShardingHandler(MigrateFromStatus* migrateFromStatus,
  628. const BSONObj& idObj,
  629. const char op):
  630. _migrateFromStatus(migrateFromStatus),
  631. _idObj(idObj.getOwned()),
  632. _op(op) {
  633. }
  634. virtual void commit() {
  635. switch (_op) {
  636. case 'd': {
  637. boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
  638. _migrateFromStatus->_deleted.push_back(_idObj);
  639. _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
  640. break;
  641. }
  642. case 'i':
  643. case 'u':
  644. {
  645. boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
  646. _migrateFromStatus->_reload.push_back(_idObj);
  647. _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
  648. break;
  649. }
  650. default:
  651. invariant(false);
  652. }
  653. }
  654. virtual void rollback() { }
  655. private:
  656. MigrateFromStatus* _migrateFromStatus;
  657. const BSONObj _idObj;
  658. const char _op;
  659. };
  660. /**
  661. * Used to receive invalidation notifications.
  662. *
  663. * XXX: move to the exec/ directory.
  664. */
  665. class DeleteNotificationStage : public PlanStage {
  666. public:
  667. virtual void invalidate(OperationContext* txn,
  668. const RecordId& dl,
  669. InvalidationType type);
  670. virtual StageState work(WorkingSetID* out) {
  671. invariant( false );
  672. }
  673. virtual bool isEOF() {
  674. invariant( false );
  675. return false;
  676. }
  677. virtual void kill() {
  678. }
  679. virtual void saveState() {
  680. invariant( false );
  681. }
  682. virtual void restoreState(OperationContext* opCtx) {
  683. invariant( false );
  684. }
  685. virtual PlanStageStats* getStats() {
  686. invariant( false );
  687. return NULL;
  688. }
  689. virtual CommonStats* getCommonStats() const {
  690. invariant( false );
  691. return NULL;
  692. }
  693. virtual SpecificStats* getSpecificStats() const {
  694. invariant( false );
  695. return NULL;
  696. }
  697. virtual std::vector<PlanStage*> getChildren() const {
  698. vector<PlanStage*> empty;
  699. return empty;
  700. }
  701. virtual StageType stageType() const {
  702. return STAGE_NOTIFY_DELETE;
  703. }
  704. };
  705. //
  706. // All member variables are labeled with one of the following codes indicating the
  707. // synchronization rules for accessing them.
  708. //
  709. // (M) Must hold _mutex for access.
  710. // (MG) For reads, _mutex *OR* Global IX Lock must be held.
  711. // For writes, the _mutex *AND* (Global Shared or Exclusive Lock) must be held.
  712. // (C) Must hold _cloneLocsMutex for access.
  713. //
  714. // Locking order:
  715. //
  716. // Global Lock -> _mutex -> _cloneLocsMutex
  717. mutable mongo::mutex _mutex;
  718. boost::condition _inCriticalSectionCV; // (M)
  719. // Is migration currently in critical section. This can be used to block new writes.
  720. bool _inCriticalSection; // (M)
  721. scoped_ptr<PlanExecutor> _deleteNotifyExec; // (M)
  722. // List of _id of documents that were modified that must be re-cloned.
  723. list<BSONObj> _reload; // (M)
  724. // List of _id of documents that were deleted during clone that should be deleted later.
  725. list<BSONObj> _deleted; // (M)
  726. // bytes in _reload + _deleted
  727. long long _memoryUsed; // (M)
  728. // If a migration is currently active.
  729. bool _active; // (MG)
  730. string _ns; // (MG)
  731. BSONObj _min; // (MG)
  732. BSONObj _max; // (MG)
  733. BSONObj _shardKeyPattern; // (MG)
  734. mutable mongo::mutex _cloneLocsMutex;
  735. // List of record id that needs to be transferred from here to the other side.
  736. set<RecordId> _cloneLocs; // (C)
  737. } migrateFromStatus;
  738. void MigrateFromStatus::DeleteNotificationStage::invalidate(OperationContext *txn,
  739. const RecordId& dl,
  740. InvalidationType type) {
  741. if ( type == INVALIDATION_DELETION ) {
  742. migrateFromStatus.aboutToDelete( dl );
  743. }
  744. }
  745. struct MigrateStatusHolder {
  746. MigrateStatusHolder( OperationContext* txn,
  747. const std::string& ns ,
  748. const BSONObj& min ,
  749. const BSONObj& max ,
  750. const BSONObj& shardKeyPattern )
  751. : _txn(txn) {
  752. _isAnotherMigrationActive =
  753. !migrateFromStatus.start(txn, ns, min, max, shardKeyPattern);
  754. }
  755. ~MigrateStatusHolder() {
  756. if (!_isAnotherMigrationActive) {
  757. migrateFromStatus.done(_txn);
  758. }
  759. }
  760. bool isAnotherMigrationActive() const {
  761. return _isAnotherMigrationActive;
  762. }
  763. private:
  764. OperationContext* _txn;
  765. bool _isAnotherMigrationActive;
  766. };
  767. void logOpForSharding(OperationContext* txn,
  768. const char * opstr,
  769. const char * ns,
  770. const BSONObj& obj,
  771. BSONObj * patt,
  772. bool notInActiveChunk) {
  773. migrateFromStatus.logOp(txn, opstr, ns, obj, patt, notInActiveChunk);
  774. }
  775. class TransferModsCommand : public ChunkCommandHelper {
  776. public:
  777. void help(stringstream& h) const { h << "internal"; }
  778. TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {}
  779. virtual void addRequiredPrivileges(const std::string& dbname,
  780. const BSONObj& cmdObj,
  781. std::vector<Privilege>* out) {
  782. ActionSet actions;
  783. actions.addAction(ActionType::internal);
  784. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  785. }
  786. bool run(OperationContext* txn,
  787. const string&,
  788. BSONObj& cmdObj,
  789. int,
  790. string& errmsg,
  791. BSONObjBuilder& result) {
  792. return migrateFromStatus.transferMods(txn, errmsg, result);
  793. }
  794. } transferModsCommand;
  795. class InitialCloneCommand : public ChunkCommandHelper {
  796. public:
  797. void help(stringstream& h) const { h << "internal"; }
  798. InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {}
  799. virtual void addRequiredPrivileges(const std::string& dbname,
  800. const BSONObj& cmdObj,
  801. std::vector<Privilege>* out) {
  802. ActionSet actions;
  803. actions.addAction(ActionType::internal);
  804. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  805. }
  806. bool run(OperationContext* txn,
  807. const string&,
  808. BSONObj& cmdObj,
  809. int,
  810. string& errmsg,
  811. BSONObjBuilder& result) {
  812. return migrateFromStatus.clone(txn, errmsg, result);
  813. }
  814. } initialCloneCommand;
  815. // Tests can pause / resume moveChunk's progress at each step by enabling / disabling each fail point.
  816. MONGO_FP_DECLARE(moveChunkHangAtStep1);
  817. MONGO_FP_DECLARE(moveChunkHangAtStep2);
  818. MONGO_FP_DECLARE(moveChunkHangAtStep3);
  819. MONGO_FP_DECLARE(moveChunkHangAtStep4);
  820. MONGO_FP_DECLARE(moveChunkHangAtStep5);
  821. MONGO_FP_DECLARE(moveChunkHangAtStep6);
  822. /**
  823. * this is the main entry for moveChunk
  824. * called to initial a move
  825. * usually by a mongos
  826. * this is called on the "from" side
  827. *
  828. * Format:
  829. * {
  830. * moveChunk: "namespace",
  831. * from: "hostAndPort",
  832. * fromShard: "shardName",
  833. * to: "hostAndPort",
  834. * toShard: "shardName",
  835. * min: {},
  836. * max: {},
  837. * maxChunkBytes: numeric,
  838. * configdb: "hostAndPort",
  839. *
  840. * // optional
  841. * secondaryThrottle: bool, //defaults to true.
  842. * writeConcern: {} // applies to individual writes.
  843. * }
  844. */
  845. class MoveChunkCommand : public Command {
  846. public:
  847. MoveChunkCommand() : Command( "moveChunk" ) {}
  848. virtual void help( stringstream& help ) const {
  849. help << "should not be calling this directly";
  850. }
  851. virtual bool slaveOk() const { return false; }
  852. virtual bool adminOnly() const { return true; }
  853. virtual bool isWriteCommandForConfigServer() const { return false; }
  854. virtual Status checkAuthForCommand(ClientBasic* client,
  855. const std::string& dbname,
  856. const BSONObj& cmdObj) {
  857. if (!AuthorizationSession::get(client)->isAuthorizedForActionsOnResource(
  858. ResourcePattern::forExactNamespace(NamespaceString(parseNs(dbname, cmdObj))),
  859. ActionType::moveChunk)) {
  860. return Status(ErrorCodes::Unauthorized, "Unauthorized");
  861. }
  862. return Status::OK();
  863. }
  864. virtual std::string parseNs(const std::string& dbname, const BSONObj& cmdObj) const {
  865. return parseNsFullyQualified(dbname, cmdObj);
  866. }
  867. bool run(OperationContext* txn,
  868. const string& dbname,
  869. BSONObj& cmdObj,
  870. int,
  871. string& errmsg,
  872. BSONObjBuilder& result) {
  873. // 1. Parse options
  874. // 2. Make sure my view is complete and lock the distributed lock to ensure shard
  875. // metadata stability.
  876. // 3. Migration
  877. // Retrieve all RecordIds, which need to be migrated in order to do as little seeking
  878. // as possible during transfer. Retrieval of the RecordIds happens under a collection
  879. // lock, but then the collection lock is dropped. This opens up an opportunity for
  880. // repair or compact to invalidate these RecordIds, because these commands do not
  881. // synchronized with migration. Note that data modifications are not a problem,
  882. // because we are registered for change notifications.
  883. //
  884. // 4. pause till migrate caught up
  885. // 5. LOCK
  886. // a) update my config, essentially locking
  887. // b) finish migrate
  888. // c) update config server
  889. // d) logChange to config server
  890. // 6. wait for all current cursors to expire
  891. // 7. remove data locally
  892. // -------------------------------
  893. // 1.
  894. string ns = parseNs(dbname, cmdObj);
  895. // The shard addresses, redundant, but allows for validation
  896. string toShardHost = cmdObj["to"].str();
  897. string fromShardHost = cmdObj["from"].str();
  898. // The shard names
  899. string toShardName = cmdObj["toShard"].str();
  900. string fromShardName = cmdObj["fromShard"].str();
  901. // Process secondary throttle settings and assign defaults if necessary.
  902. BSONObj secThrottleObj;
  903. WriteConcernOptions writeConcern;
  904. Status status = writeConcern.parseSecondaryThrottle(cmdObj, &secThrottleObj);
  905. if (!status.isOK()){
  906. if (status.code() != ErrorCodes::WriteConcernNotDefined) {
  907. warning() << status.toString() << endl;
  908. return appendCommandStatus(result, status);
  909. }
  910. writeConcern = getDefaultWriteConcern();
  911. }
  912. else {
  913. repl::ReplicationCoordinator* replCoordinator =
  914. repl::getGlobalReplicationCoordinator();
  915. if (replCoordinator->getReplicationMode() ==
  916. repl::ReplicationCoordinator::modeMasterSlave &&
  917. writeConcern.shouldWaitForOtherNodes()) {
  918. warning() << "moveChunk cannot check if secondary throttle setting "
  919. << writeConcern.toBSON()
  920. << " can be enforced in a master slave configuration";
  921. }
  922. Status status = replCoordinator->checkIfWriteConcernCanBeSatisfied(writeConcern);
  923. if (!status.isOK() && status != ErrorCodes::NoReplicationEnabled) {
  924. warning() << status.toString() << endl;
  925. return appendCommandStatus(result, status);
  926. }
  927. }
  928. if (writeConcern.shouldWaitForOtherNodes() &&
  929. writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) {
  930. // Don't allow no timeout.
  931. writeConcern.wTimeout = kDefaultWTimeoutMs;
  932. }
  933. // Do inline deletion
  934. bool waitForDelete = cmdObj["waitForDelete"].trueValue();
  935. if (waitForDelete) {
  936. log() << "moveChunk waiting for full cleanup after move" << endl;
  937. }
  938. BSONObj min = cmdObj["min"].Obj();
  939. BSONObj max = cmdObj["max"].Obj();
  940. BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
  941. if ( ns.empty() ) {
  942. errmsg = "need to specify namespace in command";
  943. return false;
  944. }
  945. if ( toShardName.empty() ) {
  946. errmsg = "need to specify shard to move chunk to";
  947. return false;
  948. }
  949. if ( fromShardName.empty() ) {
  950. errmsg = "need to specify shard to move chunk from";
  951. return false;
  952. }
  953. if ( min.isEmpty() ) {
  954. errmsg = "need to specify a min";
  955. return false;
  956. }
  957. if ( max.isEmpty() ) {
  958. errmsg = "need to specify a max";
  959. return false;
  960. }
  961. if ( maxSizeElem.eoo() || ! maxSizeElem.isNumber() ) {
  962. errmsg = "need to specify maxChunkSizeBytes";
  963. return false;
  964. }
  965. const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
  966. // This could be the first call that enables sharding - make sure we initialize the
  967. // sharding state for this shard.
  968. if ( ! shardingState.enabled() ) {
  969. if ( cmdObj["configdb"].type() != String ) {
  970. errmsg = "sharding not enabled";
  971. warning() << errmsg << endl;
  972. return false;
  973. }
  974. string configdb = cmdObj["configdb"].String();
  975. ShardingState::initialize(configdb);
  976. }
  977. // Initialize our current shard name in the shard state if needed
  978. shardingState.gotShardName(fromShardName);
  979. // Make sure we're as up-to-date as possible with shard information
  980. // This catches the case where we had to previously changed a shard's host by
  981. // removing/adding a shard with the same name
  982. Shard::reloadShardInfo();
  983. Shard toShard(toShardName);
  984. Shard fromShard(fromShardName);
  985. ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(),
  986. errmsg);
  987. if (!configLoc.isValid()) {
  988. warning() << errmsg;
  989. return false;
  990. }
  991. MoveTimingHelper timing(txn, "from" , ns , min , max , 6 /* steps */ , &errmsg,
  992. toShardName, fromShardName );
  993. log() << "received moveChunk request: " << cmdObj << migrateLog;
  994. timing.done(1);
  995. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep1);
  996. // 2.
  997. if ( migrateFromStatus.isActive() ) {
  998. errmsg = "migration already in progress";
  999. warning() << errmsg << endl;
  1000. return false;
  1001. }
  1002. //
  1003. // Get the distributed lock
  1004. //
  1005. string whyMessage(str::stream() << "migrating chunk [" << minKey << ", " << maxKey
  1006. << ") in " << ns);
  1007. auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
  1008. ns, whyMessage);
  1009. if (!scopedDistLock.isOK()) {
  1010. errmsg = stream() << "could not acquire collection lock for " << ns
  1011. << " to migrate chunk [" << minKey << "," << maxKey << ")"
  1012. << causedBy(scopedDistLock.getStatus());
  1013. warning() << errmsg << endl;
  1014. return false;
  1015. }
  1016. BSONObj chunkInfo =
  1017. BSON("min" << min << "max" << max <<
  1018. "from" << fromShard.getName() << "to" << toShard.getName());
  1019. grid.catalogManager()->logChange(txn, "moveChunk.start", ns, chunkInfo);
  1020. // Always refresh our metadata remotely
  1021. ChunkVersion origShardVersion;
  1022. Status refreshStatus = shardingState.refreshMetadataNow(txn, ns, &origShardVersion);
  1023. if (!refreshStatus.isOK()) {
  1024. errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
  1025. << "[" << minKey << "," << maxKey << ")"
  1026. << causedBy(refreshStatus.reason());
  1027. warning() << errmsg;
  1028. return false;
  1029. }
  1030. if (origShardVersion.majorVersion() == 0) {
  1031. // It makes no sense to migrate if our version is zero and we have no chunks
  1032. errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
  1033. << "[" << minKey << "," << maxKey << ")"
  1034. << " with zero shard version";
  1035. warning() << errmsg;
  1036. return false;
  1037. }
  1038. // From mongos >= v3.0.
  1039. BSONElement epochElem(cmdObj["epoch"]);
  1040. if (epochElem.type() == jstOID) {
  1041. OID cmdEpoch = epochElem.OID();
  1042. if (cmdEpoch != origShardVersion.epoch()) {
  1043. errmsg = str::stream() << "moveChunk cannot move chunk "
  1044. << "[" << minKey << ","
  1045. << maxKey << "), "
  1046. << "collection may have been dropped. "
  1047. << "current epoch: " << origShardVersion.epoch()
  1048. << ", cmd epoch: " << cmdEpoch;
  1049. warning() << errmsg;
  1050. return false;
  1051. }
  1052. }
  1053. // Get collection metadata
  1054. const CollectionMetadataPtr origCollMetadata(shardingState.getCollectionMetadata(ns));
  1055. // With nonzero shard version, we must have metadata
  1056. invariant(NULL != origCollMetadata);
  1057. ChunkVersion origCollVersion = origCollMetadata->getCollVersion();
  1058. BSONObj shardKeyPattern = origCollMetadata->getKeyPattern();
  1059. // With nonzero shard version, we must have a coll version >= our shard version
  1060. invariant(origCollVersion >= origShardVersion);
  1061. // With nonzero shard version, we must have a shard key
  1062. invariant(!shardKeyPattern.isEmpty());
  1063. ChunkType origChunk;
  1064. if (!origCollMetadata->getNextChunk(min, &origChunk)
  1065. || origChunk.getMin().woCompare(min) || origChunk.getMax().woCompare(max)) {
  1066. // Our boundaries are different from those passed in
  1067. errmsg = str::stream() << "moveChunk cannot find chunk "
  1068. << "[" << minKey << "," << maxKey << ")"
  1069. << " to migrate, the chunk boundaries may be stale";
  1070. warning() << errmsg;
  1071. return false;
  1072. }
  1073. log() << "moveChunk request accepted at version " << origShardVersion;
  1074. timing.done(2);
  1075. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep2);
  1076. // 3.
  1077. MigrateStatusHolder statusHolder(txn, ns, min, max, shardKeyPattern);
  1078. if (statusHolder.isAnotherMigrationActive()) {
  1079. errmsg = "moveChunk is already in progress from this shard";
  1080. warning() << errmsg << endl;
  1081. return false;
  1082. }
  1083. {
  1084. // See comment at the top of the function for more information on what
  1085. // synchronization is used here.
  1086. if (!migrateFromStatus.storeCurrentLocs(txn, maxChunkSize, errmsg, result)) {
  1087. warning() << errmsg << endl;
  1088. return false;
  1089. }
  1090. ScopedDbConnection connTo(toShard.getConnString());
  1091. BSONObj res;
  1092. bool ok;
  1093. const bool isSecondaryThrottle(writeConcern.shouldWaitForOtherNodes());
  1094. BSONObjBuilder recvChunkStartBuilder;
  1095. recvChunkStartBuilder.append("_recvChunkStart", ns);
  1096. recvChunkStartBuilder.append("from", fromShard.getConnString());
  1097. recvChunkStartBuilder.append("fromShardName", fromShard.getName());
  1098. recvChunkStartBuilder.append("toShardName", toShard.getName());
  1099. recvChunkStartBuilder.append("min", min);
  1100. recvChunkStartBuilder.append("max", max);
  1101. recvChunkStartBuilder.append("shardKeyPattern", shardKeyPattern);
  1102. recvChunkStartBuilder.append("configServer", configServer.modelServer());
  1103. recvChunkStartBuilder.append("secondaryThrottle", isSecondaryThrottle);
  1104. // Follow the same convention in moveChunk.
  1105. if (isSecondaryThrottle && !secThrottleObj.isEmpty()) {
  1106. recvChunkStartBuilder.append("writeConcern", secThrottleObj);
  1107. }
  1108. try{
  1109. ok = connTo->runCommand("admin", recvChunkStartBuilder.done(), res);
  1110. }
  1111. catch( DBException& e ){
  1112. errmsg = str::stream() << "moveChunk could not contact to: shard "
  1113. << toShardName << " to start transfer" << causedBy( e );
  1114. warning() << errmsg << endl;
  1115. return false;
  1116. }
  1117. connTo.done();
  1118. if ( ! ok ) {
  1119. errmsg = "moveChunk failed to engage TO-shard in the data transfer: ";
  1120. verify( res["errmsg"].type() );
  1121. errmsg += res["errmsg"].String();
  1122. result.append( "cause" , res );
  1123. warning() << errmsg << endl;
  1124. return false;
  1125. }
  1126. }
  1127. timing.done( 3 );
  1128. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep3);
  1129. // 4.
  1130. // Track last result from TO shard for sanity check
  1131. BSONObj res;
  1132. for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day
  1133. invariant(!txn->lockState()->isLocked());
  1134. // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few
  1135. // iterations, since we want empty chunk migrations to be fast.
  1136. sleepmillis( 1 << std::min( i , 10 ) );
  1137. ScopedDbConnection conn(toShard.getConnString());
  1138. bool ok;
  1139. res = BSONObj();
  1140. try {
  1141. ok = conn->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res );
  1142. res = res.getOwned();
  1143. }
  1144. catch( DBException& e ){
  1145. errmsg = str::stream() << "moveChunk could not contact to: shard " << toShardName << " to monitor transfer" << causedBy( e );
  1146. warning() << errmsg << endl;
  1147. return false;
  1148. }
  1149. conn.done();
  1150. if ( res["ns"].str() != ns ||
  1151. res["from"].str() != fromShard.getConnString() ||
  1152. !res["min"].isABSONObj() ||
  1153. res["min"].Obj().woCompare(min) != 0 ||
  1154. !res["max"].isABSONObj() ||
  1155. res["max"].Obj().woCompare(max) != 0 ) {
  1156. // This can happen when the destination aborted the migration and
  1157. // received another recvChunk before this thread sees the transition
  1158. // to the abort state. This is currently possible only if multiple migrations
  1159. // are happening at once. This is an unfortunate consequence of the shards not
  1160. // being able to keep track of multiple incoming and outgoing migrations.
  1161. errmsg = str::stream() << "Destination shard aborted migration, "
  1162. "now running a new one: " << res;
  1163. warning() << errmsg << endl;
  1164. return false;
  1165. }
  1166. LOG(0) << "moveChunk data transfer progress: " << res << " my mem used: " << migrateFromStatus.mbUsed() << migrateLog;
  1167. if ( ! ok || res["state"].String() == "fail" ) {
  1168. warning() << "moveChunk error transferring data caused migration abort: " << res << migrateLog;
  1169. errmsg = "data transfer error";
  1170. result.append( "cause" , res );
  1171. return false;
  1172. }
  1173. if ( res["state"].String() == "steady" )
  1174. break;
  1175. if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) {
  1176. // this is too much memory for us to use for this
  1177. // so we're going to abort the migrate
  1178. ScopedDbConnection conn(toShard.getConnString());
  1179. BSONObj res;
  1180. if (!conn->runCommand( "admin", BSON( "_recvChunkAbort" << 1 ), res )) {
  1181. warning() << "Error encountered while trying to abort migration on "
  1182. << "destination shard" << toShard.getConnString() << endl;
  1183. }
  1184. res = res.getOwned();
  1185. conn.done();
  1186. error() << "aborting migrate because too much memory used res: " << res << migrateLog;
  1187. errmsg = "aborting migrate because too much memory used";
  1188. result.appendBool( "split" , true );
  1189. return false;
  1190. }
  1191. txn->checkForInterrupt();
  1192. }
  1193. timing.done(4);
  1194. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep4);
  1195. // 5.
  1196. // Before we get into the critical section of the migration, let's double check
  1197. // that the docs have been cloned, the config servers are reachable,
  1198. // and the lock is in place.
  1199. log() << "About to check if it is safe to enter critical section" << endl;
  1200. // Ensure all cloned docs have actually been transferred
  1201. std::size_t locsRemaining = migrateFromStatus.cloneLocsRemaining();
  1202. if ( locsRemaining != 0 ) {
  1203. errmsg =
  1204. str::stream() << "moveChunk cannot enter critical section before all data is"
  1205. << " cloned, " << locsRemaining << " locs were not transferred"
  1206. << " but to-shard reported " << res;
  1207. // Should never happen, but safe to abort before critical section
  1208. error() << errmsg << migrateLog;
  1209. dassert( false );
  1210. return false;
  1211. }
  1212. // Ensure distributed lock still held
  1213. Status lockStatus = scopedDistLock.getValue().checkStatus();
  1214. if (!lockStatus.isOK()) {
  1215. errmsg = str::stream() << "not entering migrate critical section because "
  1216. << lockStatus.toString();
  1217. warning() << errmsg << endl;
  1218. return false;
  1219. }
  1220. log() << "About to enter migrate critical section" << endl;
  1221. {
  1222. // 5.a
  1223. // we're under the collection lock here, so no other migrate can change maxVersion
  1224. // or CollectionMetadata state
  1225. migrateFromStatus.setInCriticalSection( true );
  1226. ChunkVersion myVersion = origCollVersion;
  1227. myVersion.incMajor();
  1228. {
  1229. ScopedTransaction transaction(txn, MODE_IX);
  1230. Lock::DBLock lk(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  1231. Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);
  1232. verify( myVersion > shardingState.getVersion( ns ) );
  1233. // bump the metadata's version up and "forget" about the chunk being moved
  1234. // this is not the commit point but in practice the state in this shard won't
  1235. // until the commit it done
  1236. shardingState.donateChunk(txn, ns, min, max, myVersion);
  1237. }
  1238. log() << "moveChunk setting version to: " << myVersion << migrateLog;
  1239. // 5.b
  1240. // we're under the collection lock here, too, so we can undo the chunk donation because no other state change
  1241. // could be ongoing
  1242. BSONObj res;
  1243. bool ok;
  1244. try {
  1245. ScopedDbConnection connTo( toShard.getConnString(), 35.0 );
  1246. ok = connTo->runCommand( "admin", BSON( "_recvChunkCommit" << 1 ), res );
  1247. connTo.done();
  1248. }
  1249. catch ( DBException& e ) {
  1250. errmsg = str::stream() << "moveChunk could not contact to: shard "
  1251. << toShard.getConnString() << " to commit transfer"
  1252. << causedBy( e );
  1253. warning() << errmsg << endl;
  1254. ok = false;
  1255. }
  1256. if ( !ok || MONGO_FAIL_POINT(failMigrationCommit) ) {
  1257. log() << "moveChunk migrate commit not accepted by TO-shard: " << res
  1258. << " resetting shard version to: " << origShardVersion << migrateLog;
  1259. {
  1260. ScopedTransaction transaction(txn, MODE_IX);
  1261. Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  1262. Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);
  1263. log() << "moveChunk collection lock acquired to reset shard version from "
  1264. "failed migration"
  1265. << endl;
  1266. // revert the chunk manager back to the state before "forgetting" about the
  1267. // chunk
  1268. shardingState.undoDonateChunk(txn, ns, origCollMetadata);
  1269. }
  1270. log() << "Shard version successfully reset to clean up failed migration"
  1271. << endl;
  1272. errmsg = "_recvChunkCommit failed!";
  1273. result.append( "cause", res );
  1274. return false;
  1275. }
  1276. log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog;
  1277. // 5.c
  1278. // version at which the next highest lastmod will be set
  1279. // if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod
  1280. // otherwise the highest version is from the chunk being bumped on the FROM-shard
  1281. ChunkVersion nextVersion;
  1282. // we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another
  1283. // local one (so to bump version for the entire shard)
  1284. // we use the 'applyOps' mechanism to group the two updates and make them safer
  1285. // TODO pull config update code to a module
  1286. BSONArrayBuilder updates;
  1287. {
  1288. // update for the chunk being moved
  1289. BSONObjBuilder op;
  1290. op.append( "op" , "u" );
  1291. op.appendBool( "b" , false /* no upserting */ );
  1292. op.append( "ns" , ChunkType::ConfigNS );
  1293. BSONObjBuilder n( op.subobjStart( "o" ) );
  1294. n.append(ChunkType::name(), Chunk::genID(ns, min));
  1295. myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
  1296. n.append(ChunkType::ns(), ns);
  1297. n.append(ChunkType::min(), min);
  1298. n.append(ChunkType::max(), max);
  1299. n.append(ChunkType::shard(), toShard.getName());
  1300. n.done();
  1301. BSONObjBuilder q( op.subobjStart( "o2" ) );
  1302. q.append(ChunkType::name(), Chunk::genID(ns, min));
  1303. q.done();
  1304. updates.append( op.obj() );
  1305. }
  1306. nextVersion = myVersion;
  1307. // if we have chunks left on the FROM shard, update the version of one of them as
  1308. // well. we can figure that out by grabbing the metadata installed on 5.a
  1309. const CollectionMetadataPtr bumpedCollMetadata( shardingState.getCollectionMetadata( ns ) );
  1310. if( bumpedCollMetadata->getNumChunks() > 0 ) {
  1311. // get another chunk on that shard
  1312. ChunkType bumpChunk;
  1313. bool chunkRes =
  1314. bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(),
  1315. &bumpChunk);
  1316. BSONObj bumpMin = bumpChunk.getMin();
  1317. BSONObj bumpMax = bumpChunk.getMax();
  1318. (void)chunkRes; // for compile warning on non-debug
  1319. dassert(chunkRes);
  1320. dassert( bumpMin.woCompare( min ) != 0 );
  1321. BSONObjBuilder op;
  1322. op.append( "op" , "u" );
  1323. op.appendBool( "b" , false );
  1324. op.append( "ns" , ChunkType::ConfigNS );
  1325. nextVersion.incMinor(); // same as used on donateChunk
  1326. BSONObjBuilder n( op.subobjStart( "o" ) );
  1327. n.append(ChunkType::name(), Chunk::genID(ns, bumpMin));
  1328. nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
  1329. n.append(ChunkType::ns(), ns);
  1330. n.append(ChunkType::min(), bumpMin);
  1331. n.append(ChunkType::max(), bumpMax);
  1332. n.append(ChunkType::shard(), fromShard.getName());
  1333. n.done();
  1334. BSONObjBuilder q( op.subobjStart( "o2" ) );
  1335. q.append(ChunkType::name(), Chunk::genID(ns, bumpMin));
  1336. q.done();
  1337. updates.append( op.obj() );
  1338. log() << "moveChunk updating self version to: " << nextVersion << " through "
  1339. << bumpMin << " -> " << bumpMax << " for collection '" << ns << "'" << migrateLog;
  1340. }
  1341. else {
  1342. log() << "moveChunk moved last chunk out for collection '" << ns << "'" << migrateLog;
  1343. }
  1344. BSONArrayBuilder preCond;
  1345. {
  1346. BSONObjBuilder b;
  1347. b.append("ns", ChunkType::ConfigNS);
  1348. b.append("q", BSON("query" << BSON(ChunkType::ns(ns)) <<
  1349. "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
  1350. {
  1351. BSONObjBuilder bb( b.subobjStart( "res" ) );
  1352. // TODO: For backwards compatibility, we can't yet require an epoch here
  1353. bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), origCollVersion.toLong());
  1354. bb.done();
  1355. }
  1356. preCond.append( b.obj() );
  1357. }
  1358. int exceptionCode = OkCode;
  1359. ok = false;
  1360. try {
  1361. // For testing migration failures
  1362. if ( MONGO_FAIL_POINT(failMigrationConfigWritePrepare) ) {
  1363. throw DBException( "mock migration failure before config write",
  1364. PrepareConfigsFailedCode );
  1365. }
  1366. Status status = grid.catalogManager()->applyChunkOpsDeprecated(updates.arr(),
  1367. preCond.arr());
  1368. ok = status.isOK();
  1369. exceptionCode = status.code();
  1370. if (MONGO_FAIL_POINT(failMigrationApplyOps)) {
  1371. throw SocketException(SocketException::RECV_ERROR,
  1372. shardingState.getConfigServer());
  1373. }
  1374. }
  1375. catch (const DBException& e) {
  1376. warning() << e << migrateLog;
  1377. ok = false;
  1378. exceptionCode = e.getCode();
  1379. }
  1380. if ( exceptionCode == PrepareConfigsFailedCode ) {
  1381. // In the process of issuing the migrate commit, the SyncClusterConnection
  1382. // checks that the config servers are reachable. If they are not, we are
  1383. // sure that the applyOps command was not sent to any of the configs, so we
  1384. // can safely back out of the migration here, by resetting the shard
  1385. // version that we bumped up to in the donateChunk() call above.
  1386. log() << "About to acquire moveChunk coll lock to reset shard version from "
  1387. << "failed migration" << endl;
  1388. {
  1389. ScopedTransaction transaction(txn, MODE_IX);
  1390. Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  1391. Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);
  1392. // Revert the metadata back to the state before "forgetting"
  1393. // about the chunk.
  1394. shardingState.undoDonateChunk(txn, ns, origCollMetadata);
  1395. }
  1396. log() << "Shard version successfully reset to clean up failed migration" << endl;
  1397. errmsg = "Failed to send migrate commit to configs because " + errmsg;
  1398. return false;
  1399. }
  1400. else if ( ! ok || exceptionCode != OkCode ) {
  1401. // this could be a blip in the connectivity
  1402. // wait out a few seconds and check if the commit request made it
  1403. //
  1404. // if the commit made it to the config, we'll see the chunk in the new shard and there's no action
  1405. // if the commit did not make it, currently the only way to fix this state is to bounce the mongod so
  1406. // that the old state (before migrating) be brought in
  1407. warning() << "moveChunk commit outcome ongoing" << migrateLog;
  1408. sleepsecs( 10 );
  1409. try {
  1410. ScopedDbConnection conn(shardingState.getConfigServer(), 10.0);
  1411. // look for the chunk in this shard whose version got bumped
  1412. // we assume that if that mod made it to the config, the applyOps was successful
  1413. BSONObj doc = conn->findOne(ChunkType::ConfigNS,
  1414. Query(BSON(ChunkType::ns(ns)))
  1415. .sort(BSON(ChunkType::DEPRECATED_lastmod() << -1)));
  1416. ChunkVersion checkVersion(ChunkVersion::fromBSON(doc));
  1417. if ( checkVersion.equals( nextVersion ) ) {
  1418. log() << "moveChunk commit confirmed" << migrateLog;
  1419. errmsg.clear();
  1420. }
  1421. else {
  1422. error() << "moveChunk commit failed: version is at "
  1423. << checkVersion << " instead of " << nextVersion << migrateLog;
  1424. error() << "TERMINATING" << migrateLog;
  1425. dbexit( EXIT_SHARDING_ERROR );
  1426. }
  1427. conn.done();
  1428. }
  1429. catch ( ... ) {
  1430. error() << "moveChunk failed to get confirmation of commit" << migrateLog;
  1431. error() << "TERMINATING" << migrateLog;
  1432. dbexit( EXIT_SHARDING_ERROR );
  1433. }
  1434. }
  1435. migrateFromStatus.setInCriticalSection( false );
  1436. // 5.d
  1437. BSONObjBuilder commitInfo;
  1438. commitInfo.appendElements( chunkInfo );
  1439. if (res["counts"].type() == Object) {
  1440. commitInfo.appendElements(res["counts"].Obj());
  1441. }
  1442. grid.catalogManager()->logChange(txn, "moveChunk.commit", ns, commitInfo.obj());
  1443. }
  1444. migrateFromStatus.done(txn);
  1445. timing.done(5);
  1446. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep5);
  1447. // 6.
  1448. // NOTE: It is important that the distributed collection lock be held for this step.
  1449. RangeDeleter* deleter = getDeleter();
  1450. RangeDeleterOptions deleterOptions(KeyRange(ns,
  1451. min.getOwned(),
  1452. max.getOwned(),
  1453. shardKeyPattern));
  1454. deleterOptions.writeConcern = writeConcern;
  1455. deleterOptions.waitForOpenCursors = true;
  1456. deleterOptions.fromMigrate = true;
  1457. deleterOptions.onlyRemoveOrphanedDocs = true;
  1458. deleterOptions.removeSaverReason = "post-cleanup";
  1459. if (waitForDelete) {
  1460. log() << "doing delete inline for cleanup of chunk data" << migrateLog;
  1461. string errMsg;
  1462. // This is an immediate delete, and as a consequence, there could be more
  1463. // deletes happening simultaneously than there are deleter worker threads.
  1464. if (!deleter->deleteNow(txn,
  1465. deleterOptions,
  1466. &errMsg)) {
  1467. log() << "Error occured while performing cleanup: " << errMsg << endl;
  1468. }
  1469. }
  1470. else {
  1471. log() << "forking for cleanup of chunk data" << migrateLog;
  1472. string errMsg;
  1473. if (!deleter->queueDelete(txn,
  1474. deleterOptions,
  1475. NULL, // Don't want to be notified.
  1476. &errMsg)) {
  1477. log() << "could not queue migration cleanup: " << errMsg << endl;
  1478. }
  1479. }
  1480. timing.done(6);
  1481. MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep6);
  1482. return true;
  1483. }
  1484. } moveChunkCmd;
  1485. bool ShardingState::inCriticalMigrateSection() {
  1486. return migrateFromStatus.getInCriticalSection();
  1487. }
  1488. bool ShardingState::waitTillNotInCriticalSection( int maxSecondsToWait ) {
  1489. return migrateFromStatus.waitTillNotInCriticalSection( maxSecondsToWait );
  1490. }
  1491. /* -----
  1492. below this are the "to" side commands
  1493. command to initiate
  1494. worker thread
  1495. does initial clone
  1496. pulls initial change set
  1497. keeps pulling
  1498. keeps state
  1499. command to get state
  1500. commend to "commit"
  1501. */
  1502. // Enabling / disabling these fail points pauses / resumes MigrateStatus::_go(), the thread
  1503. // that receives a chunk migration from the donor.
  1504. MONGO_FP_DECLARE(migrateThreadHangAtStep1);
  1505. MONGO_FP_DECLARE(migrateThreadHangAtStep2);
  1506. MONGO_FP_DECLARE(migrateThreadHangAtStep3);
  1507. MONGO_FP_DECLARE(migrateThreadHangAtStep4);
  1508. MONGO_FP_DECLARE(migrateThreadHangAtStep5);
  1509. class MigrateStatus {
  1510. public:
  1511. enum State {
  1512. READY,
  1513. CLONE,
  1514. CATCHUP,
  1515. STEADY,
  1516. COMMIT_START,
  1517. DONE,
  1518. FAIL,
  1519. ABORT
  1520. };
  1521. MigrateStatus():
  1522. _active(false),
  1523. _numCloned(0),
  1524. _clonedBytes(0),
  1525. _numCatchup(0),
  1526. _numSteady(0),
  1527. _state(READY) {
  1528. }
  1529. void setState(State newState) {
  1530. boost::lock_guard<boost::mutex> sl(_mutex);
  1531. _state = newState;
  1532. }
  1533. State getState() const {
  1534. boost::lock_guard<boost::mutex> sl(_mutex);
  1535. return _state;
  1536. }
  1537. /**
  1538. * Returns OK if preparation was successful.
  1539. */
  1540. Status prepare(const std::string& ns,
  1541. const std::string& fromShard,
  1542. const BSONObj& min,
  1543. const BSONObj& max,
  1544. const BSONObj& shardKeyPattern) {
  1545. boost::lock_guard<boost::mutex> lk(_mutex);
  1546. if (_active) {
  1547. return Status(ErrorCodes::ConflictingOperationInProgress,
  1548. str::stream() << "Active migration already in progress "
  1549. << "ns: " << _ns
  1550. << ", from: " << _from
  1551. << ", min: " << _min
  1552. << ", max: " << _max);
  1553. }
  1554. _state = READY;
  1555. _errmsg = "";
  1556. _ns = ns;
  1557. _from = fromShard;
  1558. _min = min;
  1559. _max = max;
  1560. _shardKeyPattern = shardKeyPattern;
  1561. _numCloned = 0;
  1562. _clonedBytes = 0;
  1563. _numCatchup = 0;
  1564. _numSteady = 0;
  1565. _active = true;
  1566. return Status::OK();
  1567. }
  1568. void go(OperationContext* txn,
  1569. const std::string& ns,
  1570. BSONObj min,
  1571. BSONObj max,
  1572. BSONObj shardKeyPattern,
  1573. const std::string& fromShard,
  1574. const OID& epoch,
  1575. const WriteConcernOptions& writeConcern) {
  1576. try {
  1577. _go(txn, ns, min, max, shardKeyPattern, fromShard, epoch, writeConcern);
  1578. }
  1579. catch ( std::exception& e ) {
  1580. {
  1581. boost::lock_guard<boost::mutex> sl(_mutex);
  1582. _state = FAIL;
  1583. _errmsg = e.what();
  1584. }
  1585. error() << "migrate failed: " << e.what() << migrateLog;
  1586. }
  1587. catch ( ... ) {
  1588. {
  1589. boost::lock_guard<boost::mutex> sl(_mutex);
  1590. _state = FAIL;
  1591. _errmsg = "UNKNOWN ERROR";
  1592. }
  1593. error() << "migrate failed with unknown exception" << migrateLog;
  1594. }
  1595. if ( getState() != DONE ) {
  1596. // Unprotect the range if needed/possible on unsuccessful TO migration
  1597. ScopedTransaction transaction(txn, MODE_IX);
  1598. Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  1599. Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);
  1600. string errMsg;
  1601. if (!shardingState.forgetPending(txn, ns, min, max, epoch, &errMsg)) {
  1602. warning() << errMsg << endl;
  1603. }
  1604. }
  1605. setActive( false );
  1606. }
  1607. void _go(OperationContext* txn,
  1608. const std::string& ns,
  1609. BSONObj min,
  1610. BSONObj max,
  1611. BSONObj shardKeyPattern,
  1612. const std::string& fromShard,
  1613. const OID& epoch,
  1614. const WriteConcernOptions& writeConcern) {
  1615. verify( getActive() );
  1616. verify( getState() == READY );
  1617. verify( ! min.isEmpty() );
  1618. verify( ! max.isEmpty() );
  1619. DisableDocumentValidation validationDisabler(txn);
  1620. log() << "starting receiving-end of migration of chunk " << min << " -> " << max <<
  1621. " for collection " << ns << " from " << fromShard
  1622. << " at epoch " << epoch.toString() << endl;
  1623. string errmsg;
  1624. MoveTimingHelper timing(txn, "to", ns, min, max, 5 /* steps */, &errmsg, "", "");
  1625. ScopedDbConnection conn(fromShard);
  1626. conn->getLastError(); // just test connection
  1627. {
  1628. // 0. copy system.namespaces entry if collection doesn't already exist
  1629. OldClientWriteContext ctx(txn, ns );
  1630. if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(
  1631. nsToDatabaseSubstring(ns))) {
  1632. errmsg = str::stream() << "Not primary during migration: " << ns
  1633. << ": checking if collection exists";
  1634. warning() << errmsg;
  1635. setState(FAIL);
  1636. return;
  1637. }
  1638. // Only copy if ns doesn't already exist
  1639. Database* db = ctx.db();
  1640. Collection* collection = db->getCollection( ns );
  1641. if ( !collection ) {
  1642. list<BSONObj> infos =
  1643. conn->getCollectionInfos(nsToDatabase(ns),
  1644. BSON("name" << nsToCollectionSubstring(ns)));
  1645. BSONObj options;
  1646. if (infos.size() > 0) {
  1647. BSONObj entry = infos.front();
  1648. if (entry["options"].isABSONObj()) {
  1649. options = entry["options"].Obj();
  1650. }
  1651. }
  1652. WriteUnitOfWork wuow(txn);
  1653. Status status = userCreateNS(txn, db, ns, options, false);
  1654. if ( !status.isOK() ) {
  1655. warning() << "failed to create collection [" << ns << "] "
  1656. << " with options " << options << ": " << status;
  1657. }
  1658. wuow.commit();
  1659. }
  1660. }
  1661. {
  1662. // 1. copy indexes
  1663. vector<BSONObj> indexSpecs;
  1664. {
  1665. const std::list<BSONObj> indexes = conn->getIndexSpecs(ns);
  1666. indexSpecs.insert(indexSpecs.begin(), indexes.begin(), indexes.end());
  1667. }
  1668. ScopedTransaction transaction(txn, MODE_IX);
  1669. Lock::DBLock lk(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X);
  1670. OldClientContext ctx(txn, ns);
  1671. if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(
  1672. nsToDatabaseSubstring(ns))) {
  1673. errmsg = str::stream() << "Not primary during migration: " << ns;
  1674. warning() << errmsg;
  1675. setState(FAIL);
  1676. return;
  1677. }
  1678. Database* db = ctx.db();
  1679. Collection* collection = db->getCollection( ns );
  1680. if ( !collection ) {
  1681. errmsg = str::stream() << "collection dropped during migration: " << ns;
  1682. warning() << errmsg;
  1683. setState(FAIL);
  1684. return;
  1685. }
  1686. MultiIndexBlock indexer(txn, collection);
  1687. indexer.removeExistingIndexes(&indexSpecs);
  1688. if (!indexSpecs.empty()) {
  1689. // Only copy indexes if the collection does not have any documents.
  1690. if (collection->numRecords(txn) > 0) {
  1691. errmsg = str::stream() << "aborting migration, shard is missing "
  1692. << indexSpecs.size() << " indexes and "
  1693. << "collection is not empty. Non-trivial "
  1694. << "index creation should be scheduled manually";
  1695. warning() << errmsg;
  1696. setState(FAIL);
  1697. return;
  1698. }
  1699. Status status = indexer.init(indexSpecs);
  1700. if ( !status.isOK() ) {
  1701. errmsg = str::stream() << "failed to create index before migrating data. "
  1702. << " error: " << status.toString();
  1703. warning() << errmsg;
  1704. setState(FAIL);
  1705. return;
  1706. }
  1707. status = indexer.insertAllDocumentsInCollection();
  1708. if ( !status.isOK() ) {
  1709. errmsg = str::stream() << "failed to create index before migrating data. "
  1710. << " error: " << status.toString();
  1711. warning() << errmsg;
  1712. setState(FAIL);
  1713. return;
  1714. }
  1715. WriteUnitOfWork wunit(txn);
  1716. indexer.commit();
  1717. for (size_t i = 0; i < indexSpecs.size(); i++) {
  1718. // make sure to create index on secondaries as well
  1719. getGlobalServiceContext()->getOpObserver()->onCreateIndex(
  1720. txn,
  1721. db->getSystemIndexesName(),
  1722. indexSpecs[i],
  1723. true /* fromMigrate */);
  1724. }
  1725. wunit.commit();
  1726. }
  1727. timing.done(1);
  1728. MONGO_FP_PAUSE_WHILE(migrateThreadHangAtStep1);
  1729. }
  1730. {
  1731. // 2. delete any data already in range
  1732. RangeDeleterOptions deleterOptions(KeyRange(ns,
  1733. min.getOwned(),
  1734. max.getOwned(),
  1735. shardKeyPattern));
  1736. deleterOptions.writeConcern = writeConcern;
  1737. // No need to wait since all existing cursors will filter out this range when
  1738. // returning the results.
  1739. deleterOptions.waitForOpenCursors = false;
  1740. deleterOptions.fromMigrate = true;
  1741. deleterOptions.onlyRemoveOrphanedDocs = true;
  1742. deleterOptions.removeSaverReason = "preCleanup";
  1743. string errMsg;
  1744. if (!getDeleter()->deleteNow(txn, deleterOptions, &errMsg)) {
  1745. warning() << "Failed to queue delete for migrate abort: " << errMsg << endl;
  1746. setState(FAIL);
  1747. return;
  1748. }
  1749. {
  1750. // Protect the range by noting that we're now starting a migration to it
  1751. ScopedTransaction transaction(txn, MODE_IX);
  1752. Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  1753. Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);
  1754. if (!shardingState.notePending(txn, ns, min, max, epoch, &errmsg)) {
  1755. warning() << errmsg << endl;
  1756. setState(FAIL);
  1757. return;
  1758. }
  1759. }
  1760. timing.done(2);
  1761. MONGO_FP_PAUSE_WHILE(migrateThreadHangAtStep2);
  1762. }
  1763. State currentState = getState();
  1764. if (currentState == FAIL || currentState == ABORT) {
  1765. string errMsg;
  1766. RangeDeleterOptions deleterOptions(KeyRange(ns,
  1767. min.getOwned(),
  1768. max.getOwned(),
  1769. shardKeyPattern));
  1770. deleterOptions.writeConcern = writeConcern;
  1771. // No need to wait since all existing cursors will filter out this range when
  1772. // returning the results.
  1773. deleterOptions.waitForOpenCursors = false;
  1774. deleterOptions.fromMigrate = true;
  1775. deleterOptions.onlyRemoveOrphanedDocs = true;
  1776. if (!getDeleter()->queueDelete(txn, deleterOptions, NULL /* notifier */, &errMsg)) {
  1777. warning() << "Failed to queue delete for migrate abort: " << errMsg << endl;
  1778. }
  1779. }
  1780. {
  1781. // 3. initial bulk clone
  1782. setState(CLONE);
  1783. while ( true ) {
  1784. BSONObj res;
  1785. if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) { // gets array of objects to copy, in disk order
  1786. setState(FAIL);
  1787. errmsg = "_migrateClone failed: ";
  1788. errmsg += res.toString();
  1789. error() << errmsg << migrateLog;
  1790. conn.done();
  1791. return;
  1792. }
  1793. BSONObj arr = res["objects"].Obj();
  1794. int thisTime = 0;
  1795. BSONObjIterator i( arr );
  1796. while( i.more() ) {
  1797. txn->checkForInterrupt();
  1798. if ( getState() == ABORT ) {
  1799. errmsg = str::stream() << "Migration abort requested while "
  1800. << "copying documents";
  1801. error() << errmsg << migrateLog;
  1802. return;
  1803. }
  1804. BSONObj docToClone = i.next().Obj();
  1805. {
  1806. OldClientWriteContext cx(txn, ns );
  1807. BSONObj localDoc;
  1808. if (willOverrideLocalId(txn,
  1809. ns,
  1810. min,
  1811. max,
  1812. shardKeyPattern,
  1813. cx.db(),
  1814. docToClone,
  1815. &localDoc)) {
  1816. string errMsg =
  1817. str::stream() << "cannot migrate chunk, local document "
  1818. << localDoc
  1819. << " has same _id as cloned "
  1820. << "remote document " << docToClone;
  1821. warning() << errMsg << endl;
  1822. // Exception will abort migration cleanly
  1823. uasserted( 16976, errMsg );
  1824. }
  1825. Helpers::upsert( txn, ns, docToClone, true );
  1826. }
  1827. thisTime++;
  1828. {
  1829. boost::lock_guard<boost::mutex> statsLock(_mutex);
  1830. _numCloned++;
  1831. _clonedBytes += docToClone.objsize();
  1832. }
  1833. if (writeConcern.shouldWaitForOtherNodes()) {
  1834. repl::ReplicationCoordinator::StatusAndDuration replStatus =
  1835. repl::getGlobalReplicationCoordinator()->awaitReplication(
  1836. txn,
  1837. repl::ReplClientInfo::forClient(
  1838. txn->getClient()).getLastOp(),
  1839. writeConcern);
  1840. if (replStatus.status.code() == ErrorCodes::ExceededTimeLimit) {
  1841. warning() << "secondaryThrottle on, but doc insert timed out; "
  1842. "continuing";
  1843. }
  1844. else {
  1845. massertStatusOK(replStatus.status);
  1846. }
  1847. }
  1848. }
  1849. if ( thisTime == 0 )
  1850. break;
  1851. }
  1852. timing.done(3);
  1853. MONGO_FP_PAUSE_WHILE(migrateThreadHangAtStep3);
  1854. }
  1855. // if running on a replicated system, we'll need to flush the docs we cloned to the secondaries
  1856. OpTime lastOpApplied = repl::ReplClientInfo::forClient(txn->getClient()).getLastOp();
  1857. {
  1858. // 4. do bulk of mods
  1859. setState(CATCHUP);
  1860. while ( true ) {
  1861. BSONObj res;
  1862. if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
  1863. setState(FAIL);
  1864. errmsg = "_transferMods failed: ";
  1865. errmsg += res.toString();
  1866. error() << "_transferMods failed: " << res << migrateLog;
  1867. conn.done();
  1868. return;
  1869. }
  1870. if ( res["size"].number() == 0 )
  1871. break;
  1872. apply(txn, ns, min, max, shardKeyPattern, res, &lastOpApplied);
  1873. const int maxIterations = 3600*50;
  1874. int i;
  1875. for ( i=0;i<maxIterations; i++) {
  1876. txn->checkForInterrupt();
  1877. if ( getState() == ABORT ) {
  1878. errmsg = str::stream() << "Migration abort requested while waiting "
  1879. << "for replication at catch up stage";
  1880. error() << errmsg << migrateLog;
  1881. return;
  1882. }
  1883. if (opReplicatedEnough(txn, lastOpApplied, writeConcern))
  1884. break;
  1885. if ( i > 100 ) {
  1886. warning() << "secondaries having hard time keeping up with migrate" << migrateLog;
  1887. }
  1888. sleepmillis( 20 );
  1889. }
  1890. if ( i == maxIterations ) {
  1891. errmsg = "secondary can't keep up with migrate";
  1892. error() << errmsg << migrateLog;
  1893. conn.done();
  1894. setState(FAIL);
  1895. return;
  1896. }
  1897. }
  1898. timing.done(4);
  1899. MONGO_FP_PAUSE_WHILE(migrateThreadHangAtStep4);
  1900. }
  1901. {
  1902. // pause to wait for replication
  1903. // this will prevent us from going into critical section until we're ready
  1904. Timer t;
  1905. while ( t.minutes() < 600 ) {
  1906. txn->checkForInterrupt();
  1907. if (getState() == ABORT) {
  1908. errmsg = "Migration abort requested while waiting for replication";
  1909. error() << errmsg << migrateLog;
  1910. return;
  1911. }
  1912. log() << "Waiting for replication to catch up before entering critical section"
  1913. << endl;
  1914. if (flushPendingWrites(txn, ns, min, max, lastOpApplied, writeConcern))
  1915. break;
  1916. sleepsecs(1);
  1917. }
  1918. if (t.minutes() >= 600) {
  1919. setState(FAIL);
  1920. errmsg = "Cannot go to critical section because secondaries cannot keep up";
  1921. error() << errmsg << migrateLog;
  1922. return;
  1923. }
  1924. }
  1925. {
  1926. // 5. wait for commit
  1927. setState(STEADY);
  1928. bool transferAfterCommit = false;
  1929. while ( getState() == STEADY || getState() == COMMIT_START ) {
  1930. txn->checkForInterrupt();
  1931. // Make sure we do at least one transfer after recv'ing the commit message
  1932. // If we aren't sure that at least one transfer happens *after* our state
  1933. // changes to COMMIT_START, there could be mods still on the FROM shard that
  1934. // got logged *after* our _transferMods but *before* the critical section.
  1935. if ( getState() == COMMIT_START ) transferAfterCommit = true;
  1936. BSONObj res;
  1937. if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
  1938. log() << "_transferMods failed in STEADY state: " << res << migrateLog;
  1939. errmsg = res.toString();
  1940. setState(FAIL);
  1941. conn.done();
  1942. return;
  1943. }
  1944. if (res["size"].number() > 0 &&
  1945. apply(txn, ns, min, max, shardKeyPattern, res, &lastOpApplied)) {
  1946. continue;
  1947. }
  1948. if ( getState() == ABORT ) {
  1949. return;
  1950. }
  1951. // We know we're finished when:
  1952. // 1) The from side has told us that it has locked writes (COMMIT_START)
  1953. // 2) We've checked at least one more time for un-transmitted mods
  1954. if ( getState() == COMMIT_START && transferAfterCommit == true ) {
  1955. if (flushPendingWrites(txn, ns, min, max, lastOpApplied, writeConcern))
  1956. break;
  1957. }
  1958. // Only sleep if we aren't committing
  1959. if ( getState() == STEADY ) sleepmillis( 10 );
  1960. }
  1961. if ( getState() == FAIL ) {
  1962. errmsg = "timed out waiting for commit";
  1963. return;
  1964. }
  1965. timing.done(5);
  1966. MONGO_FP_PAUSE_WHILE(migrateThreadHangAtStep5);
  1967. }
  1968. setState(DONE);
  1969. conn.done();
  1970. }
  1971. void status(BSONObjBuilder& b) {
  1972. boost::lock_guard<boost::mutex> sl(_mutex);
  1973. b.appendBool("active", _active);
  1974. b.append("ns", _ns);
  1975. b.append("from", _from);
  1976. b.append("min", _min);
  1977. b.append("max", _max);
  1978. b.append("shardKeyPattern", _shardKeyPattern);
  1979. b.append("state", stateToString(_state));
  1980. if (_state == FAIL) {
  1981. b.append("errmsg", _errmsg);
  1982. }
  1983. BSONObjBuilder bb(b.subobjStart("counts"));
  1984. bb.append("cloned", _numCloned);
  1985. bb.append("clonedBytes", _clonedBytes);
  1986. bb.append("catchup", _numCatchup);
  1987. bb.append("steady", _numSteady);
  1988. bb.done();
  1989. }
  1990. bool apply(OperationContext* txn,
  1991. const string& ns,
  1992. BSONObj min,
  1993. BSONObj max,
  1994. BSONObj shardKeyPattern,
  1995. const BSONObj& xfer,
  1996. OpTime* lastOpApplied) {
  1997. OpTime dummy;
  1998. if ( lastOpApplied == NULL ) {
  1999. lastOpApplied = &dummy;
  2000. }
  2001. bool didAnything = false;
  2002. if ( xfer["deleted"].isABSONObj() ) {
  2003. ScopedTransaction transaction(txn, MODE_IX);
  2004. Lock::DBLock dlk(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
  2005. Helpers::RemoveSaver rs( "moveChunk" , ns , "removedDuring" );
  2006. BSONObjIterator i( xfer["deleted"].Obj() );
  2007. while ( i.more() ) {
  2008. Lock::CollectionLock clk(txn->lockState(), ns, MODE_X);
  2009. OldClientContext ctx(txn, ns);
  2010. BSONObj id = i.next().Obj();
  2011. // do not apply deletes if they do not belong to the chunk being migrated
  2012. BSONObj fullObj;
  2013. if (Helpers::findById(txn, ctx.db(), ns.c_str(), id, fullObj)) {
  2014. if (!isInRange(fullObj , min , max , shardKeyPattern)) {
  2015. log() << "not applying out of range deletion: " << fullObj << migrateLog;
  2016. continue;
  2017. }
  2018. }
  2019. if (serverGlobalParams.moveParanoia) {
  2020. rs.goingToDelete(fullObj);
  2021. }
  2022. deleteObjects(txn,
  2023. ctx.db(),
  2024. ns,
  2025. id,
  2026. PlanExecutor::YIELD_MANUAL,
  2027. true /* justOne */,
  2028. false /* god */,
  2029. true /* fromMigrate */);
  2030. *lastOpApplied = repl::ReplClientInfo::forClient(txn->getClient()).getLastOp();
  2031. didAnything = true;
  2032. }
  2033. }
  2034. if ( xfer["reload"].isABSONObj() ) {
  2035. BSONObjIterator i( xfer["reload"].Obj() );
  2036. while ( i.more() ) {
  2037. OldClientWriteContext cx(txn, ns);
  2038. BSONObj updatedDoc = i.next().Obj();
  2039. BSONObj localDoc;
  2040. if (willOverrideLocalId(txn,
  2041. ns,
  2042. min,
  2043. max,
  2044. shardKeyPattern,
  2045. cx.db(),
  2046. updatedDoc,
  2047. &localDoc)) {
  2048. string errMsg =
  2049. str::stream() << "cannot migrate chunk, local document "
  2050. << localDoc
  2051. << " has same _id as reloaded remote document "
  2052. << updatedDoc;
  2053. warning() << errMsg << endl;
  2054. // Exception will abort migration cleanly
  2055. uasserted( 16977, errMsg );
  2056. }
  2057. // We are in write lock here, so sure we aren't killing
  2058. Helpers::upsert( txn, ns , updatedDoc , true );
  2059. *lastOpApplied = repl::ReplClientInfo::forClient(txn->getClient()).getLastOp();
  2060. didAnything = true;
  2061. }
  2062. }
  2063. return didAnything;
  2064. }
  2065. /**
  2066. * Checks if an upsert of a remote document will override a local document with the same _id
  2067. * but in a different range on this shard.
  2068. * Must be in WriteContext to avoid races and DBHelper errors.
  2069. * TODO: Could optimize this check out if sharding on _id.
  2070. */
  2071. bool willOverrideLocalId(OperationContext* txn,
  2072. const string& ns,
  2073. BSONObj min,
  2074. BSONObj max,
  2075. BSONObj shardKeyPattern,
  2076. Database* db,
  2077. BSONObj remoteDoc,
  2078. BSONObj* localDoc) {
  2079. *localDoc = BSONObj();
  2080. if ( Helpers::findById( txn, db, ns.c_str(), remoteDoc, *localDoc ) ) {
  2081. return !isInRange( *localDoc , min , max , shardKeyPattern );
  2082. }
  2083. return false;
  2084. }
  2085. /**
  2086. * Returns true if the majority of the nodes and the nodes corresponding to the given
  2087. * writeConcern (if not empty) have applied till the specified lastOp.
  2088. */
  2089. bool opReplicatedEnough(const OperationContext* txn,
  2090. const OpTime& lastOpApplied,
  2091. const WriteConcernOptions& writeConcern) {
  2092. WriteConcernOptions majorityWriteConcern;
  2093. majorityWriteConcern.wTimeout = -1;
  2094. majorityWriteConcern.wMode = WriteConcernOptions::kMajority;
  2095. Status majorityStatus = repl::getGlobalReplicationCoordinator()->awaitReplication(
  2096. txn, lastOpApplied, majorityWriteConcern).status;
  2097. if (!writeConcern.shouldWaitForOtherNodes()) {
  2098. return majorityStatus.isOK();
  2099. }
  2100. // Also enforce the user specified write concern after "majority" so it covers
  2101. // the union of the 2 write concerns.
  2102. WriteConcernOptions userWriteConcern(writeConcern);
  2103. userWriteConcern.wTimeout = -1;
  2104. Status userStatus = repl::getGlobalReplicationCoordinator()->awaitReplication(
  2105. txn, lastOpApplied, userWriteConcern).status;
  2106. return majorityStatus.isOK() && userStatus.isOK();
  2107. }
  2108. bool flushPendingWrites(OperationContext* txn,
  2109. const std::string& ns,
  2110. BSONObj min,
  2111. BSONObj max,
  2112. const OpTime& lastOpApplied,
  2113. const WriteConcernOptions& writeConcern) {
  2114. if (!opReplicatedEnough(txn, lastOpApplied, writeConcern)) {
  2115. OpTime op( lastOpApplied );
  2116. OCCASIONALLY warning() << "migrate commit waiting for a majority of slaves for '"
  2117. << ns << "' " << min << " -> " << max
  2118. << " waiting for: " << op
  2119. << migrateLog;
  2120. return false;
  2121. }
  2122. log() << "migrate commit succeeded flushing to secondaries for '" << ns << "' " << min << " -> " << max << migrateLog;
  2123. {
  2124. // Get global lock to wait for write to be commited to journal.
  2125. ScopedTransaction transaction(txn, MODE_S);
  2126. Lock::GlobalRead lk(txn->lockState());
  2127. // if durability is on, force a write to journal
  2128. if (getDur().commitNow(txn)) {
  2129. log() << "migrate commit flushed to journal for '" << ns << "' " << min << " -> " << max << migrateLog;
  2130. }
  2131. }
  2132. return true;
  2133. }
  2134. static string stateToString(State state) {
  2135. switch (state) {
  2136. case READY: return "ready";
  2137. case CLONE: return "clone";
  2138. case CATCHUP: return "catchup";
  2139. case STEADY: return "steady";
  2140. case COMMIT_START: return "commitStart";
  2141. case DONE: return "done";
  2142. case FAIL: return "fail";
  2143. case ABORT: return "abort";
  2144. }
  2145. verify(0);
  2146. return "";
  2147. }
  2148. bool startCommit() {
  2149. boost::unique_lock<boost::mutex> lock(_mutex);
  2150. if (_state != STEADY) {
  2151. return false;
  2152. }
  2153. boost::xtime xt;
  2154. boost::xtime_get(&xt, MONGO_BOOST_TIME_UTC);
  2155. xt.sec += 30;
  2156. _state = COMMIT_START;
  2157. while (_active) {
  2158. if ( ! isActiveCV.timed_wait( lock, xt ) ){
  2159. // TIMEOUT
  2160. _state = FAIL;
  2161. log() << "startCommit never finished!" << migrateLog;
  2162. return false;
  2163. }
  2164. }
  2165. if (_state == DONE) {
  2166. return true;
  2167. }
  2168. log() << "startCommit failed, final data failed to transfer" << migrateLog;
  2169. return false;
  2170. }
  2171. void abort() {
  2172. boost::lock_guard<boost::mutex> sl(_mutex);
  2173. _state = ABORT;
  2174. _errmsg = "aborted";
  2175. }
  2176. bool getActive() const { boost::lock_guard<boost::mutex> lk(_mutex); return _active; }
  2177. void setActive( bool b ) {
  2178. boost::lock_guard<boost::mutex> lk(_mutex);
  2179. _active = b;
  2180. isActiveCV.notify_all();
  2181. }
  2182. // Guards all fields.
  2183. mutable mongo::mutex _mutex;
  2184. bool _active;
  2185. boost::condition isActiveCV;
  2186. std::string _ns;
  2187. std::string _from;
  2188. BSONObj _min;
  2189. BSONObj _max;
  2190. BSONObj _shardKeyPattern;
  2191. long long _numCloned;
  2192. long long _clonedBytes;
  2193. long long _numCatchup;
  2194. long long _numSteady;
  2195. State _state;
  2196. std::string _errmsg;
  2197. } migrateStatus;
  2198. void migrateThread(std::string ns,
  2199. BSONObj min,
  2200. BSONObj max,
  2201. BSONObj shardKeyPattern,
  2202. std::string fromShard,
  2203. OID epoch,
  2204. WriteConcernOptions writeConcern) {
  2205. Client::initThread( "migrateThread" );
  2206. OperationContextImpl txn;
  2207. if (getGlobalAuthorizationManager()->isAuthEnabled()) {
  2208. ShardedConnectionInfo::addHook();
  2209. AuthorizationSession::get(txn.getClient())->grantInternalAuthorization();
  2210. }
  2211. // Make curop active so this will show up in currOp.
  2212. txn.getCurOp()->reset();
  2213. migrateStatus.go(&txn, ns, min, max, shardKeyPattern, fromShard, epoch, writeConcern);
  2214. }
  2215. /**
  2216. * Command for initiating the recipient side of the migration to start copying data
  2217. * from the donor shard.
  2218. *
  2219. * {
  2220. * _recvChunkStart: "namespace",
  2221. * congfigServer: "hostAndPort",
  2222. * from: "hostAndPort",
  2223. * fromShardName: "shardName",
  2224. * toShardName: "shardName",
  2225. * min: {},
  2226. * max: {},
  2227. * shardKeyPattern: {},
  2228. *
  2229. * // optional
  2230. * secondaryThrottle: bool, // defaults to true
  2231. * writeConcern: {} // applies to individual writes.
  2232. * }
  2233. */
  2234. class RecvChunkStartCommand : public ChunkCommandHelper {
  2235. public:
  2236. void help(stringstream& h) const { h << "internal"; }
  2237. RecvChunkStartCommand() : ChunkCommandHelper( "_recvChunkStart" ) {}
  2238. virtual bool isWriteCommandForConfigServer() const { return false; }
  2239. virtual void addRequiredPrivileges(const std::string& dbname,
  2240. const BSONObj& cmdObj,
  2241. std::vector<Privilege>* out) {
  2242. ActionSet actions;
  2243. actions.addAction(ActionType::internal);
  2244. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  2245. }
  2246. bool run(OperationContext* txn,
  2247. const string&,
  2248. BSONObj& cmdObj,
  2249. int,
  2250. string& errmsg,
  2251. BSONObjBuilder& result) {
  2252. // Active state of TO-side migrations (MigrateStatus) is serialized by distributed
  2253. // collection lock.
  2254. if ( migrateStatus.getActive() ) {
  2255. errmsg = "migrate already in progress";
  2256. return false;
  2257. }
  2258. // Pending deletes (for migrations) are serialized by the distributed collection lock,
  2259. // we are sure we registered a delete for a range *before* we can migrate-in a
  2260. // subrange.
  2261. const size_t numDeletes = getDeleter()->getTotalDeletes();
  2262. if (numDeletes > 0) {
  2263. errmsg = str::stream() << "can't accept new chunks because "
  2264. << " there are still " << numDeletes
  2265. << " deletes from previous migration";
  2266. warning() << errmsg;
  2267. return false;
  2268. }
  2269. if (!shardingState.enabled()) {
  2270. if (!cmdObj["configServer"].eoo()) {
  2271. dassert(cmdObj["configServer"].type() == String);
  2272. ShardingState::initialize(cmdObj["configServer"].String());
  2273. }
  2274. else {
  2275. errmsg = str::stream()
  2276. << "cannot start recv'ing chunk, "
  2277. << "sharding is not enabled and no config server was provided";
  2278. warning() << errmsg;
  2279. return false;
  2280. }
  2281. }
  2282. if ( !cmdObj["toShardName"].eoo() ) {
  2283. dassert( cmdObj["toShardName"].type() == String );
  2284. shardingState.gotShardName( cmdObj["toShardName"].String() );
  2285. }
  2286. string ns = cmdObj.firstElement().String();
  2287. BSONObj min = cmdObj["min"].Obj().getOwned();
  2288. BSONObj max = cmdObj["max"].Obj().getOwned();
  2289. // Refresh our collection manager from the config server, we need a collection manager
  2290. // to start registering pending chunks.
  2291. // We force the remote refresh here to make the behavior consistent and predictable,
  2292. // generally we'd refresh anyway, and to be paranoid.
  2293. ChunkVersion currentVersion;
  2294. Status status = shardingState.refreshMetadataNow(txn, ns, &currentVersion );
  2295. if ( !status.isOK() ) {
  2296. errmsg = str::stream() << "cannot start recv'ing chunk "
  2297. << "[" << min << "," << max << ")"
  2298. << causedBy( status.reason() );
  2299. warning() << errmsg << endl;
  2300. return false;
  2301. }
  2302. // Process secondary throttle settings and assign defaults if necessary.
  2303. WriteConcernOptions writeConcern;
  2304. status = writeConcern.parseSecondaryThrottle(cmdObj, NULL);
  2305. if (!status.isOK()){
  2306. if (status.code() != ErrorCodes::WriteConcernNotDefined) {
  2307. warning() << status.toString() << endl;
  2308. return appendCommandStatus(result, status);
  2309. }
  2310. writeConcern = getDefaultWriteConcern();
  2311. }
  2312. else {
  2313. repl::ReplicationCoordinator* replCoordinator =
  2314. repl::getGlobalReplicationCoordinator();
  2315. if (replCoordinator->getReplicationMode() ==
  2316. repl::ReplicationCoordinator::modeMasterSlave &&
  2317. writeConcern.shouldWaitForOtherNodes()) {
  2318. warning() << "recvChunk cannot check if secondary throttle setting "
  2319. << writeConcern.toBSON()
  2320. << " can be enforced in a master slave configuration";
  2321. }
  2322. Status status = replCoordinator->checkIfWriteConcernCanBeSatisfied(writeConcern);
  2323. if (!status.isOK() && status != ErrorCodes::NoReplicationEnabled) {
  2324. warning() << status.toString() << endl;
  2325. return appendCommandStatus(result, status);
  2326. }
  2327. }
  2328. if (writeConcern.shouldWaitForOtherNodes() &&
  2329. writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) {
  2330. // Don't allow no timeout.
  2331. writeConcern.wTimeout = kDefaultWTimeoutMs;
  2332. }
  2333. BSONObj shardKeyPattern;
  2334. if (cmdObj.hasField("shardKeyPattern")) {
  2335. shardKeyPattern = cmdObj["shardKeyPattern"].Obj().getOwned();
  2336. } else {
  2337. // shardKeyPattern may not be provided if another shard is from pre 2.2
  2338. // In that case, assume the shard key pattern is the same as the range
  2339. // specifiers provided.
  2340. BSONObj keya = Helpers::inferKeyPattern(min);
  2341. BSONObj keyb = Helpers::inferKeyPattern(max);
  2342. verify( keya == keyb );
  2343. warning() << "No shard key pattern provided by source shard for migration."
  2344. " This is likely because the source shard is running a version prior to 2.2."
  2345. " Falling back to assuming the shard key matches the pattern of the min and max"
  2346. " chunk range specifiers. Inferred shard key: " << keya << endl;
  2347. shardKeyPattern = keya.getOwned();
  2348. }
  2349. const string fromShard(cmdObj["from"].String());
  2350. // Set the TO-side migration to active
  2351. Status prepareStatus = migrateStatus.prepare(ns, fromShard, min, max, shardKeyPattern);
  2352. if (!prepareStatus.isOK()) {
  2353. return appendCommandStatus(result, prepareStatus);
  2354. }
  2355. boost::thread m(migrateThread,
  2356. ns,
  2357. min,
  2358. max,
  2359. shardKeyPattern,
  2360. fromShard,
  2361. currentVersion.epoch(),
  2362. writeConcern);
  2363. result.appendBool( "started" , true );
  2364. return true;
  2365. }
  2366. } recvChunkStartCmd;
  2367. class RecvChunkStatusCommand : public ChunkCommandHelper {
  2368. public:
  2369. void help(stringstream& h) const { h << "internal"; }
  2370. RecvChunkStatusCommand() : ChunkCommandHelper( "_recvChunkStatus" ) {}
  2371. virtual void addRequiredPrivileges(const std::string& dbname,
  2372. const BSONObj& cmdObj,
  2373. std::vector<Privilege>* out) {
  2374. ActionSet actions;
  2375. actions.addAction(ActionType::internal);
  2376. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  2377. }
  2378. bool run(OperationContext* txn,
  2379. const string&,
  2380. BSONObj& cmdObj,
  2381. int,
  2382. string& errmsg,
  2383. BSONObjBuilder& result) {
  2384. migrateStatus.status( result );
  2385. return 1;
  2386. }
  2387. } recvChunkStatusCommand;
  2388. class RecvChunkCommitCommand : public ChunkCommandHelper {
  2389. public:
  2390. void help(stringstream& h) const { h << "internal"; }
  2391. RecvChunkCommitCommand() : ChunkCommandHelper( "_recvChunkCommit" ) {}
  2392. virtual void addRequiredPrivileges(const std::string& dbname,
  2393. const BSONObj& cmdObj,
  2394. std::vector<Privilege>* out) {
  2395. ActionSet actions;
  2396. actions.addAction(ActionType::internal);
  2397. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  2398. }
  2399. bool run(OperationContext* txn,
  2400. const string&,
  2401. BSONObj& cmdObj,
  2402. int,
  2403. string& errmsg,
  2404. BSONObjBuilder& result) {
  2405. bool ok = migrateStatus.startCommit();
  2406. migrateStatus.status( result );
  2407. return ok;
  2408. }
  2409. } recvChunkCommitCommand;
  2410. class RecvChunkAbortCommand : public ChunkCommandHelper {
  2411. public:
  2412. void help(stringstream& h) const { h << "internal"; }
  2413. RecvChunkAbortCommand() : ChunkCommandHelper( "_recvChunkAbort" ) {}
  2414. virtual void addRequiredPrivileges(const std::string& dbname,
  2415. const BSONObj& cmdObj,
  2416. std::vector<Privilege>* out) {
  2417. ActionSet actions;
  2418. actions.addAction(ActionType::internal);
  2419. out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
  2420. }
  2421. bool run(OperationContext* txn,
  2422. const string&,
  2423. BSONObj& cmdObj,
  2424. int,
  2425. string& errmsg,
  2426. BSONObjBuilder& result) {
  2427. migrateStatus.abort();
  2428. migrateStatus.status( result );
  2429. return true;
  2430. }
  2431. } recvChunkAboortCommand;
  2432. class IsInRangeTest : public StartupTest {
  2433. public:
  2434. void run() {
  2435. BSONObj min = BSON( "x" << 1 );
  2436. BSONObj max = BSON( "x" << 5 );
  2437. BSONObj skey = BSON( "x" << 1 );
  2438. verify( ! isInRange( BSON( "x" << 0 ) , min , max , skey ) );
  2439. verify( isInRange( BSON( "x" << 1 ) , min , max , skey ) );
  2440. verify( isInRange( BSON( "x" << 3 ) , min , max , skey ) );
  2441. verify( isInRange( BSON( "x" << 4 ) , min , max , skey ) );
  2442. verify( ! isInRange( BSON( "x" << 5 ) , min , max , skey ) );
  2443. verify( ! isInRange( BSON( "x" << 6 ) , min , max , skey ) );
  2444. BSONObj obj = BSON( "n" << 3 );
  2445. BSONObj min2 = BSON( "x" << BSONElementHasher::hash64( obj.firstElement() , 0 ) - 2 );
  2446. BSONObj max2 = BSON( "x" << BSONElementHasher::hash64( obj.firstElement() , 0 ) + 2 );
  2447. BSONObj hashedKey = BSON( "x" << "hashed" );
  2448. verify( isInRange( BSON( "x" << 3 ) , min2 , max2 , hashedKey ) );
  2449. verify( ! isInRange( BSON( "x" << 3 ) , min , max , hashedKey ) );
  2450. verify( ! isInRange( BSON( "x" << 4 ) , min2 , max2 , hashedKey ) );
  2451. LOG(1) << "isInRangeTest passed" << migrateLog;
  2452. }
  2453. } isInRangeTest;
  2454. }