/src/mongo/s/d_migrate.cpp
C++ | 2942 lines | 2084 code | 487 blank | 371 comment | 273 complexity | a46ce05711deb0377b47f46d4360984d MD5 | raw file
Possible License(s): BSD-3-Clause-No-Nuclear-License-2014, GPL-2.0, Apache-2.0, BSD-3-Clause, WTFPL
Large files files are truncated, but you can click here to view the full file
- // d_migrate.cpp
- /**
- * Copyright (C) 2008-2014 MongoDB Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the GNU Affero General Public License in all respects
- * for all of the code used other than as permitted herein. If you modify
- * file(s) with this exception, you may extend this exception to your
- * version of the file(s), but you are not obligated to do so. If you do not
- * wish to do so, delete this exception statement from your version. If you
- * delete this exception statement from all source files in the program,
- * then also delete it in the license file.
- */
- /**
- these are commands that live in mongod
- mostly around shard management and checking
- */
- #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
- #include "mongo/platform/basic.h"
- #include <algorithm>
- #include <boost/scoped_ptr.hpp>
- #include <boost/thread/thread.hpp>
- #include <map>
- #include <string>
- #include <vector>
- #include "mongo/client/connpool.h"
- #include "mongo/client/dbclientcursor.h"
- #include "mongo/db/auth/action_set.h"
- #include "mongo/db/auth/action_type.h"
- #include "mongo/db/auth/authorization_manager.h"
- #include "mongo/db/auth/authorization_manager_global.h"
- #include "mongo/db/auth/authorization_session.h"
- #include "mongo/db/auth/privilege.h"
- #include "mongo/db/catalog/document_validation.h"
- #include "mongo/db/catalog/index_create.h"
- #include "mongo/db/clientcursor.h"
- #include "mongo/db/commands.h"
- #include "mongo/db/concurrency/lock_state.h"
- #include "mongo/db/db_raii.h"
- #include "mongo/db/dbhelpers.h"
- #include "mongo/db/exec/plan_stage.h"
- #include "mongo/db/field_parser.h"
- #include "mongo/db/service_context.h"
- #include "mongo/db/hasher.h"
- #include "mongo/db/jsobj.h"
- #include "mongo/db/op_observer.h"
- #include "mongo/db/operation_context_impl.h"
- #include "mongo/db/ops/delete.h"
- #include "mongo/db/query/internal_plans.h"
- #include "mongo/db/query/query_knobs.h"
- #include "mongo/db/range_deleter_service.h"
- #include "mongo/db/repl/repl_client_info.h"
- #include "mongo/db/repl/replication_coordinator_global.h"
- #include "mongo/db/storage/mmap_v1/dur.h"
- #include "mongo/db/write_concern.h"
- #include "mongo/logger/ramlog.h"
- #include "mongo/s/catalog/catalog_manager.h"
- #include "mongo/s/catalog/type_chunk.h"
- #include "mongo/s/chunk.h"
- #include "mongo/s/chunk_version.h"
- #include "mongo/s/config.h"
- #include "mongo/s/d_state.h"
- #include "mongo/s/catalog/dist_lock_manager.h"
- #include "mongo/s/grid.h"
- #include "mongo/s/client/shard.h"
- #include "mongo/util/assert_util.h"
- #include "mongo/util/elapsed_tracker.h"
- #include "mongo/util/exit.h"
- #include "mongo/util/fail_point_service.h"
- #include "mongo/util/log.h"
- #include "mongo/util/processinfo.h"
- #include "mongo/util/queue.h"
- #include "mongo/util/startup_test.h"
- // Pause while a fail point is enabled.
- #define MONGO_FP_PAUSE_WHILE(symbol) while (MONGO_FAIL_POINT(symbol)) { sleepmillis(100); }
- using namespace std;
- namespace {
- using boost::scoped_ptr;
- using mongo::WriteConcernOptions;
- using mongo::repl::ReplicationCoordinator;
- using mongo::repl::OpTime;
- const int kDefaultWTimeoutMs = 60 * 1000;
- const WriteConcernOptions DefaultWriteConcern(2, WriteConcernOptions::NONE, kDefaultWTimeoutMs);
- /**
- * Returns the default write concern for migration cleanup (at donor shard) and
- * cloning documents (at recipient shard).
- */
- WriteConcernOptions getDefaultWriteConcern() {
- ReplicationCoordinator* replCoordinator =
- mongo::repl::getGlobalReplicationCoordinator();
- if (replCoordinator->getReplicationMode() ==
- mongo::repl::ReplicationCoordinator::modeReplSet) {
- mongo::Status status =
- replCoordinator->checkIfWriteConcernCanBeSatisfied(DefaultWriteConcern);
- if (status.isOK()) {
- return DefaultWriteConcern;
- }
- }
- return WriteConcernOptions(1, WriteConcernOptions::NONE, 0);
- }
- }
- namespace mongo {
- MONGO_FP_DECLARE(failMigrationCommit);
- MONGO_FP_DECLARE(failMigrationConfigWritePrepare);
- MONGO_FP_DECLARE(failMigrationApplyOps);
- Tee* migrateLog = RamLog::get("migrate");
- class MoveTimingHelper {
- public:
- MoveTimingHelper(OperationContext* txn,
- const string& where,
- const string& ns,
- BSONObj min,
- BSONObj max ,
- int total,
- string* cmdErrmsg,
- string toShard,
- string fromShard)
- : _txn(txn),
- _where(where),
- _ns(ns),
- _to(toShard),
- _from(fromShard),
- _next(0),
- _total(total),
- _cmdErrmsg(cmdErrmsg) {
- _b.append( "min" , min );
- _b.append( "max" , max );
- }
- ~MoveTimingHelper() {
- // even if logChange doesn't throw, bson does
- // sigh
- try {
- if ( !_to.empty() ){
- _b.append( "to", _to );
- }
- if ( !_from.empty() ){
- _b.append( "from", _from );
- }
- if ( _next != _total ) {
- _b.append( "note" , "aborted" );
- }
- else {
- _b.append( "note" , "success" );
- }
- if ( !_cmdErrmsg->empty() ) {
- _b.append( "errmsg" , *_cmdErrmsg );
- }
- grid.catalogManager()->logChange(_txn,
- (string)"moveChunk." + _where,
- _ns,
- _b.obj());
- }
- catch ( const std::exception& e ) {
- warning() << "couldn't record timing for moveChunk '" << _where << "': " << e.what() << migrateLog;
- }
- }
- void done(int step) {
- verify( step == ++_next );
- verify( step <= _total );
- stringstream ss;
- ss << "step " << step << " of " << _total;
- string s = ss.str();
- CurOp * op = _txn->getCurOp();
- if ( op )
- op->setMessage( s.c_str() );
- else
- warning() << "op is null in MoveTimingHelper::done" << migrateLog;
- _b.appendNumber( s , _t.millis() );
- _t.reset();
- #if 0
- // debugging for memory leak?
- ProcessInfo pi;
- ss << " v:" << pi.getVirtualMemorySize()
- << " r:" << pi.getResidentSize();
- log() << ss.str() << migrateLog;
- #endif
- }
- private:
- OperationContext* const _txn;
- Timer _t;
- string _where;
- string _ns;
- string _to;
- string _from;
- int _next;
- int _total; // expected # of steps
- const string* _cmdErrmsg;
- BSONObjBuilder _b;
- };
- class ChunkCommandHelper : public Command {
- public:
- ChunkCommandHelper( const char * name )
- : Command( name ) {
- }
- virtual void help( stringstream& help ) const {
- help << "internal - should not be called directly";
- }
- virtual bool slaveOk() const { return false; }
- virtual bool adminOnly() const { return true; }
- virtual bool isWriteCommandForConfigServer() const { return false; }
- };
- bool isInRange( const BSONObj& obj ,
- const BSONObj& min ,
- const BSONObj& max ,
- const BSONObj& shardKeyPattern ) {
- ShardKeyPattern shardKey( shardKeyPattern );
- BSONObj k = shardKey.extractShardKeyFromDoc( obj );
- return k.woCompare( min ) >= 0 && k.woCompare( max ) < 0;
- }
- class MigrateFromStatus {
- public:
- MigrateFromStatus():
- _inCriticalSection(false),
- _memoryUsed(0),
- _active(false) {
- }
- /**
- * @return false if cannot start. One of the reason for not being able to
- * start is there is already an existing migration in progress.
- */
- bool start(OperationContext* txn,
- const std::string& ns,
- const BSONObj& min,
- const BSONObj& max,
- const BSONObj& shardKeyPattern) {
- verify(!min.isEmpty());
- verify(!max.isEmpty());
- verify(!ns.empty());
- // Get global shared to synchronize with logOp. Also see comments in the class
- // members declaration for more details.
- Lock::GlobalRead globalShared(txn->lockState());
- boost::lock_guard<boost::mutex> lk(_mutex);
- if (_active) {
- return false;
- }
- _ns = ns;
- _min = min;
- _max = max;
- _shardKeyPattern = shardKeyPattern;
- verify(_deleted.size() == 0);
- verify(_reload.size() == 0);
- verify(_memoryUsed == 0);
- _active = true;
- boost::lock_guard<boost::mutex> tLock(_cloneLocsMutex);
- verify(_cloneLocs.size() == 0);
- return true;
- }
- void done(OperationContext* txn) {
- log() << "MigrateFromStatus::done About to acquire global lock to exit critical "
- "section" << endl;
- // Get global shared to synchronize with logOp. Also see comments in the class
- // members declaration for more details.
- Lock::GlobalRead globalShared(txn->lockState());
- boost::lock_guard<boost::mutex> lk(_mutex);
- _active = false;
- _deleteNotifyExec.reset( NULL );
- _inCriticalSection = false;
- _inCriticalSectionCV.notify_all();
- _deleted.clear();
- _reload.clear();
- _memoryUsed = 0;
- boost::lock_guard<boost::mutex> cloneLock(_cloneLocsMutex);
- _cloneLocs.clear();
- }
- void logOp(OperationContext* txn,
- const char* opstr,
- const char* ns,
- const BSONObj& obj,
- BSONObj* patt,
- bool notInActiveChunk) {
- ensureShardVersionOKOrThrow(ns);
- const char op = opstr[0];
- if (notInActiveChunk) {
- // Ignore writes that came from the migration process like cleanup so they
- // won't be transferred to the recipient shard. Also ignore ops from
- // _migrateClone and _transferMods since it is impossible to move a chunk
- // to self.
- return;
- }
- dassert(txn->lockState()->isWriteLocked()); // Must have Global IX.
- if (!_active)
- return;
- if (_ns != ns)
- return;
- // no need to log if this is not an insertion, an update, or an actual deletion
- // note: opstr 'db' isn't a deletion but a mention that a database exists
- // (for replication machinery mostly).
- if (op == 'n' || op == 'c' || (op == 'd' && opstr[1] == 'b'))
- return;
- BSONElement ide;
- if (patt)
- ide = patt->getField("_id");
- else
- ide = obj["_id"];
- if (ide.eoo()) {
- warning() << "logOpForSharding got mod with no _id, ignoring obj: "
- << obj << migrateLog;
- return;
- }
- if (op == 'i' && (!isInRange(obj, _min, _max, _shardKeyPattern))) {
- return;
- }
- BSONObj idObj(ide.wrap());
- if (op == 'u') {
- BSONObj fullDoc;
- OldClientContext ctx(txn, _ns, false);
- if (!Helpers::findById(txn, ctx.db(), _ns.c_str(), idObj, fullDoc)) {
- warning() << "logOpForSharding couldn't find: " << idObj
- << " even though should have" << migrateLog;
- dassert(false); // TODO: Abort the migration.
- return;
- }
- if (!isInRange(fullDoc, _min, _max, _shardKeyPattern)) {
- return;
- }
- }
- // Note: can't check if delete is in active chunk since the document is gone!
- txn->recoveryUnit()->registerChange(new LogOpForShardingHandler(this, idObj, op));
- }
- /**
- * Insert items from docIdList to a new array with the given fieldName in the given
- * builder. If explode is true, the inserted object will be the full version of the
- * document. Note that the whenever an item from the docList is inserted to the array,
- * it will also be removed from docList.
- *
- * Should be holding the collection lock for ns if explode is true.
- */
- void xfer(OperationContext* txn,
- const string& ns,
- Database* db,
- list<BSONObj> *docIdList,
- BSONObjBuilder& builder,
- const char* fieldName,
- long long& size,
- bool explode) {
- const long long maxSize = 1024 * 1024;
- if (docIdList->size() == 0 || size > maxSize)
- return;
- BSONArrayBuilder arr(builder.subarrayStart(fieldName));
- list<BSONObj>::iterator docIdIter = docIdList->begin();
- while (docIdIter != docIdList->end() && size < maxSize) {
- BSONObj idDoc = *docIdIter;
- if (explode) {
- BSONObj fullDoc;
- if (Helpers::findById(txn, db, ns.c_str(), idDoc, fullDoc)) {
- arr.append( fullDoc );
- size += fullDoc.objsize();
- }
- }
- else {
- arr.append(idDoc);
- size += idDoc.objsize();
- }
- docIdIter = docIdList->erase(docIdIter);
- }
- arr.done();
- }
- /**
- * called from the dest of a migrate
- * transfers mods from src to dest
- */
- bool transferMods(OperationContext* txn, string& errmsg, BSONObjBuilder& b) {
- long long size = 0;
- {
- AutoGetCollectionForRead ctx(txn, getNS());
- boost::lock_guard<boost::mutex> sl(_mutex);
- if (!_active) {
- errmsg = "no active migration!";
- return false;
- }
- // TODO: fix SERVER-16540 race
- xfer(txn, _ns, ctx.getDb(), &_deleted, b, "deleted", size, false);
- xfer(txn, _ns, ctx.getDb(), &_reload, b, "reload", size, true);
- }
- b.append( "size" , size );
- return true;
- }
- /**
- * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs
- * (to avoid seeking disk later).
- *
- * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices)
- * is considered too large to move.
- * @param errmsg filled with textual description of error if this call return false.
- * @return false if approximate chunk size is too big to move or true otherwise.
- */
- bool storeCurrentLocs(OperationContext* txn,
- long long maxChunkSize,
- string& errmsg,
- BSONObjBuilder& result ) {
- AutoGetCollectionForRead ctx(txn, getNS());
- Collection* collection = ctx.getCollection();
- if ( !collection ) {
- errmsg = "ns not found, should be impossible";
- return false;
- }
- // Allow multiKey based on the invariant that shard keys must be single-valued.
- // Therefore, any multi-key index prefixed by shard key cannot be multikey over
- // the shard key fields.
- IndexDescriptor *idx =
- collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn,
- _shardKeyPattern ,
- false); // requireSingleKey
- if (idx == NULL) {
- errmsg = str::stream() << "can't find index with prefix " << _shardKeyPattern
- << " in storeCurrentLocs for " << _ns;
- return false;
- }
- // Assume both min and max non-empty, append MinKey's to make them fit chosen index
- BSONObj min;
- BSONObj max;
- KeyPattern kp(idx->keyPattern());
- {
- // It's alright not to lock _mutex all the way through based on the assumption
- // that this is only called by the main thread that drives the migration and
- // only it can start and stop the current migration.
- boost::lock_guard<boost::mutex> sl(_mutex);
- invariant( _deleteNotifyExec.get() == NULL );
- WorkingSet* ws = new WorkingSet();
- DeleteNotificationStage* dns = new DeleteNotificationStage();
- PlanExecutor* deleteNotifyExec;
- // Takes ownership of 'ws' and 'dns'.
- Status execStatus = PlanExecutor::make(txn,
- ws,
- dns,
- collection,
- PlanExecutor::YIELD_MANUAL,
- &deleteNotifyExec);
- invariant(execStatus.isOK());
- deleteNotifyExec->registerExec();
- _deleteNotifyExec.reset(deleteNotifyExec);
- min = Helpers::toKeyFormat(kp.extendRangeBound(_min, false));
- max = Helpers::toKeyFormat(kp.extendRangeBound(_max, false));
- }
- auto_ptr<PlanExecutor> exec(
- InternalPlanner::indexScan(txn, collection, idx, min, max, false));
- // We can afford to yield here because any change to the base data that we might
- // miss is already being queued and will migrate in the 'transferMods' stage.
- exec->setYieldPolicy(PlanExecutor::YIELD_AUTO);
- // use the average object size to estimate how many objects a full chunk would carry
- // do that while traversing the chunk's range using the sharding index, below
- // there's a fair amount of slack before we determine a chunk is too large because object sizes will vary
- unsigned long long maxRecsWhenFull;
- long long avgRecSize;
- const long long totalRecs = collection->numRecords(txn);
- if ( totalRecs > 0 ) {
- avgRecSize = collection->dataSize(txn) / totalRecs;
- maxRecsWhenFull = maxChunkSize / avgRecSize;
- maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1) , 130 * maxRecsWhenFull / 100 /* slack */ );
- }
- else {
- avgRecSize = 0;
- maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1;
- }
-
- // do a full traversal of the chunk and don't stop even if we think it is a large chunk
- // we want the number of records to better report, in that case
- bool isLargeChunk = false;
- unsigned long long recCount = 0;;
- RecordId dl;
- while (PlanExecutor::ADVANCED == exec->getNext(NULL, &dl)) {
- if ( ! isLargeChunk ) {
- boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
- _cloneLocs.insert( dl );
- }
- if ( ++recCount > maxRecsWhenFull ) {
- isLargeChunk = true;
- // continue on despite knowing that it will fail,
- // just to get the correct value for recCount.
- }
- }
- exec.reset();
- if ( isLargeChunk ) {
- boost::lock_guard<boost::mutex> sl(_mutex);
- warning() << "cannot move chunk: the maximum number of documents for a chunk is "
- << maxRecsWhenFull << " , the maximum chunk size is " << maxChunkSize
- << " , average document size is " << avgRecSize
- << ". Found " << recCount << " documents in chunk "
- << " ns: " << _ns << " "
- << _min << " -> " << _max << migrateLog;
- result.appendBool( "chunkTooBig" , true );
- result.appendNumber( "estimatedChunkSize" , (long long)(recCount * avgRecSize) );
- errmsg = "chunk too big to move";
- return false;
- }
- log() << "moveChunk number of documents: " << cloneLocsRemaining() << migrateLog;
- txn->recoveryUnit()->commitAndRestart();
- return true;
- }
- bool clone(OperationContext* txn, string& errmsg , BSONObjBuilder& result ) {
- ElapsedTracker tracker(internalQueryExecYieldIterations,
- internalQueryExecYieldPeriodMS);
- int allocSize = 0;
- {
- AutoGetCollectionForRead ctx(txn, getNS());
- boost::lock_guard<boost::mutex> sl(_mutex);
- if (!_active) {
- errmsg = "not active";
- return false;
- }
- Collection* collection = ctx.getCollection();
- if (!collection) {
- errmsg = str::stream() << "collection " << _ns << " does not exist";
- return false;
- }
- allocSize =
- std::min(BSONObjMaxUserSize,
- static_cast<int>((12 + collection->averageObjectSize(txn)) *
- cloneLocsRemaining()));
- }
- bool isBufferFilled = false;
- BSONArrayBuilder clonedDocsArrayBuilder(allocSize);
- while (!isBufferFilled) {
- AutoGetCollectionForRead ctx(txn, getNS());
- boost::lock_guard<boost::mutex> sl(_mutex);
- if (!_active) {
- errmsg = "not active";
- return false;
- }
- // TODO: fix SERVER-16540 race
- Collection* collection = ctx.getCollection();
- if (!collection) {
- errmsg = str::stream() << "collection " << _ns << " does not exist";
- return false;
- }
- boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
- set<RecordId>::iterator cloneLocsIter = _cloneLocs.begin();
- for ( ; cloneLocsIter != _cloneLocs.end(); ++cloneLocsIter) {
- if (tracker.intervalHasElapsed()) // should I yield?
- break;
- RecordId dl = *cloneLocsIter;
- Snapshotted<BSONObj> doc;
- if (!collection->findDoc(txn, dl, &doc)) {
- // doc was deleted
- continue;
- }
- // Use the builder size instead of accumulating 'doc's size so that we take
- // into consideration the overhead of BSONArray indices, and *always*
- // append one doc.
- if (clonedDocsArrayBuilder.arrSize() != 0 &&
- (clonedDocsArrayBuilder.len() + doc.value().objsize() + 1024)
- > BSONObjMaxUserSize) {
- isBufferFilled = true; // break out of outer while loop
- break;
- }
- clonedDocsArrayBuilder.append(doc.value());
- }
- _cloneLocs.erase(_cloneLocs.begin(), cloneLocsIter);
- // Note: must be holding _cloneLocsMutex, don't move this inside while condition!
- if (_cloneLocs.empty()) {
- break;
- }
- }
- result.appendArray("objects", clonedDocsArrayBuilder.arr());
- return true;
- }
- void aboutToDelete( const RecordId& dl ) {
- // Even though above we call findDoc to check for existance
- // that check only works for non-mmapv1 engines, and this is needed
- // for mmapv1.
- boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
- _cloneLocs.erase( dl );
- }
- std::size_t cloneLocsRemaining() {
- boost::lock_guard<boost::mutex> lk(_cloneLocsMutex);
- return _cloneLocs.size();
- }
- long long mbUsed() const {
- boost::lock_guard<boost::mutex> lk(_mutex);
- return _memoryUsed / ( 1024 * 1024 );
- }
- bool getInCriticalSection() const {
- boost::lock_guard<boost::mutex> lk(_mutex);
- return _inCriticalSection;
- }
- void setInCriticalSection( bool b ) {
- boost::lock_guard<boost::mutex> lk(_mutex);
- _inCriticalSection = b;
- _inCriticalSectionCV.notify_all();
- }
- std::string getNS() const {
- boost::lock_guard<boost::mutex> sl(_mutex);
- return _ns;
- }
- /**
- * @return true if we are NOT in the critical section
- */
- bool waitTillNotInCriticalSection( int maxSecondsToWait ) {
- boost::xtime xt;
- boost::xtime_get(&xt, MONGO_BOOST_TIME_UTC);
- xt.sec += maxSecondsToWait;
- boost::unique_lock<boost::mutex> lk(_mutex);
- while (_inCriticalSection) {
- if (!_inCriticalSectionCV.timed_wait(lk, xt))
- return false;
- }
- return true;
- }
- bool isActive() const { return _getActive(); }
- private:
- bool _getActive() const { boost::lock_guard<boost::mutex> lk(_mutex); return _active; }
- void _setActive( bool b ) { boost::lock_guard<boost::mutex> lk(_mutex); _active = b; }
- /**
- * Used to commit work for LogOpForSharding. Used to keep track of changes in documents
- * that are part of a chunk being migrated.
- */
- class LogOpForShardingHandler : public RecoveryUnit::Change {
- public:
- /**
- * Invariant: idObj should belong to a document that is part of the active chunk
- * being migrated.
- */
- LogOpForShardingHandler(MigrateFromStatus* migrateFromStatus,
- const BSONObj& idObj,
- const char op):
- _migrateFromStatus(migrateFromStatus),
- _idObj(idObj.getOwned()),
- _op(op) {
- }
- virtual void commit() {
- switch (_op) {
- case 'd': {
- boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
- _migrateFromStatus->_deleted.push_back(_idObj);
- _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
- break;
- }
- case 'i':
- case 'u':
- {
- boost::lock_guard<boost::mutex> sl(_migrateFromStatus->_mutex);
- _migrateFromStatus->_reload.push_back(_idObj);
- _migrateFromStatus->_memoryUsed += _idObj.firstElement().size() + 5;
- break;
- }
- default:
- invariant(false);
- }
- }
- virtual void rollback() { }
- private:
- MigrateFromStatus* _migrateFromStatus;
- const BSONObj _idObj;
- const char _op;
- };
- /**
- * Used to receive invalidation notifications.
- *
- * XXX: move to the exec/ directory.
- */
- class DeleteNotificationStage : public PlanStage {
- public:
- virtual void invalidate(OperationContext* txn,
- const RecordId& dl,
- InvalidationType type);
- virtual StageState work(WorkingSetID* out) {
- invariant( false );
- }
- virtual bool isEOF() {
- invariant( false );
- return false;
- }
- virtual void kill() {
- }
- virtual void saveState() {
- invariant( false );
- }
- virtual void restoreState(OperationContext* opCtx) {
- invariant( false );
- }
- virtual PlanStageStats* getStats() {
- invariant( false );
- return NULL;
- }
- virtual CommonStats* getCommonStats() const {
- invariant( false );
- return NULL;
- }
- virtual SpecificStats* getSpecificStats() const {
- invariant( false );
- return NULL;
- }
- virtual std::vector<PlanStage*> getChildren() const {
- vector<PlanStage*> empty;
- return empty;
- }
- virtual StageType stageType() const {
- return STAGE_NOTIFY_DELETE;
- }
- };
- //
- // All member variables are labeled with one of the following codes indicating the
- // synchronization rules for accessing them.
- //
- // (M) Must hold _mutex for access.
- // (MG) For reads, _mutex *OR* Global IX Lock must be held.
- // For writes, the _mutex *AND* (Global Shared or Exclusive Lock) must be held.
- // (C) Must hold _cloneLocsMutex for access.
- //
- // Locking order:
- //
- // Global Lock -> _mutex -> _cloneLocsMutex
- mutable mongo::mutex _mutex;
- boost::condition _inCriticalSectionCV; // (M)
- // Is migration currently in critical section. This can be used to block new writes.
- bool _inCriticalSection; // (M)
- scoped_ptr<PlanExecutor> _deleteNotifyExec; // (M)
- // List of _id of documents that were modified that must be re-cloned.
- list<BSONObj> _reload; // (M)
- // List of _id of documents that were deleted during clone that should be deleted later.
- list<BSONObj> _deleted; // (M)
- // bytes in _reload + _deleted
- long long _memoryUsed; // (M)
- // If a migration is currently active.
- bool _active; // (MG)
- string _ns; // (MG)
- BSONObj _min; // (MG)
- BSONObj _max; // (MG)
- BSONObj _shardKeyPattern; // (MG)
- mutable mongo::mutex _cloneLocsMutex;
- // List of record id that needs to be transferred from here to the other side.
- set<RecordId> _cloneLocs; // (C)
- } migrateFromStatus;
- void MigrateFromStatus::DeleteNotificationStage::invalidate(OperationContext *txn,
- const RecordId& dl,
- InvalidationType type) {
- if ( type == INVALIDATION_DELETION ) {
- migrateFromStatus.aboutToDelete( dl );
- }
- }
- struct MigrateStatusHolder {
- MigrateStatusHolder( OperationContext* txn,
- const std::string& ns ,
- const BSONObj& min ,
- const BSONObj& max ,
- const BSONObj& shardKeyPattern )
- : _txn(txn) {
- _isAnotherMigrationActive =
- !migrateFromStatus.start(txn, ns, min, max, shardKeyPattern);
- }
- ~MigrateStatusHolder() {
- if (!_isAnotherMigrationActive) {
- migrateFromStatus.done(_txn);
- }
- }
- bool isAnotherMigrationActive() const {
- return _isAnotherMigrationActive;
- }
- private:
- OperationContext* _txn;
- bool _isAnotherMigrationActive;
- };
- void logOpForSharding(OperationContext* txn,
- const char * opstr,
- const char * ns,
- const BSONObj& obj,
- BSONObj * patt,
- bool notInActiveChunk) {
- migrateFromStatus.logOp(txn, opstr, ns, obj, patt, notInActiveChunk);
- }
- class TransferModsCommand : public ChunkCommandHelper {
- public:
- void help(stringstream& h) const { h << "internal"; }
- TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {}
- virtual void addRequiredPrivileges(const std::string& dbname,
- const BSONObj& cmdObj,
- std::vector<Privilege>* out) {
- ActionSet actions;
- actions.addAction(ActionType::internal);
- out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
- }
- bool run(OperationContext* txn,
- const string&,
- BSONObj& cmdObj,
- int,
- string& errmsg,
- BSONObjBuilder& result) {
- return migrateFromStatus.transferMods(txn, errmsg, result);
- }
- } transferModsCommand;
- class InitialCloneCommand : public ChunkCommandHelper {
- public:
- void help(stringstream& h) const { h << "internal"; }
- InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {}
- virtual void addRequiredPrivileges(const std::string& dbname,
- const BSONObj& cmdObj,
- std::vector<Privilege>* out) {
- ActionSet actions;
- actions.addAction(ActionType::internal);
- out->push_back(Privilege(ResourcePattern::forClusterResource(), actions));
- }
- bool run(OperationContext* txn,
- const string&,
- BSONObj& cmdObj,
- int,
- string& errmsg,
- BSONObjBuilder& result) {
- return migrateFromStatus.clone(txn, errmsg, result);
- }
- } initialCloneCommand;
- // Tests can pause / resume moveChunk's progress at each step by enabling / disabling each fail point.
- MONGO_FP_DECLARE(moveChunkHangAtStep1);
- MONGO_FP_DECLARE(moveChunkHangAtStep2);
- MONGO_FP_DECLARE(moveChunkHangAtStep3);
- MONGO_FP_DECLARE(moveChunkHangAtStep4);
- MONGO_FP_DECLARE(moveChunkHangAtStep5);
- MONGO_FP_DECLARE(moveChunkHangAtStep6);
- /**
- * this is the main entry for moveChunk
- * called to initial a move
- * usually by a mongos
- * this is called on the "from" side
- *
- * Format:
- * {
- * moveChunk: "namespace",
- * from: "hostAndPort",
- * fromShard: "shardName",
- * to: "hostAndPort",
- * toShard: "shardName",
- * min: {},
- * max: {},
- * maxChunkBytes: numeric,
- * configdb: "hostAndPort",
- *
- * // optional
- * secondaryThrottle: bool, //defaults to true.
- * writeConcern: {} // applies to individual writes.
- * }
- */
- class MoveChunkCommand : public Command {
- public:
- MoveChunkCommand() : Command( "moveChunk" ) {}
- virtual void help( stringstream& help ) const {
- help << "should not be calling this directly";
- }
- virtual bool slaveOk() const { return false; }
- virtual bool adminOnly() const { return true; }
- virtual bool isWriteCommandForConfigServer() const { return false; }
- virtual Status checkAuthForCommand(ClientBasic* client,
- const std::string& dbname,
- const BSONObj& cmdObj) {
- if (!AuthorizationSession::get(client)->isAuthorizedForActionsOnResource(
- ResourcePattern::forExactNamespace(NamespaceString(parseNs(dbname, cmdObj))),
- ActionType::moveChunk)) {
- return Status(ErrorCodes::Unauthorized, "Unauthorized");
- }
- return Status::OK();
- }
- virtual std::string parseNs(const std::string& dbname, const BSONObj& cmdObj) const {
- return parseNsFullyQualified(dbname, cmdObj);
- }
- bool run(OperationContext* txn,
- const string& dbname,
- BSONObj& cmdObj,
- int,
- string& errmsg,
- BSONObjBuilder& result) {
- // 1. Parse options
- // 2. Make sure my view is complete and lock the distributed lock to ensure shard
- // metadata stability.
- // 3. Migration
- // Retrieve all RecordIds, which need to be migrated in order to do as little seeking
- // as possible during transfer. Retrieval of the RecordIds happens under a collection
- // lock, but then the collection lock is dropped. This opens up an opportunity for
- // repair or compact to invalidate these RecordIds, because these commands do not
- // synchronized with migration. Note that data modifications are not a problem,
- // because we are registered for change notifications.
- //
- // 4. pause till migrate caught up
- // 5. LOCK
- // a) update my config, essentially locking
- // b) finish migrate
- // c) update config server
- // d) logChange to config server
- // 6. wait for all current cursors to expire
- // 7. remove data locally
- // -------------------------------
- // 1.
- string ns = parseNs(dbname, cmdObj);
- // The shard addresses, redundant, but allows for validation
- string toShardHost = cmdObj["to"].str();
- string fromShardHost = cmdObj["from"].str();
- // The shard names
- string toShardName = cmdObj["toShard"].str();
- string fromShardName = cmdObj["fromShard"].str();
- // Process secondary throttle settings and assign defaults if necessary.
- BSONObj secThrottleObj;
- WriteConcernOptions writeConcern;
- Status status = writeConcern.parseSecondaryThrottle(cmdObj, &secThrottleObj);
- if (!status.isOK()){
- if (status.code() != ErrorCodes::WriteConcernNotDefined) {
- warning() << status.toString() << endl;
- return appendCommandStatus(result, status);
- }
- writeConcern = getDefaultWriteConcern();
- }
- else {
- repl::ReplicationCoordinator* replCoordinator =
- repl::getGlobalReplicationCoordinator();
- if (replCoordinator->getReplicationMode() ==
- repl::ReplicationCoordinator::modeMasterSlave &&
- writeConcern.shouldWaitForOtherNodes()) {
- warning() << "moveChunk cannot check if secondary throttle setting "
- << writeConcern.toBSON()
- << " can be enforced in a master slave configuration";
- }
- Status status = replCoordinator->checkIfWriteConcernCanBeSatisfied(writeConcern);
- if (!status.isOK() && status != ErrorCodes::NoReplicationEnabled) {
- warning() << status.toString() << endl;
- return appendCommandStatus(result, status);
- }
- }
- if (writeConcern.shouldWaitForOtherNodes() &&
- writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) {
- // Don't allow no timeout.
- writeConcern.wTimeout = kDefaultWTimeoutMs;
- }
- // Do inline deletion
- bool waitForDelete = cmdObj["waitForDelete"].trueValue();
- if (waitForDelete) {
- log() << "moveChunk waiting for full cleanup after move" << endl;
- }
- BSONObj min = cmdObj["min"].Obj();
- BSONObj max = cmdObj["max"].Obj();
- BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
- if ( ns.empty() ) {
- errmsg = "need to specify namespace in command";
- return false;
- }
- if ( toShardName.empty() ) {
- errmsg = "need to specify shard to move chunk to";
- return false;
- }
- if ( fromShardName.empty() ) {
- errmsg = "need to specify shard to move chunk from";
- return false;
- }
- if ( min.isEmpty() ) {
- errmsg = "need to specify a min";
- return false;
- }
- if ( max.isEmpty() ) {
- errmsg = "need to specify a max";
- return false;
- }
- if ( maxSizeElem.eoo() || ! maxSizeElem.isNumber() ) {
- errmsg = "need to specify maxChunkSizeBytes";
- return false;
- }
- const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
- // This could be the first call that enables sharding - make sure we initialize the
- // sharding state for this shard.
- if ( ! shardingState.enabled() ) {
- if ( cmdObj["configdb"].type() != String ) {
- errmsg = "sharding not enabled";
- warning() << errmsg << endl;
- return false;
- }
- string configdb = cmdObj["configdb"].String();
- ShardingState::initialize(configdb);
- }
- // Initialize our current shard name in the shard state if needed
- shardingState.gotShardName(fromShardName);
- // Make sure we're as up-to-date as possible with shard information
- // This catches the case where we had to previously changed a shard's host by
- // removing/adding a shard with the same name
- Shard::reloadShardInfo();
- Shard toShard(toShardName);
- Shard fromShard(fromShardName);
- ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(),
- errmsg);
- if (!configLoc.isValid()) {
- warning() << errmsg;
- return false;
- }
- MoveTimingHelper timing(txn, "from" , ns , min , max , 6 /* steps */ , &errmsg,
- toShardName, fromShardName );
- log() << "received moveChunk request: " << cmdObj << migrateLog;
- timing.done(1);
- MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep1);
- // 2.
-
- if ( migrateFromStatus.isActive() ) {
- errmsg = "migration already in progress";
- warning() << errmsg << endl;
- return false;
- }
- //
- // Get the distributed lock
- //
- string whyMessage(str::stream() << "migrating chunk [" << minKey << ", " << maxKey
- << ") in " << ns);
- auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
- ns, whyMessage);
- if (!scopedDistLock.isOK()) {
- errmsg = stream() << "could not acquire collection lock for " << ns
- << " to migrate chunk [" << minKey << "," << maxKey << ")"
- << causedBy(scopedDistLock.getStatus());
- warning() << errmsg << endl;
- return false;
- }
- BSONObj chunkInfo =
- BSON("min" << min << "max" << max <<
- "from" << fromShard.getName() << "to" << toShard.getName());
- grid.catalogManager()->logChange(txn, "moveChunk.start", ns, chunkInfo);
- // Always refresh our metadata remotely
- ChunkVersion origShardVersion;
- Status refreshStatus = shardingState.refreshMetadataNow(txn, ns, &origShardVersion);
- if (!refreshStatus.isOK()) {
- errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
- << "[" << minKey << "," << maxKey << ")"
- << causedBy(refreshStatus.reason());
- warning() << errmsg;
- return false;
- }
- if (origShardVersion.majorVersion() == 0) {
- // It makes no sense to migrate if our version is zero and we have no chunks
- errmsg = str::stream() << "moveChunk cannot start migrate of chunk "
- << "[" << minKey << "," << maxKey << ")"
- << " with zero shard version";
- warning() << errmsg;
- return false;
- }
- // From mongos >= v3.0.
- BSONElement epochElem(cmdObj["epoch"]);
- if (epochElem.type() == jstOID) {
- OID cmdEpoch = epochElem.OID();
- if (cmdEpoch != origShardVersion.epoch()) {
- errmsg = str::stream() << "moveChunk cannot move chunk "
- << "[" << minKey << ","
- << maxKey << "), "
- << "collection may have been dropped. "
- << "current epoch: " << origShardVersion.epoch()
- << ", cmd epoch: " << cmdEpoch;
- warning() << errmsg;
- return false;
- }
- }
- // Get collection metadata
- const CollectionMetadataPtr origCollMetadata(shardingState.getCollectionMetadata(ns));
- // With nonzero shard version, we must have metadata
- invariant(NULL != origCollMetadata);
- ChunkVersion origCollVersion = origCollMetadata->getCollVersion();
- BSONObj shardKeyPattern = origCollMetadata->getKeyPattern();
- // With nonzero shard version, we must have a coll version >= our shard version
- invariant(origCollVersion >= origShardVersion);
- // With nonzero shard version, we must have a shard key
- invariant(!shardKeyPattern.isEmpty());
- ChunkType origChunk;
- if (!origCollMetadata->getNextChunk(min, &origChunk)
- || origChunk.getMin().woCompare(min) || origChunk.getMax().woCompare(max)) {
- // Our boundaries are different from those passed in
- errmsg = str::stream() << "moveChunk cannot find chunk "
- << "[" << minKey << "," << maxKey << ")"
- << " to migrate, the chunk boundaries may be stale";
- warning() << errmsg;
- return false;
- }
- log() << "moveChunk request accepted at version " << origShardVersion;
- timing.done(2);
- MONGO_FP_PAUSE_WHILE(moveChunkHangAtStep2);
- // 3.
- MigrateStatusHolder statusHolder(txn, ns, min, max, shardKeyPattern);
-
- if (statusHolder.isAnotherMigrationActive()) {
- errmsg = "moveChunk is already in progress from this shard";
- warning() << errmsg << endl;
- return false;
- }
- …
Large files files are truncated, but you can click here to view the full file