PageRenderTime 43ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 1ms

/src/mongo/db/ops/query.cpp

http://github.com/mongodb/mongo
C++ | 1053 lines | 817 code | 151 blank | 85 comment | 142 complexity | 1234885867e5d7c0a72833b72d04dfb5 MD5 | raw file
Possible License(s): BSD-3-Clause-No-Nuclear-License-2014, GPL-2.0, Apache-2.0, BSD-3-Clause, WTFPL
  1. // query.cpp
  2. /**
  3. * Copyright (C) 2008 10gen Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include "pch.h"
  18. #include "query.h"
  19. #include "../pdfile.h"
  20. #include "../clientcursor.h"
  21. #include "../oplog.h"
  22. #include "../../bson/util/builder.h"
  23. #include "../replutil.h"
  24. #include "../scanandorder.h"
  25. #include "../commands.h"
  26. #include "../queryoptimizer.h"
  27. #include "../../s/d_logic.h"
  28. #include "../../server.h"
  29. #include "../queryoptimizercursor.h"
  30. #include "../pagefault.h"
  31. namespace mongo {
  32. /* We cut off further objects once we cross this threshold; thus, you might get
  33. a little bit more than this, it is a threshold rather than a limit.
  34. */
  35. const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
  36. bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
  37. try {
  38. return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
  39. }
  40. catch( SendStaleConfigException& ){
  41. throw;
  42. }
  43. catch ( AssertionException& e ) {
  44. verify( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode );
  45. e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
  46. curop.debug().exceptionInfo = e.getInfo();
  47. }
  48. anObjBuilder.append("errmsg", "db assertion failure");
  49. anObjBuilder.append("ok", 0.0);
  50. BSONObj x = anObjBuilder.done();
  51. b.appendBuf((void*) x.objdata(), x.objsize());
  52. return true;
  53. }
  54. BSONObj id_obj = fromjson("{\"_id\":1}");
  55. BSONObj empty_obj = fromjson("{}");
  56. //int dump = 0;
  57. /* empty result for error conditions */
  58. QueryResult* emptyMoreResult(long long cursorid) {
  59. BufBuilder b(32768);
  60. b.skip(sizeof(QueryResult));
  61. QueryResult *qr = (QueryResult *) b.buf();
  62. qr->cursorId = 0; // 0 indicates no more data to retrieve.
  63. qr->startingFrom = 0;
  64. qr->len = b.len();
  65. qr->setOperation(opReply);
  66. qr->initializeResultFlags();
  67. qr->nReturned = 0;
  68. b.decouple();
  69. return qr;
  70. }
  71. QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
  72. exhaust = false;
  73. int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
  74. BufBuilder b( bufSize );
  75. b.skip(sizeof(QueryResult));
  76. int resultFlags = ResultFlag_AwaitCapable;
  77. int start = 0;
  78. int n = 0;
  79. Client::ReadContext ctx(ns);
  80. // call this readlocked so state can't change
  81. replVerifyReadsOk();
  82. ClientCursor::Pin p(cursorid);
  83. ClientCursor *cc = p.c();
  84. if ( unlikely(!cc) ) {
  85. LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
  86. cursorid = 0;
  87. resultFlags = ResultFlag_CursorNotFound;
  88. }
  89. else {
  90. // check for spoofing of the ns such that it does not match the one originally there for the cursor
  91. uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
  92. if ( pass == 0 )
  93. cc->updateSlaveLocation( curop );
  94. int queryOptions = cc->queryOptions();
  95. curop.debug().query = cc->query();
  96. curop.setQuery( cc->query() );
  97. start = cc->pos();
  98. Cursor *c = cc->c();
  99. c->recoverFromYield();
  100. DiskLoc last;
  101. // This manager may be stale, but it's the state of chunking when the cursor was created.
  102. ShardChunkManagerPtr manager = cc->getChunkManager();
  103. while ( 1 ) {
  104. if ( !c->ok() ) {
  105. if ( c->tailable() ) {
  106. /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however
  107. advance() can still be retries as a reactivation attempt. when there is new data, it will
  108. return true. that's what we are doing here.
  109. */
  110. if ( c->advance() )
  111. continue;
  112. if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
  113. return 0;
  114. }
  115. break;
  116. }
  117. p.release();
  118. bool ok = ClientCursor::erase(cursorid);
  119. verify(ok);
  120. cursorid = 0;
  121. cc = 0;
  122. break;
  123. }
  124. MatchDetails details;
  125. if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
  126. // field projection specified, and contains an array operator
  127. details.requestElemMatchKey();
  128. }
  129. // in some cases (clone collection) there won't be a matcher
  130. if ( !c->currentMatches( &details ) ) {
  131. }
  132. else if ( manager && ! manager->belongsToMe( cc ) ){
  133. LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
  134. }
  135. else {
  136. if( c->getsetdup(c->currLoc()) ) {
  137. //out() << " but it's a dup \n";
  138. }
  139. else {
  140. last = c->currLoc();
  141. n++;
  142. cc->fillQueryResultFromObj( b, &details );
  143. if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
  144. c->advance();
  145. cc->incPos( n );
  146. break;
  147. }
  148. }
  149. }
  150. c->advance();
  151. if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ?
  152. ClientCursor::DontNeed : ClientCursor::WillNeed ) ) {
  153. ClientCursor::erase(cursorid);
  154. cursorid = 0;
  155. cc = 0;
  156. break;
  157. }
  158. }
  159. if ( cc ) {
  160. if ( c->supportYields() ) {
  161. ClientCursor::YieldData data;
  162. verify( cc->prepareToYield( data ) );
  163. }
  164. else {
  165. cc->c()->noteLocation();
  166. }
  167. cc->mayUpgradeStorage();
  168. cc->storeOpForSlave( last );
  169. exhaust = cc->queryOptions() & QueryOption_Exhaust;
  170. }
  171. }
  172. QueryResult *qr = (QueryResult *) b.buf();
  173. qr->len = b.len();
  174. qr->setOperation(opReply);
  175. qr->_resultFlags() = resultFlags;
  176. qr->cursorId = cursorid;
  177. qr->startingFrom = start;
  178. qr->nReturned = n;
  179. b.decouple();
  180. return qr;
  181. }
  182. ResultDetails::ResultDetails() :
  183. match(),
  184. orderedMatch(),
  185. loadedRecord(),
  186. chunkSkip() {
  187. }
  188. ExplainRecordingStrategy::ExplainRecordingStrategy
  189. ( const ExplainQueryInfo::AncillaryInfo &ancillaryInfo ) :
  190. _ancillaryInfo( ancillaryInfo ) {
  191. }
  192. shared_ptr<ExplainQueryInfo> ExplainRecordingStrategy::doneQueryInfo() {
  193. shared_ptr<ExplainQueryInfo> ret = _doneQueryInfo();
  194. ret->setAncillaryInfo( _ancillaryInfo );
  195. return ret;
  196. }
  197. NoExplainStrategy::NoExplainStrategy() :
  198. ExplainRecordingStrategy( ExplainQueryInfo::AncillaryInfo() ) {
  199. }
  200. shared_ptr<ExplainQueryInfo> NoExplainStrategy::_doneQueryInfo() {
  201. verify( false );
  202. return shared_ptr<ExplainQueryInfo>();
  203. }
  204. MatchCountingExplainStrategy::MatchCountingExplainStrategy
  205. ( const ExplainQueryInfo::AncillaryInfo &ancillaryInfo ) :
  206. ExplainRecordingStrategy( ancillaryInfo ),
  207. _orderedMatches() {
  208. }
  209. void MatchCountingExplainStrategy::noteIterate( const ResultDetails& resultDetails ) {
  210. _noteIterate( resultDetails );
  211. if ( resultDetails.orderedMatch ) {
  212. ++_orderedMatches;
  213. }
  214. }
  215. SimpleCursorExplainStrategy::SimpleCursorExplainStrategy
  216. ( const ExplainQueryInfo::AncillaryInfo &ancillaryInfo,
  217. const shared_ptr<Cursor> &cursor ) :
  218. MatchCountingExplainStrategy( ancillaryInfo ),
  219. _cursor( cursor ),
  220. _explainInfo( new ExplainSinglePlanQueryInfo() ) {
  221. }
  222. void SimpleCursorExplainStrategy::notePlan( bool scanAndOrder, bool indexOnly ) {
  223. _explainInfo->notePlan( *_cursor, scanAndOrder, indexOnly );
  224. }
  225. void SimpleCursorExplainStrategy::_noteIterate( const ResultDetails& resultDetails ) {
  226. _explainInfo->noteIterate( resultDetails.match,
  227. resultDetails.loadedRecord,
  228. resultDetails.chunkSkip,
  229. *_cursor );
  230. }
  231. void SimpleCursorExplainStrategy::noteYield() {
  232. _explainInfo->noteYield();
  233. }
  234. shared_ptr<ExplainQueryInfo> SimpleCursorExplainStrategy::_doneQueryInfo() {
  235. _explainInfo->noteDone( *_cursor );
  236. return _explainInfo->queryInfo();
  237. }
  238. QueryOptimizerCursorExplainStrategy::QueryOptimizerCursorExplainStrategy
  239. ( const ExplainQueryInfo::AncillaryInfo &ancillaryInfo,
  240. const shared_ptr<QueryOptimizerCursor> &cursor ) :
  241. MatchCountingExplainStrategy( ancillaryInfo ),
  242. _cursor( cursor ) {
  243. }
  244. void QueryOptimizerCursorExplainStrategy::_noteIterate( const ResultDetails& resultDetails ) {
  245. // Note ordered matches only; if an unordered plan is selected, the explain result will
  246. // be updated with reviseN().
  247. _cursor->noteIterate( resultDetails.orderedMatch,
  248. resultDetails.loadedRecord,
  249. resultDetails.chunkSkip );
  250. }
  251. void QueryOptimizerCursorExplainStrategy::noteYield() {
  252. _cursor->noteYield();
  253. }
  254. shared_ptr<ExplainQueryInfo> QueryOptimizerCursorExplainStrategy::_doneQueryInfo() {
  255. return _cursor->explainQueryInfo();
  256. }
  257. ResponseBuildStrategy::ResponseBuildStrategy( const ParsedQuery &parsedQuery,
  258. const shared_ptr<Cursor> &cursor,
  259. BufBuilder &buf ) :
  260. _parsedQuery( parsedQuery ),
  261. _cursor( cursor ),
  262. _queryOptimizerCursor( dynamic_pointer_cast<QueryOptimizerCursor>( _cursor ) ),
  263. _buf( buf ) {
  264. }
  265. void ResponseBuildStrategy::resetBuf() {
  266. _buf.reset();
  267. _buf.skip( sizeof( QueryResult ) );
  268. }
  269. BSONObj ResponseBuildStrategy::current( bool allowCovered,
  270. ResultDetails* resultDetails ) const {
  271. if ( _parsedQuery.returnKey() ) {
  272. BSONObjBuilder bob;
  273. bob.appendKeys( _cursor->indexKeyPattern(), _cursor->currKey() );
  274. return bob.obj();
  275. }
  276. if ( allowCovered ) {
  277. const Projection::KeyOnly *keyFieldsOnly = _cursor->keyFieldsOnly();
  278. if ( keyFieldsOnly ) {
  279. return keyFieldsOnly->hydrate( _cursor->currKey() );
  280. }
  281. }
  282. resultDetails->loadedRecord = true;
  283. BSONObj ret = _cursor->current();
  284. verify( ret.isValid() );
  285. return ret;
  286. }
  287. OrderedBuildStrategy::OrderedBuildStrategy( const ParsedQuery &parsedQuery,
  288. const shared_ptr<Cursor> &cursor,
  289. BufBuilder &buf ) :
  290. ResponseBuildStrategy( parsedQuery, cursor, buf ),
  291. _skip( _parsedQuery.getSkip() ),
  292. _bufferedMatches() {
  293. }
  294. bool OrderedBuildStrategy::handleMatch( ResultDetails* resultDetails ) {
  295. DiskLoc loc = _cursor->currLoc();
  296. if ( _cursor->getsetdup( loc ) ) {
  297. return false;
  298. }
  299. if ( _skip > 0 ) {
  300. --_skip;
  301. return false;
  302. }
  303. BSONObj currentDocument = current( true, resultDetails );
  304. // Explain does not obey soft limits, so matches should not be buffered.
  305. if ( !_parsedQuery.isExplain() ) {
  306. fillQueryResultFromObj( _buf, _parsedQuery.getFields(),
  307. currentDocument, &resultDetails->matchDetails,
  308. ( _parsedQuery.showDiskLoc() ? &loc : 0 ) );
  309. ++_bufferedMatches;
  310. }
  311. resultDetails->match = true;
  312. resultDetails->orderedMatch = true;
  313. return true;
  314. }
  315. ReorderBuildStrategy* ReorderBuildStrategy::make( const ParsedQuery& parsedQuery,
  316. const shared_ptr<Cursor>& cursor,
  317. BufBuilder& buf,
  318. const QueryPlanSummary& queryPlan ) {
  319. auto_ptr<ReorderBuildStrategy> ret( new ReorderBuildStrategy( parsedQuery, cursor, buf ) );
  320. ret->init( queryPlan );
  321. return ret.release();
  322. }
  323. ReorderBuildStrategy::ReorderBuildStrategy( const ParsedQuery &parsedQuery,
  324. const shared_ptr<Cursor> &cursor,
  325. BufBuilder &buf ) :
  326. ResponseBuildStrategy( parsedQuery, cursor, buf ),
  327. _bufferedMatches() {
  328. }
  329. void ReorderBuildStrategy::init( const QueryPlanSummary &queryPlan ) {
  330. _scanAndOrder.reset( newScanAndOrder( queryPlan ) );
  331. }
  332. bool ReorderBuildStrategy::handleMatch( ResultDetails* resultDetails ) {
  333. if ( _cursor->getsetdup( _cursor->currLoc() ) ) {
  334. return false;
  335. }
  336. _handleMatchNoDedup( resultDetails );
  337. resultDetails->match = true;
  338. return true;
  339. }
  340. void ReorderBuildStrategy::_handleMatchNoDedup( ResultDetails* resultDetails ) {
  341. DiskLoc loc = _cursor->currLoc();
  342. _scanAndOrder->add( current( false, resultDetails ),
  343. _parsedQuery.showDiskLoc() ? &loc : 0 );
  344. }
  345. int ReorderBuildStrategy::rewriteMatches() {
  346. cc().curop()->debug().scanAndOrder = true;
  347. int ret = 0;
  348. _scanAndOrder->fill( _buf, &_parsedQuery, ret );
  349. _bufferedMatches = ret;
  350. return ret;
  351. }
  352. ScanAndOrder *
  353. ReorderBuildStrategy::newScanAndOrder( const QueryPlanSummary &queryPlan ) const {
  354. verify( !_parsedQuery.getOrder().isEmpty() );
  355. verify( _cursor->ok() );
  356. const FieldRangeSet *fieldRangeSet = 0;
  357. if ( queryPlan.valid() ) {
  358. fieldRangeSet = queryPlan._fieldRangeSetMulti.get();
  359. }
  360. else {
  361. verify( _queryOptimizerCursor );
  362. fieldRangeSet = _queryOptimizerCursor->initialFieldRangeSet();
  363. }
  364. verify( fieldRangeSet );
  365. return new ScanAndOrder( _parsedQuery.getSkip(),
  366. _parsedQuery.getNumToReturn(),
  367. _parsedQuery.getOrder(),
  368. *fieldRangeSet );
  369. }
  370. HybridBuildStrategy* HybridBuildStrategy::make( const ParsedQuery& parsedQuery,
  371. const shared_ptr<QueryOptimizerCursor>& cursor,
  372. BufBuilder& buf ) {
  373. auto_ptr<HybridBuildStrategy> ret( new HybridBuildStrategy( parsedQuery, cursor, buf ) );
  374. ret->init();
  375. return ret.release();
  376. }
  377. HybridBuildStrategy::HybridBuildStrategy( const ParsedQuery &parsedQuery,
  378. const shared_ptr<QueryOptimizerCursor> &cursor,
  379. BufBuilder &buf ) :
  380. ResponseBuildStrategy( parsedQuery, cursor, buf ),
  381. _orderedBuild( _parsedQuery, _cursor, _buf ),
  382. _reorderedMatches() {
  383. }
  384. void HybridBuildStrategy::init() {
  385. _reorderBuild.reset( ReorderBuildStrategy::make( _parsedQuery, _cursor, _buf,
  386. QueryPlanSummary() ) );
  387. }
  388. bool HybridBuildStrategy::handleMatch( ResultDetails* resultDetails ) {
  389. if ( !_queryOptimizerCursor->currentPlanScanAndOrderRequired() ) {
  390. return _orderedBuild.handleMatch( resultDetails );
  391. }
  392. return handleReorderMatch( resultDetails );
  393. }
  394. bool HybridBuildStrategy::handleReorderMatch( ResultDetails* resultDetails ) {
  395. DiskLoc loc = _cursor->currLoc();
  396. if ( _scanAndOrderDups.getsetdup( loc ) ) {
  397. return false;
  398. }
  399. resultDetails->match = true;
  400. try {
  401. _reorderBuild->_handleMatchNoDedup( resultDetails );
  402. } catch ( const UserException &e ) {
  403. if ( e.getCode() == ScanAndOrderMemoryLimitExceededAssertionCode ) {
  404. if ( _queryOptimizerCursor->hasPossiblyExcludedPlans() ) {
  405. _queryOptimizerCursor->clearIndexesForPatterns();
  406. throw QueryRetryException();
  407. }
  408. else if ( _queryOptimizerCursor->runningInitialInOrderPlan() ) {
  409. _queryOptimizerCursor->abortOutOfOrderPlans();
  410. return true;
  411. }
  412. }
  413. throw;
  414. }
  415. return true;
  416. }
  417. int HybridBuildStrategy::rewriteMatches() {
  418. if ( !_queryOptimizerCursor->completePlanOfHybridSetScanAndOrderRequired() ) {
  419. return _orderedBuild.rewriteMatches();
  420. }
  421. _reorderedMatches = true;
  422. resetBuf();
  423. return _reorderBuild->rewriteMatches();
  424. }
  425. int HybridBuildStrategy::bufferedMatches() const {
  426. return _reorderedMatches ?
  427. _reorderBuild->bufferedMatches() :
  428. _orderedBuild.bufferedMatches();
  429. }
  430. void HybridBuildStrategy::finishedFirstBatch() {
  431. _queryOptimizerCursor->abortOutOfOrderPlans();
  432. }
  433. QueryResponseBuilder *QueryResponseBuilder::make( const ParsedQuery &parsedQuery,
  434. const shared_ptr<Cursor> &cursor,
  435. const QueryPlanSummary &queryPlan,
  436. const BSONObj &oldPlan ) {
  437. auto_ptr<QueryResponseBuilder> ret( new QueryResponseBuilder( parsedQuery, cursor ) );
  438. ret->init( queryPlan, oldPlan );
  439. return ret.release();
  440. }
  441. QueryResponseBuilder::QueryResponseBuilder( const ParsedQuery &parsedQuery,
  442. const shared_ptr<Cursor> &cursor ) :
  443. _parsedQuery( parsedQuery ),
  444. _cursor( cursor ),
  445. _queryOptimizerCursor( dynamic_pointer_cast<QueryOptimizerCursor>( _cursor ) ),
  446. _buf( 32768 ) { // TODO be smarter here
  447. }
  448. void QueryResponseBuilder::init( const QueryPlanSummary &queryPlan, const BSONObj &oldPlan ) {
  449. _chunkManager = newChunkManager();
  450. _explain = newExplainRecordingStrategy( queryPlan, oldPlan );
  451. _builder = newResponseBuildStrategy( queryPlan );
  452. _builder->resetBuf();
  453. }
  454. bool QueryResponseBuilder::addMatch() {
  455. ResultDetails resultDetails;
  456. if ( _parsedQuery.getFields() && _parsedQuery.getFields()->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
  457. // field projection specified, and contains an array operator
  458. resultDetails.matchDetails.requestElemMatchKey();
  459. }
  460. bool match =
  461. currentMatches( &resultDetails ) &&
  462. chunkMatches( &resultDetails ) &&
  463. _builder->handleMatch( &resultDetails );
  464. _explain->noteIterate( resultDetails );
  465. return match;
  466. }
  467. void QueryResponseBuilder::noteYield() {
  468. _explain->noteYield();
  469. }
  470. bool QueryResponseBuilder::enoughForFirstBatch() const {
  471. return _parsedQuery.enoughForFirstBatch( _builder->bufferedMatches(), _buf.len() );
  472. }
  473. bool QueryResponseBuilder::enoughTotalResults() const {
  474. if ( _parsedQuery.isExplain() ) {
  475. return _parsedQuery.enoughForExplain( _explain->orderedMatches() );
  476. }
  477. return ( _parsedQuery.enough( _builder->bufferedMatches() ) ||
  478. _buf.len() >= MaxBytesToReturnToClientAtOnce );
  479. }
  480. void QueryResponseBuilder::finishedFirstBatch() {
  481. _builder->finishedFirstBatch();
  482. }
  483. int QueryResponseBuilder::handoff( Message &result ) {
  484. int rewriteCount = _builder->rewriteMatches();
  485. if ( _parsedQuery.isExplain() ) {
  486. shared_ptr<ExplainQueryInfo> explainInfo = _explain->doneQueryInfo();
  487. if ( rewriteCount != -1 ) {
  488. explainInfo->reviseN( rewriteCount );
  489. }
  490. _builder->resetBuf();
  491. fillQueryResultFromObj( _buf, 0, explainInfo->bson() );
  492. result.appendData( _buf.buf(), _buf.len() );
  493. _buf.decouple();
  494. return 1;
  495. }
  496. if ( _buf.len() > 0 ) {
  497. result.appendData( _buf.buf(), _buf.len() );
  498. _buf.decouple();
  499. }
  500. return _builder->bufferedMatches();
  501. }
  502. ShardChunkManagerPtr QueryResponseBuilder::newChunkManager() const {
  503. if ( !shardingState.needShardChunkManager( _parsedQuery.ns() ) ) {
  504. return ShardChunkManagerPtr();
  505. }
  506. return shardingState.getShardChunkManager( _parsedQuery.ns() );
  507. }
  508. shared_ptr<ExplainRecordingStrategy> QueryResponseBuilder::newExplainRecordingStrategy
  509. ( const QueryPlanSummary &queryPlan, const BSONObj &oldPlan ) const {
  510. if ( !_parsedQuery.isExplain() ) {
  511. return shared_ptr<ExplainRecordingStrategy>( new NoExplainStrategy() );
  512. }
  513. ExplainQueryInfo::AncillaryInfo ancillaryInfo;
  514. ancillaryInfo._oldPlan = oldPlan;
  515. if ( _queryOptimizerCursor ) {
  516. return shared_ptr<ExplainRecordingStrategy>
  517. ( new QueryOptimizerCursorExplainStrategy( ancillaryInfo, _queryOptimizerCursor ) );
  518. }
  519. shared_ptr<ExplainRecordingStrategy> ret
  520. ( new SimpleCursorExplainStrategy( ancillaryInfo, _cursor ) );
  521. ret->notePlan( queryPlan.valid() && queryPlan._scanAndOrderRequired,
  522. queryPlan._keyFieldsOnly );
  523. return ret;
  524. }
  525. shared_ptr<ResponseBuildStrategy> QueryResponseBuilder::newResponseBuildStrategy
  526. ( const QueryPlanSummary &queryPlan ) {
  527. bool unordered = _parsedQuery.getOrder().isEmpty();
  528. bool empty = !_cursor->ok();
  529. bool singlePlan = !_queryOptimizerCursor;
  530. bool singleOrderedPlan =
  531. singlePlan && ( !queryPlan.valid() || !queryPlan._scanAndOrderRequired );
  532. CandidatePlanCharacter queryOptimizerPlans;
  533. if ( _queryOptimizerCursor ) {
  534. queryOptimizerPlans = _queryOptimizerCursor->initialCandidatePlans();
  535. }
  536. if ( unordered ||
  537. empty ||
  538. singleOrderedPlan ||
  539. ( !singlePlan && !queryOptimizerPlans.mayRunOutOfOrderPlan() ) ) {
  540. return shared_ptr<ResponseBuildStrategy>
  541. ( new OrderedBuildStrategy( _parsedQuery, _cursor, _buf ) );
  542. }
  543. if ( singlePlan ||
  544. !queryOptimizerPlans.mayRunInOrderPlan() ) {
  545. return shared_ptr<ResponseBuildStrategy>
  546. ( ReorderBuildStrategy::make( _parsedQuery, _cursor, _buf, queryPlan ) );
  547. }
  548. return shared_ptr<ResponseBuildStrategy>
  549. ( HybridBuildStrategy::make( _parsedQuery, _queryOptimizerCursor, _buf ) );
  550. }
  551. bool QueryResponseBuilder::currentMatches( ResultDetails* resultDetails ) {
  552. bool matches = _cursor->currentMatches( &resultDetails->matchDetails );
  553. if ( resultDetails->matchDetails.hasLoadedRecord() ) {
  554. resultDetails->loadedRecord = true;
  555. }
  556. return matches;
  557. }
  558. bool QueryResponseBuilder::chunkMatches( ResultDetails* resultDetails ) {
  559. if ( !_chunkManager ) {
  560. return true;
  561. }
  562. // TODO: should make this covered at some point
  563. resultDetails->loadedRecord = true;
  564. if ( _chunkManager->belongsToMe( _cursor->current() ) ) {
  565. return true;
  566. }
  567. resultDetails->chunkSkip = true;
  568. return false;
  569. }
  570. /**
  571. * Run a query with a cursor provided by the query optimizer, or FindingStartCursor.
  572. * @yields the db lock.
  573. */
  574. string queryWithQueryOptimizer( int queryOptions, const string& ns,
  575. const BSONObj &jsobj, CurOp& curop,
  576. const BSONObj &query, const BSONObj &order,
  577. const shared_ptr<ParsedQuery> &pq_shared,
  578. const BSONObj &oldPlan,
  579. const ConfigVersion &shardingVersionAtStart,
  580. scoped_ptr<PageFaultRetryableSection>& parentPageFaultSection,
  581. scoped_ptr<NoPageFaultsAllowed>& noPageFault,
  582. Message &result ) {
  583. const ParsedQuery &pq( *pq_shared );
  584. shared_ptr<Cursor> cursor;
  585. QueryPlanSummary queryPlan;
  586. if ( pq.hasOption( QueryOption_OplogReplay ) ) {
  587. cursor = FindingStartCursor::getCursor( ns.c_str(), query, order );
  588. }
  589. else {
  590. cursor =
  591. NamespaceDetailsTransient::getCursor( ns.c_str(),
  592. query,
  593. order,
  594. QueryPlanSelectionPolicy::any(),
  595. pq_shared,
  596. false,
  597. &queryPlan );
  598. }
  599. verify( cursor );
  600. scoped_ptr<QueryResponseBuilder> queryResponseBuilder
  601. ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) );
  602. bool saveClientCursor = false;
  603. OpTime slaveReadTill;
  604. ClientCursor::Holder ccPointer( new ClientCursor( QueryOption_NoCursorTimeout, cursor,
  605. ns ) );
  606. for( ; cursor->ok(); cursor->advance() ) {
  607. bool yielded = false;
  608. if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered, &yielded ) ||
  609. !cursor->ok() ) {
  610. cursor.reset();
  611. queryResponseBuilder->noteYield();
  612. // !!! TODO The queryResponseBuilder still holds cursor. Currently it will not do
  613. // anything unsafe with the cursor in handoff(), but this is very fragile.
  614. //
  615. // We don't fail the query since we're fine with returning partial data if the
  616. // collection was dropped.
  617. // NOTE see SERVER-2454.
  618. // TODO This is wrong. The cursor could be gone if the closeAllDatabases command
  619. // just ran.
  620. break;
  621. }
  622. if ( yielded ) {
  623. queryResponseBuilder->noteYield();
  624. }
  625. if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) {
  626. break;
  627. }
  628. if ( !queryResponseBuilder->addMatch() ) {
  629. continue;
  630. }
  631. // Note slave's position in the oplog.
  632. if ( pq.hasOption( QueryOption_OplogReplay ) ) {
  633. BSONObj current = cursor->current();
  634. BSONElement e = current["ts"];
  635. if ( e.type() == Date || e.type() == Timestamp ) {
  636. slaveReadTill = e._opTime();
  637. }
  638. }
  639. if ( !cursor->supportGetMore() || pq.isExplain() ) {
  640. if ( queryResponseBuilder->enoughTotalResults() ) {
  641. break;
  642. }
  643. }
  644. else if ( queryResponseBuilder->enoughForFirstBatch() ) {
  645. // if only 1 requested, no cursor saved for efficiency...we assume it is findOne()
  646. if ( pq.wantMore() && pq.getNumToReturn() != 1 ) {
  647. queryResponseBuilder->finishedFirstBatch();
  648. if ( cursor->advance() ) {
  649. saveClientCursor = true;
  650. }
  651. }
  652. break;
  653. }
  654. }
  655. if ( cursor ) {
  656. if ( pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1 ) {
  657. cursor->setTailable();
  658. }
  659. // If the tailing request succeeded.
  660. if ( cursor->tailable() ) {
  661. saveClientCursor = true;
  662. }
  663. }
  664. if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) {
  665. // if the version changed during the query
  666. // we might be missing some data
  667. // and its safe to send this as mongos can resend
  668. // at this point
  669. throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) );
  670. }
  671. parentPageFaultSection.reset(0);
  672. noPageFault.reset( new NoPageFaultsAllowed() );
  673. int nReturned = queryResponseBuilder->handoff( result );
  674. ccPointer.reset();
  675. long long cursorid = 0;
  676. if ( saveClientCursor ) {
  677. // Create a new ClientCursor, with a default timeout.
  678. ccPointer.reset( new ClientCursor( queryOptions, cursor, ns,
  679. jsobj.getOwned() ) );
  680. cursorid = ccPointer->cursorid();
  681. DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
  682. if ( cursor->supportYields() ) {
  683. ClientCursor::YieldData data;
  684. ccPointer->prepareToYield( data );
  685. }
  686. else {
  687. ccPointer->c()->noteLocation();
  688. }
  689. // Save slave's position in the oplog.
  690. if ( pq.hasOption( QueryOption_OplogReplay ) && !slaveReadTill.isNull() ) {
  691. ccPointer->slaveReadTill( slaveReadTill );
  692. }
  693. if ( !ccPointer->ok() && ccPointer->c()->tailable() ) {
  694. DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
  695. }
  696. if( queryOptions & QueryOption_Exhaust ) {
  697. curop.debug().exhaust = true;
  698. }
  699. // Set attributes for getMore.
  700. ccPointer->setChunkManager( queryResponseBuilder->chunkManager() );
  701. ccPointer->setPos( nReturned );
  702. ccPointer->pq = pq_shared;
  703. ccPointer->fields = pq.getFieldPtr();
  704. ccPointer.release();
  705. }
  706. QueryResult *qr = (QueryResult *) result.header();
  707. qr->cursorId = cursorid;
  708. curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId );
  709. qr->setResultFlagsToOk();
  710. // qr->len is updated automatically by appendData()
  711. curop.debug().responseLength = qr->len;
  712. qr->setOperation(opReply);
  713. qr->startingFrom = 0;
  714. qr->nReturned = nReturned;
  715. int duration = curop.elapsedMillis();
  716. bool dbprofile = curop.shouldDBProfile( duration );
  717. if ( dbprofile || duration >= cmdLine.slowMS ) {
  718. curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
  719. curop.debug().ntoskip = pq.getSkip();
  720. }
  721. curop.debug().nreturned = nReturned;
  722. return curop.debug().exhaust ? ns : "";
  723. }
  724. bool queryIdHack( const char* ns, const BSONObj& query, const ParsedQuery& pq, CurOp& curop, Message& result ) {
  725. // notes:
  726. // do not touch result inside of PageFaultRetryableSection area
  727. Client& currentClient = cc(); // only here since its safe and takes time
  728. auto_ptr< QueryResult > qr;
  729. {
  730. // this extra bracing is not strictly needed
  731. // but makes it clear what the rules are in different spots
  732. scoped_ptr<PageFaultRetryableSection> pgfs;
  733. if ( ! currentClient.getPageFaultRetryableSection() )
  734. pgfs.reset( new PageFaultRetryableSection() );
  735. while ( 1 ) {
  736. try {
  737. int n = 0;
  738. bool nsFound = false;
  739. bool indexFound = false;
  740. BSONObj resObject; // put inside since we don't own the memory
  741. Client::ReadContext ctx( ns , dbpath ); // read locks
  742. replVerifyReadsOk(&pq);
  743. bool found = Helpers::findById( currentClient, ns, query, resObject, &nsFound, &indexFound );
  744. if ( nsFound && ! indexFound ) {
  745. // we have to resort to a table scan
  746. return false;
  747. }
  748. if ( shardingState.needShardChunkManager( ns ) ) {
  749. ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
  750. if ( m && ! m->belongsToMe( resObject ) ) {
  751. // I have something this _id
  752. // but it doesn't belong to me
  753. // so return nothing
  754. resObject = BSONObj();
  755. found = false;
  756. }
  757. }
  758. BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
  759. bb.skip(sizeof(QueryResult));
  760. curop.debug().idhack = true;
  761. if ( found ) {
  762. n = 1;
  763. fillQueryResultFromObj( bb , pq.getFields() , resObject );
  764. }
  765. qr.reset( (QueryResult *) bb.buf() );
  766. bb.decouple();
  767. qr->setResultFlagsToOk();
  768. qr->len = bb.len();
  769. curop.debug().responseLength = bb.len();
  770. qr->setOperation(opReply);
  771. qr->cursorId = 0;
  772. qr->startingFrom = 0;
  773. qr->nReturned = n;
  774. break;
  775. }
  776. catch ( PageFaultException& e ) {
  777. e.touch();
  778. }
  779. }
  780. }
  781. result.setData( qr.release(), true );
  782. return true;
  783. }
  784. /**
  785. * Run a query -- includes checking for and running a Command.
  786. * @return points to ns if exhaust mode. 0=normal mode
  787. * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
  788. * @yields the db mutex periodically after acquiring it.
  789. * @asserts on scan and order memory exhaustion and other cases.
  790. */
  791. string runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
  792. shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
  793. ParsedQuery& pq( *pq_shared );
  794. BSONObj jsobj = q.query;
  795. int queryOptions = q.queryOptions;
  796. const char *ns = q.ns;
  797. uassert( 16332 , "can't have an empty ns" , ns[0] );
  798. if( logLevel >= 2 )
  799. log() << "runQuery called " << ns << " " << jsobj << endl;
  800. curop.debug().ns = ns;
  801. curop.debug().ntoreturn = pq.getNumToReturn();
  802. curop.debug().query = jsobj;
  803. curop.setQuery(jsobj);
  804. const NamespaceString nsString( ns );
  805. uassert( 16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() );
  806. // Run a command.
  807. if ( pq.couldBeCommand() ) {
  808. curop.markCommand();
  809. BufBuilder bb;
  810. bb.skip(sizeof(QueryResult));
  811. BSONObjBuilder cmdResBuf;
  812. if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
  813. curop.debug().iscommand = true;
  814. curop.debug().query = jsobj;
  815. auto_ptr< QueryResult > qr;
  816. qr.reset( (QueryResult *) bb.buf() );
  817. bb.decouple();
  818. qr->setResultFlagsToOk();
  819. qr->len = bb.len();
  820. curop.debug().responseLength = bb.len();
  821. qr->setOperation(opReply);
  822. qr->cursorId = 0;
  823. qr->startingFrom = 0;
  824. qr->nReturned = 1;
  825. result.setData( qr.release(), true );
  826. }
  827. else {
  828. uasserted(13530, "bad or malformed command request?");
  829. }
  830. return "";
  831. }
  832. bool explain = pq.isExplain();
  833. BSONObj order = pq.getOrder();
  834. BSONObj query = pq.getFilter();
  835. /* The ElemIter will not be happy if this isn't really an object. So throw exception
  836. here when that is true.
  837. (Which may indicate bad data from client.)
  838. */
  839. if ( query.objsize() == 0 ) {
  840. out() << "Bad query object?\n jsobj:";
  841. out() << jsobj.toString() << "\n query:";
  842. out() << query.toString() << endl;
  843. uassert( 10110 , "bad query object", false);
  844. }
  845. // Run a simple id query.
  846. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
  847. if ( queryIdHack( ns, query, pq, curop, result ) ) {
  848. return "";
  849. }
  850. }
  851. // sanity check the query and projection
  852. if ( pq.getFields() != NULL )
  853. pq.getFields()->validateQuery( query );
  854. // these now may stored in a ClientCursor or somewhere else,
  855. // so make sure we use a real copy
  856. jsobj = jsobj.getOwned();
  857. query = query.getOwned();
  858. order = order.getOwned();
  859. bool hasRetried = false;
  860. scoped_ptr<PageFaultRetryableSection> pgfs;
  861. scoped_ptr<NoPageFaultsAllowed> npfe;
  862. while ( 1 ) {
  863. if ( ! cc().getPageFaultRetryableSection() ) {
  864. verify( ! pgfs );
  865. pgfs.reset( new PageFaultRetryableSection() );
  866. }
  867. try {
  868. Client::ReadContext ctx( ns , dbpath ); // read locks
  869. const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );
  870. replVerifyReadsOk(&pq);
  871. if ( pq.hasOption( QueryOption_CursorTailable ) ) {
  872. NamespaceDetails *d = nsdetails( ns );
  873. uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() );
  874. const BSONObj nat1 = BSON( "$natural" << 1 );
  875. if ( order.isEmpty() ) {
  876. order = nat1;
  877. }
  878. else {
  879. uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
  880. }
  881. }
  882. // Run a regular query.
  883. BSONObj oldPlan;
  884. if ( ! hasRetried && explain && ! pq.hasIndexSpecifier() ) {
  885. scoped_ptr<MultiPlanScanner> mps( MultiPlanScanner::make( ns, query, order ) );
  886. oldPlan = mps->cachedPlanExplainSummary();
  887. }
  888. return queryWithQueryOptimizer( queryOptions, ns, jsobj, curop, query, order,
  889. pq_shared, oldPlan, shardingVersionAtStart,
  890. pgfs, npfe, result );
  891. }
  892. catch ( PageFaultException& e ) {
  893. e.touch();
  894. }
  895. catch ( const QueryRetryException & ) {
  896. // In some cases the query may be retried if there is an in memory sort size assertion.
  897. verify( ! hasRetried );
  898. hasRetried = true;
  899. }
  900. }
  901. }
  902. } // namespace mongo