PageRenderTime 52ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/mongodb-1.8.3/db/query.cpp

https://bitbucket.org/wesc/debian-mongodb
C++ | 1208 lines | 959 code | 152 blank | 97 comment | 250 complexity | 0021872897dec5e3c9d3dde6b972f8eb MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. // query.cpp
  2. /**
  3. * Copyright (C) 2008 10gen Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include "pch.h"
  18. #include "query.h"
  19. #include "pdfile.h"
  20. #include "jsobjmanipulator.h"
  21. #include "../bson/util/builder.h"
  22. #include <time.h>
  23. #include "introspect.h"
  24. #include "btree.h"
  25. #include "../util/lruishmap.h"
  26. #include "json.h"
  27. #include "repl.h"
  28. #include "replpair.h"
  29. #include "scanandorder.h"
  30. #include "security.h"
  31. #include "curop-inl.h"
  32. #include "commands.h"
  33. #include "queryoptimizer.h"
  34. #include "lasterror.h"
  35. #include "../s/d_logic.h"
  36. #include "repl_block.h"
  37. namespace mongo {
  38. /* We cut off further objects once we cross this threshold; thus, you might get
  39. a little bit more than this, it is a threshold rather than a limit.
  40. */
  41. const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
  42. //ns->query->DiskLoc
  43. // LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
  44. extern bool useCursors;
  45. extern bool useHints;
  46. // Just try to identify best plan.
  47. class DeleteOp : public MultiCursor::CursorOp {
  48. public:
  49. DeleteOp( bool justOne, int& bestCount ) :
  50. justOne_( justOne ),
  51. count_(),
  52. bestCount_( bestCount ),
  53. _nscanned() {
  54. }
  55. virtual void _init() {
  56. c_ = qp().newCursor();
  57. }
  58. virtual bool prepareToYield() {
  59. if ( ! _cc ) {
  60. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
  61. }
  62. return _cc->prepareToYield( _yieldData );
  63. }
  64. virtual void recoverFromYield() {
  65. if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
  66. _cc.reset();
  67. c_.reset();
  68. massert( 13340, "cursor dropped during delete", false );
  69. }
  70. }
  71. virtual long long nscanned() {
  72. return c_.get() ? c_->nscanned() : _nscanned;
  73. }
  74. virtual void next() {
  75. if ( !c_->ok() ) {
  76. setComplete();
  77. return;
  78. }
  79. DiskLoc rloc = c_->currLoc();
  80. if ( matcher()->matches(c_->currKey(), rloc ) ) {
  81. if ( !c_->getsetdup(rloc) )
  82. ++count_;
  83. }
  84. c_->advance();
  85. _nscanned = c_->nscanned();
  86. if ( count_ > bestCount_ )
  87. bestCount_ = count_;
  88. if ( count_ > 0 ) {
  89. if ( justOne_ )
  90. setComplete();
  91. else if ( _nscanned >= 100 && count_ == bestCount_ )
  92. setComplete();
  93. }
  94. }
  95. virtual bool mayRecordPlan() const { return !justOne_; }
  96. virtual QueryOp *_createChild() const {
  97. bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called
  98. return new DeleteOp( justOne_, bestCount_ );
  99. }
  100. virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
  101. private:
  102. bool justOne_;
  103. int count_;
  104. int &bestCount_;
  105. long long _nscanned;
  106. shared_ptr<Cursor> c_;
  107. ClientCursor::CleanupPointer _cc;
  108. ClientCursor::YieldData _yieldData;
  109. };
  110. /* ns: namespace, e.g. <database>.<collection>
  111. pattern: the "where" clause / criteria
  112. justOne: stop after 1 match
  113. god: allow access to system namespaces, and don't yield
  114. */
  115. long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
  116. if( !god ) {
  117. if ( strstr(ns, ".system.") ) {
  118. /* note a delete from system.indexes would corrupt the db
  119. if done here, as there are pointers into those objects in
  120. NamespaceDetails.
  121. */
  122. uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
  123. }
  124. if ( strchr( ns , '$' ) ) {
  125. log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
  126. uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
  127. }
  128. }
  129. NamespaceDetails *d = nsdetails( ns );
  130. if ( ! d )
  131. return 0;
  132. uassert( 10101 , "can't remove from a capped collection" , ! d->capped );
  133. long long nDeleted = 0;
  134. int best = 0;
  135. shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
  136. shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
  137. if( !creal->ok() )
  138. return nDeleted;
  139. shared_ptr< Cursor > cPtr = creal;
  140. auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
  141. cc->setDoingDeletes( true );
  142. CursorId id = cc->cursorid();
  143. bool justOne = justOneOrig;
  144. bool canYield = !god && !creal->matcher()->docMatcher().atomic();
  145. do {
  146. if ( canYield && ! cc->yieldSometimes() ) {
  147. cc.release(); // has already been deleted elsewhere
  148. // TODO should we assert or something?
  149. break;
  150. }
  151. if ( !cc->ok() ) {
  152. break; // if we yielded, could have hit the end
  153. }
  154. // this way we can avoid calling updateLocation() every time (expensive)
  155. // as well as some other nuances handled
  156. cc->setDoingDeletes( true );
  157. DiskLoc rloc = cc->currLoc();
  158. BSONObj key = cc->currKey();
  159. // NOTE Calling advance() may change the matcher, so it's important
  160. // to try to match first.
  161. bool match = creal->matcher()->matches( key , rloc );
  162. if ( ! cc->advance() )
  163. justOne = true;
  164. if ( ! match )
  165. continue;
  166. assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
  167. if ( !justOne ) {
  168. /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore
  169. blocks. here we might call millions of times which would be bad.
  170. */
  171. cc->c()->noteLocation();
  172. }
  173. if ( logop ) {
  174. BSONElement e;
  175. if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
  176. BSONObjBuilder b;
  177. b.append( e );
  178. bool replJustOne = true;
  179. logOp( "d", ns, b.done(), 0, &replJustOne );
  180. }
  181. else {
  182. problem() << "deleted object without id, not logging" << endl;
  183. }
  184. }
  185. if ( rs )
  186. rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
  187. theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
  188. nDeleted++;
  189. if ( justOne ) {
  190. break;
  191. }
  192. cc->c()->checkLocation();
  193. if( !god )
  194. getDur().commitIfNeeded();
  195. if( debug && god && nDeleted == 100 )
  196. log() << "warning high number of deletes with god=true which could use significant memory" << endl;
  197. }
  198. while ( cc->ok() );
  199. if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
  200. cc.release();
  201. }
  202. return nDeleted;
  203. }
  204. int otherTraceLevel = 0;
  205. int initialExtentSize(int len);
  206. bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
  207. try {
  208. return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
  209. }
  210. catch ( AssertionException& e ) {
  211. e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
  212. }
  213. curop.debug().str << " assertion ";
  214. anObjBuilder.append("errmsg", "db assertion failure");
  215. anObjBuilder.append("ok", 0.0);
  216. BSONObj x = anObjBuilder.done();
  217. b.appendBuf((void*) x.objdata(), x.objsize());
  218. return true;
  219. }
  220. int nCaught = 0;
  221. BSONObj id_obj = fromjson("{\"_id\":1}");
  222. BSONObj empty_obj = fromjson("{}");
  223. //int dump = 0;
  224. /* empty result for error conditions */
  225. QueryResult* emptyMoreResult(long long cursorid) {
  226. BufBuilder b(32768);
  227. b.skip(sizeof(QueryResult));
  228. QueryResult *qr = (QueryResult *) b.buf();
  229. qr->cursorId = 0; // 0 indicates no more data to retrieve.
  230. qr->startingFrom = 0;
  231. qr->len = b.len();
  232. qr->setOperation(opReply);
  233. qr->nReturned = 0;
  234. b.decouple();
  235. return qr;
  236. }
  237. QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
  238. exhaust = false;
  239. ClientCursor::Pointer p(cursorid);
  240. ClientCursor *cc = p.c();
  241. int bufSize = 512;
  242. if ( cc ) {
  243. bufSize += sizeof( QueryResult );
  244. bufSize += MaxBytesToReturnToClientAtOnce;
  245. }
  246. BufBuilder b( bufSize );
  247. b.skip(sizeof(QueryResult));
  248. int resultFlags = ResultFlag_AwaitCapable;
  249. int start = 0;
  250. int n = 0;
  251. if ( !cc ) {
  252. log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
  253. cursorid = 0;
  254. resultFlags = ResultFlag_CursorNotFound;
  255. }
  256. else {
  257. if ( pass == 0 )
  258. cc->updateSlaveLocation( curop );
  259. int queryOptions = cc->queryOptions();
  260. if( pass == 0 ) {
  261. StringBuilder& ss = curop.debug().str;
  262. ss << " getMore: " << cc->query().toString() << " ";
  263. }
  264. start = cc->pos();
  265. Cursor *c = cc->c();
  266. c->checkLocation();
  267. DiskLoc last;
  268. scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
  269. if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
  270. keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
  271. while ( 1 ) {
  272. if ( !c->ok() ) {
  273. if ( c->tailable() ) {
  274. /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however
  275. advance() can still be retries as a reactivation attempt. when there is new data, it will
  276. return true. that's what we are doing here.
  277. */
  278. if ( c->advance() )
  279. continue;
  280. if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
  281. throw GetMoreWaitException();
  282. }
  283. break;
  284. }
  285. p.release();
  286. bool ok = ClientCursor::erase(cursorid);
  287. assert(ok);
  288. cursorid = 0;
  289. cc = 0;
  290. break;
  291. }
  292. // in some cases (clone collection) there won't be a matcher
  293. if ( c->matcher() && !c->matcher()->matches(c->currKey(), c->currLoc() ) ) {
  294. }
  295. /*
  296. TODO
  297. else if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( c->currKey(), c->currLoc() ) ){
  298. cout << "TEMP skipping un-owned chunk: " << c->current() << endl;
  299. }
  300. */
  301. else {
  302. if( c->getsetdup(c->currLoc()) ) {
  303. //out() << " but it's a dup \n";
  304. }
  305. else {
  306. last = c->currLoc();
  307. n++;
  308. if ( keyFieldsOnly ) {
  309. fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
  310. }
  311. else {
  312. BSONObj js = c->current();
  313. // show disk loc should be part of the main query, not in an $or clause, so this should be ok
  314. fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
  315. }
  316. if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
  317. c->advance();
  318. cc->incPos( n );
  319. break;
  320. }
  321. }
  322. }
  323. c->advance();
  324. if ( ! cc->yieldSometimes() ) {
  325. ClientCursor::erase(cursorid);
  326. cursorid = 0;
  327. cc = 0;
  328. p.deleted();
  329. break;
  330. }
  331. }
  332. if ( cc ) {
  333. cc->updateLocation();
  334. cc->mayUpgradeStorage();
  335. cc->storeOpForSlave( last );
  336. exhaust = cc->queryOptions() & QueryOption_Exhaust;
  337. }
  338. }
  339. QueryResult *qr = (QueryResult *) b.buf();
  340. qr->len = b.len();
  341. qr->setOperation(opReply);
  342. qr->_resultFlags() = resultFlags;
  343. qr->cursorId = cursorid;
  344. qr->startingFrom = start;
  345. qr->nReturned = n;
  346. b.decouple();
  347. return qr;
  348. }
  349. class CountOp : public QueryOp {
  350. public:
  351. CountOp( const string& ns , const BSONObj &spec ) :
  352. _ns(ns), _capped(false), _count(), _myCount(),
  353. _skip( spec["skip"].numberLong() ),
  354. _limit( spec["limit"].numberLong() ),
  355. _nscanned(),
  356. _bc() {
  357. }
  358. virtual void _init() {
  359. _c = qp().newCursor();
  360. _capped = _c->capped();
  361. if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) {
  362. _query = qp().simplifiedQuery( qp().indexKey() );
  363. _bc = dynamic_cast< BtreeCursor* >( _c.get() );
  364. _bc->forgetEndKey();
  365. }
  366. }
  367. virtual long long nscanned() {
  368. return _c.get() ? _c->nscanned() : _nscanned;
  369. }
  370. virtual bool prepareToYield() {
  371. if ( _c && !_cc ) {
  372. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) );
  373. }
  374. if ( _cc ) {
  375. return _cc->prepareToYield( _yieldData );
  376. }
  377. // no active cursor - ok to yield
  378. return true;
  379. }
  380. virtual void recoverFromYield() {
  381. if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
  382. _c.reset();
  383. _cc.reset();
  384. if ( _capped ) {
  385. msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns );
  386. }
  387. else {
  388. // we don't fail query since we're fine with returning partial data if collection dropped
  389. }
  390. }
  391. }
  392. virtual void next() {
  393. if ( ! _c || !_c->ok() ) {
  394. setComplete();
  395. return;
  396. }
  397. _nscanned = _c->nscanned();
  398. if ( _bc ) {
  399. if ( _firstMatch.isEmpty() ) {
  400. _firstMatch = _bc->currKeyNode().key.copy();
  401. // if not match
  402. if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) {
  403. setComplete();
  404. return;
  405. }
  406. _gotOne();
  407. }
  408. else {
  409. if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) {
  410. setComplete();
  411. return;
  412. }
  413. _gotOne();
  414. }
  415. }
  416. else {
  417. if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) {
  418. }
  419. else if( !_c->getsetdup(_c->currLoc()) ) {
  420. _gotOne();
  421. }
  422. }
  423. _c->advance();
  424. }
  425. virtual QueryOp *_createChild() const {
  426. CountOp *ret = new CountOp( _ns , BSONObj() );
  427. ret->_count = _count;
  428. ret->_skip = _skip;
  429. ret->_limit = _limit;
  430. return ret;
  431. }
  432. long long count() const { return _count; }
  433. virtual bool mayRecordPlan() const {
  434. return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() );
  435. }
  436. private:
  437. void _gotOne() {
  438. if ( _skip ) {
  439. _skip--;
  440. return;
  441. }
  442. if ( _limit > 0 && _count >= _limit ) {
  443. setStop();
  444. return;
  445. }
  446. _count++;
  447. _myCount++;
  448. }
  449. string _ns;
  450. bool _capped;
  451. long long _count;
  452. long long _myCount;
  453. long long _skip;
  454. long long _limit;
  455. long long _nscanned;
  456. shared_ptr<Cursor> _c;
  457. BSONObj _query;
  458. BtreeCursor * _bc;
  459. BSONObj _firstMatch;
  460. ClientCursor::CleanupPointer _cc;
  461. ClientCursor::YieldData _yieldData;
  462. };
  463. /* { count: "collectionname"[, query: <query>] }
  464. returns -1 on ns does not exist error.
  465. */
  466. long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
  467. Client::Context cx(ns);
  468. NamespaceDetails *d = nsdetails( ns );
  469. if ( !d ) {
  470. err = "ns missing";
  471. return -1;
  472. }
  473. BSONObj query = cmd.getObjectField("query");
  474. // count of all objects
  475. if ( query.isEmpty() ) {
  476. return applySkipLimit( d->stats.nrecords , cmd );
  477. }
  478. MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true );
  479. CountOp original( ns , cmd );
  480. shared_ptr< CountOp > res = mps.runOp( original );
  481. if ( !res->complete() ) {
  482. log() << "Count with ns: " << ns << " and query: " << query
  483. << " failed with exception: " << res->exception()
  484. << endl;
  485. return 0;
  486. }
  487. return res->count();
  488. }
  489. class ExplainBuilder {
  490. // Note: by default we filter out allPlans and oldPlan in the shell's
  491. // explain() function. If you add any recursive structures, make sure to
  492. // edit the JS to make sure everything gets filtered.
  493. public:
  494. ExplainBuilder() : _i() {}
  495. void ensureStartScan() {
  496. if ( !_a.get() ) {
  497. _a.reset( new BSONArrayBuilder() );
  498. }
  499. }
  500. void noteCursor( Cursor *c ) {
  501. BSONObjBuilder b( _a->subobjStart() );
  502. b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
  503. b.done();
  504. }
  505. void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
  506. int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
  507. if ( _i == 1 ) {
  508. _c.reset( new BSONArrayBuilder() );
  509. *_c << _b->obj();
  510. }
  511. if ( _i == 0 ) {
  512. _b.reset( new BSONObjBuilder() );
  513. }
  514. else {
  515. _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
  516. }
  517. *_b << "cursor" << c->toString();
  518. _b->appendNumber( "nscanned", nscanned );
  519. _b->appendNumber( "nscannedObjects", nscannedObjects );
  520. *_b << "n" << n;
  521. if ( scanAndOrder )
  522. *_b << "scanAndOrder" << true;
  523. *_b << "millis" << millis;
  524. *_b << "nYields" << nYields;
  525. *_b << "nChunkSkips" << nChunkSkips;
  526. *_b << "isMultiKey" << c->isMultiKey();
  527. *_b << "indexOnly" << indexOnly;
  528. *_b << "indexBounds" << c->prettyIndexBounds();
  529. if ( !hint ) {
  530. *_b << "allPlans" << _a->arr();
  531. }
  532. if ( _i != 0 ) {
  533. _b->done();
  534. }
  535. _a.reset( 0 );
  536. ++_i;
  537. }
  538. BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
  539. if ( _i > 1 ) {
  540. BSONObjBuilder b;
  541. b << "clauses" << _c->arr();
  542. b.appendNumber( "nscanned", nscanned );
  543. b.appendNumber( "nscannedObjects", nscannedObjects );
  544. b << "n" << n;
  545. b << "millis" << millis;
  546. b.appendElements( suffix );
  547. return b.obj();
  548. }
  549. else {
  550. _b->appendElements( suffix );
  551. return _b->obj();
  552. }
  553. }
  554. private:
  555. auto_ptr< BSONArrayBuilder > _a;
  556. auto_ptr< BSONObjBuilder > _b;
  557. auto_ptr< BSONArrayBuilder > _c;
  558. int _i;
  559. };
  560. // Implements database 'query' requests using the query optimizer's QueryOp interface
  561. class UserQueryOp : public QueryOp {
  562. public:
  563. UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
  564. _buf( 32768 ) , // TODO be smarter here
  565. _pq( pq ) ,
  566. _ntoskip( pq.getSkip() ) ,
  567. _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
  568. _n(0),
  569. _oldN(0),
  570. _nYields(),
  571. _nChunkSkips(),
  572. _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
  573. shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
  574. _inMemSort(false),
  575. _capped(false),
  576. _saveClientCursor(false),
  577. _wouldSaveClientCursor(false),
  578. _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
  579. _response( response ),
  580. _eb( eb ),
  581. _curop( curop )
  582. {}
  583. virtual void _init() {
  584. // only need to put the QueryResult fields there if we're building the first buffer in the message.
  585. if ( _response.empty() ) {
  586. _buf.skip( sizeof( QueryResult ) );
  587. }
  588. if ( _oplogReplay ) {
  589. _findingStartCursor.reset( new FindingStartCursor( qp() ) );
  590. _capped = true;
  591. }
  592. else {
  593. _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
  594. _capped = _c->capped();
  595. // setup check for if we can only use index to extract
  596. if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
  597. _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
  598. }
  599. }
  600. if ( qp().scanAndOrderRequired() ) {
  601. _inMemSort = true;
  602. _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) );
  603. }
  604. if ( _pq.isExplain() ) {
  605. _eb.noteCursor( _c.get() );
  606. }
  607. }
  608. virtual bool prepareToYield() {
  609. if ( _findingStartCursor.get() ) {
  610. return _findingStartCursor->prepareToYield();
  611. }
  612. else {
  613. if ( _c && !_cc ) {
  614. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
  615. }
  616. if ( _cc ) {
  617. return _cc->prepareToYield( _yieldData );
  618. }
  619. }
  620. // no active cursor - ok to yield
  621. return true;
  622. }
  623. virtual void recoverFromYield() {
  624. _nYields++;
  625. if ( _findingStartCursor.get() ) {
  626. _findingStartCursor->recoverFromYield();
  627. }
  628. else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
  629. _c.reset();
  630. _cc.reset();
  631. _so.reset();
  632. if ( _capped ) {
  633. msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
  634. }
  635. else {
  636. // we don't fail query since we're fine with returning partial data if collection dropped
  637. // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran
  638. }
  639. }
  640. }
  641. virtual long long nscanned() {
  642. if ( _findingStartCursor.get() ) {
  643. return 0; // should only be one query plan, so value doesn't really matter.
  644. }
  645. return _c.get() ? _c->nscanned() : _nscanned;
  646. }
  647. virtual void next() {
  648. if ( _findingStartCursor.get() ) {
  649. if ( _findingStartCursor->done() ) {
  650. _c = _findingStartCursor->cRelease();
  651. _findingStartCursor.reset( 0 );
  652. }
  653. else {
  654. _findingStartCursor->next();
  655. }
  656. _capped = true;
  657. return;
  658. }
  659. if ( !_c || !_c->ok() ) {
  660. finish( false );
  661. return;
  662. }
  663. bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
  664. if( 0 ) {
  665. cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
  666. }
  667. if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
  668. finish( true ); //?
  669. return;
  670. }
  671. _nscanned = _c->nscanned();
  672. if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) {
  673. // not a match, continue onward
  674. if ( _details.loadedObject )
  675. _nscannedObjects++;
  676. }
  677. else {
  678. _nscannedObjects++;
  679. DiskLoc cl = _c->currLoc();
  680. if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) {
  681. _nChunkSkips++;
  682. // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
  683. }
  684. else if( _c->getsetdup(cl) ) {
  685. // dup
  686. }
  687. else {
  688. // got a match.
  689. if ( _inMemSort ) {
  690. // note: no cursors for non-indexed, ordered results. results must be fairly small.
  691. _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
  692. }
  693. else if ( _ntoskip > 0 ) {
  694. _ntoskip--;
  695. }
  696. else {
  697. if ( _pq.isExplain() ) {
  698. _n++;
  699. if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
  700. // .limit() was used, show just that much.
  701. finish( true ); //?
  702. return;
  703. }
  704. }
  705. else {
  706. if ( _pq.returnKey() ) {
  707. BSONObjBuilder bb( _buf );
  708. bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
  709. bb.done();
  710. }
  711. else if ( _keyFieldsOnly ) {
  712. fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
  713. }
  714. else {
  715. BSONObj js = _c->current();
  716. assert( js.isValid() );
  717. if ( _oplogReplay ) {
  718. BSONElement e = js["ts"];
  719. if ( e.type() == Date || e.type() == Timestamp )
  720. _slaveReadTill = e._opTime();
  721. }
  722. fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
  723. }
  724. _n++;
  725. if ( ! _c->supportGetMore() ) {
  726. if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
  727. finish( true );
  728. return;
  729. }
  730. }
  731. else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
  732. /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
  733. if ( mayCreateCursor1 ) {
  734. _wouldSaveClientCursor = true;
  735. if ( _c->advance() ) {
  736. // more...so save a cursor
  737. _saveClientCursor = true;
  738. }
  739. }
  740. finish( true );
  741. return;
  742. }
  743. }
  744. }
  745. }
  746. }
  747. _c->advance();
  748. }
  749. // this plan won, so set data for response broadly
  750. void finish( bool stop ) {
  751. massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() );
  752. if ( _pq.isExplain() ) {
  753. _n = _inMemSort ? _so->size() : _n;
  754. }
  755. else if ( _inMemSort ) {
  756. if( _so.get() )
  757. _so->fill( _buf, _pq.getFields() , _n );
  758. }
  759. if ( _c.get() ) {
  760. _nscanned = _c->nscanned();
  761. if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
  762. _c->setTailable();
  763. // If the tailing request succeeded.
  764. if ( _c->tailable() )
  765. _saveClientCursor = true;
  766. }
  767. if ( _pq.isExplain() ) {
  768. _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
  769. _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
  770. _nChunkSkips, _keyFieldsOnly.get() > 0 );
  771. }
  772. else {
  773. if ( _buf.len() ) {
  774. _response.appendData( _buf.buf(), _buf.len() );
  775. _buf.decouple();
  776. }
  777. }
  778. if ( stop ) {
  779. setStop();
  780. }
  781. else {
  782. setComplete();
  783. }
  784. }
  785. void finishExplain( const BSONObj &suffix ) {
  786. BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
  787. fillQueryResultFromObj(_buf, 0, obj);
  788. _n = 1;
  789. _oldN = 0;
  790. _response.appendData( _buf.buf(), _buf.len() );
  791. _buf.decouple();
  792. }
  793. virtual bool mayRecordPlan() const {
  794. return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
  795. }
  796. virtual QueryOp *_createChild() const {
  797. if ( _pq.isExplain() ) {
  798. _eb.ensureStartScan();
  799. }
  800. UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
  801. ret->_oldN = n();
  802. ret->_oldNscanned = totalNscanned();
  803. ret->_oldNscannedObjects = nscannedObjects();
  804. ret->_ntoskip = _ntoskip;
  805. return ret;
  806. }
  807. bool scanAndOrderRequired() const { return _inMemSort; }
  808. shared_ptr<Cursor> cursor() { return _c; }
  809. int n() const { return _oldN + _n; }
  810. long long totalNscanned() const { return _nscanned + _oldNscanned; }
  811. long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
  812. bool saveClientCursor() const { return _saveClientCursor; }
  813. bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
  814. void finishForOplogReplay( ClientCursor * cc ) {
  815. if ( _oplogReplay && ! _slaveReadTill.isNull() )
  816. cc->slaveReadTill( _slaveReadTill );
  817. }
  818. private:
  819. BufBuilder _buf;
  820. const ParsedQuery& _pq;
  821. scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
  822. long long _ntoskip;
  823. long long _nscanned;
  824. long long _oldNscanned;
  825. long long _nscannedObjects;
  826. long long _oldNscannedObjects;
  827. int _n; // found so far
  828. int _oldN;
  829. int _nYields;
  830. int _nChunkSkips;
  831. MatchDetails _details;
  832. ShardChunkManagerPtr _chunkManager;
  833. bool _inMemSort;
  834. auto_ptr< ScanAndOrder > _so;
  835. shared_ptr<Cursor> _c;
  836. ClientCursor::CleanupPointer _cc;
  837. ClientCursor::YieldData _yieldData;
  838. bool _capped;
  839. bool _saveClientCursor;
  840. bool _wouldSaveClientCursor;
  841. bool _oplogReplay;
  842. auto_ptr< FindingStartCursor > _findingStartCursor;
  843. Message &_response;
  844. ExplainBuilder &_eb;
  845. CurOp &_curop;
  846. OpTime _slaveReadTill;
  847. };
  848. /* run a query -- includes checking for and running a Command \
  849. @return points to ns if exhaust mode. 0=normal mode
  850. */
  851. const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
  852. StringBuilder& ss = curop.debug().str;
  853. shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
  854. ParsedQuery& pq( *pq_shared );
  855. int ntoskip = q.ntoskip;
  856. BSONObj jsobj = q.query;
  857. int queryOptions = q.queryOptions;
  858. const char *ns = q.ns;
  859. if( logLevel >= 2 )
  860. log() << "query: " << ns << jsobj << endl;
  861. ss << ns;
  862. {
  863. // only say ntoreturn if nonzero.
  864. int n = pq.getNumToReturn();
  865. if( n )
  866. ss << " ntoreturn:" << n;
  867. }
  868. curop.setQuery(jsobj);
  869. if ( pq.couldBeCommand() ) {
  870. BufBuilder bb;
  871. bb.skip(sizeof(QueryResult));
  872. BSONObjBuilder cmdResBuf;
  873. if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
  874. ss << " command: ";
  875. jsobj.toString( ss );
  876. curop.markCommand();
  877. auto_ptr< QueryResult > qr;
  878. qr.reset( (QueryResult *) bb.buf() );
  879. bb.decouple();
  880. qr->setResultFlagsToOk();
  881. qr->len = bb.len();
  882. ss << " reslen:" << bb.len();
  883. qr->setOperation(opReply);
  884. qr->cursorId = 0;
  885. qr->startingFrom = 0;
  886. qr->nReturned = 1;
  887. result.setData( qr.release(), true );
  888. }
  889. else {
  890. uasserted(13530, "bad or malformed command request?");
  891. }
  892. return 0;
  893. }
  894. /* --- regular query --- */
  895. int n = 0;
  896. BSONElement hint = useHints ? pq.getHint() : BSONElement();
  897. bool explain = pq.isExplain();
  898. bool snapshot = pq.isSnapshot();
  899. BSONObj order = pq.getOrder();
  900. BSONObj query = pq.getFilter();
  901. /* The ElemIter will not be happy if this isn't really an object. So throw exception
  902. here when that is true.
  903. (Which may indicate bad data from client.)
  904. */
  905. if ( query.objsize() == 0 ) {
  906. out() << "Bad query object?\n jsobj:";
  907. out() << jsobj.toString() << "\n query:";
  908. out() << query.toString() << endl;
  909. uassert( 10110 , "bad query object", false);
  910. }
  911. /* --- read lock --- */
  912. mongolock lk(false);
  913. Client::Context ctx( ns , dbpath , &lk );
  914. replVerifyReadsOk(pq);
  915. if ( pq.hasOption( QueryOption_CursorTailable ) ) {
  916. NamespaceDetails *d = nsdetails( ns );
  917. uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
  918. const BSONObj nat1 = BSON( "$natural" << 1 );
  919. if ( order.isEmpty() ) {
  920. order = nat1;
  921. }
  922. else {
  923. uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
  924. }
  925. }
  926. BSONObj snapshotHint; // put here to keep the data in scope
  927. if( snapshot ) {
  928. NamespaceDetails *d = nsdetails(ns);
  929. if ( d ) {
  930. int i = d->findIdIndex();
  931. if( i < 0 ) {
  932. if ( strstr( ns , ".system." ) == 0 )
  933. log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
  934. }
  935. else {
  936. /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
  937. probably need a better way to specify "use the _id index" as a hint. if someone is
  938. in the query optimizer please fix this then!
  939. */
  940. BSONObjBuilder b;
  941. b.append("$hint", d->idx(i).indexName());
  942. snapshotHint = b.obj();
  943. hint = snapshotHint.firstElement();
  944. }
  945. }
  946. }
  947. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
  948. bool nsFound = false;
  949. bool indexFound = false;
  950. BSONObj resObject;
  951. Client& c = cc();
  952. bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
  953. if ( nsFound == false || indexFound == true ) {
  954. BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
  955. bb.skip(sizeof(QueryResult));
  956. ss << " idhack ";
  957. if ( found ) {
  958. n = 1;
  959. fillQueryResultFromObj( bb , pq.getFields() , resObject );
  960. }
  961. auto_ptr< QueryResult > qr;
  962. qr.reset( (QueryResult *) bb.buf() );
  963. bb.decouple();
  964. qr->setResultFlagsToOk();
  965. qr->len = bb.len();
  966. ss << " reslen:" << bb.len();
  967. qr->setOperation(opReply);
  968. qr->cursorId = 0;
  969. qr->startingFrom = 0;
  970. qr->nReturned = n;
  971. result.setData( qr.release(), true );
  972. return false;
  973. }
  974. }
  975. // regular, not QO bypass query
  976. BSONObj oldPlan;
  977. if ( explain && ! pq.hasIndexSpecifier() ) {
  978. MultiPlanScanner mps( ns, query, order );
  979. if ( mps.usingPrerecordedPlan() )
  980. oldPlan = mps.oldExplain();
  981. }
  982. auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
  983. BSONObj explainSuffix;
  984. if ( explain ) {
  985. BSONObjBuilder bb;
  986. if ( !oldPlan.isEmpty() )
  987. bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
  988. explainSuffix = bb.obj();
  989. }
  990. ExplainBuilder eb;
  991. UserQueryOp original( pq, result, eb, curop );
  992. shared_ptr< UserQueryOp > o = mps->runOp( original );
  993. UserQueryOp &dqo = *o;
  994. if ( ! dqo.complete() )
  995. throw MsgAssertionException( dqo.exception() );
  996. if ( explain ) {
  997. dqo.finishExplain( explainSuffix );
  998. }
  999. n = dqo.n();
  1000. long long nscanned = dqo.totalNscanned();
  1001. if ( dqo.scanAndOrderRequired() )
  1002. ss << " scanAndOrder ";
  1003. shared_ptr<Cursor> cursor = dqo.cursor();
  1004. if( logLevel >= 5 )
  1005. log() << " used cursor: " << cursor.get() << endl;
  1006. long long cursorid = 0;
  1007. const char * exhaust = 0;
  1008. if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
  1009. ClientCursor *cc;
  1010. bool moreClauses = mps->mayRunMore();
  1011. if ( moreClauses ) {
  1012. // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
  1013. shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) );
  1014. cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
  1015. }
  1016. else {
  1017. if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher() );
  1018. cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
  1019. }
  1020. cursorid = cc->cursorid();
  1021. DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
  1022. cc->setPos( n );
  1023. cc->pq = pq_shared;
  1024. cc->fields = pq.getFieldPtr();
  1025. cc->originalMessage = m;
  1026. cc->updateLocation();
  1027. if ( !cc->ok() && cc->c()->tailable() )
  1028. DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
  1029. if( queryOptions & QueryOption_Exhaust ) {
  1030. exhaust = ns;
  1031. ss << " exhaust ";
  1032. }
  1033. dqo.finishForOplogReplay(cc);
  1034. }
  1035. QueryResult *qr = (QueryResult *) result.header();
  1036. qr->cursorId = cursorid;
  1037. qr->setResultFlagsToOk();
  1038. // qr->len is updated automatically by appendData()
  1039. ss << " reslen:" << qr->len;
  1040. qr->setOperation(opReply);
  1041. qr->startingFrom = 0;
  1042. qr->nReturned = n;
  1043. int duration = curop.elapsedMillis();
  1044. bool dbprofile = curop.shouldDBProfile( duration );
  1045. if ( dbprofile || duration >= cmdLine.slowMS ) {
  1046. ss << " nscanned:" << nscanned << ' ';
  1047. if ( ntoskip )
  1048. ss << " ntoskip:" << ntoskip;
  1049. if ( dbprofile )
  1050. ss << " \nquery: ";
  1051. ss << jsobj.toString() << ' ';
  1052. }
  1053. ss << " nreturned:" << n;
  1054. return exhaust;
  1055. }
  1056. } // namespace mongo