PageRenderTime 68ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/db/query.cpp

https://github.com/paul/mongo
C++ | 1191 lines | 945 code | 151 blank | 95 comment | 241 complexity | 62368f0bdea0f9eaa8df2cd58c75212c MD5 | raw file
Possible License(s): Apache-2.0
  1. // query.cpp
  2. /**
  3. * Copyright (C) 2008 10gen Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include "pch.h"
  18. #include "query.h"
  19. #include "pdfile.h"
  20. #include "jsobjmanipulator.h"
  21. #include "../bson/util/builder.h"
  22. #include <time.h>
  23. #include "introspect.h"
  24. #include "btree.h"
  25. #include "../util/lruishmap.h"
  26. #include "json.h"
  27. #include "repl.h"
  28. #include "replpair.h"
  29. #include "scanandorder.h"
  30. #include "security.h"
  31. #include "curop-inl.h"
  32. #include "commands.h"
  33. #include "queryoptimizer.h"
  34. #include "lasterror.h"
  35. #include "../s/d_logic.h"
  36. #include "repl_block.h"
  37. namespace mongo {
  38. /* We cut off further objects once we cross this threshold; thus, you might get
  39. a little bit more than this, it is a threshold rather than a limit.
  40. */
  41. const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
  42. //ns->query->DiskLoc
  43. // LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
  44. extern bool useCursors;
  45. extern bool useHints;
  46. // Just try to identify best plan.
  47. class DeleteOp : public MultiCursor::CursorOp {
  48. public:
  49. DeleteOp( bool justOne, int& bestCount ) :
  50. justOne_( justOne ),
  51. count_(),
  52. bestCount_( bestCount ),
  53. _nscanned() {
  54. }
  55. virtual void _init() {
  56. c_ = qp().newCursor();
  57. }
  58. virtual bool prepareToYield() {
  59. if ( ! _cc ) {
  60. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
  61. }
  62. return _cc->prepareToYield( _yieldData );
  63. }
  64. virtual void recoverFromYield() {
  65. if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
  66. _cc.reset();
  67. c_.reset();
  68. massert( 13340, "cursor dropped during delete", false );
  69. }
  70. }
  71. virtual long long nscanned() {
  72. assert( c_.get() );
  73. return c_->nscanned();
  74. }
  75. virtual void next() {
  76. if ( !c_->ok() ) {
  77. setComplete();
  78. return;
  79. }
  80. DiskLoc rloc = c_->currLoc();
  81. if ( matcher()->matches(c_->currKey(), rloc ) ) {
  82. if ( !c_->getsetdup(rloc) )
  83. ++count_;
  84. }
  85. c_->advance();
  86. _nscanned = c_->nscanned();
  87. if ( count_ > bestCount_ )
  88. bestCount_ = count_;
  89. if ( count_ > 0 ) {
  90. if ( justOne_ )
  91. setComplete();
  92. else if ( _nscanned >= 100 && count_ == bestCount_ )
  93. setComplete();
  94. }
  95. }
  96. virtual bool mayRecordPlan() const { return !justOne_; }
  97. virtual QueryOp *_createChild() const {
  98. bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called
  99. return new DeleteOp( justOne_, bestCount_ );
  100. }
  101. virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
  102. private:
  103. bool justOne_;
  104. int count_;
  105. int &bestCount_;
  106. long long _nscanned;
  107. shared_ptr<Cursor> c_;
  108. ClientCursor::CleanupPointer _cc;
  109. ClientCursor::YieldData _yieldData;
  110. };
  111. /* ns: namespace, e.g. <database>.<collection>
  112. pattern: the "where" clause / criteria
  113. justOne: stop after 1 match
  114. god: allow access to system namespaces, and don't yield
  115. */
  116. long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
  117. if( !god ) {
  118. if ( strstr(ns, ".system.") ) {
  119. /* note a delete from system.indexes would corrupt the db
  120. if done here, as there are pointers into those objects in
  121. NamespaceDetails.
  122. */
  123. uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
  124. }
  125. if ( strchr( ns , '$' ) ) {
  126. log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
  127. uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
  128. }
  129. }
  130. NamespaceDetails *d = nsdetails( ns );
  131. if ( ! d )
  132. return 0;
  133. uassert( 10101 , "can't remove from a capped collection" , ! d->capped );
  134. long long nDeleted = 0;
  135. int best = 0;
  136. shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
  137. shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
  138. if( !creal->ok() )
  139. return nDeleted;
  140. shared_ptr< Cursor > cPtr = creal;
  141. auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
  142. cc->setDoingDeletes( true );
  143. CursorId id = cc->cursorid();
  144. bool justOne = justOneOrig;
  145. bool canYield = !god && !creal->matcher()->docMatcher().atomic();
  146. do {
  147. if ( canYield && ! cc->yieldSometimes() ) {
  148. cc.release(); // has already been deleted elsewhere
  149. // TODO should we assert or something?
  150. break;
  151. }
  152. if ( !cc->ok() ) {
  153. break; // if we yielded, could have hit the end
  154. }
  155. // this way we can avoid calling updateLocation() every time (expensive)
  156. // as well as some other nuances handled
  157. cc->setDoingDeletes( true );
  158. DiskLoc rloc = cc->currLoc();
  159. BSONObj key = cc->currKey();
  160. // NOTE Calling advance() may change the matcher, so it's important
  161. // to try to match first.
  162. bool match = creal->matcher()->matches( key , rloc );
  163. if ( ! cc->advance() )
  164. justOne = true;
  165. if ( ! match )
  166. continue;
  167. assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
  168. if ( !justOne ) {
  169. /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore
  170. blocks. here we might call millions of times which would be bad.
  171. */
  172. cc->c()->noteLocation();
  173. }
  174. if ( logop ) {
  175. BSONElement e;
  176. if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
  177. BSONObjBuilder b;
  178. b.append( e );
  179. bool replJustOne = true;
  180. logOp( "d", ns, b.done(), 0, &replJustOne );
  181. }
  182. else {
  183. problem() << "deleted object without id, not logging" << endl;
  184. }
  185. }
  186. if ( rs )
  187. rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
  188. theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
  189. nDeleted++;
  190. if ( justOne ) {
  191. break;
  192. }
  193. cc->c()->checkLocation();
  194. if( !god )
  195. getDur().commitIfNeeded();
  196. if( debug && god && nDeleted == 100 )
  197. log() << "warning high number of deletes with god=true which could use significant memory" << endl;
  198. }
  199. while ( cc->ok() );
  200. if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
  201. cc.release();
  202. }
  203. return nDeleted;
  204. }
  205. int otherTraceLevel = 0;
  206. int initialExtentSize(int len);
  207. bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
  208. try {
  209. return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
  210. }
  211. catch ( AssertionException& e ) {
  212. e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
  213. }
  214. curop.debug().str << " assertion ";
  215. anObjBuilder.append("errmsg", "db assertion failure");
  216. anObjBuilder.append("ok", 0.0);
  217. BSONObj x = anObjBuilder.done();
  218. b.appendBuf((void*) x.objdata(), x.objsize());
  219. return true;
  220. }
  221. int nCaught = 0;
  222. BSONObj id_obj = fromjson("{\"_id\":1}");
  223. BSONObj empty_obj = fromjson("{}");
  224. //int dump = 0;
  225. /* empty result for error conditions */
  226. QueryResult* emptyMoreResult(long long cursorid) {
  227. BufBuilder b(32768);
  228. b.skip(sizeof(QueryResult));
  229. QueryResult *qr = (QueryResult *) b.buf();
  230. qr->cursorId = 0; // 0 indicates no more data to retrieve.
  231. qr->startingFrom = 0;
  232. qr->len = b.len();
  233. qr->setOperation(opReply);
  234. qr->nReturned = 0;
  235. b.decouple();
  236. return qr;
  237. }
  238. QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
  239. exhaust = false;
  240. ClientCursor::Pointer p(cursorid);
  241. ClientCursor *cc = p.c();
  242. int bufSize = 512;
  243. if ( cc ) {
  244. bufSize += sizeof( QueryResult );
  245. bufSize += MaxBytesToReturnToClientAtOnce;
  246. }
  247. BufBuilder b( bufSize );
  248. b.skip(sizeof(QueryResult));
  249. int resultFlags = ResultFlag_AwaitCapable;
  250. int start = 0;
  251. int n = 0;
  252. if ( !cc ) {
  253. log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
  254. cursorid = 0;
  255. resultFlags = ResultFlag_CursorNotFound;
  256. }
  257. else {
  258. if ( pass == 0 )
  259. cc->updateSlaveLocation( curop );
  260. int queryOptions = cc->queryOptions();
  261. if( pass == 0 ) {
  262. StringBuilder& ss = curop.debug().str;
  263. ss << " getMore: " << cc->query().toString() << " ";
  264. }
  265. start = cc->pos();
  266. Cursor *c = cc->c();
  267. c->checkLocation();
  268. DiskLoc last;
  269. scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
  270. if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
  271. keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
  272. while ( 1 ) {
  273. if ( !c->ok() ) {
  274. if ( c->tailable() ) {
  275. /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however
  276. advance() can still be retries as a reactivation attempt. when there is new data, it will
  277. return true. that's what we are doing here.
  278. */
  279. if ( c->advance() )
  280. continue;
  281. if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
  282. throw GetMoreWaitException();
  283. }
  284. break;
  285. }
  286. p.release();
  287. bool ok = ClientCursor::erase(cursorid);
  288. assert(ok);
  289. cursorid = 0;
  290. cc = 0;
  291. break;
  292. }
  293. // in some cases (clone collection) there won't be a matcher
  294. if ( c->matcher() && !c->matcher()->matches(c->currKey(), c->currLoc() ) ) {
  295. }
  296. /*
  297. TODO
  298. else if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( c->currKey(), c->currLoc() ) ){
  299. cout << "TEMP skipping un-owned chunk: " << c->current() << endl;
  300. }
  301. */
  302. else {
  303. if( c->getsetdup(c->currLoc()) ) {
  304. //out() << " but it's a dup \n";
  305. }
  306. else {
  307. last = c->currLoc();
  308. n++;
  309. if ( keyFieldsOnly ) {
  310. fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
  311. }
  312. else {
  313. BSONObj js = c->current();
  314. // show disk loc should be part of the main query, not in an $or clause, so this should be ok
  315. fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
  316. }
  317. if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
  318. c->advance();
  319. cc->incPos( n );
  320. break;
  321. }
  322. }
  323. }
  324. c->advance();
  325. }
  326. if ( cc ) {
  327. cc->updateLocation();
  328. cc->mayUpgradeStorage();
  329. cc->storeOpForSlave( last );
  330. exhaust = cc->queryOptions() & QueryOption_Exhaust;
  331. }
  332. }
  333. QueryResult *qr = (QueryResult *) b.buf();
  334. qr->len = b.len();
  335. qr->setOperation(opReply);
  336. qr->_resultFlags() = resultFlags;
  337. qr->cursorId = cursorid;
  338. qr->startingFrom = start;
  339. qr->nReturned = n;
  340. b.decouple();
  341. return qr;
  342. }
  343. class CountOp : public QueryOp {
  344. public:
  345. CountOp( const string& ns , const BSONObj &spec ) :
  346. _ns(ns), _capped(false), _count(), _myCount(),
  347. _skip( spec["skip"].numberLong() ),
  348. _limit( spec["limit"].numberLong() ),
  349. _bc() {
  350. }
  351. virtual void _init() {
  352. _c = qp().newCursor();
  353. _capped = _c->capped();
  354. if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) {
  355. _query = qp().simplifiedQuery( qp().indexKey() );
  356. _bc = dynamic_cast< BtreeCursor* >( _c.get() );
  357. _bc->forgetEndKey();
  358. }
  359. }
  360. virtual long long nscanned() {
  361. assert( _c.get() );
  362. return _c->nscanned();
  363. }
  364. virtual bool prepareToYield() {
  365. if ( ! _cc ) {
  366. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) );
  367. }
  368. return _cc->prepareToYield( _yieldData );
  369. }
  370. virtual void recoverFromYield() {
  371. if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
  372. _c.reset();
  373. _cc.reset();
  374. if ( _capped ) {
  375. msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns );
  376. }
  377. else {
  378. // we don't fail query since we're fine with returning partial data if collection dropped
  379. }
  380. }
  381. }
  382. virtual void next() {
  383. if ( ! _c || !_c->ok() ) {
  384. setComplete();
  385. return;
  386. }
  387. if ( _bc ) {
  388. if ( _firstMatch.isEmpty() ) {
  389. _firstMatch = _bc->currKeyNode().key.copy();
  390. // if not match
  391. if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) {
  392. setComplete();
  393. return;
  394. }
  395. _gotOne();
  396. }
  397. else {
  398. if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) {
  399. setComplete();
  400. return;
  401. }
  402. _gotOne();
  403. }
  404. }
  405. else {
  406. if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) {
  407. }
  408. else if( !_c->getsetdup(_c->currLoc()) ) {
  409. _gotOne();
  410. }
  411. }
  412. _c->advance();
  413. }
  414. virtual QueryOp *_createChild() const {
  415. CountOp *ret = new CountOp( _ns , BSONObj() );
  416. ret->_count = _count;
  417. ret->_skip = _skip;
  418. ret->_limit = _limit;
  419. return ret;
  420. }
  421. long long count() const { return _count; }
  422. virtual bool mayRecordPlan() const {
  423. return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() );
  424. }
  425. private:
  426. void _gotOne() {
  427. if ( _skip ) {
  428. _skip--;
  429. return;
  430. }
  431. if ( _limit > 0 && _count >= _limit ) {
  432. setStop();
  433. return;
  434. }
  435. _count++;
  436. _myCount++;
  437. }
  438. string _ns;
  439. bool _capped;
  440. long long _count;
  441. long long _myCount;
  442. long long _skip;
  443. long long _limit;
  444. shared_ptr<Cursor> _c;
  445. BSONObj _query;
  446. BtreeCursor * _bc;
  447. BSONObj _firstMatch;
  448. ClientCursor::CleanupPointer _cc;
  449. ClientCursor::YieldData _yieldData;
  450. };
  451. /* { count: "collectionname"[, query: <query>] }
  452. returns -1 on ns does not exist error.
  453. */
  454. long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
  455. Client::Context cx(ns);
  456. NamespaceDetails *d = nsdetails( ns );
  457. if ( !d ) {
  458. err = "ns missing";
  459. return -1;
  460. }
  461. BSONObj query = cmd.getObjectField("query");
  462. // count of all objects
  463. if ( query.isEmpty() ) {
  464. return applySkipLimit( d->stats.nrecords , cmd );
  465. }
  466. MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true );
  467. CountOp original( ns , cmd );
  468. shared_ptr< CountOp > res = mps.runOp( original );
  469. if ( !res->complete() ) {
  470. log() << "Count with ns: " << ns << " and query: " << query
  471. << " failed with exception: " << res->exception()
  472. << endl;
  473. return 0;
  474. }
  475. return res->count();
  476. }
  477. class ExplainBuilder {
  478. // Note: by default we filter out allPlans and oldPlan in the shell's
  479. // explain() function. If you add any recursive structures, make sure to
  480. // edit the JS to make sure everything gets filtered.
  481. public:
  482. ExplainBuilder() : _i() {}
  483. void ensureStartScan() {
  484. if ( !_a.get() ) {
  485. _a.reset( new BSONArrayBuilder() );
  486. }
  487. }
  488. void noteCursor( Cursor *c ) {
  489. BSONObjBuilder b( _a->subobjStart() );
  490. b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
  491. b.done();
  492. }
  493. void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
  494. int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
  495. if ( _i == 1 ) {
  496. _c.reset( new BSONArrayBuilder() );
  497. *_c << _b->obj();
  498. }
  499. if ( _i == 0 ) {
  500. _b.reset( new BSONObjBuilder() );
  501. }
  502. else {
  503. _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
  504. }
  505. *_b << "cursor" << c->toString();
  506. _b->appendNumber( "nscanned", nscanned );
  507. _b->appendNumber( "nscannedObjects", nscannedObjects );
  508. *_b << "n" << n;
  509. if ( scanAndOrder )
  510. *_b << "scanAndOrder" << true;
  511. *_b << "millis" << millis;
  512. *_b << "nYields" << nYields;
  513. *_b << "nChunkSkips" << nChunkSkips;
  514. *_b << "isMultiKey" << c->isMultiKey();
  515. *_b << "indexOnly" << indexOnly;
  516. *_b << "indexBounds" << c->prettyIndexBounds();
  517. if ( !hint ) {
  518. *_b << "allPlans" << _a->arr();
  519. }
  520. if ( _i != 0 ) {
  521. _b->done();
  522. }
  523. _a.reset( 0 );
  524. ++_i;
  525. }
  526. BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
  527. if ( _i > 1 ) {
  528. BSONObjBuilder b;
  529. b << "clauses" << _c->arr();
  530. b.appendNumber( "nscanned", nscanned );
  531. b.appendNumber( "nscannedObjects", nscannedObjects );
  532. b << "n" << n;
  533. b << "millis" << millis;
  534. b.appendElements( suffix );
  535. return b.obj();
  536. }
  537. else {
  538. _b->appendElements( suffix );
  539. return _b->obj();
  540. }
  541. }
  542. private:
  543. auto_ptr< BSONArrayBuilder > _a;
  544. auto_ptr< BSONObjBuilder > _b;
  545. auto_ptr< BSONArrayBuilder > _c;
  546. int _i;
  547. };
  548. // Implements database 'query' requests using the query optimizer's QueryOp interface
  549. class UserQueryOp : public QueryOp {
  550. public:
  551. UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
  552. _buf( 32768 ) , // TODO be smarter here
  553. _pq( pq ) ,
  554. _ntoskip( pq.getSkip() ) ,
  555. _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
  556. _n(0),
  557. _oldN(0),
  558. _nYields(),
  559. _nChunkSkips(),
  560. _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
  561. shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
  562. _inMemSort(false),
  563. _capped(false),
  564. _saveClientCursor(false),
  565. _wouldSaveClientCursor(false),
  566. _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
  567. _response( response ),
  568. _eb( eb ),
  569. _curop( curop )
  570. {}
  571. virtual void _init() {
  572. // only need to put the QueryResult fields there if we're building the first buffer in the message.
  573. if ( _response.empty() ) {
  574. _buf.skip( sizeof( QueryResult ) );
  575. }
  576. if ( _oplogReplay ) {
  577. _findingStartCursor.reset( new FindingStartCursor( qp() ) );
  578. _capped = true;
  579. }
  580. else {
  581. _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
  582. _capped = _c->capped();
  583. // setup check for if we can only use index to extract
  584. if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
  585. _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
  586. }
  587. }
  588. if ( qp().scanAndOrderRequired() ) {
  589. _inMemSort = true;
  590. _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) );
  591. }
  592. if ( _pq.isExplain() ) {
  593. _eb.noteCursor( _c.get() );
  594. }
  595. }
  596. virtual bool prepareToYield() {
  597. if ( _findingStartCursor.get() ) {
  598. return _findingStartCursor->prepareToYield();
  599. }
  600. else {
  601. if ( ! _cc ) {
  602. _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
  603. }
  604. return _cc->prepareToYield( _yieldData );
  605. }
  606. }
  607. virtual void recoverFromYield() {
  608. _nYields++;
  609. if ( _findingStartCursor.get() ) {
  610. _findingStartCursor->recoverFromYield();
  611. }
  612. else if ( ! ClientCursor::recoverFromYield( _yieldData ) ) {
  613. _c.reset();
  614. _cc.reset();
  615. _so.reset();
  616. if ( _capped ) {
  617. msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
  618. }
  619. else {
  620. // we don't fail query since we're fine with returning partial data if collection dropped
  621. // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran
  622. }
  623. }
  624. }
  625. virtual long long nscanned() {
  626. if ( _findingStartCursor.get() ) {
  627. return 0; // should only be one query plan, so value doesn't really matter.
  628. }
  629. assert( _c.get() );
  630. return _c->nscanned();
  631. }
  632. virtual void next() {
  633. if ( _findingStartCursor.get() ) {
  634. if ( _findingStartCursor->done() ) {
  635. _c = _findingStartCursor->cRelease();
  636. _findingStartCursor.reset( 0 );
  637. }
  638. else {
  639. _findingStartCursor->next();
  640. }
  641. _capped = true;
  642. return;
  643. }
  644. if ( !_c || !_c->ok() ) {
  645. finish( false );
  646. return;
  647. }
  648. bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
  649. if( 0 ) {
  650. cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
  651. }
  652. if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
  653. finish( true ); //?
  654. return;
  655. }
  656. _nscanned = _c->nscanned();
  657. if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) {
  658. // not a match, continue onward
  659. if ( _details.loadedObject )
  660. _nscannedObjects++;
  661. }
  662. else {
  663. _nscannedObjects++;
  664. DiskLoc cl = _c->currLoc();
  665. if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) {
  666. _nChunkSkips++;
  667. // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
  668. }
  669. else if( _c->getsetdup(cl) ) {
  670. // dup
  671. }
  672. else {
  673. // got a match.
  674. if ( _inMemSort ) {
  675. // note: no cursors for non-indexed, ordered results. results must be fairly small.
  676. _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
  677. }
  678. else if ( _ntoskip > 0 ) {
  679. _ntoskip--;
  680. }
  681. else {
  682. if ( _pq.isExplain() ) {
  683. _n++;
  684. if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
  685. // .limit() was used, show just that much.
  686. finish( true ); //?
  687. return;
  688. }
  689. }
  690. else {
  691. if ( _pq.returnKey() ) {
  692. BSONObjBuilder bb( _buf );
  693. bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
  694. bb.done();
  695. }
  696. else if ( _keyFieldsOnly ) {
  697. fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
  698. }
  699. else {
  700. BSONObj js = _c->current();
  701. assert( js.isValid() );
  702. if ( _oplogReplay ) {
  703. BSONElement e = js["ts"];
  704. if ( e.type() == Date || e.type() == Timestamp )
  705. _slaveReadTill = e._opTime();
  706. }
  707. fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
  708. }
  709. _n++;
  710. if ( ! _c->supportGetMore() ) {
  711. if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
  712. finish( true );
  713. return;
  714. }
  715. }
  716. else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
  717. /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
  718. if ( mayCreateCursor1 ) {
  719. _wouldSaveClientCursor = true;
  720. if ( _c->advance() ) {
  721. // more...so save a cursor
  722. _saveClientCursor = true;
  723. }
  724. }
  725. finish( true );
  726. return;
  727. }
  728. }
  729. }
  730. }
  731. }
  732. _c->advance();
  733. }
  734. // this plan won, so set data for response broadly
  735. void finish( bool stop ) {
  736. if ( _pq.isExplain() ) {
  737. _n = _inMemSort ? _so->size() : _n;
  738. }
  739. else if ( _inMemSort ) {
  740. if( _so.get() )
  741. _so->fill( _buf, _pq.getFields() , _n );
  742. }
  743. if ( _c.get() ) {
  744. _nscanned = _c->nscanned();
  745. if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
  746. _c->setTailable();
  747. // If the tailing request succeeded.
  748. if ( _c->tailable() )
  749. _saveClientCursor = true;
  750. }
  751. if ( _pq.isExplain() ) {
  752. _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
  753. _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
  754. _nChunkSkips, _keyFieldsOnly.get() > 0 );
  755. }
  756. else {
  757. if ( _buf.len() ) {
  758. _response.appendData( _buf.buf(), _buf.len() );
  759. _buf.decouple();
  760. }
  761. }
  762. if ( stop ) {
  763. setStop();
  764. }
  765. else {
  766. setComplete();
  767. }
  768. }
  769. void finishExplain( const BSONObj &suffix ) {
  770. BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
  771. fillQueryResultFromObj(_buf, 0, obj);
  772. _n = 1;
  773. _oldN = 0;
  774. _response.appendData( _buf.buf(), _buf.len() );
  775. _buf.decouple();
  776. }
  777. virtual bool mayRecordPlan() const {
  778. return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
  779. }
  780. virtual QueryOp *_createChild() const {
  781. if ( _pq.isExplain() ) {
  782. _eb.ensureStartScan();
  783. }
  784. UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
  785. ret->_oldN = n();
  786. ret->_oldNscanned = totalNscanned();
  787. ret->_oldNscannedObjects = nscannedObjects();
  788. ret->_ntoskip = _ntoskip;
  789. return ret;
  790. }
  791. bool scanAndOrderRequired() const { return _inMemSort; }
  792. shared_ptr<Cursor> cursor() { return _c; }
  793. int n() const { return _oldN + _n; }
  794. long long totalNscanned() const { return _nscanned + _oldNscanned; }
  795. long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
  796. bool saveClientCursor() const { return _saveClientCursor; }
  797. bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
  798. void finishForOplogReplay( ClientCursor * cc ) {
  799. if ( _oplogReplay && ! _slaveReadTill.isNull() )
  800. cc->slaveReadTill( _slaveReadTill );
  801. }
  802. private:
  803. BufBuilder _buf;
  804. const ParsedQuery& _pq;
  805. scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
  806. long long _ntoskip;
  807. long long _nscanned;
  808. long long _oldNscanned;
  809. long long _nscannedObjects;
  810. long long _oldNscannedObjects;
  811. int _n; // found so far
  812. int _oldN;
  813. int _nYields;
  814. int _nChunkSkips;
  815. MatchDetails _details;
  816. ShardChunkManagerPtr _chunkManager;
  817. bool _inMemSort;
  818. auto_ptr< ScanAndOrder > _so;
  819. shared_ptr<Cursor> _c;
  820. ClientCursor::CleanupPointer _cc;
  821. ClientCursor::YieldData _yieldData;
  822. bool _capped;
  823. bool _saveClientCursor;
  824. bool _wouldSaveClientCursor;
  825. bool _oplogReplay;
  826. auto_ptr< FindingStartCursor > _findingStartCursor;
  827. Message &_response;
  828. ExplainBuilder &_eb;
  829. CurOp &_curop;
  830. OpTime _slaveReadTill;
  831. };
  832. /* run a query -- includes checking for and running a Command \
  833. @return points to ns if exhaust mode. 0=normal mode
  834. */
  835. const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
  836. StringBuilder& ss = curop.debug().str;
  837. shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
  838. ParsedQuery& pq( *pq_shared );
  839. int ntoskip = q.ntoskip;
  840. BSONObj jsobj = q.query;
  841. int queryOptions = q.queryOptions;
  842. const char *ns = q.ns;
  843. if( logLevel >= 2 )
  844. log() << "query: " << ns << jsobj << endl;
  845. ss << ns;
  846. {
  847. // only say ntoreturn if nonzero.
  848. int n = pq.getNumToReturn();
  849. if( n )
  850. ss << " ntoreturn:" << n;
  851. }
  852. curop.setQuery(jsobj);
  853. if ( pq.couldBeCommand() ) {
  854. BufBuilder bb;
  855. bb.skip(sizeof(QueryResult));
  856. BSONObjBuilder cmdResBuf;
  857. if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
  858. ss << " command: ";
  859. jsobj.toString( ss );
  860. curop.markCommand();
  861. auto_ptr< QueryResult > qr;
  862. qr.reset( (QueryResult *) bb.buf() );
  863. bb.decouple();
  864. qr->setResultFlagsToOk();
  865. qr->len = bb.len();
  866. ss << " reslen:" << bb.len();
  867. qr->setOperation(opReply);
  868. qr->cursorId = 0;
  869. qr->startingFrom = 0;
  870. qr->nReturned = 1;
  871. result.setData( qr.release(), true );
  872. }
  873. else {
  874. uasserted(13530, "bad or malformed command request?");
  875. }
  876. return 0;
  877. }
  878. /* --- regular query --- */
  879. int n = 0;
  880. BSONElement hint = useHints ? pq.getHint() : BSONElement();
  881. bool explain = pq.isExplain();
  882. bool snapshot = pq.isSnapshot();
  883. BSONObj order = pq.getOrder();
  884. BSONObj query = pq.getFilter();
  885. /* The ElemIter will not be happy if this isn't really an object. So throw exception
  886. here when that is true.
  887. (Which may indicate bad data from client.)
  888. */
  889. if ( query.objsize() == 0 ) {
  890. out() << "Bad query object?\n jsobj:";
  891. out() << jsobj.toString() << "\n query:";
  892. out() << query.toString() << endl;
  893. uassert( 10110 , "bad query object", false);
  894. }
  895. /* --- read lock --- */
  896. mongolock lk(false);
  897. Client::Context ctx( ns , dbpath , &lk );
  898. replVerifyReadsOk(pq);
  899. if ( pq.hasOption( QueryOption_CursorTailable ) ) {
  900. NamespaceDetails *d = nsdetails( ns );
  901. uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
  902. const BSONObj nat1 = BSON( "$natural" << 1 );
  903. if ( order.isEmpty() ) {
  904. order = nat1;
  905. }
  906. else {
  907. uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
  908. }
  909. }
  910. BSONObj snapshotHint; // put here to keep the data in scope
  911. if( snapshot ) {
  912. NamespaceDetails *d = nsdetails(ns);
  913. if ( d ) {
  914. int i = d->findIdIndex();
  915. if( i < 0 ) {
  916. if ( strstr( ns , ".system." ) == 0 )
  917. log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
  918. }
  919. else {
  920. /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
  921. probably need a better way to specify "use the _id index" as a hint. if someone is
  922. in the query optimizer please fix this then!
  923. */
  924. BSONObjBuilder b;
  925. b.append("$hint", d->idx(i).indexName());
  926. snapshotHint = b.obj();
  927. hint = snapshotHint.firstElement();
  928. }
  929. }
  930. }
  931. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
  932. bool nsFound = false;
  933. bool indexFound = false;
  934. BSONObj resObject;
  935. Client& c = cc();
  936. bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
  937. if ( nsFound == false || indexFound == true ) {
  938. BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
  939. bb.skip(sizeof(QueryResult));
  940. ss << " idhack ";
  941. if ( found ) {
  942. n = 1;
  943. fillQueryResultFromObj( bb , pq.getFields() , resObject );
  944. }
  945. auto_ptr< QueryResult > qr;
  946. qr.reset( (QueryResult *) bb.buf() );
  947. bb.decouple();
  948. qr->setResultFlagsToOk();
  949. qr->len = bb.len();
  950. ss << " reslen:" << bb.len();
  951. qr->setOperation(opReply);
  952. qr->cursorId = 0;
  953. qr->startingFrom = 0;
  954. qr->nReturned = n;
  955. result.setData( qr.release(), true );
  956. return false;
  957. }
  958. }
  959. // regular, not QO bypass query
  960. BSONObj oldPlan;
  961. if ( explain && ! pq.hasIndexSpecifier() ) {
  962. MultiPlanScanner mps( ns, query, order );
  963. if ( mps.usingPrerecordedPlan() )
  964. oldPlan = mps.oldExplain();
  965. }
  966. auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
  967. BSONObj explainSuffix;
  968. if ( explain ) {
  969. BSONObjBuilder bb;
  970. if ( !oldPlan.isEmpty() )
  971. bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
  972. explainSuffix = bb.obj();
  973. }
  974. ExplainBuilder eb;
  975. UserQueryOp original( pq, result, eb, curop );
  976. shared_ptr< UserQueryOp > o = mps->runOp( original );
  977. UserQueryOp &dqo = *o;
  978. if ( ! dqo.complete() )
  979. throw MsgAssertionException( dqo.exception() );
  980. if ( explain ) {
  981. dqo.finishExplain( explainSuffix );
  982. }
  983. n = dqo.n();
  984. long long nscanned = dqo.totalNscanned();
  985. if ( dqo.scanAndOrderRequired() )
  986. ss << " scanAndOrder ";
  987. shared_ptr<Cursor> cursor = dqo.cursor();
  988. if( logLevel >= 5 )
  989. log() << " used cursor: " << cursor.get() << endl;
  990. long long cursorid = 0;
  991. const char * exhaust = 0;
  992. if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
  993. ClientCursor *cc;
  994. bool moreClauses = mps->mayRunMore();
  995. if ( moreClauses ) {
  996. // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
  997. shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) );
  998. cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
  999. }
  1000. else {
  1001. cursor->setMatcher( dqo.matcher() );
  1002. cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
  1003. }
  1004. cursorid = cc->cursorid();
  1005. DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
  1006. cc->setPos( n );
  1007. cc->pq = pq_shared;
  1008. cc->fields = pq.getFieldPtr();
  1009. cc->originalMessage = m;
  1010. cc->updateLocation();
  1011. if ( !cc->ok() && cc->c()->tailable() )
  1012. DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
  1013. if( queryOptions & QueryOption_Exhaust ) {
  1014. exhaust = ns;
  1015. ss << " exhaust ";
  1016. }
  1017. dqo.finishForOplogReplay(cc);
  1018. }
  1019. QueryResult *qr = (QueryResult *) result.header();
  1020. qr->cursorId = cursorid;
  1021. qr->setResultFlagsToOk();
  1022. // qr->len is updated automatically by appendData()
  1023. ss << " reslen:" << qr->len;
  1024. qr->setOperation(opReply);
  1025. qr->startingFrom = 0;
  1026. qr->nReturned = n;
  1027. int duration = curop.elapsedMillis();
  1028. bool dbprofile = curop.shouldDBProfile( duration );
  1029. if ( dbprofile || duration >= cmdLine.slowMS ) {
  1030. ss << " nscanned:" << nscanned << ' ';
  1031. if ( ntoskip )
  1032. ss << " ntoskip:" << ntoskip;
  1033. if ( dbprofile )
  1034. ss << " \nquery: ";
  1035. ss << jsobj.toString() << ' ';
  1036. }
  1037. ss << " nreturned:" << n;
  1038. return exhaust;
  1039. }
  1040. } // namespace mongo