PageRenderTime 78ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/db/query.cpp

https://github.com/IlyaM/mongo
C++ | 984 lines | 802 code | 92 blank | 90 comment | 179 complexity | 59259be18206986b71249a0325f44796 MD5 | raw file
  1. // query.cpp
  2. /**
  3. * Copyright (C) 2008 10gen Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License, version 3,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU Affero General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include "stdafx.h"
  18. #include "query.h"
  19. #include "pdfile.h"
  20. #include "jsobjmanipulator.h"
  21. #include "../util/builder.h"
  22. #include <time.h>
  23. #include "introspect.h"
  24. #include "btree.h"
  25. #include "../util/lruishmap.h"
  26. #include "json.h"
  27. #include "repl.h"
  28. #include "replset.h"
  29. #include "scanandorder.h"
  30. #include "security.h"
  31. #include "curop.h"
  32. #include "commands.h"
  33. #include "queryoptimizer.h"
  34. #include "lasterror.h"
  35. namespace mongo {
  36. /* We cut off further objects once we cross this threshold; thus, you might get
  37. a little bit more than this, it is a threshold rather than a limit.
  38. */
  39. const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
  40. //ns->query->DiskLoc
  41. // LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
  42. extern bool useCursors;
  43. extern bool useHints;
  44. // Just try to identify best plan.
  45. class DeleteOp : public QueryOp {
  46. public:
  47. DeleteOp( bool justOne, int& bestCount ) :
  48. justOne_( justOne ),
  49. count_(),
  50. bestCount_( bestCount ),
  51. nScanned_() {
  52. }
  53. virtual void init() {
  54. c_ = qp().newCursor();
  55. matcher_.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) );
  56. }
  57. virtual void next() {
  58. if ( !c_->ok() ) {
  59. setComplete();
  60. return;
  61. }
  62. DiskLoc rloc = c_->currLoc();
  63. if ( matcher_->matches(c_->currKey(), rloc ) ) {
  64. if ( !c_->getsetdup(rloc) )
  65. ++count_;
  66. }
  67. c_->advance();
  68. ++nScanned_;
  69. if ( count_ > bestCount_ )
  70. bestCount_ = count_;
  71. if ( count_ > 0 ) {
  72. if ( justOne_ )
  73. setComplete();
  74. else if ( nScanned_ >= 100 && count_ == bestCount_ )
  75. setComplete();
  76. }
  77. }
  78. virtual bool mayRecordPlan() const { return !justOne_; }
  79. virtual QueryOp *clone() const {
  80. return new DeleteOp( justOne_, bestCount_ );
  81. }
  82. auto_ptr< Cursor > newCursor() const { return qp().newCursor(); }
  83. private:
  84. bool justOne_;
  85. int count_;
  86. int &bestCount_;
  87. long long nScanned_;
  88. auto_ptr< Cursor > c_;
  89. auto_ptr< CoveredIndexMatcher > matcher_;
  90. };
  91. /* ns: namespace, e.g. <database>.<collection>
  92. pattern: the "where" clause / criteria
  93. justOne: stop after 1 match
  94. */
  95. long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) {
  96. if( !god ) {
  97. if ( strstr(ns, ".system.") ) {
  98. /* note a delete from system.indexes would corrupt the db
  99. if done here, as there are pointers into those objects in
  100. NamespaceDetails.
  101. */
  102. uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
  103. }
  104. if ( strchr( ns , '$' ) ){
  105. log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
  106. uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
  107. }
  108. }
  109. NamespaceDetails *d = nsdetails( ns );
  110. if ( ! d )
  111. return 0;
  112. uassert( 10101 , "can't remove from a capped collection" , ! d->capped );
  113. long long nDeleted = 0;
  114. QueryPlanSet s( ns, pattern, BSONObj() );
  115. int best = 0;
  116. DeleteOp original( justOne, best );
  117. shared_ptr< DeleteOp > bestOp = s.runOp( original );
  118. auto_ptr< Cursor > creal = bestOp->newCursor();
  119. if( !creal->ok() )
  120. return nDeleted;
  121. CoveredIndexMatcher matcher(pattern, creal->indexKeyPattern());
  122. auto_ptr<ClientCursor> cc( new ClientCursor(creal, ns, false) );
  123. cc->setDoingDeletes( true );
  124. CursorId id = cc->cursorid;
  125. unsigned long long nScanned = 0;
  126. do {
  127. if ( ++nScanned % 128 == 0 && !matcher.docMatcher().atomic() ) {
  128. if ( ! cc->yield() ){
  129. cc.release(); // has already been deleted elsewhere
  130. break;
  131. }
  132. }
  133. // this way we can avoid calling updateLocation() every time (expensive)
  134. // as well as some other nuances handled
  135. cc->setDoingDeletes( true );
  136. DiskLoc rloc = cc->c->currLoc();
  137. BSONObj key = cc->c->currKey();
  138. cc->c->advance();
  139. if ( ! matcher.matches( key , rloc ) )
  140. continue;
  141. assert( !cc->c->getsetdup(rloc) ); // can't be a dup, we deleted it!
  142. if ( !justOne ) {
  143. /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore
  144. blocks. here we might call millions of times which would be bad.
  145. */
  146. cc->c->noteLocation();
  147. }
  148. if ( logop ) {
  149. BSONElement e;
  150. if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
  151. BSONObjBuilder b;
  152. b.append( e );
  153. bool replJustOne = true;
  154. logOp( "d", ns, b.done(), 0, &replJustOne );
  155. } else {
  156. problem() << "deleted object without id, not logging" << endl;
  157. }
  158. }
  159. theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
  160. nDeleted++;
  161. if ( justOne )
  162. break;
  163. cc->c->checkLocation();
  164. } while ( cc->c->ok() );
  165. if ( cc.get() && ClientCursor::find( id , false ) == 0 ){
  166. cc.release();
  167. }
  168. return nDeleted;
  169. }
  170. int otherTraceLevel = 0;
  171. int initialExtentSize(int len);
  172. bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
  173. try {
  174. return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
  175. }
  176. catch ( AssertionException& e ) {
  177. if ( !e.msg.empty() )
  178. anObjBuilder.append("assertion", e.msg);
  179. }
  180. curop.debug().str << " assertion ";
  181. anObjBuilder.append("errmsg", "db assertion failure");
  182. anObjBuilder.append("ok", 0.0);
  183. BSONObj x = anObjBuilder.done();
  184. b.append((void*) x.objdata(), x.objsize());
  185. return true;
  186. }
  187. int nCaught = 0;
  188. void killCursors(int n, long long *ids) {
  189. int k = 0;
  190. for ( int i = 0; i < n; i++ ) {
  191. if ( ClientCursor::erase(ids[i]) )
  192. k++;
  193. }
  194. log( k == n ) << "killcursors: found " << k << " of " << n << '\n';
  195. }
  196. BSONObj id_obj = fromjson("{\"_id\":1}");
  197. BSONObj empty_obj = fromjson("{}");
  198. /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
  199. [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
  200. */
  201. inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
  202. /* note: this is slow, but that is ok as order will have very few pieces */
  203. BSONObjBuilder b;
  204. char p[2] = "0";
  205. while ( 1 ) {
  206. BSONObj j = order.getObjectField(p);
  207. if ( j.isEmpty() )
  208. break;
  209. BSONElement e = j.firstElement();
  210. uassert( 10102 , "bad order array", !e.eoo());
  211. uassert( 10103 , "bad order array [2]", e.isNumber());
  212. b.append(e);
  213. (*p)++;
  214. uassert( 10104 , "too many ordering elements", *p <= '9');
  215. }
  216. return b.obj();
  217. }
  218. //int dump = 0;
  219. /* empty result for error conditions */
  220. QueryResult* emptyMoreResult(long long cursorid) {
  221. BufBuilder b(32768);
  222. b.skip(sizeof(QueryResult));
  223. QueryResult *qr = (QueryResult *) b.buf();
  224. qr->cursorId = 0; // 0 indicates no more data to retrieve.
  225. qr->startingFrom = 0;
  226. qr->len = b.len();
  227. qr->setOperation(opReply);
  228. qr->nReturned = 0;
  229. b.decouple();
  230. return qr;
  231. }
  232. QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop ) {
  233. StringBuilder& ss = curop.debug().str;
  234. ClientCursor::Pointer p(cursorid);
  235. ClientCursor *cc = p._c;
  236. int bufSize = 512;
  237. if ( cc ){
  238. bufSize += sizeof( QueryResult );
  239. bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024;
  240. }
  241. BufBuilder b( bufSize );
  242. b.skip(sizeof(QueryResult));
  243. int resultFlags = 0; //QueryResult::ResultFlag_AwaitCapable;
  244. int start = 0;
  245. int n = 0;
  246. if ( !cc ) {
  247. log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
  248. cursorid = 0;
  249. resultFlags = QueryResult::ResultFlag_CursorNotFound;
  250. }
  251. else {
  252. ss << " query: " << cc->query << " ";
  253. start = cc->pos;
  254. Cursor *c = cc->c.get();
  255. c->checkLocation();
  256. while ( 1 ) {
  257. if ( !c->ok() ) {
  258. if ( c->tailable() ) {
  259. if ( c->advance() ) {
  260. continue;
  261. }
  262. break;
  263. }
  264. p.release();
  265. bool ok = ClientCursor::erase(cursorid);
  266. assert(ok);
  267. cursorid = 0;
  268. cc = 0;
  269. break;
  270. }
  271. if ( !cc->matcher->matches(c->currKey(), c->currLoc() ) ) {
  272. }
  273. else {
  274. //out() << "matches " << c->currLoc().toString() << '\n';
  275. if( c->getsetdup(c->currLoc()) ) {
  276. //out() << " but it's a dup \n";
  277. }
  278. else {
  279. BSONObj js = c->current();
  280. fillQueryResultFromObj(b, cc->filter.get(), js);
  281. n++;
  282. if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) ||
  283. (ntoreturn==0 && b.len()>1*1024*1024) ) {
  284. c->advance();
  285. cc->pos += n;
  286. //cc->updateLocation();
  287. break;
  288. }
  289. }
  290. }
  291. c->advance();
  292. }
  293. if ( cc ) {
  294. cc->updateLocation();
  295. cc->mayUpgradeStorage();
  296. }
  297. }
  298. QueryResult *qr = (QueryResult *) b.buf();
  299. qr->len = b.len();
  300. qr->setOperation(opReply);
  301. qr->_resultFlags() = resultFlags;
  302. qr->cursorId = cursorid;
  303. qr->startingFrom = start;
  304. qr->nReturned = n;
  305. b.decouple();
  306. return qr;
  307. }
  308. class CountOp : public QueryOp {
  309. public:
  310. CountOp( const BSONObj &spec ) : spec_( spec ), count_(), bc_() {}
  311. virtual void init() {
  312. query_ = spec_.getObjectField( "query" );
  313. c_ = qp().newCursor();
  314. matcher_.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) );
  315. if ( qp().exactKeyMatch() && ! matcher_->needRecord() ) {
  316. query_ = qp().simplifiedQuery( qp().indexKey() );
  317. bc_ = dynamic_cast< BtreeCursor* >( c_.get() );
  318. bc_->forgetEndKey();
  319. }
  320. skip_ = spec_["skip"].numberLong();
  321. limit_ = spec_["limit"].numberLong();
  322. }
  323. virtual void next() {
  324. if ( !c_->ok() ) {
  325. setComplete();
  326. return;
  327. }
  328. if ( bc_ ) {
  329. if ( firstMatch_.isEmpty() ) {
  330. firstMatch_ = bc_->currKeyNode().key;
  331. // if not match
  332. if ( query_.woCompare( firstMatch_, BSONObj(), false ) ) {
  333. setComplete();
  334. return;
  335. }
  336. _gotOne();
  337. } else {
  338. if ( !firstMatch_.woEqual( bc_->currKeyNode().key ) ) {
  339. setComplete();
  340. return;
  341. }
  342. _gotOne();
  343. }
  344. } else {
  345. if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
  346. }
  347. else if( !c_->getsetdup(c_->currLoc()) ) {
  348. _gotOne();
  349. }
  350. }
  351. c_->advance();
  352. }
  353. virtual QueryOp *clone() const {
  354. return new CountOp( spec_ );
  355. }
  356. long long count() const { return count_; }
  357. virtual bool mayRecordPlan() const { return true; }
  358. private:
  359. void _gotOne(){
  360. if ( skip_ ){
  361. skip_--;
  362. return;
  363. }
  364. if ( limit_ > 0 && count_ >= limit_ ){
  365. setComplete();
  366. return;
  367. }
  368. count_++;
  369. }
  370. BSONObj spec_;
  371. long long count_;
  372. long long skip_;
  373. long long limit_;
  374. auto_ptr< Cursor > c_;
  375. BSONObj query_;
  376. BtreeCursor *bc_;
  377. auto_ptr< CoveredIndexMatcher > matcher_;
  378. BSONObj firstMatch_;
  379. };
  380. /* { count: "collectionname"[, query: <query>] }
  381. returns -1 on ns does not exist error.
  382. */
  383. long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
  384. NamespaceDetails *d = nsdetails( ns );
  385. if ( !d ) {
  386. err = "ns missing";
  387. return -1;
  388. }
  389. BSONObj query = cmd.getObjectField("query");
  390. // count of all objects
  391. if ( query.isEmpty() ){
  392. long long num = d->nrecords;
  393. num = num - cmd["skip"].numberLong();
  394. if ( num < 0 ) {
  395. num = 0;
  396. }
  397. if ( cmd["limit"].isNumber() ){
  398. long long limit = cmd["limit"].numberLong();
  399. if ( limit < num ){
  400. num = limit;
  401. }
  402. }
  403. return num;
  404. }
  405. QueryPlanSet qps( ns, query, BSONObj() );
  406. CountOp original( cmd );
  407. shared_ptr< CountOp > res = qps.runOp( original );
  408. if ( !res->complete() ) {
  409. log() << "Count with ns: " << ns << " and query: " << query
  410. << " failed with exception: " << res->exceptionMessage()
  411. << endl;
  412. return 0;
  413. }
  414. return res->count();
  415. }
  416. int _findingStartInitialTimeout = 5; // configurable for testing
  417. // Implements database 'query' requests using the query optimizer's QueryOp interface
  418. class UserQueryOp : public QueryOp {
  419. public:
  420. enum FindingStartMode { Initial, FindExtent, InExtent };
  421. UserQueryOp( int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore,
  422. bool explain, FieldMatcher *filter, int queryOptions ) :
  423. b_( 32768 ),
  424. ntoskip_( ntoskip ),
  425. ntoreturn_( ntoreturn ),
  426. order_( order ),
  427. wantMore_( wantMore ),
  428. explain_( explain ),
  429. filter_( filter ),
  430. ordering_(),
  431. nscanned_(),
  432. queryOptions_( queryOptions ),
  433. n_(),
  434. soSize_(),
  435. saveClientCursor_(),
  436. findingStart_( (queryOptions & QueryOption_OplogReplay) != 0 ),
  437. findingStartCursor_(),
  438. findingStartMode_()
  439. {
  440. uassert( 10105 , "bad skip value in query", ntoskip >= 0);
  441. }
  442. virtual void init() {
  443. b_.skip( sizeof( QueryResult ) );
  444. if ( findingStart_ ) {
  445. // Use a ClientCursor here so we can release db mutex while scanning
  446. // oplog (can take quite a while with large oplogs).
  447. auto_ptr<Cursor> c = qp().newReverseCursor();
  448. findingStartCursor_ = new ClientCursor(c, qp().ns(), false);
  449. findingStartTimer_.reset();
  450. findingStartMode_ = Initial;
  451. } else {
  452. c_ = qp().newCursor();
  453. }
  454. matcher_.reset(new CoveredIndexMatcher(qp().query(), qp().indexKey()));
  455. if ( qp().scanAndOrderRequired() ) {
  456. ordering_ = true;
  457. so_.reset( new ScanAndOrder( ntoskip_, ntoreturn_, order_ ) );
  458. wantMore_ = false;
  459. }
  460. }
  461. DiskLoc startLoc( const DiskLoc &rec ) {
  462. Extent *e = rec.rec()->myExtent( rec );
  463. if ( e->myLoc != qp().nsd()->capExtent )
  464. return e->firstRecord;
  465. // Likely we are on the fresh side of capExtent, so return first fresh record.
  466. // If we are on the stale side of capExtent, then the collection is small and it
  467. // doesn't matter if we start the extent scan with capFirstNewRecord.
  468. return qp().nsd()->capFirstNewRecord;
  469. }
  470. DiskLoc prevLoc( const DiskLoc &rec ) {
  471. Extent *e = rec.rec()->myExtent( rec );
  472. if ( e->xprev.isNull() )
  473. e = qp().nsd()->lastExtent.ext();
  474. else
  475. e = e->xprev.ext();
  476. if ( e->myLoc != qp().nsd()->capExtent )
  477. return e->firstRecord;
  478. return DiskLoc(); // reached beginning of collection
  479. }
  480. void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) {
  481. auto_ptr<Cursor> c = qp().newCursor( startLoc );
  482. findingStartCursor_ = new ClientCursor(c, qp().ns(), false);
  483. }
  484. void maybeRelease() {
  485. RARELY {
  486. CursorId id = findingStartCursor_->cursorid;
  487. findingStartCursor_->updateLocation();
  488. {
  489. dbtemprelease t;
  490. }
  491. findingStartCursor_ = ClientCursor::find( id, false );
  492. }
  493. }
  494. virtual void next() {
  495. if ( findingStart_ ) {
  496. if ( !findingStartCursor_ || !findingStartCursor_->c->ok() ) {
  497. findingStart_ = false;
  498. c_ = qp().newCursor(); // on error, start from beginning
  499. return;
  500. }
  501. switch( findingStartMode_ ) {
  502. case Initial: {
  503. if ( !matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) {
  504. findingStart_ = false; // found first record out of query range, so scan normally
  505. c_ = qp().newCursor( findingStartCursor_->c->currLoc() );
  506. return;
  507. }
  508. findingStartCursor_->c->advance();
  509. RARELY {
  510. if ( findingStartTimer_.seconds() >= _findingStartInitialTimeout ) {
  511. createClientCursor( startLoc( findingStartCursor_->c->currLoc() ) );
  512. findingStartMode_ = FindExtent;
  513. return;
  514. }
  515. }
  516. maybeRelease();
  517. return;
  518. }
  519. case FindExtent: {
  520. if ( !matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) {
  521. findingStartMode_ = InExtent;
  522. return;
  523. }
  524. DiskLoc prev = prevLoc( findingStartCursor_->c->currLoc() );
  525. if ( prev.isNull() ) { // hit beginning, so start scanning from here
  526. createClientCursor();
  527. findingStartMode_ = InExtent;
  528. return;
  529. }
  530. // There might be a more efficient implementation than creating new cursor & client cursor each time,
  531. // not worrying about that for now
  532. createClientCursor( prev );
  533. maybeRelease();
  534. return;
  535. }
  536. case InExtent: {
  537. if ( matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) {
  538. findingStart_ = false; // found first record in query range, so scan normally
  539. c_ = qp().newCursor( findingStartCursor_->c->currLoc() );
  540. return;
  541. }
  542. findingStartCursor_->c->advance();
  543. maybeRelease();
  544. return;
  545. }
  546. default: {
  547. massert( 12600, "invalid findingStartMode_", false );
  548. }
  549. }
  550. }
  551. if ( findingStartCursor_ ) {
  552. ClientCursor::erase( findingStartCursor_->cursorid );
  553. findingStartCursor_ = 0;
  554. }
  555. if ( !c_->ok() ) {
  556. finish();
  557. return;
  558. }
  559. bool mayCreateCursor1 = wantMore_ && ntoreturn_ != 1 && useCursors;
  560. if( 0 ) {
  561. BSONObj js = c_->current();
  562. cout << "SCANNING " << js << endl;
  563. }
  564. nscanned_++;
  565. if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
  566. ;
  567. }
  568. else {
  569. DiskLoc cl = c_->currLoc();
  570. if( !c_->getsetdup(cl) ) {
  571. BSONObj js = c_->current();
  572. // got a match.
  573. assert( js.objsize() >= 0 ); //defensive for segfaults
  574. if ( ordering_ ) {
  575. // note: no cursors for non-indexed, ordered results. results must be fairly small.
  576. so_->add(js);
  577. }
  578. else if ( ntoskip_ > 0 ) {
  579. ntoskip_--;
  580. } else {
  581. if ( explain_ ) {
  582. n_++;
  583. if ( n_ >= ntoreturn_ && !wantMore_ ) {
  584. // .limit() was used, show just that much.
  585. finish();
  586. return;
  587. }
  588. }
  589. else {
  590. fillQueryResultFromObj(b_, filter_, js);
  591. n_++;
  592. if ( (ntoreturn_>0 && (n_ >= ntoreturn_ || b_.len() > MaxBytesToReturnToClientAtOnce)) ||
  593. (ntoreturn_==0 && (b_.len()>1*1024*1024 || n_>=101)) ) {
  594. /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there
  595. is only a size limit. The idea is that on a find() where one doesn't use much results,
  596. we don't return much, but once getmore kicks in, we start pushing significant quantities.
  597. The n limit (vs. size) is important when someone fetches only one small field from big
  598. objects, which causes massive scanning server-side.
  599. */
  600. /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
  601. if ( mayCreateCursor1 ) {
  602. c_->advance();
  603. if ( c_->ok() ) {
  604. // more...so save a cursor
  605. saveClientCursor_ = true;
  606. }
  607. }
  608. finish();
  609. return;
  610. }
  611. }
  612. }
  613. }
  614. }
  615. c_->advance();
  616. }
  617. void finish() {
  618. if ( explain_ ) {
  619. n_ = ordering_ ? so_->size() : n_;
  620. } else if ( ordering_ ) {
  621. so_->fill(b_, filter_, n_);
  622. }
  623. if ( mayCreateCursor2() ) {
  624. c_->setTailable();
  625. }
  626. // If the tailing request succeeded.
  627. if ( c_->tailable() ) {
  628. saveClientCursor_ = true;
  629. }
  630. setComplete();
  631. }
  632. virtual bool mayRecordPlan() const { return ntoreturn_ != 1; }
  633. virtual QueryOp *clone() const {
  634. return new UserQueryOp( ntoskip_, ntoreturn_, order_, wantMore_, explain_, filter_, queryOptions_ );
  635. }
  636. BufBuilder &builder() { return b_; }
  637. bool scanAndOrderRequired() const { return ordering_; }
  638. auto_ptr< Cursor > cursor() { return c_; }
  639. auto_ptr< CoveredIndexMatcher > matcher() { return matcher_; }
  640. int n() const { return n_; }
  641. long long nscanned() const { return nscanned_; }
  642. bool saveClientCursor() const { return saveClientCursor_; }
  643. bool mayCreateCursor2() const { return ( queryOptions_ & QueryOption_CursorTailable ) && ntoreturn_ != 1; }
  644. private:
  645. BufBuilder b_;
  646. int ntoskip_;
  647. int ntoreturn_;
  648. BSONObj order_;
  649. bool wantMore_;
  650. bool explain_;
  651. FieldMatcher *filter_;
  652. bool ordering_;
  653. auto_ptr< Cursor > c_;
  654. long long nscanned_;
  655. int queryOptions_;
  656. auto_ptr< CoveredIndexMatcher > matcher_;
  657. int n_;
  658. int soSize_;
  659. bool saveClientCursor_;
  660. auto_ptr< ScanAndOrder > so_;
  661. bool findingStart_;
  662. ClientCursor * findingStartCursor_;
  663. Timer findingStartTimer_;
  664. FindingStartMode findingStartMode_;
  665. };
  666. /* run a query -- includes checking for and running a Command */
  667. auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop ) {
  668. StringBuilder& ss = curop.debug().str;
  669. const char *ns = q.ns;
  670. int ntoskip = q.ntoskip;
  671. int _ntoreturn = q.ntoreturn;
  672. BSONObj jsobj = q.query;
  673. auto_ptr< FieldMatcher > filter = q.fields; // what fields to return (unspecified = full object)
  674. int queryOptions = q.queryOptions;
  675. BSONObj snapshotHint;
  676. if( logLevel >= 2 )
  677. log() << "runQuery: " << ns << jsobj << endl;
  678. long long nscanned = 0;
  679. bool wantMore = true;
  680. int ntoreturn = _ntoreturn;
  681. if ( _ntoreturn < 0 ) {
  682. /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
  683. "cursor batch".
  684. A negative number indicates a hard limit.
  685. */
  686. ntoreturn = -_ntoreturn;
  687. wantMore = false;
  688. }
  689. ss << "query " << ns << " ntoreturn:" << ntoreturn;
  690. curop.setQuery(jsobj);
  691. BufBuilder bb;
  692. BSONObjBuilder cmdResBuf;
  693. long long cursorid = 0;
  694. bb.skip(sizeof(QueryResult));
  695. auto_ptr< QueryResult > qr;
  696. int n = 0;
  697. Client& c = cc();
  698. /* we assume you are using findOne() for running a cmd... */
  699. if ( ntoreturn == 1 && runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
  700. n = 1;
  701. qr.reset( (QueryResult *) bb.buf() );
  702. bb.decouple();
  703. qr->setResultFlagsToOk();
  704. qr->len = bb.len();
  705. ss << " reslen:" << bb.len();
  706. // qr->channel = 0;
  707. qr->setOperation(opReply);
  708. qr->cursorId = cursorid;
  709. qr->startingFrom = 0;
  710. qr->nReturned = n;
  711. }
  712. else { /* regular query */
  713. mongolock lk(false); // read lock
  714. Client::Context ctx( ns , dbpath , &lk );
  715. /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair
  716. so that queries to a pair are realtime consistent as much as possible. use setSlaveOk() to
  717. query the nonmaster member of a replica pair.
  718. */
  719. uassert( 10107 , "not master", isMaster() || (queryOptions & QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
  720. BSONElement hint;
  721. BSONObj min;
  722. BSONObj max;
  723. bool explain = false;
  724. bool _gotquery = false;
  725. bool snapshot = false;
  726. BSONObj query;
  727. {
  728. BSONElement e = jsobj.getField("$query");
  729. if ( e.eoo() )
  730. e = jsobj.getField("query");
  731. if ( !e.eoo() && (e.type() == Object || e.type() == Array) ) {
  732. query = e.embeddedObject();
  733. _gotquery = true;
  734. }
  735. }
  736. BSONObj order;
  737. {
  738. BSONElement e = jsobj.getField("$orderby");
  739. if ( e.eoo() )
  740. e = jsobj.getField("orderby");
  741. if ( !e.eoo() ) {
  742. order = e.embeddedObjectUserCheck();
  743. if ( e.type() == Array )
  744. order = transformOrderFromArrayFormat(order);
  745. }
  746. }
  747. if ( !_gotquery && order.isEmpty() )
  748. query = jsobj;
  749. else {
  750. explain = jsobj.getBoolField("$explain");
  751. if ( useHints )
  752. hint = jsobj.getField("$hint");
  753. min = jsobj.getObjectField("$min");
  754. max = jsobj.getObjectField("$max");
  755. BSONElement e = jsobj.getField("$snapshot");
  756. snapshot = !e.eoo() && e.trueValue();
  757. if( snapshot ) {
  758. uassert( 12001 , "E12001 can't sort with $snapshot", order.isEmpty());
  759. uassert( 12002 , "E12002 can't use hint with $snapshot", hint.eoo());
  760. NamespaceDetails *d = nsdetails(ns);
  761. if ( d ){
  762. int i = d->findIdIndex();
  763. if( i < 0 ) {
  764. if ( strstr( ns , ".system." ) == 0 )
  765. log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
  766. }
  767. else {
  768. /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
  769. probably need a better way to specify "use the _id index" as a hint. if someone is
  770. in the query optimizer please fix this then!
  771. */
  772. BSONObjBuilder b;
  773. b.append("$hint", d->idx(i).indexName());
  774. snapshotHint = b.obj();
  775. hint = snapshotHint.firstElement();
  776. }
  777. }
  778. }
  779. }
  780. /* The ElemIter will not be happy if this isn't really an object. So throw exception
  781. here when that is true.
  782. (Which may indicate bad data from client.)
  783. */
  784. if ( query.objsize() == 0 ) {
  785. out() << "Bad query object?\n jsobj:";
  786. out() << jsobj.toString() << "\n query:";
  787. out() << query.toString() << endl;
  788. uassert( 10110 , "bad query object", false);
  789. }
  790. bool idHackWorked = false;
  791. if ( isSimpleIdQuery( query ) ){
  792. nscanned = 1;
  793. bool nsFound = false;
  794. bool indexFound = false;
  795. BSONObj resObject;
  796. bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
  797. if ( nsFound == false || indexFound == true ){
  798. idHackWorked = true;
  799. if ( found ){
  800. n = 1;
  801. fillQueryResultFromObj( bb , filter.get() , resObject );
  802. }
  803. qr.reset( (QueryResult *) bb.buf() );
  804. bb.decouple();
  805. qr->setResultFlagsToOk();
  806. qr->len = bb.len();
  807. ss << " reslen:" << bb.len();
  808. qr->setOperation(opReply);
  809. qr->cursorId = cursorid;
  810. qr->startingFrom = 0;
  811. qr->nReturned = n;
  812. }
  813. }
  814. if ( ! idHackWorked ){ // non-simple _id lookup
  815. BSONObj oldPlan;
  816. if ( explain && hint.eoo() && min.isEmpty() && max.isEmpty() ) {
  817. QueryPlanSet qps( ns, query, order );
  818. if ( qps.usingPrerecordedPlan() )
  819. oldPlan = qps.explain();
  820. }
  821. QueryPlanSet qps( ns, query, order, &hint, !explain, min, max );
  822. UserQueryOp original( ntoskip, ntoreturn, order, wantMore, explain, filter.get(), queryOptions );
  823. shared_ptr< UserQueryOp > o = qps.runOp( original );
  824. UserQueryOp &dqo = *o;
  825. massert( 10362 , dqo.exceptionMessage(), dqo.complete() );
  826. n = dqo.n();
  827. nscanned = dqo.nscanned();
  828. if ( dqo.scanAndOrderRequired() )
  829. ss << " scanAndOrder ";
  830. auto_ptr< Cursor > c = dqo.cursor();
  831. log( 5 ) << " used cursor: " << c.get() << endl;
  832. if ( dqo.saveClientCursor() ) {
  833. // the clientcursor now owns the Cursor* and 'c' is released:
  834. ClientCursor *cc = new ClientCursor(c, ns, !(queryOptions & QueryOption_NoCursorTimeout));
  835. cursorid = cc->cursorid;
  836. cc->query = jsobj.getOwned();
  837. DEV out() << " query has more, cursorid: " << cursorid << endl;
  838. cc->matcher = dqo.matcher();
  839. cc->pos = n;
  840. cc->filter = filter;
  841. cc->originalMessage = m;
  842. cc->updateLocation();
  843. if ( !cc->c->ok() && cc->c->tailable() ) {
  844. DEV out() << " query has no more but tailable, cursorid: " << cursorid << endl;
  845. } else {
  846. DEV out() << " query has more, cursorid: " << cursorid << endl;
  847. }
  848. }
  849. if ( explain ) {
  850. BSONObjBuilder builder;
  851. builder.append("cursor", c->toString());
  852. builder.append("startKey", c->prettyStartKey());
  853. builder.append("endKey", c->prettyEndKey());
  854. builder.append("nscanned", double( dqo.nscanned() ) );
  855. builder.append("n", n);
  856. if ( dqo.scanAndOrderRequired() )
  857. builder.append("scanAndOrder", true);
  858. builder.append("millis", curop.elapsedMillis());
  859. if ( !oldPlan.isEmpty() )
  860. builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
  861. if ( hint.eoo() )
  862. builder.appendElements(qps.explain());
  863. BSONObj obj = builder.done();
  864. fillQueryResultFromObj(dqo.builder(), 0, obj);
  865. n = 1;
  866. }
  867. qr.reset( (QueryResult *) dqo.builder().buf() );
  868. dqo.builder().decouple();
  869. qr->cursorId = cursorid;
  870. qr->setResultFlagsToOk();
  871. qr->len = dqo.builder().len();
  872. ss << " reslen:" << qr->len;
  873. qr->setOperation(opReply);
  874. qr->startingFrom = 0;
  875. qr->nReturned = n;
  876. }
  877. } // end else for regular query
  878. int duration = curop.elapsedMillis();
  879. bool dbprofile = curop.shouldDBProfile( duration );
  880. if ( dbprofile || duration >= cmdLine.slowMS ) {
  881. ss << " nscanned:" << nscanned << ' ';
  882. if ( ntoskip )
  883. ss << " ntoskip:" << ntoskip;
  884. if ( dbprofile )
  885. ss << " \nquery: ";
  886. ss << jsobj << ' ';
  887. }
  888. ss << " nreturned:" << n;
  889. return qr;
  890. }
  891. } // namespace mongo