PageRenderTime 52ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/be/src/runtime/disk-io-mgr.cc

https://gitlab.com/s9perween/Impala
C++ | 935 lines | 660 code | 121 blank | 154 comment | 163 complexity | 62372b2db25f1e6ea25e4b0662644541 MD5 | raw file
  1. // Copyright 2012 Cloudera Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "runtime/disk-io-mgr.h"
  15. #include "runtime/disk-io-mgr-internal.h"
  16. using namespace boost;
  17. using namespace impala;
  18. using namespace std;
  19. // Control the number of disks on the machine. If 0, this comes from the system
  20. // settings.
  21. DEFINE_int32(num_disks, 0, "Number of disks on data node.");
  22. // Default IoMgr configs.
  23. // The maximum number of the threads per disk is also the max queue depth per disk.
  24. // The read size is the size of the reads sent to hdfs/os.
  25. // There is a trade off of latency and throughout, trying to keep disks busy but
  26. // not introduce seeks. The literature seems to agree that with 8 MB reads, random
  27. // io and sequential io perform similarly.
  28. DEFINE_int32(num_threads_per_disk, 0, "number of threads per disk");
  29. DEFINE_int32(read_size, 8 * 1024 * 1024, "Read Size (in bytes)");
  30. DEFINE_int32(min_buffer_size, 1024, "The minimum read buffer size (in bytes)");
  31. // Turning this to false will make asan much more effective for IO buffer related
  32. // bugs.
  33. DEFINE_bool(reuse_io_buffers, true, "(Advanced) If true, IoMgr will reuse IoBuffers "
  34. "across queries.");
  35. // Rotational disks should have 1 thread per disk to minimize seeks. Non-rotational
  36. // don't have this penalty and benefit from multiple concurrent IO requests.
  37. static const int THREADS_PER_ROTATIONAL_DISK = 1;
  38. static const int THREADS_PER_FLASH_DISK = 8;
  39. // The IoMgr is able to run with a wide range of memory usage. If a query has memory
  40. // remaining less than this value, the IoMgr will stop all buffering regardless of the
  41. // current queue size.
  42. static const int LOW_MEMORY = 64 * 1024 * 1024;
  43. const int DiskIoMgr::DEFAULT_QUEUE_CAPACITY = 2;
  44. // This class provides a cache of ReaderContext objects. ReaderContexts are recycled.
  45. // This is good for locality as well as lock contention. The cache has the property that
  46. // regardless of how many clients get added/removed, the memory locations for
  47. // existing clients do not change (not the case with std::vector) minimizing the locks we
  48. // have to take across all readers.
  49. // All functions on this object are thread safe
  50. class DiskIoMgr::ReaderCache {
  51. public:
  52. ReaderCache(DiskIoMgr* io_mgr) : io_mgr_(io_mgr) {}
  53. // Returns reader to the cache. This reader object can now be reused.
  54. void ReturnReader(ReaderContext* reader) {
  55. DCHECK(reader->state_ != ReaderContext::Inactive);
  56. reader->state_ = ReaderContext::Inactive;
  57. lock_guard<mutex> l(lock_);
  58. inactive_readers_.push_back(reader);
  59. }
  60. // Returns a new reader object. Allocates a new reader context if necessary.
  61. ReaderContext* GetNewReader() {
  62. lock_guard<mutex> l(lock_);
  63. if (!inactive_readers_.empty()) {
  64. ReaderContext* reader = inactive_readers_.front();
  65. inactive_readers_.pop_front();
  66. return reader;
  67. } else {
  68. ReaderContext* reader = new ReaderContext(io_mgr_, io_mgr_->num_disks());
  69. all_readers_.push_back(reader);
  70. return reader;
  71. }
  72. }
  73. // This object has the same lifetime as the disk IoMgr.
  74. ~ReaderCache() {
  75. for (list<ReaderContext*>::iterator it = all_readers_.begin();
  76. it != all_readers_.end(); ++it) {
  77. delete *it;
  78. }
  79. }
  80. // Validates that all readers are cleaned up and in the inactive state. No locks
  81. // are taken since this is only called from the disk IoMgr destructor.
  82. bool ValidateAllInactive() {
  83. for (list<ReaderContext*>::iterator it = all_readers_.begin();
  84. it != all_readers_.end(); ++it) {
  85. if ((*it)->state_ != ReaderContext::Inactive) {
  86. return false;
  87. }
  88. }
  89. DCHECK_EQ(all_readers_.size(), inactive_readers_.size());
  90. return all_readers_.size() == inactive_readers_.size();
  91. }
  92. string DebugString();
  93. private:
  94. DiskIoMgr* io_mgr_;
  95. // lock to protect all members below
  96. mutex lock_;
  97. // List of all reader created. Used for debugging
  98. list<ReaderContext*> all_readers_;
  99. // List of inactive readers. These objects can be used for a new reader.
  100. list<ReaderContext*> inactive_readers_;
  101. };
  102. string DiskIoMgr::ReaderCache::DebugString() {
  103. lock_guard<mutex> l(lock_);
  104. stringstream ss;
  105. for (list<ReaderContext*>::iterator it = all_readers_.begin();
  106. it != all_readers_.end(); ++it) {
  107. unique_lock<mutex> lock((*it)->lock_);
  108. ss << (*it)->DebugString() << endl;
  109. }
  110. return ss.str();
  111. }
  112. string DiskIoMgr::DebugString() {
  113. stringstream ss;
  114. ss << "Readers: " << endl << reader_cache_->DebugString() << endl;
  115. ss << "Disks: " << endl;
  116. for (int i = 0; i < disk_queues_.size(); ++i) {
  117. unique_lock<mutex> lock(disk_queues_[i]->lock);
  118. ss << " " << (void*) disk_queues_[i] << ":" ;
  119. if (!disk_queues_[i]->readers.empty()) {
  120. ss << " Readers: ";
  121. for (list<ReaderContext*>::iterator it = disk_queues_[i]->readers.begin();
  122. it != disk_queues_[i]->readers.end(); ++it) {
  123. ss << (void*)*it;
  124. }
  125. }
  126. ss << endl;
  127. }
  128. return ss.str();
  129. }
  130. DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr) :
  131. io_mgr_(io_mgr), reader_(NULL), buffer_(NULL) {
  132. }
  133. void DiskIoMgr::BufferDescriptor::Reset(ReaderContext* reader,
  134. ScanRange* range, char* buffer, int64_t buffer_len) {
  135. DCHECK(io_mgr_ != NULL);
  136. DCHECK(buffer_ == NULL);
  137. DCHECK(range != NULL);
  138. DCHECK(buffer != NULL);
  139. DCHECK_GE(buffer_len, 0);
  140. reader_ = reader;
  141. scan_range_ = range;
  142. buffer_ = buffer;
  143. buffer_len_ = buffer_len;
  144. len_ = 0;
  145. eosr_ = false;
  146. status_ = Status::OK;
  147. mem_tracker_ = NULL;
  148. }
  149. void DiskIoMgr::BufferDescriptor::Return() {
  150. DCHECK(io_mgr_ != NULL);
  151. io_mgr_->ReturnBuffer(this);
  152. }
  153. void DiskIoMgr::BufferDescriptor::SetMemTracker(MemTracker* tracker) {
  154. // Cached buffers don't count towards mem usage.
  155. if (scan_range_->cached_buffer_ != NULL) return;
  156. if (mem_tracker_ == tracker) return;
  157. if (mem_tracker_ != NULL) mem_tracker_->Release(buffer_len_);
  158. mem_tracker_ = tracker;
  159. if (mem_tracker_ != NULL) mem_tracker_->Consume(buffer_len_);
  160. }
  161. static void CheckSseSupport() {
  162. if (!CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
  163. LOG(WARNING) << "This machine does not support sse4_2. The default IO system "
  164. "configurations are suboptimal for this hardware. Consider "
  165. "increasing the number of threads per disk by restarting impalad "
  166. "using the --num_threads_per_disk flag with a higher value";
  167. }
  168. }
  169. DiskIoMgr::DiskIoMgr() :
  170. num_threads_per_disk_(FLAGS_num_threads_per_disk),
  171. max_buffer_size_(FLAGS_read_size),
  172. min_buffer_size_(FLAGS_min_buffer_size),
  173. cached_read_options_(NULL),
  174. shut_down_(false),
  175. total_bytes_read_counter_(TCounterType::BYTES),
  176. read_timer_(TCounterType::TIME_NS) {
  177. int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
  178. free_buffers_.resize(BitUtil::Log2(max_buffer_size_scaled) + 1);
  179. int num_disks = FLAGS_num_disks;
  180. if (num_disks == 0) num_disks = DiskInfo::num_disks();
  181. disk_queues_.resize(num_disks);
  182. CheckSseSupport();
  183. }
  184. DiskIoMgr::DiskIoMgr(int num_disks, int threads_per_disk, int min_buffer_size,
  185. int max_buffer_size) :
  186. num_threads_per_disk_(threads_per_disk),
  187. max_buffer_size_(max_buffer_size),
  188. min_buffer_size_(min_buffer_size),
  189. cached_read_options_(NULL),
  190. shut_down_(false),
  191. total_bytes_read_counter_(TCounterType::BYTES),
  192. read_timer_(TCounterType::TIME_NS) {
  193. int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
  194. free_buffers_.resize(BitUtil::Log2(max_buffer_size_scaled) + 1);
  195. if (num_disks == 0) num_disks = DiskInfo::num_disks();
  196. disk_queues_.resize(num_disks);
  197. CheckSseSupport();
  198. }
  199. DiskIoMgr::~DiskIoMgr() {
  200. shut_down_ = true;
  201. // Notify all worker threads and shut them down.
  202. for (int i = 0; i < disk_queues_.size(); ++i) {
  203. if (disk_queues_[i] == NULL) continue;
  204. {
  205. // This lock is necessary to properly use the condition var to notify
  206. // the disk worker threads. The readers also grab this lock so updates
  207. // to shut_down_ are protected.
  208. unique_lock<mutex> disk_lock(disk_queues_[i]->lock);
  209. }
  210. disk_queues_[i]->work_available.notify_all();
  211. }
  212. disk_thread_group_.JoinAll();
  213. for (int i = 0; i < disk_queues_.size(); ++i) {
  214. if (disk_queues_[i] == NULL) continue;
  215. int disk_id = disk_queues_[i]->disk_id;
  216. for (list<ReaderContext*>::iterator it = disk_queues_[i]->readers.begin();
  217. it != disk_queues_[i]->readers.end(); ++it) {
  218. DCHECK_EQ((*it)->disk_states_[disk_id].num_threads_in_read(), 0);
  219. DCHECK((*it)->disk_states_[disk_id].done());
  220. (*it)->DecrementDiskRefCount();
  221. }
  222. }
  223. DCHECK(reader_cache_.get() == NULL || reader_cache_->ValidateAllInactive())
  224. << endl << DebugString();
  225. DCHECK_EQ(num_buffers_in_readers_, 0);
  226. // Delete all allocated buffers
  227. int num_free_buffers = 0;
  228. for (int idx = 0; idx < free_buffers_.size(); ++idx) {
  229. num_free_buffers += free_buffers_[idx].size();
  230. }
  231. DCHECK_EQ(num_allocated_buffers_, num_free_buffers);
  232. GcIoBuffers();
  233. for (int i = 0; i < disk_queues_.size(); ++i) {
  234. delete disk_queues_[i];
  235. }
  236. }
  237. Status DiskIoMgr::Init(MemTracker* process_mem_tracker) {
  238. DCHECK(process_mem_tracker != NULL);
  239. process_mem_tracker_ = process_mem_tracker;
  240. // If we hit the process limit, see if we can reclaim some memory by removing
  241. // previously allocated (but unused) io buffers.
  242. process_mem_tracker->AddGcFunction(boost::bind(&DiskIoMgr::GcIoBuffers, this));
  243. for (int i = 0; i < disk_queues_.size(); ++i) {
  244. disk_queues_[i] = new DiskQueue(i);
  245. int num_threads_per_disk = num_threads_per_disk_;
  246. if (num_threads_per_disk == 0) {
  247. if (DiskInfo::is_rotational(i)) {
  248. num_threads_per_disk = THREADS_PER_ROTATIONAL_DISK;
  249. } else {
  250. num_threads_per_disk = THREADS_PER_FLASH_DISK;
  251. }
  252. }
  253. for (int j = 0; j < num_threads_per_disk; ++j) {
  254. stringstream ss;
  255. ss << "read-loop(Disk: " << i << ", Thread: " << j << ")";
  256. disk_thread_group_.AddThread(new Thread("disk-io-mgr", ss.str(),
  257. &DiskIoMgr::ReadLoop, this, disk_queues_[i]));
  258. }
  259. }
  260. reader_cache_.reset(new ReaderCache(this));
  261. return Status::OK;
  262. }
  263. Status DiskIoMgr::RegisterReader(hdfsFS hdfs, ReaderContext** reader,
  264. MemTracker* mem_tracker) {
  265. DCHECK(reader_cache_.get() != NULL) << "Must call Init() first.";
  266. *reader = reader_cache_->GetNewReader();
  267. (*reader)->Reset(hdfs, mem_tracker);
  268. return Status::OK;
  269. }
  270. void DiskIoMgr::WaitForDisksCompletion(ReaderContext* reader) {
  271. // First cancel the reader. This is more or less a no-op if the reader is
  272. // complete (common case).
  273. reader->Cancel(Status::CANCELLED);
  274. unique_lock<mutex> reader_lock(reader->lock_);
  275. DCHECK(reader->Validate()) << endl << reader->DebugString();
  276. while (reader->num_disks_with_ranges_ > 0) {
  277. reader->disks_complete_cond_var_.wait(reader_lock);
  278. }
  279. }
  280. void DiskIoMgr::UnregisterReader(ReaderContext* reader) {
  281. WaitForDisksCompletion(reader);
  282. // All the disks are done with clean, validate nothing is leaking.
  283. unique_lock<mutex> reader_lock(reader->lock_);
  284. DCHECK_EQ(reader->num_buffers_in_reader_, 0) << endl << reader->DebugString();
  285. DCHECK_EQ(reader->num_used_buffers_, 0) << endl << reader->DebugString();
  286. DCHECK(reader->Validate()) << endl << reader->DebugString();
  287. reader_cache_->ReturnReader(reader);
  288. }
  289. // Cancellation requires coordination from multiple threads. Each thread that currently
  290. // has a reference to the reader must notice the cancel and remove it from its tracking
  291. // structures. The last thread to touch the reader should deallocate (aka recycle) the
  292. // reader context object. Potential threads are:
  293. // 1. Disk threads that are currently reading for this reader.
  294. // 2. Caller threads that are waiting in GetNext.
  295. //
  296. // The steps are:
  297. // 1. Cancel will immediately set the reader in the Cancelled state. This prevents any
  298. // other thread from adding more ready buffers to this reader (they all take a lock and
  299. // check the state before doing so).
  300. // 2. Cancel will call cancel on each ScanRange that is not yet complete, unblocking
  301. // any threads in GetNext(). The reader will see the cancelled Status returned.
  302. // 3. Disk threads notice the reader is cancelled either when picking the next reader
  303. // to read for or when they try to enqueue a ready buffer. Upon noticing the cancelled
  304. // state, removes the reader from the disk queue. The last thread per disk with an
  305. // outstanding reference to the reader decrements the number of disk queues the reader
  306. // is on.
  307. void DiskIoMgr::CancelReader(ReaderContext* reader) {
  308. reader->Cancel(Status::CANCELLED);
  309. }
  310. void DiskIoMgr::set_read_timer(ReaderContext* r, RuntimeProfile::Counter* c) {
  311. r->read_timer_ = c;
  312. }
  313. void DiskIoMgr::set_bytes_read_counter(ReaderContext* r, RuntimeProfile::Counter* c) {
  314. r->bytes_read_counter_ = c;
  315. }
  316. void DiskIoMgr::set_active_read_thread_counter(ReaderContext* r,
  317. RuntimeProfile::Counter* c) {
  318. r->active_read_thread_counter_ = c;
  319. }
  320. void DiskIoMgr::set_disks_access_bitmap(ReaderContext* r,
  321. RuntimeProfile::Counter* c) {
  322. r->disks_accessed_bitmap_ = c;
  323. }
  324. int64_t DiskIoMgr::queue_size(ReaderContext* reader) const {
  325. return reader->num_ready_buffers_;
  326. }
  327. Status DiskIoMgr::reader_status(ReaderContext* reader) const {
  328. unique_lock<mutex> reader_lock(reader->lock_);
  329. return reader->status_;
  330. }
  331. int DiskIoMgr::num_unstarted_ranges(ReaderContext* reader) const {
  332. return reader->num_unstarted_ranges_;
  333. }
  334. int64_t DiskIoMgr::bytes_read_local(ReaderContext* reader) const {
  335. return reader->bytes_read_local_;
  336. }
  337. int64_t DiskIoMgr::bytes_read_short_circuit(ReaderContext* reader) const {
  338. return reader->bytes_read_short_circuit_;
  339. }
  340. int64_t DiskIoMgr::bytes_read_dn_cache(ReaderContext* reader) const {
  341. return reader->bytes_read_dn_cache_;
  342. }
  343. int64_t DiskIoMgr::GetReadThroughput() {
  344. return RuntimeProfile::UnitsPerSecond(&total_bytes_read_counter_, &read_timer_);
  345. }
  346. Status DiskIoMgr::ValidateScanRange(ScanRange* range) {
  347. int disk_id = range->disk_id_;
  348. if (disk_id < 0 || disk_id >= disk_queues_.size()) {
  349. stringstream ss;
  350. ss << "Invalid scan range. Bad disk id: " << disk_id;
  351. DCHECK(false) << ss.str();
  352. return Status(ss.str());
  353. }
  354. return Status::OK;
  355. }
  356. Status DiskIoMgr::AddScanRanges(ReaderContext* reader,
  357. const vector<ScanRange*>& ranges, bool schedule_immediately) {
  358. if (ranges.empty()) return Status::OK;
  359. // Validate and initialize all ranges
  360. for (int i = 0; i < ranges.size(); ++i) {
  361. RETURN_IF_ERROR(ValidateScanRange(ranges[i]));
  362. ranges[i]->InitInternal(this, reader);
  363. }
  364. // disks that this reader needs to be scheduled on.
  365. unique_lock<mutex> reader_lock(reader->lock_);
  366. DCHECK(reader->Validate()) << endl << reader->DebugString();
  367. if (reader->state_ == ReaderContext::Cancelled) {
  368. DCHECK(!reader->status_.ok());
  369. return reader->status_;
  370. }
  371. // Add each range to the queue of the disk the range is on
  372. for (int i = 0; i < ranges.size(); ++i) {
  373. // Don't add empty ranges.
  374. DCHECK_NE(ranges[i]->len(), 0);
  375. ScanRange* range = ranges[i];
  376. if (range->try_cache_) {
  377. if (schedule_immediately) {
  378. bool cached_read_succeeded;
  379. RETURN_IF_ERROR(range->ReadFromCache(&cached_read_succeeded));
  380. if (cached_read_succeeded) continue;
  381. // Cached read failed, fall back to AddScanRange() below.
  382. } else {
  383. reader->cached_ranges_.Enqueue(range);
  384. continue;
  385. }
  386. }
  387. reader->AddScanRange(range, schedule_immediately);
  388. }
  389. DCHECK(reader->Validate()) << endl << reader->DebugString();
  390. return Status::OK;
  391. }
  392. // This function returns the next scan range the reader should work on, checking
  393. // for eos and error cases. If there isn't already a cached scan range or a scan
  394. // range prepared by the disk threads, the caller waits on the disk threads.
  395. Status DiskIoMgr::GetNextRange(ReaderContext* reader, ScanRange** range) {
  396. DCHECK(reader != NULL);
  397. DCHECK(range != NULL);
  398. *range = NULL;
  399. Status status;
  400. unique_lock<mutex> reader_lock(reader->lock_);
  401. DCHECK(reader->Validate()) << endl << reader->DebugString();
  402. while (true) {
  403. if (reader->state_ == ReaderContext::Cancelled) {
  404. DCHECK(!reader->status_.ok());
  405. status = reader->status_;
  406. break;
  407. }
  408. if (reader->num_unstarted_ranges_ == 0 && reader->ready_to_start_ranges_.empty() &&
  409. reader->cached_ranges_.empty()) {
  410. // All ranges are done, just return.
  411. break;
  412. }
  413. if (!reader->cached_ranges_.empty()) {
  414. // We have a cached range.
  415. *range = reader->cached_ranges_.Dequeue();
  416. DCHECK((*range)->try_cache_);
  417. bool cached_read_succeeded;
  418. RETURN_IF_ERROR((*range)->ReadFromCache(&cached_read_succeeded));
  419. if (cached_read_succeeded) return Status::OK;
  420. // This range ended up not being cached. Loop again and pick up a new range.
  421. reader->AddScanRange(*range, false);
  422. DCHECK(reader->Validate()) << endl << reader->DebugString();
  423. *range = NULL;
  424. continue;
  425. }
  426. if (reader->ready_to_start_ranges_.empty()) {
  427. reader->ready_to_start_ranges_cv_.wait(reader_lock);
  428. } else {
  429. *range = reader->ready_to_start_ranges_.Dequeue();
  430. DCHECK(*range != NULL);
  431. int disk_id = (*range)->disk_id();
  432. DCHECK(*range == reader->disk_states_[disk_id].next_range_to_start());
  433. // Set this to NULL, the next time this disk runs for this reader, it will
  434. // get another range ready.
  435. reader->disk_states_[disk_id].set_next_range_to_start(NULL);
  436. reader->ScheduleScanRange(*range);
  437. break;
  438. }
  439. }
  440. return status;
  441. }
  442. Status DiskIoMgr::Read(ReaderContext* reader,
  443. ScanRange* range, BufferDescriptor** buffer) {
  444. DCHECK(range != NULL);
  445. DCHECK(buffer != NULL);
  446. *buffer = NULL;
  447. if (range->len() > max_buffer_size_) {
  448. stringstream ss;
  449. ss << "Cannot perform sync read larger than " << max_buffer_size_
  450. << ". Request was " << range->len();
  451. return Status(ss.str());
  452. }
  453. vector<DiskIoMgr::ScanRange*> ranges;
  454. ranges.push_back(range);
  455. RETURN_IF_ERROR(AddScanRanges(reader, ranges, true));
  456. RETURN_IF_ERROR(range->GetNext(buffer));
  457. DCHECK((*buffer) != NULL);
  458. DCHECK((*buffer)->eosr());
  459. return Status::OK;
  460. }
  461. void DiskIoMgr::ReturnBuffer(BufferDescriptor* buffer_desc) {
  462. DCHECK(buffer_desc != NULL);
  463. if (!buffer_desc->status_.ok()) DCHECK(buffer_desc->buffer_ == NULL);
  464. // A NULL buffer means there was an error or the buffer desc is from a cached
  465. // read. In either of those cases, we only need to return the descriptor object.
  466. if (buffer_desc->scan_range_->cached_buffer_ != NULL) {
  467. buffer_desc->buffer_ = NULL;
  468. // Returning the cached buffer means we're done with the scan range. It is
  469. // safe to close it (this is the only buffer that will be used for this range).
  470. buffer_desc->scan_range_->Close();
  471. }
  472. if (buffer_desc->buffer_ == NULL) {
  473. ReturnBufferDesc(buffer_desc);
  474. return;
  475. }
  476. ReaderContext* reader = buffer_desc->reader_;
  477. ReturnFreeBuffer(buffer_desc->buffer_, buffer_desc->buffer_len_);
  478. buffer_desc->SetMemTracker(NULL);
  479. buffer_desc->buffer_ = NULL;
  480. ReturnBufferDesc(buffer_desc);
  481. --num_buffers_in_readers_;
  482. --reader->num_buffers_in_reader_;
  483. }
  484. void DiskIoMgr::ReturnBufferDesc(BufferDescriptor* desc) {
  485. DCHECK(desc != NULL);
  486. unique_lock<mutex> lock(free_buffers_lock_);
  487. DCHECK(find(free_buffer_descs_.begin(), free_buffer_descs_.end(), desc)
  488. == free_buffer_descs_.end());
  489. free_buffer_descs_.push_back(desc);
  490. }
  491. DiskIoMgr::BufferDescriptor* DiskIoMgr::GetBufferDesc(
  492. ReaderContext* reader, ScanRange* range, char* buffer, int64_t buffer_size) {
  493. BufferDescriptor* buffer_desc;
  494. {
  495. unique_lock<mutex> lock(free_buffers_lock_);
  496. if (free_buffer_descs_.empty()) {
  497. buffer_desc = pool_.Add(new BufferDescriptor(this));
  498. } else {
  499. buffer_desc = free_buffer_descs_.front();
  500. free_buffer_descs_.pop_front();
  501. }
  502. }
  503. buffer_desc->Reset(reader, range, buffer, buffer_size);
  504. buffer_desc->SetMemTracker(reader->mem_tracker_);
  505. return buffer_desc;
  506. }
  507. char* DiskIoMgr::GetFreeBuffer(int64_t* buffer_size) {
  508. DCHECK_LE(*buffer_size, max_buffer_size_);
  509. DCHECK_GT(*buffer_size, 0);
  510. *buffer_size = min(static_cast<int64_t>(max_buffer_size_), *buffer_size);
  511. int idx = free_buffers_idx(*buffer_size);
  512. // Quantize buffer size to nearest power of 2 greater than the specified buffer size and
  513. // convert to bytes
  514. *buffer_size = (1 << idx) * min_buffer_size_;
  515. unique_lock<mutex> lock(free_buffers_lock_);
  516. char* buffer = NULL;
  517. if (free_buffers_[idx].empty()) {
  518. ++num_allocated_buffers_;
  519. if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != NULL) {
  520. ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(1L);
  521. }
  522. if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != NULL) {
  523. ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(*buffer_size);
  524. }
  525. // Update the process mem usage. This is checked the next time we start
  526. // a read for the next reader (DiskIoMgr::GetNextScanRange)
  527. process_mem_tracker_->Consume(*buffer_size);
  528. buffer = new char[*buffer_size];
  529. } else {
  530. if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != NULL) {
  531. ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(-1L);
  532. }
  533. buffer = free_buffers_[idx].front();
  534. free_buffers_[idx].pop_front();
  535. }
  536. DCHECK(buffer != NULL);
  537. return buffer;
  538. }
  539. void DiskIoMgr::GcIoBuffers() {
  540. unique_lock<mutex> lock(free_buffers_lock_);
  541. int buffers_freed = 0;
  542. int bytes_freed = 0;
  543. for (int idx = 0; idx < free_buffers_.size(); ++idx) {
  544. for (list<char*>::iterator iter = free_buffers_[idx].begin();
  545. iter != free_buffers_[idx].end(); ++iter) {
  546. int64_t buffer_size = (1 << idx) * min_buffer_size_;
  547. process_mem_tracker_->Release(buffer_size);
  548. --num_allocated_buffers_;
  549. delete[] *iter;
  550. ++buffers_freed;
  551. bytes_freed += buffer_size;
  552. }
  553. free_buffers_[idx].clear();
  554. }
  555. if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != NULL) {
  556. ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(-buffers_freed);
  557. }
  558. if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != NULL) {
  559. ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(-bytes_freed);
  560. }
  561. if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != NULL) {
  562. ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Update(0);
  563. }
  564. }
  565. void DiskIoMgr::ReturnFreeBuffer(char* buffer, int64_t buffer_size) {
  566. DCHECK(buffer != NULL);
  567. int idx = free_buffers_idx(buffer_size);
  568. DCHECK_EQ(BitUtil::Ceil(buffer_size, min_buffer_size_) & ~(1 << idx), 0)
  569. << "buffer_size_ / min_buffer_size_ should be power of 2, got buffer_size = "
  570. << buffer_size << ", min_buffer_size_ = " << min_buffer_size_;
  571. if (FLAGS_reuse_io_buffers) {
  572. unique_lock<mutex> lock(free_buffers_lock_);
  573. free_buffers_[idx].push_back(buffer);
  574. } else {
  575. process_mem_tracker_->Release(buffer_size);
  576. --num_allocated_buffers_;
  577. delete[] buffer;
  578. }
  579. if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != NULL) {
  580. ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(1L);
  581. }
  582. }
  583. // This functions gets the next scan range to work on.
  584. // - wait until there is a reader with work and available buffer or the thread should
  585. // terminate.
  586. // - Remove the scan range, available buffer and cycle to the next reader
  587. // There are a few guarantees this makes which causes some complications.
  588. // 1) Readers are round-robined.
  589. // 2) Multiple threads (including per disk) can work on the same reader.
  590. // 3) Scan ranges within a reader are round-robined.
  591. bool DiskIoMgr::GetNextScanRange(DiskQueue* disk_queue, ScanRange** range,
  592. ReaderContext** reader) {
  593. int disk_id = disk_queue->disk_id;
  594. *range = NULL;
  595. // This loops returns either with work to do or when the disk IoMgr shuts down.
  596. while (true) {
  597. *reader = NULL;
  598. ReaderContext::PerDiskState* reader_disk_state = NULL;
  599. {
  600. unique_lock<mutex> disk_lock(disk_queue->lock);
  601. while (!shut_down_ && disk_queue->readers.empty()) {
  602. // wait if there are no readers on the queue
  603. disk_queue->work_available.wait(disk_lock);
  604. }
  605. if (shut_down_) break;
  606. DCHECK(!disk_queue->readers.empty());
  607. // Get the next reader and remove the reader so that another disk thread
  608. // can't pick it up. It will be enqueued before issuing the read to HDFS
  609. // so this is not a big deal (i.e. multiple disk threads can read for the
  610. // same reader).
  611. // TODO: revisit.
  612. *reader = disk_queue->readers.front();
  613. disk_queue->readers.pop_front();
  614. DCHECK(*reader != NULL);
  615. reader_disk_state = &((*reader)->disk_states_[disk_id]);
  616. reader_disk_state->IncrementReadThreadAndDequeue();
  617. }
  618. // NOTE: no locks were taken in between. We need to be careful about what state
  619. // could have changed to the reader and disk in between.
  620. // There are some invariants here. Only one disk thread can have the
  621. // same reader here (the reader is removed from the queue). There can be
  622. // other disk threads operating on this reader in other functions though.
  623. // We just picked a reader, check the mem limits.
  624. // TODO: we can do a lot better here. The reader can likely make progress
  625. // with fewer io buffers.
  626. bool process_limit_exceeded = process_mem_tracker_->LimitExceeded();
  627. bool reader_limit_exceeded = (*reader)->mem_tracker_ != NULL
  628. ? (*reader)->mem_tracker_->AnyLimitExceeded() : false;
  629. if (process_limit_exceeded || reader_limit_exceeded) {
  630. (*reader)->Cancel(Status::MEM_LIMIT_EXCEEDED);
  631. }
  632. unique_lock<mutex> reader_lock((*reader)->lock_);
  633. VLOG_FILE << "Disk (id=" << disk_id << ") reading for " << (*reader)->DebugString();
  634. // Check if reader has been cancelled
  635. if ((*reader)->state_ == ReaderContext::Cancelled) {
  636. reader_disk_state->DecrementReadThreadAndCheckDone(*reader);
  637. continue;
  638. }
  639. DCHECK_EQ((*reader)->state_, ReaderContext::Active) << (*reader)->DebugString();
  640. if (reader_disk_state->next_range_to_start() == NULL &&
  641. !reader_disk_state->unstarted_ranges()->empty()) {
  642. // We don't have a range queued for this disk for what the caller should
  643. // read next. Populate that. We want to have one range waiting to minimize
  644. // wait time in GetNextRange.
  645. ScanRange* new_range = reader_disk_state->unstarted_ranges()->Dequeue();
  646. --(*reader)->num_unstarted_ranges_;
  647. (*reader)->ready_to_start_ranges_.Enqueue(new_range);
  648. reader_disk_state->set_next_range_to_start(new_range);
  649. if ((*reader)->num_unstarted_ranges_ == 0) {
  650. // All the ranges have been started, notify everyone blocked on GetNextRange.
  651. // Only one of them will get work so make sure to return NULL to the other
  652. // caller threads.
  653. (*reader)->ready_to_start_ranges_cv_.notify_all();
  654. } else {
  655. (*reader)->ready_to_start_ranges_cv_.notify_one();
  656. }
  657. }
  658. // Get the next scan range to work on from the reader. Only in_flight_ranges
  659. // are eligible since the disk threads do not start new ranges on their own.
  660. // There are no inflight ranges, nothing to do.
  661. if (reader_disk_state->in_flight_ranges()->empty()) {
  662. reader_disk_state->DecrementReadThread();
  663. continue;
  664. }
  665. DCHECK_GT(reader_disk_state->num_remaining_ranges(), 0);
  666. *range = reader_disk_state->in_flight_ranges()->Dequeue();
  667. DCHECK(*range != NULL);
  668. DCHECK_LT((*range)->bytes_read_, (*range)->len_);
  669. // Now that we've picked a scan range, put the reader back on the queue so
  670. // another thread can pick up another scan range for this reader.
  671. reader_disk_state->ScheduleReader(*reader, disk_id);
  672. DCHECK((*reader)->Validate()) << endl << (*reader)->DebugString();
  673. return true;
  674. }
  675. DCHECK(shut_down_);
  676. return false;
  677. }
  678. void DiskIoMgr::HandleReadFinished(DiskQueue* disk_queue, ReaderContext* reader,
  679. BufferDescriptor* buffer) {
  680. unique_lock<mutex> reader_lock(reader->lock_);
  681. ReaderContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
  682. DCHECK(reader->Validate()) << endl << reader->DebugString();
  683. DCHECK_GT(state.num_threads_in_read(), 0);
  684. DCHECK(buffer->buffer_ != NULL);
  685. if (reader->state_ == ReaderContext::Cancelled) {
  686. state.DecrementReadThreadAndCheckDone(reader);
  687. DCHECK(reader->Validate()) << endl << reader->DebugString();
  688. ReturnFreeBuffer(buffer->buffer_, buffer->buffer_len_);
  689. buffer->SetMemTracker(NULL);
  690. buffer->buffer_ = NULL;
  691. buffer->scan_range_->Cancel(reader->status_);
  692. // Enqueue the buffer to use the scan range's buffer cleanup path.
  693. buffer->scan_range_->EnqueueBuffer(buffer);
  694. return;
  695. }
  696. DCHECK_EQ(reader->state_, ReaderContext::Active);
  697. DCHECK(buffer->buffer_ != NULL);
  698. // Update the reader's scan ranges. There are a three cases here:
  699. // 1. Read error
  700. // 2. End of scan range
  701. // 3. Middle of scan range
  702. if (!buffer->status_.ok()) {
  703. // Error case
  704. ReturnFreeBuffer(buffer->buffer_, buffer->buffer_len_);
  705. buffer->SetMemTracker(NULL);
  706. buffer->buffer_ = NULL;
  707. buffer->eosr_ = true;
  708. --state.num_remaining_ranges();
  709. buffer->scan_range_->Cancel(buffer->status_);
  710. } else if (buffer->eosr_) {
  711. buffer->scan_range_->Close();
  712. --state.num_remaining_ranges();
  713. }
  714. bool queue_full = buffer->scan_range_->EnqueueBuffer(buffer);
  715. if (!buffer->eosr_) {
  716. if (queue_full) {
  717. reader->blocked_ranges_.Enqueue(buffer->scan_range_);
  718. } else {
  719. reader->ScheduleScanRange(buffer->scan_range_);
  720. }
  721. }
  722. state.DecrementReadThread();
  723. }
  724. // The thread waits until there is work or the entire system is being shut down.
  725. // If there is work, it reads the next chunk of the next scan range for the first
  726. // reader in the queue and round robins across the readers.
  727. // Locks are not taken when reading from disk. The main loop has three parts:
  728. // 1. GetNextScanRange(): Take locks and figure out what the next scan range to read is
  729. // 2. Open/Read the scan range. No locks are taken
  730. // 3. HandleReadFinished(): Take locks and update the disk and reader with the
  731. // results of the io.
  732. // Cancellation checking needs to happen in both steps 1 and 3.
  733. void DiskIoMgr::ReadLoop(DiskQueue* disk_queue) {
  734. int64_t disk_bit = 1 << disk_queue->disk_id;
  735. while (true) {
  736. char* buffer = NULL;
  737. ReaderContext* reader = NULL;;
  738. ScanRange* range = NULL;
  739. // Get the next scan range to read
  740. if (!GetNextScanRange(disk_queue, &range, &reader)) {
  741. DCHECK(shut_down_);
  742. break;
  743. }
  744. int64_t bytes_remaining = range->len_ - range->bytes_read_;
  745. int64_t buffer_size = ::min(bytes_remaining, static_cast<int64_t>(max_buffer_size_));
  746. bool enough_memory = true;
  747. if (reader->mem_tracker_ != NULL) {
  748. enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
  749. if (!enough_memory) {
  750. // Low memory, GC and try again.
  751. GcIoBuffers();
  752. enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
  753. }
  754. }
  755. if (!enough_memory) {
  756. ReaderContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
  757. unique_lock<mutex> reader_lock(reader->lock_);
  758. if (!range->ready_buffers_.empty()) {
  759. // We have memory pressure and this range doesn't need another buffer
  760. // (it already has one queued). Skip this range and pick it up later.
  761. range->blocked_on_queue_ = true;
  762. reader->blocked_ranges_.Enqueue(range);
  763. state.DecrementReadThread();
  764. continue;
  765. } else {
  766. // We need to get a buffer anyway since there are none queued. The query
  767. // is likely to fail due to mem limits but there's nothing we can do about that
  768. // now.
  769. }
  770. }
  771. buffer = GetFreeBuffer(&buffer_size);
  772. ++reader->num_used_buffers_;
  773. // Validate more invariants.
  774. DCHECK_GT(reader->num_used_buffers_, 0);
  775. DCHECK(range != NULL);
  776. DCHECK(reader != NULL);
  777. DCHECK(buffer != NULL);
  778. BufferDescriptor* buffer_desc = GetBufferDesc(reader, range, buffer, buffer_size);
  779. DCHECK(buffer_desc != NULL);
  780. // No locks in this section. Only working on local vars. We don't want to hold a
  781. // lock across the read call.
  782. buffer_desc->status_ = range->Open();
  783. if (buffer_desc->status_.ok()) {
  784. // Update counters.
  785. if (reader->active_read_thread_counter_) {
  786. reader->active_read_thread_counter_->Update(1L);
  787. }
  788. if (reader->disks_accessed_bitmap_) {
  789. reader->disks_accessed_bitmap_->BitOr(disk_bit);
  790. }
  791. SCOPED_TIMER(&read_timer_);
  792. SCOPED_TIMER(reader->read_timer_);
  793. buffer_desc->status_ = range->Read(buffer, &buffer_desc->len_, &buffer_desc->eosr_);
  794. buffer_desc->scan_range_offset_ = range->bytes_read_ - buffer_desc->len_;
  795. if (reader->bytes_read_counter_ != NULL) {
  796. COUNTER_UPDATE(reader->bytes_read_counter_, buffer_desc->len_);
  797. }
  798. COUNTER_UPDATE(&total_bytes_read_counter_, buffer_desc->len_);
  799. if (reader->active_read_thread_counter_) {
  800. reader->active_read_thread_counter_->Update(-1L);
  801. }
  802. }
  803. // Finished read, update reader/disk based on the results
  804. HandleReadFinished(disk_queue, reader, buffer_desc);
  805. }
  806. DCHECK(shut_down_);
  807. }
  808. int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
  809. int64_t buffer_size_scaled = BitUtil::Ceil(buffer_size, min_buffer_size_);
  810. int idx = BitUtil::Log2(buffer_size_scaled);
  811. DCHECK_GE(idx, 0);
  812. DCHECK_LT(idx, free_buffers_.size());
  813. return idx;
  814. }