PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/deps/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h

https://gitlab.com/barrel-db/erlang-rocksdb
C Header | 327 lines | 219 code | 43 blank | 65 comment | 13 complexity | 6b76cb13f4b09514892529da6927e886 MD5 | raw file
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #ifndef ROCKSDB_LITE
  7. #include <limits>
  8. #include <string>
  9. #include <vector>
  10. #include "db/merge_context.h"
  11. #include "memtable/skiplist.h"
  12. #include "options/db_options.h"
  13. #include "port/port.h"
  14. #include "rocksdb/comparator.h"
  15. #include "rocksdb/iterator.h"
  16. #include "rocksdb/slice.h"
  17. #include "rocksdb/status.h"
  18. #include "rocksdb/utilities/write_batch_with_index.h"
  19. namespace ROCKSDB_NAMESPACE {
  20. class MergeContext;
  21. class WBWIIteratorImpl;
  22. class WriteBatchWithIndexInternal;
  23. struct Options;
  24. // when direction == forward
  25. // * current_at_base_ <=> base_iterator > delta_iterator
  26. // when direction == backwards
  27. // * current_at_base_ <=> base_iterator < delta_iterator
  28. // always:
  29. // * equal_keys_ <=> base_iterator == delta_iterator
  30. class BaseDeltaIterator : public Iterator {
  31. public:
  32. BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
  33. WBWIIteratorImpl* delta_iterator,
  34. const Comparator* comparator,
  35. const ReadOptions* read_options = nullptr);
  36. ~BaseDeltaIterator() override {}
  37. bool Valid() const override;
  38. void SeekToFirst() override;
  39. void SeekToLast() override;
  40. void Seek(const Slice& k) override;
  41. void SeekForPrev(const Slice& k) override;
  42. void Next() override;
  43. void Prev() override;
  44. Slice key() const override;
  45. Slice value() const override;
  46. Status status() const override;
  47. void Invalidate(Status s);
  48. private:
  49. void AssertInvariants();
  50. void Advance();
  51. void AdvanceDelta();
  52. void AdvanceBase();
  53. bool BaseValid() const;
  54. bool DeltaValid() const;
  55. void UpdateCurrent();
  56. std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
  57. bool forward_;
  58. bool current_at_base_;
  59. bool equal_keys_;
  60. mutable Status status_;
  61. std::unique_ptr<Iterator> base_iterator_;
  62. std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
  63. const Comparator* comparator_; // not owned
  64. const Slice* iterate_upper_bound_;
  65. mutable PinnableSlice merge_result_;
  66. };
  67. // Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
  68. struct WriteBatchIndexEntry {
  69. WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
  70. : offset(o),
  71. column_family(c),
  72. key_offset(ko),
  73. key_size(ksz),
  74. search_key(nullptr) {}
  75. // Create a dummy entry as the search key. This index entry won't be backed
  76. // by an entry from the write batch, but a pointer to the search key. Or a
  77. // special flag of offset can indicate we are seek to first.
  78. // @_search_key: the search key
  79. // @_column_family: column family
  80. // @is_forward_direction: true for Seek(). False for SeekForPrev()
  81. // @is_seek_to_first: true if we seek to the beginning of the column family
  82. // _search_key should be null in this case.
  83. WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
  84. bool is_forward_direction, bool is_seek_to_first)
  85. // For SeekForPrev(), we need to make the dummy entry larger than any
  86. // entry who has the same search key. Otherwise, we'll miss those entries.
  87. : offset(is_forward_direction ? 0 : port::kMaxSizet),
  88. column_family(_column_family),
  89. key_offset(0),
  90. key_size(is_seek_to_first ? kFlagMinInCf : 0),
  91. search_key(_search_key) {
  92. assert(_search_key != nullptr || is_seek_to_first);
  93. }
  94. // If this flag appears in the key_size, it indicates a
  95. // key that is smaller than any other entry for the same column family.
  96. static const size_t kFlagMinInCf = port::kMaxSizet;
  97. bool is_min_in_cf() const {
  98. assert(key_size != kFlagMinInCf ||
  99. (key_offset == 0 && search_key == nullptr));
  100. return key_size == kFlagMinInCf;
  101. }
  102. // offset of an entry in write batch's string buffer. If this is a dummy
  103. // lookup key, in which case search_key != nullptr, offset is set to either
  104. // 0 or max, only for comparison purpose. Because when entries have the same
  105. // key, the entry with larger offset is larger, offset = 0 will make a seek
  106. // key small or equal than all the entries with the seek key, so that Seek()
  107. // will find all the entries of the same key. Similarly, offset = MAX will
  108. // make the entry just larger than all entries with the search key so
  109. // SeekForPrev() will see all the keys with the same key.
  110. size_t offset;
  111. uint32_t column_family; // c1olumn family of the entry.
  112. size_t key_offset; // offset of the key in write batch's string buffer.
  113. size_t key_size; // size of the key. kFlagMinInCf indicates
  114. // that this is a dummy look up entry for
  115. // SeekToFirst() to the beginning of the column
  116. // family. We use the flag here to save a boolean
  117. // in the struct.
  118. const Slice* search_key; // if not null, instead of reading keys from
  119. // write batch, use it to compare. This is used
  120. // for lookup key.
  121. };
  122. class ReadableWriteBatch : public WriteBatch {
  123. public:
  124. explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
  125. : WriteBatch(reserved_bytes, max_bytes) {}
  126. // Retrieve some information from a write entry in the write batch, given
  127. // the start offset of the write entry.
  128. Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
  129. Slice* value, Slice* blob, Slice* xid) const;
  130. };
  131. class WriteBatchEntryComparator {
  132. public:
  133. WriteBatchEntryComparator(const Comparator* _default_comparator,
  134. const ReadableWriteBatch* write_batch)
  135. : default_comparator_(_default_comparator), write_batch_(write_batch) {}
  136. // Compare a and b. Return a negative value if a is less than b, 0 if they
  137. // are equal, and a positive value if a is greater than b
  138. int operator()(const WriteBatchIndexEntry* entry1,
  139. const WriteBatchIndexEntry* entry2) const;
  140. int CompareKey(uint32_t column_family, const Slice& key1,
  141. const Slice& key2) const;
  142. void SetComparatorForCF(uint32_t column_family_id,
  143. const Comparator* comparator) {
  144. if (column_family_id >= cf_comparators_.size()) {
  145. cf_comparators_.resize(column_family_id + 1, nullptr);
  146. }
  147. cf_comparators_[column_family_id] = comparator;
  148. }
  149. const Comparator* default_comparator() { return default_comparator_; }
  150. private:
  151. const Comparator* default_comparator_;
  152. std::vector<const Comparator*> cf_comparators_;
  153. const ReadableWriteBatch* write_batch_;
  154. };
  155. using WriteBatchEntrySkipList =
  156. SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>;
  157. class WBWIIteratorImpl : public WBWIIterator {
  158. public:
  159. enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
  160. WBWIIteratorImpl(uint32_t column_family_id,
  161. WriteBatchEntrySkipList* skip_list,
  162. const ReadableWriteBatch* write_batch,
  163. WriteBatchEntryComparator* comparator)
  164. : column_family_id_(column_family_id),
  165. skip_list_iter_(skip_list),
  166. write_batch_(write_batch),
  167. comparator_(comparator) {}
  168. ~WBWIIteratorImpl() override {}
  169. bool Valid() const override {
  170. if (!skip_list_iter_.Valid()) {
  171. return false;
  172. }
  173. const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
  174. return (iter_entry != nullptr &&
  175. iter_entry->column_family == column_family_id_);
  176. }
  177. void SeekToFirst() override {
  178. WriteBatchIndexEntry search_entry(
  179. nullptr /* search_key */, column_family_id_,
  180. true /* is_forward_direction */, true /* is_seek_to_first */);
  181. skip_list_iter_.Seek(&search_entry);
  182. }
  183. void SeekToLast() override {
  184. WriteBatchIndexEntry search_entry(
  185. nullptr /* search_key */, column_family_id_ + 1,
  186. true /* is_forward_direction */, true /* is_seek_to_first */);
  187. skip_list_iter_.Seek(&search_entry);
  188. if (!skip_list_iter_.Valid()) {
  189. skip_list_iter_.SeekToLast();
  190. } else {
  191. skip_list_iter_.Prev();
  192. }
  193. }
  194. void Seek(const Slice& key) override {
  195. WriteBatchIndexEntry search_entry(&key, column_family_id_,
  196. true /* is_forward_direction */,
  197. false /* is_seek_to_first */);
  198. skip_list_iter_.Seek(&search_entry);
  199. }
  200. void SeekForPrev(const Slice& key) override {
  201. WriteBatchIndexEntry search_entry(&key, column_family_id_,
  202. false /* is_forward_direction */,
  203. false /* is_seek_to_first */);
  204. skip_list_iter_.SeekForPrev(&search_entry);
  205. }
  206. void Next() override { skip_list_iter_.Next(); }
  207. void Prev() override { skip_list_iter_.Prev(); }
  208. WriteEntry Entry() const override;
  209. Status status() const override {
  210. // this is in-memory data structure, so the only way status can be non-ok is
  211. // through memory corruption
  212. return Status::OK();
  213. }
  214. const WriteBatchIndexEntry* GetRawEntry() const {
  215. return skip_list_iter_.key();
  216. }
  217. bool MatchesKey(uint32_t cf_id, const Slice& key);
  218. // Moves the to first entry of the previous key.
  219. void PrevKey();
  220. // Moves the to first entry of the next key.
  221. void NextKey();
  222. // Moves the iterator to the Update (Put or Delete) for the current key
  223. // If there are no Put/Delete, the Iterator will point to the first entry for
  224. // this key
  225. // @return kFound if a Put was found for the key
  226. // @return kDeleted if a delete was found for the key
  227. // @return kMergeInProgress if only merges were fouund for the key
  228. // @return kError if an unsupported operation was found for the key
  229. // @return kNotFound if no operations were found for this key
  230. //
  231. Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
  232. Result FindLatestUpdate(MergeContext* merge_context);
  233. protected:
  234. void AdvanceKey(bool forward);
  235. private:
  236. uint32_t column_family_id_;
  237. WriteBatchEntrySkipList::Iterator skip_list_iter_;
  238. const ReadableWriteBatch* write_batch_;
  239. WriteBatchEntryComparator* comparator_;
  240. };
  241. class WriteBatchWithIndexInternal {
  242. public:
  243. // For GetFromBatchAndDB or similar
  244. explicit WriteBatchWithIndexInternal(DB* db,
  245. ColumnFamilyHandle* column_family);
  246. // For GetFromBatchAndDB or similar
  247. explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
  248. // For GetFromBatch or similar
  249. explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
  250. ColumnFamilyHandle* column_family);
  251. // If batch contains a value for key, store it in *value and return kFound.
  252. // If batch contains a deletion for key, return Deleted.
  253. // If batch contains Merge operations as the most recent entry for a key,
  254. // and the merge process does not stop (not reaching a value or delete),
  255. // prepend the current merge operands to *operands,
  256. // and return kMergeInProgress
  257. // If batch does not contain this key, return kNotFound
  258. // Else, return kError on error with error Status stored in *s.
  259. WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
  260. const Slice& key, std::string* value,
  261. Status* s) {
  262. return GetFromBatch(batch, key, &merge_context_, value, s);
  263. }
  264. WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
  265. const Slice& key,
  266. MergeContext* merge_context,
  267. std::string* value, Status* s);
  268. Status MergeKey(const Slice& key, const Slice* value,
  269. std::string* result) const {
  270. return MergeKey(key, value, merge_context_, result);
  271. }
  272. Status MergeKey(const Slice& key, const Slice* value,
  273. const MergeContext& context, std::string* result) const;
  274. size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
  275. MergeContext* GetMergeContext() { return &merge_context_; }
  276. Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
  277. private:
  278. DB* db_;
  279. const DBOptions* db_options_;
  280. ColumnFamilyHandle* column_family_;
  281. MergeContext merge_context_;
  282. };
  283. } // namespace ROCKSDB_NAMESPACE
  284. #endif // !ROCKSDB_LITE