PageRenderTime 21ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/bloom_filter.hpp

http://libbloom.codeplex.com
C++ Header | 543 lines | 418 code | 67 blank | 58 comment | 49 complexity | 4a94ad8deb2a7ead31622cf329f2513f MD5 | raw file
  1. /*
  2. *********************************************************************
  3. * *
  4. * Open Bloom Filter *
  5. * *
  6. * Author: Arash Partow - 2000 *
  7. * URL: http://www.partow.net *
  8. * URL: http://www.partow.net/programming/hashfunctions/index.html *
  9. * *
  10. * Copyright notice: *
  11. * Free use of the Open Bloom Filter Library is permitted under the *
  12. * guidelines and in accordance with the most current version of the *
  13. * Common Public License. *
  14. * http://www.opensource.org/licenses/cpl1.0.php *
  15. * *
  16. *********************************************************************
  17. */
  18. #ifndef INCLUDE_BLOOM_FILTER_HPP
  19. #define INCLUDE_BLOOM_FILTER_HPP
  20. #include <cstddef>
  21. #include <algorithm>
  22. #include <cmath>
  23. #include <limits>
  24. #include <string>
  25. #include <vector>
  26. static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
  27. static const unsigned char bit_mask[bits_per_char] = {
  28. 0x01, //00000001
  29. 0x02, //00000010
  30. 0x04, //00000100
  31. 0x08, //00001000
  32. 0x10, //00010000
  33. 0x20, //00100000
  34. 0x40, //01000000
  35. 0x80 //10000000
  36. };
  37. class bloom_filter
  38. {
  39. protected:
  40. typedef unsigned int bloom_type;
  41. typedef unsigned char cell_type;
  42. public:
  43. bloom_filter(const std::size_t& predicted_inserted_element_count,
  44. const double& false_positive_probability,
  45. const std::size_t& random_seed)
  46. : bit_table_(0),
  47. predicted_inserted_element_count_(predicted_inserted_element_count),
  48. inserted_element_count_(0),
  49. random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
  50. desired_false_positive_probability_(false_positive_probability)
  51. {
  52. find_optimal_parameters();
  53. generate_unique_salt();
  54. raw_table_size_ = table_size_ / bits_per_char;
  55. bit_table_ = new cell_type[raw_table_size_];
  56. std::fill_n(bit_table_,raw_table_size_,0x00);
  57. }
  58. bloom_filter(const bloom_filter& filter)
  59. {
  60. this->operator=(filter);
  61. }
  62. bloom_filter& operator = (const bloom_filter& filter)
  63. {
  64. salt_count_ = filter.salt_count_;
  65. table_size_ = filter.table_size_;
  66. raw_table_size_ = filter.raw_table_size_;
  67. predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
  68. inserted_element_count_ = filter.inserted_element_count_;
  69. random_seed_ = filter.random_seed_;
  70. desired_false_positive_probability_ = filter.desired_false_positive_probability_;
  71. delete[] bit_table_;
  72. bit_table_ = new cell_type[raw_table_size_];
  73. std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
  74. salt_ = filter.salt_;
  75. return *this;
  76. }
  77. virtual ~bloom_filter()
  78. {
  79. delete[] bit_table_;
  80. }
  81. inline bool operator!() const
  82. {
  83. return (0 == table_size_);
  84. }
  85. inline void clear()
  86. {
  87. std::fill_n(bit_table_,raw_table_size_,0x00);
  88. inserted_element_count_ = 0;
  89. }
  90. inline void insert(const unsigned char* key_begin, const std::size_t& length)
  91. {
  92. std::size_t bit_index = 0;
  93. std::size_t bit = 0;
  94. for (std::size_t i = 0; i < salt_.size(); ++i)
  95. {
  96. compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
  97. bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
  98. }
  99. ++inserted_element_count_;
  100. }
  101. template<typename T>
  102. inline void insert(const T& t)
  103. {
  104. // Note: T must be a C++ POD type.
  105. insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
  106. }
  107. inline void insert(const std::string& key)
  108. {
  109. insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
  110. }
  111. inline void insert(const char* data, const std::size_t& length)
  112. {
  113. insert(reinterpret_cast<const unsigned char*>(data),length);
  114. }
  115. template<typename InputIterator>
  116. inline void insert(const InputIterator begin, const InputIterator end)
  117. {
  118. InputIterator itr = begin;
  119. while (end != itr)
  120. {
  121. insert(*(itr++));
  122. }
  123. }
  124. inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
  125. {
  126. std::size_t bit_index = 0;
  127. std::size_t bit = 0;
  128. for (std::size_t i = 0; i < salt_.size(); ++i)
  129. {
  130. compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
  131. if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
  132. {
  133. return false;
  134. }
  135. }
  136. return true;
  137. }
  138. template<typename T>
  139. inline bool contains(const T& t) const
  140. {
  141. return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
  142. }
  143. inline bool contains(const std::string& key) const
  144. {
  145. return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
  146. }
  147. inline bool contains(const char* data, const std::size_t& length) const
  148. {
  149. return contains(reinterpret_cast<const unsigned char*>(data),length);
  150. }
  151. template<typename InputIterator>
  152. inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
  153. {
  154. InputIterator itr = begin;
  155. while (end != itr)
  156. {
  157. if (!contains(*itr))
  158. {
  159. return itr;
  160. }
  161. ++itr;
  162. }
  163. return end;
  164. }
  165. template<typename InputIterator>
  166. inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
  167. {
  168. InputIterator itr = begin;
  169. while (end != itr)
  170. {
  171. if (contains(*itr))
  172. {
  173. return itr;
  174. }
  175. ++itr;
  176. }
  177. return end;
  178. }
  179. inline virtual std::size_t size() const
  180. {
  181. return table_size_;
  182. }
  183. inline std::size_t element_count() const
  184. {
  185. return inserted_element_count_;
  186. }
  187. inline double effective_fpp() const
  188. {
  189. /*
  190. Note:
  191. The effective false positive probability is calculated using the
  192. designated table size and hash function count in conjunction with
  193. the current number of inserted elements - not the user defined
  194. predicated/expected number of inserted elements.
  195. */
  196. return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
  197. }
  198. inline bloom_filter& operator &= (const bloom_filter& filter)
  199. {
  200. /* intersection */
  201. if (
  202. (salt_count_ == filter.salt_count_) &&
  203. (table_size_ == filter.table_size_) &&
  204. (random_seed_ == filter.random_seed_)
  205. )
  206. {
  207. for (std::size_t i = 0; i < raw_table_size_; ++i)
  208. {
  209. bit_table_[i] &= filter.bit_table_[i];
  210. }
  211. }
  212. return *this;
  213. }
  214. inline bloom_filter& operator |= (const bloom_filter& filter)
  215. {
  216. /* union */
  217. if (
  218. (salt_count_ == filter.salt_count_) &&
  219. (table_size_ == filter.table_size_) &&
  220. (random_seed_ == filter.random_seed_)
  221. )
  222. {
  223. for (std::size_t i = 0; i < raw_table_size_; ++i)
  224. {
  225. bit_table_[i] |= filter.bit_table_[i];
  226. }
  227. }
  228. return *this;
  229. }
  230. inline bloom_filter& operator ^= (const bloom_filter& filter)
  231. {
  232. /* difference */
  233. if (
  234. (salt_count_ == filter.salt_count_) &&
  235. (table_size_ == filter.table_size_) &&
  236. (random_seed_ == filter.random_seed_)
  237. )
  238. {
  239. for (std::size_t i = 0; i < raw_table_size_; ++i)
  240. {
  241. bit_table_[i] ^= filter.bit_table_[i];
  242. }
  243. }
  244. return *this;
  245. }
  246. inline const cell_type* table() const
  247. {
  248. return bit_table_;
  249. }
  250. protected:
  251. inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
  252. {
  253. bit_index = hash % table_size_;
  254. bit = bit_index % bits_per_char;
  255. }
  256. void generate_unique_salt()
  257. {
  258. /*
  259. Note:
  260. A distinct hash function need not be implementation-wise
  261. distinct. In the current implementation "seeding" a common
  262. hash function with different values seems to be adequate.
  263. */
  264. const unsigned int predef_salt_count = 128;
  265. static const bloom_type predef_salt[predef_salt_count] =
  266. {
  267. 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
  268. 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
  269. 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
  270. 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
  271. 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
  272. 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
  273. 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
  274. 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
  275. 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
  276. 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
  277. 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
  278. 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
  279. 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
  280. 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
  281. 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
  282. 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
  283. 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
  284. 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
  285. 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
  286. 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
  287. 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
  288. 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
  289. 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
  290. 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
  291. 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
  292. 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
  293. 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
  294. 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
  295. 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
  296. 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
  297. 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
  298. 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
  299. };
  300. if (salt_count_ <= predef_salt_count)
  301. {
  302. std::copy(predef_salt,
  303. predef_salt + salt_count_,
  304. std::back_inserter(salt_));
  305. for (unsigned int i = 0; i < salt_.size(); ++i)
  306. {
  307. /*
  308. Note:
  309. This is done to integrate the user defined random seed,
  310. so as to allow for the generation of unique bloom filter
  311. instances.
  312. */
  313. salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
  314. }
  315. }
  316. else
  317. {
  318. std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
  319. srand(static_cast<unsigned int>(random_seed_));
  320. while (salt_.size() < salt_count_)
  321. {
  322. bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
  323. if (0 == current_salt) continue;
  324. if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
  325. {
  326. salt_.push_back(current_salt);
  327. }
  328. }
  329. }
  330. }
  331. void find_optimal_parameters()
  332. {
  333. /*
  334. Note:
  335. The following will attempt to find the number of hash functions
  336. and minimum amount of storage bits required to construct a bloom
  337. filter consistent with the user defined false positive probability
  338. and estimated element insertion count.
  339. */
  340. double min_m = std::numeric_limits<double>::infinity();
  341. double min_k = 0.0;
  342. double curr_m = 0.0;
  343. double k = 1.0;
  344. while (k < 1000.0)
  345. {
  346. double numerator = (- k * predicted_inserted_element_count_);
  347. double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
  348. curr_m = numerator / denominator;
  349. if (curr_m < min_m)
  350. {
  351. min_m = curr_m;
  352. min_k = k;
  353. }
  354. k += 1.0;
  355. }
  356. salt_count_ = static_cast<std::size_t>(min_k);
  357. table_size_ = static_cast<std::size_t>(min_m);
  358. table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
  359. }
  360. inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
  361. {
  362. const unsigned char* itr = begin;
  363. while (remaining_length >= 4)
  364. {
  365. hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
  366. hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
  367. hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
  368. hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
  369. remaining_length -= 4;
  370. }
  371. while (remaining_length >= 2)
  372. {
  373. hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
  374. hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
  375. remaining_length -= 2;
  376. }
  377. if (remaining_length)
  378. {
  379. hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
  380. }
  381. return hash;
  382. }
  383. std::vector<bloom_type> salt_;
  384. unsigned char* bit_table_;
  385. std::size_t salt_count_;
  386. std::size_t table_size_;
  387. std::size_t raw_table_size_;
  388. std::size_t predicted_inserted_element_count_;
  389. std::size_t inserted_element_count_;
  390. std::size_t random_seed_;
  391. double desired_false_positive_probability_;
  392. };
  393. inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
  394. {
  395. bloom_filter result = a;
  396. result &= b;
  397. return result;
  398. }
  399. inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
  400. {
  401. bloom_filter result = a;
  402. result |= b;
  403. return result;
  404. }
  405. inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
  406. {
  407. bloom_filter result = a;
  408. result ^= b;
  409. return result;
  410. }
  411. class compressible_bloom_filter : public bloom_filter
  412. {
  413. public:
  414. compressible_bloom_filter(const std::size_t& predicted_element_count,
  415. const double& false_positive_probability,
  416. const std::size_t& random_seed)
  417. : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
  418. {
  419. size_list.push_back(table_size_);
  420. }
  421. inline virtual std::size_t size() const
  422. {
  423. return size_list.back();
  424. }
  425. inline bool compress(const double& percentage)
  426. {
  427. if ((0.0 >= percentage) || (percentage >= 100.0))
  428. {
  429. return false;
  430. }
  431. std::size_t original_table_size = size_list.back();
  432. std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
  433. new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
  434. if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
  435. {
  436. return false;
  437. }
  438. desired_false_positive_probability_ = effective_fpp();
  439. cell_type* tmp = new cell_type[new_table_size / bits_per_char];
  440. std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
  441. cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
  442. cell_type* end = bit_table_ + (original_table_size / bits_per_char);
  443. cell_type* itr_tmp = tmp;
  444. while (end != itr)
  445. {
  446. *(itr_tmp++) |= (*itr++);
  447. }
  448. delete[] bit_table_;
  449. bit_table_ = tmp;
  450. size_list.push_back(new_table_size);
  451. return true;
  452. }
  453. private:
  454. inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
  455. {
  456. bit_index = hash;
  457. for (std::size_t i = 0; i < size_list.size(); ++i)
  458. {
  459. bit_index %= size_list[i];
  460. }
  461. bit = bit_index % bits_per_char;
  462. }
  463. std::vector<std::size_t> size_list;
  464. };
  465. #endif
  466. /*
  467. Note 1:
  468. If it can be guaranteed that bits_per_char will be of the form 2^n then
  469. the following optimization can be used:
  470. hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
  471. Note 2:
  472. For performance reasons where possible when allocating memory it should
  473. be aligned (aligned_alloc) according to the architecture being used.
  474. */