/moses/src/CompactPT/BlockHashIndex.h

https://bitbucket.org/xwd/moses-csp · C Header · 182 lines · 124 code · 38 blank · 20 comment · 2 complexity · 79cd3d26e14619f47630c3adf8d8b3f9 MD5 · raw file

  1. // $Id$
  2. // vim:tabstop=2
  3. /***********************************************************************
  4. Moses - factored phrase-based language decoder
  5. Copyright (C) 2006 University of Edinburgh
  6. This library is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU Lesser General Public
  8. License as published by the Free Software Foundation; either
  9. version 2.1 of the License, or (at your option) any later version.
  10. This library is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. Lesser General Public License for more details.
  14. You should have received a copy of the GNU Lesser General Public
  15. License along with this library; if not, write to the Free Software
  16. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. ***********************************************************************/
  18. #ifndef moses_BlockHashIndex_h
  19. #define moses_BlockHashIndex_h
  20. #include <iostream>
  21. #include <string>
  22. #include <vector>
  23. #include <queue>
  24. #include <cstring>
  25. #include <cstdio>
  26. #include "MurmurHash3.h"
  27. #include "StringVector.h"
  28. #include "PackedArray.h"
  29. #ifdef WITH_THREADS
  30. #include "ThreadPool.h"
  31. #endif
  32. namespace Moses
  33. {
  34. class BlockHashIndex
  35. {
  36. private:
  37. std::priority_queue<int> m_queue;
  38. size_t m_orderBits;
  39. size_t m_fingerPrintBits;
  40. std::FILE* m_fileHandle;
  41. size_t m_fileHandleStart;
  42. StringVector<unsigned char, unsigned long> m_landmarks;
  43. std::vector<void*> m_hashes;
  44. std::vector<clock_t> m_clocks;
  45. std::vector<PairedPackedArray<>*> m_arrays;
  46. std::vector<size_t> m_seekIndex;
  47. size_t m_size;
  48. int m_lastSaved;
  49. int m_lastDropped;
  50. size_t m_numLoadedRanges;
  51. #ifdef WITH_THREADS
  52. ThreadPool m_threadPool;
  53. boost::mutex m_mutex;
  54. template <typename Keys>
  55. class HashTask : public Task
  56. {
  57. public:
  58. HashTask(int id, BlockHashIndex& hash, Keys& keys)
  59. : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
  60. virtual void Run()
  61. {
  62. m_hash.CalcHash(m_id, *m_keys);
  63. }
  64. virtual ~HashTask()
  65. {
  66. delete m_keys;
  67. }
  68. private:
  69. int m_id;
  70. BlockHashIndex& m_hash;
  71. Keys* m_keys;
  72. };
  73. #endif
  74. size_t GetFprint(const char* key) const;
  75. size_t GetHash(size_t i, const char* key);
  76. public:
  77. #ifdef WITH_THREADS
  78. BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
  79. size_t threadsNum = 2);
  80. #else
  81. BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
  82. #endif
  83. ~BlockHashIndex();
  84. size_t GetHash(const char* key);
  85. size_t GetHash(std::string key);
  86. size_t operator[](std::string key);
  87. size_t operator[](char* key);
  88. void BeginSave(std::FILE* mphf);
  89. void SaveRange(size_t i);
  90. void SaveLastRange();
  91. size_t FinalizeSave();
  92. #ifdef WITH_THREADS
  93. void WaitAll();
  94. #endif
  95. void DropRange(size_t i);
  96. void DropLastRange();
  97. size_t LoadIndex(std::FILE* mphf);
  98. void LoadRange(size_t i);
  99. size_t Save(std::string filename);
  100. size_t Save(std::FILE * mphf);
  101. size_t Load(std::string filename);
  102. size_t Load(std::FILE * mphf);
  103. size_t GetSize() const;
  104. void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
  105. template <typename Keys>
  106. void AddRange(Keys &keys)
  107. {
  108. size_t current = m_landmarks.size();
  109. if(m_landmarks.size() && m_landmarks.back().str() >= keys[0])
  110. {
  111. std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
  112. std::cerr << "1: " << m_landmarks.back().str() << std::endl;
  113. std::cerr << "2: " << keys[0] << std::endl;
  114. abort();
  115. }
  116. m_landmarks.push_back(keys[0]);
  117. m_size += keys.size();
  118. #ifdef WITH_THREADS
  119. HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
  120. m_threadPool.Submit(ht);
  121. #else
  122. CalcHash(current, keys);
  123. #endif
  124. }
  125. template <typename Keys>
  126. void CalcHash(size_t current, Keys &keys)
  127. {
  128. #ifdef HAVE_CMPH
  129. void* source = vectorAdapter(keys);
  130. CalcHash(current, source);
  131. #endif
  132. }
  133. void CalcHash(size_t current, void* source);
  134. #ifdef HAVE_CMPH
  135. void* vectorAdapter(std::vector<std::string>& v);
  136. void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
  137. void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
  138. #endif
  139. };
  140. }
  141. #endif