/src/im/gpinyin/include/userdict.h

http://ftk.googlecode.com/ · C++ Header · 427 lines · 242 code · 98 blank · 87 comment · 0 complexity · 177fcee9c60da03be8a2475e75ec4711 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_INCLUDE_USERDICT_H__
  17. #define PINYINIME_INCLUDE_USERDICT_H__
  18. #define ___CACHE_ENABLED___
  19. #define ___SYNC_ENABLED___
  20. #define ___PREDICT_ENABLED___
  21. // Debug performance for operations
  22. // #define ___DEBUG_PERF___
  23. #include "atomdictbase.h"
  24. namespace ime_pinyin {
  25. class UserDict : public AtomDictBase {
  26. public:
  27. UserDict();
  28. ~UserDict();
  29. bool load_dict(const char *file_name, LemmaIdType start_id,
  30. LemmaIdType end_id);
  31. bool close_dict();
  32. unsigned number_of_lemmas();
  33. void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
  34. MileStoneHandle extend_dict(MileStoneHandle from_handle,
  35. const DictExtPara *dep, LmaPsbItem *lpi_items,
  36. unsigned lpi_max, unsigned *lpi_num);
  37. unsigned get_lpis(const uint16 *splid_str, uint16 splid_str_len,
  38. LmaPsbItem *lpi_items, unsigned lpi_max);
  39. uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
  40. uint16 str_max);
  41. uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
  42. uint16 splids_max, bool arg_valid);
  43. unsigned predict(const char16 last_hzs[], uint16 hzs_len,
  44. NPredictItem *npre_items, unsigned npre_max,
  45. unsigned b4_used);
  46. // Full spelling ids are required
  47. LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
  48. uint16 lemma_len, uint16 count);
  49. LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
  50. bool selected);
  51. LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
  52. uint16 lemma_len);
  53. LmaScoreType get_lemma_score(LemmaIdType lemma_id);
  54. LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
  55. uint16 lemma_len);
  56. bool remove_lemma(LemmaIdType lemma_id);
  57. unsigned get_total_lemma_count();
  58. void set_total_lemma_count_of_others(unsigned count);
  59. void flush_cache();
  60. void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
  61. uint32 reclaim_ratio);
  62. void reclaim();
  63. void defragment();
  64. #ifdef ___SYNC_ENABLED___
  65. void clear_sync_lemmas(unsigned int start, unsigned int end);
  66. int get_sync_count();
  67. LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
  68. uint16 lemma_len, uint16 count, uint64 lmt);
  69. /**
  70. * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
  71. *
  72. * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
  73. * @param len length of lemmas string in UTF-16LE
  74. * @return newly added lemma count
  75. */
  76. int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
  77. /**
  78. * Get lemmas need sync to a UTF-16LE string of above format.
  79. * Note: input buffer (str) must not be too small. If str is too small to
  80. * contain single one lemma, there might be a dead loop.
  81. *
  82. * @param str buffer to write lemmas
  83. * @param size buffer size in UTF-16LE
  84. * @param count output value of lemma returned
  85. * @return UTF-16LE string length
  86. */
  87. int get_sync_lemmas_in_utf16le_string_from_beginning(
  88. char16 * str, int size, int * count);
  89. #endif
  90. struct UserDictStat {
  91. uint32 version;
  92. const char * file_name;
  93. struct timeval load_time;
  94. struct timeval last_update;
  95. uint32 disk_size;
  96. uint32 lemma_count;
  97. uint32 lemma_size;
  98. uint32 delete_count;
  99. uint32 delete_size;
  100. #ifdef ___SYNC_ENABLED___
  101. uint32 sync_count;
  102. #endif
  103. uint32 reclaim_ratio;
  104. uint32 limit_lemma_count;
  105. uint32 limit_lemma_size;
  106. };
  107. bool state(UserDictStat * stat);
  108. private:
  109. uint32 total_other_nfreq_;
  110. struct timeval load_time_;
  111. LemmaIdType start_id_;
  112. uint32 version_;
  113. uint8 * lemmas_;
  114. // In-Memory-Only flag for each lemma
  115. static const uint8 kUserDictLemmaFlagRemove = 1;
  116. // Inuse lemmas' offset
  117. uint32 * offsets_;
  118. // Highest bit in offset tells whether corresponding lemma is removed
  119. static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
  120. // Maximum possible for the offset
  121. static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
  122. // Bit width for last modified time, from 1 to 16
  123. static const uint32 kUserDictLMTBitWidth = 16;
  124. // Granularity for last modified time in second
  125. static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
  126. // Maximum frequency count
  127. static const uint16 kUserDictMaxFrequency = 0xFFFF;
  128. #define COARSE_UTC(year, month, day, hour, minute, second) \
  129. ( \
  130. (year - 1970) * 365 * 24 * 60 * 60 + \
  131. (month - 1) * 30 * 24 * 60 * 60 + \
  132. (day - 1) * 24 * 60 * 60 + \
  133. (hour - 0) * 60 * 60 + \
  134. (minute - 0) * 60 + \
  135. (second - 0) \
  136. )
  137. static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
  138. // Correspond to offsets_
  139. uint32 * scores_;
  140. // Following two fields are only valid in memory
  141. uint32 * ids_;
  142. #ifdef ___PREDICT_ENABLED___
  143. uint32 * predicts_;
  144. #endif
  145. #ifdef ___SYNC_ENABLED___
  146. uint32 * syncs_;
  147. unsigned sync_count_size_;
  148. #endif
  149. uint32 * offsets_by_id_;
  150. unsigned lemma_count_left_;
  151. unsigned lemma_size_left_;
  152. const char * dict_file_;
  153. // Be sure size is 4xN
  154. struct UserDictInfo {
  155. // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
  156. uint32 reclaim_ratio;
  157. // maximum lemma count, 0 means no limitation
  158. uint32 limit_lemma_count;
  159. // Maximum lemma size, it's different from
  160. // whole disk file size or in-mem dict size
  161. // 0 means no limitation
  162. uint32 limit_lemma_size;
  163. // Total lemma count including deleted and inuse
  164. // Also indicate offsets_ size
  165. uint32 lemma_count;
  166. // Total size of lemmas including used and freed
  167. uint32 lemma_size;
  168. // Freed lemma count
  169. uint32 free_count;
  170. // Freed lemma size in byte
  171. uint32 free_size;
  172. #ifdef ___SYNC_ENABLED___
  173. uint32 sync_count;
  174. #endif
  175. int32 total_nfreq;
  176. } dict_info_;
  177. static const uint32 kUserDictVersion = 0x0ABCDEF0;
  178. static const uint32 kUserDictPreAlloc = 32;
  179. static const uint32 kUserDictAverageNchar = 8;
  180. enum UserDictState {
  181. // Keep in order
  182. USER_DICT_NONE = 0,
  183. USER_DICT_SYNC,
  184. #ifdef ___SYNC_ENABLED___
  185. USER_DICT_SYNC_DIRTY,
  186. #endif
  187. USER_DICT_SCORE_DIRTY,
  188. USER_DICT_OFFSET_DIRTY,
  189. USER_DICT_LEMMA_DIRTY,
  190. USER_DICT_DEFRAGMENTED,
  191. } state_;
  192. struct UserDictSearchable {
  193. uint16 splids_len;
  194. uint16 splid_start[kMaxLemmaSize];
  195. uint16 splid_count[kMaxLemmaSize];
  196. // Compact inital letters for both FuzzyCompareSpellId and cache system
  197. uint32 signature[kMaxLemmaSize / 4];
  198. };
  199. #ifdef ___CACHE_ENABLED___
  200. enum UserDictCacheType {
  201. USER_DICT_CACHE,
  202. USER_DICT_MISS_CACHE,
  203. };
  204. static const int kUserDictCacheSize = 4;
  205. static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
  206. struct UserDictMissCache {
  207. uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
  208. uint16 head, tail;
  209. } miss_caches_[kMaxLemmaSize];
  210. struct UserDictCache {
  211. uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
  212. uint32 offsets[kUserDictCacheSize];
  213. uint32 lengths[kUserDictCacheSize];
  214. // Ring buffer
  215. uint16 head, tail;
  216. } caches_[kMaxLemmaSize];
  217. void cache_init();
  218. void cache_push(UserDictCacheType type,
  219. UserDictSearchable *searchable,
  220. uint32 offset, uint32 length);
  221. bool cache_hit(UserDictSearchable *searchable,
  222. uint32 *offset, uint32 *length);
  223. bool load_cache(UserDictSearchable *searchable,
  224. uint32 *offset, uint32 *length);
  225. void save_cache(UserDictSearchable *searchable,
  226. uint32 offset, uint32 length);
  227. void reset_cache();
  228. bool load_miss_cache(UserDictSearchable *searchable);
  229. void save_miss_cache(UserDictSearchable *searchable);
  230. void reset_miss_cache();
  231. #endif
  232. LmaScoreType translate_score(int f);
  233. int extract_score_freq(int raw_score);
  234. uint64 extract_score_lmt(int raw_score);
  235. inline int build_score(uint64 lmt, int freq);
  236. inline int64 utf16le_atoll(uint16 *s, int len);
  237. inline int utf16le_lltoa(int64 v, uint16 *s, int size);
  238. LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
  239. uint16 lemma_len, uint16 count, uint64 lmt);
  240. unsigned _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
  241. LmaPsbItem *lpi_items, unsigned lpi_max, bool * need_extend);
  242. int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
  243. int _get_lemma_score(LemmaIdType lemma_id);
  244. int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
  245. const UserDictSearchable *searchable);
  246. bool is_prefix_spell_id(const uint16 * fullids,
  247. uint16 fulllen, const UserDictSearchable *searchable);
  248. uint32 get_dict_file_size(UserDictInfo * info);
  249. bool reset(const char *file);
  250. bool validate(const char *file);
  251. bool load(const char *file, LemmaIdType start_id);
  252. bool is_valid_state();
  253. bool is_valid_lemma_id(LemmaIdType id);
  254. LemmaIdType get_max_lemma_id();
  255. void set_lemma_flag(uint32 offset, uint8 flag);
  256. char get_lemma_flag(uint32 offset);
  257. char get_lemma_nchar(uint32 offset);
  258. uint16 * get_lemma_spell_ids(uint32 offset);
  259. uint16 * get_lemma_word(uint32 offset);
  260. // Prepare searchable to fasten locate process
  261. void prepare_locate(UserDictSearchable *searchable,
  262. const uint16 * splids, uint16 len);
  263. // Compare initial letters only
  264. int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
  265. const UserDictSearchable *searchable);
  266. // Compare exactly two spell ids
  267. // First argument must be a full id spell id
  268. bool equal_spell_id(const uint16 * fullids,
  269. uint16 fulllen, const UserDictSearchable *searchable);
  270. // Find first item by initial letters
  271. int32 locate_first_in_offsets(const UserDictSearchable *searchable);
  272. LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
  273. uint16 lemma_len, uint16 count, uint64 lmt);
  274. // Check if a lemma is in dictionary
  275. int32 locate_in_offsets(char16 lemma_str[],
  276. uint16 splid_str[], uint16 lemma_len);
  277. bool remove_lemma_by_offset_index(int offset_index);
  278. #ifdef ___PREDICT_ENABLED___
  279. uint32 locate_where_to_insert_in_predicts(const uint16 * words,
  280. int lemma_len);
  281. int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
  282. void remove_lemma_from_predict_list(uint32 offset);
  283. #endif
  284. #ifdef ___SYNC_ENABLED___
  285. void queue_lemma_for_sync(LemmaIdType id);
  286. void remove_lemma_from_sync_list(uint32 offset);
  287. void write_back_sync(int fd);
  288. #endif
  289. void write_back_score(int fd);
  290. void write_back_offset(int fd);
  291. void write_back_lemma(int fd);
  292. void write_back_all(int fd);
  293. void write_back();
  294. struct UserDictScoreOffsetPair {
  295. int score;
  296. uint32 offset_index;
  297. };
  298. inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
  299. void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
  300. // On-disk format for each lemma
  301. // +-------------+
  302. // | Version (4) |
  303. // +-------------+
  304. // +-----------+-----------+--------------------+-------------------+
  305. // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
  306. // +-----------+-----------+--------------------+-------------------+
  307. // ...
  308. // +-----------------------+ +-------------+ <---Offset of offset
  309. // | Offset1 by_splids (4) | ... | OffsetN (4) |
  310. // +-----------------------+ +-------------+
  311. #ifdef ___PREDICT_ENABLED___
  312. // +----------------------+ +-------------+
  313. // | Offset1 by_lemma (4) | ... | OffsetN (4) |
  314. // +----------------------+ +-------------+
  315. #endif
  316. // +------------+ +------------+
  317. // | Score1 (4) | ... | ScoreN (4) |
  318. // +------------+ +------------+
  319. #ifdef ___SYNC_ENABLED___
  320. // +-------------+ +-------------+
  321. // | NewAdd1 (4) | ... | NewAddN (4) |
  322. // +-------------+ +-------------+
  323. #endif
  324. // +----------------+
  325. // | Dict Info (4x) |
  326. // +----------------+
  327. };
  328. }
  329. #endif