PageRenderTime 165ms CodeModel.GetById 110ms app.highlight 31ms RepoModel.GetById 20ms app.codeStats 0ms

/src/im/gpinyin/include/userdict.h

http://ftk.googlecode.com/
C++ Header | 427 lines | 242 code | 98 blank | 87 comment | 0 complexity | 177fcee9c60da03be8a2475e75ec4711 MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#ifndef PINYINIME_INCLUDE_USERDICT_H__
 18#define PINYINIME_INCLUDE_USERDICT_H__
 19
 20#define ___CACHE_ENABLED___
 21#define ___SYNC_ENABLED___
 22#define ___PREDICT_ENABLED___
 23
 24// Debug performance for operations
 25// #define ___DEBUG_PERF___
 26
 27#include "atomdictbase.h"
 28
 29namespace ime_pinyin {
 30
 31class UserDict : public AtomDictBase {
 32 public:
 33  UserDict();
 34  ~UserDict();
 35
 36  bool load_dict(const char *file_name, LemmaIdType start_id,
 37                 LemmaIdType end_id);
 38
 39  bool close_dict();
 40
 41  unsigned number_of_lemmas();
 42
 43  void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
 44
 45  MileStoneHandle extend_dict(MileStoneHandle from_handle,
 46                              const DictExtPara *dep, LmaPsbItem *lpi_items,
 47                              unsigned lpi_max, unsigned *lpi_num);
 48
 49  unsigned get_lpis(const uint16 *splid_str, uint16 splid_str_len,
 50                  LmaPsbItem *lpi_items, unsigned lpi_max);
 51
 52  uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
 53                       uint16 str_max);
 54
 55  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
 56                          uint16 splids_max, bool arg_valid);
 57
 58  unsigned predict(const char16 last_hzs[], uint16 hzs_len,
 59                 NPredictItem *npre_items, unsigned npre_max,
 60                 unsigned b4_used);
 61
 62  // Full spelling ids are required
 63  LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
 64                        uint16 lemma_len, uint16 count);
 65
 66  LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
 67                           bool selected);
 68
 69  LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
 70                           uint16 lemma_len);
 71
 72  LmaScoreType get_lemma_score(LemmaIdType lemma_id);
 73
 74  LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
 75                        uint16 lemma_len);
 76
 77  bool remove_lemma(LemmaIdType lemma_id);
 78
 79  unsigned get_total_lemma_count();
 80  void set_total_lemma_count_of_others(unsigned count);
 81
 82  void flush_cache();
 83
 84  void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
 85                 uint32 reclaim_ratio);
 86
 87  void reclaim();
 88
 89  void defragment();
 90
 91#ifdef ___SYNC_ENABLED___
 92  void clear_sync_lemmas(unsigned int start, unsigned int end);
 93
 94  int get_sync_count();
 95
 96  LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
 97                        uint16 lemma_len, uint16 count, uint64 lmt);
 98   /**
 99    * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
100    *
101    * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
102    * @param len length of lemmas string in UTF-16LE
103    * @return newly added lemma count
104    */
105  int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
106
107  /**
108   * Get lemmas need sync to a UTF-16LE string of above format.
109   * Note: input buffer (str) must not be too small. If str is too small to 
110   *       contain single one lemma, there might be a dead loop.
111   *
112   * @param str buffer to write lemmas
113   * @param size buffer size in UTF-16LE
114   * @param count output value of lemma returned
115   * @return UTF-16LE string length
116   */
117  int get_sync_lemmas_in_utf16le_string_from_beginning(
118      char16 * str, int size, int * count);
119
120#endif
121
122  struct UserDictStat {
123    uint32 version;
124    const char * file_name;
125    struct timeval load_time;
126    struct timeval last_update;
127    uint32 disk_size;
128    uint32 lemma_count;
129    uint32 lemma_size;
130    uint32 delete_count;
131    uint32 delete_size;
132#ifdef ___SYNC_ENABLED___
133    uint32 sync_count;
134#endif
135    uint32 reclaim_ratio;
136    uint32 limit_lemma_count;
137    uint32 limit_lemma_size;
138  };
139
140  bool state(UserDictStat * stat);
141
142 private:
143  uint32 total_other_nfreq_;
144  struct timeval load_time_;
145  LemmaIdType start_id_;
146  uint32 version_;
147  uint8 * lemmas_;
148
149  // In-Memory-Only flag for each lemma
150  static const uint8 kUserDictLemmaFlagRemove = 1;
151  // Inuse lemmas' offset
152  uint32 * offsets_;
153  // Highest bit in offset tells whether corresponding lemma is removed
154  static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
155  // Maximum possible for the offset
156  static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
157  // Bit width for last modified time, from 1 to 16
158  static const uint32 kUserDictLMTBitWidth = 16;
159  // Granularity for last modified time in second
160  static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
161  // Maximum frequency count
162  static const uint16 kUserDictMaxFrequency = 0xFFFF;
163
164#define COARSE_UTC(year, month, day, hour, minute, second) \
165  ( \
166    (year - 1970) * 365 * 24 * 60 * 60 + \
167    (month - 1) * 30 * 24 * 60 * 60 + \
168    (day - 1) * 24 * 60 * 60 + \
169    (hour - 0) * 60 * 60 + \
170    (minute - 0) * 60 + \
171    (second - 0) \
172  )
173  static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
174
175  // Correspond to offsets_
176  uint32 * scores_;
177  // Following two fields are only valid in memory
178  uint32 * ids_;
179#ifdef ___PREDICT_ENABLED___
180  uint32 * predicts_;
181#endif
182#ifdef ___SYNC_ENABLED___
183  uint32 * syncs_;
184  unsigned sync_count_size_;
185#endif
186  uint32 * offsets_by_id_;
187
188  unsigned lemma_count_left_;
189  unsigned lemma_size_left_;
190
191  const char * dict_file_;
192
193  // Be sure size is 4xN
194  struct UserDictInfo {
195    // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
196    uint32 reclaim_ratio;
197    // maximum lemma count, 0 means no limitation
198    uint32 limit_lemma_count;
199    // Maximum lemma size, it's different from
200    // whole disk file size or in-mem dict size
201    // 0 means no limitation
202    uint32 limit_lemma_size;
203    // Total lemma count including deleted and inuse
204    // Also indicate offsets_ size
205    uint32 lemma_count;
206    // Total size of lemmas including used and freed
207    uint32 lemma_size;
208    // Freed lemma count
209    uint32 free_count;
210    // Freed lemma size in byte
211    uint32 free_size;
212#ifdef ___SYNC_ENABLED___
213    uint32 sync_count;
214#endif
215    int32 total_nfreq;
216  } dict_info_;
217
218  static const uint32 kUserDictVersion = 0x0ABCDEF0;
219
220  static const uint32 kUserDictPreAlloc = 32;
221  static const uint32 kUserDictAverageNchar = 8;
222
223  enum UserDictState {
224    // Keep in order
225    USER_DICT_NONE = 0,
226    USER_DICT_SYNC,
227#ifdef ___SYNC_ENABLED___
228    USER_DICT_SYNC_DIRTY,
229#endif
230    USER_DICT_SCORE_DIRTY,
231    USER_DICT_OFFSET_DIRTY,
232    USER_DICT_LEMMA_DIRTY,
233
234    USER_DICT_DEFRAGMENTED,
235  } state_;
236
237  struct UserDictSearchable {
238    uint16 splids_len;
239    uint16 splid_start[kMaxLemmaSize];
240    uint16 splid_count[kMaxLemmaSize];
241    // Compact inital letters for both FuzzyCompareSpellId and cache system
242    uint32 signature[kMaxLemmaSize / 4];
243  };
244
245#ifdef ___CACHE_ENABLED___
246  enum UserDictCacheType {
247    USER_DICT_CACHE,
248    USER_DICT_MISS_CACHE,
249  };
250
251  static const int kUserDictCacheSize = 4;
252  static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
253
254  struct UserDictMissCache {
255    uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
256    uint16 head, tail;
257  } miss_caches_[kMaxLemmaSize];
258
259  struct UserDictCache {
260    uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
261    uint32 offsets[kUserDictCacheSize];
262    uint32 lengths[kUserDictCacheSize];
263    // Ring buffer
264    uint16 head, tail;
265  } caches_[kMaxLemmaSize];
266
267  void cache_init();
268
269  void cache_push(UserDictCacheType type,
270                 UserDictSearchable *searchable,
271                 uint32 offset, uint32 length);
272
273  bool cache_hit(UserDictSearchable *searchable,
274                 uint32 *offset, uint32 *length);
275
276  bool load_cache(UserDictSearchable *searchable,
277                  uint32 *offset, uint32 *length);
278
279  void save_cache(UserDictSearchable *searchable,
280                  uint32 offset, uint32 length);
281
282  void reset_cache();
283
284  bool load_miss_cache(UserDictSearchable *searchable);
285
286  void save_miss_cache(UserDictSearchable *searchable);
287
288  void reset_miss_cache();
289#endif
290
291  LmaScoreType translate_score(int f);
292
293  int extract_score_freq(int raw_score);
294
295  uint64 extract_score_lmt(int raw_score);
296
297  inline int build_score(uint64 lmt, int freq);
298
299  inline int64 utf16le_atoll(uint16 *s, int len);
300
301  inline int utf16le_lltoa(int64 v, uint16 *s, int size);
302
303  LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
304                        uint16 lemma_len, uint16 count, uint64 lmt);
305
306  unsigned _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
307                   LmaPsbItem *lpi_items, unsigned lpi_max, bool * need_extend);
308
309  int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
310
311  int _get_lemma_score(LemmaIdType lemma_id);
312
313  int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
314                               const UserDictSearchable *searchable);
315
316  bool is_prefix_spell_id(const uint16 * fullids,
317                          uint16 fulllen, const UserDictSearchable *searchable);
318
319  uint32 get_dict_file_size(UserDictInfo * info);
320
321  bool reset(const char *file);
322
323  bool validate(const char *file);
324
325  bool load(const char *file, LemmaIdType start_id);
326
327  bool is_valid_state();
328
329  bool is_valid_lemma_id(LemmaIdType id);
330
331  LemmaIdType get_max_lemma_id();
332
333  void set_lemma_flag(uint32 offset, uint8 flag);
334
335  char get_lemma_flag(uint32 offset);
336
337  char get_lemma_nchar(uint32 offset);
338
339  uint16 * get_lemma_spell_ids(uint32 offset);
340
341  uint16 * get_lemma_word(uint32 offset);
342
343  // Prepare searchable to fasten locate process
344  void prepare_locate(UserDictSearchable *searchable,
345                      const uint16 * splids, uint16 len);
346
347  // Compare initial letters only
348  int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
349                               const UserDictSearchable *searchable);
350
351  // Compare exactly two spell ids
352  // First argument must be a full id spell id
353  bool equal_spell_id(const uint16 * fullids,
354                      uint16 fulllen, const UserDictSearchable *searchable);
355
356  // Find first item by initial letters
357  int32 locate_first_in_offsets(const UserDictSearchable *searchable);
358
359  LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
360                           uint16 lemma_len, uint16 count, uint64 lmt);
361
362  // Check if a lemma is in dictionary
363  int32 locate_in_offsets(char16 lemma_str[],
364                          uint16 splid_str[], uint16 lemma_len);
365
366  bool remove_lemma_by_offset_index(int offset_index);
367#ifdef ___PREDICT_ENABLED___
368  uint32 locate_where_to_insert_in_predicts(const uint16 * words,
369                                            int lemma_len);
370
371  int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
372
373  void remove_lemma_from_predict_list(uint32 offset);
374#endif
375#ifdef ___SYNC_ENABLED___
376  void queue_lemma_for_sync(LemmaIdType id);
377
378  void remove_lemma_from_sync_list(uint32 offset);
379
380  void write_back_sync(int fd);
381#endif
382  void write_back_score(int fd);
383  void write_back_offset(int fd);
384  void write_back_lemma(int fd);
385  void write_back_all(int fd);
386  void write_back();
387
388  struct UserDictScoreOffsetPair {
389    int score;
390    uint32 offset_index;
391  };
392
393  inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
394
395  void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
396
397  // On-disk format for each lemma
398  // +-------------+
399  // | Version (4) |
400  // +-------------+
401  // +-----------+-----------+--------------------+-------------------+
402  // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
403  // +-----------+-----------+--------------------+-------------------+
404  // ...
405  // +-----------------------+     +-------------+      <---Offset of offset
406  // | Offset1 by_splids (4) | ... | OffsetN (4) |
407  // +-----------------------+     +-------------+
408#ifdef ___PREDICT_ENABLED___
409  // +----------------------+     +-------------+
410  // | Offset1 by_lemma (4) | ... | OffsetN (4) |
411  // +----------------------+     +-------------+
412#endif
413  // +------------+     +------------+
414  // | Score1 (4) | ... | ScoreN (4) |
415  // +------------+     +------------+
416#ifdef ___SYNC_ENABLED___
417  // +-------------+     +-------------+
418  // | NewAdd1 (4) | ... | NewAddN (4) |
419  // +-------------+     +-------------+
420#endif
421  // +----------------+
422  // | Dict Info (4x) |
423  // +----------------+
424};
425}
426
427#endif