PageRenderTime 21ms CodeModel.GetById 2ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/src/im/gpinyin/include/dicttrie.h

http://ftk.googlecode.com/
C++ Header | 233 lines | 108 code | 51 blank | 74 comment | 0 complexity | d7a90675ef774efafc4cb965fb86a0ab MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
 18#define PINYINIME_INCLUDE_DICTTRIE_H__
 19
 20#include <stdlib.h>
 21#include "./atomdictbase.h"
 22#include "./dictdef.h"
 23#include "./dictlist.h"
 24#include "./searchutility.h"
 25
 26namespace ime_pinyin {
 27
 28class DictTrie : AtomDictBase {
 29 private:
 30  struct ParsingMark {
 31    unsigned node_offset:24;
 32    unsigned node_num:8;           // Number of nodes with this spelling id given
 33                                 // by spl_id. If spl_id is a Shengmu, for nodes
 34                                 // in the first layer of DictTrie, it equals to
 35                                 // SpellingTrie::shm2full_num(); but for those
 36                                 // nodes which are not in the first layer,
 37                                 // node_num < SpellingTrie::shm2full_num().
 38                                 // For a full spelling id, node_num = 1;
 39  };
 40
 41  // Used to indicate an extended mile stone.
 42  // An extended mile stone is used to mark a partial match in the dictionary
 43  // trie to speed up further potential extending.
 44  // For example, when the user inputs "w", a mile stone is created to mark the
 45  // partial match status, so that when user inputs another char 'm', it will be
 46  // faster to extend search space based on this mile stone.
 47  //
 48  // For partial match status of "wm", there can be more than one sub mile
 49  // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
 50  // there may be more one parsing mark used to mark these partial matchings.
 51  // A mile stone records the starting position in the mark list and number of
 52  // marks.
 53  struct MileStone {
 54    uint16 mark_start;
 55    uint16 mark_num;
 56  };
 57
 58  DictList* dict_list_;
 59
 60  const SpellingTrie *spl_trie_;
 61
 62  LmaNodeLE0* root_;        // Nodes for root and the first layer.
 63  LmaNodeGE1* nodes_ge1_;   // Nodes for other layers.
 64
 65  // An quick index from spelling id to the LmaNodeLE0 node buffer, or
 66  // to the root_ buffer.
 67  // Index length:
 68  // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
 69  // to get the end.
 70  // All Shengmu ids are not indexed because they will be converted into
 71  // corresponding full ids.
 72  // So, given an id splid, the son is:
 73  // root_[splid_le0_index_[splid - kFullSplIdStart]]
 74  uint16 *splid_le0_index_;
 75
 76  unsigned lma_node_num_le0_;
 77  unsigned lma_node_num_ge1_;
 78
 79  // The first part is for homophnies, and the last  top_lma_num_ items are
 80  // lemmas with highest scores.
 81  unsigned char *lma_idx_buf_;
 82  unsigned lma_idx_buf_len_;  // The total size of lma_idx_buf_ in byte.
 83  unsigned total_lma_num_;    // Total number of lemmas in this dictionary.
 84  unsigned top_lmas_num_;     // Number of lemma with highest scores.
 85
 86  // Parsing mark list used to mark the detailed extended statuses.
 87  ParsingMark *parsing_marks_;
 88  // The position for next available mark.
 89  uint16 parsing_marks_pos_;
 90
 91  // Mile stone list used to mark the extended status.
 92  MileStone *mile_stones_;
 93  // The position for the next available mile stone. We use positions (except 0)
 94  // as handles.
 95  MileStoneHandle mile_stones_pos_;
 96
 97  // Get the offset of sons for a node.
 98  inline unsigned get_son_offset(const LmaNodeGE1 *node);
 99
100  // Get the offset of homonious ids for a node.
101  inline unsigned get_homo_idx_buf_offset(const LmaNodeGE1 *node);
102
103  // Get the lemma id by the offset.
104  inline LemmaIdType get_lemma_id(unsigned id_offset);
105
106  void free_resource(bool free_dict_list);
107
108  bool load_dict(FILE *fp);
109
110  // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
111  // them into the lpi_items buffer.
112  // This function is called by the search engine.
113  unsigned fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned max_size,
114                         LmaNodeLE0 *node);
115
116  // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
117  // them into the lpi_items buffer.
118  // This function is called by inner functions extend_dict0(), extend_dict1()
119  // and extend_dict2().
120  unsigned fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned max_size,
121                         unsigned homo_buf_off, LmaNodeGE1 *node,
122                         uint16 lma_len);
123
124  // Extend in the trie from level 0.
125  MileStoneHandle extend_dict0(MileStoneHandle from_handle,
126                               const DictExtPara *dep, LmaPsbItem *lpi_items,
127                               unsigned lpi_max, unsigned *lpi_num);
128
129  // Extend in the trie from level 1.
130  MileStoneHandle extend_dict1(MileStoneHandle from_handle,
131                               const DictExtPara *dep, LmaPsbItem *lpi_items,
132                               unsigned lpi_max, unsigned *lpi_num);
133
134  // Extend in the trie from level 2.
135  MileStoneHandle extend_dict2(MileStoneHandle from_handle,
136                               const DictExtPara *dep, LmaPsbItem *lpi_items,
137                               unsigned lpi_max, unsigned *lpi_num);
138
139  // Try to extend the given spelling id buffer, and if the given id_lemma can
140  // be successfully gotten, return true;
141  // The given spelling ids are all valid full ids.
142  bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
143
144#ifdef ___BUILD_MODEL___
145  bool save_dict(FILE *fp);
146#endif  // ___BUILD_MODEL___
147
148  static const int kMaxMileStone = 100;
149  static const int kMaxParsingMark = 600;
150  static const MileStoneHandle kFirstValidMileStoneHandle = 1;
151
152  friend class DictParser;
153  friend class DictBuilder;
154
155 public:
156
157  DictTrie();
158  ~DictTrie();
159
160#ifdef ___BUILD_MODEL___
161  // Construct the tree from the file fn_raw.
162  // fn_validhzs provide the valid hanzi list. If fn_validhzs is
163  // NULL, only chars in GB2312 will be included.
164  bool build_dict(const char *fn_raw, const char *fn_validhzs);
165
166  // Save the binary dictionary
167  // Actually, the SpellingTrie/DictList instance will be also saved.
168  bool save_dict(const char *filename);
169#endif  // ___BUILD_MODEL___
170
171  void convert_to_hanzis(char16 *str, uint16 str_len);
172
173  void convert_to_scis_ids(char16 *str, uint16 str_len);
174
175  // Load a binary dictionary
176  // The SpellingTrie instance/DictList will be also loaded
177  bool load_dict(const char *filename, LemmaIdType start_id,
178                 LemmaIdType end_id);
179  bool load_dict_fd(int sys_fd, long start_offset, long length,
180                    LemmaIdType start_id, LemmaIdType end_id);
181  bool close_dict() {return true;}
182  unsigned number_of_lemmas() {return 0;}
183
184  void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
185
186  MileStoneHandle extend_dict(MileStoneHandle from_handle,
187                              const DictExtPara *dep,
188                              LmaPsbItem *lpi_items,
189                              unsigned lpi_max, unsigned *lpi_num);
190
191  unsigned get_lpis(const uint16 *splid_str, uint16 splid_str_len,
192                  LmaPsbItem *lpi_items, unsigned lpi_max);
193
194  uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
195
196  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
197                          uint16 splids_max, bool arg_valid);
198
199  unsigned predict(const char16 *last_hzs, uint16 hzs_len,
200                 NPredictItem *npre_items, unsigned npre_max,
201                 unsigned b4_used);
202
203  LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
204                        uint16 lemma_len, uint16 count) {return 0;}
205
206  LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
207                           bool selected) {return 0;}
208
209  LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
210                           uint16 lemma_len) {return 0;}
211
212  LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
213
214  LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
215                        uint16 lemma_len) {return 0;}
216
217  bool remove_lemma(LemmaIdType lemma_id) {return false;}
218
219  unsigned get_total_lemma_count() {return 0;}
220  void set_total_lemma_count_of_others(unsigned count);
221
222  void flush_cache() {}
223
224  LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
225
226  // Fill the lemmas with highest scores to the prediction buffer.
227  // his_len is the history length to fill in the prediction buffer.
228  unsigned predict_top_lmas(unsigned his_len, NPredictItem *npre_items,
229                          unsigned npre_max, unsigned b4_used);
230};
231}
232
233#endif  // PINYINIME_INCLUDE_DICTTRIE_H__