/src/im/gpinyin/include/dicttrie.h

http://ftk.googlecode.com/ · C++ Header · 233 lines · 108 code · 51 blank · 74 comment · 0 complexity · d7a90675ef774efafc4cb965fb86a0ab MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_INCLUDE_DICTTRIE_H__
  17. #define PINYINIME_INCLUDE_DICTTRIE_H__
  18. #include <stdlib.h>
  19. #include "./atomdictbase.h"
  20. #include "./dictdef.h"
  21. #include "./dictlist.h"
  22. #include "./searchutility.h"
  23. namespace ime_pinyin {
  24. class DictTrie : AtomDictBase {
  25. private:
  26. struct ParsingMark {
  27. unsigned node_offset:24;
  28. unsigned node_num:8; // Number of nodes with this spelling id given
  29. // by spl_id. If spl_id is a Shengmu, for nodes
  30. // in the first layer of DictTrie, it equals to
  31. // SpellingTrie::shm2full_num(); but for those
  32. // nodes which are not in the first layer,
  33. // node_num < SpellingTrie::shm2full_num().
  34. // For a full spelling id, node_num = 1;
  35. };
  36. // Used to indicate an extended mile stone.
  37. // An extended mile stone is used to mark a partial match in the dictionary
  38. // trie to speed up further potential extending.
  39. // For example, when the user inputs "w", a mile stone is created to mark the
  40. // partial match status, so that when user inputs another char 'm', it will be
  41. // faster to extend search space based on this mile stone.
  42. //
  43. // For partial match status of "wm", there can be more than one sub mile
  44. // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
  45. // there may be more one parsing mark used to mark these partial matchings.
  46. // A mile stone records the starting position in the mark list and number of
  47. // marks.
  48. struct MileStone {
  49. uint16 mark_start;
  50. uint16 mark_num;
  51. };
  52. DictList* dict_list_;
  53. const SpellingTrie *spl_trie_;
  54. LmaNodeLE0* root_; // Nodes for root and the first layer.
  55. LmaNodeGE1* nodes_ge1_; // Nodes for other layers.
  56. // An quick index from spelling id to the LmaNodeLE0 node buffer, or
  57. // to the root_ buffer.
  58. // Index length:
  59. // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
  60. // to get the end.
  61. // All Shengmu ids are not indexed because they will be converted into
  62. // corresponding full ids.
  63. // So, given an id splid, the son is:
  64. // root_[splid_le0_index_[splid - kFullSplIdStart]]
  65. uint16 *splid_le0_index_;
  66. unsigned lma_node_num_le0_;
  67. unsigned lma_node_num_ge1_;
  68. // The first part is for homophnies, and the last top_lma_num_ items are
  69. // lemmas with highest scores.
  70. unsigned char *lma_idx_buf_;
  71. unsigned lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte.
  72. unsigned total_lma_num_; // Total number of lemmas in this dictionary.
  73. unsigned top_lmas_num_; // Number of lemma with highest scores.
  74. // Parsing mark list used to mark the detailed extended statuses.
  75. ParsingMark *parsing_marks_;
  76. // The position for next available mark.
  77. uint16 parsing_marks_pos_;
  78. // Mile stone list used to mark the extended status.
  79. MileStone *mile_stones_;
  80. // The position for the next available mile stone. We use positions (except 0)
  81. // as handles.
  82. MileStoneHandle mile_stones_pos_;
  83. // Get the offset of sons for a node.
  84. inline unsigned get_son_offset(const LmaNodeGE1 *node);
  85. // Get the offset of homonious ids for a node.
  86. inline unsigned get_homo_idx_buf_offset(const LmaNodeGE1 *node);
  87. // Get the lemma id by the offset.
  88. inline LemmaIdType get_lemma_id(unsigned id_offset);
  89. void free_resource(bool free_dict_list);
  90. bool load_dict(FILE *fp);
  91. // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
  92. // them into the lpi_items buffer.
  93. // This function is called by the search engine.
  94. unsigned fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned max_size,
  95. LmaNodeLE0 *node);
  96. // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
  97. // them into the lpi_items buffer.
  98. // This function is called by inner functions extend_dict0(), extend_dict1()
  99. // and extend_dict2().
  100. unsigned fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned max_size,
  101. unsigned homo_buf_off, LmaNodeGE1 *node,
  102. uint16 lma_len);
  103. // Extend in the trie from level 0.
  104. MileStoneHandle extend_dict0(MileStoneHandle from_handle,
  105. const DictExtPara *dep, LmaPsbItem *lpi_items,
  106. unsigned lpi_max, unsigned *lpi_num);
  107. // Extend in the trie from level 1.
  108. MileStoneHandle extend_dict1(MileStoneHandle from_handle,
  109. const DictExtPara *dep, LmaPsbItem *lpi_items,
  110. unsigned lpi_max, unsigned *lpi_num);
  111. // Extend in the trie from level 2.
  112. MileStoneHandle extend_dict2(MileStoneHandle from_handle,
  113. const DictExtPara *dep, LmaPsbItem *lpi_items,
  114. unsigned lpi_max, unsigned *lpi_num);
  115. // Try to extend the given spelling id buffer, and if the given id_lemma can
  116. // be successfully gotten, return true;
  117. // The given spelling ids are all valid full ids.
  118. bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
  119. #ifdef ___BUILD_MODEL___
  120. bool save_dict(FILE *fp);
  121. #endif // ___BUILD_MODEL___
  122. static const int kMaxMileStone = 100;
  123. static const int kMaxParsingMark = 600;
  124. static const MileStoneHandle kFirstValidMileStoneHandle = 1;
  125. friend class DictParser;
  126. friend class DictBuilder;
  127. public:
  128. DictTrie();
  129. ~DictTrie();
  130. #ifdef ___BUILD_MODEL___
  131. // Construct the tree from the file fn_raw.
  132. // fn_validhzs provide the valid hanzi list. If fn_validhzs is
  133. // NULL, only chars in GB2312 will be included.
  134. bool build_dict(const char *fn_raw, const char *fn_validhzs);
  135. // Save the binary dictionary
  136. // Actually, the SpellingTrie/DictList instance will be also saved.
  137. bool save_dict(const char *filename);
  138. #endif // ___BUILD_MODEL___
  139. void convert_to_hanzis(char16 *str, uint16 str_len);
  140. void convert_to_scis_ids(char16 *str, uint16 str_len);
  141. // Load a binary dictionary
  142. // The SpellingTrie instance/DictList will be also loaded
  143. bool load_dict(const char *filename, LemmaIdType start_id,
  144. LemmaIdType end_id);
  145. bool load_dict_fd(int sys_fd, long start_offset, long length,
  146. LemmaIdType start_id, LemmaIdType end_id);
  147. bool close_dict() {return true;}
  148. unsigned number_of_lemmas() {return 0;}
  149. void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
  150. MileStoneHandle extend_dict(MileStoneHandle from_handle,
  151. const DictExtPara *dep,
  152. LmaPsbItem *lpi_items,
  153. unsigned lpi_max, unsigned *lpi_num);
  154. unsigned get_lpis(const uint16 *splid_str, uint16 splid_str_len,
  155. LmaPsbItem *lpi_items, unsigned lpi_max);
  156. uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
  157. uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
  158. uint16 splids_max, bool arg_valid);
  159. unsigned predict(const char16 *last_hzs, uint16 hzs_len,
  160. NPredictItem *npre_items, unsigned npre_max,
  161. unsigned b4_used);
  162. LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
  163. uint16 lemma_len, uint16 count) {return 0;}
  164. LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
  165. bool selected) {return 0;}
  166. LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
  167. uint16 lemma_len) {return 0;}
  168. LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
  169. LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
  170. uint16 lemma_len) {return 0;}
  171. bool remove_lemma(LemmaIdType lemma_id) {return false;}
  172. unsigned get_total_lemma_count() {return 0;}
  173. void set_total_lemma_count_of_others(unsigned count);
  174. void flush_cache() {}
  175. LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
  176. // Fill the lemmas with highest scores to the prediction buffer.
  177. // his_len is the history length to fill in the prediction buffer.
  178. unsigned predict_top_lmas(unsigned his_len, NPredictItem *npre_items,
  179. unsigned npre_max, unsigned b4_used);
  180. };
  181. }
  182. #endif // PINYINIME_INCLUDE_DICTTRIE_H__