/src/im/gpinyin/include/spellingtrie.h

http://ftk.googlecode.com/ · C++ Header · 258 lines · 95 code · 59 blank · 104 comment · 8 complexity · d5a3845f9ac80951346be6626dc67297 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
  17. #define PINYINIME_INCLUDE_SPELLINGTRIE_H__
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include "./dictdef.h"
  21. namespace ime_pinyin {
  22. static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;
  23. // Node used for the trie of spellings
  24. struct SpellingNode {
  25. SpellingNode *first_son;
  26. // The spelling id for each node. If you need more bits to store
  27. // spelling id, please adjust this structure.
  28. uint16 spelling_idx:11;
  29. uint16 num_of_son:5;
  30. char char_this_node;
  31. unsigned char score;
  32. };
  33. class SpellingTrie {
  34. private:
  35. static const int kMaxYmNum = 64;
  36. static const unsigned kValidSplCharNum = 26;
  37. static const uint16 kHalfIdShengmuMask = 0x01;
  38. static const uint16 kHalfIdYunmuMask = 0x02;
  39. static const uint16 kHalfIdSzmMask = 0x04;
  40. // Map from half spelling id to single char.
  41. // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
  42. // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
  43. // 28 to 'Z', 29 to 'z'.
  44. // [0] is not used to achieve better efficiency.
  45. static const char kHalfId2Sc_[kFullSplIdStart + 1];
  46. static unsigned char char_flags_[];
  47. static SpellingTrie* instance_;
  48. // The spelling table
  49. char *spelling_buf_;
  50. // The size of longest spelling string, includes '\0' and an extra char to
  51. // store score. For example, "zhuang" is the longgest item in Pinyin list,
  52. // so spelling_size_ is 8.
  53. // Structure: The string ended with '\0' + score char.
  54. // An item with a lower score has a higher probability.
  55. unsigned spelling_size_;
  56. // Number of full spelling ids.
  57. unsigned spelling_num_;
  58. float score_amplifier_;
  59. unsigned char average_score_;
  60. // The Yunmu id list for the spelling ids (for half ids of Shengmu,
  61. // the Yunmu id is 0).
  62. // The length of the list is spelling_num_ + kFullSplIdStart,
  63. // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
  64. uint8 *spl_ym_ids_;
  65. // The Yunmu table.
  66. // Each Yunmu will be assigned with Yunmu id from 1.
  67. char *ym_buf_;
  68. unsigned ym_size_; // The size of longest Yunmu string, '\0'included.
  69. unsigned ym_num_;
  70. // The spelling string just queried
  71. char *splstr_queried_;
  72. // The spelling string just queried
  73. char16 *splstr16_queried_;
  74. // The root node of the spelling tree
  75. SpellingNode* root_;
  76. // If a none qwerty key such as a fnction key like ENTER is given, this node
  77. // will be used to indicate that this is not a QWERTY node.
  78. SpellingNode* dumb_node_;
  79. // If a splitter key is pressed, this node will be used to indicate that this
  80. // is a splitter key.
  81. SpellingNode* splitter_node_;
  82. // Used to get the first level sons.
  83. SpellingNode* level1_sons_[kValidSplCharNum];
  84. // The full spl_id range for specific half id.
  85. // h2f means half to full.
  86. // A half id can be a ShouZiMu id (id to represent the first char of a full
  87. // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
  88. // [1..kFullSplIdStart-1] is the arrange of half id.
  89. uint16 h2f_start_[kFullSplIdStart];
  90. uint16 h2f_num_[kFullSplIdStart];
  91. // Map from full id to half id.
  92. uint16 *f2h_;
  93. #ifdef ___BUILD_MODEL___
  94. // How many node used to build the trie.
  95. unsigned node_num_;
  96. #endif
  97. SpellingTrie();
  98. void free_son_trie(SpellingNode* node);
  99. // Construct a subtree using a subset of the spelling array (from
  100. // item_star to item_end).
  101. // Member spelliing_buf_ and spelling_size_ should be valid.
  102. // parent is used to update its num_of_son and score.
  103. SpellingNode* construct_spellings_subset(unsigned item_start, unsigned item_end,
  104. unsigned level, SpellingNode *parent);
  105. bool build_f2h();
  106. // The caller should guarantee ch >= 'A' && ch <= 'Z'
  107. bool is_shengmu_char(char ch) const;
  108. // The caller should guarantee ch >= 'A' && ch <= 'Z'
  109. bool is_yunmu_char(char ch) const;
  110. #ifdef ___BUILD_MODEL___
  111. // Given a spelling string, return its Yunmu string.
  112. // The caller guaratees spl_str is valid.
  113. const char* get_ym_str(const char *spl_str);
  114. // Build the Yunmu list, and the mapping relation between the full ids and the
  115. // Yunmu ids. This functin is called after the spelling trie is built.
  116. bool build_ym_info();
  117. #endif
  118. friend class SpellingParser;
  119. friend class SmartSplParser;
  120. friend class SmartSplParser2;
  121. public:
  122. ~SpellingTrie();
  123. inline static bool is_valid_spl_char(char ch) {
  124. return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
  125. }
  126. // The caller guarantees that the two chars are valid spelling chars.
  127. inline static bool is_same_spl_char(char ch1, char ch2) {
  128. return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
  129. }
  130. // Construct the tree from the input pinyin array
  131. // The given string list should have been sorted.
  132. // score_amplifier is used to convert a possibility value into score.
  133. // average_score is the average_score of all spellings. The dumb node is
  134. // assigned with this score.
  135. bool construct(const char* spelling_arr, unsigned item_size, unsigned item_num,
  136. float score_amplifier, unsigned char average_score);
  137. // Test if the given id is a valid spelling id.
  138. // If function returns true, the given splid may be updated like this:
  139. // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
  140. // first given as a half id 1, but because 'A' is a one-char Yunmu and
  141. // it is a valid id, it needs to updated to its corresponding full id.
  142. bool if_valid_id_update(uint16 *splid) const;
  143. // Test if the given id is a half id.
  144. bool is_half_id(uint16 splid) const;
  145. bool is_full_id(uint16 splid) const;
  146. // Test if the given id is a one-char Yunmu id (obviously, it is also a half
  147. // id), such as 'A', 'E' and 'O'.
  148. bool is_half_id_yunmu(uint16 splid) const;
  149. // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
  150. // For Pinyin, only i/u/v is not a ShouZiMu char.
  151. // The caller should guarantee that ch >= 'A' && ch <= 'Z'
  152. bool is_szm_char(char ch) const;
  153. // Test If this char is enabled in ShouZiMu mode.
  154. // The caller should guarantee that ch >= 'A' && ch <= 'Z'
  155. bool szm_is_enabled(char ch) const;
  156. // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
  157. // to input).
  158. void szm_enable_shm(bool enable);
  159. // Enable/disable Yunmus in ShouZiMu mode.
  160. void szm_enable_ym(bool enable);
  161. // Test if this char is enabled in ShouZiMu mode.
  162. // The caller should guarantee ch >= 'A' && ch <= 'Z'
  163. bool is_szm_enabled(char ch) const;
  164. // Return the number of full ids for the given half id.
  165. uint16 half2full_num(uint16 half_id) const;
  166. // Return the number of full ids for the given half id, and fill spl_id_start
  167. // to return the first full id.
  168. uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;
  169. // Return the corresponding half id for the given full id.
  170. // Not frequently used, low efficient.
  171. // Return 0 if fails.
  172. uint16 full_to_half(uint16 full_id) const;
  173. // To test whether a half id is compatible with a full id.
  174. // Generally, when half_id == full_to_half(full_id), return true.
  175. // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
  176. // with a full id like "Zhe". (Fussy mode is not ready).
  177. bool half_full_compatible(uint16 half_id, uint16 full_id) const;
  178. static const SpellingTrie* get_cpinstance();
  179. static SpellingTrie& get_instance();
  180. // Save to the file stream
  181. bool save_spl_trie(FILE *fp);
  182. // Load from the file stream
  183. bool load_spl_trie(FILE *fp);
  184. // Get the number of spellings
  185. unsigned get_spelling_num();
  186. // Return the Yunmu id for the given Yunmu string.
  187. // If the string is not valid, return 0;
  188. uint8 get_ym_id(const char* ym_str);
  189. // Get the readonly Pinyin string for a given spelling id
  190. const char* get_spelling_str(uint16 splid);
  191. // Get the readonly Pinyin string for a given spelling id
  192. const char16* get_spelling_str16(uint16 splid);
  193. // Get Pinyin string for a given spelling id. Return the length of the
  194. // string, and fill-in '\0' at the end.
  195. unsigned get_spelling_str16(uint16 splid, char16 *splstr16,
  196. unsigned splstr16_len);
  197. };
  198. }
  199. #endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__