PageRenderTime 491ms CodeModel.GetById 195ms app.highlight 14ms RepoModel.GetById 202ms app.codeStats 0ms

/src/im/gpinyin/include/spellingtrie.h

http://ftk.googlecode.com/
C++ Header | 258 lines | 95 code | 59 blank | 104 comment | 8 complexity | d5a3845f9ac80951346be6626dc67297 MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
 18#define PINYINIME_INCLUDE_SPELLINGTRIE_H__
 19
 20#include <stdio.h>
 21#include <stdlib.h>
 22#include "./dictdef.h"
 23
 24namespace ime_pinyin {
 25
 26static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;
 27
 28// Node used for the trie of spellings
 29struct SpellingNode {
 30  SpellingNode *first_son;
 31  // The spelling id for each node. If you need more bits to store
 32  // spelling id, please adjust this structure.
 33  uint16 spelling_idx:11;
 34  uint16  num_of_son:5;
 35  char char_this_node;
 36  unsigned char score;
 37};
 38
 39class SpellingTrie {
 40 private:
 41  static const int kMaxYmNum = 64;
 42  static const unsigned kValidSplCharNum = 26;
 43
 44  static const uint16 kHalfIdShengmuMask = 0x01;
 45  static const uint16 kHalfIdYunmuMask = 0x02;
 46  static const uint16 kHalfIdSzmMask = 0x04;
 47
 48  // Map from half spelling id to single char.
 49  // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
 50  // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
 51  // 28 to 'Z', 29 to 'z'.
 52  // [0] is not used to achieve better efficiency.
 53  static const char kHalfId2Sc_[kFullSplIdStart + 1];
 54
 55  static unsigned char char_flags_[];
 56  static SpellingTrie* instance_;
 57
 58  // The spelling table
 59  char *spelling_buf_;
 60
 61  // The size of longest spelling string, includes '\0' and an extra char to
 62  // store score. For example, "zhuang" is the longgest item in Pinyin list,
 63  // so spelling_size_ is 8.
 64  // Structure: The string ended with '\0' + score char.
 65  // An item with a lower score has a higher probability.
 66  unsigned spelling_size_;
 67
 68  // Number of full spelling ids.
 69  unsigned spelling_num_;
 70
 71  float score_amplifier_;
 72  unsigned char average_score_;
 73
 74  // The Yunmu id list for the spelling ids (for half ids of Shengmu,
 75  // the Yunmu id is 0).
 76  // The length of the list is spelling_num_ + kFullSplIdStart,
 77  // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
 78  uint8 *spl_ym_ids_;
 79
 80  // The Yunmu table.
 81  // Each Yunmu will be assigned with Yunmu id from 1.
 82  char *ym_buf_;
 83  unsigned ym_size_;  // The size of longest Yunmu string, '\0'included.
 84  unsigned ym_num_;
 85
 86  // The spelling string just queried
 87  char *splstr_queried_;
 88
 89  // The spelling string just queried
 90  char16 *splstr16_queried_;
 91
 92  // The root node of the spelling tree
 93  SpellingNode* root_;
 94
 95  // If a none qwerty key such as a fnction key like ENTER is given, this node
 96  // will be used to indicate that this is not a QWERTY node.
 97  SpellingNode* dumb_node_;
 98
 99  // If a splitter key is pressed, this node will be used to indicate that this
100  // is a splitter key.
101  SpellingNode* splitter_node_;
102
103  // Used to get the first level sons.
104  SpellingNode* level1_sons_[kValidSplCharNum];
105
106  // The full spl_id range for specific half id.
107  // h2f means half to full.
108  // A half id can be a ShouZiMu id (id to represent the first char of a full
109  // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
110  // [1..kFullSplIdStart-1] is the arrange of half id.
111  uint16 h2f_start_[kFullSplIdStart];
112  uint16 h2f_num_[kFullSplIdStart];
113
114  // Map from full id to half id.
115  uint16 *f2h_;
116
117#ifdef ___BUILD_MODEL___
118  // How many node used to build the trie.
119  unsigned node_num_;
120#endif
121
122  SpellingTrie();
123
124  void free_son_trie(SpellingNode* node);
125
126  // Construct a subtree using a subset of the spelling array (from
127  // item_star to item_end).
128  // Member spelliing_buf_ and spelling_size_ should be valid.
129  // parent is used to update its num_of_son and score.
130  SpellingNode* construct_spellings_subset(unsigned item_start, unsigned item_end,
131                                           unsigned level, SpellingNode *parent);
132  bool build_f2h();
133
134  // The caller should guarantee ch >= 'A' && ch <= 'Z'
135  bool is_shengmu_char(char ch) const;
136
137  // The caller should guarantee ch >= 'A' && ch <= 'Z'
138  bool is_yunmu_char(char ch) const;
139
140#ifdef ___BUILD_MODEL___
141  // Given a spelling string, return its Yunmu string.
142  // The caller guaratees spl_str is valid.
143  const char* get_ym_str(const char *spl_str);
144
145  // Build the Yunmu list, and the mapping relation between the full ids and the
146  // Yunmu ids. This functin is called after the spelling trie is built.
147  bool build_ym_info();
148#endif
149
150  friend class SpellingParser;
151  friend class SmartSplParser;
152  friend class SmartSplParser2;
153
154 public:
155  ~SpellingTrie();
156
157  inline static bool is_valid_spl_char(char ch) {
158    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
159  }
160
161  // The caller guarantees that the two chars are valid spelling chars.
162  inline static bool is_same_spl_char(char ch1, char ch2) {
163    return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
164  }
165
166  // Construct the tree from the input pinyin array
167  // The given string list should have been sorted.
168  // score_amplifier is used to convert a possibility value into score.
169  // average_score is the average_score of all spellings. The dumb node is
170  // assigned with this score.
171  bool construct(const char* spelling_arr, unsigned item_size, unsigned item_num,
172                 float score_amplifier, unsigned char average_score);
173
174  // Test if the given id is a valid spelling id.
175  // If function returns true, the given splid may be updated like this:
176  // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
177  // first given as a half id 1, but because 'A' is a one-char Yunmu and
178  // it is a valid id, it needs to updated to its corresponding full id.
179  bool if_valid_id_update(uint16 *splid) const;
180
181  // Test if the given id is a half id.
182  bool is_half_id(uint16 splid) const;
183
184  bool is_full_id(uint16 splid) const;
185
186  // Test if the given id is a one-char Yunmu id (obviously, it is also a half
187  // id), such as 'A', 'E' and 'O'.
188  bool is_half_id_yunmu(uint16 splid) const;
189
190  // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
191  // For Pinyin, only i/u/v is not a ShouZiMu char.
192  // The caller should guarantee that ch >= 'A' && ch <= 'Z'
193  bool is_szm_char(char ch) const;
194
195  // Test If this char is enabled in ShouZiMu mode.
196  // The caller should guarantee that ch >= 'A' && ch <= 'Z'
197  bool szm_is_enabled(char ch) const;
198
199  // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
200  // to input).
201  void szm_enable_shm(bool enable);
202
203  // Enable/disable Yunmus in ShouZiMu mode.
204  void szm_enable_ym(bool enable);
205
206  // Test if this char is enabled in ShouZiMu mode.
207  // The caller should guarantee ch >= 'A' && ch <= 'Z'
208  bool is_szm_enabled(char ch) const;
209
210  // Return the number of full ids for the given half id.
211  uint16 half2full_num(uint16 half_id) const;
212
213  // Return the number of full ids for the given half id, and fill spl_id_start
214  // to return the first full id.
215  uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;
216
217  // Return the corresponding half id for the given full id.
218  // Not frequently used, low efficient.
219  // Return 0 if fails.
220  uint16 full_to_half(uint16 full_id) const;
221
222  // To test whether a half id is compatible with a full id.
223  // Generally, when half_id == full_to_half(full_id), return true.
224  // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
225  // with a full id like "Zhe". (Fussy mode is not ready).
226  bool half_full_compatible(uint16 half_id, uint16 full_id) const;
227
228  static const SpellingTrie* get_cpinstance();
229
230  static SpellingTrie& get_instance();
231
232  // Save to the file stream
233  bool save_spl_trie(FILE *fp);
234
235  // Load from the file stream
236  bool load_spl_trie(FILE *fp);
237
238  // Get the number of spellings
239  unsigned get_spelling_num();
240
241  // Return the Yunmu id for the given Yunmu string.
242  // If the string is not valid, return 0;
243  uint8 get_ym_id(const char* ym_str);
244
245  // Get the readonly Pinyin string for a given spelling id
246  const char* get_spelling_str(uint16 splid);
247
248  // Get the readonly Pinyin string for a given spelling id
249  const char16* get_spelling_str16(uint16 splid);
250
251  // Get Pinyin string for a given spelling id. Return the length of the
252  // string, and fill-in '\0' at the end.
253  unsigned get_spelling_str16(uint16 splid, char16 *splstr16,
254                            unsigned splstr16_len);
255};
256}
257
258#endif  // PINYINIME_INCLUDE_SPELLINGTRIE_H__