PageRenderTime 570ms CodeModel.GetById 212ms app.highlight 153ms RepoModel.GetById 199ms app.codeStats 0ms

/src/im/gpinyin/include/matrixsearch.h

http://ftk.googlecode.com/
C++ Header | 456 lines | 152 code | 85 blank | 219 comment | 0 complexity | c0d25ecd3e7b3b40d36f859f8d8dc6e1 MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
 18#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
 19
 20#include <stdlib.h>
 21#include "./atomdictbase.h"
 22#include "./dicttrie.h"
 23#include "./searchutility.h"
 24#include "./spellingtrie.h"
 25#include "./splparser.h"
 26
 27namespace ime_pinyin {
 28
 29static const unsigned kMaxRowNum = kMaxSearchSteps;
 30
 31typedef struct {
 32  // MileStoneHandle objects for the system and user dictionaries.
 33  MileStoneHandle dict_handles[2];
 34  // From which DMI node. -1 means it's from root.
 35  PoolPosType dmi_fr;
 36  // The spelling id for the Pinyin string from the previous DMI to this node.
 37  // If it is a half id like Shengmu, the node pointed by dict_node is the first
 38  // node with this Shengmu,
 39  uint16 spl_id;
 40  // What's the level of the dict node. Level of root is 0, but root is never
 41  // recorded by dict_node.
 42  unsigned char dict_level:7;
 43  // If this node is for composing phrase, this bit is 1.
 44  unsigned char c_phrase:1;
 45  // Whether the spl_id is parsed with a split character at the end.
 46  unsigned char splid_end_split:1;
 47  // What's the length of the spelling string for this match, for the whole
 48  // word.
 49  unsigned char splstr_len:7;
 50  // Used to indicate whether all spelling ids from the root are full spelling
 51  // ids. This information is useful for keymapping mode(not finished). Because
 52  // in this mode, there is no clear boundaries, we prefer those results which
 53  // have full spelling ids.
 54  unsigned char all_full_id:1;
 55} DictMatchInfo, *PDictMatchInfo;
 56
 57typedef struct MatrixNode {
 58  LemmaIdType id;
 59  float score;
 60  MatrixNode *from;
 61  // From which DMI node. Used to trace the spelling segmentation.
 62  PoolPosType dmi_fr;
 63  uint16 step;
 64} MatrixNode, *PMatrixNode;
 65
 66typedef struct {
 67  // The MatrixNode position in the matrix pool
 68  PoolPosType mtrx_nd_pos;
 69  // The DictMatchInfo position in the DictMatchInfo pool.
 70  PoolPosType dmi_pos;
 71  uint16 mtrx_nd_num;
 72  uint16 dmi_num:15;
 73  // Used to indicate whether there are dmi nodes in this step with full
 74  // spelling id. This information is used to decide whether a substring of a
 75  // valid Pinyin should be extended.
 76  //
 77  // Example1: shoudao
 78  // When the last char 'o' is added, the parser will find "dao" is a valid
 79  // Pinyin, and because all dmi nodes at location 'd' (including those for
 80  // "shoud", and those for "d") have Shengmu id only, so it is not necessary
 81  // to extend "ao", otherwise the result may be "shoud ao", that is not
 82  // reasonable.
 83  //
 84  // Example2: hengao
 85  // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
 86  // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
 87  // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
 88  // "heng ao" can also be the result.
 89  //
 90  // Similarly, "ganga" is expanded to "gang a".
 91  //
 92  // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
 93  // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
 94  // is not valid either). If the parser uses break in the loop, the result
 95  // always be "xian"; but if the parser uses continue in the loop, "xi an" will
 96  // also be tried. This behaviour can be set via the function
 97  // set_xi_an_switch().
 98  uint16 dmi_has_full_id:1;
 99  // Points to a MatrixNode of the current step to indicate which choice the
100  // user selects.
101  MatrixNode *mtrx_nd_fixed;
102} MatrixRow, *PMatrixRow;
103
104// When user inputs and selects candidates, the fixed lemma ids are stored in
105// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
106// lemmas from the beginning are fixed. If user deletes Pinyin characters one
107// by one from the end, these fixed lemmas can be unlocked one by one when
108// necessary. Whenever user deletes a Chinese character and its spelling string
109// in these fixed lemmas, all fixed lemmas will be merged together into a unit
110// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
111// phrase will be the first lemma in the sentence. Because it contains some
112// modified lemmas (by deleting a character), these merged lemmas are called
113// sub lemmas (sublma), and each of them are represented individually, so that
114// when user deletes Pinyin characters from the end, these sub lemmas can also
115// be unlocked one by one.
116typedef struct {
117  uint16 spl_ids[kMaxRowNum];
118  uint16 spl_start[kMaxRowNum];
119  char16 chn_str[kMaxRowNum];       // Chinese string.
120  uint16 sublma_start[kMaxRowNum];  // Counted in Chinese characters.
121  unsigned sublma_num;
122  uint16 length;                    // Counted in Chinese characters.
123} ComposingPhrase, *TComposingPhrase;
124
125class MatrixSearch {
126 private:
127  // If it is true, prediction list by string whose length is greater than 1
128  // will be limited to a reasonable number.
129  static const bool kPredictLimitGt1 = false;
130
131  // If it is true, the engine will prefer long history based prediction,
132  // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
133  // based on the two-character history.
134  static const bool kPreferLongHistoryPredict = true;
135
136  // If it is true, prediction will only be based on user dictionary. this flag
137  // is for debug purpose.
138  static const bool kOnlyUserDictPredict = false;
139
140  // The maximum buffer to store LmaPsbItems.
141  static const unsigned kMaxLmaPsbItems = 1450;
142
143  // How many rows for each step.
144  static const unsigned kMaxNodeARow = 5;
145
146  // The maximum length of the sentence candidates counted in chinese
147  // characters
148  static const unsigned kMaxSentenceLength = 16;
149
150  // The size of the matrix node pool.
151  static const unsigned kMtrxNdPoolSize = 200;
152
153  // The size of the DMI node pool.
154  static const unsigned kDmiPoolSize = 800;
155
156  // Used to indicate whether this object has been initialized.
157  bool inited_;
158
159  // Spelling trie.
160  const SpellingTrie *spl_trie_;
161
162  // Used to indicate this switcher status: when "xian" is parseed, should
163  // "xi an" also be extended. Default is false.
164  // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
165  // should be valid for a FULL spelling, or a combination of two spellings,
166  // first of which is a FULL id too. So even it is true, "da" will never be
167  // split into "d a", because "d" is not a full spelling id.
168  bool xi_an_enabled_;
169
170  // System dictionary.
171  DictTrie* dict_trie_;
172
173  // User dictionary.
174  AtomDictBase* user_dict_;
175
176  // Spelling parser.
177  SpellingParser* spl_parser_;
178
179  // The maximum allowed length of spelling string (such as a Pinyin string).
180  unsigned max_sps_len_;
181
182  // The maximum allowed length of a result Chinese string.
183  unsigned max_hzs_len_;
184
185  // Pinyin string. Max length: kMaxRowNum - 1
186  char pys_[kMaxRowNum];
187
188  // The length of the string that has been decoded successfully.
189  unsigned pys_decoded_len_;
190
191  // Shared buffer for multiple purposes.
192  unsigned *share_buf_;
193
194  MatrixNode *mtrx_nd_pool_;
195  PoolPosType mtrx_nd_pool_used_;    // How many nodes used in the pool
196  DictMatchInfo *dmi_pool_;
197  PoolPosType dmi_pool_used_;        // How many items used in the pool
198
199  MatrixRow *matrix_;                // The first row is for starting
200
201  DictExtPara *dep_;                 // Parameter used to extend DMI nodes.
202
203  NPredictItem *npre_items_;         // Used to do prediction
204  unsigned npre_items_len_;
205
206  // The starting positions and lemma ids for the full sentence candidate.
207  unsigned lma_id_num_;
208  uint16 lma_start_[kMaxRowNum];     // Counted in spelling ids.
209  LemmaIdType lma_id_[kMaxRowNum];
210  unsigned fixed_lmas_;
211
212  // If fixed_lmas_ is bigger than i,  Element i is used to indicate whether
213  // the i'th lemma id in lma_id_ is the first candidate for that step.
214  // If all candidates are the first one for that step, the whole string can be
215  // decoded by the engine automatically, so no need to add it to user
216  // dictionary. (We are considering to add it to user dictionary in the
217  // future).
218  uint8 fixed_lmas_no1_[kMaxRowNum];
219
220  // Composing phrase
221  ComposingPhrase c_phrase_;
222
223  // If dmi_c_phrase_ is true, the decoder will try to match the
224  // composing phrase (And definitely it will match successfully). If it
225  // is false, the decoder will try to match lemmas items in dictionaries.
226  bool dmi_c_phrase_;
227
228  // The starting positions and spelling ids for the first full sentence
229  // candidate.
230  unsigned spl_id_num_;                // Number of splling ids
231  uint16 spl_start_[kMaxRowNum];     // Starting positions
232  uint16 spl_id_[kMaxRowNum];        // Spelling ids
233  // Used to remember the last fixed position, counted in Hanzi.
234  unsigned fixed_hzs_;
235
236  // Lemma Items with possibility score, two purposes:
237  // 1. In Viterbi decoding, this buffer is used to get all possible candidates
238  // for current step;
239  // 2. When the search is done, this buffer is used to get candiates from the
240  // first un-fixed step and show them to the user.
241  LmaPsbItem lpi_items_[kMaxLmaPsbItems];
242  unsigned lpi_total_;
243
244  // Assign the pointers with NULL. The caller makes sure that all pointers are
245  // not valid before calling it. This function only will be called in the
246  // construction function and free_resource().
247  void reset_pointers_to_null();
248
249  bool alloc_resource();
250
251  void free_resource();
252
253  // Reset the search space totally.
254  bool reset_search0();
255
256  // Reset the search space from ch_pos step. For example, if the original
257  // input Pinyin is "an", reset_search(1) will reset the search space to the
258  // result of "a". If the given position is out of range, return false.
259  // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
260  // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
261  // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
262  // The DMI nodes will be kept.
263  //
264  // Note: this function should not destroy content of pys_.
265  bool reset_search(unsigned ch_pos, bool clear_fixed_this_step,
266                    bool clear_dmi_this_step, bool clear_mtrx_this_step);
267
268  // Delete a part of the content in pys_.
269  void del_in_pys(unsigned start, unsigned len);
270
271  // Delete a spelling id and its corresponding Chinese character, and merge
272  // the fixed lemmas into the composing phrase.
273  // del_spl_pos indicates which spelling id needs to be delete.
274  // This function will update the lemma and spelling segmentation information.
275  // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
276  // the fixed lemmas.
277  void merge_fixed_lmas(unsigned del_spl_pos);
278
279  // Get spelling start posistions and ids. The result will be stored in
280  // spl_id_num_, spl_start_[], spl_id_[].
281  // fixed_hzs_ will be also assigned.
282  void get_spl_start_id();
283
284  // Get all lemma ids with match the given spelling id stream(shorter than the
285  // maximum length of a word).
286  // If pfullsent is not NULL, means the full sentence candidate may be the
287  // same with the coming lemma string, if so, remove that lemma.
288  // The result is sorted in descendant order by the frequency score.
289  unsigned get_lpis(const uint16* splid_str, unsigned splid_str_len,
290                  LmaPsbItem* lma_buf, unsigned max_lma_buf,
291                  const char16 *pfullsent, bool sort_by_psb);
292
293  uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
294
295  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
296                          uint16 splids_max, bool arg_valid);
297
298
299  // Extend a DMI node with a spelling id. ext_len is the length of the rows
300  // to extend, actually, it is the size of the spelling string of splid.
301  // return value can be 1 or 0.
302  // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
303  // the pool).
304  // 0 means either the dmi node can not be extended with splid, or the splid
305  // is a Shengmu id, which is only used to get lpi_items, or the result node
306  // in DictTrie has no son, it is not nccessary to keep the new DMI.
307  //
308  // This function modifies the content of lpi_items_ and lpi_total_.
309  // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
310  // The function's returned value has no relation with the value of lpi_num.
311  //
312  // If dmi == NULL, this function will extend the root node of DictTrie
313  //
314  // This function will not change dmi_nd_pool_used_. Please change it after
315  // calling this function if necessary.
316  //
317  // The caller should guarantees that NULL != dep.
318  unsigned extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);
319
320  // Extend dmi for the composing phrase.
321  unsigned extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);
322
323  // Extend a MatrixNode with the give LmaPsbItem list.
324  // res_row is the destination row number.
325  // This function does not change mtrx_nd_pool_used_. Please change it after
326  // calling this function if necessary.
327  // return 0 always.
328  unsigned extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
329                        unsigned lpi_num, PoolPosType dmi_fr, unsigned res_row);
330
331
332  // Try to find a dmi node at step_to position, and the found dmi node should
333  // match the given spelling id strings.
334  PoolPosType match_dmi(unsigned step_to, uint16 spl_ids[], uint16 spl_id_num);
335
336  bool add_char(char ch);
337  bool prepare_add_char(char ch);
338
339  // Called after prepare_add_char, so the input char has been saved.
340  bool add_char_qwerty();
341
342  // Prepare candidates from the last fixed hanzi position.
343  void prepare_candidates();
344
345  // Is the character in step pos a splitter character?
346  // The caller guarantees that the position is valid.
347  bool is_split_at(uint16 pos);
348
349  void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
350                PoolPosType dmi_fr,
351                uint16 spl_id, uint16 node_num, unsigned char dict_level,
352                bool splid_end_split, unsigned char splstr_len,
353                unsigned char all_full_id);
354
355  unsigned inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
356                       char16 predict_buf[][kMaxPredictSize + 1],
357                       unsigned buf_len);
358
359  // Add the first candidate to the user dictionary.
360  bool try_add_cand0_to_userdict();
361
362  // Add a user lemma to the user dictionary. This lemma is a subset of
363  // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
364  // number of lemmas to be combined together as a new lemma. The caller
365  // gurantees that the combined new lemma's length is less or equal to
366  // kMaxLemmaSize.
367  bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);
368
369  // Update dictionary frequencies.
370  void update_dict_freq();
371
372  void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);
373
374 public:
375  MatrixSearch();
376  ~MatrixSearch();
377
378  bool init(const char *fn_sys_dict, const char *fn_usr_dict);
379
380  bool init_fd(int sys_fd, long start_offset, long length,
381               const char *fn_usr_dict);
382
383  void set_max_lens(unsigned max_sps_len, unsigned max_hzs_len);
384
385  void close();
386
387  void flush_cache();
388
389  void set_xi_an_switch(bool xi_an_enabled);
390
391  bool get_xi_an_switch();
392
393  // Reset the search space. Equivalent to reset_search(0).
394  // If inited, always return true;
395  bool reset_search();
396
397  // Search a Pinyin string.
398  // Return value is the position successfully parsed.
399  unsigned search(const char *py, unsigned py_len);
400
401  // Used to delete something in the Pinyin string kept by the engine, and do
402  // a re-search.
403  // Return value is the new length of Pinyin string kept by the engine which
404  // is parsed successfully.
405  // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
406  // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
407  // characters for pos-th spelling id needs to be deleted.
408  // If the deleted character(s) is just after a fixed lemma or sub lemma in
409  // composing phrase, clear_fixed_this_step indicates whether we needs to
410  // unlock the last fixed lemma or sub lemma.
411  // If is_pos_in_splid is false, and pos-th character is in the range for the
412  // fixed lemmas or composing string, this function will do nothing and just
413  // return the result of the previous search.
414  unsigned delsearch(unsigned pos, bool is_pos_in_splid,
415                   bool clear_fixed_this_step);
416
417  // Get the number of candiates, called after search().
418  unsigned get_candidate_num();
419
420  // Get the Pinyin string stored by the engine.
421  // *decoded_len returns the length of the successfully decoded string.
422  const char* get_pystr(unsigned *decoded_len);
423
424  // Get the spelling boundaries for the first sentence candidate.
425  // Number of spellings will be returned. The number of valid elements in
426  // spl_start is one more than the return value because the last one is used
427  // to indicate the beginning of the next un-input speling.
428  // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
429  unsigned get_spl_start(const uint16 *&spl_start);
430
431  // Get one candiate string. If full sentence candidate is available, it will
432  // be the first one.
433  char16* get_candidate(unsigned cand_id, char16 *cand_str, unsigned max_len);
434
435  // Get the first candiate, which is a "full sentence".
436  // retstr_len is not NULL, it will be used to return the string length.
437  // If only_unfixed is true, only unfixed part will be fetched.
438  char16* get_candidate0(char16* cand_str, unsigned max_len,
439                         uint16 *retstr_len, bool only_unfixed);
440
441  // Choose a candidate. The decoder will do a search after the fixed position.
442  unsigned choose(unsigned cand_id);
443
444  // Cancel the last choosing operation, and return the new number of choices.
445  unsigned cancel_last_choice();
446
447  // Get the length of fixed Hanzis.
448  unsigned get_fixedlen();
449
450  unsigned get_predicts(const char16 fixed_buf[],
451                      char16 predict_buf[][kMaxPredictSize + 1],
452                      unsigned buf_len);
453};
454}
455
456#endif  // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__