PageRenderTime 27ms CodeModel.GetById 13ms app.highlight 9ms RepoModel.GetById 2ms app.codeStats 0ms

/src/im/gpinyin/include/dictbuilder.h

http://ftk.googlecode.com/
C++ Header | 171 lines | 72 code | 39 blank | 60 comment | 0 complexity | 2b5cb657f4a536af363a367c834631f0 MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
 18#define PINYINIME_INCLUDE_DICTBUILDER_H__
 19
 20#include <stdlib.h>
 21#include "./utf16char.h"
 22#include "./dictdef.h"
 23#include "./dictlist.h"
 24#include "./spellingtable.h"
 25#include "./spellingtrie.h"
 26#include "./splparser.h"
 27
 28namespace ime_pinyin {
 29
 30#ifdef ___BUILD_MODEL___
 31
 32#define ___DO_STATISTICS___
 33
 34class DictTrie;
 35
 36class DictBuilder {
 37 private:
 38  // The raw lemma array buffer.
 39  LemmaEntry *lemma_arr_;
 40  unsigned lemma_num_;
 41
 42  // Used to store all possible single char items.
 43  // Two items may have the same Hanzi while their spelling ids are different.
 44  SingleCharItem *scis_;
 45  unsigned scis_num_;
 46
 47  // In the tree, root's level is -1.
 48  // Lemma nodes for root, and level 0
 49  LmaNodeLE0 *lma_nodes_le0_;
 50
 51  // Lemma nodes for layers whose levels are deeper than 0
 52  LmaNodeGE1 *lma_nodes_ge1_;
 53
 54  // Number of used lemma nodes
 55  unsigned lma_nds_used_num_le0_;
 56  unsigned lma_nds_used_num_ge1_;
 57
 58  // Used to store homophonies' ids.
 59  LemmaIdType *homo_idx_buf_;
 60  // Number of homophonies each of which only contains one Chinese character.
 61  unsigned homo_idx_num_eq1_;
 62  // Number of homophonies each of which contains more than one character.
 63  unsigned homo_idx_num_gt1_;
 64
 65  // The items with highest scores.
 66  LemmaEntry *top_lmas_;
 67  unsigned top_lmas_num_;
 68
 69  SpellingTable *spl_table_;
 70  SpellingParser *spl_parser_;
 71
 72#ifdef ___DO_STATISTICS___
 73  unsigned max_sonbuf_len_[kMaxLemmaSize];
 74  unsigned max_homobuf_len_[kMaxLemmaSize];
 75
 76  unsigned total_son_num_[kMaxLemmaSize];
 77  unsigned total_node_hasson_[kMaxLemmaSize];
 78  unsigned total_sonbuf_num_[kMaxLemmaSize];
 79  unsigned total_sonbuf_allnoson_[kMaxLemmaSize];
 80  unsigned total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
 81  unsigned total_homo_num_[kMaxLemmaSize];
 82
 83  unsigned sonbufs_num1_;     // Number of son buffer with only 1 son
 84  unsigned sonbufs_numgt1_;   // Number of son buffer with more 1 son;
 85
 86  unsigned total_lma_node_num_;
 87
 88  void stat_init();
 89  void stat_print();
 90#endif
 91
 92 public:
 93
 94  DictBuilder();
 95  ~DictBuilder();
 96
 97  // Build dictionary trie from the file fn_raw. File fn_validhzs provides
 98  // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
 99  // included.
100  bool build_dict(const char* fn_raw, const char* fn_validhzs,
101                  DictTrie *dict_trie);
102
103 private:
104  // Fill in the buffer with id. The caller guarantees that the paramters are
105  // vaild.
106  void id_to_charbuf(unsigned char *buf, LemmaIdType id);
107
108  // Update the offset of sons for a node.
109  void set_son_offset(LmaNodeGE1 *node, unsigned offset);
110
111  // Update the offset of homophonies' ids for a node.
112  void set_homo_id_buf_offset(LmaNodeGE1 *node, unsigned offset);
113
114  // Format a speling string.
115  void format_spelling_str(char *spl_str);
116
117  // Sort the lemma_arr by the hanzi string, and give each of unique items
118  // a id. Why we need to sort the lemma list according to their Hanzi string
119  // is to find items started by a given prefix string to do prediction.
120  // Actually, the single char items are be in other order, for example,
121  // in spelling id order, etc.
122  // Return value is next un-allocated idx available.
123  LemmaIdType sort_lemmas_by_hz();
124
125  // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
126  // lemma buffer lemma_arr_.
127  // This function should be called after the lemma array is ready.
128  // Return the number of unique SingleCharItem elements.
129  unsigned build_scis();
130
131  // Construct a subtree using a subset of the spelling array (from
132  // item_star to item_end)
133  // parent is the parent node to update the necessary information
134  // parent can be a member of LmaNodeLE0 or LmaNodeGE1
135  bool construct_subset(void* parent, LemmaEntry* lemma_arr,
136                        unsigned item_start, unsigned item_end, unsigned level);
137
138
139  // Read valid Chinese Hanzis from the given file.
140  // num is used to return number of chars.
141  // The return buffer is sorted and caller needs to free the returned buffer.
142  char16* read_valid_hanzis(const char *fn_validhzs, unsigned *num);
143
144
145  // Read a raw dictionary. max_item is the maximum number of items. If there
146  // are more items in the ditionary, only the first max_item will be read.
147  // Returned value is the number of items successfully read from the file.
148  unsigned read_raw_dict(const char* fn_raw, const char *fn_validhzs,
149                       unsigned max_item);
150
151  // Try to find if a character is in hzs buffer.
152  bool hz_in_hanzis_list(const char16 *hzs, unsigned hzs_len, char16 hz);
153
154  // Try to find if all characters in str are in hzs buffer.
155  bool str_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
156                          const char16 *str, unsigned str_len);
157
158  // Get these lemmas with toppest scores.
159  void get_top_lemmas();
160
161  // Allocate resource to build dictionary.
162  // lma_num is the number of items to be loaded
163  bool alloc_resource(unsigned lma_num);
164
165  // Free resource.
166  void free_resource();
167};
168#endif  // ___BUILD_MODEL___
169}
170
171#endif  // PINYINIME_INCLUDE_DICTBUILDER_H__