/src/im/gpinyin/include/dictbuilder.h

http://ftk.googlecode.com/ · C++ Header · 171 lines · 72 code · 39 blank · 60 comment · 0 complexity · 2b5cb657f4a536af363a367c834631f0 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
  17. #define PINYINIME_INCLUDE_DICTBUILDER_H__
  18. #include <stdlib.h>
  19. #include "./utf16char.h"
  20. #include "./dictdef.h"
  21. #include "./dictlist.h"
  22. #include "./spellingtable.h"
  23. #include "./spellingtrie.h"
  24. #include "./splparser.h"
  25. namespace ime_pinyin {
  26. #ifdef ___BUILD_MODEL___
  27. #define ___DO_STATISTICS___
  28. class DictTrie;
  29. class DictBuilder {
  30. private:
  31. // The raw lemma array buffer.
  32. LemmaEntry *lemma_arr_;
  33. unsigned lemma_num_;
  34. // Used to store all possible single char items.
  35. // Two items may have the same Hanzi while their spelling ids are different.
  36. SingleCharItem *scis_;
  37. unsigned scis_num_;
  38. // In the tree, root's level is -1.
  39. // Lemma nodes for root, and level 0
  40. LmaNodeLE0 *lma_nodes_le0_;
  41. // Lemma nodes for layers whose levels are deeper than 0
  42. LmaNodeGE1 *lma_nodes_ge1_;
  43. // Number of used lemma nodes
  44. unsigned lma_nds_used_num_le0_;
  45. unsigned lma_nds_used_num_ge1_;
  46. // Used to store homophonies' ids.
  47. LemmaIdType *homo_idx_buf_;
  48. // Number of homophonies each of which only contains one Chinese character.
  49. unsigned homo_idx_num_eq1_;
  50. // Number of homophonies each of which contains more than one character.
  51. unsigned homo_idx_num_gt1_;
  52. // The items with highest scores.
  53. LemmaEntry *top_lmas_;
  54. unsigned top_lmas_num_;
  55. SpellingTable *spl_table_;
  56. SpellingParser *spl_parser_;
  57. #ifdef ___DO_STATISTICS___
  58. unsigned max_sonbuf_len_[kMaxLemmaSize];
  59. unsigned max_homobuf_len_[kMaxLemmaSize];
  60. unsigned total_son_num_[kMaxLemmaSize];
  61. unsigned total_node_hasson_[kMaxLemmaSize];
  62. unsigned total_sonbuf_num_[kMaxLemmaSize];
  63. unsigned total_sonbuf_allnoson_[kMaxLemmaSize];
  64. unsigned total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
  65. unsigned total_homo_num_[kMaxLemmaSize];
  66. unsigned sonbufs_num1_; // Number of son buffer with only 1 son
  67. unsigned sonbufs_numgt1_; // Number of son buffer with more 1 son;
  68. unsigned total_lma_node_num_;
  69. void stat_init();
  70. void stat_print();
  71. #endif
  72. public:
  73. DictBuilder();
  74. ~DictBuilder();
  75. // Build dictionary trie from the file fn_raw. File fn_validhzs provides
  76. // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
  77. // included.
  78. bool build_dict(const char* fn_raw, const char* fn_validhzs,
  79. DictTrie *dict_trie);
  80. private:
  81. // Fill in the buffer with id. The caller guarantees that the paramters are
  82. // vaild.
  83. void id_to_charbuf(unsigned char *buf, LemmaIdType id);
  84. // Update the offset of sons for a node.
  85. void set_son_offset(LmaNodeGE1 *node, unsigned offset);
  86. // Update the offset of homophonies' ids for a node.
  87. void set_homo_id_buf_offset(LmaNodeGE1 *node, unsigned offset);
  88. // Format a speling string.
  89. void format_spelling_str(char *spl_str);
  90. // Sort the lemma_arr by the hanzi string, and give each of unique items
  91. // a id. Why we need to sort the lemma list according to their Hanzi string
  92. // is to find items started by a given prefix string to do prediction.
  93. // Actually, the single char items are be in other order, for example,
  94. // in spelling id order, etc.
  95. // Return value is next un-allocated idx available.
  96. LemmaIdType sort_lemmas_by_hz();
  97. // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
  98. // lemma buffer lemma_arr_.
  99. // This function should be called after the lemma array is ready.
  100. // Return the number of unique SingleCharItem elements.
  101. unsigned build_scis();
  102. // Construct a subtree using a subset of the spelling array (from
  103. // item_star to item_end)
  104. // parent is the parent node to update the necessary information
  105. // parent can be a member of LmaNodeLE0 or LmaNodeGE1
  106. bool construct_subset(void* parent, LemmaEntry* lemma_arr,
  107. unsigned item_start, unsigned item_end, unsigned level);
  108. // Read valid Chinese Hanzis from the given file.
  109. // num is used to return number of chars.
  110. // The return buffer is sorted and caller needs to free the returned buffer.
  111. char16* read_valid_hanzis(const char *fn_validhzs, unsigned *num);
  112. // Read a raw dictionary. max_item is the maximum number of items. If there
  113. // are more items in the ditionary, only the first max_item will be read.
  114. // Returned value is the number of items successfully read from the file.
  115. unsigned read_raw_dict(const char* fn_raw, const char *fn_validhzs,
  116. unsigned max_item);
  117. // Try to find if a character is in hzs buffer.
  118. bool hz_in_hanzis_list(const char16 *hzs, unsigned hzs_len, char16 hz);
  119. // Try to find if all characters in str are in hzs buffer.
  120. bool str_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
  121. const char16 *str, unsigned str_len);
  122. // Get these lemmas with toppest scores.
  123. void get_top_lemmas();
  124. // Allocate resource to build dictionary.
  125. // lma_num is the number of items to be loaded
  126. bool alloc_resource(unsigned lma_num);
  127. // Free resource.
  128. void free_resource();
  129. };
  130. #endif // ___BUILD_MODEL___
  131. }
  132. #endif // PINYINIME_INCLUDE_DICTBUILDER_H__