/src/im/gpinyin/include/ngram.h

http://ftk.googlecode.com/ · C++ Header · 96 lines · 40 code · 22 blank · 34 comment · 0 complexity · 30f1232430b507163bcf7db1f9673999 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_INCLUDE_NGRAM_H__
  17. #define PINYINIME_INCLUDE_NGRAM_H__
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include "./dictdef.h"
  21. namespace ime_pinyin {
  22. typedef unsigned char CODEBOOK_TYPE;
  23. static const unsigned kCodeBookSize = 256;
  24. class NGram {
  25. public:
  26. // The maximum score of a lemma item.
  27. static const LmaScoreType kMaxScore = 0x3fff;
  28. // In order to reduce the storage size, the original log value is amplified by
  29. // kScoreAmplifier, and we use LmaScoreType to store.
  30. // After this process, an item with a lower score has a higher frequency.
  31. static const int kLogValueAmplifier = -800;
  32. // System words' total frequency. It is not the real total frequency, instead,
  33. // It is only used to adjust system lemmas' scores when the user dictionary's
  34. // total frequency changes.
  35. // In this version, frequencies of system lemmas are fixed. We are considering
  36. // to make them changable in next version.
  37. static const unsigned kSysDictTotalFreq = 100000000;
  38. private:
  39. static NGram* instance_;
  40. bool initialized_;
  41. unsigned idx_num_;
  42. unsigned total_freq_none_sys_;
  43. // Score compensation for system dictionary lemmas.
  44. // Because after user adds some user lemmas, the total frequency changes, and
  45. // we use this value to normalize the score.
  46. float sys_score_compensation_;
  47. #ifdef ___BUILD_MODEL___
  48. double *freq_codes_df_;
  49. #endif
  50. LmaScoreType *freq_codes_;
  51. CODEBOOK_TYPE *lma_freq_idx_;
  52. public:
  53. NGram();
  54. ~NGram();
  55. static NGram& get_instance();
  56. bool save_ngram(FILE *fp);
  57. bool load_ngram(FILE *fp);
  58. // Set the total frequency of all none system dictionaries.
  59. void set_total_freq_none_sys(unsigned freq_none_sys);
  60. float get_uni_psb(LemmaIdType lma_id);
  61. // Convert a probability to score. Actually, the score will be limited to
  62. // kMaxScore, but at runtime, we also need float expression to get accurate
  63. // value of the score.
  64. // After the conversion, a lower score indicates a higher probability of the
  65. // item.
  66. static float convert_psb_to_score(double psb);
  67. #ifdef ___BUILD_MODEL___
  68. // For constructing the unigram mode model.
  69. bool build_unigram(LemmaEntry *lemma_arr, unsigned num,
  70. LemmaIdType next_idx_unused);
  71. #endif
  72. };
  73. }
  74. #endif // PINYINIME_INCLUDE_NGRAM_H__