PageRenderTime 19ms CodeModel.GetById 12ms app.highlight 5ms RepoModel.GetById 1ms app.codeStats 0ms

/src/im/gpinyin/include/ngram.h

http://ftk.googlecode.com/
C++ Header | 96 lines | 40 code | 22 blank | 34 comment | 0 complexity | 30f1232430b507163bcf7db1f9673999 MD5 | raw file
 1/*
 2 * Copyright (C) 2009 The Android Open Source Project
 3 *
 4 * Licensed under the Apache License, Version 2.0 (the "License");
 5 * you may not use this file except in compliance with the License.
 6 * You may obtain a copy of the License at
 7 *
 8 *      http://www.apache.org/licenses/LICENSE-2.0
 9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef PINYINIME_INCLUDE_NGRAM_H__
18#define PINYINIME_INCLUDE_NGRAM_H__
19
20#include <stdio.h>
21#include <stdlib.h>
22#include "./dictdef.h"
23
24namespace ime_pinyin {
25
26typedef unsigned char CODEBOOK_TYPE;
27
28static const unsigned kCodeBookSize = 256;
29
30class NGram {
31 public:
32  // The maximum score of a lemma item.
33  static const LmaScoreType kMaxScore = 0x3fff;
34
35  // In order to reduce the storage size, the original log value is amplified by
36  // kScoreAmplifier, and we use LmaScoreType to store.
37  // After this process, an item with a lower score has a higher frequency.
38  static const int kLogValueAmplifier = -800;
39
40  // System words' total frequency. It is not the real total frequency, instead,
41  // It is only used to adjust system lemmas' scores when the user dictionary's
42  // total frequency changes.
43  // In this version, frequencies of system lemmas are fixed. We are considering
44  // to make them changable in next version.
45  static const unsigned kSysDictTotalFreq = 100000000;
46
47 private:
48
49  static NGram* instance_;
50
51  bool initialized_;
52  unsigned idx_num_;
53
54  unsigned total_freq_none_sys_;
55
56  // Score compensation for system dictionary lemmas.
57  // Because after user adds some user lemmas, the total frequency changes, and
58  // we use this value to normalize the score.
59  float sys_score_compensation_;
60
61#ifdef ___BUILD_MODEL___
62  double *freq_codes_df_;
63#endif
64  LmaScoreType *freq_codes_;
65  CODEBOOK_TYPE *lma_freq_idx_;
66
67 public:
68  NGram();
69  ~NGram();
70
71  static NGram& get_instance();
72
73  bool save_ngram(FILE *fp);
74  bool load_ngram(FILE *fp);
75
76  // Set the total frequency of all none system dictionaries.
77  void set_total_freq_none_sys(unsigned freq_none_sys);
78
79  float get_uni_psb(LemmaIdType lma_id);
80
81  // Convert a probability to score. Actually, the score will be limited to
82  // kMaxScore, but at runtime, we also need float expression to get accurate
83  // value of the score.
84  // After the conversion, a lower score indicates a higher probability of the
85  // item.
86  static float convert_psb_to_score(double psb);
87
88#ifdef ___BUILD_MODEL___
89  // For constructing the unigram mode model.
90  bool build_unigram(LemmaEntry *lemma_arr, unsigned num,
91                     LemmaIdType next_idx_unused);
92#endif
93};
94}
95
96#endif  // PINYINIME_INCLUDE_NGRAM_H__