PageRenderTime 41ms CodeModel.GetById 14ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 1ms

/src/im/gpinyin/share/spellingtable.cpp

http://ftk.googlecode.com/
C++ | 313 lines | 222 code | 64 blank | 27 comment | 78 complexity | aff9c87910adb43c68149b69bed2051f MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#include <assert.h>
 18#include <stdlib.h>
 19#include <stdio.h>
 20#include <string.h>
 21#include <math.h>
 22#include "../include/spellingtable.h"
 23
 24namespace ime_pinyin {
 25
 26#ifdef ___BUILD_MODEL___
 27
 28const char SpellingTable::
 29    kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
 30
 31// "" is the biggest, so that all empty strings will be moved to the end
 32// _eb mean empty is biggest
 33int compare_raw_spl_eb(const void* p1, const void* p2) {
 34  if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
 35    return 1;
 36
 37  if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
 38    return -1;
 39
 40  return strcmp((static_cast<const RawSpelling*>(p1))->str,
 41                (static_cast<const RawSpelling*>(p2))->str);
 42}
 43
 44unsigned get_odd_next(unsigned value) {
 45  unsigned v_next = value;
 46  while (true) {
 47    unsigned v_next_sqrt = (unsigned)sqrt(v_next);
 48
 49    bool is_odd = true;
 50    for (unsigned v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
 51      if (v_next % v_dv == 0) {
 52        is_odd = false;
 53        break;
 54      }
 55    }
 56
 57    if (is_odd)
 58      return v_next;
 59
 60    v_next++;
 61  }
 62
 63  // never reach here
 64  return 0;
 65}
 66
 67SpellingTable::SpellingTable() {
 68  need_score_ = false;
 69  raw_spellings_ = NULL;
 70  spelling_buf_ = NULL;
 71  spelling_num_ = 0;
 72  total_freq_ = 0;
 73  frozen_ = true;
 74}
 75
 76SpellingTable::~SpellingTable() {
 77  free_resource();
 78}
 79
 80unsigned SpellingTable::get_hash_pos(const char* spelling_str) {
 81  unsigned hash_pos = 0;
 82  for (unsigned pos = 0; pos < spelling_size_; pos++) {
 83    if ('\0' == spelling_str[pos])
 84      break;
 85    hash_pos += (unsigned)spelling_str[pos];
 86  }
 87
 88  hash_pos = hash_pos % spelling_max_num_;
 89  return hash_pos;
 90}
 91
 92unsigned SpellingTable::hash_pos_next(unsigned hash_pos) {
 93  hash_pos += 123;
 94  hash_pos = hash_pos % spelling_max_num_;
 95  return hash_pos;
 96}
 97
 98void SpellingTable::free_resource() {
 99  if (NULL != raw_spellings_)
100    delete [] raw_spellings_;
101  raw_spellings_ = NULL;
102
103  if (NULL != spelling_buf_)
104    delete [] spelling_buf_;
105  spelling_buf_ = NULL;
106}
107
108bool SpellingTable::init_table(unsigned pure_spl_size, unsigned spl_max_num,
109                               bool need_score) {
110  if (pure_spl_size == 0 || spl_max_num ==0)
111    return false;
112
113  need_score_ = need_score;
114
115  free_resource();
116
117  spelling_size_ = pure_spl_size + 1;
118  if (need_score)
119    spelling_size_ += 1;
120  spelling_max_num_ = get_odd_next(spl_max_num);
121  spelling_num_ = 0;
122
123  raw_spellings_ = new RawSpelling[spelling_max_num_];
124  spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
125  if (NULL == raw_spellings_ || NULL == spelling_buf_) {
126    free_resource();
127    return false;
128  }
129
130  memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
131  memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
132  frozen_ = false;
133  total_freq_ = 0;
134  return true;
135}
136
137bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
138  if (frozen_ || NULL == spelling_str)
139    return false;
140
141  for (unsigned pos = 0; pos < kNotSupportNum; pos++) {
142    if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
143      return false;
144    }
145  }
146
147  total_freq_ += freq;
148
149  unsigned hash_pos = get_hash_pos(spelling_str);
150
151  raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
152
153  if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
154              spelling_size_ - 1) == 0) {
155    raw_spellings_[hash_pos].freq += freq;
156    return true;
157  }
158
159  unsigned hash_pos_ori = hash_pos;
160
161  while (true) {
162    if (strncmp(raw_spellings_[hash_pos].str,
163                spelling_str, spelling_size_ - 1) == 0) {
164      raw_spellings_[hash_pos].freq += freq;
165      return true;
166    }
167
168    if ('\0' == raw_spellings_[hash_pos].str[0]) {
169      raw_spellings_[hash_pos].freq += freq;
170      strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
171      raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
172      spelling_num_++;
173      return true;
174    }
175
176    hash_pos = hash_pos_next(hash_pos);
177    if (hash_pos_ori == hash_pos)
178      return false;
179  }
180
181  // never reach here
182  return false;
183}
184
185bool SpellingTable::contain(const char* spelling_str) {
186  if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
187    return false;
188
189  unsigned hash_pos = get_hash_pos(spelling_str);
190
191  if ('\0' == raw_spellings_[hash_pos].str[0])
192    return false;
193
194  if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
195      == 0)
196    return true;
197
198  unsigned hash_pos_ori = hash_pos;
199
200  while (true) {
201    hash_pos = hash_pos_next(hash_pos);
202    if (hash_pos_ori == hash_pos)
203      return false;
204
205    if ('\0' == raw_spellings_[hash_pos].str[0])
206      return false;
207
208    if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
209        == 0)
210      return true;
211  }
212
213  // never reach here
214  return false;
215}
216
217const char* SpellingTable::arrange(unsigned *item_size, unsigned *spl_num) {
218  if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
219      NULL == item_size || NULL == spl_num)
220    return NULL;
221
222  qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
223        compare_raw_spl_eb);
224
225  // After sorting, only the first spelling_num_ items are valid.
226  // Copy them to the destination buffer.
227  for (unsigned pos = 0; pos < spelling_num_; pos++) {
228    strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
229            spelling_size_);
230  }
231
232  if (need_score_) {
233    if (kPrintDebug0)
234      printf("------------Spelling Possiblities--------------\n");
235
236    double max_score = 0;
237    double min_score = 0;
238
239    // After sorting, only the first spelling_num_ items are valid.
240    for (unsigned pos = 0; pos < spelling_num_; pos++) {
241      raw_spellings_[pos].freq /= total_freq_;
242      if (need_score_) {
243        if (0 == pos) {
244          max_score = raw_spellings_[0].freq;
245          min_score = max_score;
246        } else {
247          if (raw_spellings_[pos].freq > max_score)
248            max_score = raw_spellings_[pos].freq;
249          if (raw_spellings_[pos].freq < min_score)
250            min_score = raw_spellings_[pos].freq;
251        }
252      }
253    }
254
255    if (kPrintDebug0)
256      printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
257
258    max_score = log(max_score);
259    min_score = log(min_score);
260
261    if (kPrintDebug0)
262      printf("-----max log value: %f, min log value: %f\n",
263             max_score, min_score);
264
265    // The absolute value of min_score is bigger than that of max_score because
266    // both of them are negative after log function.
267    score_amplifier_ = 1.0 * 255 / min_score;
268
269    double average_score = 0;
270    for (unsigned pos = 0; pos < spelling_num_; pos++) {
271      double score = log(raw_spellings_[pos].freq) * score_amplifier_;
272      assert(score >= 0);
273
274      average_score += score;
275
276      // Because of calculation precision issue, score might be a little bigger
277      // than 255 after being amplified.
278      if (score > 255)
279        score = 255;
280      char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
281      this_spl_buf[spelling_size_ - 1] =
282          static_cast<char>((unsigned char)score);
283
284      if (kPrintDebug0) {
285        printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
286               (unsigned char)this_spl_buf[spelling_size_ -1]);
287      }
288    }
289    average_score /= spelling_num_;
290    assert(average_score <= 255);
291    average_score_ = static_cast<uint8>(average_score);
292
293    if (kPrintDebug0)
294      printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
295             average_score_);
296  }
297
298  *item_size = spelling_size_;
299  *spl_num = spelling_num_;
300  frozen_ = true;
301  return spelling_buf_;
302}
303
304float SpellingTable::get_score_amplifier() {
305  return static_cast<float>(score_amplifier_);
306}
307
308unsigned char SpellingTable::get_average_score() {
309  return average_score_;
310}
311
312#endif  // ___BUILD_MODEL___
313}  // namespace ime_pinyin