/src/im/gpinyin/share/spellingtable.cpp

http://ftk.googlecode.com/ · C++ · 313 lines · 222 code · 64 blank · 27 comment · 78 complexity · aff9c87910adb43c68149b69bed2051f MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include <stdlib.h>
  18. #include <stdio.h>
  19. #include <string.h>
  20. #include <math.h>
  21. #include "../include/spellingtable.h"
  22. namespace ime_pinyin {
  23. #ifdef ___BUILD_MODEL___
  24. const char SpellingTable::
  25. kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
  26. // "" is the biggest, so that all empty strings will be moved to the end
  27. // _eb mean empty is biggest
  28. int compare_raw_spl_eb(const void* p1, const void* p2) {
  29. if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
  30. return 1;
  31. if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
  32. return -1;
  33. return strcmp((static_cast<const RawSpelling*>(p1))->str,
  34. (static_cast<const RawSpelling*>(p2))->str);
  35. }
  36. unsigned get_odd_next(unsigned value) {
  37. unsigned v_next = value;
  38. while (true) {
  39. unsigned v_next_sqrt = (unsigned)sqrt(v_next);
  40. bool is_odd = true;
  41. for (unsigned v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
  42. if (v_next % v_dv == 0) {
  43. is_odd = false;
  44. break;
  45. }
  46. }
  47. if (is_odd)
  48. return v_next;
  49. v_next++;
  50. }
  51. // never reach here
  52. return 0;
  53. }
  54. SpellingTable::SpellingTable() {
  55. need_score_ = false;
  56. raw_spellings_ = NULL;
  57. spelling_buf_ = NULL;
  58. spelling_num_ = 0;
  59. total_freq_ = 0;
  60. frozen_ = true;
  61. }
  62. SpellingTable::~SpellingTable() {
  63. free_resource();
  64. }
  65. unsigned SpellingTable::get_hash_pos(const char* spelling_str) {
  66. unsigned hash_pos = 0;
  67. for (unsigned pos = 0; pos < spelling_size_; pos++) {
  68. if ('\0' == spelling_str[pos])
  69. break;
  70. hash_pos += (unsigned)spelling_str[pos];
  71. }
  72. hash_pos = hash_pos % spelling_max_num_;
  73. return hash_pos;
  74. }
  75. unsigned SpellingTable::hash_pos_next(unsigned hash_pos) {
  76. hash_pos += 123;
  77. hash_pos = hash_pos % spelling_max_num_;
  78. return hash_pos;
  79. }
  80. void SpellingTable::free_resource() {
  81. if (NULL != raw_spellings_)
  82. delete [] raw_spellings_;
  83. raw_spellings_ = NULL;
  84. if (NULL != spelling_buf_)
  85. delete [] spelling_buf_;
  86. spelling_buf_ = NULL;
  87. }
  88. bool SpellingTable::init_table(unsigned pure_spl_size, unsigned spl_max_num,
  89. bool need_score) {
  90. if (pure_spl_size == 0 || spl_max_num ==0)
  91. return false;
  92. need_score_ = need_score;
  93. free_resource();
  94. spelling_size_ = pure_spl_size + 1;
  95. if (need_score)
  96. spelling_size_ += 1;
  97. spelling_max_num_ = get_odd_next(spl_max_num);
  98. spelling_num_ = 0;
  99. raw_spellings_ = new RawSpelling[spelling_max_num_];
  100. spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
  101. if (NULL == raw_spellings_ || NULL == spelling_buf_) {
  102. free_resource();
  103. return false;
  104. }
  105. memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
  106. memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
  107. frozen_ = false;
  108. total_freq_ = 0;
  109. return true;
  110. }
  111. bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
  112. if (frozen_ || NULL == spelling_str)
  113. return false;
  114. for (unsigned pos = 0; pos < kNotSupportNum; pos++) {
  115. if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
  116. return false;
  117. }
  118. }
  119. total_freq_ += freq;
  120. unsigned hash_pos = get_hash_pos(spelling_str);
  121. raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
  122. if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
  123. spelling_size_ - 1) == 0) {
  124. raw_spellings_[hash_pos].freq += freq;
  125. return true;
  126. }
  127. unsigned hash_pos_ori = hash_pos;
  128. while (true) {
  129. if (strncmp(raw_spellings_[hash_pos].str,
  130. spelling_str, spelling_size_ - 1) == 0) {
  131. raw_spellings_[hash_pos].freq += freq;
  132. return true;
  133. }
  134. if ('\0' == raw_spellings_[hash_pos].str[0]) {
  135. raw_spellings_[hash_pos].freq += freq;
  136. strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
  137. raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
  138. spelling_num_++;
  139. return true;
  140. }
  141. hash_pos = hash_pos_next(hash_pos);
  142. if (hash_pos_ori == hash_pos)
  143. return false;
  144. }
  145. // never reach here
  146. return false;
  147. }
  148. bool SpellingTable::contain(const char* spelling_str) {
  149. if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
  150. return false;
  151. unsigned hash_pos = get_hash_pos(spelling_str);
  152. if ('\0' == raw_spellings_[hash_pos].str[0])
  153. return false;
  154. if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
  155. == 0)
  156. return true;
  157. unsigned hash_pos_ori = hash_pos;
  158. while (true) {
  159. hash_pos = hash_pos_next(hash_pos);
  160. if (hash_pos_ori == hash_pos)
  161. return false;
  162. if ('\0' == raw_spellings_[hash_pos].str[0])
  163. return false;
  164. if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
  165. == 0)
  166. return true;
  167. }
  168. // never reach here
  169. return false;
  170. }
  171. const char* SpellingTable::arrange(unsigned *item_size, unsigned *spl_num) {
  172. if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
  173. NULL == item_size || NULL == spl_num)
  174. return NULL;
  175. qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
  176. compare_raw_spl_eb);
  177. // After sorting, only the first spelling_num_ items are valid.
  178. // Copy them to the destination buffer.
  179. for (unsigned pos = 0; pos < spelling_num_; pos++) {
  180. strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
  181. spelling_size_);
  182. }
  183. if (need_score_) {
  184. if (kPrintDebug0)
  185. printf("------------Spelling Possiblities--------------\n");
  186. double max_score = 0;
  187. double min_score = 0;
  188. // After sorting, only the first spelling_num_ items are valid.
  189. for (unsigned pos = 0; pos < spelling_num_; pos++) {
  190. raw_spellings_[pos].freq /= total_freq_;
  191. if (need_score_) {
  192. if (0 == pos) {
  193. max_score = raw_spellings_[0].freq;
  194. min_score = max_score;
  195. } else {
  196. if (raw_spellings_[pos].freq > max_score)
  197. max_score = raw_spellings_[pos].freq;
  198. if (raw_spellings_[pos].freq < min_score)
  199. min_score = raw_spellings_[pos].freq;
  200. }
  201. }
  202. }
  203. if (kPrintDebug0)
  204. printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
  205. max_score = log(max_score);
  206. min_score = log(min_score);
  207. if (kPrintDebug0)
  208. printf("-----max log value: %f, min log value: %f\n",
  209. max_score, min_score);
  210. // The absolute value of min_score is bigger than that of max_score because
  211. // both of them are negative after log function.
  212. score_amplifier_ = 1.0 * 255 / min_score;
  213. double average_score = 0;
  214. for (unsigned pos = 0; pos < spelling_num_; pos++) {
  215. double score = log(raw_spellings_[pos].freq) * score_amplifier_;
  216. assert(score >= 0);
  217. average_score += score;
  218. // Because of calculation precision issue, score might be a little bigger
  219. // than 255 after being amplified.
  220. if (score > 255)
  221. score = 255;
  222. char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
  223. this_spl_buf[spelling_size_ - 1] =
  224. static_cast<char>((unsigned char)score);
  225. if (kPrintDebug0) {
  226. printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
  227. (unsigned char)this_spl_buf[spelling_size_ -1]);
  228. }
  229. }
  230. average_score /= spelling_num_;
  231. assert(average_score <= 255);
  232. average_score_ = static_cast<uint8>(average_score);
  233. if (kPrintDebug0)
  234. printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
  235. average_score_);
  236. }
  237. *item_size = spelling_size_;
  238. *spl_num = spelling_num_;
  239. frozen_ = true;
  240. return spelling_buf_;
  241. }
  242. float SpellingTable::get_score_amplifier() {
  243. return static_cast<float>(score_amplifier_);
  244. }
  245. unsigned char SpellingTable::get_average_score() {
  246. return average_score_;
  247. }
  248. #endif // ___BUILD_MODEL___
  249. } // namespace ime_pinyin