/src/im/gpinyin/share/dictlist.cpp

http://ftk.googlecode.com/ · C++ · 446 lines · 328 code · 93 blank · 25 comment · 137 complexity · e922a6a084f777800181d18e5ea9482b MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include "../include/dictlist.h"
  20. #include "../include/mystdlib.h"
  21. #include "../include/ngram.h"
  22. #include "../include/searchutility.h"
  23. namespace ime_pinyin {
  24. DictList::DictList() {
  25. initialized_ = false;
  26. scis_num_ = 0;
  27. scis_hz_ = NULL;
  28. scis_splid_ = NULL;
  29. buf_ = NULL;
  30. spl_trie_ = SpellingTrie::get_cpinstance();
  31. assert(kMaxLemmaSize == 8);
  32. cmp_func_[0] = cmp_hanzis_1;
  33. cmp_func_[1] = cmp_hanzis_2;
  34. cmp_func_[2] = cmp_hanzis_3;
  35. cmp_func_[3] = cmp_hanzis_4;
  36. cmp_func_[4] = cmp_hanzis_5;
  37. cmp_func_[5] = cmp_hanzis_6;
  38. cmp_func_[6] = cmp_hanzis_7;
  39. cmp_func_[7] = cmp_hanzis_8;
  40. }
  41. DictList::~DictList() {
  42. free_resource();
  43. }
  44. bool DictList::alloc_resource(unsigned buf_size, unsigned scis_num) {
  45. // Allocate memory
  46. buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16)));
  47. if (NULL == buf_)
  48. return false;
  49. scis_num_ = scis_num;
  50. scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16)));
  51. if (NULL == scis_hz_)
  52. return false;
  53. scis_splid_ = static_cast<SpellingId*>
  54. (malloc(scis_num_ * sizeof(SpellingId)));
  55. if (NULL == scis_splid_)
  56. return false;
  57. return true;
  58. }
  59. void DictList::free_resource() {
  60. if (NULL != buf_)
  61. free(buf_);
  62. buf_ = NULL;
  63. if (NULL != scis_hz_)
  64. free(scis_hz_);
  65. scis_hz_ = NULL;
  66. if (NULL != scis_splid_)
  67. free(scis_splid_);
  68. scis_splid_ = NULL;
  69. }
  70. #ifdef ___BUILD_MODEL___
  71. bool DictList::init_list(const SingleCharItem *scis, unsigned scis_num,
  72. const LemmaEntry *lemma_arr, unsigned lemma_num) {
  73. if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num)
  74. return false;
  75. initialized_ = false;
  76. if (NULL != buf_)
  77. free(buf_);
  78. // calculate the size
  79. unsigned buf_size = calculate_size(lemma_arr, lemma_num);
  80. if (0 == buf_size)
  81. return false;
  82. if (!alloc_resource(buf_size, scis_num))
  83. return false;
  84. fill_scis(scis, scis_num);
  85. // Copy the related content from the array to inner buffer
  86. fill_list(lemma_arr, lemma_num);
  87. initialized_ = true;
  88. return true;
  89. }
  90. unsigned DictList::calculate_size(const LemmaEntry* lemma_arr, unsigned lemma_num) {
  91. unsigned last_hz_len = 0;
  92. unsigned list_size = 0;
  93. unsigned id_num = 0;
  94. for (unsigned i = 0; i < lemma_num; i++) {
  95. if (0 == i) {
  96. last_hz_len = lemma_arr[i].hz_str_len;
  97. assert(last_hz_len > 0);
  98. assert(lemma_arr[0].idx_by_hz == 1);
  99. id_num++;
  100. start_pos_[0] = 0;
  101. start_id_[0] = id_num;
  102. last_hz_len = 1;
  103. list_size += last_hz_len;
  104. } else {
  105. unsigned current_hz_len = lemma_arr[i].hz_str_len;
  106. assert(current_hz_len >= last_hz_len);
  107. if (current_hz_len == last_hz_len) {
  108. list_size += current_hz_len;
  109. id_num++;
  110. } else {
  111. for (unsigned len = last_hz_len; len < current_hz_len - 1; len++) {
  112. start_pos_[len] = start_pos_[len - 1];
  113. start_id_[len] = start_id_[len - 1];
  114. }
  115. start_pos_[current_hz_len - 1] = list_size;
  116. id_num++;
  117. start_id_[current_hz_len - 1] = id_num;
  118. last_hz_len = current_hz_len;
  119. list_size += current_hz_len;
  120. }
  121. }
  122. }
  123. for (unsigned i = last_hz_len; i <= kMaxLemmaSize; i++) {
  124. if (0 == i) {
  125. start_pos_[0] = 0;
  126. start_id_[0] = 1;
  127. } else {
  128. start_pos_[i] = list_size;
  129. start_id_[i] = id_num;
  130. }
  131. }
  132. return start_pos_[kMaxLemmaSize];
  133. }
  134. void DictList::fill_scis(const SingleCharItem *scis, unsigned scis_num) {
  135. assert(scis_num_ == scis_num);
  136. for (unsigned pos = 0; pos < scis_num_; pos++) {
  137. scis_hz_[pos] = scis[pos].hz;
  138. scis_splid_[pos] = scis[pos].splid;
  139. }
  140. }
  141. void DictList::fill_list(const LemmaEntry* lemma_arr, unsigned lemma_num) {
  142. unsigned current_pos = 0;
  143. utf16_strncpy(buf_, lemma_arr[0].hanzi_str,
  144. lemma_arr[0].hz_str_len);
  145. current_pos = lemma_arr[0].hz_str_len;
  146. unsigned id_num = 1;
  147. for (unsigned i = 1; i < lemma_num; i++) {
  148. utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str,
  149. lemma_arr[i].hz_str_len);
  150. id_num++;
  151. current_pos += lemma_arr[i].hz_str_len;
  152. }
  153. assert(current_pos == start_pos_[kMaxLemmaSize]);
  154. assert(id_num == start_id_[kMaxLemmaSize]);
  155. }
  156. char16* DictList::find_pos2_startedbyhz(char16 hz_char) {
  157. char16 *found_2w = static_cast<char16*>
  158. (mybsearch(&hz_char, buf_ + start_pos_[1],
  159. (start_pos_[2] - start_pos_[1]) / 2,
  160. sizeof(char16) * 2, cmp_hanzis_1));
  161. if (NULL == found_2w)
  162. return NULL;
  163. while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1))
  164. found_2w -= 2;
  165. return found_2w;
  166. }
  167. #endif // ___BUILD_MODEL___
  168. char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[],
  169. unsigned word_len, int (*cmp_func)(const void *, const void *)) {
  170. char16 *found_w = static_cast<char16*>
  171. (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1],
  172. (start_pos_[word_len] - start_pos_[word_len - 1])
  173. / word_len,
  174. sizeof(char16) * word_len, cmp_func));
  175. if (NULL == found_w)
  176. return NULL;
  177. while (found_w > buf_ + start_pos_[word_len -1] &&
  178. cmp_func(found_w, found_w - word_len) == 0)
  179. found_w -= word_len;
  180. return found_w;
  181. }
  182. unsigned DictList::predict(const char16 last_hzs[], uint16 hzs_len,
  183. NPredictItem *npre_items, unsigned npre_max,
  184. unsigned b4_used) {
  185. assert(hzs_len <= kMaxPredictSize && hzs_len > 0);
  186. // 1. Prepare work
  187. int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1];
  188. NGram& ngram = NGram::get_instance();
  189. unsigned item_num = 0;
  190. // 2. Do prediction
  191. for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len;
  192. pre_len++) {
  193. uint16 word_len = hzs_len + pre_len;
  194. char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func);
  195. if (NULL == w_buf)
  196. continue;
  197. while (w_buf < buf_ + start_pos_[word_len] &&
  198. cmp_func(w_buf, last_hzs) == 0 &&
  199. item_num < npre_max) {
  200. memset(npre_items + item_num, 0, sizeof(NPredictItem));
  201. utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len);
  202. npre_items[item_num].psb =
  203. ngram.get_uni_psb((unsigned)(w_buf - buf_ - start_pos_[word_len - 1])
  204. / word_len + start_id_[word_len - 1]);
  205. npre_items[item_num].his_len = hzs_len;
  206. item_num++;
  207. w_buf += word_len;
  208. }
  209. }
  210. unsigned new_num = 0;
  211. for (unsigned i = 0; i < item_num; i++) {
  212. // Try to find it in the existing items
  213. unsigned e_pos;
  214. for (e_pos = 1; e_pos <= b4_used; e_pos++) {
  215. if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs,
  216. kMaxPredictSize) == 0)
  217. break;
  218. }
  219. if (e_pos <= b4_used)
  220. continue;
  221. // If not found, append it to the buffer
  222. npre_items[new_num] = npre_items[i];
  223. new_num++;
  224. }
  225. return new_num;
  226. }
  227. uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
  228. uint16 str_max) {
  229. if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf
  230. || str_max <= 1)
  231. return 0;
  232. // Find the range
  233. for (uint16 i = 0; i < kMaxLemmaSize; i++) {
  234. if (i + 1 > str_max - 1)
  235. return 0;
  236. if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) {
  237. unsigned id_span = id_lemma - start_id_[i];
  238. uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1);
  239. for (uint16 len = 0; len <= i; len++) {
  240. str_buf[len] = buf[len];
  241. }
  242. str_buf[i+1] = (char16)'\0';
  243. return i + 1;
  244. }
  245. }
  246. return 0;
  247. }
  248. uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
  249. uint16 *splids, uint16 max_splids) {
  250. char16 *hz_found = static_cast<char16*>
  251. (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1));
  252. assert(NULL != hz_found && hanzi == *hz_found);
  253. // Move to the first one.
  254. while (hz_found > scis_hz_ && hanzi == *(hz_found - 1))
  255. hz_found--;
  256. // First try to found if strict comparison result is not zero.
  257. char16 *hz_f = hz_found;
  258. bool strict = false;
  259. while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) {
  260. uint16 pos = hz_f - scis_hz_;
  261. if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) {
  262. strict = true;
  263. }
  264. hz_f++;
  265. }
  266. uint16 found_num = 0;
  267. while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) {
  268. uint16 pos = hz_found - scis_hz_;
  269. if (0 == half_splid ||
  270. (strict && scis_splid_[pos].half_splid == half_splid) ||
  271. (!strict && spl_trie_->half_full_compatible(half_splid,
  272. scis_splid_[pos].full_splid))) {
  273. assert(found_num + 1 < max_splids);
  274. splids[found_num] = scis_splid_[pos].full_splid;
  275. found_num++;
  276. }
  277. hz_found++;
  278. }
  279. return found_num;
  280. }
  281. LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) {
  282. if (NULL == str || str_len > kMaxLemmaSize)
  283. return 0;
  284. char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]);
  285. if (NULL == found)
  286. return 0;
  287. assert(found > buf_);
  288. assert(static_cast<unsigned>(found - buf_) >= start_pos_[str_len - 1]);
  289. return static_cast<LemmaIdType>
  290. (start_id_[str_len - 1] +
  291. (found - buf_ - start_pos_[str_len - 1]) / str_len);
  292. }
  293. void DictList::convert_to_hanzis(char16 *str, uint16 str_len) {
  294. assert(NULL != str);
  295. for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
  296. str[str_pos] = scis_hz_[str[str_pos]];
  297. }
  298. }
  299. void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) {
  300. assert(NULL != str);
  301. for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
  302. str[str_pos] = 0x100;
  303. }
  304. }
  305. bool DictList::save_list(FILE *fp) {
  306. if (!initialized_ || NULL == fp)
  307. return false;
  308. if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] ||
  309. NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_)
  310. return false;
  311. if (fwrite(&scis_num_, sizeof(unsigned), 1, fp) != 1)
  312. return false;
  313. if (fwrite(start_pos_, sizeof(unsigned), kMaxLemmaSize + 1, fp) !=
  314. kMaxLemmaSize + 1)
  315. return false;
  316. if (fwrite(start_id_, sizeof(unsigned), kMaxLemmaSize + 1, fp) !=
  317. kMaxLemmaSize + 1)
  318. return false;
  319. if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
  320. return false;
  321. if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
  322. return false;
  323. if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
  324. start_pos_[kMaxLemmaSize])
  325. return false;
  326. return true;
  327. }
  328. bool DictList::load_list(FILE *fp) {
  329. if (NULL == fp)
  330. return false;
  331. initialized_ = false;
  332. if (fread(&scis_num_, sizeof(unsigned), 1, fp) != 1)
  333. return false;
  334. if (fread(start_pos_, sizeof(unsigned), kMaxLemmaSize + 1, fp) !=
  335. kMaxLemmaSize + 1)
  336. return false;
  337. if (fread(start_id_, sizeof(unsigned), kMaxLemmaSize + 1, fp) !=
  338. kMaxLemmaSize + 1)
  339. return false;
  340. free_resource();
  341. if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_))
  342. return false;
  343. if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
  344. return false;
  345. if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
  346. return false;
  347. if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
  348. start_pos_[kMaxLemmaSize])
  349. return false;
  350. initialized_ = true;
  351. return true;
  352. }
  353. } // namespace ime_pinyin