PageRenderTime 33ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 1ms

/src/im/gpinyin/share/matrixsearch.cpp

http://ftk.googlecode.com/
C++ | 1958 lines | 1468 code | 307 blank | 183 comment | 541 complexity | 78b349d2a93098d3ec614f9c46e4972e MD5 | raw file
Possible License(s): LGPL-3.0
  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include <math.h>
  18. #include <stdio.h>
  19. #include <string.h>
  20. #include "../include/lpicache.h"
  21. #include "../include/matrixsearch.h"
  22. #include "../include/mystdlib.h"
  23. #include "../include/ngram.h"
  24. #include "../include/userdict.h"
  25. namespace ime_pinyin {
  26. #define PRUMING_SCORE 8000.0
  27. MatrixSearch::MatrixSearch() {
  28. inited_ = false;
  29. spl_trie_ = SpellingTrie::get_cpinstance();
  30. reset_pointers_to_null();
  31. pys_decoded_len_ = 0;
  32. mtrx_nd_pool_used_ = 0;
  33. dmi_pool_used_ = 0;
  34. xi_an_enabled_ = false;
  35. dmi_c_phrase_ = false;
  36. assert(kMaxSearchSteps > 0);
  37. max_sps_len_ = kMaxSearchSteps - 1;
  38. max_hzs_len_ = kMaxSearchSteps;
  39. }
  40. MatrixSearch::~MatrixSearch() {
  41. free_resource();
  42. }
  43. void MatrixSearch::reset_pointers_to_null() {
  44. dict_trie_ = NULL;
  45. user_dict_ = NULL;
  46. spl_parser_ = NULL;
  47. share_buf_ = NULL;
  48. // The following four buffers are used for decoding, and they are based on
  49. // share_buf_, no need to delete them.
  50. mtrx_nd_pool_ = NULL;
  51. dmi_pool_ = NULL;
  52. matrix_ = NULL;
  53. dep_ = NULL;
  54. // Based on share_buf_, no need to delete them.
  55. npre_items_ = NULL;
  56. }
  57. bool MatrixSearch::alloc_resource() {
  58. free_resource();
  59. dict_trie_ = new DictTrie();
  60. user_dict_ = static_cast<AtomDictBase*>(new UserDict());
  61. spl_parser_ = new SpellingParser();
  62. unsigned mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
  63. mtrx_nd_size = align_to_unsigned(mtrx_nd_size) / sizeof(unsigned);
  64. unsigned dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
  65. dmi_size = align_to_unsigned(dmi_size) / sizeof(unsigned);
  66. unsigned matrix_size = sizeof(MatrixRow) * kMaxRowNum;
  67. matrix_size = align_to_unsigned(matrix_size) / sizeof(unsigned);
  68. unsigned dep_size = sizeof(DictExtPara);
  69. dep_size = align_to_unsigned(dep_size) / sizeof(unsigned);
  70. // share_buf's size is determined by the buffers for search.
  71. share_buf_ = new unsigned[mtrx_nd_size + dmi_size + matrix_size + dep_size];
  72. if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
  73. NULL == share_buf_)
  74. return false;
  75. // The buffers for search are based on the share buffer
  76. mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
  77. dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
  78. matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
  79. dep_ = reinterpret_cast<DictExtPara*>
  80. (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
  81. // The prediction buffer is also based on the share buffer.
  82. npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
  83. npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
  84. sizeof(unsigned) / sizeof(NPredictItem);
  85. return true;
  86. }
  87. void MatrixSearch::free_resource() {
  88. if (NULL != dict_trie_)
  89. delete dict_trie_;
  90. if (NULL != user_dict_)
  91. delete user_dict_;
  92. if (NULL != spl_parser_)
  93. delete spl_parser_;
  94. if (NULL != share_buf_)
  95. delete [] share_buf_;
  96. reset_pointers_to_null();
  97. }
  98. bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
  99. if (NULL == fn_sys_dict || NULL == fn_usr_dict)
  100. return false;
  101. if (!alloc_resource())
  102. return false;
  103. if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
  104. return false;
  105. // If engine fails to load the user dictionary, reset the user dictionary
  106. // to NULL.
  107. if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
  108. delete user_dict_;
  109. user_dict_ = NULL;
  110. } else{
  111. user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
  112. }
  113. reset_search0();
  114. inited_ = true;
  115. return true;
  116. }
  117. bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
  118. const char *fn_usr_dict) {
  119. if (NULL == fn_usr_dict)
  120. return false;
  121. if (!alloc_resource())
  122. return false;
  123. if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
  124. return false;
  125. if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
  126. delete user_dict_;
  127. user_dict_ = NULL;
  128. } else {
  129. user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
  130. }
  131. reset_search0();
  132. inited_ = true;
  133. return true;
  134. }
  135. void MatrixSearch::set_max_lens(unsigned max_sps_len, unsigned max_hzs_len) {
  136. if (0 != max_sps_len)
  137. max_sps_len_ = max_sps_len;
  138. if (0 != max_hzs_len)
  139. max_hzs_len_ = max_hzs_len;
  140. }
  141. void MatrixSearch::close() {
  142. flush_cache();
  143. free_resource();
  144. inited_ = false;
  145. }
  146. void MatrixSearch::flush_cache() {
  147. if (NULL != user_dict_)
  148. user_dict_->flush_cache();
  149. }
  150. void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
  151. xi_an_enabled_ = xi_an_enabled;
  152. }
  153. bool MatrixSearch::get_xi_an_switch() {
  154. return xi_an_enabled_;
  155. }
  156. bool MatrixSearch::reset_search() {
  157. if (!inited_)
  158. return false;
  159. return reset_search0();
  160. }
  161. bool MatrixSearch::reset_search0() {
  162. if (!inited_)
  163. return false;
  164. pys_decoded_len_ = 0;
  165. mtrx_nd_pool_used_ = 0;
  166. dmi_pool_used_ = 0;
  167. // Get a MatrixNode from the pool
  168. matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
  169. matrix_[0].mtrx_nd_num = 1;
  170. mtrx_nd_pool_used_ += 1;
  171. // Update the node, and make it to be a starting node
  172. MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
  173. node->id = 0;
  174. node->score = 0;
  175. node->from = NULL;
  176. node->step = 0;
  177. node->dmi_fr = (PoolPosType)-1;
  178. matrix_[0].dmi_pos = 0;
  179. matrix_[0].dmi_num = 0;
  180. matrix_[0].dmi_has_full_id = 1;
  181. matrix_[0].mtrx_nd_fixed = node;
  182. lma_start_[0] = 0;
  183. fixed_lmas_ = 0;
  184. spl_start_[0] = 0;
  185. fixed_hzs_ = 0;
  186. dict_trie_->reset_milestones(0, 0);
  187. if (NULL != user_dict_)
  188. user_dict_->reset_milestones(0, 0);
  189. return true;
  190. }
  191. bool MatrixSearch::reset_search(unsigned ch_pos, bool clear_fixed_this_step,
  192. bool clear_dmi_this_step,
  193. bool clear_mtrx_this_step) {
  194. if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
  195. return false;
  196. if (0 == ch_pos) {
  197. reset_search0();
  198. } else {
  199. // Prepare mile stones of this step to clear.
  200. MileStoneHandle *dict_handles_to_clear = NULL;
  201. if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
  202. dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
  203. }
  204. // If there are more steps, and this step is not allowed to clear, find
  205. // milestones of next step.
  206. if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
  207. dict_handles_to_clear = NULL;
  208. if (matrix_[ch_pos + 1].dmi_num > 0) {
  209. dict_handles_to_clear =
  210. dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
  211. }
  212. }
  213. if (NULL != dict_handles_to_clear) {
  214. dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
  215. if (NULL != user_dict_)
  216. user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
  217. }
  218. pys_decoded_len_ = ch_pos;
  219. if (clear_dmi_this_step) {
  220. dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
  221. + matrix_[ch_pos - 1].dmi_num;
  222. matrix_[ch_pos].dmi_num = 0;
  223. } else {
  224. dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
  225. }
  226. if (clear_mtrx_this_step) {
  227. mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
  228. + matrix_[ch_pos - 1].mtrx_nd_num;
  229. matrix_[ch_pos].mtrx_nd_num = 0;
  230. } else {
  231. mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
  232. + matrix_[ch_pos].mtrx_nd_num;
  233. }
  234. // Modify fixed_hzs_
  235. if (fixed_hzs_ > 0 &&
  236. ((kLemmaIdComposing != lma_id_[0]) ||
  237. (kLemmaIdComposing == lma_id_[0] &&
  238. spl_start_[c_phrase_.length] <= ch_pos))) {
  239. unsigned fixed_ch_pos = ch_pos;
  240. if (clear_fixed_this_step)
  241. fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
  242. while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
  243. fixed_ch_pos--;
  244. fixed_lmas_ = 0;
  245. fixed_hzs_ = 0;
  246. if (fixed_ch_pos > 0) {
  247. while (spl_start_[fixed_hzs_] < fixed_ch_pos)
  248. fixed_hzs_++;
  249. assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
  250. while (lma_start_[fixed_lmas_] < fixed_hzs_)
  251. fixed_lmas_++;
  252. assert(lma_start_[fixed_lmas_] == fixed_hzs_);
  253. }
  254. // Re-search the Pinyin string for the unlocked lemma
  255. // which was previously fixed.
  256. //
  257. // Prepare mile stones of this step to clear.
  258. MileStoneHandle *dict_handles_to_clear = NULL;
  259. if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
  260. matrix_[fixed_ch_pos].dmi_num > 0) {
  261. dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
  262. }
  263. // If there are more steps, and this step is not allowed to clear, find
  264. // milestones of next step.
  265. if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
  266. dict_handles_to_clear = NULL;
  267. if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
  268. dict_handles_to_clear =
  269. dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
  270. }
  271. }
  272. if (NULL != dict_handles_to_clear) {
  273. dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
  274. if (NULL != user_dict_)
  275. user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
  276. }
  277. pys_decoded_len_ = fixed_ch_pos;
  278. if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
  279. dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
  280. + matrix_[fixed_ch_pos - 1].dmi_num;
  281. matrix_[fixed_ch_pos].dmi_num = 0;
  282. } else {
  283. dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
  284. matrix_[fixed_ch_pos].dmi_num;
  285. }
  286. if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
  287. mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
  288. + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
  289. matrix_[fixed_ch_pos].mtrx_nd_num = 0;
  290. } else {
  291. mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
  292. + matrix_[fixed_ch_pos].mtrx_nd_num;
  293. }
  294. for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
  295. add_char(pys_[re_pos]);
  296. }
  297. } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
  298. for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
  299. uint16 splpos_begin = c_phrase_.sublma_start[subpos];
  300. uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
  301. for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
  302. // If ch_pos is in this spelling
  303. uint16 spl_start = c_phrase_.spl_start[splpos];
  304. uint16 spl_end = c_phrase_.spl_start[splpos + 1];
  305. if (ch_pos >= spl_start && ch_pos < spl_end) {
  306. // Clear everything after this position
  307. c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
  308. c_phrase_.sublma_start[subpos + 1] = splpos;
  309. c_phrase_.sublma_num = subpos + 1;
  310. c_phrase_.length = splpos;
  311. if (splpos == splpos_begin) {
  312. c_phrase_.sublma_num = subpos;
  313. }
  314. }
  315. }
  316. }
  317. // Extend the composing phrase.
  318. reset_search0();
  319. dmi_c_phrase_ = true;
  320. uint16 c_py_pos = 0;
  321. while (c_py_pos < spl_start_[c_phrase_.length]) {
  322. bool b_ac_tmp = add_char(pys_[c_py_pos]);
  323. assert(b_ac_tmp);
  324. c_py_pos++;
  325. }
  326. dmi_c_phrase_ = false;
  327. lma_id_num_ = 1;
  328. fixed_lmas_ = 1;
  329. fixed_lmas_no1_[0] = 0; // A composing string is always modified.
  330. fixed_hzs_ = c_phrase_.length;
  331. lma_start_[1] = fixed_hzs_;
  332. lma_id_[0] = kLemmaIdComposing;
  333. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
  334. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
  335. }
  336. }
  337. return true;
  338. }
  339. void MatrixSearch::del_in_pys(unsigned start, unsigned len) {
  340. while (start < kMaxRowNum - len && '\0' != pys_[start]) {
  341. pys_[start] = pys_[start + len];
  342. start++;
  343. }
  344. }
  345. unsigned MatrixSearch::search(const char *py, unsigned py_len) {
  346. if (!inited_ || NULL == py)
  347. return 0;
  348. // If the search Pinyin string is too long, it will be truncated.
  349. if (py_len > kMaxRowNum - 1)
  350. py_len = kMaxRowNum - 1;
  351. // Compare the new string with the previous one. Find their prefix to
  352. // increase search efficiency.
  353. unsigned ch_pos = 0;
  354. for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
  355. if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
  356. break;
  357. }
  358. bool clear_fix = true;
  359. if (ch_pos == pys_decoded_len_)
  360. clear_fix = false;
  361. reset_search(ch_pos, clear_fix, false, false);
  362. memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
  363. pys_[py_len] = '\0';
  364. while ('\0' != pys_[ch_pos]) {
  365. if (!add_char(py[ch_pos])) {
  366. pys_decoded_len_ = ch_pos;
  367. break;
  368. }
  369. ch_pos++;
  370. }
  371. // Get spelling ids and starting positions.
  372. get_spl_start_id();
  373. // If there are too many spellings, remove the last letter until the spelling
  374. // number is acceptable.
  375. while (spl_id_num_ > 9) {
  376. py_len--;
  377. reset_search(py_len, false, false, false);
  378. pys_[py_len] = '\0';
  379. get_spl_start_id();
  380. }
  381. prepare_candidates();
  382. if (kPrintDebug0) {
  383. printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
  384. printf("--DMI Pool Used: %d\n", dmi_pool_used_);
  385. if (kPrintDebug1) {
  386. for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
  387. debug_print_dmi(pos, 1);
  388. }
  389. }
  390. }
  391. return ch_pos;
  392. }
  393. unsigned MatrixSearch::delsearch(unsigned pos, bool is_pos_in_splid,
  394. bool clear_fixed_this_step) {
  395. if (!inited_)
  396. return 0;
  397. unsigned reset_pos = pos;
  398. // Out of range for both Pinyin mode and Spelling id mode.
  399. if (pys_decoded_len_ <= pos) {
  400. del_in_pys(pos, 1);
  401. reset_pos = pys_decoded_len_;
  402. // Decode the string after the un-decoded position
  403. while ('\0' != pys_[reset_pos]) {
  404. if (!add_char(pys_[reset_pos])) {
  405. pys_decoded_len_ = reset_pos;
  406. break;
  407. }
  408. reset_pos++;
  409. }
  410. get_spl_start_id();
  411. prepare_candidates();
  412. return pys_decoded_len_;
  413. }
  414. // Spelling id mode, but out of range.
  415. if (is_pos_in_splid && spl_id_num_ <= pos)
  416. return pys_decoded_len_;
  417. // Begin to handle two modes respectively.
  418. // Pinyin mode by default
  419. unsigned c_py_len = 0; // The length of composing phrase's Pinyin
  420. unsigned del_py_len = 1;
  421. if (!is_pos_in_splid) {
  422. // Pinyin mode is only allowed to delete beyond the fixed lemmas.
  423. if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
  424. return pys_decoded_len_;
  425. del_in_pys(pos, 1);
  426. // If the deleted character is just the one after the last fixed lemma
  427. if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
  428. // If all fixed lemmas have been merged, and the caller of the function
  429. // request to unlock the last fixed lemma.
  430. if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
  431. // Unlock the last sub lemma in the composing phrase. Because it is not
  432. // easy to unlock it directly. Instead, we re-decode the modified
  433. // composing phrase.
  434. c_phrase_.sublma_num--;
  435. c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
  436. reset_pos = spl_start_[c_phrase_.length];
  437. c_py_len = reset_pos;
  438. }
  439. }
  440. } else {
  441. del_py_len = spl_start_[pos + 1] - spl_start_[pos];
  442. del_in_pys(spl_start_[pos], del_py_len);
  443. if (pos >= lma_start_[fixed_lmas_]) {
  444. c_py_len = 0;
  445. reset_pos = spl_start_[pos + 1] - del_py_len;
  446. } else {
  447. c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
  448. reset_pos = c_py_len;
  449. if (c_py_len > 0)
  450. merge_fixed_lmas(pos);
  451. }
  452. }
  453. if (c_py_len > 0) {
  454. assert(c_phrase_.length > 0 && c_py_len ==
  455. c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
  456. // The composing phrase is valid, reset all search space,
  457. // and begin a new search which will only extend the composing
  458. // phrase.
  459. reset_search0();
  460. dmi_c_phrase_ = true;
  461. // Extend the composing phrase.
  462. uint16 c_py_pos = 0;
  463. while (c_py_pos < c_py_len) {
  464. bool b_ac_tmp = add_char(pys_[c_py_pos]);
  465. assert(b_ac_tmp);
  466. c_py_pos++;
  467. }
  468. dmi_c_phrase_ = false;
  469. // Fixd the composing phrase as the first choice.
  470. lma_id_num_ = 1;
  471. fixed_lmas_ = 1;
  472. fixed_lmas_no1_[0] = 0; // A composing string is always modified.
  473. fixed_hzs_ = c_phrase_.length;
  474. lma_start_[1] = fixed_hzs_;
  475. lma_id_[0] = kLemmaIdComposing;
  476. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
  477. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
  478. } else {
  479. // Reseting search only clear pys_decoded_len_, but the string is kept.
  480. reset_search(reset_pos, clear_fixed_this_step, false, false);
  481. }
  482. // Decode the string after the delete position.
  483. while ('\0' != pys_[reset_pos]) {
  484. if (!add_char(pys_[reset_pos])) {
  485. pys_decoded_len_ = reset_pos;
  486. break;
  487. }
  488. reset_pos++;
  489. }
  490. get_spl_start_id();
  491. prepare_candidates();
  492. return pys_decoded_len_;
  493. }
  494. unsigned MatrixSearch::get_candidate_num() {
  495. if (!inited_ || 0 == pys_decoded_len_ ||
  496. 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
  497. return 0;
  498. return 1 + lpi_total_;
  499. }
  500. char16* MatrixSearch::get_candidate(unsigned cand_id, char16 *cand_str,
  501. unsigned max_len) {
  502. if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
  503. return NULL;
  504. if (0 == cand_id) {
  505. return get_candidate0(cand_str, max_len, NULL, false);
  506. } else {
  507. cand_id--;
  508. }
  509. // For this case: the current sentence is a word only, and the user fixed it,
  510. // so the result will be fixed to the sentence space, and
  511. // lpi_total_ will be set to 0.
  512. if (0 == lpi_total_) {
  513. return get_candidate0(cand_str, max_len, NULL, false);
  514. }
  515. LemmaIdType id = lpi_items_[cand_id].id;
  516. char16 s[kMaxLemmaSize + 1];
  517. uint16 s_len = lpi_items_[cand_id].lma_len;
  518. if (s_len > 1) {
  519. s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
  520. } else {
  521. // For a single character, Hanzi is ready.
  522. s[0] = lpi_items_[cand_id].hanzi;
  523. s[1] = static_cast<char16>(0);
  524. }
  525. if (s_len > 0 && max_len > s_len) {
  526. utf16_strncpy(cand_str, s, s_len);
  527. cand_str[s_len] = (char16)'\0';
  528. return cand_str;
  529. }
  530. return NULL;
  531. }
  532. void MatrixSearch::update_dict_freq() {
  533. if (NULL != user_dict_) {
  534. // Update the total frequency of all lemmas, including system lemmas and
  535. // user dictionary lemmas.
  536. unsigned total_freq = user_dict_->get_total_lemma_count();
  537. dict_trie_->set_total_lemma_count_of_others(total_freq);
  538. }
  539. }
  540. bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
  541. float score) {
  542. if (lma_to - lma_fr <= 1 || NULL == user_dict_)
  543. return false;
  544. char16 word_str[kMaxLemmaSize + 1];
  545. uint16 spl_ids[kMaxLemmaSize];
  546. uint16 spl_id_fr = 0;
  547. for (uint16 pos = lma_fr; pos < lma_to; pos++) {
  548. LemmaIdType lma_id = lma_id_[pos];
  549. if (is_user_lemma(lma_id)) {
  550. user_dict_->update_lemma(lma_id, 1, true);
  551. }
  552. uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
  553. utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
  554. uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
  555. kMaxLemmaSize + 1 - spl_id_fr);
  556. assert(tmp == lma_len);
  557. tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
  558. if (tmp != lma_len) {
  559. return false;
  560. }
  561. spl_id_fr += lma_len;
  562. }
  563. assert(spl_id_fr <= kMaxLemmaSize);
  564. return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
  565. spl_id_fr, 1);
  566. }
  567. void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
  568. if (dmi_pos >= dmi_pool_used_) return;
  569. DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
  570. if (1 == nest_level) {
  571. printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
  572. }
  573. if (dmi->dict_level > 1) {
  574. debug_print_dmi(dmi->dmi_fr, nest_level + 1);
  575. }
  576. printf("---%d\n", dmi->dict_level);
  577. printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
  578. printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
  579. get_spelling_str(dmi->spl_id), dmi->spl_id);
  580. printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
  581. if (1 == nest_level) {
  582. printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
  583. }
  584. }
  585. bool MatrixSearch::try_add_cand0_to_userdict() {
  586. unsigned new_cand_num = get_candidate_num();
  587. if (fixed_hzs_ > 0 && 1 == new_cand_num) {
  588. float score_from = 0;
  589. uint16 lma_id_from = 0;
  590. uint16 pos = 0;
  591. bool modified = false;
  592. while (pos < fixed_lmas_) {
  593. if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
  594. static_cast<uint16>(kMaxLemmaSize)) {
  595. float score_to_add =
  596. mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
  597. .mtrx_nd_pos].score - score_from;
  598. if (modified) {
  599. score_to_add += 1.0;
  600. if (score_to_add > NGram::kMaxScore) {
  601. score_to_add = NGram::kMaxScore;
  602. }
  603. add_lma_to_userdict(lma_id_from, pos, score_to_add);
  604. }
  605. lma_id_from = pos;
  606. score_from += score_to_add;
  607. // Clear the flag for next user lemma.
  608. modified = false;
  609. }
  610. if (0 == fixed_lmas_no1_[pos]) {
  611. modified = true;
  612. }
  613. pos++;
  614. }
  615. // Single-char word is not allowed to add to userdict.
  616. if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
  617. float score_to_add =
  618. mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
  619. .mtrx_nd_pos].score - score_from;
  620. if (modified) {
  621. score_to_add += 1.0;
  622. if (score_to_add > NGram::kMaxScore) {
  623. score_to_add = NGram::kMaxScore;
  624. }
  625. add_lma_to_userdict(lma_id_from, pos, score_to_add);
  626. }
  627. }
  628. }
  629. return true;
  630. }
  631. // Choose a candidate, and give new candidates for next step.
  632. // If user finishes selection, we will try to communicate with user dictionary
  633. // to add new items or update score of some existing items.
  634. //
  635. // Basic rule:
  636. // 1. If user selects the first choice:
  637. // 1.1. If the first choice is not a sentence, instead, it is a lemma:
  638. // 1.1.1. If the first choice is a user lemma, notify the user
  639. // dictionary that a user lemma is hit, and add occuring count
  640. // by 1.
  641. // 1.1.2. If the first choice is a system lemma, do nothing.
  642. // 1.2. If the first choice is a sentence containing more than one lemma:
  643. // 1.2.1. The whole sentence will be added as a user lemma. If the
  644. // sentence contains user lemmas, -> hit, and add occuring count
  645. // by 1.
  646. unsigned MatrixSearch::choose(unsigned cand_id) {
  647. if (!inited_ || 0 == pys_decoded_len_)
  648. return 0;
  649. if (0 == cand_id) {
  650. fixed_hzs_ = spl_id_num_;
  651. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
  652. matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
  653. for (unsigned pos = fixed_lmas_; pos < lma_id_num_; pos++) {
  654. fixed_lmas_no1_[pos] = 1;
  655. }
  656. fixed_lmas_ = lma_id_num_;
  657. lpi_total_ = 0; // Clean all other candidates.
  658. // 1. It is the first choice
  659. if (1 == lma_id_num_) {
  660. // 1.1. The first choice is not a sentence but a lemma
  661. if (is_user_lemma(lma_id_[0])) {
  662. // 1.1.1. The first choice is a user lemma, notify the user dictionary
  663. // that it is hit.
  664. if (NULL != user_dict_)
  665. user_dict_->update_lemma(lma_id_[0], 1, true);
  666. } else {
  667. // 1.1.2. do thing for a system lemma.
  668. }
  669. } else {
  670. // 1.2. The first choice is a sentence.
  671. // 1.2.1 Try to add the whole sentence to user dictionary, the whole
  672. // sentence may be splitted into many items.
  673. if (NULL != user_dict_) {
  674. try_add_cand0_to_userdict();
  675. }
  676. }
  677. update_dict_freq();
  678. return 1;
  679. } else {
  680. cand_id--;
  681. }
  682. // 2. It is not the full sentence candidate.
  683. // Find the length of the candidate.
  684. LemmaIdType id_chosen = lpi_items_[cand_id].id;
  685. LmaScoreType score_chosen = lpi_items_[cand_id].psb;
  686. unsigned cand_len = lpi_items_[cand_id].lma_len;
  687. assert(cand_len > 0);
  688. // Notify the atom dictionary that this item is hit.
  689. if (is_user_lemma(id_chosen)) {
  690. if (NULL != user_dict_) {
  691. user_dict_->update_lemma(id_chosen, 1, true);
  692. }
  693. update_dict_freq();
  694. }
  695. // 3. Fixed the chosen item.
  696. // 3.1 Get the steps number.
  697. unsigned step_fr = spl_start_[fixed_hzs_];
  698. unsigned step_to = spl_start_[fixed_hzs_ + cand_len];
  699. // 3.2 Save the length of the original string.
  700. unsigned pys_decoded_len = pys_decoded_len_;
  701. // 3.2 Reset the space of the fixed part.
  702. reset_search(step_to, false, false, true);
  703. // 3.3 For the last character of the fixed part, the previous DMI
  704. // information will be kept, while the MTRX information will be re-extended,
  705. // and only one node will be extended.
  706. matrix_[step_to].mtrx_nd_num = 0;
  707. LmaPsbItem lpi_item;
  708. lpi_item.psb = score_chosen;
  709. lpi_item.id = id_chosen;
  710. PoolPosType step_to_dmi_fr = match_dmi(step_to,
  711. spl_id_ + fixed_hzs_, cand_len);
  712. assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
  713. extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
  714. step_to_dmi_fr, step_to);
  715. matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
  716. mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
  717. matrix_[step_to].mtrx_nd_num;
  718. if (id_chosen == lma_id_[fixed_lmas_])
  719. fixed_lmas_no1_[fixed_lmas_] = 1;
  720. else
  721. fixed_lmas_no1_[fixed_lmas_] = 0;
  722. lma_id_[fixed_lmas_] = id_chosen;
  723. lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
  724. fixed_lmas_++;
  725. fixed_hzs_ = fixed_hzs_ + cand_len;
  726. while (step_to != pys_decoded_len) {
  727. bool b = add_char(pys_[step_to]);
  728. assert(b);
  729. step_to++;
  730. }
  731. if (fixed_hzs_ < spl_id_num_) {
  732. prepare_candidates();
  733. } else {
  734. lpi_total_ = 0;
  735. if (NULL != user_dict_) {
  736. try_add_cand0_to_userdict();
  737. }
  738. }
  739. return get_candidate_num();
  740. }
  741. unsigned MatrixSearch::cancel_last_choice() {
  742. if (!inited_ || 0 == pys_decoded_len_)
  743. return 0;
  744. unsigned step_start = 0;
  745. if (fixed_hzs_ > 0) {
  746. unsigned step_end = spl_start_[fixed_hzs_];
  747. MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
  748. assert(NULL != end_node);
  749. step_start = end_node->from->step;
  750. if (step_start > 0) {
  751. DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
  752. fixed_hzs_ -= dmi->dict_level;
  753. } else {
  754. fixed_hzs_ = 0;
  755. }
  756. reset_search(step_start, false, false, false);
  757. while (pys_[step_start] != '\0') {
  758. bool b = add_char(pys_[step_start]);
  759. assert(b);
  760. step_start++;
  761. }
  762. prepare_candidates();
  763. }
  764. return get_candidate_num();
  765. }
  766. unsigned MatrixSearch::get_fixedlen() {
  767. if (!inited_ || 0 == pys_decoded_len_)
  768. return 0;
  769. return fixed_hzs_;
  770. }
  771. bool MatrixSearch::prepare_add_char(char ch) {
  772. if (pys_decoded_len_ >= kMaxRowNum - 1 ||
  773. (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
  774. return false;
  775. if (dmi_pool_used_ >= kDmiPoolSize) return false;
  776. pys_[pys_decoded_len_] = ch;
  777. pys_decoded_len_++;
  778. MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
  779. mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
  780. mtrx_this_row->mtrx_nd_num = 0;
  781. mtrx_this_row->dmi_pos = dmi_pool_used_;
  782. mtrx_this_row->dmi_num = 0;
  783. mtrx_this_row->dmi_has_full_id = 0;
  784. return true;
  785. }
  786. bool MatrixSearch::is_split_at(uint16 pos) {
  787. return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
  788. }
  789. void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
  790. PoolPosType dmi_fr, uint16 spl_id,
  791. uint16 node_num, unsigned char dict_level,
  792. bool splid_end_split, unsigned char splstr_len,
  793. unsigned char all_full_id) {
  794. dmi->dict_handles[0] = handles[0];
  795. dmi->dict_handles[1] = handles[1];
  796. dmi->dmi_fr = dmi_fr;
  797. dmi->spl_id = spl_id;
  798. dmi->dict_level = dict_level;
  799. dmi->splid_end_split = splid_end_split ? 1 : 0;
  800. dmi->splstr_len = splstr_len;
  801. dmi->all_full_id = all_full_id;
  802. dmi->c_phrase = 0;
  803. }
  804. bool MatrixSearch::add_char(char ch) {
  805. if (!prepare_add_char(ch))
  806. return false;
  807. return add_char_qwerty();
  808. }
  809. bool MatrixSearch::add_char_qwerty() {
  810. matrix_[pys_decoded_len_].mtrx_nd_num = 0;
  811. bool spl_matched = false;
  812. uint16 longest_ext = 0;
  813. // Extend the search matrix, from the oldest unfixed row. ext_len means
  814. // extending length.
  815. for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
  816. if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
  817. continue;
  818. // Refer to the declaration of the variable dmi_has_full_id for the
  819. // explanation of this piece of code. In one word, it is used to prevent
  820. // from the unwise extending of "shoud ou" but allow the reasonable
  821. // extending of "heng ao", "lang a", etc.
  822. if (ext_len > 1 && 0 != longest_ext &&
  823. 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
  824. if (xi_an_enabled_)
  825. continue;
  826. else
  827. break;
  828. }
  829. uint16 oldrow = pys_decoded_len_ - ext_len;
  830. // 0. If that row is before the last fixed step, ignore.
  831. if (spl_start_[fixed_hzs_] > oldrow)
  832. continue;
  833. // 1. Check if that old row has valid MatrixNode. If no, means that row is
  834. // not a boundary, either a word boundary or a spelling boundary.
  835. // If it is for extending composing phrase, it's OK to ignore the 0.
  836. if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
  837. continue;
  838. // 2. Get spelling id(s) for the last ext_len chars.
  839. uint16 spl_idx;
  840. bool is_pre = false;
  841. spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
  842. ext_len, &is_pre);
  843. if (is_pre)
  844. spl_matched = true;
  845. if (0 == spl_idx)
  846. continue;
  847. bool splid_end_split = is_split_at(oldrow + ext_len);
  848. // 3. Extend the DMI nodes of that old row
  849. // + 1 is to extend an extra node from the root
  850. for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
  851. dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
  852. dmi_pos++) {
  853. DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
  854. if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
  855. dmi = NULL; // The last one, NULL means extending from the root.
  856. } else {
  857. // If the dmi is covered by the fixed arrange, ignore it.
  858. if (fixed_hzs_ > 0 &&
  859. pys_decoded_len_ - ext_len - dmi->splstr_len <
  860. spl_start_[fixed_hzs_]) {
  861. continue;
  862. }
  863. // If it is not in mode for composing phrase, and the source DMI node
  864. // is marked for composing phrase, ignore this node.
  865. if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
  866. continue;
  867. }
  868. }
  869. // For example, if "gao" is extended, "g ao" is not allowed.
  870. // or "zh" has been passed, "z h" is not allowed.
  871. // Both word and word-connection will be prevented.
  872. if (longest_ext > ext_len) {
  873. if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
  874. continue;
  875. }
  876. // "z h" is not allowed.
  877. if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
  878. continue;
  879. }
  880. }
  881. dep_->splids_extended = 0;
  882. if (NULL != dmi) {
  883. uint16 prev_ids_num = dmi->dict_level;
  884. if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
  885. (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) {
  886. continue;
  887. }
  888. DictMatchInfo *d = dmi;
  889. while (d) {
  890. dep_->splids[--prev_ids_num] = d->spl_id;
  891. if ((PoolPosType)-1 == d->dmi_fr)
  892. break;
  893. d = dmi_pool_ + d->dmi_fr;
  894. }
  895. assert(0 == prev_ids_num);
  896. dep_->splids_extended = dmi->dict_level;
  897. }
  898. dep_->splids[dep_->splids_extended] = spl_idx;
  899. dep_->ext_len = ext_len;
  900. dep_->splid_end_split = splid_end_split;
  901. dep_->id_num = 1;
  902. dep_->id_start = spl_idx;
  903. if (spl_trie_->is_half_id(spl_idx)) {
  904. // Get the full id list
  905. dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
  906. assert(dep_->id_num > 0);
  907. }
  908. uint16 new_dmi_num;
  909. new_dmi_num = extend_dmi(dep_, dmi);
  910. if (new_dmi_num > 0) {
  911. if (dmi_c_phrase_) {
  912. dmi_pool_[dmi_pool_used_].c_phrase = 1;
  913. }
  914. matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
  915. dmi_pool_used_ += new_dmi_num;
  916. if (!spl_trie_->is_half_id(spl_idx))
  917. matrix_[pys_decoded_len_].dmi_has_full_id = 1;
  918. }
  919. // If get candiate lemmas, try to extend the path
  920. if (lpi_total_ > 0) {
  921. uint16 fr_row;
  922. if (NULL == dmi) {
  923. fr_row = oldrow;
  924. } else {
  925. assert(oldrow >= dmi->splstr_len);
  926. fr_row = oldrow - dmi->splstr_len;
  927. }
  928. for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
  929. mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
  930. matrix_[fr_row].mtrx_nd_num;
  931. mtrx_nd_pos++) {
  932. MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
  933. extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
  934. dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
  935. if (longest_ext == 0)
  936. longest_ext = ext_len;
  937. }
  938. }
  939. } // for dmi_pos
  940. } // for ext_len
  941. mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
  942. if (dmi_c_phrase_)
  943. return true;
  944. return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
  945. }
  946. void MatrixSearch::prepare_candidates() {
  947. // Get candiates from the first un-fixed step.
  948. uint16 lma_size_max = kMaxLemmaSize;
  949. if (lma_size_max > spl_id_num_ - fixed_hzs_)
  950. lma_size_max = spl_id_num_ - fixed_hzs_;
  951. uint16 lma_size = lma_size_max;
  952. // If the full sentense candidate's unfixed part may be the same with a normal
  953. // lemma. Remove the lemma candidate in this case.
  954. char16 fullsent[kMaxLemmaSize + 1];
  955. char16 *pfullsent = NULL;
  956. uint16 sent_len;
  957. pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
  958. // If the unfixed part contains more than one ids, it is not necessary to
  959. // check whether a lemma's string is the same to the unfixed part of the full
  960. // sentence candidate, so, set it to NULL;
  961. if (sent_len > kMaxLemmaSize)
  962. pfullsent = NULL;
  963. lpi_total_ = 0;
  964. unsigned lpi_num_full_match = 0; // Number of items which are fully-matched.
  965. while (lma_size > 0) {
  966. unsigned lma_num;
  967. lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
  968. lpi_items_ + lpi_total_,
  969. unsigned(kMaxLmaPsbItems - lpi_total_),
  970. pfullsent, lma_size == lma_size_max);
  971. if (lma_num > 0) {
  972. lpi_total_ += lma_num;
  973. // For next lemma candidates which are not the longest, it is not
  974. // necessary to compare with the full sentence candiate.
  975. pfullsent = NULL;
  976. }
  977. if (lma_size == lma_size_max) {
  978. lpi_num_full_match = lpi_total_;
  979. }
  980. lma_size--;
  981. }
  982. // Sort those partially-matched items by their unified scores.
  983. myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
  984. sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
  985. if (kPrintDebug0) {
  986. printf("-----Prepare candidates, score:\n");
  987. for (unsigned a = 0; a < lpi_total_; a++) {
  988. printf("[%03d]%d ", a, lpi_items_[a].psb);
  989. if ((a + 1) % 6 == 0) printf("\n");
  990. }
  991. printf("\n");
  992. }
  993. if (kPrintDebug0) {
  994. printf("--- lpi_total_ = %d\n", lpi_total_);
  995. }
  996. }
  997. const char* MatrixSearch::get_pystr(unsigned *decoded_len) {
  998. if (!inited_ || NULL == decoded_len)
  999. return NULL;
  1000. *decoded_len = pys_decoded_len_;
  1001. return pys_;
  1002. }
  1003. void MatrixSearch::merge_fixed_lmas(unsigned del_spl_pos) {
  1004. if (fixed_lmas_ == 0)
  1005. return;
  1006. // Update spelling segmentation information first.
  1007. spl_id_num_ -= 1;
  1008. uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
  1009. for (unsigned pos = del_spl_pos; pos <= spl_id_num_; pos++) {
  1010. spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
  1011. if (pos == spl_id_num_)
  1012. break;
  1013. spl_id_[pos] = spl_id_[pos + 1];
  1014. }
  1015. // Begin to merge.
  1016. uint16 phrase_len = 0;
  1017. // Update the spelling ids to the composing phrase.
  1018. // We need to convert these ids into full id in the future.
  1019. memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
  1020. memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
  1021. // If composing phrase has not been created, first merge all fixed
  1022. // lemmas into a composing phrase without deletion.
  1023. if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
  1024. uint16 bp = 1; // Begin position of real fixed lemmas.
  1025. // There is no existing composing phrase.
  1026. if (kLemmaIdComposing != lma_id_[0]) {
  1027. c_phrase_.sublma_num = 0;
  1028. bp = 0;
  1029. }
  1030. uint16 sub_num = c_phrase_.sublma_num;
  1031. for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
  1032. c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
  1033. if (lma_start_[pos] > del_spl_pos) {
  1034. c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
  1035. }
  1036. if (pos == fixed_lmas_)
  1037. break;
  1038. uint16 lma_len;
  1039. char16 *lma_str = c_phrase_.chn_str +
  1040. c_phrase_.sublma_start[sub_num] + phrase_len;
  1041. lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
  1042. assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
  1043. phrase_len += lma_len;
  1044. }
  1045. assert(phrase_len == lma_start_[fixed_lmas_]);
  1046. c_phrase_.length = phrase_len; // will be deleted by 1
  1047. c_phrase_.sublma_num += fixed_lmas_ - bp;
  1048. } else {
  1049. for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
  1050. if (c_phrase_.sublma_start[pos] > del_spl_pos) {
  1051. c_phrase_.sublma_start[pos] -= 1;
  1052. }
  1053. }
  1054. phrase_len = c_phrase_.length;
  1055. }
  1056. assert(phrase_len > 0);
  1057. if (1 == phrase_len) {
  1058. // After the only one is deleted, nothing will be left.
  1059. fixed_lmas_ = 0;
  1060. return;
  1061. }
  1062. // Delete the Chinese character in the merged phrase.
  1063. // The corresponding elements in spl_ids and spl_start of the
  1064. // phrase have been deleted.
  1065. char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
  1066. for (uint16 pos = 0;
  1067. pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
  1068. pos++) {
  1069. chn_str[pos] = chn_str[pos + 1];
  1070. }
  1071. c_phrase_.length -= 1;
  1072. // If the deleted spelling id is in a sub lemma which contains more than
  1073. // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
  1074. // which only contains 1 id, the whole sub lemma needs to be deleted, so
  1075. // del_a_sub will be true.
  1076. bool del_a_sub = false;
  1077. for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
  1078. if (c_phrase_.sublma_start[pos - 1] ==
  1079. c_phrase_.sublma_start[pos]) {
  1080. del_a_sub = true;
  1081. }
  1082. if (del_a_sub) {
  1083. c_phrase_.sublma_start[pos - 1] =
  1084. c_phrase_.sublma_start[pos];
  1085. }
  1086. }
  1087. if (del_a_sub)
  1088. c_phrase_.sublma_num -= 1;
  1089. return;
  1090. }
  1091. void MatrixSearch::get_spl_start_id() {
  1092. lma_id_num_ = 0;
  1093. lma_start_[0] = 0;
  1094. spl_id_num_ = 0;
  1095. spl_start_[0] = 0;
  1096. if (!inited_ || 0 == pys_decoded_len_ ||
  1097. 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
  1098. return;
  1099. // Calculate number of lemmas and spellings
  1100. // Only scan those part which is not fixed.
  1101. lma_id_num_ = fixed_lmas_;
  1102. spl_id_num_ = fixed_hzs_;
  1103. MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
  1104. while (mtrx_nd != mtrx_nd_pool_) {
  1105. if (fixed_hzs_ > 0) {
  1106. if (mtrx_nd->step <= spl_start_[fixed_hzs_])
  1107. break;
  1108. }
  1109. // Update the spelling segamentation information
  1110. unsigned char word_splstr_len = 0;
  1111. PoolPosType dmi_fr = mtrx_nd->dmi_fr;
  1112. if ((PoolPosType)-1 != dmi_fr)
  1113. word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
  1114. while ((PoolPosType)-1 != dmi_fr) {
  1115. spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
  1116. (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
  1117. spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
  1118. spl_id_num_++;
  1119. dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
  1120. }
  1121. // Update the lemma segmentation information
  1122. lma_start_[lma_id_num_ + 1] = spl_id_num_;
  1123. lma_id_[lma_id_num_] = mtrx_nd->id;
  1124. lma_id_num_++;
  1125. mtrx_nd = mtrx_nd->from;
  1126. }
  1127. // Reverse the result of spelling info
  1128. for (unsigned pos = fixed_hzs_;
  1129. pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
  1130. if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
  1131. spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
  1132. spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
  1133. spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
  1134. spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
  1135. spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
  1136. spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
  1137. }
  1138. }
  1139. // Reverse the result of lemma info
  1140. for (unsigned pos = fixed_lmas_;
  1141. pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
  1142. assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
  1143. if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
  1144. lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
  1145. lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
  1146. lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
  1147. lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
  1148. lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
  1149. lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
  1150. }
  1151. }
  1152. for (unsigned pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
  1153. if (pos < lma_id_num_)
  1154. lma_start_[pos] = lma_start_[pos - 1] +
  1155. (lma_start_[pos] - lma_start_[pos + 1]);
  1156. else
  1157. lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
  1158. lma_start_[fixed_lmas_];
  1159. }
  1160. // Find the last fixed position
  1161. fixed_hzs_ = 0;
  1162. for (unsigned pos = spl_id_num_; pos > 0; pos--) {
  1163. if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
  1164. fixed_hzs_ = pos;
  1165. break;
  1166. }
  1167. }
  1168. return;
  1169. }
  1170. unsigned MatrixSearch::get_spl_start(const uint16 *&spl_start) {
  1171. get_spl_start_id();
  1172. spl_start = spl_start_;
  1173. return spl_id_num_;
  1174. }
  1175. unsigned MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
  1176. if (dmi_pool_used_ >= kDmiPoolSize) return 0;
  1177. if (dmi_c_phrase_)
  1178. return extend_dmi_c(dep, dmi_s);
  1179. LpiCache& lpi_cache = LpiCache::get_instance();
  1180. uint16 splid = dep->splids[dep->splids_extended];
  1181. bool cached = false;
  1182. if (0 == dep->splids_extended)
  1183. cached = lpi_cache.is_cached(splid);
  1184. // 1. If this is a half Id, get its corresponding full starting Id and
  1185. // number of full Id.
  1186. unsigned ret_val = 0;
  1187. PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node
  1188. lpi_total_ = 0;
  1189. MileStoneHandle from_h[3];
  1190. from_h[0] = 0;
  1191. from_h[1] = 0;
  1192. if (0 != dep->splids_extended) {
  1193. from_h[0] = dmi_s->dict_handles[0];
  1194. from_h[1] = dmi_s->dict_handles[1];
  1195. }
  1196. // 2. Begin exgtending in the system dictionary
  1197. unsigned lpi_num = 0;
  1198. MileStoneHandle handles[2];
  1199. handles[0] = handles[1] = 0;
  1200. if (from_h[0] > 0 || NULL == dmi_s) {
  1201. handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
  1202. kMaxLmaPsbItems, &lpi_num);
  1203. }
  1204. if (handles[0] > 0)
  1205. lpi_total_ = lpi_num;
  1206. if (NULL == dmi_s) { // from root
  1207. assert(0 != handles[0]);
  1208. mtrx_dmi_fr = dmi_pool_used_;
  1209. }
  1210. // 3. Begin extending in the user dictionary
  1211. if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
  1212. handles[1] = user_dict_->extend_dict(from_h[1], dep,
  1213. lpi_items_ + lpi_total_,
  1214. kMaxLmaPsbItems - lpi_total_,
  1215. &lpi_num);
  1216. if (handles[1] > 0) {
  1217. if (kPrintDebug0) {
  1218. for (unsigned t = 0; t < lpi_num; t++) {
  1219. printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
  1220. lpi_items_[lpi_total_ + t].psb);
  1221. }
  1222. }
  1223. lpi_total_ += lpi_num;
  1224. }
  1225. }
  1226. if (0 != handles[0] || 0 != handles[1]) {
  1227. if (dmi_pool_used_ >= kDmiPoolSize) return 0;
  1228. DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
  1229. if (NULL == dmi_s) {
  1230. fill_dmi(dmi_add, handles,
  1231. (PoolPosType)-1, splid,
  1232. 1, 1, dep->splid_end_split, dep->ext_len,
  1233. spl_trie_->is_half_id(splid) ? 0 : 1);
  1234. } else {
  1235. fill_dmi(dmi_add, handles,
  1236. dmi_s - dmi_pool_, splid, 1,
  1237. dmi_s->dict_level + 1, dep->splid_end_split,
  1238. dmi_s->splstr_len + dep->ext_len,
  1239. spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
  1240. }
  1241. ret_val = 1;
  1242. }
  1243. if (!cached) {
  1244. if (0 == lpi_total_)
  1245. return ret_val;
  1246. if (kPrintDebug0) {
  1247. printf("--- lpi_total_ = %d\n", lpi_total_);
  1248. }
  1249. myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
  1250. if (NULL == dmi_s && spl_trie_->is_half_id(splid))
  1251. lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
  1252. } else {
  1253. assert(spl_trie_->is_half_id(splid));
  1254. lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
  1255. }
  1256. return ret_val;
  1257. }
  1258. unsigned MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
  1259. lpi_total_ = 0;
  1260. uint16 pos = dep->splids_extended;
  1261. assert(dmi_c_phrase_);
  1262. if (pos >= c_phrase_.length)
  1263. return 0;
  1264. uint16 splid = dep->splids[pos];
  1265. if (splid == c_phrase_.spl_ids[pos]) {
  1266. DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
  1267. MileStoneHandle handles[2]; // Actually never used.
  1268. if (NULL == dmi_s)
  1269. fill_dmi(dmi_add, handles,
  1270. (PoolPosType)-1, splid,
  1271. 1, 1, dep->splid_end_split, dep->ext_len,
  1272. spl_trie_->is_half_id(splid) ? 0 : 1);
  1273. else
  1274. fill_dmi(dmi_add, handles,
  1275. dmi_s - dmi_pool_, splid, 1,
  1276. dmi_s->dict_level + 1, dep->splid_end_split,
  1277. dmi_s->splstr_len + dep->ext_len,
  1278. spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
  1279. if (pos == c_phrase_.length - 1) {
  1280. lpi_items_[0].id = kLemmaIdComposing;
  1281. lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score.
  1282. lpi_total_ = 1;
  1283. }
  1284. return 1;
  1285. }
  1286. return 0;
  1287. }
  1288. unsigned MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
  1289. unsigned lpi_num, PoolPosType dmi_fr,
  1290. unsigned res_row) {
  1291. assert(NULL != mtrx_nd);
  1292. matrix_[res_row].mtrx_nd_fixed = NULL;
  1293. if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
  1294. return 0;
  1295. if (0 == mtrx_nd->step) {
  1296. // Because the list is sorted, if the source step is 0, it is only
  1297. // necessary to pick up the first kMaxNodeARow items.
  1298. if (lpi_num > kMaxNodeARow)
  1299. lpi_num = kMaxNodeARow;
  1300. }
  1301. MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
  1302. for (unsigned pos = 0; pos < lpi_num; pos++) {
  1303. float score = mtrx_nd->score + lpi_items[pos].psb;
  1304. if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
  1305. break;
  1306. // Try to add a new node
  1307. unsigned mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
  1308. MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
  1309. bool replace = false;
  1310. // Find its position
  1311. while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
  1312. if (static_cast<unsigned>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
  1313. *mtrx_nd_res = *(mtrx_nd_res - 1);
  1314. mtrx_nd_res--;
  1315. replace = true;
  1316. }
  1317. if (replace || (mtrx_nd_num < kMaxNodeARow &&
  1318. matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
  1319. mtrx_nd_res->id = lpi_items[pos].id;
  1320. mtrx_nd_res->score = score;
  1321. mtrx_nd_res->from = mtrx_nd;
  1322. mtrx_nd_res->dmi_fr = dmi_fr;
  1323. mtrx_nd_res->step = res_row;
  1324. if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
  1325. matrix_[res_row].mtrx_nd_num++;
  1326. }
  1327. }
  1328. return matrix_[res_row].mtrx_nd_num;
  1329. }
  1330. PoolPosType MatrixSearch::match_dmi(unsigned step_to, uint16 spl_ids[],
  1331. uint16 spl_id_num) {
  1332. if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
  1333. return static_cast<PoolPosType>(-1);
  1334. }
  1335. for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
  1336. DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
  1337. if (dmi->dict_level != spl_id_num)
  1338. continue;
  1339. bool matched = true;
  1340. for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
  1341. if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
  1342. matched = false;
  1343. break;
  1344. }
  1345. dmi = dmi_pool_ + dmi->dmi_fr;
  1346. }
  1347. if (matched) {
  1348. return matrix_[step_to].dmi_pos + dmi_pos;
  1349. }
  1350. }
  1351. return static_cast<PoolPosType>(-1);
  1352. }
  1353. char16* MatrixSearch::get_candidate0(char16 *cand_str, unsigned max_len,
  1354. uint16 *retstr_len,
  1355. bool only_unfixed) {
  1356. if (pys_decoded_len_ == 0 ||
  1357. matrix_[pys_decoded_len_].mtrx_nd_num == 0)
  1358. return NULL;
  1359. LemmaIdType idxs[kMaxRowNum];
  1360. unsigned id_num = 0;
  1361. MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
  1362. if (kPrintDebug0) {
  1363. printf("--- sentence score: %f\n", mtrx_nd->score);
  1364. }
  1365. if (kPrintDebug1) {
  1366. printf("==============Sentence DMI (reverse order) begin===========>>\n");
  1367. }
  1368. while (mtrx_nd != NULL) {
  1369. idxs[id_num] = mtrx_nd->id;
  1370. id_num++;
  1371. if (kPrintDebug1) {
  1372. printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
  1373. mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
  1374. debug_print_dmi(mtrx_nd->dmi_fr, 1);
  1375. }
  1376. mtrx_nd = mtrx_nd->from;
  1377. }
  1378. if (kPrintDebug1) {
  1379. printf("<<==============Sentence DMI (reverse order) end=============\n");
  1380. }
  1381. unsigned ret_pos = 0;
  1382. do {
  1383. id_num--;
  1384. if (0 == idxs[id_num])
  1385. continue;
  1386. char16 str[kMaxLemmaSize + 1];
  1387. uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
  1388. if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
  1389. (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
  1390. if (!only_unfixed)
  1391. utf16_strncpy(cand_str + ret_pos, str, str_len);
  1392. else if (ret_pos >= fixed_hzs_)
  1393. utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
  1394. ret_pos += str_len;
  1395. } else {
  1396. return NULL;
  1397. }
  1398. } while (id_num != 0);
  1399. if (!only_unfixed) {
  1400. if (NULL != retstr_len)
  1401. *retstr_len = ret_pos;
  1402. cand_str[ret_pos] = (char16)'\0';
  1403. } else {
  1404. if (NULL != retstr_len)
  1405. *retstr_len = ret_pos - fixed_hzs_;
  1406. cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
  1407. }
  1408. return cand_str;
  1409. }
  1410. unsigned MatrixSearch::get_lpis(const uint16* splid_str, unsigned splid_str_len,
  1411. LmaPsbItem* lma_buf, unsigned max_lma_buf,
  1412. const char16 *pfullsent, bool sort_by_psb) {
  1413. if (splid_str_len > kMaxLemmaSize)
  1414. return 0;
  1415. unsigned num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
  1416. lma_buf, max_lma_buf);
  1417. unsigned num2 = 0;
  1418. if (NULL != user_dict_) {
  1419. num2 = user_dict_->get_lpis(splid_str, splid_str_len,
  1420. lma_buf + num1, max_lma_buf - num1);
  1421. }
  1422. unsigned num = num1 + num2;
  1423. if (0 == num)
  1424. return 0;
  1425. // Remove repeated items.
  1426. if (splid_str_len > 1) {
  1427. LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
  1428. unsigned lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
  1429. sizeof(LmaPsbStrItem);
  1430. assert(lpsi_num > num);
  1431. if (num > lpsi_num) num = lpsi_num;
  1432. lpsi_num = num;
  1433. for (unsigned pos = 0; pos < lpsi_num; pos++) {
  1434. lpsis[pos].lpi = lma_buf[pos];
  1435. get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
  1436. }
  1437. myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
  1438. unsigned remain_num = 0;
  1439. for (unsigned pos = 0; pos < lpsi_num; pos++) {
  1440. if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
  1441. if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
  1442. assert(remain_num > 0);
  1443. lma_buf[remain_num - 1] = lpsis[pos].lpi;
  1444. }
  1445. continue;
  1446. }
  1447. if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
  1448. continue;
  1449. lma_buf[remain_num] = lpsis[pos].lpi;
  1450. remain_num++;
  1451. }
  1452. // Update the result number
  1453. num = remain_num;
  1454. } else {
  1455. // For single character, some characters have more than one spelling, for
  1456. // example, "de" and "di" are all valid for a Chinese character, so when
  1457. // the user input "d", repeated items are generated.
  1458. // For single character lemmas, Hanzis will be gotten
  1459. for (unsigned pos = 0; pos < num; pos++) {
  1460. char16 hanzis[2];
  1461. get_lemma_str(lma_buf[pos].id, hanzis, 2);
  1462. lma_buf[pos].hanzi = hanzis[0];
  1463. }
  1464. myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
  1465. unsigned remain_num = 0;
  1466. for (unsigned pos = 0; pos < num; pos++) {
  1467. if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
  1468. if (NULL != pfullsent &&
  1469. static_cast<char16>(0) == pfullsent[1] &&
  1470. lma_buf[pos].hanzi == pfullsent[0])
  1471. continue;
  1472. if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
  1473. assert(remain_num > 0);
  1474. assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
  1475. lma_buf[remain_num - 1] = lma_buf[pos];
  1476. }
  1477. continue;
  1478. }
  1479. if (NULL != pfullsent &&
  1480. static_cast<char16>(0) == pfullsent[1] &&
  1481. lma_buf[pos].hanzi == pfullsent[0])
  1482. continue;
  1483. lma_buf[remain_num] = lma_buf[pos];
  1484. remain_num++;
  1485. }
  1486. num = remain_num;
  1487. }
  1488. if (sort_by_psb) {
  1489. myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
  1490. }
  1491. return num;
  1492. }
  1493. uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
  1494. uint16 str_max) {
  1495. uint16 str_len = 0;
  1496. if (is_system_lemma(id_lemma)) {
  1497. str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
  1498. } else if (is_user_lemma(id_lemma)) {
  1499. if (NULL != user_dict_) {
  1500. str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
  1501. } else {
  1502. str_len = 0;
  1503. str_buf[0] = static_cast<char16>('\0');
  1504. }
  1505. } else if (is_composing_lemma(id_lemma)) {
  1506. if (str_max <= 1)
  1507. return 0;
  1508. str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
  1509. if (str_len > str_max - 1)
  1510. str_len = str_max - 1;
  1511. utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
  1512. str_buf[str_len] = (char16)'\0';
  1513. return str_len;
  1514. }
  1515. return str_len;
  1516. }
  1517. uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
  1518. uint16 splids_max, bool arg_valid) {
  1519. uint16 splid_num = 0;
  1520. if (arg_valid) {
  1521. for (splid_num = 0; splid_num < splids_max; splid_num++) {
  1522. if (spl_trie_->is_half_id(splids[splid_num]))
  1523. break;
  1524. }
  1525. if (splid_num == splids_max)
  1526. return splid_num;
  1527. }
  1528. if (is_system_lemma(id_lemma)) {
  1529. splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
  1530. arg_valid);
  1531. } else if (is_user_lemma(id_lemma)) {
  1532. if (NULL != user_dict_) {
  1533. splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
  1534. arg_valid);
  1535. } else {
  1536. splid_num = 0;
  1537. }
  1538. } else if (is_composing_lemma(id_lemma)) {
  1539. if (c_phrase_.length > splids_max) {
  1540. return 0;
  1541. }
  1542. for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
  1543. splids[pos] = c_phrase_.spl_ids[pos];
  1544. if (spl_trie_->is_half_id(splids[pos])) {
  1545. return 0;
  1546. }
  1547. }
  1548. }
  1549. return splid_num;
  1550. }
  1551. unsigned MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
  1552. char16 predict_buf[][kMaxPredictSize + 1],
  1553. unsigned buf_len) {
  1554. unsigned res_total = 0;
  1555. memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
  1556. // In order to shorten the comments, j-character candidates predicted by
  1557. // i-character prefix are called P(i,j). All candiates predicted by
  1558. // i-character prefix are called P(i,*)
  1559. // Step 1. Get P(kMaxPredictSize, *) and sort them, here
  1560. // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
  1561. for (unsigned len = fixed_len; len >0; len--) {
  1562. // How many blank items are available
  1563. unsigned this_max = npre_items_len_ - res_total;
  1564. unsigned res_this;
  1565. // If the history is longer than 1, and we can not get prediction from
  1566. // lemmas longer than 2, in this case, we will add lemmas with
  1567. // highest scores as the prediction result.
  1568. if (fixed_len > 1 && 1 == len && 0 == res_total) {
  1569. // Try to find if recent n (n>1) characters can be a valid lemma in system
  1570. // dictionary.
  1571. bool nearest_n_word = false;
  1572. for (unsigned nlen = 2; nlen <= fixed_len; nlen++) {
  1573. if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
  1574. nearest_n_word = true;
  1575. break;
  1576. }
  1577. }
  1578. res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
  1579. npre_items_ + res_total,
  1580. this_max, res_total);
  1581. res_total += res_this;
  1582. }
  1583. // How many blank items are available
  1584. this_max = npre_items_len_ - res_total;
  1585. res_this = 0;
  1586. if (!kOnlyUserDictPredict) {
  1587. res_this =
  1588. dict_trie_->predict(fixed_buf + fixed_len - len, len,
  1589. npre_items_ + res_total, this_max,
  1590. res_total);
  1591. }
  1592. if (NULL != user_dict_) {
  1593. res_this = res_this +
  1594. user_dict_->predict(fixed_buf + fixed_len - len, len,
  1595. npre_items_ + res_total + res_this,
  1596. this_max - res_this, res_total + res_this);
  1597. }
  1598. if (kPredictLimitGt1) {
  1599. myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
  1600. cmp_npre_by_score);
  1601. if (len > 3) {
  1602. if (res_this > kMaxPredictNumByGt3)
  1603. res_this = kMaxPredictNumByGt3;
  1604. } else if (3 == len) {
  1605. if (res_this > kMaxPredictNumBy3)
  1606. res_this = kMaxPredictNumBy3;
  1607. } else if (2 == len) {
  1608. if (res_this > kMaxPredictNumBy2)
  1609. res_this = kMaxPredictNumBy2;
  1610. }
  1611. }
  1612. res_total += res_this;
  1613. }
  1614. res_total = remove_duplicate_npre(npre_items_, res_total);
  1615. if (kPreferLongHistoryPredict) {
  1616. myqsort(npre_items_, res_total, sizeof(NPredictItem),
  1617. cmp_npre_by_hislen_score);
  1618. } else {
  1619. myqsort(npre_items_, res_total, sizeof(NPredictItem),
  1620. cmp_npre_by_score);
  1621. }
  1622. if (buf_len < res_total) {
  1623. res_total = buf_len;
  1624. }
  1625. if (kPrintDebug2) {
  1626. printf("/////////////////Predicted Items Begin////////////////////>>\n");
  1627. for (unsigned i = 0; i < res_total; i++) {
  1628. printf("---");
  1629. for (unsigned j = 0; j < kMaxPredictSize; j++) {
  1630. printf("%d ", npre_items_[i].pre_hzs[j]);
  1631. }
  1632. printf("\n");
  1633. }
  1634. printf("<<///////////////Predicted Items End////////////////////////\n");
  1635. }
  1636. for (unsigned i = 0; i < res_total; i++) {
  1637. utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
  1638. kMaxPredictSize);
  1639. predict_buf[i][kMaxPredictSize] = '\0';
  1640. }
  1641. return res_total;
  1642. }
  1643. unsigned MatrixSearch::get_predicts(const char16 fixed_buf[],
  1644. char16 predict_buf[][kMaxPredictSize + 1],
  1645. unsigned buf_len) {
  1646. unsigned fixed_len = utf16_strlen(fixed_buf);
  1647. if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
  1648. return 0;
  1649. return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
  1650. }
  1651. } // namespace ime_pinyin