/src/im/gpinyin/share/dicttrie.cpp

http://ftk.googlecode.com/ · C++ · 942 lines · 744 code · 144 blank · 54 comment · 257 complexity · 87d9022ba04ddb85965a0056082bee1d MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #include "../include/dicttrie.h"
  20. #include "../include/dictbuilder.h"
  21. #include "../include/lpicache.h"
  22. #include "../include/mystdlib.h"
  23. #include "../include/ngram.h"
  24. namespace ime_pinyin {
  25. DictTrie::DictTrie() {
  26. spl_trie_ = SpellingTrie::get_cpinstance();
  27. root_ = NULL;
  28. splid_le0_index_ = NULL;
  29. lma_node_num_le0_ = 0;
  30. nodes_ge1_ = NULL;
  31. lma_node_num_ge1_ = 0;
  32. lma_idx_buf_ = NULL;
  33. lma_idx_buf_len_ = 0;
  34. total_lma_num_ = 0;
  35. top_lmas_num_ = 0;
  36. dict_list_ = NULL;
  37. parsing_marks_ = NULL;
  38. mile_stones_ = NULL;
  39. reset_milestones(0, kFirstValidMileStoneHandle);
  40. }
  41. DictTrie::~DictTrie() {
  42. free_resource(true);
  43. }
  44. void DictTrie::free_resource(bool free_dict_list) {
  45. if (NULL != root_)
  46. free(root_);
  47. root_ = NULL;
  48. if (NULL != splid_le0_index_)
  49. free(splid_le0_index_);
  50. splid_le0_index_ = NULL;
  51. if (NULL != nodes_ge1_)
  52. free(nodes_ge1_);
  53. nodes_ge1_ = NULL;
  54. if (NULL != nodes_ge1_)
  55. free(nodes_ge1_);
  56. nodes_ge1_ = NULL;
  57. if (free_dict_list) {
  58. if (NULL != dict_list_) {
  59. delete dict_list_;
  60. }
  61. dict_list_ = NULL;
  62. }
  63. if (parsing_marks_)
  64. delete [] parsing_marks_;
  65. parsing_marks_ = NULL;
  66. if (mile_stones_)
  67. delete [] mile_stones_;
  68. mile_stones_ = NULL;
  69. reset_milestones(0, kFirstValidMileStoneHandle);
  70. }
  71. inline unsigned DictTrie::get_son_offset(const LmaNodeGE1 *node) {
  72. return ((unsigned)node->son_1st_off_l + ((unsigned)node->son_1st_off_h << 16));
  73. }
  74. inline unsigned DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) {
  75. return ((unsigned)node->homo_idx_buf_off_l +
  76. ((unsigned)node->homo_idx_buf_off_h << 16));
  77. }
  78. inline LemmaIdType DictTrie::get_lemma_id(unsigned id_offset) {
  79. LemmaIdType id = 0;
  80. for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--)
  81. id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos];
  82. id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize];
  83. return id;
  84. }
  85. #ifdef ___BUILD_MODEL___
  86. bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) {
  87. DictBuilder* dict_builder = new DictBuilder();
  88. free_resource(true);
  89. return dict_builder->build_dict(fn_raw, fn_validhzs, this);
  90. }
  91. bool DictTrie::save_dict(FILE *fp) {
  92. if (NULL == fp)
  93. return false;
  94. if (fwrite(&lma_node_num_le0_, sizeof(unsigned), 1, fp) != 1)
  95. return false;
  96. if (fwrite(&lma_node_num_ge1_, sizeof(unsigned), 1, fp) != 1)
  97. return false;
  98. if (fwrite(&lma_idx_buf_len_, sizeof(unsigned), 1, fp) != 1)
  99. return false;
  100. if (fwrite(&top_lmas_num_, sizeof(unsigned), 1, fp) != 1)
  101. return false;
  102. if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
  103. != lma_node_num_le0_)
  104. return false;
  105. if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
  106. != lma_node_num_ge1_)
  107. return false;
  108. if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
  109. lma_idx_buf_len_)
  110. return false;
  111. return true;
  112. }
  113. bool DictTrie::save_dict(const char *filename) {
  114. if (NULL == filename)
  115. return false;
  116. if (NULL == root_ || NULL == dict_list_)
  117. return false;
  118. SpellingTrie &spl_trie = SpellingTrie::get_instance();
  119. NGram &ngram = NGram::get_instance();
  120. FILE *fp = fopen(filename, "wb");
  121. if (NULL == fp)
  122. return false;
  123. if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) ||
  124. !save_dict(fp) || !ngram.save_ngram(fp)) {
  125. fclose(fp);
  126. return false;
  127. }
  128. fclose(fp);
  129. return true;
  130. }
  131. #endif // ___BUILD_MODEL___
  132. bool DictTrie::load_dict(FILE *fp) {
  133. if (NULL == fp)
  134. return false;
  135. if (fread(&lma_node_num_le0_, sizeof(unsigned), 1, fp) != 1)
  136. return false;
  137. if (fread(&lma_node_num_ge1_, sizeof(unsigned), 1, fp) != 1)
  138. return false;
  139. if (fread(&lma_idx_buf_len_, sizeof(unsigned), 1, fp) != 1)
  140. return false;
  141. if (fread(&top_lmas_num_, sizeof(unsigned), 1, fp) != 1 ||
  142. top_lmas_num_ >= lma_idx_buf_len_)
  143. return false;
  144. free_resource(false);
  145. root_ = static_cast<LmaNodeLE0*>
  146. (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0)));
  147. nodes_ge1_ = static_cast<LmaNodeGE1*>
  148. (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1)));
  149. lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_);
  150. total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize;
  151. unsigned buf_size = SpellingTrie::get_instance().get_spelling_num() + 1;
  152. assert(lma_node_num_le0_ <= buf_size);
  153. splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16)));
  154. // Init the space for parsing.
  155. parsing_marks_ = new ParsingMark[kMaxParsingMark];
  156. mile_stones_ = new MileStone[kMaxMileStone];
  157. reset_milestones(0, kFirstValidMileStoneHandle);
  158. if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ ||
  159. NULL == splid_le0_index_ || NULL == parsing_marks_ ||
  160. NULL == mile_stones_) {
  161. free_resource(false);
  162. return false;
  163. }
  164. if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
  165. != lma_node_num_le0_)
  166. return false;
  167. if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
  168. != lma_node_num_ge1_)
  169. return false;
  170. if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
  171. lma_idx_buf_len_)
  172. return false;
  173. // The quick index for the first level sons
  174. uint16 last_splid = kFullSplIdStart;
  175. unsigned last_pos = 0;
  176. for (unsigned i = 1; i < lma_node_num_le0_; i++) {
  177. for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++)
  178. splid_le0_index_[splid - kFullSplIdStart] = last_pos;
  179. splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] =
  180. static_cast<uint16>(i);
  181. last_splid = root_[i].spl_idx;
  182. last_pos = i;
  183. }
  184. for (uint16 splid = last_splid + 1;
  185. splid < buf_size + kFullSplIdStart; splid++) {
  186. assert(static_cast<unsigned>(splid - kFullSplIdStart) < buf_size);
  187. splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1;
  188. }
  189. return true;
  190. }
  191. bool DictTrie::load_dict(const char *filename, LemmaIdType start_id,
  192. LemmaIdType end_id) {
  193. if (NULL == filename || end_id <= start_id)
  194. return false;
  195. FILE *fp = fopen(filename, "rb");
  196. if (NULL == fp)
  197. return false;
  198. free_resource(true);
  199. dict_list_ = new DictList();
  200. if (NULL == dict_list_) {
  201. fclose(fp);
  202. return false;
  203. }
  204. SpellingTrie &spl_trie = SpellingTrie::get_instance();
  205. NGram &ngram = NGram::get_instance();
  206. if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
  207. !load_dict(fp) || !ngram.load_ngram(fp) ||
  208. total_lma_num_ > end_id - start_id + 1) {
  209. free_resource(true);
  210. fclose(fp);
  211. return false;
  212. }
  213. fclose(fp);
  214. return true;
  215. }
  216. bool DictTrie::load_dict_fd(int sys_fd, long start_offset,
  217. long length, LemmaIdType start_id,
  218. LemmaIdType end_id) {
  219. if (start_offset < 0 || length <= 0 || end_id <= start_id)
  220. return false;
  221. FILE *fp = fdopen(sys_fd, "rb");
  222. if (NULL == fp)
  223. return false;
  224. if (-1 == fseek(fp, start_offset, SEEK_SET)) {
  225. fclose(fp);
  226. return false;
  227. }
  228. free_resource(true);
  229. dict_list_ = new DictList();
  230. if (NULL == dict_list_) {
  231. fclose(fp);
  232. return false;
  233. }
  234. SpellingTrie &spl_trie = SpellingTrie::get_instance();
  235. NGram &ngram = NGram::get_instance();
  236. if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
  237. !load_dict(fp) || !ngram.load_ngram(fp) ||
  238. ftell(fp) < start_offset + length ||
  239. total_lma_num_ > end_id - start_id + 1) {
  240. free_resource(true);
  241. fclose(fp);
  242. return false;
  243. }
  244. fclose(fp);
  245. return true;
  246. }
  247. unsigned DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned lpi_max,
  248. LmaNodeLE0 *node) {
  249. unsigned lpi_num = 0;
  250. NGram& ngram = NGram::get_instance();
  251. for (unsigned homo = 0; homo < (unsigned)node->num_of_homo; homo++) {
  252. lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off +
  253. homo);
  254. lpi_items[lpi_num].lma_len = 1;
  255. lpi_items[lpi_num].psb =
  256. static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
  257. lpi_num++;
  258. if (lpi_num >= lpi_max)
  259. break;
  260. }
  261. return lpi_num;
  262. }
  263. unsigned DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], unsigned lpi_max,
  264. unsigned homo_buf_off, LmaNodeGE1 *node,
  265. uint16 lma_len) {
  266. unsigned lpi_num = 0;
  267. NGram& ngram = NGram::get_instance();
  268. for (unsigned homo = 0; homo < (unsigned)node->num_of_homo; homo++) {
  269. lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo);
  270. lpi_items[lpi_num].lma_len = lma_len;
  271. lpi_items[lpi_num].psb =
  272. static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
  273. lpi_num++;
  274. if (lpi_num >= lpi_max)
  275. break;
  276. }
  277. return lpi_num;
  278. }
  279. void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
  280. if (0 == from_step) {
  281. parsing_marks_pos_ = 0;
  282. mile_stones_pos_ = kFirstValidMileStoneHandle;
  283. } else {
  284. if (from_handle > 0 && from_handle < mile_stones_pos_) {
  285. mile_stones_pos_ = from_handle;
  286. MileStone *mile_stone = mile_stones_ + from_handle;
  287. parsing_marks_pos_ = mile_stone->mark_start;
  288. }
  289. }
  290. }
  291. MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle,
  292. const DictExtPara *dep,
  293. LmaPsbItem *lpi_items, unsigned lpi_max,
  294. unsigned *lpi_num) {
  295. if (NULL == dep)
  296. return 0;
  297. // from LmaNodeLE0 (root) to LmaNodeLE0
  298. if (0 == from_handle) {
  299. assert(0 == dep->splids_extended);
  300. return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num);
  301. }
  302. // from LmaNodeLE0 to LmaNodeGE1
  303. if (1 == dep->splids_extended)
  304. return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num);
  305. // From LmaNodeGE1 to LmaNodeGE1
  306. return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num);
  307. }
  308. MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle,
  309. const DictExtPara *dep,
  310. LmaPsbItem *lpi_items,
  311. unsigned lpi_max, unsigned *lpi_num) {
  312. assert(NULL != dep && 0 == from_handle);
  313. *lpi_num = 0;
  314. MileStoneHandle ret_handle = 0;
  315. uint16 splid = dep->splids[dep->splids_extended];
  316. uint16 id_start = dep->id_start;
  317. uint16 id_num = dep->id_num;
  318. LpiCache& lpi_cache = LpiCache::get_instance();
  319. bool cached = lpi_cache.is_cached(splid);
  320. // 2. Begin exgtending
  321. // 2.1 Get the LmaPsbItem list
  322. LmaNodeLE0 *node = root_;
  323. unsigned son_start = splid_le0_index_[id_start - kFullSplIdStart];
  324. unsigned son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart];
  325. for (unsigned son_pos = son_start; son_pos < son_end; son_pos++) {
  326. assert(1 == node->son_1st_off);
  327. LmaNodeLE0 *son = root_ + son_pos;
  328. assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num);
  329. if (!cached && *lpi_num < lpi_max) {
  330. bool need_lpi = true;
  331. if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start)
  332. need_lpi = false;
  333. if (need_lpi)
  334. *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
  335. lpi_max - *lpi_num, son);
  336. }
  337. // If necessary, fill in a new mile stone.
  338. if (son->spl_idx == id_start) {
  339. if (mile_stones_pos_ < kMaxMileStone &&
  340. parsing_marks_pos_ < kMaxParsingMark) {
  341. parsing_marks_[parsing_marks_pos_].node_offset = son_pos;
  342. parsing_marks_[parsing_marks_pos_].node_num = id_num;
  343. mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_;
  344. mile_stones_[mile_stones_pos_].mark_num = 1;
  345. ret_handle = mile_stones_pos_;
  346. parsing_marks_pos_++;
  347. mile_stones_pos_++;
  348. }
  349. }
  350. if (son->spl_idx >= id_start + id_num -1)
  351. break;
  352. }
  353. // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
  354. // mile_stones_pos_);
  355. return ret_handle;
  356. }
  357. MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle,
  358. const DictExtPara *dep,
  359. LmaPsbItem *lpi_items,
  360. unsigned lpi_max, unsigned *lpi_num) {
  361. assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
  362. MileStoneHandle ret_handle = 0;
  363. // 1. If this is a half Id, get its corresponding full starting Id and
  364. // number of full Id.
  365. unsigned ret_val = 0;
  366. uint16 id_start = dep->id_start;
  367. uint16 id_num = dep->id_num;
  368. // 2. Begin extending.
  369. MileStone *mile_stone = mile_stones_ + from_handle;
  370. for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
  371. ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
  372. uint16 ext_num = p_mark.node_num;
  373. for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
  374. LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos;
  375. unsigned found_start = 0;
  376. unsigned found_num = 0;
  377. for (unsigned son_pos = 0; son_pos < (unsigned)node->num_of_son; son_pos++) {
  378. assert(node->son_1st_off <= lma_node_num_ge1_);
  379. LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos;
  380. if (son->spl_idx >= id_start
  381. && son->spl_idx < id_start + id_num) {
  382. if (*lpi_num < lpi_max) {
  383. unsigned homo_buf_off = get_homo_idx_buf_offset(son);
  384. *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
  385. lpi_max - *lpi_num, homo_buf_off, son,
  386. 2);
  387. }
  388. // If necessary, fill in the new DTMI
  389. if (0 == found_num) {
  390. found_start = son_pos;
  391. }
  392. found_num++;
  393. }
  394. if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
  395. (unsigned)node->num_of_son - 1) {
  396. if (found_num > 0) {
  397. if (mile_stones_pos_ < kMaxMileStone &&
  398. parsing_marks_pos_ < kMaxParsingMark) {
  399. parsing_marks_[parsing_marks_pos_].node_offset =
  400. node->son_1st_off + found_start;
  401. parsing_marks_[parsing_marks_pos_].node_num = found_num;
  402. if (0 == ret_val)
  403. mile_stones_[mile_stones_pos_].mark_start =
  404. parsing_marks_pos_;
  405. parsing_marks_pos_++;
  406. }
  407. ret_val++;
  408. }
  409. break;
  410. } // for son_pos
  411. } // for ext_pos
  412. } // for h_pos
  413. }
  414. if (ret_val > 0) {
  415. mile_stones_[mile_stones_pos_].mark_num = ret_val;
  416. ret_handle = mile_stones_pos_;
  417. mile_stones_pos_++;
  418. ret_val = 1;
  419. }
  420. // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
  421. // mile_stones_pos_);
  422. return ret_handle;
  423. }
  424. MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle,
  425. const DictExtPara *dep,
  426. LmaPsbItem *lpi_items,
  427. unsigned lpi_max, unsigned *lpi_num) {
  428. assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
  429. MileStoneHandle ret_handle = 0;
  430. // 1. If this is a half Id, get its corresponding full starting Id and
  431. // number of full Id.
  432. unsigned ret_val = 0;
  433. uint16 id_start = dep->id_start;
  434. uint16 id_num = dep->id_num;
  435. // 2. Begin extending.
  436. MileStone *mile_stone = mile_stones_ + from_handle;
  437. for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
  438. ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
  439. uint16 ext_num = p_mark.node_num;
  440. for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
  441. LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos;
  442. unsigned found_start = 0;
  443. unsigned found_num = 0;
  444. for (unsigned son_pos = 0; son_pos < (unsigned)node->num_of_son; son_pos++) {
  445. assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
  446. LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos;
  447. if (son->spl_idx >= id_start
  448. && son->spl_idx < id_start + id_num) {
  449. if (*lpi_num < lpi_max) {
  450. unsigned homo_buf_off = get_homo_idx_buf_offset(son);
  451. *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
  452. lpi_max - *lpi_num, homo_buf_off, son,
  453. dep->splids_extended + 1);
  454. }
  455. // If necessary, fill in the new DTMI
  456. if (0 == found_num) {
  457. found_start = son_pos;
  458. }
  459. found_num++;
  460. }
  461. if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
  462. (unsigned)node->num_of_son - 1) {
  463. if (found_num > 0) {
  464. if (mile_stones_pos_ < kMaxMileStone &&
  465. parsing_marks_pos_ < kMaxParsingMark) {
  466. parsing_marks_[parsing_marks_pos_].node_offset =
  467. get_son_offset(node) + found_start;
  468. parsing_marks_[parsing_marks_pos_].node_num = found_num;
  469. if (0 == ret_val)
  470. mile_stones_[mile_stones_pos_].mark_start =
  471. parsing_marks_pos_;
  472. parsing_marks_pos_++;
  473. }
  474. ret_val++;
  475. }
  476. break;
  477. }
  478. } // for son_pos
  479. } // for ext_pos
  480. } // for h_pos
  481. if (ret_val > 0) {
  482. mile_stones_[mile_stones_pos_].mark_num = ret_val;
  483. ret_handle = mile_stones_pos_;
  484. mile_stones_pos_++;
  485. }
  486. // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
  487. // mile_stones_pos_);
  488. return ret_handle;
  489. }
  490. bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num,
  491. LemmaIdType id_lemma) {
  492. if (0 == splid_num || NULL == splids)
  493. return false;
  494. void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart];
  495. for (uint16 pos = 1; pos < splid_num; pos++) {
  496. if (1 == pos) {
  497. LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
  498. LmaNodeGE1 *node_son;
  499. uint16 son_pos;
  500. for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son);
  501. son_pos++) {
  502. assert(node_le0->son_1st_off <= lma_node_num_ge1_);
  503. node_son = nodes_ge1_ + node_le0->son_1st_off
  504. + son_pos;
  505. if (node_son->spl_idx == splids[pos])
  506. break;
  507. }
  508. if (son_pos < node_le0->num_of_son)
  509. node = reinterpret_cast<void*>(node_son);
  510. else
  511. return false;
  512. } else {
  513. LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
  514. LmaNodeGE1 *node_son;
  515. uint16 son_pos;
  516. for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son);
  517. son_pos++) {
  518. assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0);
  519. node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos;
  520. if (node_son->spl_idx == splids[pos])
  521. break;
  522. }
  523. if (son_pos < node_ge1->num_of_son)
  524. node = reinterpret_cast<void*>(node_son);
  525. else
  526. return false;
  527. }
  528. }
  529. if (1 == splid_num) {
  530. LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
  531. unsigned num_of_homo = (unsigned)node_le0->num_of_homo;
  532. for (unsigned homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
  533. LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
  534. char16 str[2];
  535. get_lemma_str(id_this, str, 2);
  536. if (id_this == id_lemma)
  537. return true;
  538. }
  539. } else {
  540. LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
  541. unsigned num_of_homo = (unsigned)node_ge1->num_of_homo;
  542. for (unsigned homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
  543. unsigned node_homo_off = get_homo_idx_buf_offset(node_ge1);
  544. if (get_lemma_id(node_homo_off + homo_pos) == id_lemma)
  545. return true;
  546. }
  547. }
  548. return false;
  549. }
  550. unsigned DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len,
  551. LmaPsbItem* lma_buf, unsigned max_lma_buf) {
  552. if (splid_str_len > kMaxLemmaSize)
  553. return 0;
  554. #define MAX_EXTENDBUF_LEN 200
  555. unsigned* node_buf1[MAX_EXTENDBUF_LEN]; // use unsigned for data alignment
  556. unsigned* node_buf2[MAX_EXTENDBUF_LEN];
  557. LmaNodeLE0** node_fr_le0 =
  558. reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from.
  559. LmaNodeLE0** node_to_le0 =
  560. reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to.
  561. LmaNodeGE1** node_fr_ge1 = NULL;
  562. LmaNodeGE1** node_to_ge1 = NULL;
  563. unsigned node_fr_num = 1;
  564. unsigned node_to_num = 0;
  565. node_fr_le0[0] = root_;
  566. if (NULL == node_fr_le0[0])
  567. return 0;
  568. unsigned spl_pos = 0;
  569. while (spl_pos < splid_str_len) {
  570. uint16 id_num = 1;
  571. uint16 id_start = splid_str[spl_pos];
  572. // If it is a half id
  573. if (spl_trie_->is_half_id(splid_str[spl_pos])) {
  574. id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start);
  575. assert(id_num > 0);
  576. }
  577. // Extend the nodes
  578. if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes
  579. for (unsigned node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
  580. LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
  581. assert(node == root_ && 1 == node_fr_num);
  582. unsigned son_start = splid_le0_index_[id_start - kFullSplIdStart];
  583. unsigned son_end =
  584. splid_le0_index_[id_start + id_num - kFullSplIdStart];
  585. for (unsigned son_pos = son_start; son_pos < son_end; son_pos++) {
  586. assert(1 == node->son_1st_off);
  587. LmaNodeLE0 *node_son = root_ + son_pos;
  588. assert(node_son->spl_idx >= id_start
  589. && node_son->spl_idx < id_start + id_num);
  590. if (node_to_num < MAX_EXTENDBUF_LEN) {
  591. node_to_le0[node_to_num] = node_son;
  592. node_to_num++;
  593. }
  594. // id_start + id_num - 1 is the last one, which has just been
  595. // recorded.
  596. if (node_son->spl_idx >= id_start + id_num - 1)
  597. break;
  598. }
  599. }
  600. spl_pos++;
  601. if (spl_pos >= splid_str_len || node_to_num == 0)
  602. break;
  603. // Prepare the nodes for next extending
  604. // next time, from LmaNodeLE0 to LmaNodeGE1
  605. LmaNodeLE0** node_tmp = node_fr_le0;
  606. node_fr_le0 = node_to_le0;
  607. node_to_le0 = NULL;
  608. node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp);
  609. } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes
  610. for (unsigned node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
  611. LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
  612. for (unsigned son_pos = 0; son_pos < (unsigned)node->num_of_son;
  613. son_pos++) {
  614. assert(node->son_1st_off <= lma_node_num_ge1_);
  615. LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off
  616. + son_pos;
  617. if (node_son->spl_idx >= id_start
  618. && node_son->spl_idx < id_start + id_num) {
  619. if (node_to_num < MAX_EXTENDBUF_LEN) {
  620. node_to_ge1[node_to_num] = node_son;
  621. node_to_num++;
  622. }
  623. }
  624. // id_start + id_num - 1 is the last one, which has just been
  625. // recorded.
  626. if (node_son->spl_idx >= id_start + id_num - 1)
  627. break;
  628. }
  629. }
  630. spl_pos++;
  631. if (spl_pos >= splid_str_len || node_to_num == 0)
  632. break;
  633. // Prepare the nodes for next extending
  634. // next time, from LmaNodeGE1 to LmaNodeGE1
  635. node_fr_ge1 = node_to_ge1;
  636. node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0);
  637. node_fr_le0 = NULL;
  638. node_to_le0 = NULL;
  639. } else { // From LmaNodeGE1 to LmaNodeGE1 nodes
  640. for (unsigned node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
  641. LmaNodeGE1 *node = node_fr_ge1[node_fr_pos];
  642. for (unsigned son_pos = 0; son_pos < (unsigned)node->num_of_son;
  643. son_pos++) {
  644. assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
  645. LmaNodeGE1 *node_son = nodes_ge1_
  646. + get_son_offset(node) + son_pos;
  647. if (node_son->spl_idx >= id_start
  648. && node_son->spl_idx < id_start + id_num) {
  649. if (node_to_num < MAX_EXTENDBUF_LEN) {
  650. node_to_ge1[node_to_num] = node_son;
  651. node_to_num++;
  652. }
  653. }
  654. // id_start + id_num - 1 is the last one, which has just been
  655. // recorded.
  656. if (node_son->spl_idx >= id_start + id_num - 1)
  657. break;
  658. }
  659. }
  660. spl_pos++;
  661. if (spl_pos >= splid_str_len || node_to_num == 0)
  662. break;
  663. // Prepare the nodes for next extending
  664. // next time, from LmaNodeGE1 to LmaNodeGE1
  665. LmaNodeGE1 **node_tmp = node_fr_ge1;
  666. node_fr_ge1 = node_to_ge1;
  667. node_to_ge1 = node_tmp;
  668. }
  669. // The number of node for next extending
  670. node_fr_num = node_to_num;
  671. node_to_num = 0;
  672. } // while
  673. if (0 == node_to_num)
  674. return 0;
  675. NGram &ngram = NGram::get_instance();
  676. unsigned lma_num = 0;
  677. // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e',
  678. // only those candidates for the full matched one-char id will be returned.
  679. if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0]))
  680. node_to_num = node_to_num > 0 ? 1 : 0;
  681. for (unsigned node_pos = 0; node_pos < node_to_num; node_pos++) {
  682. unsigned num_of_homo = 0;
  683. if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes
  684. LmaNodeLE0* node_le0 = node_to_le0[node_pos];
  685. num_of_homo = (unsigned)node_le0->num_of_homo;
  686. for (unsigned homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
  687. unsigned ch_pos = lma_num + homo_pos;
  688. lma_buf[ch_pos].id =
  689. get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
  690. lma_buf[ch_pos].lma_len = 1;
  691. lma_buf[ch_pos].psb =
  692. static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
  693. if (lma_num + homo_pos >= max_lma_buf - 1)
  694. break;
  695. }
  696. } else { // Get from LmaNodeGE1 nodes
  697. LmaNodeGE1* node_ge1 = node_to_ge1[node_pos];
  698. num_of_homo = (unsigned)node_ge1->num_of_homo;
  699. for (unsigned homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
  700. unsigned ch_pos = lma_num + homo_pos;
  701. unsigned node_homo_off = get_homo_idx_buf_offset(node_ge1);
  702. lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos);
  703. lma_buf[ch_pos].lma_len = splid_str_len;
  704. lma_buf[ch_pos].psb =
  705. static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
  706. if (lma_num + homo_pos >= max_lma_buf - 1)
  707. break;
  708. }
  709. }
  710. lma_num += num_of_homo;
  711. if (lma_num >= max_lma_buf) {
  712. lma_num = max_lma_buf;
  713. break;
  714. }
  715. }
  716. return lma_num;
  717. }
  718. uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
  719. uint16 str_max) {
  720. return dict_list_->get_lemma_str(id_lemma, str_buf, str_max);
  721. }
  722. uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
  723. uint16 splids_max, bool arg_valid) {
  724. char16 lma_str[kMaxLemmaSize + 1];
  725. uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1);
  726. assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max);
  727. uint16 spl_mtrx[kMaxLemmaSize * 5];
  728. uint16 spl_start[kMaxLemmaSize + 1];
  729. spl_start[0] = 0;
  730. uint16 try_num = 1;
  731. for (uint16 pos = 0; pos < lma_len; pos++) {
  732. uint16 cand_splids_this = 0;
  733. if (arg_valid && spl_trie_->is_full_id(splids[pos])) {
  734. spl_mtrx[spl_start[pos]] = splids[pos];
  735. cand_splids_this = 1;
  736. } else {
  737. cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos],
  738. arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos],
  739. kMaxLemmaSize * 5 - spl_start[pos]);
  740. assert(cand_splids_this > 0);
  741. }
  742. spl_start[pos + 1] = spl_start[pos] + cand_splids_this;
  743. try_num *= cand_splids_this;
  744. }
  745. for (uint16 try_pos = 0; try_pos < try_num; try_pos++) {
  746. uint16 mod = 1;
  747. for (uint16 pos = 0; pos < lma_len; pos++) {
  748. uint16 radix = spl_start[pos + 1] - spl_start[pos];
  749. splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix];
  750. mod *= radix;
  751. }
  752. if (try_extend(splids, lma_len, id_lemma))
  753. return lma_len;
  754. }
  755. return 0;
  756. }
  757. void DictTrie::set_total_lemma_count_of_others(unsigned count) {
  758. NGram& ngram = NGram::get_instance();
  759. ngram.set_total_freq_none_sys(count);
  760. }
  761. void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) {
  762. return dict_list_->convert_to_hanzis(str, str_len);
  763. }
  764. void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) {
  765. return dict_list_->convert_to_scis_ids(str, str_len);
  766. }
  767. LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) {
  768. if (NULL == lemma_str || lemma_len > kMaxLemmaSize)
  769. return 0;
  770. return dict_list_->get_lemma_id(lemma_str, lemma_len);
  771. }
  772. unsigned DictTrie::predict_top_lmas(unsigned his_len, NPredictItem *npre_items,
  773. unsigned npre_max, unsigned b4_used) {
  774. NGram &ngram = NGram::get_instance();
  775. unsigned item_num = 0;
  776. unsigned top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_;
  777. unsigned top_lmas_pos = 0;
  778. while (item_num < npre_max && top_lmas_pos < top_lmas_num_) {
  779. memset(npre_items + item_num, 0, sizeof(NPredictItem));
  780. LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos);
  781. top_lmas_pos += 1;
  782. if (dict_list_->get_lemma_str(top_lma_id,
  783. npre_items[item_num].pre_hzs,
  784. kMaxLemmaSize - 1) == 0) {
  785. continue;
  786. }
  787. npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id);
  788. npre_items[item_num].his_len = his_len;
  789. item_num++;
  790. }
  791. return item_num;
  792. }
  793. unsigned DictTrie::predict(const char16 *last_hzs, uint16 hzs_len,
  794. NPredictItem *npre_items, unsigned npre_max,
  795. unsigned b4_used) {
  796. return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used);
  797. }
  798. } // namespace ime_pinyin