/src/im/gpinyin/share/dictbuilder.cpp

http://ftk.googlecode.com/ · C++ · 1070 lines · 812 code · 192 blank · 66 comment · 241 complexity · 9bf27adae65f9e175c5d08e6faf00ab7 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include <stdio.h>
  18. #include <stdlib.h>
  19. #include <string.h>
  20. #include "../include/dictbuilder.h"
  21. #include "../include/dicttrie.h"
  22. #include "../include/mystdlib.h"
  23. #include "../include/ngram.h"
  24. #include "../include/searchutility.h"
  25. #include "../include/spellingtable.h"
  26. #include "../include/spellingtrie.h"
  27. #include "../include/splparser.h"
  28. #include "../include/utf16reader.h"
  29. namespace ime_pinyin {
  30. #ifdef ___BUILD_MODEL___
  31. static const unsigned kReadBufLen = 512;
  32. static const unsigned kSplTableHashLen = 2000;
  33. // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
  34. // frequencies.
  35. int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
  36. const SingleCharItem *s1, *s2;
  37. s1 = static_cast<const SingleCharItem*>(p1);
  38. s2 = static_cast<const SingleCharItem*>(p2);
  39. if (s1->hz < s2->hz)
  40. return -1;
  41. if (s1->hz > s2->hz)
  42. return 1;
  43. if (s1->splid.half_splid < s2->splid.half_splid)
  44. return -1;
  45. if (s1->splid.half_splid > s2->splid.half_splid)
  46. return 1;
  47. if (s1->splid.full_splid < s2->splid.full_splid)
  48. return -1;
  49. if (s1->splid.full_splid > s2->splid.full_splid)
  50. return 1;
  51. if (s1->freq > s2->freq)
  52. return -1;
  53. if (s1->freq < s2->freq)
  54. return 1;
  55. return 0;
  56. }
  57. int cmp_scis_hz_splid(const void* p1, const void* p2) {
  58. const SingleCharItem *s1, *s2;
  59. s1 = static_cast<const SingleCharItem*>(p1);
  60. s2 = static_cast<const SingleCharItem*>(p2);
  61. if (s1->hz < s2->hz)
  62. return -1;
  63. if (s1->hz > s2->hz)
  64. return 1;
  65. if (s1->splid.half_splid < s2->splid.half_splid)
  66. return -1;
  67. if (s1->splid.half_splid > s2->splid.half_splid)
  68. return 1;
  69. if (s1->splid.full_splid < s2->splid.full_splid)
  70. return -1;
  71. if (s1->splid.full_splid > s2->splid.full_splid)
  72. return 1;
  73. return 0;
  74. }
  75. int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
  76. unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
  77. unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
  78. if (size1 < size2)
  79. return -1;
  80. else if (size1 > size2)
  81. return 1;
  82. return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
  83. ((const LemmaEntry*)p2)->hanzi_str);
  84. }
  85. int compare_char16(const void* p1, const void* p2) {
  86. if (*((const char16*)p1) < *((const char16*)p2))
  87. return -1;
  88. if (*((const char16*)p1) > *((const char16*)p2))
  89. return 1;
  90. return 0;
  91. }
  92. int compare_py(const void* p1, const void* p2) {
  93. int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
  94. ((const LemmaEntry*)p2)->spl_idx_arr);
  95. if (0 != ret)
  96. return ret;
  97. return static_cast<int>(((const LemmaEntry*)p2)->freq) -
  98. static_cast<int>(((const LemmaEntry*)p1)->freq);
  99. }
  100. // First hanzi, if the same, then Pinyin
  101. int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
  102. unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
  103. unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
  104. if (size1 < size2)
  105. return -1;
  106. else if (size1 > size2)
  107. return 1;
  108. int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
  109. ((const LemmaEntry*)p2)->hanzi_str);
  110. if (0 != ret)
  111. return ret;
  112. ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
  113. ((const LemmaEntry*)p2)->spl_idx_arr);
  114. return ret;
  115. }
  116. int compare_splid2(const void* p1, const void* p2) {
  117. int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
  118. ((const LemmaEntry*)p2)->spl_idx_arr);
  119. return ret;
  120. }
  121. DictBuilder::DictBuilder() {
  122. lemma_arr_ = NULL;
  123. lemma_num_ = 0;
  124. scis_ = NULL;
  125. scis_num_ = 0;
  126. lma_nodes_le0_ = NULL;
  127. lma_nodes_ge1_ = NULL;
  128. lma_nds_used_num_le0_ = 0;
  129. lma_nds_used_num_ge1_ = 0;
  130. homo_idx_buf_ = NULL;
  131. homo_idx_num_eq1_ = 0;
  132. homo_idx_num_gt1_ = 0;
  133. top_lmas_ = NULL;
  134. top_lmas_num_ = 0;
  135. spl_table_ = NULL;
  136. spl_parser_ = NULL;
  137. }
  138. DictBuilder::~DictBuilder() {
  139. free_resource();
  140. }
  141. bool DictBuilder::alloc_resource(unsigned lma_num) {
  142. if (0 == lma_num)
  143. return false;
  144. free_resource();
  145. lemma_num_ = lma_num;
  146. lemma_arr_ = new LemmaEntry[lemma_num_];
  147. top_lmas_num_ = 0;
  148. top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
  149. // New the scis_ buffer to the possible maximum size.
  150. scis_num_ = lemma_num_ * kMaxLemmaSize;
  151. scis_ = new SingleCharItem[scis_num_];
  152. // The root and first level nodes is less than kMaxSpellingNum + 1
  153. lma_nds_used_num_le0_ = 0;
  154. lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
  155. // Other nodes is less than lemma_num
  156. lma_nds_used_num_ge1_ = 0;
  157. lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
  158. homo_idx_buf_ = new LemmaIdType[lemma_num_];
  159. spl_table_ = new SpellingTable();
  160. spl_parser_ = new SpellingParser();
  161. if (NULL == lemma_arr_ || NULL == top_lmas_ ||
  162. NULL == scis_ || NULL == spl_table_ ||
  163. NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
  164. NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
  165. free_resource();
  166. return false;
  167. }
  168. memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
  169. memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
  170. memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
  171. memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
  172. memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
  173. spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
  174. return true;
  175. }
  176. char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, unsigned *num) {
  177. if (NULL == fn_validhzs || NULL == num)
  178. return NULL;
  179. *num = 0;
  180. FILE *fp = fopen(fn_validhzs, "rb");
  181. if (NULL == fp)
  182. return NULL;
  183. char16 utf16header;
  184. if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
  185. 0xfeff != utf16header) {
  186. fclose(fp);
  187. return NULL;
  188. }
  189. fseek(fp, 0, SEEK_END);
  190. *num = ftell(fp) / sizeof(char16);
  191. assert(*num >= 1);
  192. *num -= 1;
  193. char16 *hzs = new char16[*num];
  194. if (NULL == hzs) {
  195. fclose(fp);
  196. return NULL;
  197. }
  198. fseek(fp, 2, SEEK_SET);
  199. if (fread(hzs, sizeof(char16), *num, fp) != *num) {
  200. fclose(fp);
  201. delete [] hzs;
  202. return NULL;
  203. }
  204. fclose(fp);
  205. myqsort(hzs, *num, sizeof(char16), compare_char16);
  206. return hzs;
  207. }
  208. bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
  209. char16 hz) {
  210. if (NULL == hzs)
  211. return false;
  212. char16 *found;
  213. found = static_cast<char16*>(
  214. mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
  215. if (NULL == found)
  216. return false;
  217. assert(*found == hz);
  218. return true;
  219. }
  220. // The caller makes sure that the parameters are valid.
  221. bool DictBuilder::str_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
  222. const char16 *str, unsigned str_len) {
  223. if (NULL == hzs || NULL == str)
  224. return false;
  225. for (unsigned pos = 0; pos < str_len; pos++) {
  226. if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
  227. return false;
  228. }
  229. return true;
  230. }
  231. void DictBuilder::get_top_lemmas() {
  232. top_lmas_num_ = 0;
  233. if (NULL == lemma_arr_)
  234. return;
  235. for (unsigned pos = 0; pos < lemma_num_; pos++) {
  236. if (0 == top_lmas_num_) {
  237. top_lmas_[0] = lemma_arr_[pos];
  238. top_lmas_num_ = 1;
  239. continue;
  240. }
  241. if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
  242. if (kTopScoreLemmaNum > top_lmas_num_)
  243. top_lmas_num_ += 1;
  244. unsigned move_pos;
  245. for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
  246. top_lmas_[move_pos] = top_lmas_[move_pos - 1];
  247. if (0 == move_pos - 1 ||
  248. (move_pos - 1 > 0 &&
  249. top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
  250. break;
  251. }
  252. }
  253. assert(move_pos > 0);
  254. top_lmas_[move_pos - 1] = lemma_arr_[pos];
  255. } else if (kTopScoreLemmaNum > top_lmas_num_) {
  256. top_lmas_[top_lmas_num_] = lemma_arr_[pos];
  257. top_lmas_num_ += 1;
  258. }
  259. }
  260. if (kPrintDebug0) {
  261. printf("\n------Top Lemmas------------------\n");
  262. for (unsigned pos = 0; pos < top_lmas_num_; pos++) {
  263. printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
  264. top_lmas_[pos].freq);
  265. }
  266. }
  267. }
  268. void DictBuilder::free_resource() {
  269. if (NULL != lemma_arr_)
  270. delete [] lemma_arr_;
  271. if (NULL != scis_)
  272. delete [] scis_;
  273. if (NULL != lma_nodes_le0_)
  274. delete [] lma_nodes_le0_;
  275. if (NULL != lma_nodes_ge1_)
  276. delete [] lma_nodes_ge1_;
  277. if (NULL != homo_idx_buf_)
  278. delete [] homo_idx_buf_;
  279. if (NULL != spl_table_)
  280. delete spl_table_;
  281. if (NULL != spl_parser_)
  282. delete spl_parser_;
  283. lemma_arr_ = NULL;
  284. scis_ = NULL;
  285. lma_nodes_le0_ = NULL;
  286. lma_nodes_ge1_ = NULL;
  287. homo_idx_buf_ = NULL;
  288. spl_table_ = NULL;
  289. spl_parser_ = NULL;
  290. lemma_num_ = 0;
  291. lma_nds_used_num_le0_ = 0;
  292. lma_nds_used_num_ge1_ = 0;
  293. homo_idx_num_eq1_ = 0;
  294. homo_idx_num_gt1_ = 0;
  295. }
  296. unsigned DictBuilder::read_raw_dict(const char* fn_raw,
  297. const char *fn_validhzs,
  298. unsigned max_item) {
  299. if (NULL == fn_raw) return 0;
  300. Utf16Reader utf16_reader;
  301. if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
  302. return false;
  303. char16 read_buf[kReadBufLen];
  304. // Read the number of lemmas in the file
  305. unsigned lemma_num = 240000;
  306. // allocate resource required
  307. if (!alloc_resource(lemma_num)) {
  308. utf16_reader.close();
  309. }
  310. // Read the valid Hanzi list.
  311. char16 *valid_hzs = NULL;
  312. unsigned valid_hzs_num = 0;
  313. valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
  314. // Begin reading the lemma entries
  315. for (unsigned i = 0; i < max_item; i++) {
  316. // read next entry
  317. if (!utf16_reader.readline(read_buf, kReadBufLen)) {
  318. lemma_num = i;
  319. break;
  320. }
  321. unsigned token_size;
  322. char16 *token;
  323. char16 *to_tokenize = read_buf;
  324. // Get the Hanzi string
  325. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
  326. if (NULL == token) {
  327. free_resource();
  328. utf16_reader.close();
  329. return false;
  330. }
  331. unsigned lemma_size = utf16_strlen(token);
  332. if (lemma_size > kMaxLemmaSize) {
  333. i--;
  334. continue;
  335. }
  336. if (lemma_size > 4) {
  337. i--;
  338. continue;
  339. }
  340. // Copy to the lemma entry
  341. utf16_strcpy(lemma_arr_[i].hanzi_str, token);
  342. lemma_arr_[i].hz_str_len = token_size;
  343. // Get the freq string
  344. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
  345. if (NULL == token) {
  346. free_resource();
  347. utf16_reader.close();
  348. return false;
  349. }
  350. lemma_arr_[i].freq = utf16_atof(token);
  351. if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
  352. i--;
  353. continue;
  354. }
  355. // Get GBK mark, if no valid Hanzi list available, all items which contains
  356. // GBK characters will be discarded. Otherwise, all items which contains
  357. // characters outside of the valid Hanzi list will be discarded.
  358. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
  359. assert(NULL != token);
  360. int gbk_flag = utf16_atoi(token);
  361. if (NULL == valid_hzs || 0 == valid_hzs_num) {
  362. if (0 != gbk_flag) {
  363. i--;
  364. continue;
  365. }
  366. } else {
  367. if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
  368. lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
  369. i--;
  370. continue;
  371. }
  372. }
  373. // Get spelling String
  374. bool spelling_not_support = false;
  375. for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
  376. hz_pos++) {
  377. // Get a Pinyin
  378. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
  379. if (NULL == token) {
  380. free_resource();
  381. utf16_reader.close();
  382. return false;
  383. }
  384. assert(utf16_strlen(token) <= kMaxPinyinSize);
  385. utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
  386. format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
  387. // Put the pinyin to the spelling table
  388. if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
  389. lemma_arr_[i].freq)) {
  390. spelling_not_support = true;
  391. break;
  392. }
  393. }
  394. // The whole line must have been parsed fully, otherwise discard this one.
  395. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
  396. if (spelling_not_support || NULL != token) {
  397. i--;
  398. continue;
  399. }
  400. }
  401. delete [] valid_hzs;
  402. utf16_reader.close();
  403. printf("read succesfully, lemma num: %d\n", lemma_num);
  404. return lemma_num;
  405. }
  406. bool DictBuilder::build_dict(const char *fn_raw,
  407. const char *fn_validhzs,
  408. DictTrie *dict_trie) {
  409. if (NULL == fn_raw || NULL == dict_trie)
  410. return false;
  411. lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
  412. if (0 == lemma_num_)
  413. return false;
  414. // Arrange the spelling table, and build a spelling tree
  415. // The size of an spelling. '\0' is included. If the spelling table is
  416. // initialized to calculate the spelling scores, the last char in the
  417. // spelling string will be score, and it is also included in spl_item_size.
  418. unsigned spl_item_size;
  419. unsigned spl_num;
  420. const char* spl_buf;
  421. spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
  422. if (NULL == spl_buf) {
  423. free_resource();
  424. return false;
  425. }
  426. SpellingTrie &spl_trie = SpellingTrie::get_instance();
  427. if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
  428. spl_table_->get_score_amplifier(),
  429. spl_table_->get_average_score())) {
  430. free_resource();
  431. return false;
  432. }
  433. printf("spelling tree construct successfully.\n");
  434. // Convert the spelling string to idxs
  435. for (unsigned i = 0; i < lemma_num_; i++) {
  436. for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
  437. hz_pos++) {
  438. uint16 spl_idxs[2];
  439. uint16 spl_start_pos[3];
  440. bool is_pre = true;
  441. int spl_idx_num =
  442. spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
  443. strlen(lemma_arr_[i].pinyin_str[hz_pos]),
  444. spl_idxs, spl_start_pos, 2, is_pre);
  445. assert(1 == spl_idx_num);
  446. if (spl_trie.is_half_id(spl_idxs[0])) {
  447. uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
  448. assert(0 != num);
  449. }
  450. lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
  451. }
  452. }
  453. // Sort the lemma items according to the hanzi, and give each unique item a
  454. // id
  455. sort_lemmas_by_hz();
  456. scis_num_ = build_scis();
  457. // Construct the dict list
  458. dict_trie->dict_list_ = new DictList();
  459. bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
  460. lemma_arr_, lemma_num_);
  461. assert(dl_success);
  462. // Construct the NGram information
  463. NGram& ngram = NGram::get_instance();
  464. ngram.build_unigram(lemma_arr_, lemma_num_,
  465. lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
  466. // sort the lemma items according to the spelling idx string
  467. myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
  468. get_top_lemmas();
  469. #ifdef ___DO_STATISTICS___
  470. stat_init();
  471. #endif
  472. lma_nds_used_num_le0_ = 1; // The root node
  473. bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
  474. lemma_arr_, 0, lemma_num_, 0);
  475. if (!dt_success) {
  476. free_resource();
  477. return false;
  478. }
  479. #ifdef ___DO_STATISTICS___
  480. stat_print();
  481. #endif
  482. // Move the node data and homo data to the DictTrie
  483. dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
  484. dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
  485. unsigned lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
  486. dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
  487. assert(NULL != dict_trie->root_);
  488. assert(NULL != dict_trie->lma_idx_buf_);
  489. dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
  490. dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
  491. dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
  492. dict_trie->top_lmas_num_ = top_lmas_num_;
  493. memcpy(dict_trie->root_, lma_nodes_le0_,
  494. sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
  495. memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
  496. sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
  497. for (unsigned pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
  498. id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
  499. homo_idx_buf_[pos]);
  500. }
  501. for (unsigned pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
  502. pos < lma_idx_num; pos++) {
  503. LemmaIdType idx =
  504. top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
  505. id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
  506. }
  507. if (kPrintDebug0) {
  508. printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
  509. printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
  510. printf("top_lmas_num_: %d\n", top_lmas_num_);
  511. }
  512. free_resource();
  513. if (kPrintDebug0) {
  514. printf("Building dict succeds\n");
  515. }
  516. return dt_success;
  517. }
  518. void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
  519. if (NULL == buf) return;
  520. for (unsigned pos = 0; pos < kLemmaIdSize; pos++) {
  521. (buf)[pos] = (unsigned char)(id >> (pos * 8));
  522. }
  523. }
  524. void DictBuilder::set_son_offset(LmaNodeGE1 *node, unsigned offset) {
  525. node->son_1st_off_l = static_cast<uint16>(offset);
  526. node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
  527. }
  528. void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, unsigned offset) {
  529. node->homo_idx_buf_off_l = static_cast<uint16>(offset);
  530. node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
  531. }
  532. // All spelling strings will be converted to upper case, except that
  533. // spellings started with "ZH"/"CH"/"SH" will be converted to
  534. // "Zh"/"Ch"/"Sh"
  535. void DictBuilder::format_spelling_str(char *spl_str) {
  536. if (NULL == spl_str)
  537. return;
  538. uint16 pos = 0;
  539. while ('\0' != spl_str[pos]) {
  540. if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
  541. spl_str[pos] = spl_str[pos] - 'a' + 'A';
  542. if (1 == pos && 'H' == spl_str[pos]) {
  543. if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
  544. spl_str[pos] = 'h';
  545. }
  546. }
  547. pos++;
  548. }
  549. }
  550. LemmaIdType DictBuilder::sort_lemmas_by_hz() {
  551. if (NULL == lemma_arr_ || 0 == lemma_num_)
  552. return 0;
  553. myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
  554. lemma_arr_[0].idx_by_hz = 1;
  555. LemmaIdType idx_max = 1;
  556. for (unsigned i = 1; i < lemma_num_; i++) {
  557. if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
  558. idx_max++;
  559. lemma_arr_[i].idx_by_hz = idx_max;
  560. } else {
  561. idx_max++;
  562. lemma_arr_[i].idx_by_hz = idx_max;
  563. }
  564. }
  565. return idx_max + 1;
  566. }
  567. unsigned DictBuilder::build_scis() {
  568. if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
  569. return 0;
  570. SpellingTrie &spl_trie = SpellingTrie::get_instance();
  571. // This first one is blank, because id 0 is invalid.
  572. scis_[0].freq = 0;
  573. scis_[0].hz = 0;
  574. scis_[0].splid.full_splid = 0;
  575. scis_[0].splid.half_splid = 0;
  576. scis_num_ = 1;
  577. // Copy the hanzis to the buffer
  578. for (unsigned pos = 0; pos < lemma_num_; pos++) {
  579. unsigned hz_num = lemma_arr_[pos].hz_str_len;
  580. for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
  581. scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
  582. scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
  583. scis_[scis_num_].splid.half_splid =
  584. spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
  585. if (1 == hz_num)
  586. scis_[scis_num_].freq = lemma_arr_[pos].freq;
  587. else
  588. scis_[scis_num_].freq = 0.000001;
  589. scis_num_++;
  590. }
  591. }
  592. myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
  593. // Remove repeated items
  594. unsigned unique_scis_num = 1;
  595. for (unsigned pos = 1; pos < scis_num_; pos++) {
  596. if (scis_[pos].hz == scis_[pos - 1].hz &&
  597. scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
  598. continue;
  599. scis_[unique_scis_num] = scis_[pos];
  600. scis_[unique_scis_num].splid.half_splid =
  601. spl_trie.full_to_half(scis_[pos].splid.full_splid);
  602. unique_scis_num++;
  603. }
  604. scis_num_ = unique_scis_num;
  605. // Update the lemma list.
  606. for (unsigned pos = 0; pos < lemma_num_; pos++) {
  607. unsigned hz_num = lemma_arr_[pos].hz_str_len;
  608. for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
  609. SingleCharItem key;
  610. key.hz = lemma_arr_[pos].hanzi_str[hzpos];
  611. key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
  612. key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
  613. SingleCharItem *found;
  614. found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
  615. unique_scis_num,
  616. sizeof(SingleCharItem),
  617. cmp_scis_hz_splid));
  618. assert(found);
  619. lemma_arr_[pos].hanzi_scis_ids[hzpos] =
  620. static_cast<uint16>(found - scis_);
  621. lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
  622. }
  623. }
  624. return scis_num_;
  625. }
  626. bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
  627. unsigned item_start, unsigned item_end,
  628. unsigned level) {
  629. if (level >= kMaxLemmaSize || item_end <= item_start)
  630. return false;
  631. // 1. Scan for how many sons
  632. unsigned parent_son_num = 0;
  633. // LemmaNode *son_1st = NULL;
  634. // parent.num_of_son = 0;
  635. LemmaEntry *lma_last_start = lemma_arr_ + item_start;
  636. uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
  637. // Scan for how many sons to be allocaed
  638. for (unsigned i = item_start + 1; i< item_end; i++) {
  639. LemmaEntry *lma_current = lemma_arr + i;
  640. uint16 spl_idx_current = lma_current->spl_idx_arr[level];
  641. if (spl_idx_current != spl_idx_node) {
  642. parent_son_num++;
  643. spl_idx_node = spl_idx_current;
  644. }
  645. }
  646. parent_son_num++;
  647. #ifdef ___DO_STATISTICS___
  648. // Use to indicate whether all nodes of this layer have no son.
  649. bool allson_noson = true;
  650. assert(level < kMaxLemmaSize);
  651. if (parent_son_num > max_sonbuf_len_[level])
  652. max_sonbuf_len_[level] = parent_son_num;
  653. total_son_num_[level] += parent_son_num;
  654. total_sonbuf_num_[level] += 1;
  655. if (parent_son_num == 1)
  656. sonbufs_num1_++;
  657. else
  658. sonbufs_numgt1_++;
  659. total_lma_node_num_ += parent_son_num;
  660. #endif
  661. // 2. Update the parent's information
  662. // Update the parent's son list;
  663. LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used
  664. LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used.
  665. if (0 == level) { // the parent is root
  666. (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
  667. lma_nds_used_num_le0_;
  668. son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
  669. lma_nds_used_num_le0_ += parent_son_num;
  670. assert(parent_son_num <= 65535);
  671. (static_cast<LmaNodeLE0*>(parent))->num_of_son =
  672. static_cast<uint16>(parent_son_num);
  673. } else if (1 == level) { // the parent is a son of root
  674. (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
  675. lma_nds_used_num_ge1_;
  676. son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
  677. lma_nds_used_num_ge1_ += parent_son_num;
  678. assert(parent_son_num <= 65535);
  679. (static_cast<LmaNodeLE0*>(parent))->num_of_son =
  680. static_cast<uint16>(parent_son_num);
  681. } else {
  682. set_son_offset((static_cast<LmaNodeGE1*>(parent)),
  683. lma_nds_used_num_ge1_);
  684. son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
  685. lma_nds_used_num_ge1_ += parent_son_num;
  686. assert(parent_son_num <= 255);
  687. (static_cast<LmaNodeGE1*>(parent))->num_of_son =
  688. (unsigned char)parent_son_num;
  689. }
  690. // 3. Now begin to construct the son one by one
  691. unsigned son_pos = 0;
  692. lma_last_start = lemma_arr_ + item_start;
  693. spl_idx_node = lma_last_start->spl_idx_arr[level];
  694. unsigned homo_num = 0;
  695. if (lma_last_start->spl_idx_arr[level + 1] == 0)
  696. homo_num = 1;
  697. unsigned item_start_next = item_start;
  698. for (unsigned i = item_start + 1; i < item_end; i++) {
  699. LemmaEntry* lma_current = lemma_arr_ + i;
  700. uint16 spl_idx_current = lma_current->spl_idx_arr[level];
  701. if (spl_idx_current == spl_idx_node) {
  702. if (lma_current->spl_idx_arr[level + 1] == 0)
  703. homo_num++;
  704. } else {
  705. // Construct a node
  706. LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
  707. LmaNodeGE1 *node_cur_ge1 = NULL;
  708. if (0 == level) {
  709. node_cur_le0 = son_1st_le0 + son_pos;
  710. node_cur_le0->spl_idx = spl_idx_node;
  711. node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
  712. node_cur_le0->son_1st_off = 0;
  713. homo_idx_num_eq1_ += homo_num;
  714. } else {
  715. node_cur_ge1 = son_1st_ge1 + son_pos;
  716. node_cur_ge1->spl_idx = spl_idx_node;
  717. set_homo_id_buf_offset(node_cur_ge1,
  718. (homo_idx_num_eq1_ + homo_idx_num_gt1_));
  719. set_son_offset(node_cur_ge1, 0);
  720. homo_idx_num_gt1_ += homo_num;
  721. }
  722. if (homo_num > 0) {
  723. LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
  724. homo_idx_num_gt1_ - homo_num;
  725. if (0 == level) {
  726. assert(homo_num <= 65535);
  727. node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
  728. } else {
  729. assert(homo_num <= 255);
  730. node_cur_ge1->num_of_homo = (unsigned char)homo_num;
  731. }
  732. for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
  733. idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
  734. }
  735. #ifdef ___DO_STATISTICS___
  736. if (homo_num > max_homobuf_len_[level])
  737. max_homobuf_len_[level] = homo_num;
  738. total_homo_num_[level] += homo_num;
  739. #endif
  740. }
  741. if (i - item_start_next > homo_num) {
  742. void *next_parent;
  743. if (0 == level)
  744. next_parent = static_cast<void*>(node_cur_le0);
  745. else
  746. next_parent = static_cast<void*>(node_cur_ge1);
  747. construct_subset(next_parent, lemma_arr,
  748. item_start_next + homo_num, i, level + 1);
  749. #ifdef ___DO_STATISTICS___
  750. total_node_hasson_[level] += 1;
  751. allson_noson = false;
  752. #endif
  753. }
  754. // for the next son
  755. lma_last_start = lma_current;
  756. spl_idx_node = spl_idx_current;
  757. item_start_next = i;
  758. homo_num = 0;
  759. if (lma_current->spl_idx_arr[level + 1] == 0)
  760. homo_num = 1;
  761. son_pos++;
  762. }
  763. }
  764. // 4. The last one to construct
  765. LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
  766. LmaNodeGE1 *node_cur_ge1 = NULL;
  767. if (0 == level) {
  768. node_cur_le0 = son_1st_le0 + son_pos;
  769. node_cur_le0->spl_idx = spl_idx_node;
  770. node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
  771. node_cur_le0->son_1st_off = 0;
  772. homo_idx_num_eq1_ += homo_num;
  773. } else {
  774. node_cur_ge1 = son_1st_ge1 + son_pos;
  775. node_cur_ge1->spl_idx = spl_idx_node;
  776. set_homo_id_buf_offset(node_cur_ge1,
  777. (homo_idx_num_eq1_ + homo_idx_num_gt1_));
  778. set_son_offset(node_cur_ge1, 0);
  779. homo_idx_num_gt1_ += homo_num;
  780. }
  781. if (homo_num > 0) {
  782. LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
  783. homo_idx_num_gt1_ - homo_num;
  784. if (0 == level) {
  785. assert(homo_num <= 65535);
  786. node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
  787. } else {
  788. assert(homo_num <= 255);
  789. node_cur_ge1->num_of_homo = (unsigned char)homo_num;
  790. }
  791. for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
  792. idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
  793. }
  794. #ifdef ___DO_STATISTICS___
  795. if (homo_num > max_homobuf_len_[level])
  796. max_homobuf_len_[level] = homo_num;
  797. total_homo_num_[level] += homo_num;
  798. #endif
  799. }
  800. if (item_end - item_start_next > homo_num) {
  801. void *next_parent;
  802. if (0 == level)
  803. next_parent = static_cast<void*>(node_cur_le0);
  804. else
  805. next_parent = static_cast<void*>(node_cur_ge1);
  806. construct_subset(next_parent, lemma_arr,
  807. item_start_next + homo_num, item_end, level + 1);
  808. #ifdef ___DO_STATISTICS___
  809. total_node_hasson_[level] += 1;
  810. allson_noson = false;
  811. #endif
  812. }
  813. #ifdef ___DO_STATISTICS___
  814. if (allson_noson) {
  815. total_sonbuf_allnoson_[level] += 1;
  816. total_node_in_sonbuf_allnoson_[level] += parent_son_num;
  817. }
  818. #endif
  819. assert(son_pos + 1 == parent_son_num);
  820. return true;
  821. }
  822. #ifdef ___DO_STATISTICS___
  823. void DictBuilder::stat_init() {
  824. memset(max_sonbuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
  825. memset(max_homobuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
  826. memset(total_son_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
  827. memset(total_node_hasson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  828. memset(total_sonbuf_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
  829. memset(total_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  830. memset(total_node_in_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  831. memset(total_homo_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
  832. sonbufs_num1_ = 0;
  833. sonbufs_numgt1_ = 0;
  834. total_lma_node_num_ = 0;
  835. }
  836. void DictBuilder::stat_print() {
  837. printf("\n------------STAT INFO-------------\n");
  838. printf("[root is layer -1]\n");
  839. printf(".. max_sonbuf_len per layer(from layer 0):\n ");
  840. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  841. printf("%d, ", max_sonbuf_len_[i]);
  842. printf("-, \n");
  843. printf(".. max_homobuf_len per layer:\n -, ");
  844. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  845. printf("%d, ", max_homobuf_len_[i]);
  846. printf("\n");
  847. printf(".. total_son_num per layer:\n ");
  848. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  849. printf("%d, ", total_son_num_[i]);
  850. printf("-, \n");
  851. printf(".. total_node_hasson per layer:\n 1, ");
  852. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  853. printf("%d, ", total_node_hasson_[i]);
  854. printf("\n");
  855. printf(".. total_sonbuf_num per layer:\n ");
  856. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  857. printf("%d, ", total_sonbuf_num_[i]);
  858. printf("-, \n");
  859. printf(".. total_sonbuf_allnoson per layer:\n ");
  860. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  861. printf("%d, ", total_sonbuf_allnoson_[i]);
  862. printf("-, \n");
  863. printf(".. total_node_in_sonbuf_allnoson per layer:\n ");
  864. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  865. printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
  866. printf("-, \n");
  867. printf(".. total_homo_num per layer:\n 0, ");
  868. for (unsigned i = 0; i < kMaxLemmaSize; i++)
  869. printf("%d, ", total_homo_num_[i]);
  870. printf("\n");
  871. printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
  872. printf(".. son buf allocation number with more than 1 son: %d\n",
  873. sonbufs_numgt1_);
  874. printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
  875. }
  876. #endif // ___DO_STATISTICS___
  877. #endif // ___BUILD_MODEL___
  878. } // namespace ime_pinyin