PageRenderTime 564ms CodeModel.GetById 3ms app.highlight 514ms RepoModel.GetById 2ms app.codeStats 2ms

/src/im/gpinyin/share/dictbuilder.cpp

http://ftk.googlecode.com/
C++ | 1070 lines | 812 code | 192 blank | 66 comment | 241 complexity | 9bf27adae65f9e175c5d08e6faf00ab7 MD5 | raw file
   1/*
   2 * Copyright (C) 2009 The Android Open Source Project
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *      http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 */
  16
  17#include <assert.h>
  18#include <stdio.h>
  19#include <stdlib.h>
  20#include <string.h>
  21
  22#include "../include/dictbuilder.h"
  23#include "../include/dicttrie.h"
  24#include "../include/mystdlib.h"
  25#include "../include/ngram.h"
  26#include "../include/searchutility.h"
  27#include "../include/spellingtable.h"
  28#include "../include/spellingtrie.h"
  29#include "../include/splparser.h"
  30#include "../include/utf16reader.h"
  31
  32namespace ime_pinyin {
  33
  34#ifdef ___BUILD_MODEL___
  35
  36static const unsigned kReadBufLen = 512;
  37static const unsigned kSplTableHashLen = 2000;
  38
  39// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
  40// frequencies.
  41int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
  42  const SingleCharItem *s1, *s2;
  43  s1 = static_cast<const SingleCharItem*>(p1);
  44  s2 = static_cast<const SingleCharItem*>(p2);
  45
  46  if (s1->hz < s2->hz)
  47    return -1;
  48  if (s1->hz > s2->hz)
  49    return 1;
  50
  51  if (s1->splid.half_splid < s2->splid.half_splid)
  52    return -1;
  53  if (s1->splid.half_splid > s2->splid.half_splid)
  54    return 1;
  55
  56  if (s1->splid.full_splid < s2->splid.full_splid)
  57    return -1;
  58  if (s1->splid.full_splid > s2->splid.full_splid)
  59    return 1;
  60
  61  if (s1->freq > s2->freq)
  62    return -1;
  63  if (s1->freq < s2->freq)
  64    return 1;
  65  return 0;
  66}
  67
  68int cmp_scis_hz_splid(const void* p1, const void* p2) {
  69  const SingleCharItem *s1, *s2;
  70  s1 = static_cast<const SingleCharItem*>(p1);
  71  s2 = static_cast<const SingleCharItem*>(p2);
  72
  73  if (s1->hz < s2->hz)
  74    return -1;
  75  if (s1->hz > s2->hz)
  76    return 1;
  77
  78  if (s1->splid.half_splid < s2->splid.half_splid)
  79    return -1;
  80  if (s1->splid.half_splid > s2->splid.half_splid)
  81    return 1;
  82
  83  if (s1->splid.full_splid < s2->splid.full_splid)
  84    return -1;
  85  if (s1->splid.full_splid > s2->splid.full_splid)
  86    return 1;
  87
  88  return 0;
  89}
  90
  91int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
  92  unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
  93  unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
  94  if (size1 < size2)
  95    return -1;
  96  else if (size1 > size2)
  97    return 1;
  98
  99  return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
 100                      ((const LemmaEntry*)p2)->hanzi_str);
 101}
 102
 103int compare_char16(const void* p1, const void* p2) {
 104  if (*((const char16*)p1) < *((const char16*)p2))
 105    return -1;
 106  if (*((const char16*)p1) > *((const char16*)p2))
 107    return 1;
 108  return 0;
 109}
 110
 111int compare_py(const void* p1, const void* p2) {
 112  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
 113                         ((const LemmaEntry*)p2)->spl_idx_arr);
 114
 115  if (0 != ret)
 116    return ret;
 117
 118  return static_cast<int>(((const LemmaEntry*)p2)->freq) -
 119         static_cast<int>(((const LemmaEntry*)p1)->freq);
 120}
 121
 122// First hanzi, if the same, then Pinyin
 123int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
 124  unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
 125  unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
 126  if (size1 < size2)
 127    return -1;
 128  else if (size1 > size2)
 129    return 1;
 130  int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
 131                         ((const LemmaEntry*)p2)->hanzi_str);
 132
 133  if (0 != ret)
 134    return ret;
 135
 136  ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
 137                     ((const LemmaEntry*)p2)->spl_idx_arr);
 138  return ret;
 139}
 140
 141int compare_splid2(const void* p1, const void* p2) {
 142  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
 143                         ((const LemmaEntry*)p2)->spl_idx_arr);
 144  return ret;
 145}
 146
 147DictBuilder::DictBuilder() {
 148  lemma_arr_ = NULL;
 149  lemma_num_ = 0;
 150
 151  scis_ = NULL;
 152  scis_num_ = 0;
 153
 154  lma_nodes_le0_ = NULL;
 155  lma_nodes_ge1_ = NULL;
 156
 157  lma_nds_used_num_le0_ = 0;
 158  lma_nds_used_num_ge1_ = 0;
 159
 160  homo_idx_buf_ = NULL;
 161  homo_idx_num_eq1_ = 0;
 162  homo_idx_num_gt1_ = 0;
 163
 164  top_lmas_ = NULL;
 165  top_lmas_num_ = 0;
 166
 167  spl_table_ = NULL;
 168  spl_parser_ = NULL;
 169}
 170
 171DictBuilder::~DictBuilder() {
 172  free_resource();
 173}
 174
 175bool DictBuilder::alloc_resource(unsigned lma_num) {
 176  if (0 == lma_num)
 177    return false;
 178
 179  free_resource();
 180
 181  lemma_num_ = lma_num;
 182  lemma_arr_ = new LemmaEntry[lemma_num_];
 183
 184  top_lmas_num_ = 0;
 185  top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
 186
 187  // New the scis_ buffer to the possible maximum size.
 188  scis_num_ = lemma_num_ * kMaxLemmaSize;
 189  scis_ = new SingleCharItem[scis_num_];
 190
 191  // The root and first level nodes is less than kMaxSpellingNum + 1
 192  lma_nds_used_num_le0_ = 0;
 193  lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
 194
 195  // Other nodes is less than lemma_num
 196  lma_nds_used_num_ge1_ = 0;
 197  lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
 198
 199  homo_idx_buf_ = new LemmaIdType[lemma_num_];
 200  spl_table_ = new SpellingTable();
 201  spl_parser_ = new SpellingParser();
 202
 203  if (NULL == lemma_arr_ || NULL == top_lmas_ ||
 204      NULL == scis_ || NULL == spl_table_ ||
 205      NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
 206      NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
 207    free_resource();
 208    return false;
 209  }
 210
 211  memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
 212  memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
 213  memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
 214  memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
 215  memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
 216  spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
 217
 218  return true;
 219}
 220
 221char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, unsigned *num) {
 222  if (NULL == fn_validhzs || NULL == num)
 223    return NULL;
 224
 225  *num = 0;
 226  FILE *fp = fopen(fn_validhzs, "rb");
 227  if (NULL == fp)
 228    return NULL;
 229
 230  char16 utf16header;
 231  if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
 232      0xfeff != utf16header) {
 233    fclose(fp);
 234    return NULL;
 235  }
 236
 237  fseek(fp, 0, SEEK_END);
 238  *num = ftell(fp) / sizeof(char16);
 239  assert(*num >= 1);
 240  *num -= 1;
 241
 242  char16 *hzs = new char16[*num];
 243  if (NULL == hzs) {
 244    fclose(fp);
 245    return NULL;
 246  }
 247
 248  fseek(fp, 2, SEEK_SET);
 249
 250  if (fread(hzs, sizeof(char16), *num, fp) != *num) {
 251    fclose(fp);
 252    delete [] hzs;
 253    return NULL;
 254  }
 255  fclose(fp);
 256
 257  myqsort(hzs, *num, sizeof(char16), compare_char16);
 258  return hzs;
 259}
 260
 261bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
 262                                    char16 hz) {
 263  if (NULL == hzs)
 264    return false;
 265
 266  char16 *found;
 267  found = static_cast<char16*>(
 268      mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
 269  if (NULL == found)
 270    return false;
 271
 272  assert(*found == hz);
 273  return true;
 274}
 275
 276// The caller makes sure that the parameters are valid.
 277bool DictBuilder::str_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
 278                                     const char16 *str, unsigned str_len) {
 279  if (NULL == hzs || NULL == str)
 280    return false;
 281
 282  for (unsigned pos = 0; pos < str_len; pos++) {
 283    if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
 284      return false;
 285  }
 286  return true;
 287}
 288
 289void DictBuilder::get_top_lemmas() {
 290  top_lmas_num_ = 0;
 291  if (NULL == lemma_arr_)
 292    return;
 293
 294  for (unsigned pos = 0; pos < lemma_num_; pos++) {
 295    if (0 == top_lmas_num_) {
 296      top_lmas_[0] = lemma_arr_[pos];
 297      top_lmas_num_ = 1;
 298      continue;
 299    }
 300
 301    if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
 302      if (kTopScoreLemmaNum > top_lmas_num_)
 303        top_lmas_num_ += 1;
 304
 305      unsigned move_pos;
 306      for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
 307        top_lmas_[move_pos] = top_lmas_[move_pos - 1];
 308        if (0 == move_pos - 1 ||
 309            (move_pos - 1 > 0 &&
 310             top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
 311          break;
 312        }
 313      }
 314      assert(move_pos > 0);
 315      top_lmas_[move_pos - 1] = lemma_arr_[pos];
 316    } else if (kTopScoreLemmaNum > top_lmas_num_) {
 317      top_lmas_[top_lmas_num_] = lemma_arr_[pos];
 318      top_lmas_num_ += 1;
 319    }
 320  }
 321
 322  if (kPrintDebug0) {
 323    printf("\n------Top Lemmas------------------\n");
 324    for (unsigned pos = 0; pos < top_lmas_num_; pos++) {
 325      printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
 326             top_lmas_[pos].freq);
 327    }
 328  }
 329}
 330
 331void DictBuilder::free_resource() {
 332  if (NULL != lemma_arr_)
 333    delete [] lemma_arr_;
 334
 335  if (NULL != scis_)
 336    delete [] scis_;
 337
 338  if (NULL != lma_nodes_le0_)
 339    delete [] lma_nodes_le0_;
 340
 341  if (NULL != lma_nodes_ge1_)
 342    delete [] lma_nodes_ge1_;
 343
 344  if (NULL != homo_idx_buf_)
 345    delete [] homo_idx_buf_;
 346
 347  if (NULL != spl_table_)
 348    delete spl_table_;
 349
 350  if (NULL != spl_parser_)
 351    delete spl_parser_;
 352
 353  lemma_arr_ = NULL;
 354  scis_ = NULL;
 355  lma_nodes_le0_ = NULL;
 356  lma_nodes_ge1_ = NULL;
 357  homo_idx_buf_ = NULL;
 358  spl_table_ = NULL;
 359  spl_parser_ = NULL;
 360
 361  lemma_num_ = 0;
 362  lma_nds_used_num_le0_ = 0;
 363  lma_nds_used_num_ge1_ = 0;
 364  homo_idx_num_eq1_ = 0;
 365  homo_idx_num_gt1_ = 0;
 366}
 367
 368unsigned DictBuilder::read_raw_dict(const char* fn_raw,
 369                                  const char *fn_validhzs,
 370                                  unsigned max_item) {
 371  if (NULL == fn_raw) return 0;
 372
 373  Utf16Reader utf16_reader;
 374  if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
 375    return false;
 376
 377  char16 read_buf[kReadBufLen];
 378
 379  // Read the number of lemmas in the file
 380  unsigned lemma_num = 240000;
 381
 382  // allocate resource required
 383  if (!alloc_resource(lemma_num)) {
 384    utf16_reader.close();
 385  }
 386
 387  // Read the valid Hanzi list.
 388  char16 *valid_hzs = NULL;
 389  unsigned valid_hzs_num = 0;
 390  valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
 391
 392  // Begin reading the lemma entries
 393  for (unsigned i = 0; i < max_item; i++) {
 394    // read next entry
 395    if (!utf16_reader.readline(read_buf, kReadBufLen)) {
 396      lemma_num = i;
 397      break;
 398    }
 399
 400    unsigned token_size;
 401    char16 *token;
 402    char16 *to_tokenize = read_buf;
 403
 404    // Get the Hanzi string
 405    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
 406    if (NULL == token) {
 407      free_resource();
 408      utf16_reader.close();
 409      return false;
 410    }
 411
 412    unsigned lemma_size = utf16_strlen(token);
 413
 414    if (lemma_size > kMaxLemmaSize) {
 415      i--;
 416      continue;
 417    }
 418
 419    if (lemma_size > 4) {
 420      i--;
 421      continue;
 422    }
 423
 424    // Copy to the lemma entry
 425    utf16_strcpy(lemma_arr_[i].hanzi_str, token);
 426
 427    lemma_arr_[i].hz_str_len = token_size;
 428
 429    // Get the freq string
 430    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
 431    if (NULL == token) {
 432      free_resource();
 433      utf16_reader.close();
 434      return false;
 435    }
 436    lemma_arr_[i].freq = utf16_atof(token);
 437
 438    if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
 439      i--;
 440      continue;
 441    }
 442
 443    // Get GBK mark, if no valid Hanzi list available, all items which contains
 444    // GBK characters will be discarded. Otherwise, all items which contains
 445    // characters outside of the valid Hanzi list will be discarded.
 446    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
 447    assert(NULL != token);
 448    int gbk_flag = utf16_atoi(token);
 449    if (NULL == valid_hzs || 0 == valid_hzs_num) {
 450      if (0 != gbk_flag) {
 451        i--;
 452        continue;
 453      }
 454    } else {
 455      if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
 456          lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
 457        i--;
 458        continue;
 459      }
 460    }
 461
 462    // Get spelling String
 463    bool spelling_not_support = false;
 464    for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
 465         hz_pos++) {
 466      // Get a Pinyin
 467      token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
 468      if (NULL == token) {
 469        free_resource();
 470        utf16_reader.close();
 471        return false;
 472      }
 473
 474      assert(utf16_strlen(token) <= kMaxPinyinSize);
 475
 476      utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
 477
 478      format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
 479
 480      // Put the pinyin to the spelling table
 481      if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
 482                                    lemma_arr_[i].freq)) {
 483        spelling_not_support = true;
 484        break;
 485      }
 486    }
 487
 488    // The whole line must have been parsed fully, otherwise discard this one.
 489    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
 490    if (spelling_not_support || NULL != token) {
 491      i--;
 492      continue;
 493    }
 494  }
 495
 496  delete [] valid_hzs;
 497  utf16_reader.close();
 498
 499  printf("read succesfully, lemma num: %d\n", lemma_num);
 500
 501  return lemma_num;
 502}
 503
 504bool DictBuilder::build_dict(const char *fn_raw,
 505                             const char *fn_validhzs,
 506                             DictTrie *dict_trie) {
 507  if (NULL == fn_raw || NULL == dict_trie)
 508    return false;
 509
 510  lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
 511  if (0 == lemma_num_)
 512    return false;
 513
 514  // Arrange the spelling table, and build a spelling tree
 515  // The size of an spelling. '\0' is included. If the spelling table is
 516  // initialized to calculate the spelling scores, the last char in the
 517  // spelling string will be score, and it is also included in spl_item_size.
 518  unsigned spl_item_size;
 519  unsigned spl_num;
 520  const char* spl_buf;
 521  spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
 522  if (NULL == spl_buf) {
 523    free_resource();
 524    return false;
 525  }
 526
 527  SpellingTrie &spl_trie = SpellingTrie::get_instance();
 528
 529  if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
 530                          spl_table_->get_score_amplifier(),
 531                          spl_table_->get_average_score())) {
 532    free_resource();
 533    return false;
 534  }
 535
 536  printf("spelling tree construct successfully.\n");
 537
 538  // Convert the spelling string to idxs
 539  for (unsigned i = 0; i < lemma_num_; i++) {
 540    for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
 541         hz_pos++) {
 542      uint16 spl_idxs[2];
 543      uint16 spl_start_pos[3];
 544      bool is_pre = true;
 545      int spl_idx_num =
 546        spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
 547                                    strlen(lemma_arr_[i].pinyin_str[hz_pos]),
 548                                    spl_idxs, spl_start_pos, 2, is_pre);
 549      assert(1 == spl_idx_num);
 550
 551      if (spl_trie.is_half_id(spl_idxs[0])) {
 552        uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
 553        assert(0 != num);
 554      }
 555      lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
 556    }
 557  }
 558
 559  // Sort the lemma items according to the hanzi, and give each unique item a
 560  // id
 561  sort_lemmas_by_hz();
 562
 563  scis_num_ = build_scis();
 564
 565  // Construct the dict list
 566  dict_trie->dict_list_ = new DictList();
 567  bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
 568                                                     lemma_arr_, lemma_num_);
 569  assert(dl_success);
 570
 571  // Construct the NGram information
 572  NGram& ngram = NGram::get_instance();
 573  ngram.build_unigram(lemma_arr_, lemma_num_,
 574                      lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
 575
 576  // sort the lemma items according to the spelling idx string
 577  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
 578
 579  get_top_lemmas();
 580
 581#ifdef ___DO_STATISTICS___
 582  stat_init();
 583#endif
 584
 585  lma_nds_used_num_le0_ = 1;  // The root node
 586  bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
 587                                     lemma_arr_, 0, lemma_num_, 0);
 588  if (!dt_success) {
 589    free_resource();
 590    return false;
 591  }
 592
 593#ifdef ___DO_STATISTICS___
 594  stat_print();
 595#endif
 596
 597  // Move the node data and homo data to the DictTrie
 598  dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
 599  dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
 600  unsigned lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
 601  dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
 602  assert(NULL != dict_trie->root_);
 603  assert(NULL != dict_trie->lma_idx_buf_);
 604  dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
 605  dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
 606  dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
 607  dict_trie->top_lmas_num_ = top_lmas_num_;
 608
 609  memcpy(dict_trie->root_, lma_nodes_le0_,
 610         sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
 611  memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
 612         sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
 613
 614  for (unsigned pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
 615    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
 616                  homo_idx_buf_[pos]);
 617  }
 618
 619  for (unsigned pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
 620       pos < lma_idx_num; pos++) {
 621    LemmaIdType idx =
 622        top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
 623    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
 624  }
 625
 626  if (kPrintDebug0) {
 627    printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
 628    printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
 629    printf("top_lmas_num_: %d\n", top_lmas_num_);
 630  }
 631
 632  free_resource();
 633
 634  if (kPrintDebug0) {
 635    printf("Building dict succeds\n");
 636  }
 637  return dt_success;
 638}
 639
 640void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
 641  if (NULL == buf) return;
 642  for (unsigned pos = 0; pos < kLemmaIdSize; pos++) {
 643    (buf)[pos] = (unsigned char)(id >> (pos * 8));
 644  }
 645}
 646
 647void DictBuilder::set_son_offset(LmaNodeGE1 *node, unsigned offset) {
 648  node->son_1st_off_l = static_cast<uint16>(offset);
 649  node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
 650}
 651
 652void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, unsigned offset) {
 653  node->homo_idx_buf_off_l = static_cast<uint16>(offset);
 654  node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
 655
 656}
 657
 658// All spelling strings will be converted to upper case, except that
 659// spellings started with "ZH"/"CH"/"SH" will be converted to
 660// "Zh"/"Ch"/"Sh"
 661void DictBuilder::format_spelling_str(char *spl_str) {
 662  if (NULL == spl_str)
 663    return;
 664
 665  uint16 pos = 0;
 666  while ('\0' != spl_str[pos]) {
 667    if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
 668      spl_str[pos] = spl_str[pos] - 'a' + 'A';
 669
 670    if (1 == pos && 'H' == spl_str[pos]) {
 671      if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
 672        spl_str[pos] = 'h';
 673      }
 674    }
 675    pos++;
 676  }
 677}
 678
 679LemmaIdType DictBuilder::sort_lemmas_by_hz() {
 680  if (NULL == lemma_arr_ || 0 == lemma_num_)
 681    return 0;
 682
 683  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
 684
 685  lemma_arr_[0].idx_by_hz = 1;
 686  LemmaIdType idx_max = 1;
 687  for (unsigned i = 1; i < lemma_num_; i++) {
 688    if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
 689      idx_max++;
 690      lemma_arr_[i].idx_by_hz = idx_max;
 691    } else {
 692      idx_max++;
 693      lemma_arr_[i].idx_by_hz = idx_max;
 694    }
 695  }
 696  return idx_max + 1;
 697}
 698
 699unsigned DictBuilder::build_scis() {
 700  if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
 701    return 0;
 702
 703  SpellingTrie &spl_trie = SpellingTrie::get_instance();
 704
 705  // This first one is blank, because id 0 is invalid.
 706  scis_[0].freq = 0;
 707  scis_[0].hz = 0;
 708  scis_[0].splid.full_splid = 0;
 709  scis_[0].splid.half_splid = 0;
 710  scis_num_ = 1;
 711
 712  // Copy the hanzis to the buffer
 713  for (unsigned pos = 0; pos < lemma_num_; pos++) {
 714    unsigned hz_num = lemma_arr_[pos].hz_str_len;
 715    for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
 716      scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
 717      scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
 718      scis_[scis_num_].splid.half_splid =
 719          spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
 720      if (1 == hz_num)
 721        scis_[scis_num_].freq = lemma_arr_[pos].freq;
 722      else
 723        scis_[scis_num_].freq = 0.000001;
 724      scis_num_++;
 725    }
 726  }
 727
 728  myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
 729
 730  // Remove repeated items
 731  unsigned unique_scis_num = 1;
 732  for (unsigned pos = 1; pos < scis_num_; pos++) {
 733    if (scis_[pos].hz == scis_[pos - 1].hz &&
 734        scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
 735      continue;
 736    scis_[unique_scis_num] = scis_[pos];
 737    scis_[unique_scis_num].splid.half_splid =
 738        spl_trie.full_to_half(scis_[pos].splid.full_splid);
 739    unique_scis_num++;
 740  }
 741
 742  scis_num_ = unique_scis_num;
 743
 744  // Update the lemma list.
 745  for (unsigned pos = 0; pos < lemma_num_; pos++) {
 746    unsigned hz_num = lemma_arr_[pos].hz_str_len;
 747    for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
 748      SingleCharItem key;
 749      key.hz = lemma_arr_[pos].hanzi_str[hzpos];
 750      key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
 751      key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
 752
 753      SingleCharItem *found;
 754      found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
 755                                                     unique_scis_num,
 756                                                     sizeof(SingleCharItem),
 757                                                     cmp_scis_hz_splid));
 758
 759      assert(found);
 760
 761      lemma_arr_[pos].hanzi_scis_ids[hzpos] =
 762          static_cast<uint16>(found - scis_);
 763      lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
 764    }
 765  }
 766
 767  return scis_num_;
 768}
 769
 770bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
 771                                   unsigned item_start, unsigned item_end,
 772                                   unsigned level) {
 773  if (level >= kMaxLemmaSize || item_end <= item_start)
 774    return false;
 775
 776  // 1. Scan for how many sons
 777  unsigned parent_son_num = 0;
 778  // LemmaNode *son_1st = NULL;
 779  // parent.num_of_son = 0;
 780
 781  LemmaEntry *lma_last_start = lemma_arr_ + item_start;
 782  uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
 783
 784  // Scan for how many sons to be allocaed
 785  for (unsigned i = item_start + 1; i< item_end; i++) {
 786    LemmaEntry *lma_current = lemma_arr + i;
 787    uint16 spl_idx_current = lma_current->spl_idx_arr[level];
 788    if (spl_idx_current != spl_idx_node) {
 789      parent_son_num++;
 790      spl_idx_node = spl_idx_current;
 791    }
 792  }
 793  parent_son_num++;
 794
 795#ifdef ___DO_STATISTICS___
 796  // Use to indicate whether all nodes of this layer have no son.
 797  bool allson_noson = true;
 798
 799  assert(level < kMaxLemmaSize);
 800  if (parent_son_num > max_sonbuf_len_[level])
 801    max_sonbuf_len_[level] = parent_son_num;
 802
 803  total_son_num_[level] += parent_son_num;
 804  total_sonbuf_num_[level] += 1;
 805
 806  if (parent_son_num == 1)
 807    sonbufs_num1_++;
 808  else
 809    sonbufs_numgt1_++;
 810  total_lma_node_num_ += parent_son_num;
 811#endif
 812
 813  // 2. Update the parent's information
 814  //    Update the parent's son list;
 815  LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
 816  LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
 817  if (0 == level) {                 // the parent is root
 818    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
 819      lma_nds_used_num_le0_;
 820    son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
 821    lma_nds_used_num_le0_ += parent_son_num;
 822
 823    assert(parent_son_num <= 65535);
 824    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
 825      static_cast<uint16>(parent_son_num);
 826  } else if (1 == level) {  // the parent is a son of root
 827    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
 828      lma_nds_used_num_ge1_;
 829    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
 830    lma_nds_used_num_ge1_ += parent_son_num;
 831
 832    assert(parent_son_num <= 65535);
 833    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
 834      static_cast<uint16>(parent_son_num);
 835  } else {
 836    set_son_offset((static_cast<LmaNodeGE1*>(parent)),
 837                   lma_nds_used_num_ge1_);
 838    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
 839    lma_nds_used_num_ge1_ += parent_son_num;
 840
 841    assert(parent_son_num <= 255);
 842    (static_cast<LmaNodeGE1*>(parent))->num_of_son =
 843      (unsigned char)parent_son_num;
 844  }
 845
 846  // 3. Now begin to construct the son one by one
 847  unsigned son_pos = 0;
 848
 849  lma_last_start = lemma_arr_ + item_start;
 850  spl_idx_node = lma_last_start->spl_idx_arr[level];
 851
 852  unsigned homo_num = 0;
 853  if (lma_last_start->spl_idx_arr[level + 1] == 0)
 854    homo_num = 1;
 855
 856  unsigned item_start_next = item_start;
 857
 858  for (unsigned i = item_start + 1; i < item_end; i++) {
 859    LemmaEntry* lma_current = lemma_arr_ + i;
 860    uint16 spl_idx_current = lma_current->spl_idx_arr[level];
 861
 862    if (spl_idx_current == spl_idx_node) {
 863      if (lma_current->spl_idx_arr[level + 1] == 0)
 864        homo_num++;
 865    } else {
 866      // Construct a node
 867      LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
 868      LmaNodeGE1 *node_cur_ge1 = NULL;
 869      if (0 == level) {
 870        node_cur_le0 = son_1st_le0 + son_pos;
 871        node_cur_le0->spl_idx = spl_idx_node;
 872        node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
 873        node_cur_le0->son_1st_off = 0;
 874        homo_idx_num_eq1_ += homo_num;
 875      } else {
 876        node_cur_ge1 = son_1st_ge1 + son_pos;
 877        node_cur_ge1->spl_idx = spl_idx_node;
 878
 879        set_homo_id_buf_offset(node_cur_ge1,
 880                               (homo_idx_num_eq1_ + homo_idx_num_gt1_));
 881        set_son_offset(node_cur_ge1, 0);
 882        homo_idx_num_gt1_ += homo_num;
 883      }
 884
 885      if (homo_num > 0) {
 886        LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
 887              homo_idx_num_gt1_ - homo_num;
 888        if (0 == level) {
 889          assert(homo_num <= 65535);
 890          node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
 891        } else {
 892          assert(homo_num <= 255);
 893          node_cur_ge1->num_of_homo = (unsigned char)homo_num;
 894        }
 895
 896        for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
 897          idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
 898        }
 899
 900#ifdef ___DO_STATISTICS___
 901        if (homo_num > max_homobuf_len_[level])
 902          max_homobuf_len_[level] = homo_num;
 903
 904        total_homo_num_[level] += homo_num;
 905#endif
 906      }
 907
 908      if (i - item_start_next > homo_num) {
 909        void *next_parent;
 910        if (0 == level)
 911          next_parent = static_cast<void*>(node_cur_le0);
 912        else
 913          next_parent = static_cast<void*>(node_cur_ge1);
 914        construct_subset(next_parent, lemma_arr,
 915                         item_start_next + homo_num, i, level + 1);
 916#ifdef ___DO_STATISTICS___
 917
 918        total_node_hasson_[level] += 1;
 919        allson_noson = false;
 920#endif
 921      }
 922
 923      // for the next son
 924      lma_last_start = lma_current;
 925      spl_idx_node = spl_idx_current;
 926      item_start_next = i;
 927      homo_num = 0;
 928      if (lma_current->spl_idx_arr[level + 1] == 0)
 929        homo_num = 1;
 930
 931      son_pos++;
 932    }
 933  }
 934
 935  // 4. The last one to construct
 936  LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
 937  LmaNodeGE1 *node_cur_ge1 = NULL;
 938  if (0 == level) {
 939    node_cur_le0 = son_1st_le0 + son_pos;
 940    node_cur_le0->spl_idx = spl_idx_node;
 941    node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
 942    node_cur_le0->son_1st_off = 0;
 943    homo_idx_num_eq1_ += homo_num;
 944  } else {
 945    node_cur_ge1 = son_1st_ge1 + son_pos;
 946    node_cur_ge1->spl_idx = spl_idx_node;
 947
 948    set_homo_id_buf_offset(node_cur_ge1,
 949                           (homo_idx_num_eq1_ + homo_idx_num_gt1_));
 950    set_son_offset(node_cur_ge1, 0);
 951    homo_idx_num_gt1_ += homo_num;
 952  }
 953
 954  if (homo_num > 0) {
 955    LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
 956          homo_idx_num_gt1_ - homo_num;
 957    if (0 == level) {
 958      assert(homo_num <= 65535);
 959      node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
 960    } else {
 961      assert(homo_num <= 255);
 962      node_cur_ge1->num_of_homo = (unsigned char)homo_num;
 963    }
 964
 965    for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
 966      idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
 967    }
 968
 969#ifdef ___DO_STATISTICS___
 970    if (homo_num > max_homobuf_len_[level])
 971      max_homobuf_len_[level] = homo_num;
 972
 973    total_homo_num_[level] += homo_num;
 974#endif
 975  }
 976
 977  if (item_end - item_start_next > homo_num) {
 978    void *next_parent;
 979    if (0 == level)
 980      next_parent = static_cast<void*>(node_cur_le0);
 981    else
 982      next_parent = static_cast<void*>(node_cur_ge1);
 983    construct_subset(next_parent, lemma_arr,
 984                     item_start_next + homo_num, item_end, level + 1);
 985#ifdef ___DO_STATISTICS___
 986
 987    total_node_hasson_[level] += 1;
 988    allson_noson = false;
 989#endif
 990  }
 991
 992#ifdef ___DO_STATISTICS___
 993  if (allson_noson) {
 994    total_sonbuf_allnoson_[level] += 1;
 995    total_node_in_sonbuf_allnoson_[level] += parent_son_num;
 996  }
 997#endif
 998
 999  assert(son_pos + 1 == parent_son_num);
1000  return true;
1001}
1002
1003#ifdef ___DO_STATISTICS___
1004void DictBuilder::stat_init() {
1005  memset(max_sonbuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
1006  memset(max_homobuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
1007  memset(total_son_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
1008  memset(total_node_hasson_, 0, sizeof(unsigned) * kMaxLemmaSize);
1009  memset(total_sonbuf_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
1010  memset(total_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
1011  memset(total_node_in_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
1012  memset(total_homo_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
1013
1014  sonbufs_num1_ = 0;
1015  sonbufs_numgt1_ = 0;
1016  total_lma_node_num_ = 0;
1017}
1018
1019void DictBuilder::stat_print() {
1020  printf("\n------------STAT INFO-------------\n");
1021  printf("[root is layer -1]\n");
1022  printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
1023  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1024    printf("%d, ", max_sonbuf_len_[i]);
1025  printf("-, \n");
1026
1027  printf(".. max_homobuf_len per layer:\n   -, ");
1028  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1029    printf("%d, ", max_homobuf_len_[i]);
1030  printf("\n");
1031
1032  printf(".. total_son_num per layer:\n   ");
1033  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1034    printf("%d, ", total_son_num_[i]);
1035  printf("-, \n");
1036
1037  printf(".. total_node_hasson per layer:\n   1, ");
1038  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1039    printf("%d, ", total_node_hasson_[i]);
1040  printf("\n");
1041
1042  printf(".. total_sonbuf_num per layer:\n   ");
1043  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1044    printf("%d, ", total_sonbuf_num_[i]);
1045  printf("-, \n");
1046
1047  printf(".. total_sonbuf_allnoson per layer:\n   ");
1048  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1049    printf("%d, ", total_sonbuf_allnoson_[i]);
1050  printf("-, \n");
1051
1052  printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
1053  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1054    printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
1055  printf("-, \n");
1056
1057  printf(".. total_homo_num per layer:\n   0, ");
1058  for (unsigned i = 0; i < kMaxLemmaSize; i++)
1059    printf("%d, ", total_homo_num_[i]);
1060  printf("\n");
1061
1062  printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
1063  printf(".. son buf allocation number with more than 1 son: %d\n",
1064         sonbufs_numgt1_);
1065  printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
1066}
1067#endif  // ___DO_STATISTICS___
1068
1069#endif  // ___BUILD_MODEL___
1070}  // namespace ime_pinyin