dictbuilder.cpp - The code is part of a dictionary builder,…

/src/im/gpinyin/share/dictbuilder.cpp

http://ftk.googlecode.com/ · C++ · 1070 lines · 812 code · 192 blank · 66 comment · 241 complexity · 9bf27adae65f9e175c5d08e6faf00ab7 MD5 · raw file

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "../include/dictbuilder.h"
#include "../include/dicttrie.h"
#include "../include/mystdlib.h"
#include "../include/ngram.h"
#include "../include/searchutility.h"
#include "../include/spellingtable.h"
#include "../include/spellingtrie.h"
#include "../include/splparser.h"
#include "../include/utf16reader.h"

namespace ime_pinyin {

#ifdef ___BUILD_MODEL___

static const unsigned kReadBufLen = 512;
static const unsigned kSplTableHashLen = 2000;

// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
// frequencies.
int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
  const SingleCharItem *s1, *s2;
  s1 = static_cast<const SingleCharItem*>(p1);
  s2 = static_cast<const SingleCharItem*>(p2);

  if (s1->hz < s2->hz)
    return -1;
  if (s1->hz > s2->hz)
    return 1;

  if (s1->splid.half_splid < s2->splid.half_splid)
    return -1;
  if (s1->splid.half_splid > s2->splid.half_splid)
    return 1;

  if (s1->splid.full_splid < s2->splid.full_splid)
    return -1;
  if (s1->splid.full_splid > s2->splid.full_splid)
    return 1;

  if (s1->freq > s2->freq)
    return -1;
  if (s1->freq < s2->freq)
    return 1;
  return 0;
}

int cmp_scis_hz_splid(const void* p1, const void* p2) {
  const SingleCharItem *s1, *s2;
  s1 = static_cast<const SingleCharItem*>(p1);
  s2 = static_cast<const SingleCharItem*>(p2);

  if (s1->hz < s2->hz)
    return -1;
  if (s1->hz > s2->hz)
    return 1;

  if (s1->splid.half_splid < s2->splid.half_splid)
    return -1;
  if (s1->splid.half_splid > s2->splid.half_splid)
    return 1;

  if (s1->splid.full_splid < s2->splid.full_splid)
    return -1;
  if (s1->splid.full_splid > s2->splid.full_splid)
    return 1;

  return 0;
}

int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
  unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
  unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
  if (size1 < size2)
    return -1;
  else if (size1 > size2)
    return 1;

  return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
                      ((const LemmaEntry*)p2)->hanzi_str);
}

int compare_char16(const void* p1, const void* p2) {
  if (*((const char16*)p1) < *((const char16*)p2))
    return -1;
  if (*((const char16*)p1) > *((const char16*)p2))
    return 1;
  return 0;
}

int compare_py(const void* p1, const void* p2) {
  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
                         ((const LemmaEntry*)p2)->spl_idx_arr);

  if (0 != ret)
    return ret;

  return static_cast<int>(((const LemmaEntry*)p2)->freq) -
         static_cast<int>(((const LemmaEntry*)p1)->freq);
}

// First hanzi, if the same, then Pinyin
int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
  unsigned size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
  unsigned size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
  if (size1 < size2)
    return -1;
  else if (size1 > size2)
    return 1;
  int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
                         ((const LemmaEntry*)p2)->hanzi_str);

  if (0 != ret)
    return ret;

  ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
                     ((const LemmaEntry*)p2)->spl_idx_arr);
  return ret;
}

int compare_splid2(const void* p1, const void* p2) {
  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
                         ((const LemmaEntry*)p2)->spl_idx_arr);
  return ret;
}

DictBuilder::DictBuilder() {
  lemma_arr_ = NULL;
  lemma_num_ = 0;

  scis_ = NULL;
  scis_num_ = 0;

  lma_nodes_le0_ = NULL;
  lma_nodes_ge1_ = NULL;

  lma_nds_used_num_le0_ = 0;
  lma_nds_used_num_ge1_ = 0;

  homo_idx_buf_ = NULL;
  homo_idx_num_eq1_ = 0;
  homo_idx_num_gt1_ = 0;

  top_lmas_ = NULL;
  top_lmas_num_ = 0;

  spl_table_ = NULL;
  spl_parser_ = NULL;
}

DictBuilder::~DictBuilder() {
  free_resource();
}

bool DictBuilder::alloc_resource(unsigned lma_num) {
  if (0 == lma_num)
    return false;

  free_resource();

  lemma_num_ = lma_num;
  lemma_arr_ = new LemmaEntry[lemma_num_];

  top_lmas_num_ = 0;
  top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];

  // New the scis_ buffer to the possible maximum size.
  scis_num_ = lemma_num_ * kMaxLemmaSize;
  scis_ = new SingleCharItem[scis_num_];

  // The root and first level nodes is less than kMaxSpellingNum + 1
  lma_nds_used_num_le0_ = 0;
  lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];

  // Other nodes is less than lemma_num
  lma_nds_used_num_ge1_ = 0;
  lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];

  homo_idx_buf_ = new LemmaIdType[lemma_num_];
  spl_table_ = new SpellingTable();
  spl_parser_ = new SpellingParser();

  if (NULL == lemma_arr_ || NULL == top_lmas_ ||
      NULL == scis_ || NULL == spl_table_ ||
      NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
      NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
    free_resource();
    return false;
  }

  memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
  memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
  memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
  memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
  memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
  spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);

  return true;
}

char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, unsigned *num) {
  if (NULL == fn_validhzs || NULL == num)
    return NULL;

  *num = 0;
  FILE *fp = fopen(fn_validhzs, "rb");
  if (NULL == fp)
    return NULL;

  char16 utf16header;
  if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
      0xfeff != utf16header) {
    fclose(fp);
    return NULL;
  }

  fseek(fp, 0, SEEK_END);
  *num = ftell(fp) / sizeof(char16);
  assert(*num >= 1);
  *num -= 1;

  char16 *hzs = new char16[*num];
  if (NULL == hzs) {
    fclose(fp);
    return NULL;
  }

  fseek(fp, 2, SEEK_SET);

  if (fread(hzs, sizeof(char16), *num, fp) != *num) {
    fclose(fp);
    delete [] hzs;
    return NULL;
  }
  fclose(fp);

  myqsort(hzs, *num, sizeof(char16), compare_char16);
  return hzs;
}

bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
                                    char16 hz) {
  if (NULL == hzs)
    return false;

  char16 *found;
  found = static_cast<char16*>(
      mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
  if (NULL == found)
    return false;

  assert(*found == hz);
  return true;
}

// The caller makes sure that the parameters are valid.
bool DictBuilder::str_in_hanzis_list(const char16 *hzs, unsigned hzs_len,
                                     const char16 *str, unsigned str_len) {
  if (NULL == hzs || NULL == str)
    return false;

  for (unsigned pos = 0; pos < str_len; pos++) {
    if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
      return false;
  }
  return true;
}

void DictBuilder::get_top_lemmas() {
  top_lmas_num_ = 0;
  if (NULL == lemma_arr_)
    return;

  for (unsigned pos = 0; pos < lemma_num_; pos++) {
    if (0 == top_lmas_num_) {
      top_lmas_[0] = lemma_arr_[pos];
      top_lmas_num_ = 1;
      continue;
    }

    if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
      if (kTopScoreLemmaNum > top_lmas_num_)
        top_lmas_num_ += 1;

      unsigned move_pos;
      for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
        top_lmas_[move_pos] = top_lmas_[move_pos - 1];
        if (0 == move_pos - 1 ||
            (move_pos - 1 > 0 &&
             top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
          break;
        }
      }
      assert(move_pos > 0);
      top_lmas_[move_pos - 1] = lemma_arr_[pos];
    } else if (kTopScoreLemmaNum > top_lmas_num_) {
      top_lmas_[top_lmas_num_] = lemma_arr_[pos];
      top_lmas_num_ += 1;
    }
  }

  if (kPrintDebug0) {
    printf("\n------Top Lemmas------------------\n");
    for (unsigned pos = 0; pos < top_lmas_num_; pos++) {
      printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
             top_lmas_[pos].freq);
    }
  }
}

void DictBuilder::free_resource() {
  if (NULL != lemma_arr_)
    delete [] lemma_arr_;

  if (NULL != scis_)
    delete [] scis_;

  if (NULL != lma_nodes_le0_)
    delete [] lma_nodes_le0_;

  if (NULL != lma_nodes_ge1_)
    delete [] lma_nodes_ge1_;

  if (NULL != homo_idx_buf_)
    delete [] homo_idx_buf_;

  if (NULL != spl_table_)
    delete spl_table_;

  if (NULL != spl_parser_)
    delete spl_parser_;

  lemma_arr_ = NULL;
  scis_ = NULL;
  lma_nodes_le0_ = NULL;
  lma_nodes_ge1_ = NULL;
  homo_idx_buf_ = NULL;
  spl_table_ = NULL;
  spl_parser_ = NULL;

  lemma_num_ = 0;
  lma_nds_used_num_le0_ = 0;
  lma_nds_used_num_ge1_ = 0;
  homo_idx_num_eq1_ = 0;
  homo_idx_num_gt1_ = 0;
}

unsigned DictBuilder::read_raw_dict(const char* fn_raw,
                                  const char *fn_validhzs,
                                  unsigned max_item) {
  if (NULL == fn_raw) return 0;

  Utf16Reader utf16_reader;
  if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
    return false;

  char16 read_buf[kReadBufLen];

  // Read the number of lemmas in the file
  unsigned lemma_num = 240000;

  // allocate resource required
  if (!alloc_resource(lemma_num)) {
    utf16_reader.close();
  }

  // Read the valid Hanzi list.
  char16 *valid_hzs = NULL;
  unsigned valid_hzs_num = 0;
  valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);

  // Begin reading the lemma entries
  for (unsigned i = 0; i < max_item; i++) {
    // read next entry
    if (!utf16_reader.readline(read_buf, kReadBufLen)) {
      lemma_num = i;
      break;
    }

    unsigned token_size;
    char16 *token;
    char16 *to_tokenize = read_buf;

    // Get the Hanzi string
    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    if (NULL == token) {
      free_resource();
      utf16_reader.close();
      return false;
    }

    unsigned lemma_size = utf16_strlen(token);

    if (lemma_size > kMaxLemmaSize) {
      i--;
      continue;
    }

    if (lemma_size > 4) {
      i--;
      continue;
    }

    // Copy to the lemma entry
    utf16_strcpy(lemma_arr_[i].hanzi_str, token);

    lemma_arr_[i].hz_str_len = token_size;

    // Get the freq string
    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    if (NULL == token) {
      free_resource();
      utf16_reader.close();
      return false;
    }
    lemma_arr_[i].freq = utf16_atof(token);

    if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
      i--;
      continue;
    }

    // Get GBK mark, if no valid Hanzi list available, all items which contains
    // GBK characters will be discarded. Otherwise, all items which contains
    // characters outside of the valid Hanzi list will be discarded.
    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    assert(NULL != token);
    int gbk_flag = utf16_atoi(token);
    if (NULL == valid_hzs || 0 == valid_hzs_num) {
      if (0 != gbk_flag) {
        i--;
        continue;
      }
    } else {
      if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
          lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
        i--;
        continue;
      }
    }

    // Get spelling String
    bool spelling_not_support = false;
    for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
         hz_pos++) {
      // Get a Pinyin
      token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
      if (NULL == token) {
        free_resource();
        utf16_reader.close();
        return false;
      }

      assert(utf16_strlen(token) <= kMaxPinyinSize);

      utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);

      format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);

      // Put the pinyin to the spelling table
      if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
                                    lemma_arr_[i].freq)) {
        spelling_not_support = true;
        break;
      }
    }

    // The whole line must have been parsed fully, otherwise discard this one.
    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    if (spelling_not_support || NULL != token) {
      i--;
      continue;
    }
  }

  delete [] valid_hzs;
  utf16_reader.close();

  printf("read succesfully, lemma num: %d\n", lemma_num);

  return lemma_num;
}

bool DictBuilder::build_dict(const char *fn_raw,
                             const char *fn_validhzs,
                             DictTrie *dict_trie) {
  if (NULL == fn_raw || NULL == dict_trie)
    return false;

  lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
  if (0 == lemma_num_)
    return false;

  // Arrange the spelling table, and build a spelling tree
  // The size of an spelling. '\0' is included. If the spelling table is
  // initialized to calculate the spelling scores, the last char in the
  // spelling string will be score, and it is also included in spl_item_size.
  unsigned spl_item_size;
  unsigned spl_num;
  const char* spl_buf;
  spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
  if (NULL == spl_buf) {
    free_resource();
    return false;
  }

  SpellingTrie &spl_trie = SpellingTrie::get_instance();

  if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
                          spl_table_->get_score_amplifier(),
                          spl_table_->get_average_score())) {
    free_resource();
    return false;
  }

  printf("spelling tree construct successfully.\n");

  // Convert the spelling string to idxs
  for (unsigned i = 0; i < lemma_num_; i++) {
    for (unsigned hz_pos = 0; hz_pos < (unsigned)lemma_arr_[i].hz_str_len;
         hz_pos++) {
      uint16 spl_idxs[2];
      uint16 spl_start_pos[3];
      bool is_pre = true;
      int spl_idx_num =
        spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
                                    strlen(lemma_arr_[i].pinyin_str[hz_pos]),
                                    spl_idxs, spl_start_pos, 2, is_pre);
      assert(1 == spl_idx_num);

      if (spl_trie.is_half_id(spl_idxs[0])) {
        uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
        assert(0 != num);
      }
      lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
    }
  }

  // Sort the lemma items according to the hanzi, and give each unique item a
  // id
  sort_lemmas_by_hz();

  scis_num_ = build_scis();

  // Construct the dict list
  dict_trie->dict_list_ = new DictList();
  bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
                                                     lemma_arr_, lemma_num_);
  assert(dl_success);

  // Construct the NGram information
  NGram& ngram = NGram::get_instance();
  ngram.build_unigram(lemma_arr_, lemma_num_,
                      lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);

  // sort the lemma items according to the spelling idx string
  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);

  get_top_lemmas();

#ifdef ___DO_STATISTICS___
  stat_init();
#endif

  lma_nds_used_num_le0_ = 1;  // The root node
  bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
                                     lemma_arr_, 0, lemma_num_, 0);
  if (!dt_success) {
    free_resource();
    return false;
  }

#ifdef ___DO_STATISTICS___
  stat_print();
#endif

  // Move the node data and homo data to the DictTrie
  dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
  dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
  unsigned lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
  dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
  assert(NULL != dict_trie->root_);
  assert(NULL != dict_trie->lma_idx_buf_);
  dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
  dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
  dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
  dict_trie->top_lmas_num_ = top_lmas_num_;

  memcpy(dict_trie->root_, lma_nodes_le0_,
         sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
  memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
         sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);

  for (unsigned pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
                  homo_idx_buf_[pos]);
  }

  for (unsigned pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
       pos < lma_idx_num; pos++) {
    LemmaIdType idx =
        top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
  }

  if (kPrintDebug0) {
    printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
    printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
    printf("top_lmas_num_: %d\n", top_lmas_num_);
  }

  free_resource();

  if (kPrintDebug0) {
    printf("Building dict succeds\n");
  }
  return dt_success;
}

void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
  if (NULL == buf) return;
  for (unsigned pos = 0; pos < kLemmaIdSize; pos++) {
    (buf)[pos] = (unsigned char)(id >> (pos * 8));
  }
}

void DictBuilder::set_son_offset(LmaNodeGE1 *node, unsigned offset) {
  node->son_1st_off_l = static_cast<uint16>(offset);
  node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
}

void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, unsigned offset) {
  node->homo_idx_buf_off_l = static_cast<uint16>(offset);
  node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);

}

// All spelling strings will be converted to upper case, except that
// spellings started with "ZH"/"CH"/"SH" will be converted to
// "Zh"/"Ch"/"Sh"
void DictBuilder::format_spelling_str(char *spl_str) {
  if (NULL == spl_str)
    return;

  uint16 pos = 0;
  while ('\0' != spl_str[pos]) {
    if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
      spl_str[pos] = spl_str[pos] - 'a' + 'A';

    if (1 == pos && 'H' == spl_str[pos]) {
      if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
        spl_str[pos] = 'h';
      }
    }
    pos++;
  }
}

LemmaIdType DictBuilder::sort_lemmas_by_hz() {
  if (NULL == lemma_arr_ || 0 == lemma_num_)
    return 0;

  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);

  lemma_arr_[0].idx_by_hz = 1;
  LemmaIdType idx_max = 1;
  for (unsigned i = 1; i < lemma_num_; i++) {
    if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
      idx_max++;
      lemma_arr_[i].idx_by_hz = idx_max;
    } else {
      idx_max++;
      lemma_arr_[i].idx_by_hz = idx_max;
    }
  }
  return idx_max + 1;
}

unsigned DictBuilder::build_scis() {
  if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
    return 0;

  SpellingTrie &spl_trie = SpellingTrie::get_instance();

  // This first one is blank, because id 0 is invalid.
  scis_[0].freq = 0;
  scis_[0].hz = 0;
  scis_[0].splid.full_splid = 0;
  scis_[0].splid.half_splid = 0;
  scis_num_ = 1;

  // Copy the hanzis to the buffer
  for (unsigned pos = 0; pos < lemma_num_; pos++) {
    unsigned hz_num = lemma_arr_[pos].hz_str_len;
    for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
      scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
      scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
      scis_[scis_num_].splid.half_splid =
          spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
      if (1 == hz_num)
        scis_[scis_num_].freq = lemma_arr_[pos].freq;
      else
        scis_[scis_num_].freq = 0.000001;
      scis_num_++;
    }
  }

  myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);

  // Remove repeated items
  unsigned unique_scis_num = 1;
  for (unsigned pos = 1; pos < scis_num_; pos++) {
    if (scis_[pos].hz == scis_[pos - 1].hz &&
        scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
      continue;
    scis_[unique_scis_num] = scis_[pos];
    scis_[unique_scis_num].splid.half_splid =
        spl_trie.full_to_half(scis_[pos].splid.full_splid);
    unique_scis_num++;
  }

  scis_num_ = unique_scis_num;

  // Update the lemma list.
  for (unsigned pos = 0; pos < lemma_num_; pos++) {
    unsigned hz_num = lemma_arr_[pos].hz_str_len;
    for (unsigned hzpos = 0; hzpos < hz_num; hzpos++) {
      SingleCharItem key;
      key.hz = lemma_arr_[pos].hanzi_str[hzpos];
      key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
      key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);

      SingleCharItem *found;
      found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
                                                     unique_scis_num,
                                                     sizeof(SingleCharItem),
                                                     cmp_scis_hz_splid));

      assert(found);

      lemma_arr_[pos].hanzi_scis_ids[hzpos] =
          static_cast<uint16>(found - scis_);
      lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
    }
  }

  return scis_num_;
}

bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
                                   unsigned item_start, unsigned item_end,
                                   unsigned level) {
  if (level >= kMaxLemmaSize || item_end <= item_start)
    return false;

  // 1. Scan for how many sons
  unsigned parent_son_num = 0;
  // LemmaNode *son_1st = NULL;
  // parent.num_of_son = 0;

  LemmaEntry *lma_last_start = lemma_arr_ + item_start;
  uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];

  // Scan for how many sons to be allocaed
  for (unsigned i = item_start + 1; i< item_end; i++) {
    LemmaEntry *lma_current = lemma_arr + i;
    uint16 spl_idx_current = lma_current->spl_idx_arr[level];
    if (spl_idx_current != spl_idx_node) {
      parent_son_num++;
      spl_idx_node = spl_idx_current;
    }
  }
  parent_son_num++;

#ifdef ___DO_STATISTICS___
  // Use to indicate whether all nodes of this layer have no son.
  bool allson_noson = true;

  assert(level < kMaxLemmaSize);
  if (parent_son_num > max_sonbuf_len_[level])
    max_sonbuf_len_[level] = parent_son_num;

  total_son_num_[level] += parent_son_num;
  total_sonbuf_num_[level] += 1;

  if (parent_son_num == 1)
    sonbufs_num1_++;
  else
    sonbufs_numgt1_++;
  total_lma_node_num_ += parent_son_num;
#endif

  // 2. Update the parent's information
  //    Update the parent's son list;
  LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
  LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
  if (0 == level) {                 // the parent is root
    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
      lma_nds_used_num_le0_;
    son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
    lma_nds_used_num_le0_ += parent_son_num;

    assert(parent_son_num <= 65535);
    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
      static_cast<uint16>(parent_son_num);
  } else if (1 == level) {  // the parent is a son of root
    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
      lma_nds_used_num_ge1_;
    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
    lma_nds_used_num_ge1_ += parent_son_num;

    assert(parent_son_num <= 65535);
    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
      static_cast<uint16>(parent_son_num);
  } else {
    set_son_offset((static_cast<LmaNodeGE1*>(parent)),
                   lma_nds_used_num_ge1_);
    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
    lma_nds_used_num_ge1_ += parent_son_num;

    assert(parent_son_num <= 255);
    (static_cast<LmaNodeGE1*>(parent))->num_of_son =
      (unsigned char)parent_son_num;
  }

  // 3. Now begin to construct the son one by one
  unsigned son_pos = 0;

  lma_last_start = lemma_arr_ + item_start;
  spl_idx_node = lma_last_start->spl_idx_arr[level];

  unsigned homo_num = 0;
  if (lma_last_start->spl_idx_arr[level + 1] == 0)
    homo_num = 1;

  unsigned item_start_next = item_start;

  for (unsigned i = item_start + 1; i < item_end; i++) {
    LemmaEntry* lma_current = lemma_arr_ + i;
    uint16 spl_idx_current = lma_current->spl_idx_arr[level];

    if (spl_idx_current == spl_idx_node) {
      if (lma_current->spl_idx_arr[level + 1] == 0)
        homo_num++;
    } else {
      // Construct a node
      LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
      LmaNodeGE1 *node_cur_ge1 = NULL;
      if (0 == level) {
        node_cur_le0 = son_1st_le0 + son_pos;
        node_cur_le0->spl_idx = spl_idx_node;
        node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
        node_cur_le0->son_1st_off = 0;
        homo_idx_num_eq1_ += homo_num;
      } else {
        node_cur_ge1 = son_1st_ge1 + son_pos;
        node_cur_ge1->spl_idx = spl_idx_node;

        set_homo_id_buf_offset(node_cur_ge1,
                               (homo_idx_num_eq1_ + homo_idx_num_gt1_));
        set_son_offset(node_cur_ge1, 0);
        homo_idx_num_gt1_ += homo_num;
      }

      if (homo_num > 0) {
        LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
              homo_idx_num_gt1_ - homo_num;
        if (0 == level) {
          assert(homo_num <= 65535);
          node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
        } else {
          assert(homo_num <= 255);
          node_cur_ge1->num_of_homo = (unsigned char)homo_num;
        }

        for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
          idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
        }

#ifdef ___DO_STATISTICS___
        if (homo_num > max_homobuf_len_[level])
          max_homobuf_len_[level] = homo_num;

        total_homo_num_[level] += homo_num;
#endif
      }

      if (i - item_start_next > homo_num) {
        void *next_parent;
        if (0 == level)
          next_parent = static_cast<void*>(node_cur_le0);
        else
          next_parent = static_cast<void*>(node_cur_ge1);
        construct_subset(next_parent, lemma_arr,
                         item_start_next + homo_num, i, level + 1);
#ifdef ___DO_STATISTICS___

        total_node_hasson_[level] += 1;
        allson_noson = false;
#endif
      }

      // for the next son
      lma_last_start = lma_current;
      spl_idx_node = spl_idx_current;
      item_start_next = i;
      homo_num = 0;
      if (lma_current->spl_idx_arr[level + 1] == 0)
        homo_num = 1;

      son_pos++;
    }
  }

  // 4. The last one to construct
  LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
  LmaNodeGE1 *node_cur_ge1 = NULL;
  if (0 == level) {
    node_cur_le0 = son_1st_le0 + son_pos;
    node_cur_le0->spl_idx = spl_idx_node;
    node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
    node_cur_le0->son_1st_off = 0;
    homo_idx_num_eq1_ += homo_num;
  } else {
    node_cur_ge1 = son_1st_ge1 + son_pos;
    node_cur_ge1->spl_idx = spl_idx_node;

    set_homo_id_buf_offset(node_cur_ge1,
                           (homo_idx_num_eq1_ + homo_idx_num_gt1_));
    set_son_offset(node_cur_ge1, 0);
    homo_idx_num_gt1_ += homo_num;
  }

  if (homo_num > 0) {
    LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
          homo_idx_num_gt1_ - homo_num;
    if (0 == level) {
      assert(homo_num <= 65535);
      node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
    } else {
      assert(homo_num <= 255);
      node_cur_ge1->num_of_homo = (unsigned char)homo_num;
    }

    for (unsigned homo_pos = 0; homo_pos < homo_num; homo_pos++) {
      idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
    }

#ifdef ___DO_STATISTICS___
    if (homo_num > max_homobuf_len_[level])
      max_homobuf_len_[level] = homo_num;

    total_homo_num_[level] += homo_num;
#endif
  }

  if (item_end - item_start_next > homo_num) {
    void *next_parent;
    if (0 == level)
      next_parent = static_cast<void*>(node_cur_le0);
    else
      next_parent = static_cast<void*>(node_cur_ge1);
    construct_subset(next_parent, lemma_arr,
                     item_start_next + homo_num, item_end, level + 1);
#ifdef ___DO_STATISTICS___

    total_node_hasson_[level] += 1;
    allson_noson = false;
#endif
  }

#ifdef ___DO_STATISTICS___
  if (allson_noson) {
    total_sonbuf_allnoson_[level] += 1;
    total_node_in_sonbuf_allnoson_[level] += parent_son_num;
  }
#endif

  assert(son_pos + 1 == parent_son_num);
  return true;
}

#ifdef ___DO_STATISTICS___
void DictBuilder::stat_init() {
  memset(max_sonbuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(max_homobuf_len_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_son_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_node_hasson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_sonbuf_num_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_node_in_sonbuf_allnoson_, 0, sizeof(unsigned) * kMaxLemmaSize);
  memset(total_homo_num_, 0, sizeof(unsigned) * kMaxLemmaSize);

  sonbufs_num1_ = 0;
  sonbufs_numgt1_ = 0;
  total_lma_node_num_ = 0;
}

void DictBuilder::stat_print() {
  printf("\n------------STAT INFO-------------\n");
  printf("[root is layer -1]\n");
  printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", max_sonbuf_len_[i]);
  printf("-, \n");

  printf(".. max_homobuf_len per layer:\n   -, ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", max_homobuf_len_[i]);
  printf("\n");

  printf(".. total_son_num per layer:\n   ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_son_num_[i]);
  printf("-, \n");

  printf(".. total_node_hasson per layer:\n   1, ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_node_hasson_[i]);
  printf("\n");

  printf(".. total_sonbuf_num per layer:\n   ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_sonbuf_num_[i]);
  printf("-, \n");

  printf(".. total_sonbuf_allnoson per layer:\n   ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_sonbuf_allnoson_[i]);
  printf("-, \n");

  printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
  printf("-, \n");

  printf(".. total_homo_num per layer:\n   0, ");
  for (unsigned i = 0; i < kMaxLemmaSize; i++)
    printf("%d, ", total_homo_num_[i]);
  printf("\n");

  printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
  printf(".. son buf allocation number with more than 1 son: %d\n",
         sonbufs_numgt1_);
  printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
}
#endif  // ___DO_STATISTICS___

#endif  // ___BUILD_MODEL___
}  // namespace ime_pinyin
Summary ✨

The code is part of a dictionary builder, responsible for constructing a data structure from a set of lemmas (words with their meanings). It recursively traverses the lemmas, allocating memory for sonbufs and homobufs as needed, and keeps track of statistics such as maximum buffer sizes and total node numbers. The statistics are printed at the end to provide an overview of the construction process.
Alerts (4)

Complexity hotspot; lines 203 to 206 (total complexity: 13)
203 204 205 206