PageRenderTime 2530ms CodeModel.GetById 142ms app.highlight 2230ms RepoModel.GetById 139ms app.codeStats 1ms

/src/im/gpinyin/share/matrixsearch.cpp

http://ftk.googlecode.com/
C++ | 1958 lines | 1468 code | 307 blank | 183 comment | 541 complexity | 78b349d2a93098d3ec614f9c46e4972e MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (C) 2009 The Android Open Source Project
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *      http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 */
  16
  17#include <assert.h>
  18#include <math.h>
  19#include <stdio.h>
  20#include <string.h>
  21#include "../include/lpicache.h"
  22#include "../include/matrixsearch.h"
  23#include "../include/mystdlib.h"
  24#include "../include/ngram.h"
  25#include "../include/userdict.h"
  26
  27namespace ime_pinyin {
  28
  29#define PRUMING_SCORE 8000.0
  30
  31MatrixSearch::MatrixSearch() {
  32  inited_ = false;
  33  spl_trie_ = SpellingTrie::get_cpinstance();
  34
  35  reset_pointers_to_null();
  36
  37  pys_decoded_len_ = 0;
  38  mtrx_nd_pool_used_ = 0;
  39  dmi_pool_used_ = 0;
  40  xi_an_enabled_ = false;
  41  dmi_c_phrase_ = false;
  42
  43  assert(kMaxSearchSteps > 0);
  44  max_sps_len_ = kMaxSearchSteps - 1;
  45  max_hzs_len_ = kMaxSearchSteps;
  46}
  47
  48MatrixSearch::~MatrixSearch() {
  49  free_resource();
  50}
  51
  52void MatrixSearch::reset_pointers_to_null() {
  53  dict_trie_ = NULL;
  54  user_dict_ = NULL;
  55  spl_parser_ = NULL;
  56
  57  share_buf_ = NULL;
  58
  59  // The following four buffers are used for decoding, and they are based on
  60  // share_buf_, no need to delete them.
  61  mtrx_nd_pool_ = NULL;
  62  dmi_pool_ = NULL;
  63  matrix_ = NULL;
  64  dep_ = NULL;
  65
  66  // Based on share_buf_, no need to delete them.
  67  npre_items_ = NULL;
  68}
  69
  70bool MatrixSearch::alloc_resource() {
  71  free_resource();
  72
  73  dict_trie_ = new DictTrie();
  74  user_dict_ = static_cast<AtomDictBase*>(new UserDict());
  75  spl_parser_ = new SpellingParser();
  76
  77  unsigned mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
  78  mtrx_nd_size = align_to_unsigned(mtrx_nd_size) / sizeof(unsigned);
  79  unsigned dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
  80  dmi_size = align_to_unsigned(dmi_size) / sizeof(unsigned);
  81  unsigned matrix_size = sizeof(MatrixRow) * kMaxRowNum;
  82  matrix_size = align_to_unsigned(matrix_size) / sizeof(unsigned);
  83  unsigned dep_size = sizeof(DictExtPara);
  84  dep_size = align_to_unsigned(dep_size) / sizeof(unsigned);
  85
  86  // share_buf's size is determined by the buffers for search.
  87  share_buf_ = new unsigned[mtrx_nd_size + dmi_size + matrix_size + dep_size];
  88
  89  if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
  90      NULL == share_buf_)
  91    return false;
  92
  93  // The buffers for search are based on the share buffer
  94  mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
  95  dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
  96  matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
  97  dep_ = reinterpret_cast<DictExtPara*>
  98      (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
  99
 100  // The prediction buffer is also based on the share buffer.
 101  npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
 102  npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
 103      sizeof(unsigned) / sizeof(NPredictItem);
 104  return true;
 105}
 106
 107void MatrixSearch::free_resource() {
 108  if (NULL != dict_trie_)
 109    delete dict_trie_;
 110
 111  if (NULL != user_dict_)
 112    delete user_dict_;
 113
 114  if (NULL != spl_parser_)
 115    delete spl_parser_;
 116
 117  if (NULL != share_buf_)
 118    delete [] share_buf_;
 119
 120  reset_pointers_to_null();
 121}
 122
 123bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
 124  if (NULL == fn_sys_dict || NULL == fn_usr_dict)
 125    return false;
 126
 127  if (!alloc_resource())
 128    return false;
 129
 130  if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
 131    return false;
 132
 133  // If engine fails to load the user dictionary, reset the user dictionary
 134  // to NULL.
 135  if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
 136    delete user_dict_;
 137    user_dict_ = NULL;
 138  } else{
 139    user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
 140  }
 141
 142  reset_search0();
 143
 144  inited_ = true;
 145  return true;
 146}
 147
 148bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
 149                           const char *fn_usr_dict) {
 150  if (NULL == fn_usr_dict)
 151    return false;
 152
 153  if (!alloc_resource())
 154    return false;
 155
 156  if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
 157    return false;
 158
 159  if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
 160    delete user_dict_;
 161    user_dict_ = NULL;
 162  } else {
 163    user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
 164  }
 165
 166  reset_search0();
 167
 168  inited_ = true;
 169  return true;
 170}
 171
 172void MatrixSearch::set_max_lens(unsigned max_sps_len, unsigned max_hzs_len) {
 173  if (0 != max_sps_len)
 174    max_sps_len_ = max_sps_len;
 175  if (0 != max_hzs_len)
 176    max_hzs_len_ = max_hzs_len;
 177}
 178
 179void MatrixSearch::close() {
 180  flush_cache();
 181  free_resource();
 182  inited_ = false;
 183}
 184
 185void MatrixSearch::flush_cache() {
 186  if (NULL != user_dict_)
 187    user_dict_->flush_cache();
 188}
 189
 190void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
 191  xi_an_enabled_ = xi_an_enabled;
 192}
 193
 194bool MatrixSearch::get_xi_an_switch() {
 195  return xi_an_enabled_;
 196}
 197
 198bool MatrixSearch::reset_search() {
 199  if (!inited_)
 200    return false;
 201  return reset_search0();
 202}
 203
 204bool MatrixSearch::reset_search0() {
 205    if (!inited_)
 206        return false;
 207
 208    pys_decoded_len_ = 0;
 209    mtrx_nd_pool_used_ = 0;
 210    dmi_pool_used_ = 0;
 211
 212    // Get a MatrixNode from the pool
 213    matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
 214    matrix_[0].mtrx_nd_num = 1;
 215    mtrx_nd_pool_used_ += 1;
 216
 217    // Update the node, and make it to be a starting node
 218    MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
 219    node->id = 0;
 220    node->score = 0;
 221    node->from = NULL;
 222    node->step = 0;
 223    node->dmi_fr = (PoolPosType)-1;
 224
 225    matrix_[0].dmi_pos = 0;
 226    matrix_[0].dmi_num = 0;
 227    matrix_[0].dmi_has_full_id = 1;
 228    matrix_[0].mtrx_nd_fixed = node;
 229
 230    lma_start_[0] = 0;
 231    fixed_lmas_ = 0;
 232    spl_start_[0] = 0;
 233    fixed_hzs_ = 0;
 234
 235    dict_trie_->reset_milestones(0, 0);
 236    if (NULL != user_dict_)
 237      user_dict_->reset_milestones(0, 0);
 238
 239    return true;
 240}
 241
 242bool MatrixSearch::reset_search(unsigned ch_pos, bool clear_fixed_this_step,
 243                                bool clear_dmi_this_step,
 244                                bool clear_mtrx_this_step) {
 245  if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
 246    return false;
 247
 248  if (0 == ch_pos) {
 249    reset_search0();
 250  } else {
 251    // Prepare mile stones of this step to clear.
 252    MileStoneHandle *dict_handles_to_clear = NULL;
 253    if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
 254      dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
 255    }
 256
 257    // If there are more steps, and this step is not allowed to clear, find
 258    // milestones of next step.
 259    if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
 260      dict_handles_to_clear = NULL;
 261      if (matrix_[ch_pos + 1].dmi_num > 0) {
 262        dict_handles_to_clear =
 263            dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
 264      }
 265    }
 266
 267    if (NULL != dict_handles_to_clear) {
 268      dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
 269      if (NULL != user_dict_)
 270        user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
 271    }
 272
 273    pys_decoded_len_ = ch_pos;
 274
 275    if (clear_dmi_this_step) {
 276      dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
 277                       + matrix_[ch_pos - 1].dmi_num;
 278      matrix_[ch_pos].dmi_num = 0;
 279    } else {
 280      dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
 281    }
 282
 283    if (clear_mtrx_this_step) {
 284      mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
 285                           + matrix_[ch_pos - 1].mtrx_nd_num;
 286      matrix_[ch_pos].mtrx_nd_num = 0;
 287    } else {
 288      mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
 289                           + matrix_[ch_pos].mtrx_nd_num;
 290    }
 291
 292    // Modify fixed_hzs_
 293    if (fixed_hzs_ > 0 &&
 294        ((kLemmaIdComposing != lma_id_[0]) ||
 295         (kLemmaIdComposing == lma_id_[0] &&
 296          spl_start_[c_phrase_.length] <= ch_pos))) {
 297      unsigned fixed_ch_pos = ch_pos;
 298      if (clear_fixed_this_step)
 299        fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
 300      while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
 301        fixed_ch_pos--;
 302
 303      fixed_lmas_ = 0;
 304      fixed_hzs_ = 0;
 305      if (fixed_ch_pos > 0) {
 306        while (spl_start_[fixed_hzs_] < fixed_ch_pos)
 307          fixed_hzs_++;
 308        assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
 309
 310        while (lma_start_[fixed_lmas_] < fixed_hzs_)
 311          fixed_lmas_++;
 312        assert(lma_start_[fixed_lmas_] == fixed_hzs_);
 313      }
 314
 315      // Re-search the Pinyin string for the unlocked lemma
 316      // which was previously fixed.
 317      //
 318      // Prepare mile stones of this step to clear.
 319      MileStoneHandle *dict_handles_to_clear = NULL;
 320      if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
 321          matrix_[fixed_ch_pos].dmi_num > 0) {
 322        dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
 323      }
 324
 325      // If there are more steps, and this step is not allowed to clear, find
 326      // milestones of next step.
 327      if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
 328        dict_handles_to_clear = NULL;
 329        if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
 330          dict_handles_to_clear =
 331              dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
 332        }
 333      }
 334
 335      if (NULL != dict_handles_to_clear) {
 336        dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
 337        if (NULL != user_dict_)
 338          user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
 339      }
 340
 341
 342      pys_decoded_len_ = fixed_ch_pos;
 343
 344      if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
 345        dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
 346                         + matrix_[fixed_ch_pos - 1].dmi_num;
 347        matrix_[fixed_ch_pos].dmi_num = 0;
 348      } else {
 349        dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
 350            matrix_[fixed_ch_pos].dmi_num;
 351      }
 352
 353      if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
 354        mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
 355                             + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
 356        matrix_[fixed_ch_pos].mtrx_nd_num = 0;
 357      } else {
 358        mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
 359                             + matrix_[fixed_ch_pos].mtrx_nd_num;
 360      }
 361
 362      for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
 363        add_char(pys_[re_pos]);
 364      }
 365    } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
 366      for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
 367        uint16 splpos_begin = c_phrase_.sublma_start[subpos];
 368        uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
 369        for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
 370          // If ch_pos is in this spelling
 371          uint16 spl_start = c_phrase_.spl_start[splpos];
 372          uint16 spl_end = c_phrase_.spl_start[splpos + 1];
 373          if (ch_pos >= spl_start && ch_pos < spl_end) {
 374            // Clear everything after this position
 375            c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
 376            c_phrase_.sublma_start[subpos + 1] = splpos;
 377            c_phrase_.sublma_num = subpos + 1;
 378            c_phrase_.length = splpos;
 379
 380            if (splpos == splpos_begin) {
 381              c_phrase_.sublma_num = subpos;
 382            }
 383          }
 384        }
 385      }
 386
 387      // Extend the composing phrase.
 388      reset_search0();
 389      dmi_c_phrase_ = true;
 390      uint16 c_py_pos = 0;
 391      while (c_py_pos < spl_start_[c_phrase_.length]) {
 392        bool b_ac_tmp = add_char(pys_[c_py_pos]);
 393        assert(b_ac_tmp);
 394        c_py_pos++;
 395      }
 396      dmi_c_phrase_ = false;
 397
 398      lma_id_num_ = 1;
 399      fixed_lmas_ = 1;
 400      fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
 401      fixed_hzs_ = c_phrase_.length;
 402      lma_start_[1] = fixed_hzs_;
 403      lma_id_[0] = kLemmaIdComposing;
 404      matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
 405          matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
 406    }
 407  }
 408
 409  return true;
 410}
 411
 412void MatrixSearch::del_in_pys(unsigned start, unsigned len) {
 413  while (start < kMaxRowNum - len && '\0' != pys_[start]) {
 414    pys_[start] = pys_[start + len];
 415    start++;
 416  }
 417}
 418
 419unsigned MatrixSearch::search(const char *py, unsigned py_len) {
 420  if (!inited_ || NULL == py)
 421    return 0;
 422
 423  // If the search Pinyin string is too long, it will be truncated.
 424  if (py_len > kMaxRowNum - 1)
 425    py_len = kMaxRowNum - 1;
 426
 427  // Compare the new string with the previous one. Find their prefix to
 428  // increase search efficiency.
 429  unsigned ch_pos = 0;
 430  for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
 431    if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
 432      break;
 433  }
 434
 435  bool clear_fix = true;
 436  if (ch_pos == pys_decoded_len_)
 437    clear_fix = false;
 438
 439  reset_search(ch_pos, clear_fix, false, false);
 440
 441  memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
 442  pys_[py_len] = '\0';
 443
 444  while ('\0' != pys_[ch_pos]) {
 445    if (!add_char(py[ch_pos])) {
 446      pys_decoded_len_ = ch_pos;
 447      break;
 448    }
 449    ch_pos++;
 450  }
 451
 452  // Get spelling ids and starting positions.
 453  get_spl_start_id();
 454
 455  // If there are too many spellings, remove the last letter until the spelling
 456  // number is acceptable.
 457  while (spl_id_num_ > 9) {
 458    py_len--;
 459    reset_search(py_len, false, false, false);
 460    pys_[py_len] = '\0';
 461    get_spl_start_id();
 462  }
 463
 464  prepare_candidates();
 465
 466  if (kPrintDebug0) {
 467    printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
 468    printf("--DMI Pool Used: %d\n", dmi_pool_used_);
 469
 470    if (kPrintDebug1) {
 471      for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
 472        debug_print_dmi(pos, 1);
 473      }
 474    }
 475  }
 476
 477  return ch_pos;
 478}
 479
 480unsigned MatrixSearch::delsearch(unsigned pos, bool is_pos_in_splid,
 481                               bool clear_fixed_this_step) {
 482  if (!inited_)
 483    return 0;
 484
 485  unsigned reset_pos = pos;
 486
 487  // Out of range for both Pinyin mode and Spelling id mode.
 488  if (pys_decoded_len_ <= pos) {
 489    del_in_pys(pos, 1);
 490
 491    reset_pos = pys_decoded_len_;
 492    // Decode the string after the un-decoded position
 493    while ('\0' != pys_[reset_pos]) {
 494      if (!add_char(pys_[reset_pos])) {
 495        pys_decoded_len_ = reset_pos;
 496        break;
 497      }
 498      reset_pos++;
 499    }
 500    get_spl_start_id();
 501    prepare_candidates();
 502    return pys_decoded_len_;
 503  }
 504
 505  // Spelling id mode, but out of range.
 506  if (is_pos_in_splid && spl_id_num_ <= pos)
 507    return pys_decoded_len_;
 508
 509  // Begin to handle two modes respectively.
 510  // Pinyin mode by default
 511  unsigned c_py_len = 0;  // The length of composing phrase's Pinyin
 512  unsigned del_py_len = 1;
 513  if (!is_pos_in_splid) {
 514    // Pinyin mode is only allowed to delete beyond the fixed lemmas.
 515    if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
 516      return pys_decoded_len_;
 517
 518    del_in_pys(pos, 1);
 519
 520    // If the deleted character is just the one after the last fixed lemma
 521    if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
 522      // If all fixed lemmas have been merged, and the caller of the function
 523      // request to unlock the last fixed lemma.
 524      if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
 525        // Unlock the last sub lemma in the composing phrase. Because it is not
 526        // easy to unlock it directly. Instead, we re-decode the modified
 527        // composing phrase.
 528        c_phrase_.sublma_num--;
 529        c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
 530        reset_pos = spl_start_[c_phrase_.length];
 531        c_py_len = reset_pos;
 532      }
 533    }
 534  } else {
 535    del_py_len = spl_start_[pos + 1] - spl_start_[pos];
 536
 537    del_in_pys(spl_start_[pos], del_py_len);
 538
 539    if (pos >= lma_start_[fixed_lmas_]) {
 540      c_py_len = 0;
 541      reset_pos = spl_start_[pos + 1] - del_py_len;
 542    } else {
 543      c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
 544      reset_pos = c_py_len;
 545      if (c_py_len > 0)
 546        merge_fixed_lmas(pos);
 547    }
 548  }
 549
 550  if (c_py_len > 0) {
 551    assert(c_phrase_.length > 0 && c_py_len ==
 552        c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
 553    // The composing phrase is valid, reset all search space,
 554    // and begin a new search which will only extend the composing
 555    // phrase.
 556    reset_search0();
 557
 558    dmi_c_phrase_ = true;
 559    // Extend the composing phrase.
 560    uint16 c_py_pos = 0;
 561    while (c_py_pos < c_py_len) {
 562      bool b_ac_tmp = add_char(pys_[c_py_pos]);
 563      assert(b_ac_tmp);
 564      c_py_pos++;
 565    }
 566    dmi_c_phrase_ = false;
 567
 568    // Fixd the composing phrase as the first choice.
 569    lma_id_num_ = 1;
 570    fixed_lmas_ = 1;
 571    fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
 572    fixed_hzs_ = c_phrase_.length;
 573    lma_start_[1] = fixed_hzs_;
 574    lma_id_[0] = kLemmaIdComposing;
 575    matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
 576        matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
 577  } else {
 578    // Reseting search only clear pys_decoded_len_, but the string is kept.
 579    reset_search(reset_pos, clear_fixed_this_step, false, false);
 580  }
 581
 582  // Decode the string after the delete position.
 583  while ('\0' != pys_[reset_pos]) {
 584    if (!add_char(pys_[reset_pos])) {
 585      pys_decoded_len_ = reset_pos;
 586      break;
 587    }
 588    reset_pos++;
 589  }
 590
 591  get_spl_start_id();
 592  prepare_candidates();
 593  return pys_decoded_len_;
 594}
 595
 596unsigned MatrixSearch::get_candidate_num() {
 597  if (!inited_ || 0 == pys_decoded_len_ ||
 598      0 == matrix_[pys_decoded_len_].mtrx_nd_num)
 599    return 0;
 600
 601  return 1 + lpi_total_;
 602}
 603
 604char16* MatrixSearch::get_candidate(unsigned cand_id, char16 *cand_str,
 605                                    unsigned max_len) {
 606  if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
 607    return NULL;
 608
 609  if (0 == cand_id) {
 610    return get_candidate0(cand_str, max_len, NULL, false);
 611  } else {
 612    cand_id--;
 613  }
 614
 615  // For this case: the current sentence is a word only, and the user fixed it,
 616  // so the result will be fixed to the sentence space, and
 617  // lpi_total_ will be set to 0.
 618  if (0 == lpi_total_) {
 619    return get_candidate0(cand_str, max_len, NULL, false);
 620  }
 621
 622  LemmaIdType id = lpi_items_[cand_id].id;
 623  char16 s[kMaxLemmaSize + 1];
 624
 625  uint16 s_len = lpi_items_[cand_id].lma_len;
 626  if (s_len > 1) {
 627    s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
 628  } else {
 629    // For a single character, Hanzi is ready.
 630    s[0] = lpi_items_[cand_id].hanzi;
 631    s[1] = static_cast<char16>(0);
 632  }
 633
 634  if (s_len > 0 &&  max_len > s_len) {
 635    utf16_strncpy(cand_str, s, s_len);
 636    cand_str[s_len] = (char16)'\0';
 637    return cand_str;
 638  }
 639
 640  return NULL;
 641}
 642
 643void MatrixSearch::update_dict_freq() {
 644  if (NULL != user_dict_) {
 645    // Update the total frequency of all lemmas, including system lemmas and
 646    // user dictionary lemmas.
 647    unsigned total_freq = user_dict_->get_total_lemma_count();
 648    dict_trie_->set_total_lemma_count_of_others(total_freq);
 649  }
 650}
 651
 652bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
 653                                       float score) {
 654  if (lma_to - lma_fr <= 1 || NULL == user_dict_)
 655    return false;
 656
 657  char16 word_str[kMaxLemmaSize + 1];
 658  uint16 spl_ids[kMaxLemmaSize];
 659
 660  uint16 spl_id_fr = 0;
 661
 662  for (uint16 pos = lma_fr; pos < lma_to; pos++) {
 663    LemmaIdType lma_id = lma_id_[pos];
 664    if (is_user_lemma(lma_id)) {
 665      user_dict_->update_lemma(lma_id, 1, true);
 666    }
 667    uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
 668    utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
 669
 670    uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
 671                               kMaxLemmaSize + 1 - spl_id_fr);
 672    assert(tmp == lma_len);
 673
 674    tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
 675    if (tmp != lma_len) {
 676      return false;
 677    }
 678
 679    spl_id_fr += lma_len;
 680  }
 681
 682  assert(spl_id_fr <= kMaxLemmaSize);
 683
 684  return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
 685                                 spl_id_fr, 1);
 686}
 687
 688void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
 689  if (dmi_pos >= dmi_pool_used_) return;
 690
 691  DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
 692
 693  if (1 == nest_level) {
 694    printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
 695  }
 696  if (dmi->dict_level > 1) {
 697    debug_print_dmi(dmi->dmi_fr, nest_level + 1);
 698  }
 699  printf("---%d\n", dmi->dict_level);
 700  printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
 701  printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
 702         get_spelling_str(dmi->spl_id), dmi->spl_id);
 703  printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
 704  if (1 == nest_level) {
 705    printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
 706  }
 707}
 708
 709bool MatrixSearch::try_add_cand0_to_userdict() {
 710  unsigned new_cand_num = get_candidate_num();
 711  if (fixed_hzs_ > 0 && 1 == new_cand_num) {
 712    float score_from = 0;
 713    uint16 lma_id_from = 0;
 714    uint16 pos = 0;
 715    bool modified = false;
 716    while (pos < fixed_lmas_) {
 717      if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
 718          static_cast<uint16>(kMaxLemmaSize)) {
 719        float score_to_add =
 720            mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
 721            .mtrx_nd_pos].score - score_from;
 722        if (modified) {
 723          score_to_add += 1.0;
 724          if (score_to_add > NGram::kMaxScore) {
 725            score_to_add = NGram::kMaxScore;
 726          }
 727          add_lma_to_userdict(lma_id_from, pos, score_to_add);
 728        }
 729        lma_id_from = pos;
 730        score_from += score_to_add;
 731
 732        // Clear the flag for next user lemma.
 733        modified = false;
 734      }
 735
 736      if (0 == fixed_lmas_no1_[pos]) {
 737        modified = true;
 738      }
 739      pos++;
 740    }
 741
 742    // Single-char word is not allowed to add to userdict.
 743    if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
 744      float score_to_add =
 745          mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
 746          .mtrx_nd_pos].score - score_from;
 747      if (modified) {
 748        score_to_add += 1.0;
 749        if (score_to_add > NGram::kMaxScore) {
 750          score_to_add = NGram::kMaxScore;
 751        }
 752        add_lma_to_userdict(lma_id_from, pos, score_to_add);
 753      }
 754    }
 755  }
 756  return true;
 757}
 758
 759// Choose a candidate, and give new candidates for next step.
 760// If user finishes selection, we will try to communicate with user dictionary
 761// to add new items or update score of some existing items.
 762//
 763// Basic rule:
 764// 1. If user selects the first choice:
 765//    1.1. If the first choice is not a sentence, instead, it is a lemma:
 766//         1.1.1. If the first choice is a user lemma, notify the user
 767//                dictionary that a user lemma is hit, and add occuring count
 768//                by 1.
 769//         1.1.2. If the first choice is a system lemma, do nothing.
 770//    1.2. If the first choice is a sentence containing more than one lemma:
 771//         1.2.1. The whole sentence will be added as a user lemma. If the
 772//                sentence contains user lemmas, -> hit, and add occuring count
 773//                by 1.
 774unsigned MatrixSearch::choose(unsigned cand_id) {
 775  if (!inited_ || 0 == pys_decoded_len_)
 776    return 0;
 777
 778  if (0 == cand_id) {
 779    fixed_hzs_ = spl_id_num_;
 780    matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
 781        matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
 782    for (unsigned pos = fixed_lmas_; pos < lma_id_num_; pos++) {
 783      fixed_lmas_no1_[pos] = 1;
 784    }
 785    fixed_lmas_ = lma_id_num_;
 786    lpi_total_ = 0;  // Clean all other candidates.
 787
 788    // 1. It is the first choice
 789    if (1 == lma_id_num_) {
 790      // 1.1. The first choice is not a sentence but a lemma
 791      if (is_user_lemma(lma_id_[0])) {
 792        // 1.1.1. The first choice is a user lemma, notify the user dictionary
 793        // that it is hit.
 794        if (NULL != user_dict_)
 795          user_dict_->update_lemma(lma_id_[0], 1, true);
 796      } else {
 797        // 1.1.2. do thing for a system lemma.
 798      }
 799    } else {
 800      // 1.2. The first choice is a sentence.
 801      // 1.2.1 Try to add the whole sentence to user dictionary, the whole
 802      // sentence may be splitted into many items.
 803      if (NULL != user_dict_) {
 804        try_add_cand0_to_userdict();
 805      }
 806    }
 807    update_dict_freq();
 808    return 1;
 809  } else {
 810    cand_id--;
 811  }
 812
 813  // 2. It is not the full sentence candidate.
 814  // Find the length of the candidate.
 815  LemmaIdType id_chosen = lpi_items_[cand_id].id;
 816  LmaScoreType score_chosen = lpi_items_[cand_id].psb;
 817  unsigned cand_len = lpi_items_[cand_id].lma_len;
 818
 819  assert(cand_len > 0);
 820
 821  // Notify the atom dictionary that this item is hit.
 822  if (is_user_lemma(id_chosen)) {
 823    if (NULL != user_dict_) {
 824      user_dict_->update_lemma(id_chosen, 1, true);
 825    }
 826    update_dict_freq();
 827  }
 828
 829  // 3. Fixed the chosen item.
 830  // 3.1 Get the steps number.
 831  unsigned step_fr = spl_start_[fixed_hzs_];
 832  unsigned step_to = spl_start_[fixed_hzs_ + cand_len];
 833
 834  // 3.2 Save the length of the original string.
 835  unsigned pys_decoded_len = pys_decoded_len_;
 836
 837  // 3.2 Reset the space of the fixed part.
 838  reset_search(step_to, false, false, true);
 839
 840  // 3.3 For the last character of the fixed part, the previous DMI
 841  // information will be kept, while the MTRX information will be re-extended,
 842  // and only one node will be extended.
 843  matrix_[step_to].mtrx_nd_num = 0;
 844
 845  LmaPsbItem lpi_item;
 846  lpi_item.psb = score_chosen;
 847  lpi_item.id = id_chosen;
 848
 849  PoolPosType step_to_dmi_fr = match_dmi(step_to,
 850                                         spl_id_ + fixed_hzs_, cand_len);
 851  assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
 852
 853  extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
 854                 step_to_dmi_fr, step_to);
 855
 856  matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
 857  mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
 858                       matrix_[step_to].mtrx_nd_num;
 859
 860  if (id_chosen == lma_id_[fixed_lmas_])
 861    fixed_lmas_no1_[fixed_lmas_] = 1;
 862  else
 863    fixed_lmas_no1_[fixed_lmas_] = 0;
 864  lma_id_[fixed_lmas_] = id_chosen;
 865  lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
 866  fixed_lmas_++;
 867  fixed_hzs_ = fixed_hzs_ + cand_len;
 868
 869  while (step_to != pys_decoded_len) {
 870    bool b = add_char(pys_[step_to]);
 871    assert(b);
 872    step_to++;
 873  }
 874
 875  if (fixed_hzs_ < spl_id_num_) {
 876    prepare_candidates();
 877  } else {
 878    lpi_total_ = 0;
 879    if (NULL != user_dict_) {
 880      try_add_cand0_to_userdict();
 881    }
 882  }
 883
 884  return get_candidate_num();
 885}
 886
 887unsigned MatrixSearch::cancel_last_choice() {
 888  if (!inited_ || 0 == pys_decoded_len_)
 889    return 0;
 890
 891  unsigned step_start = 0;
 892  if (fixed_hzs_ > 0) {
 893    unsigned step_end = spl_start_[fixed_hzs_];
 894    MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
 895    assert(NULL != end_node);
 896
 897    step_start = end_node->from->step;
 898
 899    if (step_start > 0) {
 900      DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
 901      fixed_hzs_ -= dmi->dict_level;
 902    } else {
 903      fixed_hzs_ = 0;
 904    }
 905
 906    reset_search(step_start, false, false, false);
 907
 908    while (pys_[step_start] != '\0') {
 909      bool b = add_char(pys_[step_start]);
 910      assert(b);
 911      step_start++;
 912    }
 913
 914    prepare_candidates();
 915  }
 916  return get_candidate_num();
 917}
 918
 919unsigned MatrixSearch::get_fixedlen() {
 920  if (!inited_ || 0 == pys_decoded_len_)
 921    return 0;
 922  return fixed_hzs_;
 923}
 924
 925bool MatrixSearch::prepare_add_char(char ch) {
 926  if (pys_decoded_len_ >= kMaxRowNum - 1 ||
 927      (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
 928    return false;
 929
 930  if (dmi_pool_used_ >= kDmiPoolSize) return false;
 931
 932  pys_[pys_decoded_len_] = ch;
 933  pys_decoded_len_++;
 934
 935  MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
 936  mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
 937  mtrx_this_row->mtrx_nd_num = 0;
 938  mtrx_this_row->dmi_pos = dmi_pool_used_;
 939  mtrx_this_row->dmi_num = 0;
 940  mtrx_this_row->dmi_has_full_id = 0;
 941
 942  return true;
 943}
 944
 945bool MatrixSearch::is_split_at(uint16 pos) {
 946  return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
 947}
 948
 949void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
 950                            PoolPosType dmi_fr, uint16 spl_id,
 951                            uint16 node_num, unsigned char dict_level,
 952                            bool splid_end_split, unsigned char splstr_len,
 953                            unsigned char all_full_id) {
 954  dmi->dict_handles[0] = handles[0];
 955  dmi->dict_handles[1] = handles[1];
 956  dmi->dmi_fr = dmi_fr;
 957  dmi->spl_id = spl_id;
 958  dmi->dict_level = dict_level;
 959  dmi->splid_end_split = splid_end_split ? 1 : 0;
 960  dmi->splstr_len = splstr_len;
 961  dmi->all_full_id = all_full_id;
 962  dmi->c_phrase = 0;
 963}
 964
 965bool MatrixSearch::add_char(char ch) {
 966  if (!prepare_add_char(ch))
 967    return false;
 968  return add_char_qwerty();
 969}
 970
 971bool MatrixSearch::add_char_qwerty() {
 972  matrix_[pys_decoded_len_].mtrx_nd_num = 0;
 973
 974  bool spl_matched = false;
 975  uint16 longest_ext = 0;
 976  // Extend the search matrix, from the oldest unfixed row. ext_len means
 977  // extending length.
 978  for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
 979    if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
 980      continue;
 981
 982    // Refer to the declaration of the variable dmi_has_full_id for the
 983    // explanation of this piece of code. In one word, it is used to prevent
 984    // from the unwise extending of "shoud ou" but allow the reasonable
 985    // extending of "heng ao", "lang a", etc.
 986    if (ext_len > 1 && 0 != longest_ext &&
 987        0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
 988      if (xi_an_enabled_)
 989        continue;
 990      else
 991        break;
 992    }
 993
 994    uint16 oldrow = pys_decoded_len_ - ext_len;
 995
 996    // 0. If that row is before the last fixed step, ignore.
 997    if (spl_start_[fixed_hzs_] > oldrow)
 998      continue;
 999
1000    // 1. Check if that old row has valid MatrixNode. If no, means that row is
1001    // not a boundary, either a word boundary or a spelling boundary.
1002    // If it is for extending composing phrase, it's OK to ignore the 0.
1003    if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
1004      continue;
1005
1006    // 2. Get spelling id(s) for the last ext_len chars.
1007    uint16 spl_idx;
1008    bool is_pre = false;
1009    spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
1010                                            ext_len, &is_pre);
1011    if (is_pre)
1012      spl_matched = true;
1013
1014    if (0 == spl_idx)
1015      continue;
1016
1017    bool splid_end_split = is_split_at(oldrow + ext_len);
1018
1019    // 3. Extend the DMI nodes of that old row
1020    // + 1 is to extend an extra node from the root
1021    for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
1022         dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
1023         dmi_pos++) {
1024      DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
1025      if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
1026        dmi = NULL;  // The last one, NULL means extending from the root.
1027      } else {
1028        // If the dmi is covered by the fixed arrange, ignore it.
1029        if (fixed_hzs_ > 0 &&
1030            pys_decoded_len_ - ext_len - dmi->splstr_len <
1031            spl_start_[fixed_hzs_]) {
1032          continue;
1033        }
1034        // If it is not in mode for composing phrase, and the source DMI node
1035        // is marked for composing phrase, ignore this node.
1036        if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
1037          continue;
1038        }
1039      }
1040
1041      // For example, if "gao" is extended, "g ao" is not allowed.
1042      // or "zh" has been passed, "z h" is not allowed.
1043      // Both word and word-connection will be prevented.
1044      if (longest_ext > ext_len) {
1045        if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
1046          continue;
1047        }
1048
1049        // "z h" is not allowed.
1050        if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
1051          continue;
1052        }
1053      }
1054
1055      dep_->splids_extended = 0;
1056      if (NULL != dmi) {
1057        uint16 prev_ids_num = dmi->dict_level;
1058        if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
1059            (dmi_c_phrase_ && prev_ids_num >=  kMaxRowNum)) {
1060          continue;
1061        }
1062
1063        DictMatchInfo *d = dmi;
1064        while (d) {
1065          dep_->splids[--prev_ids_num] = d->spl_id;
1066          if ((PoolPosType)-1 == d->dmi_fr)
1067            break;
1068          d = dmi_pool_ + d->dmi_fr;
1069        }
1070        assert(0 == prev_ids_num);
1071        dep_->splids_extended = dmi->dict_level;
1072      }
1073      dep_->splids[dep_->splids_extended] = spl_idx;
1074      dep_->ext_len = ext_len;
1075      dep_->splid_end_split = splid_end_split;
1076
1077      dep_->id_num = 1;
1078      dep_->id_start = spl_idx;
1079      if (spl_trie_->is_half_id(spl_idx)) {
1080        // Get the full id list
1081        dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
1082        assert(dep_->id_num > 0);
1083      }
1084
1085      uint16 new_dmi_num;
1086
1087      new_dmi_num = extend_dmi(dep_, dmi);
1088
1089      if (new_dmi_num > 0) {
1090        if (dmi_c_phrase_) {
1091          dmi_pool_[dmi_pool_used_].c_phrase = 1;
1092        }
1093        matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
1094        dmi_pool_used_ += new_dmi_num;
1095
1096        if (!spl_trie_->is_half_id(spl_idx))
1097          matrix_[pys_decoded_len_].dmi_has_full_id = 1;
1098      }
1099
1100      // If get candiate lemmas, try to extend the path
1101      if (lpi_total_ > 0) {
1102        uint16 fr_row;
1103        if (NULL == dmi) {
1104          fr_row = oldrow;
1105        } else {
1106          assert(oldrow >= dmi->splstr_len);
1107          fr_row = oldrow - dmi->splstr_len;
1108        }
1109        for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
1110             mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
1111             matrix_[fr_row].mtrx_nd_num;
1112             mtrx_nd_pos++) {
1113          MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
1114
1115          extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
1116                         dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
1117          if (longest_ext == 0)
1118            longest_ext = ext_len;
1119        }
1120      }
1121    }  // for dmi_pos
1122  }  // for ext_len
1123  mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
1124
1125  if (dmi_c_phrase_)
1126    return true;
1127
1128  return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
1129}
1130
1131void MatrixSearch::prepare_candidates() {
1132  // Get candiates from the first un-fixed step.
1133  uint16 lma_size_max = kMaxLemmaSize;
1134  if (lma_size_max > spl_id_num_ - fixed_hzs_)
1135    lma_size_max = spl_id_num_ - fixed_hzs_;
1136
1137  uint16 lma_size = lma_size_max;
1138
1139  // If the full sentense candidate's unfixed part may be the same with a normal
1140  // lemma. Remove the lemma candidate in this case.
1141  char16 fullsent[kMaxLemmaSize + 1];
1142  char16 *pfullsent = NULL;
1143  uint16 sent_len;
1144  pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
1145
1146  // If the unfixed part contains more than one ids, it is not necessary to
1147  // check whether a lemma's string is the same to the unfixed part of the full
1148  // sentence candidate, so, set it to NULL;
1149  if (sent_len > kMaxLemmaSize)
1150    pfullsent = NULL;
1151
1152  lpi_total_ = 0;
1153  unsigned lpi_num_full_match = 0;  // Number of items which are fully-matched.
1154  while (lma_size > 0) {
1155    unsigned lma_num;
1156    lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
1157                       lpi_items_ + lpi_total_,
1158                       unsigned(kMaxLmaPsbItems - lpi_total_),
1159                       pfullsent, lma_size == lma_size_max);
1160
1161    if (lma_num > 0) {
1162      lpi_total_ += lma_num;
1163      // For next lemma candidates which are not the longest, it is not
1164      // necessary to compare with the full sentence candiate.
1165      pfullsent = NULL;
1166    }
1167    if (lma_size == lma_size_max) {
1168      lpi_num_full_match = lpi_total_;
1169    }
1170    lma_size--;
1171  }
1172
1173  // Sort those partially-matched items by their unified scores.
1174  myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
1175          sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
1176
1177  if (kPrintDebug0) {
1178    printf("-----Prepare candidates, score:\n");
1179    for (unsigned a = 0; a < lpi_total_; a++) {
1180      printf("[%03d]%d    ", a, lpi_items_[a].psb);
1181      if ((a + 1) % 6 == 0) printf("\n");
1182    }
1183    printf("\n");
1184  }
1185
1186  if (kPrintDebug0) {
1187    printf("--- lpi_total_ = %d\n", lpi_total_);
1188  }
1189}
1190
1191const char* MatrixSearch::get_pystr(unsigned *decoded_len) {
1192  if (!inited_ || NULL == decoded_len)
1193    return NULL;
1194
1195  *decoded_len = pys_decoded_len_;
1196  return pys_;
1197}
1198
1199void MatrixSearch::merge_fixed_lmas(unsigned del_spl_pos) {
1200  if (fixed_lmas_ == 0)
1201    return;
1202  // Update spelling segmentation information first.
1203  spl_id_num_ -= 1;
1204  uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
1205  for (unsigned pos = del_spl_pos; pos <= spl_id_num_; pos++) {
1206    spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
1207    if (pos == spl_id_num_)
1208      break;
1209    spl_id_[pos] = spl_id_[pos + 1];
1210  }
1211
1212  // Begin to merge.
1213  uint16 phrase_len = 0;
1214
1215  // Update the spelling ids to the composing phrase.
1216  // We need to convert these ids into full id in the future.
1217  memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
1218  memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
1219
1220  // If composing phrase has not been created, first merge all fixed
1221  //  lemmas into a composing phrase without deletion.
1222  if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
1223    uint16 bp = 1;  // Begin position of real fixed lemmas.
1224    // There is no existing composing phrase.
1225    if (kLemmaIdComposing != lma_id_[0]) {
1226      c_phrase_.sublma_num = 0;
1227      bp = 0;
1228    }
1229
1230    uint16 sub_num = c_phrase_.sublma_num;
1231    for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
1232      c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
1233      if (lma_start_[pos] > del_spl_pos) {
1234        c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
1235      }
1236
1237      if (pos == fixed_lmas_)
1238        break;
1239
1240      uint16 lma_len;
1241      char16 *lma_str = c_phrase_.chn_str +
1242          c_phrase_.sublma_start[sub_num] + phrase_len;
1243
1244      lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
1245      assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
1246      phrase_len += lma_len;
1247    }
1248    assert(phrase_len == lma_start_[fixed_lmas_]);
1249    c_phrase_.length = phrase_len;  // will be deleted by 1
1250    c_phrase_.sublma_num += fixed_lmas_ - bp;
1251  } else {
1252    for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
1253      if (c_phrase_.sublma_start[pos] > del_spl_pos) {
1254        c_phrase_.sublma_start[pos] -= 1;
1255      }
1256    }
1257    phrase_len = c_phrase_.length;
1258  }
1259
1260  assert(phrase_len > 0);
1261  if (1 == phrase_len) {
1262    // After the only one is deleted, nothing will be left.
1263    fixed_lmas_ = 0;
1264    return;
1265  }
1266
1267  // Delete the Chinese character in the merged phrase.
1268  // The corresponding elements in spl_ids and spl_start of the
1269  // phrase have been deleted.
1270  char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
1271  for (uint16 pos = 0;
1272      pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
1273      pos++) {
1274    chn_str[pos] = chn_str[pos + 1];
1275  }
1276  c_phrase_.length -= 1;
1277
1278  // If the deleted spelling id is in a sub lemma which contains more than
1279  // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
1280  // which only contains 1 id, the whole sub lemma needs to be deleted, so
1281  // del_a_sub will be true.
1282  bool del_a_sub = false;
1283  for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
1284    if (c_phrase_.sublma_start[pos - 1] ==
1285        c_phrase_.sublma_start[pos]) {
1286      del_a_sub = true;
1287    }
1288    if (del_a_sub) {
1289      c_phrase_.sublma_start[pos - 1] =
1290          c_phrase_.sublma_start[pos];
1291    }
1292  }
1293  if (del_a_sub)
1294    c_phrase_.sublma_num -= 1;
1295
1296  return;
1297}
1298
1299void MatrixSearch::get_spl_start_id() {
1300  lma_id_num_ = 0;
1301  lma_start_[0] = 0;
1302
1303  spl_id_num_ = 0;
1304  spl_start_[0] = 0;
1305  if (!inited_ || 0 == pys_decoded_len_ ||
1306      0 == matrix_[pys_decoded_len_].mtrx_nd_num)
1307    return;
1308
1309  // Calculate number of lemmas and spellings
1310  // Only scan those part which is not fixed.
1311  lma_id_num_ = fixed_lmas_;
1312  spl_id_num_ = fixed_hzs_;
1313
1314  MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1315  while (mtrx_nd != mtrx_nd_pool_) {
1316    if (fixed_hzs_ > 0) {
1317      if (mtrx_nd->step <= spl_start_[fixed_hzs_])
1318        break;
1319    }
1320
1321    // Update the spelling segamentation information
1322    unsigned char word_splstr_len = 0;
1323    PoolPosType dmi_fr = mtrx_nd->dmi_fr;
1324    if ((PoolPosType)-1 != dmi_fr)
1325      word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
1326
1327    while ((PoolPosType)-1 != dmi_fr) {
1328      spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
1329          (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
1330      spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
1331      spl_id_num_++;
1332      dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
1333    }
1334
1335    // Update the lemma segmentation information
1336    lma_start_[lma_id_num_ + 1] = spl_id_num_;
1337    lma_id_[lma_id_num_] = mtrx_nd->id;
1338    lma_id_num_++;
1339
1340    mtrx_nd = mtrx_nd->from;
1341  }
1342
1343  // Reverse the result of spelling info
1344  for (unsigned pos = fixed_hzs_;
1345       pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
1346    if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
1347      spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1348      spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
1349      spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1350
1351      spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
1352      spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
1353      spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
1354    }
1355  }
1356
1357  // Reverse the result of lemma info
1358  for (unsigned pos = fixed_lmas_;
1359       pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
1360    assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
1361
1362    if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
1363      lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1364      lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
1365      lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1366
1367      lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1368      lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
1369      lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1370    }
1371  }
1372
1373  for (unsigned pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
1374    if (pos < lma_id_num_)
1375      lma_start_[pos] = lma_start_[pos - 1] +
1376          (lma_start_[pos] - lma_start_[pos + 1]);
1377    else
1378      lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
1379          lma_start_[fixed_lmas_];
1380  }
1381
1382  // Find the last fixed position
1383  fixed_hzs_ = 0;
1384  for (unsigned pos = spl_id_num_; pos > 0; pos--) {
1385    if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
1386      fixed_hzs_ = pos;
1387      break;
1388    }
1389  }
1390
1391  return;
1392}
1393
1394unsigned MatrixSearch::get_spl_start(const uint16 *&spl_start) {
1395  get_spl_start_id();
1396  spl_start = spl_start_;
1397  return spl_id_num_;
1398}
1399
1400unsigned MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
1401  if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1402
1403  if (dmi_c_phrase_)
1404    return extend_dmi_c(dep, dmi_s);
1405
1406  LpiCache& lpi_cache = LpiCache::get_instance();
1407  uint16 splid = dep->splids[dep->splids_extended];
1408
1409  bool cached = false;
1410  if (0 == dep->splids_extended)
1411    cached = lpi_cache.is_cached(splid);
1412
1413  // 1. If this is a half Id, get its corresponding full starting Id and
1414  // number of full Id.
1415  unsigned ret_val = 0;
1416  PoolPosType mtrx_dmi_fr = (PoolPosType)-1;  // From which dmi node
1417
1418  lpi_total_ = 0;
1419
1420  MileStoneHandle from_h[3];
1421  from_h[0] = 0;
1422  from_h[1] = 0;
1423
1424  if (0 != dep->splids_extended) {
1425    from_h[0] = dmi_s->dict_handles[0];
1426    from_h[1] = dmi_s->dict_handles[1];
1427  }
1428
1429  // 2. Begin exgtending in the system dictionary
1430  unsigned lpi_num = 0;
1431  MileStoneHandle handles[2];
1432  handles[0] = handles[1] = 0;
1433  if (from_h[0] > 0 || NULL == dmi_s) {
1434    handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
1435                                         kMaxLmaPsbItems, &lpi_num);
1436  }
1437  if (handles[0] > 0)
1438    lpi_total_ = lpi_num;
1439
1440  if (NULL == dmi_s) {  // from root
1441    assert(0 != handles[0]);
1442    mtrx_dmi_fr = dmi_pool_used_;
1443  }
1444
1445  // 3. Begin extending in the user dictionary
1446  if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
1447    handles[1] = user_dict_->extend_dict(from_h[1], dep,
1448                                         lpi_items_ + lpi_total_,
1449                                         kMaxLmaPsbItems - lpi_total_,
1450                                         &lpi_num);
1451    if (handles[1] > 0) {
1452      if (kPrintDebug0) {
1453        for (unsigned t = 0; t < lpi_num; t++) {
1454          printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
1455                 lpi_items_[lpi_total_ + t].psb);
1456        }
1457      }
1458      lpi_total_ += lpi_num;
1459    }
1460  }
1461
1462  if (0 != handles[0] || 0 != handles[1]) {
1463    if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1464
1465    DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1466    if (NULL == dmi_s) {
1467      fill_dmi(dmi_add, handles,
1468               (PoolPosType)-1, splid,
1469               1, 1, dep->splid_end_split, dep->ext_len,
1470               spl_trie_->is_half_id(splid) ? 0 : 1);
1471    } else {
1472      fill_dmi(dmi_add, handles,
1473               dmi_s - dmi_pool_, splid, 1,
1474               dmi_s->dict_level + 1, dep->splid_end_split,
1475               dmi_s->splstr_len + dep->ext_len,
1476               spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1477    }
1478
1479    ret_val = 1;
1480  }
1481
1482  if (!cached) {
1483    if (0 == lpi_total_)
1484      return ret_val;
1485
1486    if (kPrintDebug0) {
1487      printf("--- lpi_total_ = %d\n", lpi_total_);
1488    }
1489
1490    myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1491    if (NULL == dmi_s && spl_trie_->is_half_id(splid))
1492      lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
1493  } else {
1494    assert(spl_trie_->is_half_id(splid));
1495    lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
1496  }
1497
1498  return ret_val;
1499}
1500
1501unsigned MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
1502  lpi_total_ = 0;
1503
1504  uint16 pos = dep->splids_extended;
1505  assert(dmi_c_phrase_);
1506  if (pos >= c_phrase_.length)
1507    return 0;
1508
1509  uint16 splid = dep->splids[pos];
1510  if (splid == c_phrase_.spl_ids[pos]) {
1511    DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1512    MileStoneHandle handles[2];  // Actually never used.
1513    if (NULL == dmi_s)
1514      fill_dmi(dmi_add, handles,
1515               (PoolPosType)-1, splid,
1516               1, 1, dep->splid_end_split, dep->ext_len,
1517               spl_trie_->is_half_id(splid) ? 0 : 1);
1518    else
1519      fill_dmi(dmi_add, handles,
1520               dmi_s - dmi_pool_, splid, 1,
1521               dmi_s->dict_level + 1, dep->splid_end_split,
1522               dmi_s->splstr_len + dep->ext_len,
1523               spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1524
1525    if (pos == c_phrase_.length - 1) {
1526      lpi_items_[0].id = kLemmaIdComposing;
1527      lpi_items_[0].psb = 0;  // 0 is bigger than normal lemma score.
1528      lpi_total_ = 1;
1529    }
1530    return 1;
1531  }
1532  return 0;
1533}
1534
1535unsigned MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
1536                                    unsigned lpi_num, PoolPosType dmi_fr,
1537                                    unsigned res_row) {
1538  assert(NULL != mtrx_nd);
1539  matrix_[res_row].mtrx_nd_fixed = NULL;
1540
1541  if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
1542    return 0;
1543
1544  if (0 == mtrx_nd->step) {
1545    // Because the list is sorted, if the source step is 0, it is only
1546    // necessary to pick up the first kMaxNodeARow items.
1547    if (lpi_num > kMaxNodeARow)
1548      lpi_num = kMaxNodeARow;
1549  }
1550
1551  MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
1552  for (unsigned pos = 0; pos < lpi_num; pos++) {
1553    float score = mtrx_nd->score + lpi_items[pos].psb;
1554    if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
1555      break;
1556
1557    // Try to add a new node
1558    unsigned mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
1559    MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
1560    bool replace = false;
1561    // Find its position
1562    while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
1563      if (static_cast<unsigned>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
1564        *mtrx_nd_res = *(mtrx_nd_res - 1);
1565      mtrx_nd_res--;
1566      replace = true;
1567    }
1568    if (replace || (mtrx_nd_num < kMaxNodeARow &&
1569        matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
1570      mtrx_nd_res->id = lpi_items[pos].id;
1571      mtrx_nd_res->score = score;
1572      mtrx_nd_res->from = mtrx_nd;
1573      mtrx_nd_res->dmi_fr = dmi_fr;
1574      mtrx_nd_res->step = res_row;
1575      if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
1576        matrix_[res_row].mtrx_nd_num++;
1577    }
1578  }
1579  return matrix_[res_row].mtrx_nd_num;
1580}
1581
1582PoolPosType MatrixSearch::match_dmi(unsigned step_to, uint16 spl_ids[],
1583                                    uint16 spl_id_num) {
1584  if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
1585    return static_cast<PoolPosType>(-1);
1586  }
1587
1588  for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
1589    DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
1590
1591    if (dmi->dict_level != spl_id_num)
1592      continue;
1593
1594    bool matched = true;
1595    for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
1596      if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
1597        matched = false;
1598        break;
1599      }
1600
1601      dmi = dmi_pool_ + dmi->dmi_fr;
1602    }
1603    if (matched) {
1604      return matrix_[step_to].dmi_pos + dmi_pos;
1605  

Large files files are truncated, but you can click here to view the full file