PageRenderTime 74ms CodeModel.GetById 20ms app.highlight 34ms RepoModel.GetById 17ms app.codeStats 0ms

/src/im/gpinyin/share/splparser.cpp

http://ftk.googlecode.com/
C++ | 341 lines | 256 code | 54 blank | 31 comment | 100 complexity | 7ca8a72bb9bd8cb9cfa5b7d4f8cf4d13 MD5 | raw file
  1/*
  2 * Copyright (C) 2009 The Android Open Source Project
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License");
  5 * you may not use this file except in compliance with the License.
  6 * You may obtain a copy of the License at
  7 *
  8 *      http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS,
 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 * See the License for the specific language governing permissions and
 14 * limitations under the License.
 15 */
 16
 17#include <assert.h>
 18#include "../include/splparser.h"
 19
 20namespace ime_pinyin {
 21
 22SpellingParser::SpellingParser() {
 23  spl_trie_ = SpellingTrie::get_cpinstance();
 24}
 25
 26bool SpellingParser::is_valid_to_parse(char ch) {
 27  return SpellingTrie::is_valid_spl_char(ch);
 28}
 29
 30uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
 31                                      uint16 spl_idx[], uint16 start_pos[],
 32                                      uint16 max_size, bool &last_is_pre) {
 33  if (NULL == splstr || 0 == max_size || 0 == str_len)
 34    return 0;
 35
 36  if (!SpellingTrie::is_valid_spl_char(splstr[0]))
 37    return 0;
 38
 39  last_is_pre = false;
 40
 41  const SpellingNode *node_this = spl_trie_->root_;
 42
 43  uint16 str_pos = 0;
 44  uint16 idx_num = 0;
 45  if (NULL != start_pos)
 46    start_pos[0] = 0;
 47  bool last_is_splitter = false;
 48
 49  while (str_pos < str_len) {
 50    char char_this = splstr[str_pos];
 51    // all characters outside of [a, z] are considered as splitters
 52    if (!SpellingTrie::is_valid_spl_char(char_this)) {
 53      // test if the current node is endable
 54      uint16 id_this = node_this->spelling_idx;
 55      if (spl_trie_->if_valid_id_update(&id_this)) {
 56        spl_idx[idx_num] = id_this;
 57
 58        idx_num++;
 59        str_pos++;
 60        if (NULL != start_pos)
 61          start_pos[idx_num] = str_pos;
 62        if (idx_num >= max_size)
 63          return idx_num;
 64
 65        node_this = spl_trie_->root_;
 66        last_is_splitter = true;
 67        continue;
 68      } else {
 69        if (last_is_splitter) {
 70          str_pos++;
 71          if (NULL != start_pos)
 72            start_pos[idx_num] = str_pos;
 73          continue;
 74        } else {
 75          return idx_num;
 76        }
 77      }
 78    }
 79
 80    last_is_splitter = false;
 81
 82    SpellingNode *found_son = NULL;
 83
 84    if (0 == str_pos) {
 85      if (char_this >= 'a')
 86        found_son = spl_trie_->level1_sons_[char_this - 'a'];
 87      else
 88        found_son = spl_trie_->level1_sons_[char_this - 'A'];
 89    } else {
 90      SpellingNode *first_son = node_this->first_son;
 91      // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
 92      // frequently used, so we scan from the end.
 93      for (int i = 0; i < node_this->num_of_son; i++) {
 94        SpellingNode *this_son = first_son + i;
 95        if (SpellingTrie::is_same_spl_char(
 96            this_son->char_this_node, char_this)) {
 97          found_son = this_son;
 98          break;
 99        }
100      }
101    }
102
103    // found, just move the current node pointer to the the son
104    if (NULL != found_son) {
105      node_this = found_son;
106    } else {
107      // not found, test if it is endable
108      uint16 id_this = node_this->spelling_idx;
109      if (spl_trie_->if_valid_id_update(&id_this)) {
110        // endable, remember the index
111        spl_idx[idx_num] = id_this;
112
113        idx_num++;
114        if (NULL != start_pos)
115          start_pos[idx_num] = str_pos;
116        if (idx_num >= max_size)
117          return idx_num;
118        node_this = spl_trie_->root_;
119        continue;
120      } else {
121        return idx_num;
122      }
123    }
124
125    str_pos++;
126  }
127
128  uint16 id_this = node_this->spelling_idx;
129  if (spl_trie_->if_valid_id_update(&id_this)) {
130    // endable, remember the index
131    spl_idx[idx_num] = id_this;
132
133    idx_num++;
134    if (NULL != start_pos)
135      start_pos[idx_num] = str_pos;
136  }
137
138  last_is_pre = !last_is_splitter;
139
140  return idx_num;
141}
142
143uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
144                                        uint16 spl_idx[], uint16 start_pos[],
145                                        uint16 max_size, bool &last_is_pre) {
146  uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
147                                  max_size, last_is_pre);
148  for (uint16 pos = 0; pos < idx_num; pos++) {
149    if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
150      spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
151      if (pos == idx_num - 1) {
152        last_is_pre = false;
153      }
154    }
155  }
156  return idx_num;
157}
158
159uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
160                                        uint16 spl_idx[], uint16 start_pos[],
161                                        uint16 max_size, bool &last_is_pre) {
162  if (NULL == splstr || 0 == max_size || 0 == str_len)
163    return 0;
164
165  if (!SpellingTrie::is_valid_spl_char(splstr[0]))
166    return 0;
167
168  last_is_pre = false;
169
170  const SpellingNode *node_this = spl_trie_->root_;
171
172  uint16 str_pos = 0;
173  uint16 idx_num = 0;
174  if (NULL != start_pos)
175    start_pos[0] = 0;
176  bool last_is_splitter = false;
177
178  while (str_pos < str_len) {
179    char16 char_this = splstr[str_pos];
180    // all characters outside of [a, z] are considered as splitters
181    if (!SpellingTrie::is_valid_spl_char(char_this)) {
182      // test if the current node is endable
183      uint16 id_this = node_this->spelling_idx;
184      if (spl_trie_->if_valid_id_update(&id_this)) {
185        spl_idx[idx_num] = id_this;
186
187        idx_num++;
188        str_pos++;
189        if (NULL != start_pos)
190          start_pos[idx_num] = str_pos;
191        if (idx_num >= max_size)
192          return idx_num;
193
194        node_this = spl_trie_->root_;
195        last_is_splitter = true;
196        continue;
197      } else {
198        if (last_is_splitter) {
199          str_pos++;
200          if (NULL != start_pos)
201            start_pos[idx_num] = str_pos;
202          continue;
203        } else {
204          return idx_num;
205        }
206      }
207    }
208
209    last_is_splitter = false;
210
211    SpellingNode *found_son = NULL;
212
213    if (0 == str_pos) {
214      if (char_this >= 'a')
215        found_son = spl_trie_->level1_sons_[char_this - 'a'];
216      else
217        found_son = spl_trie_->level1_sons_[char_this - 'A'];
218    } else {
219      SpellingNode *first_son = node_this->first_son;
220      // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
221      // frequently used, so we scan from the end.
222      for (int i = 0; i < node_this->num_of_son; i++) {
223        SpellingNode *this_son = first_son + i;
224        if (SpellingTrie::is_same_spl_char(
225            this_son->char_this_node, char_this)) {
226          found_son = this_son;
227          break;
228        }
229      }
230    }
231
232    // found, just move the current node pointer to the the son
233    if (NULL != found_son) {
234      node_this = found_son;
235    } else {
236      // not found, test if it is endable
237      uint16 id_this = node_this->spelling_idx;
238      if (spl_trie_->if_valid_id_update(&id_this)) {
239        // endable, remember the index
240        spl_idx[idx_num] = id_this;
241
242        idx_num++;
243        if (NULL != start_pos)
244          start_pos[idx_num] = str_pos;
245        if (idx_num >= max_size)
246          return idx_num;
247        node_this = spl_trie_->root_;
248        continue;
249      } else {
250        return idx_num;
251      }
252    }
253
254    str_pos++;
255  }
256
257  uint16 id_this = node_this->spelling_idx;
258  if (spl_trie_->if_valid_id_update(&id_this)) {
259    // endable, remember the index
260    spl_idx[idx_num] = id_this;
261
262    idx_num++;
263    if (NULL != start_pos)
264      start_pos[idx_num] = str_pos;
265  }
266
267  last_is_pre = !last_is_splitter;
268
269  return idx_num;
270}
271
272uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
273                                          uint16 spl_idx[], uint16 start_pos[],
274                                          uint16 max_size, bool &last_is_pre) {
275  uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
276                                    max_size, last_is_pre);
277  for (uint16 pos = 0; pos < idx_num; pos++) {
278    if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
279      spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
280      if (pos == idx_num - 1) {
281        last_is_pre = false;
282      }
283    }
284  }
285  return idx_num;
286}
287
288uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
289                                        bool *is_pre) {
290  if (NULL == is_pre)
291    return 0;
292
293  uint16 spl_idx[2];
294  uint16 start_pos[3];
295
296  if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
297    return 0;
298
299  if (start_pos[1] != str_len)
300    return 0;
301  return spl_idx[0];
302}
303
304uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
305                                          bool *is_pre) {
306  if (NULL == is_pre)
307    return 0;
308
309  uint16 spl_idx[2];
310  uint16 start_pos[3];
311
312  if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
313    return 0;
314
315  if (start_pos[1] != str_len)
316    return 0;
317  if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
318    spl_trie_->half_to_full(spl_idx[0], spl_idx);
319    *is_pre = false;
320  }
321
322  return spl_idx[0];
323}
324
325uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
326    uint16 splidx[], uint16 max_size,
327    uint16 &full_id_num, bool &is_pre) {
328  if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
329    return 0;
330
331  splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
332  full_id_num = 0;
333  if (0 != splidx[0]) {
334    if (splidx[0] >= kFullSplIdStart)
335      full_id_num = 1;
336    return 1;
337  }
338  return 0;
339}
340
341}  // namespace ime_pinyin