/src/im/gpinyin/share/splparser.cpp

http://ftk.googlecode.com/ · C++ · 341 lines · 256 code · 54 blank · 31 comment · 100 complexity · 7ca8a72bb9bd8cb9cfa5b7d4f8cf4d13 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <assert.h>
  17. #include "../include/splparser.h"
  18. namespace ime_pinyin {
  19. SpellingParser::SpellingParser() {
  20. spl_trie_ = SpellingTrie::get_cpinstance();
  21. }
  22. bool SpellingParser::is_valid_to_parse(char ch) {
  23. return SpellingTrie::is_valid_spl_char(ch);
  24. }
  25. uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
  26. uint16 spl_idx[], uint16 start_pos[],
  27. uint16 max_size, bool &last_is_pre) {
  28. if (NULL == splstr || 0 == max_size || 0 == str_len)
  29. return 0;
  30. if (!SpellingTrie::is_valid_spl_char(splstr[0]))
  31. return 0;
  32. last_is_pre = false;
  33. const SpellingNode *node_this = spl_trie_->root_;
  34. uint16 str_pos = 0;
  35. uint16 idx_num = 0;
  36. if (NULL != start_pos)
  37. start_pos[0] = 0;
  38. bool last_is_splitter = false;
  39. while (str_pos < str_len) {
  40. char char_this = splstr[str_pos];
  41. // all characters outside of [a, z] are considered as splitters
  42. if (!SpellingTrie::is_valid_spl_char(char_this)) {
  43. // test if the current node is endable
  44. uint16 id_this = node_this->spelling_idx;
  45. if (spl_trie_->if_valid_id_update(&id_this)) {
  46. spl_idx[idx_num] = id_this;
  47. idx_num++;
  48. str_pos++;
  49. if (NULL != start_pos)
  50. start_pos[idx_num] = str_pos;
  51. if (idx_num >= max_size)
  52. return idx_num;
  53. node_this = spl_trie_->root_;
  54. last_is_splitter = true;
  55. continue;
  56. } else {
  57. if (last_is_splitter) {
  58. str_pos++;
  59. if (NULL != start_pos)
  60. start_pos[idx_num] = str_pos;
  61. continue;
  62. } else {
  63. return idx_num;
  64. }
  65. }
  66. }
  67. last_is_splitter = false;
  68. SpellingNode *found_son = NULL;
  69. if (0 == str_pos) {
  70. if (char_this >= 'a')
  71. found_son = spl_trie_->level1_sons_[char_this - 'a'];
  72. else
  73. found_son = spl_trie_->level1_sons_[char_this - 'A'];
  74. } else {
  75. SpellingNode *first_son = node_this->first_son;
  76. // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
  77. // frequently used, so we scan from the end.
  78. for (int i = 0; i < node_this->num_of_son; i++) {
  79. SpellingNode *this_son = first_son + i;
  80. if (SpellingTrie::is_same_spl_char(
  81. this_son->char_this_node, char_this)) {
  82. found_son = this_son;
  83. break;
  84. }
  85. }
  86. }
  87. // found, just move the current node pointer to the the son
  88. if (NULL != found_son) {
  89. node_this = found_son;
  90. } else {
  91. // not found, test if it is endable
  92. uint16 id_this = node_this->spelling_idx;
  93. if (spl_trie_->if_valid_id_update(&id_this)) {
  94. // endable, remember the index
  95. spl_idx[idx_num] = id_this;
  96. idx_num++;
  97. if (NULL != start_pos)
  98. start_pos[idx_num] = str_pos;
  99. if (idx_num >= max_size)
  100. return idx_num;
  101. node_this = spl_trie_->root_;
  102. continue;
  103. } else {
  104. return idx_num;
  105. }
  106. }
  107. str_pos++;
  108. }
  109. uint16 id_this = node_this->spelling_idx;
  110. if (spl_trie_->if_valid_id_update(&id_this)) {
  111. // endable, remember the index
  112. spl_idx[idx_num] = id_this;
  113. idx_num++;
  114. if (NULL != start_pos)
  115. start_pos[idx_num] = str_pos;
  116. }
  117. last_is_pre = !last_is_splitter;
  118. return idx_num;
  119. }
  120. uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
  121. uint16 spl_idx[], uint16 start_pos[],
  122. uint16 max_size, bool &last_is_pre) {
  123. uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
  124. max_size, last_is_pre);
  125. for (uint16 pos = 0; pos < idx_num; pos++) {
  126. if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
  127. spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
  128. if (pos == idx_num - 1) {
  129. last_is_pre = false;
  130. }
  131. }
  132. }
  133. return idx_num;
  134. }
  135. uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
  136. uint16 spl_idx[], uint16 start_pos[],
  137. uint16 max_size, bool &last_is_pre) {
  138. if (NULL == splstr || 0 == max_size || 0 == str_len)
  139. return 0;
  140. if (!SpellingTrie::is_valid_spl_char(splstr[0]))
  141. return 0;
  142. last_is_pre = false;
  143. const SpellingNode *node_this = spl_trie_->root_;
  144. uint16 str_pos = 0;
  145. uint16 idx_num = 0;
  146. if (NULL != start_pos)
  147. start_pos[0] = 0;
  148. bool last_is_splitter = false;
  149. while (str_pos < str_len) {
  150. char16 char_this = splstr[str_pos];
  151. // all characters outside of [a, z] are considered as splitters
  152. if (!SpellingTrie::is_valid_spl_char(char_this)) {
  153. // test if the current node is endable
  154. uint16 id_this = node_this->spelling_idx;
  155. if (spl_trie_->if_valid_id_update(&id_this)) {
  156. spl_idx[idx_num] = id_this;
  157. idx_num++;
  158. str_pos++;
  159. if (NULL != start_pos)
  160. start_pos[idx_num] = str_pos;
  161. if (idx_num >= max_size)
  162. return idx_num;
  163. node_this = spl_trie_->root_;
  164. last_is_splitter = true;
  165. continue;
  166. } else {
  167. if (last_is_splitter) {
  168. str_pos++;
  169. if (NULL != start_pos)
  170. start_pos[idx_num] = str_pos;
  171. continue;
  172. } else {
  173. return idx_num;
  174. }
  175. }
  176. }
  177. last_is_splitter = false;
  178. SpellingNode *found_son = NULL;
  179. if (0 == str_pos) {
  180. if (char_this >= 'a')
  181. found_son = spl_trie_->level1_sons_[char_this - 'a'];
  182. else
  183. found_son = spl_trie_->level1_sons_[char_this - 'A'];
  184. } else {
  185. SpellingNode *first_son = node_this->first_son;
  186. // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
  187. // frequently used, so we scan from the end.
  188. for (int i = 0; i < node_this->num_of_son; i++) {
  189. SpellingNode *this_son = first_son + i;
  190. if (SpellingTrie::is_same_spl_char(
  191. this_son->char_this_node, char_this)) {
  192. found_son = this_son;
  193. break;
  194. }
  195. }
  196. }
  197. // found, just move the current node pointer to the the son
  198. if (NULL != found_son) {
  199. node_this = found_son;
  200. } else {
  201. // not found, test if it is endable
  202. uint16 id_this = node_this->spelling_idx;
  203. if (spl_trie_->if_valid_id_update(&id_this)) {
  204. // endable, remember the index
  205. spl_idx[idx_num] = id_this;
  206. idx_num++;
  207. if (NULL != start_pos)
  208. start_pos[idx_num] = str_pos;
  209. if (idx_num >= max_size)
  210. return idx_num;
  211. node_this = spl_trie_->root_;
  212. continue;
  213. } else {
  214. return idx_num;
  215. }
  216. }
  217. str_pos++;
  218. }
  219. uint16 id_this = node_this->spelling_idx;
  220. if (spl_trie_->if_valid_id_update(&id_this)) {
  221. // endable, remember the index
  222. spl_idx[idx_num] = id_this;
  223. idx_num++;
  224. if (NULL != start_pos)
  225. start_pos[idx_num] = str_pos;
  226. }
  227. last_is_pre = !last_is_splitter;
  228. return idx_num;
  229. }
  230. uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
  231. uint16 spl_idx[], uint16 start_pos[],
  232. uint16 max_size, bool &last_is_pre) {
  233. uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
  234. max_size, last_is_pre);
  235. for (uint16 pos = 0; pos < idx_num; pos++) {
  236. if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
  237. spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
  238. if (pos == idx_num - 1) {
  239. last_is_pre = false;
  240. }
  241. }
  242. }
  243. return idx_num;
  244. }
  245. uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
  246. bool *is_pre) {
  247. if (NULL == is_pre)
  248. return 0;
  249. uint16 spl_idx[2];
  250. uint16 start_pos[3];
  251. if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
  252. return 0;
  253. if (start_pos[1] != str_len)
  254. return 0;
  255. return spl_idx[0];
  256. }
  257. uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
  258. bool *is_pre) {
  259. if (NULL == is_pre)
  260. return 0;
  261. uint16 spl_idx[2];
  262. uint16 start_pos[3];
  263. if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
  264. return 0;
  265. if (start_pos[1] != str_len)
  266. return 0;
  267. if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
  268. spl_trie_->half_to_full(spl_idx[0], spl_idx);
  269. *is_pre = false;
  270. }
  271. return spl_idx[0];
  272. }
  273. uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
  274. uint16 splidx[], uint16 max_size,
  275. uint16 &full_id_num, bool &is_pre) {
  276. if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
  277. return 0;
  278. splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
  279. full_id_num = 0;
  280. if (0 != splidx[0]) {
  281. if (splidx[0] >= kFullSplIdStart)
  282. full_id_num = 1;
  283. return 1;
  284. }
  285. return 0;
  286. }
  287. } // namespace ime_pinyin