/src/im/gpinyin/include/matrixsearch.h

http://ftk.googlecode.com/ · C++ Header · 456 lines · 152 code · 85 blank · 219 comment · 0 complexity · c0d25ecd3e7b3b40d36f859f8d8dc6e1 MD5 · raw file

  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
  17. #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
  18. #include <stdlib.h>
  19. #include "./atomdictbase.h"
  20. #include "./dicttrie.h"
  21. #include "./searchutility.h"
  22. #include "./spellingtrie.h"
  23. #include "./splparser.h"
  24. namespace ime_pinyin {
  25. static const unsigned kMaxRowNum = kMaxSearchSteps;
  26. typedef struct {
  27. // MileStoneHandle objects for the system and user dictionaries.
  28. MileStoneHandle dict_handles[2];
  29. // From which DMI node. -1 means it's from root.
  30. PoolPosType dmi_fr;
  31. // The spelling id for the Pinyin string from the previous DMI to this node.
  32. // If it is a half id like Shengmu, the node pointed by dict_node is the first
  33. // node with this Shengmu,
  34. uint16 spl_id;
  35. // What's the level of the dict node. Level of root is 0, but root is never
  36. // recorded by dict_node.
  37. unsigned char dict_level:7;
  38. // If this node is for composing phrase, this bit is 1.
  39. unsigned char c_phrase:1;
  40. // Whether the spl_id is parsed with a split character at the end.
  41. unsigned char splid_end_split:1;
  42. // What's the length of the spelling string for this match, for the whole
  43. // word.
  44. unsigned char splstr_len:7;
  45. // Used to indicate whether all spelling ids from the root are full spelling
  46. // ids. This information is useful for keymapping mode(not finished). Because
  47. // in this mode, there is no clear boundaries, we prefer those results which
  48. // have full spelling ids.
  49. unsigned char all_full_id:1;
  50. } DictMatchInfo, *PDictMatchInfo;
  51. typedef struct MatrixNode {
  52. LemmaIdType id;
  53. float score;
  54. MatrixNode *from;
  55. // From which DMI node. Used to trace the spelling segmentation.
  56. PoolPosType dmi_fr;
  57. uint16 step;
  58. } MatrixNode, *PMatrixNode;
  59. typedef struct {
  60. // The MatrixNode position in the matrix pool
  61. PoolPosType mtrx_nd_pos;
  62. // The DictMatchInfo position in the DictMatchInfo pool.
  63. PoolPosType dmi_pos;
  64. uint16 mtrx_nd_num;
  65. uint16 dmi_num:15;
  66. // Used to indicate whether there are dmi nodes in this step with full
  67. // spelling id. This information is used to decide whether a substring of a
  68. // valid Pinyin should be extended.
  69. //
  70. // Example1: shoudao
  71. // When the last char 'o' is added, the parser will find "dao" is a valid
  72. // Pinyin, and because all dmi nodes at location 'd' (including those for
  73. // "shoud", and those for "d") have Shengmu id only, so it is not necessary
  74. // to extend "ao", otherwise the result may be "shoud ao", that is not
  75. // reasonable.
  76. //
  77. // Example2: hengao
  78. // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
  79. // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
  80. // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
  81. // "heng ao" can also be the result.
  82. //
  83. // Similarly, "ganga" is expanded to "gang a".
  84. //
  85. // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
  86. // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
  87. // is not valid either). If the parser uses break in the loop, the result
  88. // always be "xian"; but if the parser uses continue in the loop, "xi an" will
  89. // also be tried. This behaviour can be set via the function
  90. // set_xi_an_switch().
  91. uint16 dmi_has_full_id:1;
  92. // Points to a MatrixNode of the current step to indicate which choice the
  93. // user selects.
  94. MatrixNode *mtrx_nd_fixed;
  95. } MatrixRow, *PMatrixRow;
  96. // When user inputs and selects candidates, the fixed lemma ids are stored in
  97. // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
  98. // lemmas from the beginning are fixed. If user deletes Pinyin characters one
  99. // by one from the end, these fixed lemmas can be unlocked one by one when
  100. // necessary. Whenever user deletes a Chinese character and its spelling string
  101. // in these fixed lemmas, all fixed lemmas will be merged together into a unit
  102. // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
  103. // phrase will be the first lemma in the sentence. Because it contains some
  104. // modified lemmas (by deleting a character), these merged lemmas are called
  105. // sub lemmas (sublma), and each of them are represented individually, so that
  106. // when user deletes Pinyin characters from the end, these sub lemmas can also
  107. // be unlocked one by one.
  108. typedef struct {
  109. uint16 spl_ids[kMaxRowNum];
  110. uint16 spl_start[kMaxRowNum];
  111. char16 chn_str[kMaxRowNum]; // Chinese string.
  112. uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters.
  113. unsigned sublma_num;
  114. uint16 length; // Counted in Chinese characters.
  115. } ComposingPhrase, *TComposingPhrase;
  116. class MatrixSearch {
  117. private:
  118. // If it is true, prediction list by string whose length is greater than 1
  119. // will be limited to a reasonable number.
  120. static const bool kPredictLimitGt1 = false;
  121. // If it is true, the engine will prefer long history based prediction,
  122. // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
  123. // based on the two-character history.
  124. static const bool kPreferLongHistoryPredict = true;
  125. // If it is true, prediction will only be based on user dictionary. this flag
  126. // is for debug purpose.
  127. static const bool kOnlyUserDictPredict = false;
  128. // The maximum buffer to store LmaPsbItems.
  129. static const unsigned kMaxLmaPsbItems = 1450;
  130. // How many rows for each step.
  131. static const unsigned kMaxNodeARow = 5;
  132. // The maximum length of the sentence candidates counted in chinese
  133. // characters
  134. static const unsigned kMaxSentenceLength = 16;
  135. // The size of the matrix node pool.
  136. static const unsigned kMtrxNdPoolSize = 200;
  137. // The size of the DMI node pool.
  138. static const unsigned kDmiPoolSize = 800;
  139. // Used to indicate whether this object has been initialized.
  140. bool inited_;
  141. // Spelling trie.
  142. const SpellingTrie *spl_trie_;
  143. // Used to indicate this switcher status: when "xian" is parseed, should
  144. // "xi an" also be extended. Default is false.
  145. // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
  146. // should be valid for a FULL spelling, or a combination of two spellings,
  147. // first of which is a FULL id too. So even it is true, "da" will never be
  148. // split into "d a", because "d" is not a full spelling id.
  149. bool xi_an_enabled_;
  150. // System dictionary.
  151. DictTrie* dict_trie_;
  152. // User dictionary.
  153. AtomDictBase* user_dict_;
  154. // Spelling parser.
  155. SpellingParser* spl_parser_;
  156. // The maximum allowed length of spelling string (such as a Pinyin string).
  157. unsigned max_sps_len_;
  158. // The maximum allowed length of a result Chinese string.
  159. unsigned max_hzs_len_;
  160. // Pinyin string. Max length: kMaxRowNum - 1
  161. char pys_[kMaxRowNum];
  162. // The length of the string that has been decoded successfully.
  163. unsigned pys_decoded_len_;
  164. // Shared buffer for multiple purposes.
  165. unsigned *share_buf_;
  166. MatrixNode *mtrx_nd_pool_;
  167. PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool
  168. DictMatchInfo *dmi_pool_;
  169. PoolPosType dmi_pool_used_; // How many items used in the pool
  170. MatrixRow *matrix_; // The first row is for starting
  171. DictExtPara *dep_; // Parameter used to extend DMI nodes.
  172. NPredictItem *npre_items_; // Used to do prediction
  173. unsigned npre_items_len_;
  174. // The starting positions and lemma ids for the full sentence candidate.
  175. unsigned lma_id_num_;
  176. uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids.
  177. LemmaIdType lma_id_[kMaxRowNum];
  178. unsigned fixed_lmas_;
  179. // If fixed_lmas_ is bigger than i, Element i is used to indicate whether
  180. // the i'th lemma id in lma_id_ is the first candidate for that step.
  181. // If all candidates are the first one for that step, the whole string can be
  182. // decoded by the engine automatically, so no need to add it to user
  183. // dictionary. (We are considering to add it to user dictionary in the
  184. // future).
  185. uint8 fixed_lmas_no1_[kMaxRowNum];
  186. // Composing phrase
  187. ComposingPhrase c_phrase_;
  188. // If dmi_c_phrase_ is true, the decoder will try to match the
  189. // composing phrase (And definitely it will match successfully). If it
  190. // is false, the decoder will try to match lemmas items in dictionaries.
  191. bool dmi_c_phrase_;
  192. // The starting positions and spelling ids for the first full sentence
  193. // candidate.
  194. unsigned spl_id_num_; // Number of splling ids
  195. uint16 spl_start_[kMaxRowNum]; // Starting positions
  196. uint16 spl_id_[kMaxRowNum]; // Spelling ids
  197. // Used to remember the last fixed position, counted in Hanzi.
  198. unsigned fixed_hzs_;
  199. // Lemma Items with possibility score, two purposes:
  200. // 1. In Viterbi decoding, this buffer is used to get all possible candidates
  201. // for current step;
  202. // 2. When the search is done, this buffer is used to get candiates from the
  203. // first un-fixed step and show them to the user.
  204. LmaPsbItem lpi_items_[kMaxLmaPsbItems];
  205. unsigned lpi_total_;
  206. // Assign the pointers with NULL. The caller makes sure that all pointers are
  207. // not valid before calling it. This function only will be called in the
  208. // construction function and free_resource().
  209. void reset_pointers_to_null();
  210. bool alloc_resource();
  211. void free_resource();
  212. // Reset the search space totally.
  213. bool reset_search0();
  214. // Reset the search space from ch_pos step. For example, if the original
  215. // input Pinyin is "an", reset_search(1) will reset the search space to the
  216. // result of "a". If the given position is out of range, return false.
  217. // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
  218. // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
  219. // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
  220. // The DMI nodes will be kept.
  221. //
  222. // Note: this function should not destroy content of pys_.
  223. bool reset_search(unsigned ch_pos, bool clear_fixed_this_step,
  224. bool clear_dmi_this_step, bool clear_mtrx_this_step);
  225. // Delete a part of the content in pys_.
  226. void del_in_pys(unsigned start, unsigned len);
  227. // Delete a spelling id and its corresponding Chinese character, and merge
  228. // the fixed lemmas into the composing phrase.
  229. // del_spl_pos indicates which spelling id needs to be delete.
  230. // This function will update the lemma and spelling segmentation information.
  231. // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
  232. // the fixed lemmas.
  233. void merge_fixed_lmas(unsigned del_spl_pos);
  234. // Get spelling start posistions and ids. The result will be stored in
  235. // spl_id_num_, spl_start_[], spl_id_[].
  236. // fixed_hzs_ will be also assigned.
  237. void get_spl_start_id();
  238. // Get all lemma ids with match the given spelling id stream(shorter than the
  239. // maximum length of a word).
  240. // If pfullsent is not NULL, means the full sentence candidate may be the
  241. // same with the coming lemma string, if so, remove that lemma.
  242. // The result is sorted in descendant order by the frequency score.
  243. unsigned get_lpis(const uint16* splid_str, unsigned splid_str_len,
  244. LmaPsbItem* lma_buf, unsigned max_lma_buf,
  245. const char16 *pfullsent, bool sort_by_psb);
  246. uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
  247. uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
  248. uint16 splids_max, bool arg_valid);
  249. // Extend a DMI node with a spelling id. ext_len is the length of the rows
  250. // to extend, actually, it is the size of the spelling string of splid.
  251. // return value can be 1 or 0.
  252. // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
  253. // the pool).
  254. // 0 means either the dmi node can not be extended with splid, or the splid
  255. // is a Shengmu id, which is only used to get lpi_items, or the result node
  256. // in DictTrie has no son, it is not nccessary to keep the new DMI.
  257. //
  258. // This function modifies the content of lpi_items_ and lpi_total_.
  259. // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
  260. // The function's returned value has no relation with the value of lpi_num.
  261. //
  262. // If dmi == NULL, this function will extend the root node of DictTrie
  263. //
  264. // This function will not change dmi_nd_pool_used_. Please change it after
  265. // calling this function if necessary.
  266. //
  267. // The caller should guarantees that NULL != dep.
  268. unsigned extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);
  269. // Extend dmi for the composing phrase.
  270. unsigned extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);
  271. // Extend a MatrixNode with the give LmaPsbItem list.
  272. // res_row is the destination row number.
  273. // This function does not change mtrx_nd_pool_used_. Please change it after
  274. // calling this function if necessary.
  275. // return 0 always.
  276. unsigned extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
  277. unsigned lpi_num, PoolPosType dmi_fr, unsigned res_row);
  278. // Try to find a dmi node at step_to position, and the found dmi node should
  279. // match the given spelling id strings.
  280. PoolPosType match_dmi(unsigned step_to, uint16 spl_ids[], uint16 spl_id_num);
  281. bool add_char(char ch);
  282. bool prepare_add_char(char ch);
  283. // Called after prepare_add_char, so the input char has been saved.
  284. bool add_char_qwerty();
  285. // Prepare candidates from the last fixed hanzi position.
  286. void prepare_candidates();
  287. // Is the character in step pos a splitter character?
  288. // The caller guarantees that the position is valid.
  289. bool is_split_at(uint16 pos);
  290. void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
  291. PoolPosType dmi_fr,
  292. uint16 spl_id, uint16 node_num, unsigned char dict_level,
  293. bool splid_end_split, unsigned char splstr_len,
  294. unsigned char all_full_id);
  295. unsigned inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
  296. char16 predict_buf[][kMaxPredictSize + 1],
  297. unsigned buf_len);
  298. // Add the first candidate to the user dictionary.
  299. bool try_add_cand0_to_userdict();
  300. // Add a user lemma to the user dictionary. This lemma is a subset of
  301. // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
  302. // number of lemmas to be combined together as a new lemma. The caller
  303. // gurantees that the combined new lemma's length is less or equal to
  304. // kMaxLemmaSize.
  305. bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);
  306. // Update dictionary frequencies.
  307. void update_dict_freq();
  308. void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);
  309. public:
  310. MatrixSearch();
  311. ~MatrixSearch();
  312. bool init(const char *fn_sys_dict, const char *fn_usr_dict);
  313. bool init_fd(int sys_fd, long start_offset, long length,
  314. const char *fn_usr_dict);
  315. void set_max_lens(unsigned max_sps_len, unsigned max_hzs_len);
  316. void close();
  317. void flush_cache();
  318. void set_xi_an_switch(bool xi_an_enabled);
  319. bool get_xi_an_switch();
  320. // Reset the search space. Equivalent to reset_search(0).
  321. // If inited, always return true;
  322. bool reset_search();
  323. // Search a Pinyin string.
  324. // Return value is the position successfully parsed.
  325. unsigned search(const char *py, unsigned py_len);
  326. // Used to delete something in the Pinyin string kept by the engine, and do
  327. // a re-search.
  328. // Return value is the new length of Pinyin string kept by the engine which
  329. // is parsed successfully.
  330. // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
  331. // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
  332. // characters for pos-th spelling id needs to be deleted.
  333. // If the deleted character(s) is just after a fixed lemma or sub lemma in
  334. // composing phrase, clear_fixed_this_step indicates whether we needs to
  335. // unlock the last fixed lemma or sub lemma.
  336. // If is_pos_in_splid is false, and pos-th character is in the range for the
  337. // fixed lemmas or composing string, this function will do nothing and just
  338. // return the result of the previous search.
  339. unsigned delsearch(unsigned pos, bool is_pos_in_splid,
  340. bool clear_fixed_this_step);
  341. // Get the number of candiates, called after search().
  342. unsigned get_candidate_num();
  343. // Get the Pinyin string stored by the engine.
  344. // *decoded_len returns the length of the successfully decoded string.
  345. const char* get_pystr(unsigned *decoded_len);
  346. // Get the spelling boundaries for the first sentence candidate.
  347. // Number of spellings will be returned. The number of valid elements in
  348. // spl_start is one more than the return value because the last one is used
  349. // to indicate the beginning of the next un-input speling.
  350. // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
  351. unsigned get_spl_start(const uint16 *&spl_start);
  352. // Get one candiate string. If full sentence candidate is available, it will
  353. // be the first one.
  354. char16* get_candidate(unsigned cand_id, char16 *cand_str, unsigned max_len);
  355. // Get the first candiate, which is a "full sentence".
  356. // retstr_len is not NULL, it will be used to return the string length.
  357. // If only_unfixed is true, only unfixed part will be fetched.
  358. char16* get_candidate0(char16* cand_str, unsigned max_len,
  359. uint16 *retstr_len, bool only_unfixed);
  360. // Choose a candidate. The decoder will do a search after the fixed position.
  361. unsigned choose(unsigned cand_id);
  362. // Cancel the last choosing operation, and return the new number of choices.
  363. unsigned cancel_last_choice();
  364. // Get the length of fixed Hanzis.
  365. unsigned get_fixedlen();
  366. unsigned get_predicts(const char16 fixed_buf[],
  367. char16 predict_buf[][kMaxPredictSize + 1],
  368. unsigned buf_len);
  369. };
  370. }
  371. #endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__