PageRenderTime 24ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/gcin-1.6.1/tsin-parse.cpp

#
C++ | 336 lines | 266 code | 65 blank | 5 comment | 61 complexity | 21d7493b980b2c0b80c5316379ee694e MD5 | raw file
Possible License(s): LGPL-2.1
  1. #include <string.h>
  2. #include "gcin.h"
  3. #include "pho.h"
  4. #include "tsin.h"
  5. #include "gcin-conf.h"
  6. #include <math.h>
  7. #include "tsin-parse.h"
  8. #include "gtab-buf.h"
  9. #include "gst.h"
  10. #define DBG (0)
  11. extern gboolean tsin_is_gtab;
  12. extern int ph_key_sz;
  13. void add_cache(int start, int usecount, TSIN_PARSE *out, short match_phr_N, short no_match_ch_N, int tc_len);
  14. void extract_gtab_key(int start, int len, void *out);
  15. gboolean check_gtab_fixed_mismatch(int idx, char *mtch, int plen);
  16. void mask_tone(phokey_t *pho, int plen, char *tone_mask);
  17. static int tsin_parse_len;
  18. void set_tsin_parse_len(int len)
  19. {
  20. tsin_parse_len = len;
  21. }
  22. static char *c_pinyin_set;
  23. int tsin_parse_recur(int start, TSIN_PARSE *out,
  24. short *r_match_phr_N, short *r_no_match_ch_N)
  25. {
  26. int plen;
  27. double bestscore = -1;
  28. int bestusecount = 0;
  29. *r_match_phr_N = 0;
  30. *r_no_match_ch_N = tsin_parse_len - start;
  31. for(plen=1; start + plen <= tsin_parse_len && plen <= MAX_PHRASE_LEN; plen++) {
  32. #if DBG
  33. dbg("---- aa st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n");
  34. #endif
  35. if (plen > 1) {
  36. if (tsin_is_gtab) {
  37. if (gbuf[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD)
  38. break;
  39. } else
  40. if (tss.chpho[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD)
  41. break;
  42. }
  43. phokey_t pp[MAX_PHRASE_LEN + 1];
  44. u_int pp32[MAX_PHRASE_LEN + 1];
  45. u_int64_t pp64[MAX_PHRASE_LEN + 1];
  46. int sti, edi;
  47. TSIN_PARSE pbest[MAX_PH_BF_EXT+1];
  48. #define MAXV 1000
  49. int maxusecount = 5-MAXV;
  50. int remlen;
  51. short match_phr_N=0, no_match_ch_N = plen;
  52. void *ppp;
  53. if (ph_key_sz==2)
  54. ppp=pp;
  55. else if (ph_key_sz==4)
  56. ppp=pp32;
  57. else
  58. ppp=pp64;
  59. bzero(pbest, sizeof(TSIN_PARSE) * tsin_parse_len);
  60. pbest[0].len = plen;
  61. pbest[0].start = start;
  62. int i, ofs;
  63. if (tsin_is_gtab)
  64. for(ofs=i=0; i < plen; i++)
  65. ofs += utf8cpy((char *)pbest[0].str + ofs, gbuf[start + i].ch);
  66. else
  67. for(ofs=i=0; i < plen; i++)
  68. ofs += utf8cpy((char *)pbest[0].str + ofs, tss.chpho[start + i].ch);
  69. #if DBG
  70. dbg("st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n");
  71. #endif
  72. if (tsin_is_gtab)
  73. extract_gtab_key(start, plen, ppp);
  74. else {
  75. extract_pho(start, plen, (phokey_t *)ppp);
  76. if (c_pinyin_set)
  77. mask_tone(pp, plen, c_pinyin_set + start);
  78. }
  79. #if DBG
  80. for(i=0; i < plen; i++) {
  81. prph(pp[i]); dbg("%d", c_pinyin_set[i+start]);
  82. }
  83. dbg("\n");
  84. #endif
  85. char *pinyin_set = c_pinyin_set ? c_pinyin_set+start:NULL;
  86. if (!tsin_seek(ppp, plen, &sti, &edi, pinyin_set)) {
  87. // dbg("tsin_seek not found...\n");
  88. if (plen > 1)
  89. break;
  90. goto next;
  91. }
  92. phokey_t mtk[MAX_PHRASE_LEN];
  93. u_int mtk32[MAX_PHRASE_LEN];
  94. u_int64_t mtk64[MAX_PHRASE_LEN];
  95. void *pho;
  96. if (ph_key_sz==2)
  97. pho=mtk;
  98. else if (ph_key_sz==4)
  99. pho=mtk32;
  100. else
  101. pho=mtk64;
  102. for (;sti < edi; sti++) {
  103. char mtch[MAX_PHRASE_LEN*CH_SZ+1];
  104. char match_len;
  105. usecount_t usecount;
  106. load_tsin_entry(sti, &match_len, &usecount, pho, (u_char *)mtch);
  107. if (match_len < plen)
  108. continue;
  109. if (tsin_is_gtab) {
  110. if (check_gtab_fixed_mismatch(start, mtch, plen))
  111. continue;
  112. } else
  113. if (check_fixed_mismatch(start, mtch, plen))
  114. continue;
  115. if (usecount < 0)
  116. usecount = 0;
  117. int i;
  118. if (ph_key_sz==2) {
  119. if (c_pinyin_set) {
  120. // mask_tone(pp, plen, c_pinyin_set + start);
  121. mask_tone(mtk, plen, c_pinyin_set + start);
  122. }
  123. for(i=0;i < plen;i++)
  124. if (mtk[i]!=pp[i])
  125. break;
  126. } else if (ph_key_sz==4) {
  127. for(i=0;i < plen;i++)
  128. if (mtk32[i]!=pp32[i])
  129. break;
  130. } else {
  131. for(i=0;i < plen;i++)
  132. if (mtk64[i]!=pp64[i])
  133. break;
  134. }
  135. if (i < plen)
  136. continue;
  137. if (match_len > plen) {
  138. continue;
  139. }
  140. if (usecount <= maxusecount)
  141. continue;
  142. pbest[0].len = plen;
  143. maxusecount = usecount;
  144. utf8cpyN((char *)pbest[0].str, mtch, plen);
  145. pbest[0].flag |= FLAG_TSIN_PARSE_PHRASE;
  146. match_phr_N = 1;
  147. no_match_ch_N = 0;
  148. #if DBG
  149. utf8_putcharn(mtch, plen);
  150. dbg(" plen %d usecount:%d ", plen, usecount);
  151. utf8_putcharn(mtch, plen);
  152. dbg("\n");
  153. #endif
  154. }
  155. next:
  156. #if 0
  157. if (!match_phr_N) {
  158. if (tsin_is_gtab) {
  159. if (!(gbuf[start].ch[0] & 0x80))
  160. no_match_ch_N = 0;
  161. } else
  162. if (!(tss.chpho[start].ch[0] & 0x80))
  163. no_match_ch_N = 0;
  164. }
  165. #else
  166. // dbg("no_match_ch_N %d\n", no_match_ch_N);
  167. #endif
  168. remlen = tsin_parse_len - (start + plen);
  169. if (remlen) {
  170. int next = start + plen;
  171. CACHE *pca;
  172. short smatch_phr_N, sno_match_ch_N;
  173. int uc;
  174. if (pca = cache_lookup(next)) {
  175. uc = pca->usecount;
  176. smatch_phr_N = pca->match_phr_N;
  177. sno_match_ch_N = pca->no_match_ch_N;
  178. memcpy(&pbest[1], pca->best, (tsin_parse_len - next) * sizeof(TSIN_PARSE));
  179. } else {
  180. uc = tsin_parse_recur(next, &pbest[1], &smatch_phr_N, &sno_match_ch_N);
  181. // dbg(" gg %d\n", smatch_phr_N);
  182. add_cache(next, uc, &pbest[1], smatch_phr_N, sno_match_ch_N, tsin_parse_len);
  183. }
  184. match_phr_N += smatch_phr_N;
  185. no_match_ch_N += sno_match_ch_N;
  186. maxusecount += uc;
  187. }
  188. double score = log((double)maxusecount + MAXV) /
  189. (pow((double)match_phr_N, 10)+ 1.0E-6) / (pow((double)no_match_ch_N, 20) + 1.0E-6);
  190. #if DBG
  191. dbg("st:%d plen:%d zz muse:%d ma:%d noma:%d score:%.4e %.4e\n", start, plen,
  192. maxusecount, match_phr_N, no_match_ch_N, score, bestscore);
  193. #endif
  194. if (score > bestscore) {
  195. #if DBG
  196. dbg("is best org %.4e\n", bestscore);
  197. #endif
  198. bestscore = score;
  199. memcpy(out, pbest, sizeof(TSIN_PARSE) * (tsin_parse_len - start));
  200. #if DBG
  201. dbg(" str:%d ", start);
  202. int i;
  203. for(i=0; i < tsin_parse_len - start; i++) {
  204. utf8_putcharn((char *)out[i].str, out[i].len);
  205. }
  206. dbg("\n");
  207. #endif
  208. bestusecount = maxusecount;
  209. *r_match_phr_N = match_phr_N;
  210. *r_no_match_ch_N = no_match_ch_N;
  211. }
  212. }
  213. if (bestusecount < 0)
  214. bestusecount = 0;
  215. return bestusecount;
  216. }
  217. void disp_ph_sta_idx(int idx);
  218. void free_cache(), load_tsin_db();
  219. void tsin_parse()
  220. {
  221. TSIN_PARSE out[MAX_PH_BF_EXT+1];
  222. bzero(out, sizeof(out));
  223. int i, ofsi;
  224. if (tss.c_len <= 1)
  225. return;
  226. load_tsin_db();
  227. set_tsin_parse_len(tss.c_len);
  228. init_cache(tss.c_len);
  229. char pinyin_set[MAX_PH_BF_EXT];
  230. c_pinyin_set = pin_juyin?pinyin_set:NULL;
  231. get_chpho_pinyin_set(pinyin_set);
  232. short smatch_phr_N, sno_match_ch_N;
  233. tsin_parse_recur(0, out, &smatch_phr_N, &sno_match_ch_N);
  234. #if 0
  235. puts("vvvvvvvvvvvvvvvv");
  236. for(i=0; i < tss.c_len; i++) {
  237. printf("%d:", out[i].len);
  238. utf8_putcharn(out[i].str, out[i].len);
  239. }
  240. dbg("\n");
  241. #endif
  242. for(i=0; i < tss.c_len; i++)
  243. tss.chpho[i].flag &= ~(FLAG_CHPHO_PHRASE_HEAD|FLAG_CHPHO_PHRASE_BODY);
  244. for(ofsi=i=0; out[i].len; i++) {
  245. int j, ofsj;
  246. int psta = ofsi;
  247. if (out[i].flag & FLAG_TSIN_PARSE_PHRASE)
  248. tss.chpho[ofsi].flag |= FLAG_CHPHO_PHRASE_HEAD;
  249. for(ofsj=j=0; j < out[i].len; j++) {
  250. ofsj += utf8cpy(tss.chpho[ofsi].cha, (char *)&out[i].str[ofsj]);
  251. // tss.chpho[ofsi].ch = tss.chpho[ofsi].cha;
  252. tss.chpho[ofsi].flag |= FLAG_CHPHO_PHRASE_BODY;
  253. if (out[i].flag & FLAG_TSIN_PARSE_PHRASE)
  254. tss.chpho[ofsi].psta = psta;
  255. ofsi++;
  256. }
  257. }
  258. int ph_sta_idx = tss.ph_sta;
  259. if (tss.chpho[tss.c_len-1].psta>=0 && tss.c_len - tss.chpho[tss.c_len-1].psta > 1) {
  260. ph_sta_idx = tss.chpho[tss.c_len-1].psta;
  261. }
  262. #if 1
  263. disp_ph_sta_idx(ph_sta_idx);
  264. #endif
  265. #if 0
  266. for(i=0;i<tss.c_len;i++)
  267. utf8_putchar(tss.chpho[i].ch);
  268. puts("");
  269. #endif
  270. free_cache();
  271. }