/fcitx-4.2.4/src/im/pinyin/pyParser.c

# · C · 562 lines · 381 code · 115 blank · 66 comment · 146 complexity · 0bde6b721e2123efd7ae18ae0bae6dd2 MD5 · raw file

  1. /***************************************************************************
  2. * Copyright (C) 2002~2005 by Yuking *
  3. * yuking_net@sohu.com *
  4. * *
  5. * This program is free software; you can redistribute it and/or modify *
  6. * it under the terms of the GNU General Public License as published by *
  7. * the Free Software Foundation; either version 2 of the License, or *
  8. * (at your option) any later version. *
  9. * *
  10. * This program is distributed in the hope that it will be useful, *
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  13. * GNU General Public License for more details. *
  14. * *
  15. * You should have received a copy of the GNU General Public License *
  16. * along with this program; if not, write to the *
  17. * Free Software Foundation, Inc., *
  18. * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
  19. ***************************************************************************/
  20. #include <stdio.h>
  21. #include <string.h>
  22. #include "fcitx/fcitx.h"
  23. #include "fcitx/ime.h"
  24. #include "fcitx-utils/log.h"
  25. #include "pyMapTable.h"
  26. #include "PYFA.h"
  27. #include "sp.h"
  28. #include "pyParser.h"
  29. #include "pyconfig.h"
  30. #include "py.h"
  31. extern const ConsonantMap consonantMapTable[];
  32. extern const SyllabaryMap syllabaryMapTable[];
  33. static double LookupPYFreq(FcitxPinyinConfig* pyconfig, int index1, int index2);
  34. int IsSyllabary(const char *strPY, boolean bMode)
  35. {
  36. register int i;
  37. for (i = 0; syllabaryMapTable[i].cMap; i++) {
  38. if (bMode) {
  39. if (!strncmp(strPY, syllabaryMapTable[i].strPY, strlen(syllabaryMapTable[i].strPY)))
  40. return i;
  41. } else {
  42. if (!strcmp(strPY, syllabaryMapTable[i].strPY))
  43. return i;
  44. }
  45. }
  46. return -1;
  47. }
  48. int IsConsonant(const char *strPY, boolean bMode)
  49. {
  50. register int i;
  51. for (i = 0; consonantMapTable[i].cMap; i++) {
  52. if (bMode) {
  53. if (!strncmp(strPY, consonantMapTable[i].strPY, strlen(consonantMapTable[i].strPY)))
  54. return i;
  55. } else {
  56. if (!strcmp(strPY, consonantMapTable[i].strPY))
  57. return i;
  58. }
  59. }
  60. return -1;
  61. }
  62. int FindPYFAIndex(FcitxPinyinConfig *pyconfig, const char *strPY, boolean bMode)
  63. {
  64. int i;
  65. for (i = 0; pyconfig->PYTable[i].strPY[0] != '\0'; i++) {
  66. int cmp;
  67. if (bMode)
  68. cmp = strncmp(strPY, pyconfig->PYTable[i].strPY, strlen(pyconfig->PYTable[i].strPY));
  69. else
  70. cmp = strcmp(strPY, pyconfig->PYTable[i].strPY);
  71. if (!cmp) {
  72. if (!pyconfig->PYTable[i].pMH)
  73. return i;
  74. else if (*(pyconfig->PYTable[i].pMH)) {
  75. /* trick: not the kind of misstype */
  76. if (pyconfig->PYTable[i].pMH != &pyconfig->bMisstype)
  77. return i;
  78. else
  79. /* fixed pinyin is valid? */
  80. if (!pyconfig->PYTable[i + 1].pMH || *(pyconfig->PYTable[i + 1].pMH))
  81. return i;
  82. }
  83. }
  84. }
  85. return -1;
  86. }
  87. void ParsePY(FcitxPinyinConfig *pyconfig, const char *strPY, ParsePYStruct * parsePY, PYPARSEINPUTMODE mode, boolean bSP)
  88. {
  89. const char *strP;
  90. int iIndex;
  91. int iTemp;
  92. char str_Map[3];
  93. char strTemp[7];
  94. parsePY->iMode = PARSE_SINGLEHZ;
  95. strP = strPY;
  96. parsePY->iHZCount = 0;
  97. if (bSP) {
  98. char strQP[7];
  99. char strJP[3];
  100. strJP[2] = '\0';
  101. while (*strP) {
  102. strJP[0] = *strP++;
  103. strJP[1] = *strP;
  104. SP2QP(pyconfig, strJP, strQP);
  105. MapPY(pyconfig, strQP, str_Map, mode);
  106. if (!*strP) {
  107. strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
  108. strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
  109. break;
  110. }
  111. iIndex = FindPYFAIndex(pyconfig, strQP, 0);
  112. if (iIndex != -1) {
  113. strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
  114. strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
  115. strP++;
  116. } else {
  117. strJP[1] = '\0';
  118. SP2QP(pyconfig, strJP, strQP);
  119. if (!MapPY(pyconfig, strQP, str_Map, mode))
  120. strcpy(parsePY->strMap[parsePY->iHZCount], strJP);
  121. else
  122. strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
  123. strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
  124. }
  125. if (*strP == PY_SEPARATOR) {
  126. strcat(parsePY->strPYParsed[parsePY->iHZCount - 1], PY_SEPARATOR_S);
  127. while (*strP == PY_SEPARATOR)
  128. strP++;
  129. }
  130. }
  131. } else {
  132. boolean bSeperator = false;
  133. do {
  134. iIndex = FindPYFAIndex(pyconfig, strP, 1);
  135. if (iIndex != -1) {
  136. size_t lIndex = strlen(pyconfig->PYTable[iIndex].strPY);
  137. strTemp[0] = pyconfig->PYTable[iIndex].strPY[lIndex - 1];
  138. iTemp = -1;
  139. /*
  140. * if the end of pinyin is 'g', 'n', 'e'
  141. * there might be another possbility, for example "wanan" can be "wa nan" and "wan an"
  142. * try resolve these problem here
  143. */
  144. if (strTemp[0] == 'g' || strTemp[0] == 'n' || strTemp[0] == 'e' || strTemp[0] == 'a') {
  145. strncpy(strTemp, strP, lIndex - 1);
  146. strTemp[lIndex - 1] = '\0';
  147. /* for example we have "wan", so we try to check "wa" is valid or not, with exact match */
  148. iTemp = FindPYFAIndex(pyconfig, strTemp, 0);
  149. /* if "wa" is valid */
  150. if (iTemp != -1) {
  151. /* also check "nan" is valid or not */
  152. int firstIndex;
  153. firstIndex = iTemp;
  154. iTemp = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iTemp].strPY), 1);
  155. /* if still is valid */
  156. if (iTemp != -1) {
  157. /*
  158. * length 1 split is what we must avoid,
  159. * for example, "nin" can be "ni n", but no separator can for "nin" if we split here
  160. *
  161. * and "ying" can be also "yi ng", for just the same case"
  162. */
  163. if (strlen(pyconfig->PYTable[iTemp].strPY) == 1 || !strcmp("ng", pyconfig->PYTable[iTemp].strPY))
  164. iTemp = -1;
  165. }
  166. if (iTemp != -1) {
  167. /* check the general frequency that this shoud split or not */
  168. int index2 = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iIndex].strPY), 1);
  169. boolean resplit = false;
  170. do {
  171. /* prefer longer */
  172. if (index2 == -1) {
  173. resplit = true;
  174. break;
  175. }
  176. size_t length1 = strlen(pyconfig->PYTable[iIndex].strPY) + strlen(pyconfig->PYTable[index2].strPY);
  177. size_t length2 = strlen(pyconfig->PYTable[firstIndex].strPY) + strlen(pyconfig->PYTable[iTemp].strPY);
  178. if (length1 != length2) {
  179. resplit = (length1 < length2);
  180. break;
  181. }
  182. double freq1 = LookupPYFreq(pyconfig, iIndex, index2);
  183. double freq2 = LookupPYFreq(pyconfig, firstIndex, iTemp);
  184. resplit = (freq1 <= freq2);
  185. } while(0);
  186. if (resplit) {
  187. strncpy(strTemp, strP, lIndex - 1);
  188. strTemp[lIndex - 1] = '\0';
  189. }
  190. else
  191. iTemp = -1;
  192. }
  193. }
  194. }
  195. if (iTemp == -1)
  196. strcpy(strTemp, pyconfig->PYTable[iIndex].strPY);
  197. MapPY(pyconfig, strTemp, str_Map, mode);
  198. strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
  199. strP += strlen(strTemp);
  200. if (bSeperator) {
  201. bSeperator = false;
  202. parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
  203. parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
  204. } else
  205. parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
  206. strcat(parsePY->strPYParsed[parsePY->iHZCount++], strTemp);
  207. } else {
  208. if (pyconfig->bFullPY && *strP != PY_SEPARATOR)
  209. parsePY->iMode = PARSE_ERROR;
  210. iIndex = IsConsonant(strP, 1);
  211. if (-1 != iIndex) {
  212. parsePY->iMode = PARSE_ERROR;
  213. if (bSeperator) {
  214. bSeperator = false;
  215. parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
  216. parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
  217. } else
  218. parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
  219. strcat(parsePY->strPYParsed[parsePY->iHZCount], consonantMapTable[iIndex].strPY);
  220. MapPY(pyconfig, consonantMapTable[iIndex].strPY, str_Map, mode);
  221. strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map);
  222. strP += strlen(consonantMapTable[iIndex].strPY);
  223. } else {
  224. iIndex = IsSyllabary(strP, 1);
  225. if (-1 != iIndex) {
  226. if (bSeperator) {
  227. bSeperator = false;
  228. parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
  229. parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
  230. } else
  231. parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
  232. strcat(parsePY->strPYParsed[parsePY->iHZCount], syllabaryMapTable[iIndex].strPY);
  233. MapPY(pyconfig, syllabaryMapTable[iIndex].strPY, str_Map, mode);
  234. strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map);
  235. strP += strlen(syllabaryMapTable[iIndex].strPY);
  236. if (parsePY->iMode != PARSE_ERROR)
  237. parsePY->iMode = PARSE_ABBR;
  238. } else {
  239. //??????
  240. strP++;
  241. bSeperator = true;
  242. parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
  243. parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
  244. parsePY->strMap[parsePY->iHZCount][0] = '0';
  245. parsePY->strMap[parsePY->iHZCount][1] = '0';
  246. parsePY->strMap[parsePY->iHZCount][2] = '\0';
  247. }
  248. }
  249. }
  250. } while (*strP);
  251. }
  252. if (strPY[strlen(strPY) - 1] == PY_SEPARATOR && !bSP)
  253. parsePY->iHZCount++;
  254. if (parsePY->iMode != PARSE_ERROR) {
  255. parsePY->iMode = parsePY->iMode & PARSE_ABBR;
  256. if (parsePY->iHZCount > 1)
  257. parsePY->iMode = parsePY->iMode | PARSE_PHRASE;
  258. else
  259. parsePY->iMode = parsePY->iMode | PARSE_SINGLEHZ;
  260. }
  261. }
  262. /*
  263. * ?????(?????????)???????
  264. * ??true?????????false(?????strPY?????????)
  265. */
  266. boolean MapPY(FcitxPinyinConfig* pyconfig, const char* strPYorigin, char strMap[3], PYPARSEINPUTMODE mode)
  267. {
  268. char str[5];
  269. char strPY[7];
  270. int iIndex;
  271. strcpy(strPY, strPYorigin);
  272. size_t len = strlen(strPY);
  273. if (pyconfig->bMisstype && strPY[len - 1] == 'n' && strPY[len - 2] == 'g') {
  274. strPY[len - 2] = 'n';
  275. strPY[len - 1] = 'g';
  276. }
  277. //????eng
  278. if (!strcmp(strPY, "eng") && pyconfig->MHPY_C[1].bMode) {
  279. strcpy(strMap, "X0");
  280. return true;
  281. }
  282. strMap[2] = '\0';
  283. iIndex = IsSyllabary(strPY, 0);
  284. if (-1 != iIndex) {
  285. strMap[0] = syllabaryMapTable[iIndex].cMap;
  286. strMap[1] = mode;
  287. return true;
  288. }
  289. iIndex = IsConsonant(strPY, 0);
  290. if (-1 != iIndex) {
  291. strMap[0] = mode;
  292. strMap[1] = consonantMapTable[iIndex].cMap;
  293. return true;
  294. }
  295. str[0] = strPY[0];
  296. str[1] = '\0';
  297. if (strPY[1] == 'h' || strPY[1] == 'g') {
  298. str[0] = strPY[0];
  299. str[1] = strPY[1];
  300. str[2] = '\0';
  301. iIndex = IsSyllabary(str, 0);
  302. strMap[0] = consonantMapTable[iIndex].cMap;
  303. iIndex = IsConsonant(strPY + 2, 0);
  304. strMap[1] = consonantMapTable[iIndex].cMap;
  305. } else {
  306. str[0] = strPY[0];
  307. str[1] = '\0';
  308. iIndex = IsSyllabary(str, 0);
  309. if (iIndex == -1)
  310. return false;
  311. strMap[0] = consonantMapTable[iIndex].cMap;
  312. iIndex = IsConsonant(strPY + 1, 0);
  313. if (iIndex == -1)
  314. return false;
  315. strMap[1] = consonantMapTable[iIndex].cMap;
  316. }
  317. return true;
  318. }
  319. /*
  320. * ??????????????????false?????
  321. * ????????????
  322. */
  323. boolean MapToPY(char strMap[3], char *strPY)
  324. {
  325. int i;
  326. strPY[0] = '\0';
  327. if (strMap[0] != ' ') {
  328. i = 0;
  329. while (syllabaryMapTable[i].cMap) {
  330. if (syllabaryMapTable[i].cMap == strMap[0]) {
  331. strcpy(strPY, syllabaryMapTable[i].strPY);
  332. break;
  333. }
  334. i++;
  335. }
  336. if (!strlen(strPY))
  337. return false;
  338. }
  339. if (strMap[1] != ' ') {
  340. i = 0;
  341. while (consonantMapTable[i].cMap) {
  342. if (consonantMapTable[i].cMap == strMap[1]) {
  343. strcat(strPY, consonantMapTable[i].strPY);
  344. return true;
  345. }
  346. i++;
  347. }
  348. } else
  349. return true;
  350. return false;
  351. }
  352. /*
  353. * ????????
  354. * 0????
  355. * b??????????true????
  356. */
  357. int Cmp1Map(FcitxPinyinConfig* pyconfig, char map1, char map2, boolean b, boolean bUseMH, boolean bSP)
  358. {
  359. int iVal1, iVal2;
  360. if (map2 == '0' || map1 == '0') {
  361. if (map1 == ' ' || map2 == ' ' || !pyconfig->bFullPY || bSP)
  362. return 0;
  363. } else {
  364. if (b) {
  365. iVal1 = GetMHIndex_S(pyconfig->MHPY_S, map1, bUseMH);
  366. iVal2 = GetMHIndex_S(pyconfig->MHPY_S, map2, bUseMH);
  367. } else {
  368. iVal1 = GetMHIndex_C(pyconfig->MHPY_C, map1);
  369. iVal2 = GetMHIndex_C(pyconfig->MHPY_C, map2);
  370. }
  371. if (iVal1 == iVal2)
  372. if (iVal1 >= 0)
  373. return 0;
  374. }
  375. return (map1 - map2);
  376. }
  377. /*
  378. * ?????????????????
  379. * 0????
  380. * >0?????
  381. * <0?????
  382. */
  383. int Cmp2Map(FcitxPinyinConfig* pyconfig, char map1[3], char map2[3], boolean bSP)
  384. {
  385. int i;
  386. if (IsZ_C_S(map2[0]) && map2[1] == '0')
  387. i = Cmp1Map(pyconfig, map1[0], map2[0], true, true, bSP);
  388. else
  389. i = Cmp1Map(pyconfig, map1[0], map2[0], true, false, bSP);
  390. if (i)
  391. return i;
  392. return Cmp1Map(pyconfig, map1[1], map2[1], false, false, bSP);
  393. }
  394. /*
  395. * ??strMap2???strMap1???
  396. * ? ????0
  397. * ? ?????0
  398. * *iMatchedLength ????????????
  399. */
  400. int CmpMap(FcitxPinyinConfig* pyconfig, char *strMap1, char *strMap2, int *iMatchedLength, boolean bSP)
  401. {
  402. int val;
  403. *iMatchedLength = 0;
  404. for (;;) {
  405. if (!strMap2[*iMatchedLength])
  406. return (strMap1[*iMatchedLength] - strMap2[*iMatchedLength]);
  407. if (((*iMatchedLength + 1) % 2) && (IsZ_C_S(strMap2[*iMatchedLength]) && (strMap2[*iMatchedLength + 1] == '0' || !strMap2[*iMatchedLength + 1])))
  408. val = Cmp1Map(pyconfig, strMap1[*iMatchedLength], strMap2[*iMatchedLength], (*iMatchedLength + 1) % 2, true, bSP);
  409. else
  410. val = Cmp1Map(pyconfig, strMap1[*iMatchedLength], strMap2[*iMatchedLength], (*iMatchedLength + 1) % 2, false, bSP);
  411. if (val)
  412. return val;
  413. (*iMatchedLength)++;
  414. }
  415. return 0;
  416. }
  417. #include "pysplitdata.h"
  418. struct _PYMappedSplitData {
  419. char py[MAX_PY_LENGTH * 2 + 2];
  420. float freq;
  421. UT_hash_handle hh;
  422. };
  423. const UT_icd splitData_icd = { sizeof(PYMappedSplitData), 0, 0, 0 };
  424. void InitPYSplitData(FcitxPinyinConfig* pyconfig)
  425. {
  426. size_t size = sizeof(pySplitData) / sizeof(pySplitData[0]);
  427. int i = 0;
  428. for (i = 0; i < size; i ++) {
  429. PYMappedSplitData* data = fcitx_utils_malloc0(sizeof(PYMappedSplitData));
  430. sprintf(data->py, "%s %s", pySplitData[i].py1, pySplitData[i].py2);
  431. data->freq = pySplitData[i].freq;
  432. HASH_ADD_STR(pyconfig->splitData, py, data);
  433. }
  434. }
  435. double LookupPYFreq(FcitxPinyinConfig* pyconfig, int index1, int index2)
  436. {
  437. char py[MAX_PY_LENGTH * 2 + 2];
  438. if (index1 < 0 || index2 < 0)
  439. return 0;
  440. sprintf(py, "%s %s", pyconfig->PYTable[index1].strPY, pyconfig->PYTable[index2].strPY);
  441. PYMappedSplitData* s = NULL;
  442. HASH_FIND_STR(pyconfig->splitData, py, s);
  443. if (s == NULL)
  444. return 0;
  445. return s->freq;
  446. }
  447. // kate: indent-mode cstyle; space-indent on; indent-width 0;