/Modules/cjkcodecs/_codecs_kr.c

http://unladen-swallow.googlecode.com/ · C · 452 lines · 358 code · 70 blank · 24 comment · 88 complexity · 16b1325ad28a1ed9d31558f383caa1f2 MD5 · raw file

  1. /*
  2. * _codecs_kr.c: Codecs collection for Korean encodings
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #include "cjkcodecs.h"
  7. #include "mappings_kr.h"
  8. /*
  9. * EUC-KR codec
  10. */
  11. #define EUCKR_JAMO_FIRSTBYTE 0xA4
  12. #define EUCKR_JAMO_FILLER 0xD4
  13. static const unsigned char u2cgk_choseong[19] = {
  14. 0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
  15. 0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
  16. 0xbc, 0xbd, 0xbe
  17. };
  18. static const unsigned char u2cgk_jungseong[21] = {
  19. 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
  20. 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
  21. 0xcf, 0xd0, 0xd1, 0xd2, 0xd3
  22. };
  23. static const unsigned char u2cgk_jongseong[28] = {
  24. 0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
  25. 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
  26. 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
  27. 0xbb, 0xbc, 0xbd, 0xbe
  28. };
  29. ENCODER(euc_kr)
  30. {
  31. while (inleft > 0) {
  32. Py_UNICODE c = IN1;
  33. DBCHAR code;
  34. if (c < 0x80) {
  35. WRITE1((unsigned char)c)
  36. NEXT(1, 1)
  37. continue;
  38. }
  39. UCS4INVALID(c)
  40. REQUIRE_OUTBUF(2)
  41. TRYMAP_ENC(cp949, code, c);
  42. else return 1;
  43. if ((code & 0x8000) == 0) {
  44. /* KS X 1001 coded character */
  45. OUT1((code >> 8) | 0x80)
  46. OUT2((code & 0xFF) | 0x80)
  47. NEXT(1, 2)
  48. }
  49. else { /* Mapping is found in CP949 extension,
  50. * but we encode it in KS X 1001:1998 Annex 3,
  51. * make-up sequence for EUC-KR. */
  52. REQUIRE_OUTBUF(8)
  53. /* syllable composition precedence */
  54. OUT1(EUCKR_JAMO_FIRSTBYTE)
  55. OUT2(EUCKR_JAMO_FILLER)
  56. /* All codepoints in CP949 extension are in unicode
  57. * Hangul Syllable area. */
  58. assert(0xac00 <= c && c <= 0xd7a3);
  59. c -= 0xac00;
  60. OUT3(EUCKR_JAMO_FIRSTBYTE)
  61. OUT4(u2cgk_choseong[c / 588])
  62. NEXT_OUT(4)
  63. OUT1(EUCKR_JAMO_FIRSTBYTE)
  64. OUT2(u2cgk_jungseong[(c / 28) % 21])
  65. OUT3(EUCKR_JAMO_FIRSTBYTE)
  66. OUT4(u2cgk_jongseong[c % 28])
  67. NEXT(1, 4)
  68. }
  69. }
  70. return 0;
  71. }
  72. #define NONE 127
  73. static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
  74. 0, 1, NONE, 2, NONE, NONE, 3, 4,
  75. 5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  76. 6, 7, 8, NONE, 9, 10, 11, 12,
  77. 13, 14, 15, 16, 17, 18
  78. };
  79. static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
  80. 1, 2, 3, 4, 5, 6, 7, NONE,
  81. 8, 9, 10, 11, 12, 13, 14, 15,
  82. 16, 17, NONE, 18, 19, 20, 21, 22,
  83. NONE, 23, 24, 25, 26, 27
  84. };
  85. DECODER(euc_kr)
  86. {
  87. while (inleft > 0) {
  88. unsigned char c = IN1;
  89. REQUIRE_OUTBUF(1)
  90. if (c < 0x80) {
  91. OUT1(c)
  92. NEXT(1, 1)
  93. continue;
  94. }
  95. REQUIRE_INBUF(2)
  96. if (c == EUCKR_JAMO_FIRSTBYTE &&
  97. IN2 == EUCKR_JAMO_FILLER) {
  98. /* KS X 1001:1998 Annex 3 make-up sequence */
  99. DBCHAR cho, jung, jong;
  100. REQUIRE_INBUF(8)
  101. if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
  102. (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
  103. (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
  104. return 8;
  105. c = (*inbuf)[3];
  106. if (0xa1 <= c && c <= 0xbe)
  107. cho = cgk2u_choseong[c - 0xa1];
  108. else
  109. cho = NONE;
  110. c = (*inbuf)[5];
  111. jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
  112. c = (*inbuf)[7];
  113. if (c == EUCKR_JAMO_FILLER)
  114. jong = 0;
  115. else if (0xa1 <= c && c <= 0xbe)
  116. jong = cgk2u_jongseong[c - 0xa1];
  117. else
  118. jong = NONE;
  119. if (cho == NONE || jung == NONE || jong == NONE)
  120. return 8;
  121. OUT1(0xac00 + cho*588 + jung*28 + jong);
  122. NEXT(8, 1)
  123. }
  124. else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
  125. NEXT(2, 1)
  126. }
  127. else
  128. return 2;
  129. }
  130. return 0;
  131. }
  132. #undef NONE
  133. /*
  134. * CP949 codec
  135. */
  136. ENCODER(cp949)
  137. {
  138. while (inleft > 0) {
  139. Py_UNICODE c = IN1;
  140. DBCHAR code;
  141. if (c < 0x80) {
  142. WRITE1((unsigned char)c)
  143. NEXT(1, 1)
  144. continue;
  145. }
  146. UCS4INVALID(c)
  147. REQUIRE_OUTBUF(2)
  148. TRYMAP_ENC(cp949, code, c);
  149. else return 1;
  150. OUT1((code >> 8) | 0x80)
  151. if (code & 0x8000)
  152. OUT2(code & 0xFF) /* MSB set: CP949 */
  153. else
  154. OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
  155. NEXT(1, 2)
  156. }
  157. return 0;
  158. }
  159. DECODER(cp949)
  160. {
  161. while (inleft > 0) {
  162. unsigned char c = IN1;
  163. REQUIRE_OUTBUF(1)
  164. if (c < 0x80) {
  165. OUT1(c)
  166. NEXT(1, 1)
  167. continue;
  168. }
  169. REQUIRE_INBUF(2)
  170. TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
  171. else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
  172. else return 2;
  173. NEXT(2, 1)
  174. }
  175. return 0;
  176. }
  177. /*
  178. * JOHAB codec
  179. */
  180. static const unsigned char u2johabidx_choseong[32] = {
  181. 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  182. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  183. 0x10, 0x11, 0x12, 0x13, 0x14,
  184. };
  185. static const unsigned char u2johabidx_jungseong[32] = {
  186. 0x03, 0x04, 0x05, 0x06, 0x07,
  187. 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  188. 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  189. 0x1a, 0x1b, 0x1c, 0x1d,
  190. };
  191. static const unsigned char u2johabidx_jongseong[32] = {
  192. 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  193. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  194. 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
  195. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
  196. };
  197. static const DBCHAR u2johabjamo[] = {
  198. 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
  199. 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
  200. 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
  201. 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
  202. 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
  203. 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
  204. 0x8741, 0x8761, 0x8781, 0x87a1,
  205. };
  206. ENCODER(johab)
  207. {
  208. while (inleft > 0) {
  209. Py_UNICODE c = IN1;
  210. DBCHAR code;
  211. if (c < 0x80) {
  212. WRITE1((unsigned char)c)
  213. NEXT(1, 1)
  214. continue;
  215. }
  216. UCS4INVALID(c)
  217. REQUIRE_OUTBUF(2)
  218. if (c >= 0xac00 && c <= 0xd7a3) {
  219. c -= 0xac00;
  220. code = 0x8000 |
  221. (u2johabidx_choseong[c / 588] << 10) |
  222. (u2johabidx_jungseong[(c / 28) % 21] << 5) |
  223. u2johabidx_jongseong[c % 28];
  224. }
  225. else if (c >= 0x3131 && c <= 0x3163)
  226. code = u2johabjamo[c - 0x3131];
  227. else TRYMAP_ENC(cp949, code, c) {
  228. unsigned char c1, c2, t2;
  229. unsigned short t1;
  230. assert((code & 0x8000) == 0);
  231. c1 = code >> 8;
  232. c2 = code & 0xff;
  233. if (((c1 >= 0x21 && c1 <= 0x2c) ||
  234. (c1 >= 0x4a && c1 <= 0x7d)) &&
  235. (c2 >= 0x21 && c2 <= 0x7e)) {
  236. t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
  237. (c1 - 0x21 + 0x197));
  238. t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
  239. OUT1(t1 >> 1)
  240. OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
  241. NEXT(1, 2)
  242. continue;
  243. }
  244. else
  245. return 1;
  246. }
  247. else
  248. return 1;
  249. OUT1(code >> 8)
  250. OUT2(code & 0xff)
  251. NEXT(1, 2)
  252. }
  253. return 0;
  254. }
  255. #define FILL 0xfd
  256. #define NONE 0xff
  257. static const unsigned char johabidx_choseong[32] = {
  258. NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
  259. 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
  260. 0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
  261. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  262. };
  263. static const unsigned char johabidx_jungseong[32] = {
  264. NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
  265. NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
  266. NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
  267. NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
  268. };
  269. static const unsigned char johabidx_jongseong[32] = {
  270. NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  271. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  272. 0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
  273. 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
  274. };
  275. static const unsigned char johabjamo_choseong[32] = {
  276. NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
  277. 0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
  278. 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
  279. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  280. };
  281. static const unsigned char johabjamo_jungseong[32] = {
  282. NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
  283. NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
  284. NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
  285. NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
  286. };
  287. static const unsigned char johabjamo_jongseong[32] = {
  288. NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
  289. 0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
  290. 0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
  291. 0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
  292. };
  293. DECODER(johab)
  294. {
  295. while (inleft > 0) {
  296. unsigned char c = IN1, c2;
  297. REQUIRE_OUTBUF(1)
  298. if (c < 0x80) {
  299. OUT1(c)
  300. NEXT(1, 1)
  301. continue;
  302. }
  303. REQUIRE_INBUF(2)
  304. c2 = IN2;
  305. if (c < 0xd8) {
  306. /* johab hangul */
  307. unsigned char c_cho, c_jung, c_jong;
  308. unsigned char i_cho, i_jung, i_jong;
  309. c_cho = (c >> 2) & 0x1f;
  310. c_jung = ((c << 3) | c2 >> 5) & 0x1f;
  311. c_jong = c2 & 0x1f;
  312. i_cho = johabidx_choseong[c_cho];
  313. i_jung = johabidx_jungseong[c_jung];
  314. i_jong = johabidx_jongseong[c_jong];
  315. if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
  316. return 2;
  317. /* we don't use U+1100 hangul jamo yet. */
  318. if (i_cho == FILL) {
  319. if (i_jung == FILL) {
  320. if (i_jong == FILL)
  321. OUT1(0x3000)
  322. else
  323. OUT1(0x3100 |
  324. johabjamo_jongseong[c_jong])
  325. }
  326. else {
  327. if (i_jong == FILL)
  328. OUT1(0x3100 |
  329. johabjamo_jungseong[c_jung])
  330. else
  331. return 2;
  332. }
  333. } else {
  334. if (i_jung == FILL) {
  335. if (i_jong == FILL)
  336. OUT1(0x3100 |
  337. johabjamo_choseong[c_cho])
  338. else
  339. return 2;
  340. }
  341. else
  342. OUT1(0xac00 +
  343. i_cho * 588 +
  344. i_jung * 28 +
  345. (i_jong == FILL ? 0 : i_jong))
  346. }
  347. NEXT(2, 1)
  348. } else {
  349. /* KS X 1001 except hangul jamos and syllables */
  350. if (c == 0xdf || c > 0xf9 ||
  351. c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
  352. (c2 & 0x7f) == 0x7f ||
  353. (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
  354. return 2;
  355. else {
  356. unsigned char t1, t2;
  357. t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
  358. 2 * c - 0x197);
  359. t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
  360. t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
  361. t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
  362. TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
  363. else return 2;
  364. NEXT(2, 1)
  365. }
  366. }
  367. }
  368. return 0;
  369. }
  370. #undef NONE
  371. #undef FILL
  372. BEGIN_MAPPINGS_LIST
  373. MAPPING_DECONLY(ksx1001)
  374. MAPPING_ENCONLY(cp949)
  375. MAPPING_DECONLY(cp949ext)
  376. END_MAPPINGS_LIST
  377. BEGIN_CODECS_LIST
  378. CODEC_STATELESS(euc_kr)
  379. CODEC_STATELESS(cp949)
  380. CODEC_STATELESS(johab)
  381. END_CODECS_LIST
  382. I_AM_A_MODULE_FOR(kr)