PageRenderTime 327ms CodeModel.GetById 110ms app.highlight 67ms RepoModel.GetById 145ms app.codeStats 0ms

/Modules/cjkcodecs/_codecs_kr.c

http://unladen-swallow.googlecode.com/
C | 452 lines | 358 code | 70 blank | 24 comment | 88 complexity | 16b1325ad28a1ed9d31558f383caa1f2 MD5 | raw file
  1/*
  2 * _codecs_kr.c: Codecs collection for Korean encodings
  3 *
  4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5 */
  6
  7#include "cjkcodecs.h"
  8#include "mappings_kr.h"
  9
 10/*
 11 * EUC-KR codec
 12 */
 13
 14#define EUCKR_JAMO_FIRSTBYTE	0xA4
 15#define EUCKR_JAMO_FILLER	0xD4
 16
 17static const unsigned char u2cgk_choseong[19] = {
 18	0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
 19	0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
 20	0xbc, 0xbd, 0xbe
 21};
 22static const unsigned char u2cgk_jungseong[21] = {
 23	0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
 24	0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
 25	0xcf, 0xd0, 0xd1, 0xd2, 0xd3
 26};
 27static const unsigned char u2cgk_jongseong[28] = {
 28	0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 29	0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
 30	0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
 31	0xbb, 0xbc, 0xbd, 0xbe
 32};
 33
 34ENCODER(euc_kr)
 35{
 36	while (inleft > 0) {
 37		Py_UNICODE c = IN1;
 38		DBCHAR code;
 39
 40		if (c < 0x80) {
 41			WRITE1((unsigned char)c)
 42			NEXT(1, 1)
 43			continue;
 44		}
 45		UCS4INVALID(c)
 46
 47		REQUIRE_OUTBUF(2)
 48		TRYMAP_ENC(cp949, code, c);
 49		else return 1;
 50
 51		if ((code & 0x8000) == 0) {
 52			/* KS X 1001 coded character */
 53			OUT1((code >> 8) | 0x80)
 54			OUT2((code & 0xFF) | 0x80)
 55			NEXT(1, 2)
 56		}
 57		else {	/* Mapping is found in CP949 extension,
 58			 * but we encode it in KS X 1001:1998 Annex 3,
 59			 * make-up sequence for EUC-KR. */
 60
 61			REQUIRE_OUTBUF(8)
 62
 63			/* syllable composition precedence */
 64			OUT1(EUCKR_JAMO_FIRSTBYTE)
 65			OUT2(EUCKR_JAMO_FILLER)
 66
 67			/* All codepoints in CP949 extension are in unicode
 68			 * Hangul Syllable area. */
 69			assert(0xac00 <= c && c <= 0xd7a3);
 70			c -= 0xac00;
 71
 72			OUT3(EUCKR_JAMO_FIRSTBYTE)
 73			OUT4(u2cgk_choseong[c / 588])
 74			NEXT_OUT(4)
 75
 76			OUT1(EUCKR_JAMO_FIRSTBYTE)
 77			OUT2(u2cgk_jungseong[(c / 28) % 21])
 78			OUT3(EUCKR_JAMO_FIRSTBYTE)
 79			OUT4(u2cgk_jongseong[c % 28])
 80			NEXT(1, 4)
 81		}
 82	}
 83
 84	return 0;
 85}
 86
 87#define NONE	127
 88
 89static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
 90	   0,    1, NONE,    2, NONE, NONE,    3,    4,
 91	   5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
 92	   6,    7,    8, NONE,    9,   10,   11,   12,
 93	  13,   14,   15,   16,   17,   18
 94};
 95static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
 96	   1,    2,    3,    4,    5,    6,    7, NONE,
 97	   8,    9,   10,   11,   12,   13,   14,   15,
 98	  16,   17, NONE,   18,   19,   20,   21,   22,
 99	NONE,   23,   24,   25,   26,   27
100};
101
102DECODER(euc_kr)
103{
104	while (inleft > 0) {
105		unsigned char c = IN1;
106
107		REQUIRE_OUTBUF(1)
108
109		if (c < 0x80) {
110			OUT1(c)
111			NEXT(1, 1)
112			continue;
113		}
114
115		REQUIRE_INBUF(2)
116
117		if (c == EUCKR_JAMO_FIRSTBYTE &&
118		    IN2 == EUCKR_JAMO_FILLER) {
119			/* KS X 1001:1998 Annex 3 make-up sequence */
120			DBCHAR cho, jung, jong;
121
122			REQUIRE_INBUF(8)
123			if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
124			    (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
125			    (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
126				return 8;
127
128			c = (*inbuf)[3];
129			if (0xa1 <= c && c <= 0xbe)
130				cho = cgk2u_choseong[c - 0xa1];
131			else
132				cho = NONE;
133
134			c = (*inbuf)[5];
135			jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
136
137			c = (*inbuf)[7];
138			if (c == EUCKR_JAMO_FILLER)
139				jong = 0;
140			else if (0xa1 <= c && c <= 0xbe)
141				jong = cgk2u_jongseong[c - 0xa1];
142			else
143				jong = NONE;
144
145			if (cho == NONE || jung == NONE || jong == NONE)
146				return 8;
147
148			OUT1(0xac00 + cho*588 + jung*28 + jong);
149			NEXT(8, 1)
150		}
151		else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
152			NEXT(2, 1)
153		}
154		else
155			return 2;
156	}
157
158	return 0;
159}
160#undef NONE
161
162
163/*
164 * CP949 codec
165 */
166
167ENCODER(cp949)
168{
169	while (inleft > 0) {
170		Py_UNICODE c = IN1;
171		DBCHAR code;
172
173		if (c < 0x80) {
174			WRITE1((unsigned char)c)
175			NEXT(1, 1)
176			continue;
177		}
178		UCS4INVALID(c)
179
180		REQUIRE_OUTBUF(2)
181		TRYMAP_ENC(cp949, code, c);
182		else return 1;
183
184		OUT1((code >> 8) | 0x80)
185		if (code & 0x8000)
186			OUT2(code & 0xFF) /* MSB set: CP949 */
187		else
188			OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
189		NEXT(1, 2)
190	}
191
192	return 0;
193}
194
195DECODER(cp949)
196{
197	while (inleft > 0) {
198		unsigned char c = IN1;
199
200		REQUIRE_OUTBUF(1)
201
202		if (c < 0x80) {
203			OUT1(c)
204			NEXT(1, 1)
205			continue;
206		}
207
208		REQUIRE_INBUF(2)
209		TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
210		else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
211		else return 2;
212
213		NEXT(2, 1)
214	}
215
216	return 0;
217}
218
219
220/*
221 * JOHAB codec
222 */
223
224static const unsigned char u2johabidx_choseong[32] = {
225                0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
226    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
227    0x10, 0x11, 0x12, 0x13, 0x14,
228};
229static const unsigned char u2johabidx_jungseong[32] = {
230                      0x03, 0x04, 0x05, 0x06, 0x07,
231                0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
232                0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
233                0x1a, 0x1b, 0x1c, 0x1d,
234};
235static const unsigned char u2johabidx_jongseong[32] = {
236          0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238    0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
239    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
240};
241static const DBCHAR u2johabjamo[] = {
242            0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
243    0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
244    0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
245    0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
246    0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
247    0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
248    0x8741, 0x8761, 0x8781, 0x87a1,
249};
250
251ENCODER(johab)
252{
253	while (inleft > 0) {
254		Py_UNICODE c = IN1;
255		DBCHAR code;
256
257		if (c < 0x80) {
258			WRITE1((unsigned char)c)
259			NEXT(1, 1)
260			continue;
261		}
262		UCS4INVALID(c)
263
264		REQUIRE_OUTBUF(2)
265
266		if (c >= 0xac00 && c <= 0xd7a3) {
267			c -= 0xac00;
268			code = 0x8000 |
269				(u2johabidx_choseong[c / 588] << 10) |
270				(u2johabidx_jungseong[(c / 28) % 21] << 5) |
271				u2johabidx_jongseong[c % 28];
272		}
273		else if (c >= 0x3131 && c <= 0x3163)
274			code = u2johabjamo[c - 0x3131];
275		else TRYMAP_ENC(cp949, code, c) {
276			unsigned char c1, c2, t2;
277			unsigned short t1;
278
279			assert((code & 0x8000) == 0);
280			c1 = code >> 8;
281			c2 = code & 0xff;
282			if (((c1 >= 0x21 && c1 <= 0x2c) ||
283			    (c1 >= 0x4a && c1 <= 0x7d)) &&
284			    (c2 >= 0x21 && c2 <= 0x7e)) {
285				t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
286						  (c1 - 0x21 + 0x197));
287				t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
288				OUT1(t1 >> 1)
289				OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
290				NEXT(1, 2)
291				continue;
292			}
293			else
294				return 1;
295		}
296		else
297			return 1;
298
299		OUT1(code >> 8)
300		OUT2(code & 0xff)
301		NEXT(1, 2)
302	}
303
304	return 0;
305}
306
307#define FILL 0xfd
308#define NONE 0xff
309
310static const unsigned char johabidx_choseong[32] = {
311    NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
312    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
313    0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
314    NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
315};
316static const unsigned char johabidx_jungseong[32] = {
317    NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
318    NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
319    NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
320    NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
321};
322static const unsigned char johabidx_jongseong[32] = {
323    NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
324    0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
325    0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
326    0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
327};
328
329static const unsigned char johabjamo_choseong[32] = {
330    NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
331    0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
332    0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
333    NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
334};
335static const unsigned char johabjamo_jungseong[32] = {
336    NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
337    NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
338    NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
339    NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
340};
341static const unsigned char johabjamo_jongseong[32] = {
342    NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
343    0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
344    0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
345    0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
346};
347
348DECODER(johab)
349{
350	while (inleft > 0) {
351		unsigned char    c = IN1, c2;
352
353		REQUIRE_OUTBUF(1)
354
355		if (c < 0x80) {
356			OUT1(c)
357			NEXT(1, 1)
358			continue;
359		}
360
361		REQUIRE_INBUF(2)
362		c2 = IN2;
363
364		if (c < 0xd8) {
365			/* johab hangul */
366			unsigned char c_cho, c_jung, c_jong;
367			unsigned char i_cho, i_jung, i_jong;
368
369			c_cho = (c >> 2) & 0x1f;
370			c_jung = ((c << 3) | c2 >> 5) & 0x1f;
371			c_jong = c2 & 0x1f;
372
373			i_cho = johabidx_choseong[c_cho];
374			i_jung = johabidx_jungseong[c_jung];
375			i_jong = johabidx_jongseong[c_jong];
376
377			if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
378				return 2;
379
380			/* we don't use U+1100 hangul jamo yet. */
381			if (i_cho == FILL) {
382				if (i_jung == FILL) {
383					if (i_jong == FILL)
384						OUT1(0x3000)
385					else
386						OUT1(0x3100 |
387						  johabjamo_jongseong[c_jong])
388				}
389				else {
390					if (i_jong == FILL)
391						OUT1(0x3100 |
392						  johabjamo_jungseong[c_jung])
393					else
394						return 2;
395				}
396			} else {
397				if (i_jung == FILL) {
398					if (i_jong == FILL)
399						OUT1(0x3100 |
400						  johabjamo_choseong[c_cho])
401					else
402						return 2;
403				}
404				else
405					OUT1(0xac00 +
406					     i_cho * 588 +
407					     i_jung * 28 +
408					     (i_jong == FILL ? 0 : i_jong))
409			}
410			NEXT(2, 1)
411		} else {
412			/* KS X 1001 except hangul jamos and syllables */
413			if (c == 0xdf || c > 0xf9 ||
414			    c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
415			    (c2 & 0x7f) == 0x7f ||
416			    (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
417				return 2;
418			else {
419				unsigned char t1, t2;
420
421				t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
422						 2 * c - 0x197);
423				t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
424				t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
425				t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
426
427				TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
428				else return 2;
429				NEXT(2, 1)
430			}
431		}
432	}
433
434	return 0;
435}
436#undef NONE
437#undef FILL
438
439
440BEGIN_MAPPINGS_LIST
441  MAPPING_DECONLY(ksx1001)
442  MAPPING_ENCONLY(cp949)
443  MAPPING_DECONLY(cp949ext)
444END_MAPPINGS_LIST
445
446BEGIN_CODECS_LIST
447  CODEC_STATELESS(euc_kr)
448  CODEC_STATELESS(cp949)
449  CODEC_STATELESS(johab)
450END_CODECS_LIST
451
452I_AM_A_MODULE_FOR(kr)