PageRenderTime 423ms CodeModel.GetById 126ms app.highlight 109ms RepoModel.GetById 149ms app.codeStats 0ms

/Modules/cjkcodecs/_codecs_cn.c

http://unladen-swallow.googlecode.com/
C | 444 lines | 332 code | 84 blank | 28 comment | 107 complexity | 61b64ea46b65615ff2bc5368f433dcdc MD5 | raw file
  1/*
  2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
  3 *
  4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5 */
  6
  7#include "cjkcodecs.h"
  8#include "mappings_cn.h"
  9
 10/**
 11 * hz is predefined as 100 on AIX. So we undefine it to avoid
 12 * conflict against hz codec's.
 13 */
 14#ifdef _AIX
 15#undef hz
 16#endif
 17
 18/* GBK and GB2312 map differently in few codepoints that are listed below:
 19 *
 20 *		gb2312				gbk
 21 * A1A4		U+30FB KATAKANA MIDDLE DOT	U+00B7 MIDDLE DOT
 22 * A1AA		U+2015 HORIZONTAL BAR		U+2014 EM DASH
 23 * A844		undefined			U+2015 HORIZONTAL BAR
 24 */
 25
 26#define GBK_DECODE(dc1, dc2, assi) \
 27	if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
 28	else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
 29	else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
 30	else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
 31	else TRYMAP_DEC(gbkext, assi, dc1, dc2);
 32
 33#define GBK_ENCODE(code, assi) \
 34	if ((code) == 0x2014) (assi) = 0xa1aa; \
 35	else if ((code) == 0x2015) (assi) = 0xa844; \
 36	else if ((code) == 0x00b7) (assi) = 0xa1a4; \
 37	else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
 38
 39/*
 40 * GB2312 codec
 41 */
 42
 43ENCODER(gb2312)
 44{
 45	while (inleft > 0) {
 46		Py_UNICODE c = IN1;
 47		DBCHAR code;
 48
 49		if (c < 0x80) {
 50			WRITE1((unsigned char)c)
 51			NEXT(1, 1)
 52			continue;
 53		}
 54		UCS4INVALID(c)
 55
 56		REQUIRE_OUTBUF(2)
 57		TRYMAP_ENC(gbcommon, code, c);
 58		else return 1;
 59
 60		if (code & 0x8000) /* MSB set: GBK */
 61			return 1;
 62
 63		OUT1((code >> 8) | 0x80)
 64		OUT2((code & 0xFF) | 0x80)
 65		NEXT(1, 2)
 66	}
 67
 68	return 0;
 69}
 70
 71DECODER(gb2312)
 72{
 73	while (inleft > 0) {
 74		unsigned char c = **inbuf;
 75
 76		REQUIRE_OUTBUF(1)
 77
 78		if (c < 0x80) {
 79			OUT1(c)
 80			NEXT(1, 1)
 81			continue;
 82		}
 83
 84		REQUIRE_INBUF(2)
 85		TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
 86			NEXT(2, 1)
 87		}
 88		else return 2;
 89	}
 90
 91	return 0;
 92}
 93
 94
 95/*
 96 * GBK codec
 97 */
 98
 99ENCODER(gbk)
100{
101	while (inleft > 0) {
102		Py_UNICODE c = IN1;
103		DBCHAR code;
104
105		if (c < 0x80) {
106			WRITE1((unsigned char)c)
107			NEXT(1, 1)
108			continue;
109		}
110		UCS4INVALID(c)
111
112		REQUIRE_OUTBUF(2)
113
114		GBK_ENCODE(c, code)
115		else return 1;
116
117		OUT1((code >> 8) | 0x80)
118		if (code & 0x8000)
119			OUT2((code & 0xFF)) /* MSB set: GBK */
120		else
121			OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122		NEXT(1, 2)
123	}
124
125	return 0;
126}
127
128DECODER(gbk)
129{
130	while (inleft > 0) {
131		unsigned char c = IN1;
132
133		REQUIRE_OUTBUF(1)
134
135		if (c < 0x80) {
136			OUT1(c)
137			NEXT(1, 1)
138			continue;
139		}
140
141		REQUIRE_INBUF(2)
142
143		GBK_DECODE(c, IN2, **outbuf)
144		else return 2;
145
146		NEXT(2, 1)
147	}
148
149	return 0;
150}
151
152
153/*
154 * GB18030 codec
155 */
156
157ENCODER(gb18030)
158{
159	while (inleft > 0) {
160		ucs4_t c = IN1;
161		DBCHAR code;
162
163		if (c < 0x80) {
164			WRITE1(c)
165			NEXT(1, 1)
166			continue;
167		}
168
169		DECODE_SURROGATE(c)
170		if (c > 0x10FFFF)
171#if Py_UNICODE_SIZE == 2
172			return 2; /* surrogates pair */
173#else
174			return 1;
175#endif
176		else if (c >= 0x10000) {
177			ucs4_t tc = c - 0x10000;
178
179			REQUIRE_OUTBUF(4)
180
181			OUT4((unsigned char)(tc % 10) + 0x30)
182			tc /= 10;
183			OUT3((unsigned char)(tc % 126) + 0x81)
184			tc /= 126;
185			OUT2((unsigned char)(tc % 10) + 0x30)
186			tc /= 10;
187			OUT1((unsigned char)(tc + 0x90))
188
189#if Py_UNICODE_SIZE == 2
190			NEXT(2, 4) /* surrogates pair */
191#else
192			NEXT(1, 4)
193#endif
194			continue;
195		}
196
197		REQUIRE_OUTBUF(2)
198
199		GBK_ENCODE(c, code)
200		else TRYMAP_ENC(gb18030ext, code, c);
201		else {
202			const struct _gb18030_to_unibmp_ranges *utrrange;
203
204			REQUIRE_OUTBUF(4)
205
206			for (utrrange = gb18030_to_unibmp_ranges;
207			     utrrange->first != 0;
208			     utrrange++)
209				if (utrrange->first <= c &&
210				    c <= utrrange->last) {
211					Py_UNICODE tc;
212
213					tc = c - utrrange->first +
214					     utrrange->base;
215
216					OUT4((unsigned char)(tc % 10) + 0x30)
217					tc /= 10;
218					OUT3((unsigned char)(tc % 126) + 0x81)
219					tc /= 126;
220					OUT2((unsigned char)(tc % 10) + 0x30)
221					tc /= 10;
222					OUT1((unsigned char)tc + 0x81)
223
224					NEXT(1, 4)
225					break;
226				}
227
228			if (utrrange->first == 0)
229				return 1;
230			continue;
231		}
232
233		OUT1((code >> 8) | 0x80)
234		if (code & 0x8000)
235			OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
236		else
237			OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
238
239		NEXT(1, 2)
240	}
241
242	return 0;
243}
244
245DECODER(gb18030)
246{
247	while (inleft > 0) {
248		unsigned char c = IN1, c2;
249
250		REQUIRE_OUTBUF(1)
251
252		if (c < 0x80) {
253			OUT1(c)
254			NEXT(1, 1)
255			continue;
256		}
257
258		REQUIRE_INBUF(2)
259
260		c2 = IN2;
261		if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262			const struct _gb18030_to_unibmp_ranges *utr;
263			unsigned char c3, c4;
264			ucs4_t lseq;
265
266			REQUIRE_INBUF(4)
267			c3 = IN3;
268			c4 = IN4;
269			if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
270				return 4;
271			c -= 0x81;  c2 -= 0x30;
272			c3 -= 0x81; c4 -= 0x30;
273
274			if (c < 4) { /* U+0080 - U+FFFF */
275				lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276					(ucs4_t)c3 * 10 + c4;
277				if (lseq < 39420) {
278					for (utr = gb18030_to_unibmp_ranges;
279					     lseq >= (utr + 1)->base;
280					     utr++) ;
281					OUT1(utr->first - utr->base + lseq)
282					NEXT(4, 1)
283					continue;
284				}
285			}
286			else if (c >= 15) { /* U+10000 - U+10FFFF */
287				lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288					* 1260 + (ucs4_t)c3 * 10 + c4;
289				if (lseq <= 0x10FFFF) {
290					WRITEUCS4(lseq);
291					NEXT_IN(4)
292					continue;
293				}
294			}
295			return 4;
296		}
297
298		GBK_DECODE(c, c2, **outbuf)
299		else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
300		else return 2;
301
302		NEXT(2, 1)
303	}
304
305	return 0;
306}
307
308
309/*
310 * HZ codec
311 */
312
313ENCODER_INIT(hz)
314{
315	state->i = 0;
316	return 0;
317}
318
319ENCODER_RESET(hz)
320{
321	if (state->i != 0) {
322		WRITE2('~', '}')
323		state->i = 0;
324		NEXT_OUT(2)
325	}
326	return 0;
327}
328
329ENCODER(hz)
330{
331	while (inleft > 0) {
332		Py_UNICODE c = IN1;
333		DBCHAR code;
334
335		if (c < 0x80) {
336			if (state->i == 0) {
337				WRITE1((unsigned char)c)
338				NEXT(1, 1)
339			}
340			else {
341				WRITE3('~', '}', (unsigned char)c)
342				NEXT(1, 3)
343				state->i = 0;
344			}
345			continue;
346		}
347
348		UCS4INVALID(c)
349
350		TRYMAP_ENC(gbcommon, code, c);
351		else return 1;
352
353		if (code & 0x8000) /* MSB set: GBK */
354			return 1;
355
356		if (state->i == 0) {
357			WRITE4('~', '{', code >> 8, code & 0xff)
358			NEXT(1, 4)
359			state->i = 1;
360		}
361		else {
362			WRITE2(code >> 8, code & 0xff)
363			NEXT(1, 2)
364		}
365	}
366
367	return 0;
368}
369
370DECODER_INIT(hz)
371{
372	state->i = 0;
373	return 0;
374}
375
376DECODER_RESET(hz)
377{
378	state->i = 0;
379	return 0;
380}
381
382DECODER(hz)
383{
384	while (inleft > 0) {
385		unsigned char c = IN1;
386
387		if (c == '~') {
388			unsigned char c2 = IN2;
389
390			REQUIRE_INBUF(2)
391			if (c2 == '~') {
392				WRITE1('~')
393				NEXT(2, 1)
394				continue;
395			}
396			else if (c2 == '{' && state->i == 0)
397				state->i = 1; /* set GB */
398			else if (c2 == '}' && state->i == 1)
399				state->i = 0; /* set ASCII */
400			else if (c2 == '\n')
401				; /* line-continuation */
402			else
403				return 2;
404			NEXT(2, 0);
405			continue;
406		}
407
408		if (c & 0x80)
409			return 1;
410
411		if (state->i == 0) { /* ASCII mode */
412			WRITE1(c)
413			NEXT(1, 1)
414		}
415		else { /* GB mode */
416			REQUIRE_INBUF(2)
417			REQUIRE_OUTBUF(1)
418			TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
419				NEXT(2, 1)
420			}
421			else
422				return 2;
423		}
424	}
425
426	return 0;
427}
428
429
430BEGIN_MAPPINGS_LIST
431  MAPPING_DECONLY(gb2312)
432  MAPPING_DECONLY(gbkext)
433  MAPPING_ENCONLY(gbcommon)
434  MAPPING_ENCDEC(gb18030ext)
435END_MAPPINGS_LIST
436
437BEGIN_CODECS_LIST
438  CODEC_STATELESS(gb2312)
439  CODEC_STATELESS(gbk)
440  CODEC_STATELESS(gb18030)
441  CODEC_STATEFUL(hz)
442END_CODECS_LIST
443
444I_AM_A_MODULE_FOR(cn)