PageRenderTime 440ms CodeModel.GetById 181ms app.highlight 143ms RepoModel.GetById 109ms app.codeStats 0ms

/Modules/cjkcodecs/_codecs_jp.c

http://unladen-swallow.googlecode.com/
C | 731 lines | 595 code | 95 blank | 41 comment | 229 complexity | c9dfdf08e178f8207d9b5fcac37582d4 MD5 | raw file
  1/*
  2 * _codecs_jp.c: Codecs collection for Japanese encodings
  3 *
  4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5 */
  6
  7#define USING_BINARY_PAIR_SEARCH
  8#define EMPBASE 0x20000
  9
 10#include "cjkcodecs.h"
 11#include "mappings_jp.h"
 12#include "mappings_jisx0213_pair.h"
 13#include "alg_jisx0201.h"
 14#include "emu_jisx0213_2000.h"
 15
 16/*
 17 * CP932 codec
 18 */
 19
 20ENCODER(cp932)
 21{
 22	while (inleft > 0) {
 23		Py_UNICODE c = IN1;
 24		DBCHAR code;
 25		unsigned char c1, c2;
 26
 27		if (c <= 0x80) {
 28			WRITE1((unsigned char)c)
 29			NEXT(1, 1)
 30			continue;
 31		}
 32		else if (c >= 0xff61 && c <= 0xff9f) {
 33			WRITE1(c - 0xfec0)
 34			NEXT(1, 1)
 35			continue;
 36		}
 37		else if (c >= 0xf8f0 && c <= 0xf8f3) {
 38			/* Windows compatibility */
 39			REQUIRE_OUTBUF(1)
 40			if (c == 0xf8f0)
 41				OUT1(0xa0)
 42			else
 43				OUT1(c - 0xfef1 + 0xfd)
 44			NEXT(1, 1)
 45			continue;
 46		}
 47
 48		UCS4INVALID(c)
 49		REQUIRE_OUTBUF(2)
 50
 51		TRYMAP_ENC(cp932ext, code, c) {
 52			OUT1(code >> 8)
 53			OUT2(code & 0xff)
 54		}
 55		else TRYMAP_ENC(jisxcommon, code, c) {
 56			if (code & 0x8000) /* MSB set: JIS X 0212 */
 57				return 1;
 58
 59			/* JIS X 0208 */
 60			c1 = code >> 8;
 61			c2 = code & 0xff;
 62			c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
 63			c1 = (c1 - 0x21) >> 1;
 64			OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
 65			OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
 66		}
 67		else if (c >= 0xe000 && c < 0xe758) {
 68			/* User-defined area */
 69			c1 = (Py_UNICODE)(c - 0xe000) / 188;
 70			c2 = (Py_UNICODE)(c - 0xe000) % 188;
 71			OUT1(c1 + 0xf0)
 72			OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
 73		}
 74		else
 75			return 1;
 76
 77		NEXT(1, 2)
 78	}
 79
 80	return 0;
 81}
 82
 83DECODER(cp932)
 84{
 85	while (inleft > 0) {
 86		unsigned char c = IN1, c2;
 87
 88		REQUIRE_OUTBUF(1)
 89		if (c <= 0x80) {
 90			OUT1(c)
 91			NEXT(1, 1)
 92			continue;
 93		}
 94		else if (c >= 0xa0 && c <= 0xdf) {
 95			if (c == 0xa0)
 96				OUT1(0xf8f0) /* half-width katakana */
 97			else
 98				OUT1(0xfec0 + c)
 99			NEXT(1, 1)
100			continue;
101		}
102		else if (c >= 0xfd/* && c <= 0xff*/) {
103			/* Windows compatibility */
104			OUT1(0xf8f1 - 0xfd + c)
105			NEXT(1, 1)
106			continue;
107		}
108
109		REQUIRE_INBUF(2)
110		c2 = IN2;
111
112		TRYMAP_DEC(cp932ext, **outbuf, c, c2);
113		else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
114			if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
115				return 2;
116
117			c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
118			c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
119			c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
120			c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
121
122			TRYMAP_DEC(jisx0208, **outbuf, c, c2);
123			else return 2;
124		}
125		else if (c >= 0xf0 && c <= 0xf9) {
126			if ((c2 >= 0x40 && c2 <= 0x7e) ||
127			    (c2 >= 0x80 && c2 <= 0xfc))
128				OUT1(0xe000 + 188 * (c - 0xf0) +
129				     (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
130			else
131				return 2;
132		}
133		else
134			return 2;
135
136		NEXT(2, 1)
137	}
138
139	return 0;
140}
141
142
143/*
144 * EUC-JIS-2004 codec
145 */
146
147ENCODER(euc_jis_2004)
148{
149	while (inleft > 0) {
150		ucs4_t c = IN1;
151		DBCHAR code;
152		Py_ssize_t insize;
153
154		if (c < 0x80) {
155			WRITE1(c)
156			NEXT(1, 1)
157			continue;
158		}
159
160		DECODE_SURROGATE(c)
161		insize = GET_INSIZE(c);
162
163		if (c <= 0xFFFF) {
164			EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
165			else TRYMAP_ENC(jisx0213_bmp, code, c) {
166				if (code == MULTIC) {
167					if (inleft < 2) {
168						if (flags & MBENC_FLUSH) {
169							code = find_pairencmap(
170							    (ucs2_t)c, 0,
171							  jisx0213_pair_encmap,
172							    JISX0213_ENCPAIRS);
173							if (code == DBCINV)
174								return 1;
175						}
176						else
177							return MBERR_TOOFEW;
178					}
179					else {
180						code = find_pairencmap(
181							(ucs2_t)c, (*inbuf)[1],
182							jisx0213_pair_encmap,
183							JISX0213_ENCPAIRS);
184						if (code == DBCINV) {
185							code = find_pairencmap(
186							    (ucs2_t)c, 0,
187							  jisx0213_pair_encmap,
188							    JISX0213_ENCPAIRS);
189							if (code == DBCINV)
190								return 1;
191						} else
192							insize = 2;
193					}
194				}
195			}
196			else TRYMAP_ENC(jisxcommon, code, c);
197			else if (c >= 0xff61 && c <= 0xff9f) {
198				/* JIS X 0201 half-width katakana */
199				WRITE2(0x8e, c - 0xfec0)
200				NEXT(1, 2)
201				continue;
202			}
203			else if (c == 0xff3c)
204				/* F/W REVERSE SOLIDUS (see NOTES) */
205				code = 0x2140;
206			else if (c == 0xff5e)
207				/* F/W TILDE (see NOTES) */
208				code = 0x2232;
209			else
210				return 1;
211		}
212		else if (c >> 16 == EMPBASE >> 16) {
213			EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
214			else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
215			else return insize;
216		}
217		else
218			return insize;
219
220		if (code & 0x8000) {
221			/* Codeset 2 */
222			WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
223			NEXT(insize, 3)
224		} else {
225			/* Codeset 1 */
226			WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
227			NEXT(insize, 2)
228		}
229	}
230
231	return 0;
232}
233
234DECODER(euc_jis_2004)
235{
236	while (inleft > 0) {
237		unsigned char c = IN1;
238		ucs4_t code;
239
240		REQUIRE_OUTBUF(1)
241
242		if (c < 0x80) {
243			OUT1(c)
244			NEXT(1, 1)
245			continue;
246		}
247
248		if (c == 0x8e) {
249			/* JIS X 0201 half-width katakana */
250			unsigned char c2;
251
252			REQUIRE_INBUF(2)
253			c2 = IN2;
254			if (c2 >= 0xa1 && c2 <= 0xdf) {
255				OUT1(0xfec0 + c2)
256				NEXT(2, 1)
257			}
258			else
259				return 2;
260		}
261		else if (c == 0x8f) {
262			unsigned char c2, c3;
263
264			REQUIRE_INBUF(3)
265			c2 = IN2 ^ 0x80;
266			c3 = IN3 ^ 0x80;
267
268			/* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
269			EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3)
270			else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
271			else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
272				WRITEUCS4(EMPBASE | code)
273				NEXT_IN(3)
274				continue;
275			}
276			else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
277			else return 3;
278			NEXT(3, 1)
279		}
280		else {
281			unsigned char c2;
282
283			REQUIRE_INBUF(2)
284			c ^= 0x80;
285			c2 = IN2 ^ 0x80;
286
287			/* JIS X 0213 Plane 1 */
288			EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2)
289			else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
290			else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
291			else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
292			else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
293			else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
294				WRITEUCS4(EMPBASE | code)
295				NEXT_IN(2)
296				continue;
297			}
298			else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
299				WRITE2(code >> 16, code & 0xffff)
300				NEXT(2, 2)
301				continue;
302			}
303			else return 2;
304			NEXT(2, 1)
305		}
306	}
307
308	return 0;
309}
310
311
312/*
313 * EUC-JP codec
314 */
315
316ENCODER(euc_jp)
317{
318	while (inleft > 0) {
319		Py_UNICODE c = IN1;
320		DBCHAR code;
321
322		if (c < 0x80) {
323			WRITE1((unsigned char)c)
324			NEXT(1, 1)
325			continue;
326		}
327
328		UCS4INVALID(c)
329
330		TRYMAP_ENC(jisxcommon, code, c);
331		else if (c >= 0xff61 && c <= 0xff9f) {
332			/* JIS X 0201 half-width katakana */
333			WRITE2(0x8e, c - 0xfec0)
334			NEXT(1, 2)
335			continue;
336		}
337#ifndef STRICT_BUILD
338		else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
339			code = 0x2140;
340		else if (c == 0xa5) { /* YEN SIGN */
341			WRITE1(0x5c);
342			NEXT(1, 1)
343			continue;
344		} else if (c == 0x203e) { /* OVERLINE */
345			WRITE1(0x7e);
346			NEXT(1, 1)
347			continue;
348		}
349#endif
350		else
351			return 1;
352
353		if (code & 0x8000) {
354			/* JIS X 0212 */
355			WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
356			NEXT(1, 3)
357		} else {
358			/* JIS X 0208 */
359			WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
360			NEXT(1, 2)
361		}
362	}
363
364	return 0;
365}
366
367DECODER(euc_jp)
368{
369	while (inleft > 0) {
370		unsigned char c = IN1;
371
372		REQUIRE_OUTBUF(1)
373
374			if (c < 0x80) {
375				OUT1(c)
376				NEXT(1, 1)
377				continue;
378			}
379
380		if (c == 0x8e) {
381			/* JIS X 0201 half-width katakana */
382			unsigned char c2;
383
384			REQUIRE_INBUF(2)
385			c2 = IN2;
386			if (c2 >= 0xa1 && c2 <= 0xdf) {
387				OUT1(0xfec0 + c2)
388				NEXT(2, 1)
389			}
390			else
391				return 2;
392		}
393		else if (c == 0x8f) {
394			unsigned char c2, c3;
395
396			REQUIRE_INBUF(3)
397			c2 = IN2;
398			c3 = IN3;
399			/* JIS X 0212 */
400			TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
401				NEXT(3, 1)
402			}
403			else
404				return 3;
405		}
406		else {
407			unsigned char c2;
408
409			REQUIRE_INBUF(2)
410			c2 = IN2;
411			/* JIS X 0208 */
412#ifndef STRICT_BUILD
413			if (c == 0xa1 && c2 == 0xc0)
414				/* FULL-WIDTH REVERSE SOLIDUS */
415				**outbuf = 0xff3c;
416			else
417#endif
418				TRYMAP_DEC(jisx0208, **outbuf,
419					   c ^ 0x80, c2 ^ 0x80) ;
420			else return 2;
421			NEXT(2, 1)
422		}
423	}
424
425	return 0;
426}
427
428
429/*
430 * SHIFT_JIS codec
431 */
432
433ENCODER(shift_jis)
434{
435	while (inleft > 0) {
436		Py_UNICODE c = IN1;
437		DBCHAR code;
438		unsigned char c1, c2;
439
440#ifdef STRICT_BUILD
441		JISX0201_R_ENCODE(c, code)
442#else
443		if (c < 0x80) code = c;
444		else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */
445		else if (c == 0x203e) code = 0x7e; /* OVERLINE */
446#endif
447		else JISX0201_K_ENCODE(c, code)
448		else UCS4INVALID(c)
449		else code = NOCHAR;
450
451		if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
452			REQUIRE_OUTBUF(1)
453
454			OUT1((unsigned char)code)
455			NEXT(1, 1)
456			continue;
457		}
458
459		REQUIRE_OUTBUF(2)
460
461		if (code == NOCHAR) {
462			TRYMAP_ENC(jisxcommon, code, c);
463#ifndef STRICT_BUILD
464			else if (c == 0xff3c)
465				code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */
466#endif
467			else
468				return 1;
469
470			if (code & 0x8000) /* MSB set: JIS X 0212 */
471				return 1;
472		}
473
474		c1 = code >> 8;
475		c2 = code & 0xff;
476		c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
477		c1 = (c1 - 0x21) >> 1;
478		OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
479		OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
480		NEXT(1, 2)
481	}
482
483	return 0;
484}
485
486DECODER(shift_jis)
487{
488	while (inleft > 0) {
489		unsigned char c = IN1;
490
491		REQUIRE_OUTBUF(1)
492
493#ifdef STRICT_BUILD
494		JISX0201_R_DECODE(c, **outbuf)
495#else
496		if (c < 0x80) **outbuf = c;
497#endif
498		else JISX0201_K_DECODE(c, **outbuf)
499		else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
500			unsigned char c1, c2;
501
502			REQUIRE_INBUF(2)
503			c2 = IN2;
504			if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
505				return 2;
506
507			c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
508			c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
509			c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21);
510			c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
511
512#ifndef STRICT_BUILD
513			if (c1 == 0x21 && c2 == 0x40) {
514				/* FULL-WIDTH REVERSE SOLIDUS */
515				OUT1(0xff3c)
516				NEXT(2, 1)
517				continue;
518			}
519#endif
520			TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
521				NEXT(2, 1)
522				continue;
523			}
524			else
525				return 2;
526		}
527		else
528			return 2;
529
530		NEXT(1, 1) /* JIS X 0201 */
531	}
532
533	return 0;
534}
535
536
537/*
538 * SHIFT_JIS-2004 codec
539 */
540
541ENCODER(shift_jis_2004)
542{
543	while (inleft > 0) {
544		ucs4_t c = IN1;
545		DBCHAR code = NOCHAR;
546		int c1, c2;
547		Py_ssize_t insize;
548
549		JISX0201_ENCODE(c, code)
550		else DECODE_SURROGATE(c)
551
552		if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
553			WRITE1((unsigned char)code)
554			NEXT(1, 1)
555			continue;
556		}
557
558		REQUIRE_OUTBUF(2)
559		insize = GET_INSIZE(c);
560
561		if (code == NOCHAR) {
562			if (c <= 0xffff) {
563				EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
564				else TRYMAP_ENC(jisx0213_bmp, code, c) {
565					if (code == MULTIC) {
566						if (inleft < 2) {
567						    if (flags & MBENC_FLUSH) {
568							code = find_pairencmap
569							    ((ucs2_t)c, 0,
570							  jisx0213_pair_encmap,
571							    JISX0213_ENCPAIRS);
572							if (code == DBCINV)
573							    return 1;
574						    }
575						    else
576							    return MBERR_TOOFEW;
577						}
578						else {
579						    code = find_pairencmap(
580							    (ucs2_t)c, IN2,
581							  jisx0213_pair_encmap,
582							    JISX0213_ENCPAIRS);
583						    if (code == DBCINV) {
584							code = find_pairencmap(
585							    (ucs2_t)c, 0,
586							  jisx0213_pair_encmap,
587							    JISX0213_ENCPAIRS);
588							if (code == DBCINV)
589							    return 1;
590							}
591							else
592							    insize = 2;
593						}
594					}
595				}
596				else TRYMAP_ENC(jisxcommon, code, c) {
597					/* abandon JIS X 0212 codes */
598					if (code & 0x8000)
599						return 1;
600				}
601				else return 1;
602			}
603			else if (c >> 16 == EMPBASE >> 16) {
604				EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
605				else TRYMAP_ENC(jisx0213_emp, code, c&0xffff);
606				else return insize;
607			}
608			else
609				return insize;
610		}
611
612		c1 = code >> 8;
613		c2 = (code & 0xff) - 0x21;
614
615		if (c1 & 0x80) { /* Plane 2 */
616			if (c1 >= 0xee) c1 -= 0x87;
617			else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49;
618			else c1 -= 0x43;
619		}
620		else /* Plane 1 */
621			c1 -= 0x21;
622
623		if (c1 & 1) c2 += 0x5e;
624		c1 >>= 1;
625		OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
626		OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
627
628		NEXT(insize, 2)
629	}
630
631	return 0;
632}
633
634DECODER(shift_jis_2004)
635{
636	while (inleft > 0) {
637		unsigned char c = IN1;
638
639		REQUIRE_OUTBUF(1)
640		JISX0201_DECODE(c, **outbuf)
641		else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
642			unsigned char c1, c2;
643			ucs4_t code;
644
645			REQUIRE_INBUF(2)
646			c2 = IN2;
647			if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
648				return 2;
649
650			c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
651			c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
652			c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1));
653			c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
654
655			if (c1 < 0x5e) { /* Plane 1 */
656				c1 += 0x21;
657				EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf,
658						c1, c2)
659				else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
660					NEXT_OUT(1)
661				}
662				else TRYMAP_DEC(jisx0213_1_bmp, **outbuf,
663						c1, c2) {
664					NEXT_OUT(1)
665				}
666				else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) {
667					WRITEUCS4(EMPBASE | code)
668				}
669				else TRYMAP_DEC(jisx0213_pair, code, c1, c2) {
670					WRITE2(code >> 16, code & 0xffff)
671					NEXT_OUT(2)
672				}
673				else
674					return 2;
675				NEXT_IN(2)
676			}
677			else { /* Plane 2 */
678				if (c1 >= 0x67) c1 += 0x07;
679				else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37;
680				else c1 -= 0x3d;
681
682				EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf,
683						c1, c2)
684				else TRYMAP_DEC(jisx0213_2_bmp, **outbuf,
685						c1, c2) ;
686				else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) {
687					WRITEUCS4(EMPBASE | code)
688					NEXT_IN(2)
689					continue;
690				}
691				else
692					return 2;
693				NEXT(2, 1)
694			}
695			continue;
696		}
697		else
698			return 2;
699
700		NEXT(1, 1) /* JIS X 0201 */
701	}
702
703	return 0;
704}
705
706
707BEGIN_MAPPINGS_LIST
708  MAPPING_DECONLY(jisx0208)
709  MAPPING_DECONLY(jisx0212)
710  MAPPING_ENCONLY(jisxcommon)
711  MAPPING_DECONLY(jisx0213_1_bmp)
712  MAPPING_DECONLY(jisx0213_2_bmp)
713  MAPPING_ENCONLY(jisx0213_bmp)
714  MAPPING_DECONLY(jisx0213_1_emp)
715  MAPPING_DECONLY(jisx0213_2_emp)
716  MAPPING_ENCONLY(jisx0213_emp)
717  MAPPING_ENCDEC(jisx0213_pair)
718  MAPPING_ENCDEC(cp932ext)
719END_MAPPINGS_LIST
720
721BEGIN_CODECS_LIST
722  CODEC_STATELESS(shift_jis)
723  CODEC_STATELESS(cp932)
724  CODEC_STATELESS(euc_jp)
725  CODEC_STATELESS(shift_jis_2004)
726  CODEC_STATELESS(euc_jis_2004)
727  { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) },
728  { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) },
729END_CODECS_LIST
730
731I_AM_A_MODULE_FOR(jp)