/Modules/cjkcodecs/_codecs_cn.c

http://unladen-swallow.googlecode.com/ · C · 444 lines · 332 code · 84 blank · 28 comment · 107 complexity · 61b64ea46b65615ff2bc5368f433dcdc MD5 · raw file

  1. /*
  2. * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #include "cjkcodecs.h"
  7. #include "mappings_cn.h"
  8. /**
  9. * hz is predefined as 100 on AIX. So we undefine it to avoid
  10. * conflict against hz codec's.
  11. */
  12. #ifdef _AIX
  13. #undef hz
  14. #endif
  15. /* GBK and GB2312 map differently in few codepoints that are listed below:
  16. *
  17. * gb2312 gbk
  18. * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
  19. * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
  20. * A844 undefined U+2015 HORIZONTAL BAR
  21. */
  22. #define GBK_DECODE(dc1, dc2, assi) \
  23. if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
  24. else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
  25. else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
  26. else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
  27. else TRYMAP_DEC(gbkext, assi, dc1, dc2);
  28. #define GBK_ENCODE(code, assi) \
  29. if ((code) == 0x2014) (assi) = 0xa1aa; \
  30. else if ((code) == 0x2015) (assi) = 0xa844; \
  31. else if ((code) == 0x00b7) (assi) = 0xa1a4; \
  32. else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
  33. /*
  34. * GB2312 codec
  35. */
  36. ENCODER(gb2312)
  37. {
  38. while (inleft > 0) {
  39. Py_UNICODE c = IN1;
  40. DBCHAR code;
  41. if (c < 0x80) {
  42. WRITE1((unsigned char)c)
  43. NEXT(1, 1)
  44. continue;
  45. }
  46. UCS4INVALID(c)
  47. REQUIRE_OUTBUF(2)
  48. TRYMAP_ENC(gbcommon, code, c);
  49. else return 1;
  50. if (code & 0x8000) /* MSB set: GBK */
  51. return 1;
  52. OUT1((code >> 8) | 0x80)
  53. OUT2((code & 0xFF) | 0x80)
  54. NEXT(1, 2)
  55. }
  56. return 0;
  57. }
  58. DECODER(gb2312)
  59. {
  60. while (inleft > 0) {
  61. unsigned char c = **inbuf;
  62. REQUIRE_OUTBUF(1)
  63. if (c < 0x80) {
  64. OUT1(c)
  65. NEXT(1, 1)
  66. continue;
  67. }
  68. REQUIRE_INBUF(2)
  69. TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
  70. NEXT(2, 1)
  71. }
  72. else return 2;
  73. }
  74. return 0;
  75. }
  76. /*
  77. * GBK codec
  78. */
  79. ENCODER(gbk)
  80. {
  81. while (inleft > 0) {
  82. Py_UNICODE c = IN1;
  83. DBCHAR code;
  84. if (c < 0x80) {
  85. WRITE1((unsigned char)c)
  86. NEXT(1, 1)
  87. continue;
  88. }
  89. UCS4INVALID(c)
  90. REQUIRE_OUTBUF(2)
  91. GBK_ENCODE(c, code)
  92. else return 1;
  93. OUT1((code >> 8) | 0x80)
  94. if (code & 0x8000)
  95. OUT2((code & 0xFF)) /* MSB set: GBK */
  96. else
  97. OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
  98. NEXT(1, 2)
  99. }
  100. return 0;
  101. }
  102. DECODER(gbk)
  103. {
  104. while (inleft > 0) {
  105. unsigned char c = IN1;
  106. REQUIRE_OUTBUF(1)
  107. if (c < 0x80) {
  108. OUT1(c)
  109. NEXT(1, 1)
  110. continue;
  111. }
  112. REQUIRE_INBUF(2)
  113. GBK_DECODE(c, IN2, **outbuf)
  114. else return 2;
  115. NEXT(2, 1)
  116. }
  117. return 0;
  118. }
  119. /*
  120. * GB18030 codec
  121. */
  122. ENCODER(gb18030)
  123. {
  124. while (inleft > 0) {
  125. ucs4_t c = IN1;
  126. DBCHAR code;
  127. if (c < 0x80) {
  128. WRITE1(c)
  129. NEXT(1, 1)
  130. continue;
  131. }
  132. DECODE_SURROGATE(c)
  133. if (c > 0x10FFFF)
  134. #if Py_UNICODE_SIZE == 2
  135. return 2; /* surrogates pair */
  136. #else
  137. return 1;
  138. #endif
  139. else if (c >= 0x10000) {
  140. ucs4_t tc = c - 0x10000;
  141. REQUIRE_OUTBUF(4)
  142. OUT4((unsigned char)(tc % 10) + 0x30)
  143. tc /= 10;
  144. OUT3((unsigned char)(tc % 126) + 0x81)
  145. tc /= 126;
  146. OUT2((unsigned char)(tc % 10) + 0x30)
  147. tc /= 10;
  148. OUT1((unsigned char)(tc + 0x90))
  149. #if Py_UNICODE_SIZE == 2
  150. NEXT(2, 4) /* surrogates pair */
  151. #else
  152. NEXT(1, 4)
  153. #endif
  154. continue;
  155. }
  156. REQUIRE_OUTBUF(2)
  157. GBK_ENCODE(c, code)
  158. else TRYMAP_ENC(gb18030ext, code, c);
  159. else {
  160. const struct _gb18030_to_unibmp_ranges *utrrange;
  161. REQUIRE_OUTBUF(4)
  162. for (utrrange = gb18030_to_unibmp_ranges;
  163. utrrange->first != 0;
  164. utrrange++)
  165. if (utrrange->first <= c &&
  166. c <= utrrange->last) {
  167. Py_UNICODE tc;
  168. tc = c - utrrange->first +
  169. utrrange->base;
  170. OUT4((unsigned char)(tc % 10) + 0x30)
  171. tc /= 10;
  172. OUT3((unsigned char)(tc % 126) + 0x81)
  173. tc /= 126;
  174. OUT2((unsigned char)(tc % 10) + 0x30)
  175. tc /= 10;
  176. OUT1((unsigned char)tc + 0x81)
  177. NEXT(1, 4)
  178. break;
  179. }
  180. if (utrrange->first == 0)
  181. return 1;
  182. continue;
  183. }
  184. OUT1((code >> 8) | 0x80)
  185. if (code & 0x8000)
  186. OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
  187. else
  188. OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
  189. NEXT(1, 2)
  190. }
  191. return 0;
  192. }
  193. DECODER(gb18030)
  194. {
  195. while (inleft > 0) {
  196. unsigned char c = IN1, c2;
  197. REQUIRE_OUTBUF(1)
  198. if (c < 0x80) {
  199. OUT1(c)
  200. NEXT(1, 1)
  201. continue;
  202. }
  203. REQUIRE_INBUF(2)
  204. c2 = IN2;
  205. if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
  206. const struct _gb18030_to_unibmp_ranges *utr;
  207. unsigned char c3, c4;
  208. ucs4_t lseq;
  209. REQUIRE_INBUF(4)
  210. c3 = IN3;
  211. c4 = IN4;
  212. if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
  213. return 4;
  214. c -= 0x81; c2 -= 0x30;
  215. c3 -= 0x81; c4 -= 0x30;
  216. if (c < 4) { /* U+0080 - U+FFFF */
  217. lseq = ((ucs4_t)c * 10 + c2) * 1260 +
  218. (ucs4_t)c3 * 10 + c4;
  219. if (lseq < 39420) {
  220. for (utr = gb18030_to_unibmp_ranges;
  221. lseq >= (utr + 1)->base;
  222. utr++) ;
  223. OUT1(utr->first - utr->base + lseq)
  224. NEXT(4, 1)
  225. continue;
  226. }
  227. }
  228. else if (c >= 15) { /* U+10000 - U+10FFFF */
  229. lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
  230. * 1260 + (ucs4_t)c3 * 10 + c4;
  231. if (lseq <= 0x10FFFF) {
  232. WRITEUCS4(lseq);
  233. NEXT_IN(4)
  234. continue;
  235. }
  236. }
  237. return 4;
  238. }
  239. GBK_DECODE(c, c2, **outbuf)
  240. else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
  241. else return 2;
  242. NEXT(2, 1)
  243. }
  244. return 0;
  245. }
  246. /*
  247. * HZ codec
  248. */
  249. ENCODER_INIT(hz)
  250. {
  251. state->i = 0;
  252. return 0;
  253. }
  254. ENCODER_RESET(hz)
  255. {
  256. if (state->i != 0) {
  257. WRITE2('~', '}')
  258. state->i = 0;
  259. NEXT_OUT(2)
  260. }
  261. return 0;
  262. }
  263. ENCODER(hz)
  264. {
  265. while (inleft > 0) {
  266. Py_UNICODE c = IN1;
  267. DBCHAR code;
  268. if (c < 0x80) {
  269. if (state->i == 0) {
  270. WRITE1((unsigned char)c)
  271. NEXT(1, 1)
  272. }
  273. else {
  274. WRITE3('~', '}', (unsigned char)c)
  275. NEXT(1, 3)
  276. state->i = 0;
  277. }
  278. continue;
  279. }
  280. UCS4INVALID(c)
  281. TRYMAP_ENC(gbcommon, code, c);
  282. else return 1;
  283. if (code & 0x8000) /* MSB set: GBK */
  284. return 1;
  285. if (state->i == 0) {
  286. WRITE4('~', '{', code >> 8, code & 0xff)
  287. NEXT(1, 4)
  288. state->i = 1;
  289. }
  290. else {
  291. WRITE2(code >> 8, code & 0xff)
  292. NEXT(1, 2)
  293. }
  294. }
  295. return 0;
  296. }
  297. DECODER_INIT(hz)
  298. {
  299. state->i = 0;
  300. return 0;
  301. }
  302. DECODER_RESET(hz)
  303. {
  304. state->i = 0;
  305. return 0;
  306. }
  307. DECODER(hz)
  308. {
  309. while (inleft > 0) {
  310. unsigned char c = IN1;
  311. if (c == '~') {
  312. unsigned char c2 = IN2;
  313. REQUIRE_INBUF(2)
  314. if (c2 == '~') {
  315. WRITE1('~')
  316. NEXT(2, 1)
  317. continue;
  318. }
  319. else if (c2 == '{' && state->i == 0)
  320. state->i = 1; /* set GB */
  321. else if (c2 == '}' && state->i == 1)
  322. state->i = 0; /* set ASCII */
  323. else if (c2 == '\n')
  324. ; /* line-continuation */
  325. else
  326. return 2;
  327. NEXT(2, 0);
  328. continue;
  329. }
  330. if (c & 0x80)
  331. return 1;
  332. if (state->i == 0) { /* ASCII mode */
  333. WRITE1(c)
  334. NEXT(1, 1)
  335. }
  336. else { /* GB mode */
  337. REQUIRE_INBUF(2)
  338. REQUIRE_OUTBUF(1)
  339. TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
  340. NEXT(2, 1)
  341. }
  342. else
  343. return 2;
  344. }
  345. }
  346. return 0;
  347. }
  348. BEGIN_MAPPINGS_LIST
  349. MAPPING_DECONLY(gb2312)
  350. MAPPING_DECONLY(gbkext)
  351. MAPPING_ENCONLY(gbcommon)
  352. MAPPING_ENCDEC(gb18030ext)
  353. END_MAPPINGS_LIST
  354. BEGIN_CODECS_LIST
  355. CODEC_STATELESS(gb2312)
  356. CODEC_STATELESS(gbk)
  357. CODEC_STATELESS(gb18030)
  358. CODEC_STATEFUL(hz)
  359. END_CODECS_LIST
  360. I_AM_A_MODULE_FOR(cn)