PageRenderTime 65ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/src/encoding.cc

https://bitbucket.org/mumurik/xyzzy
C++ | 2739 lines | 2464 code | 271 blank | 4 comment | 644 complexity | 1c9d35cd81e98f80ee2c90b77eccf817 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #include "ed.h"
  2. #include "encoding.h"
  3. #include "ibmext.h"
  4. #include "utf2sjis.h"
  5. u_char escseq_euckr[] = {ccs_usascii, ccs_ksc5601, ccs_invalid, ccs_invalid};
  6. u_char escseq_eucgb[] = {ccs_usascii, ccs_gb2312, ccs_invalid, ccs_invalid};
  7. u_int designatable_any[] = {u_int (-1), u_int (-1), u_int (-1), u_int (-1)};
  8. const Char *
  9. cjk_translate_table (int lang)
  10. {
  11. switch (lang)
  12. {
  13. case ENCODING_LANG_JP:
  14. case ENCODING_LANG_JP2:
  15. return wc2cp932_table;
  16. case ENCODING_LANG_KR:
  17. init_wc2ksc5601_table ();
  18. return wc2ksc5601_table;
  19. case ENCODING_LANG_CN_GB:
  20. init_wc2gb2312_table ();
  21. return wc2gb2312_table;
  22. case ENCODING_LANG_CN_BIG5:
  23. init_wc2big5_table ();
  24. return wc2big5_table;
  25. case ENCODING_LANG_CN:
  26. init_wc2gb2312_table ();
  27. init_wc2big5_table ();
  28. return 0;
  29. default:
  30. return 0;
  31. }
  32. }
  33. int
  34. xbuffered_read_stream::refill ()
  35. {
  36. do
  37. {
  38. begin ();
  39. refill_internal ();
  40. }
  41. while (head () == tail () && head () != base ());
  42. return setbuf (head (), tail ());
  43. }
  44. void
  45. sjis_to_internal_stream::refill_internal ()
  46. {
  47. while (room () > 0)
  48. {
  49. int c1 = s_in.get ();
  50. if (c1 == eof)
  51. break;
  52. if (SJISP (c1))
  53. {
  54. int c2 = s_in.get ();
  55. if (c2 != eof)
  56. c1 = (c1 << 8) + c2;
  57. }
  58. put (c1);
  59. }
  60. }
  61. void
  62. fast_sjis_to_internal_stream::refill_internal ()
  63. {
  64. const u_char *rs, *rse;
  65. Char *rd, *rde;
  66. s_in.begin_direct_input (rs, rse);
  67. begin_direct_output (rd, rde);
  68. const u_char *s = rs, *const se = rse;
  69. Char *d = rd, *const de = rde;
  70. for (; d < de && s < se; d++)
  71. {
  72. int c1 = *s++;
  73. if (SJISP (c1) && s < se)
  74. c1 = (c1 << 8) + *s++;
  75. *d = c1;
  76. }
  77. s_in.end_direct_input (s);
  78. end_direct_output (d);
  79. }
  80. Char
  81. jisx0212_to_internal (int c1, int c2, int vender)
  82. {
  83. if (vender == ENCODING_ISO_VENDER_OSFJVC && c1 >= 0x75)
  84. {
  85. c1 += 20;
  86. return (j2sh (c1, c2) << 8) | j2sl (c1, c2);
  87. }
  88. if (vender != ENCODING_ISO_VENDER_OSFJVC
  89. && vender != ENCODING_ISO_VENDER_IBMEXT)
  90. return jisx0212_to_int (c1, c2);
  91. if (c1 == 0x74)
  92. return ibmext_eucjp2sjis_table[c2 - (0x21 - (0x7f - 0x73))];
  93. if (c1 == 0x73 && c2 >= 0x73)
  94. return ibmext_eucjp2sjis_table[c2 - 0x73];
  95. Char cc = jisx0212_to_int (c1, c2);
  96. Char t = w2i (i2w (cc));
  97. if (t != Char (-1))
  98. return t;
  99. return cc;
  100. }
  101. iso2022_noesc_to_internal_stream::iso2022_noesc_to_internal_stream (xinput_stream <u_char> &in,
  102. const u_char *g,
  103. int flags)
  104. : xbuffered_read_stream (in),
  105. s_vender (vender_depend_code (flags & ENCODING_ISO_VENDER_MASK))
  106. {
  107. memcpy (s_g, g, 4);
  108. init_cns11643_table ();
  109. }
  110. void
  111. iso2022_noesc_to_internal_stream::to_internal (u_char ccs, int c1, int oc1)
  112. {
  113. if (ccs_1byte_charset_p (ccs))
  114. put ((ccs_1byte_94_charset_p (ccs)
  115. ? c1 <= ' ' || c1 >= 0x7f : c1 < ' ')
  116. ? oc1 : (ccs << 7) | c1);
  117. else
  118. {
  119. if (c1 <= 0x20 || c1 >= 0x7f)
  120. put (oc1);
  121. else
  122. {
  123. int oc2 = s_in.get ();
  124. if (oc2 == eof)
  125. {
  126. put (oc1);
  127. return;
  128. }
  129. int c2 = oc2 & 127;
  130. if (c2 <= 0x20 || c2 >= 0x7f)
  131. {
  132. put (oc1);
  133. s_in.putback (oc2);
  134. }
  135. else
  136. switch (ccs)
  137. {
  138. case ccs_jisx0208:
  139. if (s_vender == ENCODING_ISO_VENDER_OSFJVC && c1 >= 0x75)
  140. c1 += 10;
  141. put ((j2sh (c1, c2) << 8) | j2sl (c1, c2));
  142. break;
  143. case ccs_jisx0212:
  144. put (jisx0212_to_internal (c1, c2, s_vender));
  145. break;
  146. case ccs_gb2312:
  147. put (gb2312_to_int (c1, c2));
  148. break;
  149. case ccs_big5_1:
  150. case ccs_big5_2:
  151. mule_g2b (ccs, c1, c2);
  152. put (big5_to_int (c1, c2));
  153. break;
  154. case ccs_cns11643_1:
  155. {
  156. Char cc = cns11643_1_to_internal[c1 * 94 + c2 - (0x21 * 94 + 0x21)];
  157. if (cc != Char (-1))
  158. put (cc);
  159. else
  160. {
  161. put (oc1);
  162. put (oc2);
  163. }
  164. break;
  165. }
  166. case ccs_cns11643_2:
  167. {
  168. Char cc = cns11643_2_to_internal[c1 * 94 + c2 - (0x21 * 94 + 0x21)];
  169. if (cc != Char (-1))
  170. put (cc);
  171. else
  172. {
  173. put (oc1);
  174. put (oc2);
  175. }
  176. break;
  177. }
  178. case ccs_ksc5601:
  179. put (ksc5601_to_int (c1, c2));
  180. break;
  181. default:
  182. assert (0);
  183. put (oc1);
  184. put (oc2);
  185. break;
  186. }
  187. }
  188. }
  189. }
  190. void
  191. iso2022_noesc_to_internal_stream::refill_internal ()
  192. {
  193. while (room () > 0)
  194. {
  195. int c = s_in.get ();
  196. if (c == eof)
  197. break;
  198. if (c < 128)
  199. to_internal (s_g[0], c, c);
  200. else if (c == CC_SS2)
  201. {
  202. if (s_g[2] == ccs_invalid)
  203. put (c);
  204. else
  205. {
  206. c = s_in.get ();
  207. if (c == eof)
  208. break;
  209. to_internal (s_g[2], c & 127, c);
  210. }
  211. }
  212. else if (c == CC_SS3)
  213. {
  214. if (s_g[3] == ccs_invalid)
  215. put (c);
  216. else
  217. {
  218. c = s_in.get ();
  219. if (c == eof)
  220. break;
  221. to_internal (s_g[3], c & 127, c);
  222. }
  223. }
  224. else
  225. to_internal (s_g[1], c & 127, c);
  226. }
  227. }
  228. int
  229. iso2022_to_internal_stream::designate94 (u_char &g, int *cp)
  230. {
  231. int c = s_in.get ();
  232. switch (c)
  233. {
  234. case 'B':
  235. case 'J':
  236. g = ccs_usascii;
  237. return 1;
  238. case 'I':
  239. g = ccs_jisx0201_kana;
  240. return 1;
  241. default:
  242. put (cp[0]);
  243. put (cp[1]);
  244. cp[0] = c;
  245. return 0;
  246. }
  247. }
  248. int
  249. iso2022_to_internal_stream::designate96 (u_char &g, int *cp)
  250. {
  251. int c = s_in.get ();
  252. switch (c)
  253. {
  254. case 'A':
  255. g = ccs_iso8859_1;
  256. return 1;
  257. case 'B':
  258. g = ccs_iso8859_2;
  259. return 1;
  260. case 'C':
  261. g = ccs_iso8859_3;
  262. return 1;
  263. case 'D':
  264. g = ccs_iso8859_4;
  265. return 1;
  266. case 'F':
  267. g = ccs_iso8859_7;
  268. return 1;
  269. case 'L':
  270. g = ccs_iso8859_5;
  271. return 1;
  272. case 'M':
  273. g = ccs_iso8859_9;
  274. return 1;
  275. case 'V':
  276. g = ccs_iso8859_10;
  277. return 1;
  278. case 'Y':
  279. g = ccs_iso8859_13;
  280. return 1;
  281. default:
  282. put (cp[0]);
  283. put (cp[1]);
  284. cp[0] = c;
  285. return 0;
  286. }
  287. }
  288. int
  289. iso2022_to_internal_stream::designate94n (u_char &g, int *cp)
  290. {
  291. int c = s_in.get ();
  292. switch (c)
  293. {
  294. case '@':
  295. case 'B':
  296. g = ccs_jisx0208;
  297. return 1;
  298. case 'D':
  299. g = ccs_jisx0212;
  300. return 1;
  301. case 'A':
  302. g = ccs_gb2312;
  303. return 1;
  304. case 'C':
  305. g = ccs_ksc5601;
  306. return 1;
  307. case 'G':
  308. g = ccs_cns11643_1;
  309. return 1;
  310. case 'H':
  311. g = ccs_cns11643_2;
  312. return 1;
  313. case '0':
  314. g = ccs_big5_1;
  315. return 1;
  316. case '1':
  317. g = ccs_big5_2;
  318. return 1;
  319. default:
  320. put (cp[0]);
  321. put (cp[1]);
  322. put (cp[2]);
  323. cp[0] = c;
  324. return 0;
  325. }
  326. }
  327. void
  328. iso2022_to_internal_stream::refill_internal ()
  329. {
  330. while (room () > 0)
  331. {
  332. int c[4];
  333. while (1)
  334. {
  335. c[0] = s_in.get ();
  336. again:
  337. if (c[0] != CC_ESC)
  338. break;
  339. s_ss = 0;
  340. c[1] = s_in.get ();
  341. switch (c[1])
  342. {
  343. case '$':
  344. c[2] = s_in.get ();
  345. switch (c[2])
  346. {
  347. case '@':
  348. case 'B':
  349. s_g[0] = ccs_jisx0208;
  350. continue;
  351. case 'A':
  352. s_g[0] = ccs_gb2312;
  353. continue;
  354. case '(':
  355. if (designate94n (s_g[0], c))
  356. continue;
  357. break;
  358. case ')':
  359. if (designate94n (s_g[1], c))
  360. continue;
  361. break;
  362. case '*':
  363. if (designate94n (s_g[2], c))
  364. continue;
  365. break;
  366. case '+':
  367. if (designate94n (s_g[3], c))
  368. continue;
  369. break;
  370. default:
  371. put (c[0]);
  372. put (c[1]);
  373. c[0] = c[2];
  374. break;
  375. }
  376. break;
  377. case '(':
  378. if (designate94 (s_g[0], c))
  379. continue;
  380. break;
  381. case ')':
  382. if (designate94 (s_g[1], c))
  383. continue;
  384. break;
  385. case '*':
  386. if (designate94 (s_g[2], c))
  387. continue;
  388. break;
  389. case '+':
  390. if (designate94 (s_g[3], c))
  391. continue;
  392. break;
  393. case ',': // Mule compatible
  394. if (designate96 (s_g[0], c))
  395. continue;
  396. break;
  397. case '-':
  398. if (designate96 (s_g[1], c))
  399. continue;
  400. break;
  401. case '.':
  402. if (designate96 (s_g[2], c))
  403. continue;
  404. break;
  405. case '/':
  406. if (designate96 (s_g[3], c))
  407. continue;
  408. break;
  409. case 'N': // SS2
  410. if (s_g[2] == ccs_invalid)
  411. goto wrong;
  412. s_ss = &s_g[2];
  413. continue;
  414. case 'O': // SS3
  415. if (s_g[3] == ccs_invalid)
  416. goto wrong;
  417. s_ss = &s_g[3];
  418. continue;
  419. case 'n': // LS2:
  420. if (s_g[2] == ccs_invalid)
  421. goto wrong;
  422. s_gl = &s_g[2];
  423. continue;
  424. case 'o': // LS3:
  425. if (s_g[3] == ccs_invalid)
  426. goto wrong;
  427. s_gl = &s_g[3];
  428. continue;
  429. case '~': // LS1R
  430. if (s_g[1] == ccs_invalid)
  431. goto wrong;
  432. s_gr = &s_g[1];
  433. continue;
  434. case '}': // LS2R
  435. if (s_g[2] == ccs_invalid)
  436. goto wrong;
  437. s_gr = &s_g[2];
  438. continue;
  439. case '|': // LS3R
  440. if (s_g[3] == ccs_invalid)
  441. goto wrong;
  442. s_gr = &s_g[3];
  443. continue;
  444. wrong:
  445. put (c[0]);
  446. c[0] = c[1];
  447. goto normal_char;
  448. default:
  449. s_in.putback (c[1]);
  450. goto normal_char;
  451. }
  452. if (c[0] == eof)
  453. return;
  454. if (room () > 0)
  455. goto again;
  456. s_in.putback (c[0]);
  457. return;
  458. }
  459. normal_char:
  460. int cc = c[0];
  461. if (cc == eof)
  462. break;
  463. switch (cc)
  464. {
  465. case CC_SS2:
  466. if (s_g[2] == ccs_invalid)
  467. break;
  468. s_ss = &s_g[2];
  469. continue;
  470. case CC_SS3:
  471. if (s_g[3] == ccs_invalid)
  472. break;
  473. s_ss = &s_g[3];
  474. continue;
  475. case CC_SI:
  476. if (s_g[1] == ccs_invalid)
  477. break;
  478. s_gl = &s_g[0];
  479. continue;
  480. case CC_SO:
  481. if (s_g[1] == ccs_invalid)
  482. break;
  483. s_gl = &s_g[1];
  484. continue;
  485. }
  486. int ccs;
  487. if (cc < 128)
  488. ccs = s_ss ? *s_ss : *s_gl;
  489. else
  490. {
  491. ccs = s_ss ? *s_ss : *s_gr;
  492. if (ccs != ccs_invalid)
  493. cc &= 127;
  494. else
  495. ccs = ccs_usascii;
  496. }
  497. s_ss = 0;
  498. to_internal (ccs, cc, c[0]);
  499. }
  500. }
  501. void
  502. big5_to_internal_stream::refill_internal ()
  503. {
  504. while (room () > 0)
  505. {
  506. int c1 = s_in.get ();
  507. if (c1 == eof)
  508. break;
  509. if (c1 >= 0xa1 && c1 <= 0xf8 && c1 != 0xc8)
  510. {
  511. int c2 = s_in.get ();
  512. if (c2 >= 0x40 && c2 <= 0x7e || c2 >= 0xa1 && c2 <= 0xfe)
  513. c1 = big5_to_int (c1, c2);
  514. else
  515. s_in.putback (c2);
  516. }
  517. put (c1);
  518. }
  519. }
  520. void
  521. binary_to_internal_stream::refill_internal ()
  522. {
  523. while (room () > 0)
  524. {
  525. int c = s_in.get ();
  526. if (c == eof)
  527. break;
  528. put (c);
  529. }
  530. }
  531. utf_to_internal_stream::putw_t
  532. utf_to_internal_stream::per_lang_putw (int lang)
  533. {
  534. switch (lang)
  535. {
  536. default:
  537. case ENCODING_LANG_JP:
  538. case ENCODING_LANG_JP2:
  539. return &utf_to_internal_stream::putw_jp;
  540. case ENCODING_LANG_KR:
  541. case ENCODING_LANG_CN_GB:
  542. case ENCODING_LANG_CN_BIG5:
  543. return &utf_to_internal_stream::putw_gen;
  544. case ENCODING_LANG_CN:
  545. return &utf_to_internal_stream::putw_cn;
  546. }
  547. }
  548. void
  549. utf_to_internal_stream::putw_jp (ucs2_t wc)
  550. {
  551. if (s_has_bom < 0)
  552. {
  553. s_has_bom = wc == UNICODE_BOM;
  554. if (s_has_bom)
  555. return;
  556. }
  557. if (!(s_flags & ENCODING_UTF_WINDOWS))
  558. {
  559. int n = wc % numberof (utf_shiftjis2internal_hash);
  560. if (utf_shiftjis2internal_hash[n].wc == wc)
  561. {
  562. put (utf_shiftjis2internal_hash[n].cc);
  563. return;
  564. }
  565. }
  566. Char cc;
  567. if (s_to_full_width
  568. && (cc = wc2cp932 (wc)) != Char (-1)
  569. && !ccs_1byte_94_charset_p (code_charset (cc)))
  570. put (cc);
  571. else
  572. {
  573. cc = w2i (wc);
  574. if (cc != Char (-1))
  575. put (cc);
  576. else
  577. {
  578. put (utf16_ucs2_to_undef_pair_high (wc));
  579. put (utf16_ucs2_to_undef_pair_low (wc));
  580. }
  581. }
  582. }
  583. void
  584. utf_to_internal_stream::putw_gen (ucs2_t wc)
  585. {
  586. if (s_has_bom < 0)
  587. {
  588. s_has_bom = wc == UNICODE_BOM;
  589. if (s_has_bom)
  590. return;
  591. }
  592. Char cc = w2i (wc);
  593. if (cc != Char (-1))
  594. {
  595. if (!ccs_1byte_94_charset_p (code_charset (cc)))
  596. {
  597. Char t = s_cjk_translate[wc];
  598. if (t != Char (-1))
  599. cc = t;
  600. }
  601. put (cc);
  602. }
  603. else
  604. {
  605. put (utf16_ucs2_to_undef_pair_high (wc));
  606. put (utf16_ucs2_to_undef_pair_low (wc));
  607. }
  608. }
  609. void
  610. utf_to_internal_stream::putw_cn (ucs2_t wc)
  611. {
  612. if (s_has_bom < 0)
  613. {
  614. s_has_bom = wc == UNICODE_BOM;
  615. if (s_has_bom)
  616. return;
  617. }
  618. Char cc = w2i (wc);
  619. if (cc != Char (-1))
  620. {
  621. if (!ccs_1byte_94_charset_p (code_charset (cc)))
  622. {
  623. Char t = wc2gb2312_table[wc];
  624. if (t != Char (-1) || (t = wc2big5_table[wc]) != Char (-1))
  625. cc = t;
  626. }
  627. put (cc);
  628. }
  629. else
  630. {
  631. put (utf16_ucs2_to_undef_pair_high (wc));
  632. put (utf16_ucs2_to_undef_pair_low (wc));
  633. }
  634. }
  635. inline void
  636. utf_to_internal_stream::putl (ucs4_t lc)
  637. {
  638. if (lc < 0x10000)
  639. putw (ucs2_t (lc));
  640. else
  641. {
  642. putw (utf16_ucs4_to_pair_high (lc));
  643. putw (utf16_ucs4_to_pair_low (lc));
  644. }
  645. }
  646. void
  647. utf16_to_internal_stream::refill_internal_le ()
  648. {
  649. while (room () > 0)
  650. {
  651. int c1 = s_in.get ();
  652. if (c1 == eof)
  653. break;
  654. int c2 = s_in.get ();
  655. if (c2 == eof)
  656. break;
  657. putw ((c2 << 8) | c1);
  658. }
  659. }
  660. void
  661. utf16_to_internal_stream::refill_internal_be ()
  662. {
  663. while (room () > 0)
  664. {
  665. int c1 = s_in.get ();
  666. if (c1 == eof)
  667. break;
  668. int c2 = s_in.get ();
  669. if (c2 == eof)
  670. break;
  671. putw ((c1 << 8) | c2);
  672. }
  673. }
  674. void
  675. utf16unknown_to_internal_stream::refill_internal ()
  676. {
  677. if (!s_byte_order)
  678. {
  679. int c1 = s_in.get ();
  680. if (c1 == eof)
  681. return;
  682. int c2 = s_in.get ();
  683. if (c2 == eof)
  684. return;
  685. ucs2_t wc = (c1 << 8) | c2;
  686. if (wc == UNICODE_BOM)
  687. s_byte_order = ENCODING_UTF_BE;
  688. else if (wc == UNICODE_REVBOM)
  689. s_byte_order = ENCODING_UTF_LE;
  690. else
  691. {
  692. putw (wc);
  693. if (xsymbol_value (Vdefault_utf16_byte_order) == Kbig_endian)
  694. s_byte_order = ENCODING_UTF_BE;
  695. else
  696. s_byte_order = ENCODING_UTF_LE;
  697. }
  698. }
  699. if (s_byte_order == ENCODING_UTF_BE)
  700. refill_internal_be ();
  701. else
  702. refill_internal_le ();
  703. }
  704. u_char utf8_chtab[] =
  705. {
  706. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  707. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  708. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  709. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  710. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  711. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  712. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  713. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  714. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  715. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  716. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  717. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  718. 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
  719. 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
  720. 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
  721. 3,3,3,3,3,3,3,3,2,2,2,2,1,1,0,0,
  722. };
  723. u_char utf8_chmask[] = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f};
  724. void
  725. utf8_to_internal_stream::refill_internal ()
  726. {
  727. while (room () > 0)
  728. {
  729. int c = s_in.get ();
  730. if (c == eof)
  731. break;
  732. u_char nbits = utf8_chtab[c];
  733. c &= utf8_chmask[nbits];
  734. switch (nbits)
  735. {
  736. case 7:
  737. putw (c);
  738. break;
  739. case 0:
  740. case 6:
  741. /* invalid code */
  742. break;
  743. default:
  744. {
  745. ucs4_t code = c;
  746. do
  747. {
  748. c = s_in.get ();
  749. if (c == eof)
  750. return;
  751. code = (code << 6) | (c & 0x3f);
  752. }
  753. while (++nbits < 6);
  754. putl (code);
  755. break;
  756. }
  757. }
  758. }
  759. }
  760. utf7_to_internal_stream::utf7_to_internal_stream (xinput_stream <u_char> &in,
  761. int flags, int lang)
  762. : utf_to_internal_stream (in, flags | ENCODING_UTF_SIGNATURE, lang),
  763. s_direct_encoding (1), s_cc (eof),
  764. s_imap4p (flags & UTF7_IMAP4_MAILBOX_NAME),
  765. s_shift_char (s_imap4p ? '&' : '+')
  766. {
  767. }
  768. int
  769. utf7_to_internal_stream::unicode_shifted_encoding ()
  770. {
  771. u_char buf[8];
  772. int nchars;
  773. for (nchars = 0; nchars < sizeof buf; nchars++)
  774. {
  775. s_cc = s_in.get ();
  776. if (s_cc == eof)
  777. break;
  778. int b = s_imap4p ? imap4_base64_decode (s_cc) : base64_decode (s_cc);
  779. if (b >= 64)
  780. break;
  781. buf[nchars] = b;
  782. }
  783. int t = 0;
  784. int n = nchars & ~3;
  785. int i;
  786. for (i = 0; i < n; i += 4)
  787. {
  788. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  789. buf[t++] = (buf[i + 1] << 4) | (buf[i + 2] >> 2);
  790. buf[t++] = (buf[i + 2] << 6) | buf[i + 3];
  791. }
  792. switch (nchars & 3)
  793. {
  794. case 2:
  795. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  796. break;
  797. case 3:
  798. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  799. buf[t++] = (buf[i + 1] << 4) | (buf[i + 2] >> 2);
  800. break;
  801. }
  802. t &= ~1;
  803. for (i = 0; i < t; i += 2)
  804. putw ((buf[i] << 8) + buf[i + 1]);
  805. s_nbytes += t;
  806. s_direct_encoding = nchars < sizeof buf;
  807. if (s_cc == '-')
  808. {
  809. if (!s_nbytes)
  810. putw (s_shift_char);
  811. s_cc = s_in.get ();
  812. }
  813. return !s_direct_encoding;
  814. }
  815. void
  816. utf7_to_internal_stream::refill_internal ()
  817. {
  818. if (!s_direct_encoding)
  819. goto unicode_shifted_encoding;
  820. if (s_cc == eof)
  821. s_cc = s_in.get ();
  822. while (room () > 0)
  823. {
  824. if (s_cc == eof)
  825. break;
  826. if (s_cc != s_shift_char)
  827. {
  828. putw (s_cc);
  829. s_cc = s_in.get ();
  830. }
  831. else
  832. {
  833. s_nbytes = 0;
  834. s_direct_encoding = 0;
  835. unicode_shifted_encoding:
  836. while (room () > 0 && unicode_shifted_encoding ())
  837. ;
  838. }
  839. }
  840. }
  841. void
  842. utf5_to_internal_stream::refill_internal ()
  843. {
  844. while (1)
  845. {
  846. int c = s_in.get ();
  847. if (c == eof)
  848. return;
  849. nextchar:
  850. ucs4_t code = digit_char (c) - 16;
  851. if (code >= 16)
  852. continue;
  853. while (1)
  854. {
  855. c = s_in.get ();
  856. if (c == eof)
  857. {
  858. putl (code);
  859. return;
  860. }
  861. int n = digit_char (c);
  862. if (n >= 16)
  863. {
  864. putl (code);
  865. if (room () <= 0)
  866. {
  867. s_in.putback (c);
  868. return;
  869. }
  870. goto nextchar;
  871. }
  872. code = (code << 4) | n;
  873. }
  874. }
  875. }
  876. void
  877. iso8859_to_internal_stream::refill_internal ()
  878. {
  879. while (room () > 0)
  880. {
  881. int c = s_in.get ();
  882. if (c == eof)
  883. break;
  884. if (c >= 0xa0)
  885. c = s_charset | (c & 127);
  886. put (c);
  887. }
  888. }
  889. void
  890. windows_codepage_to_internal_stream::refill_internal ()
  891. {
  892. while (room () > 0)
  893. {
  894. int c = s_in.get ();
  895. if (c == eof)
  896. break;
  897. if (c >= 0x80 && s_translate[c - 0x80] != Char (-1))
  898. c = s_translate[c - 0x80];
  899. put (c);
  900. }
  901. }
  902. void
  903. xwrite_stream::puteol ()
  904. {
  905. if (s_eol == eol_crlf)
  906. {
  907. put ('\r');
  908. put ('\n');
  909. }
  910. else if (s_eol == eol_lf)
  911. put ('\n');
  912. else
  913. put ('\r');
  914. s_nlines++;
  915. }
  916. int
  917. internal_to_sjis_stream::refill ()
  918. {
  919. begin ();
  920. while (room () > 0)
  921. {
  922. int c = s_in.get ();
  923. if (c == eof)
  924. break;
  925. Char cc = c;
  926. if (cc >= 0x80)
  927. {
  928. if (code_charset_bit (cc) & ccsf_possible_cp932)
  929. {
  930. cc = wc2cp932 (i2w (cc));
  931. if (cc == Char (-1))
  932. cc = DEFCHAR;
  933. }
  934. else if (code_charset_bit (cc) & ccsf_not_cp932)
  935. cc = DEFCHAR;
  936. if (DBCP (cc))
  937. {
  938. put (u_char (cc >> 8));
  939. put (u_char (cc));
  940. continue;
  941. }
  942. }
  943. if (cc == '\n')
  944. puteol ();
  945. else
  946. put (u_char (cc));
  947. }
  948. return finish ();
  949. }
  950. int
  951. internal_to_big5_stream::refill ()
  952. {
  953. begin ();
  954. while (room () > 0)
  955. {
  956. int c = s_in.get ();
  957. if (c == eof)
  958. break;
  959. Char cc = wc2big5_table[i2w (c)];
  960. if (cc == Char (-1))
  961. cc = DEFCHAR;
  962. if (cc >= 0x100)
  963. {
  964. int c1, c2;
  965. int_to_big5 (cc, c1, c2);
  966. put (c1);
  967. put (c2);
  968. }
  969. else if (cc == '\n')
  970. puteol ();
  971. else
  972. put (u_char (cc));
  973. }
  974. return finish ();
  975. }
  976. int
  977. internal_to_binary_stream::refill ()
  978. {
  979. begin ();
  980. while (room () > 0)
  981. {
  982. int c = s_in.get ();
  983. if (c == eof)
  984. break;
  985. if (c >= 0x100)
  986. {
  987. put (u_char (c >> 8));
  988. put (u_char (c));
  989. }
  990. else if (c == '\n')
  991. puteol ();
  992. else
  993. put (u_char (c));
  994. }
  995. return finish ();
  996. }
  997. Char
  998. convert_ibmext (Char cc)
  999. {
  1000. if (cc <= 0xeffc)
  1001. return cc;
  1002. if (cc < 0xfa40 || cc > 0xfc4b)
  1003. return DEFCHAR;
  1004. int c2 = cc & 0xff;
  1005. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1006. return DEFCHAR;
  1007. if (c2 >= 0x80)
  1008. c2--;
  1009. return ibmext2internal_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1010. }
  1011. Char
  1012. convert_ibmext2necext (Char cc)
  1013. {
  1014. if (cc <= 0xeffc)
  1015. return cc;
  1016. if (cc < 0xfa40 || cc > 0xfc4b)
  1017. return DEFCHAR;
  1018. int c2 = cc & 0xff;
  1019. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1020. return DEFCHAR;
  1021. if (c2 >= 0x80)
  1022. c2--;
  1023. return ibmext2necext_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1024. }
  1025. Char
  1026. convert_osfjvc (Char cc)
  1027. {
  1028. if (cc < 0xecfc)
  1029. return cc;
  1030. if (cc >= 0xed40 && cc <= 0xeefc)
  1031. {
  1032. cc = w2i (i2w (cc));
  1033. if (cc == Char (-1))
  1034. return DEFCHAR;
  1035. }
  1036. if (cc < 0xfa40 || cc > 0xfc4b)
  1037. return cc;
  1038. int c2 = cc & 0xff;
  1039. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1040. return DEFCHAR;
  1041. if (c2 >= 0x80)
  1042. c2--;
  1043. return ibmext2internal_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1044. }
  1045. int
  1046. vender_depend_code (int vender)
  1047. {
  1048. if (vender == ENCODING_ISO_VENDER_NIL)
  1049. vender = to_vender_code (xsymbol_value (Vvender_depend_code_mapping));
  1050. if (vender != ENCODING_ISO_VENDER_NIL)
  1051. return vender;
  1052. return ENCODING_ISO_VENDER_NECEXT;
  1053. }
  1054. vender_code_mapper_fn
  1055. select_vender_code_mapper (int vender)
  1056. {
  1057. switch (vender_depend_code (vender))
  1058. {
  1059. default:
  1060. case ENCODING_ISO_VENDER_IBMEXT:
  1061. return convert_ibmext;
  1062. case ENCODING_ISO_VENDER_NECEXT:
  1063. return convert_ibmext2necext;
  1064. case ENCODING_ISO_VENDER_OSFJVC:
  1065. return convert_osfjvc;
  1066. }
  1067. }
  1068. const internal_to_iso2022_stream::ccs_data internal_to_iso2022_stream::s_ccs_data[32] =
  1069. {
  1070. {'B', ctype94}, // ccs_usascii
  1071. {'I', ctype94}, // ccs_jisx0201_kana
  1072. {'A', ctype96}, // ccs_iso8859_1
  1073. {'B', ctype96}, // ccs_iso8859_2
  1074. {'C', ctype96}, // ccs_iso8859_3
  1075. {'D', ctype96}, // ccs_iso8859_4
  1076. {'L', ctype96}, // ccs_iso8859_5
  1077. {'F', ctype96}, // ccs_iso8859_7
  1078. {'M', ctype96}, // ccs_iso8859_9
  1079. {'V', ctype96}, // ccs_iso8859_10
  1080. {'Y', ctype96}, // ccs_iso8859_13
  1081. {'B', ctype94n, 1}, // ccs_jisx0208
  1082. {'D', ctype94n}, // ccs_jisx0212
  1083. {'A', ctype94n, 1}, // ccs_gb2312
  1084. {'C', ctype94n}, // ccs_ksc5601
  1085. {'0', ctype94n}, // ccs_big5_1
  1086. {'1', ctype94n}, // ccs_big5_2
  1087. {0}, // ccs_utf16_undef_char_high
  1088. {0}, // ccs_utf16_undef_char_low
  1089. {0}, // ccs_utf16_surrogate_high
  1090. {0}, // ccs_utf16_surrogate_low
  1091. {'G', ctype94n}, // ccs_cns11643_1
  1092. {'H', ctype94n}, // ccs_cns11643_2
  1093. };
  1094. const char internal_to_iso2022_stream::s_inter94[] = {'(', ')', '*', '+'};
  1095. const char internal_to_iso2022_stream::s_inter96[] = {',', '-', '.', '/'};
  1096. int
  1097. internal_to_iso2022_stream::select_designation (int ccs) const
  1098. {
  1099. if (ccs == ccs_usascii)
  1100. return 0;
  1101. int i;
  1102. for (i = 0; i < 4; i++)
  1103. if (s_initial[i] == ccs)
  1104. return i;
  1105. for (i = 0; i < 4; i++)
  1106. if (s_designatable[i] != u_int (-1)
  1107. && s_designatable[i] & (1 << ccs))
  1108. return i;
  1109. for (i = 0; i < 4; i++)
  1110. if (s_designatable[i] == u_int (-1))
  1111. return i;
  1112. if (s_flags & ENCODING_ISO_LOCKING_SHIFT)
  1113. return 1;
  1114. if (s_ccs_data[ccs].ctype == ctype96)
  1115. return 2;
  1116. return 0;
  1117. }
  1118. internal_to_iso2022_stream::internal_to_iso2022_stream (xinput_stream <Char> &in,
  1119. eol_code eol,
  1120. int flags,
  1121. const u_char *initial,
  1122. const u_int *designatable,
  1123. int cjk)
  1124. : xwrite_stream (in, eol), s_flags (flags), s_initial (initial),
  1125. s_gl (&s_g[0]), s_gr (flags & ENCODING_ISO_7BITS ? 0 : &s_g[1]),
  1126. s_designatable (designatable), s_cjk_translate (cjk_translate_table (cjk)),
  1127. s_lang_cn (cjk == ENCODING_LANG_CN),
  1128. s_vender_code_mapper (select_vender_code_mapper
  1129. (flags & ENCODING_ISO_VENDER_MASK))
  1130. {
  1131. memcpy (s_g, s_initial, 4);
  1132. for (int i = ccs_usascii; i < ccs_max; i++)
  1133. s_designation[i] = select_designation (i);
  1134. if (s_flags & ENCODING_ISO_USE_CNS11643)
  1135. init_big5cns_table ();
  1136. }
  1137. void
  1138. internal_to_iso2022_stream::designate (int n, u_char ccs)
  1139. {
  1140. if (s_g[n] != ccs)
  1141. {
  1142. put ('\033');
  1143. switch (s_ccs_data[ccs].ctype)
  1144. {
  1145. case ctype94:
  1146. put (s_inter94[n]);
  1147. break;
  1148. case ctype96:
  1149. put (s_inter96[n]);
  1150. break;
  1151. case ctype94n:
  1152. put ('$');
  1153. if (n || !(s_flags & ENCODING_ISO_SHORT_FORM) || !s_ccs_data[ccs].fshort)
  1154. put (s_inter94[n]);
  1155. break;
  1156. }
  1157. put (s_ccs_data[ccs].final);
  1158. s_g[n] = ccs;
  1159. }
  1160. }
  1161. int
  1162. internal_to_iso2022_stream::designate (u_char ccs)
  1163. {
  1164. int n = s_designation[ccs];
  1165. if (s_g[n] != ccs)
  1166. {
  1167. if (s_gl != &s_g[0] && s_flags & ENCODING_ISO_ASCII_CTRL)
  1168. {
  1169. put (CC_SI);
  1170. s_gl = &s_g[0];
  1171. }
  1172. designate (n, ccs);
  1173. }
  1174. switch (n)
  1175. {
  1176. default:
  1177. if (s_gl != &s_g[0])
  1178. {
  1179. put (CC_SI);
  1180. s_gl = &s_g[0];
  1181. }
  1182. return 0;
  1183. case 1:
  1184. if (s_gr == &s_g[1])
  1185. return 0x80;
  1186. if (s_gl != &s_g[1])
  1187. {
  1188. put (CC_SO);
  1189. s_gl = &s_g[1];
  1190. }
  1191. return 0;
  1192. case 2:
  1193. if (s_gr)
  1194. {
  1195. put (CC_SS2);
  1196. return 0x80;
  1197. }
  1198. else
  1199. {
  1200. put (CC_ESC);
  1201. put ('N');
  1202. return 0;
  1203. }
  1204. case 3:
  1205. if (s_gr)
  1206. {
  1207. put (CC_SS3);
  1208. return 0x80;
  1209. }
  1210. else
  1211. {
  1212. put (CC_ESC);
  1213. put ('O');
  1214. return 0;
  1215. }
  1216. }
  1217. }
  1218. int
  1219. internal_to_iso2022_stream::refill ()
  1220. {
  1221. begin ();
  1222. while (room () > 0)
  1223. {
  1224. int c = s_in.get ();
  1225. if (c == eof)
  1226. {
  1227. designate (ccs_usascii);
  1228. break;
  1229. }
  1230. Char cc = c;
  1231. u_int ccsf = code_charset_bit (cc);
  1232. if (ccsf & (ccsf_utf16_surrogate | ccsf_utf16_undef_char))
  1233. cc = DEFCHAR;
  1234. else
  1235. {
  1236. if (s_lang_cn)
  1237. {
  1238. if (!(ccsf & (ccsf_gb2312 | ccsf_big5)))
  1239. {
  1240. wchar_t wc = i2w (cc);
  1241. Char t = wc2gb2312_table[wc];
  1242. if (t != Char (-1) || (t = wc2big5_table[wc]) != Char (-1))
  1243. cc = t;
  1244. }
  1245. }
  1246. else
  1247. {
  1248. if (s_cjk_translate)
  1249. {
  1250. Char t = s_cjk_translate[i2w (cc)];
  1251. if (t != Char (-1))
  1252. cc = t;
  1253. }
  1254. }
  1255. cc = (*s_vender_code_mapper)(cc);
  1256. }
  1257. int ccs = code_charset (cc);
  1258. switch (ccs)
  1259. {
  1260. int c1, c2, f;
  1261. case ccs_usascii:
  1262. if (cc == '\n')
  1263. {
  1264. if (s_flags & ENCODING_ISO_ASCII_EOL)
  1265. {
  1266. if (s_gl != &s_g[0])
  1267. {
  1268. put (CC_SI);
  1269. s_gl = &s_g[0];
  1270. }
  1271. for (int i = 0; i < 4; i++)
  1272. if (s_g[i] != s_initial[i])
  1273. {
  1274. if (s_initial[i] != ccs_invalid)
  1275. designate (i, s_initial[i]);
  1276. else
  1277. s_g[i] = s_initial[i];
  1278. }
  1279. }
  1280. puteol ();
  1281. break;
  1282. }
  1283. /* fall thru... */
  1284. usascii:
  1285. if (s_flags & ENCODING_ISO_ASCII_CTRL || (cc > ' ' && cc < CC_DEL))
  1286. designate (ccs_usascii);
  1287. put (u_char (cc));
  1288. break;
  1289. case ccs_jisx0201_kana:
  1290. if (cc <= 0xa0 || cc == 0xff)
  1291. goto usascii;
  1292. f = designate (ccs_jisx0201_kana);
  1293. put (u_char (cc & 127 | f));
  1294. break;
  1295. case ccs_iso8859_1:
  1296. case ccs_iso8859_2:
  1297. case ccs_iso8859_3:
  1298. case ccs_iso8859_4:
  1299. case ccs_iso8859_5:
  1300. case ccs_iso8859_7:
  1301. case ccs_iso8859_9:
  1302. case ccs_iso8859_10:
  1303. case ccs_iso8859_13:
  1304. cc &= 127;
  1305. if (cc < ' ')
  1306. {
  1307. cc |= 0x80;
  1308. goto usascii;
  1309. }
  1310. f = designate (ccs);
  1311. put (u_char (cc | f));
  1312. break;
  1313. case ccs_jisx0212:
  1314. if (cc > CCS_JISX0212_MAX)
  1315. goto badchar;
  1316. int_to_jisx0212 (cc, c1, c2);
  1317. goto put94n;
  1318. case ccs_gb2312:
  1319. if (cc > CCS_GB2312_MAX)
  1320. goto badchar;
  1321. int_to_gb2312 (cc, c1, c2);
  1322. goto put94n;
  1323. case ccs_ksc5601:
  1324. if (cc > CCS_KSC5601_MAX)
  1325. goto badchar;
  1326. int_to_ksc5601 (cc, c1, c2);
  1327. goto put94n;
  1328. case ccs_big5:
  1329. if (cc > CCS_BIG5_MAX)
  1330. goto badchar;
  1331. if (s_flags & ENCODING_ISO_USE_CNS11643)
  1332. {
  1333. cc = big5cns_table[cc - CCS_BIG5_MIN];
  1334. if (cc != Char (-1))
  1335. {
  1336. switch (cc & 0x8080)
  1337. {
  1338. default:
  1339. ccs = ccs_gb2312;
  1340. break;
  1341. case BIG5CNS_CNS11643_1:
  1342. ccs = ccs_cns11643_1;
  1343. break;
  1344. case BIG5CNS_CNS11643_2:
  1345. ccs = ccs_cns11643_2;
  1346. break;
  1347. }
  1348. c1 = (cc >> 8) & 127;
  1349. c2 = cc & 127;
  1350. goto put94n;
  1351. }
  1352. }
  1353. int_to_big5 (cc, c1, c2);
  1354. mule_b2g (ccs, c1, c2);
  1355. goto put94n;
  1356. default:
  1357. c1 = cc >> 8;
  1358. c2 = cc & 255;
  1359. if (!SJISP (c1) || !SJIS2P (c2))
  1360. goto badchar;
  1361. s2j (c1, c2);
  1362. if (c1 >= 95 + 32)
  1363. {
  1364. if (c1 < 105 + 32)
  1365. c1 -= 10;
  1366. else if (c1 < 115 + 32)
  1367. {
  1368. c1 -= 20;
  1369. ccs = ccs_jisx0212;
  1370. }
  1371. else
  1372. goto badchar;
  1373. }
  1374. put94n:
  1375. f = designate (ccs);
  1376. put (c1 | f);
  1377. put (c2 | f);
  1378. break;
  1379. badchar:
  1380. designate (ccs_usascii);
  1381. put ('?');
  1382. break;
  1383. }
  1384. }
  1385. return finish ();
  1386. }
  1387. int
  1388. internal_to_utf_stream::getw () const
  1389. {
  1390. int c = s_in.get ();
  1391. if (c == eof)
  1392. return eof;
  1393. Char cc = Char (c);
  1394. if (!(s_flags & ENCODING_UTF_WINDOWS) && cc != Char (-1))
  1395. {
  1396. int n = cc % numberof (utf_internal2shiftjis_hash);
  1397. if (utf_internal2shiftjis_hash[n].cc == cc)
  1398. return utf_internal2shiftjis_hash[n].wc;
  1399. }
  1400. ucs2_t wc = i2w (cc);
  1401. if (wc != ucs2_t (-1))
  1402. return wc;
  1403. if (utf16_undef_char_high_p (ucs2_t (cc)))
  1404. {
  1405. int c2 = s_in.get ();
  1406. if (c2 != eof)
  1407. {
  1408. if (utf16_undef_char_low_p (ucs2_t (c2)))
  1409. return utf16_undef_pair_to_ucs2 (ucs2_t (cc), ucs2_t (c2));
  1410. s_in.putback (c2);
  1411. }
  1412. }
  1413. return DEFCHAR;
  1414. }
  1415. int
  1416. internal_to_utf16le_stream::refill ()
  1417. {
  1418. begin ();
  1419. if (s_bom)
  1420. {
  1421. s_bom = 0;
  1422. if (!s_in.eofp ())
  1423. {
  1424. put (u_char (UNICODE_BOM));
  1425. put (u_char (UNICODE_BOM >> 8));
  1426. }
  1427. }
  1428. while (room () > 0)
  1429. {
  1430. int c = getw ();
  1431. if (c == eof)
  1432. break;
  1433. ucs2_t wc = ucs2_t (c);
  1434. if (wc == '\n')
  1435. {
  1436. if (s_eol == eol_crlf)
  1437. {
  1438. put ('\r');
  1439. put (0);
  1440. put ('\n');
  1441. put (0);
  1442. }
  1443. else if (s_eol == eol_lf)
  1444. {
  1445. put ('\n');
  1446. put (0);
  1447. }
  1448. else
  1449. {
  1450. put ('\r');
  1451. put (0);
  1452. }
  1453. s_nlines++;
  1454. }
  1455. else
  1456. {
  1457. put (u_char (wc));
  1458. put (u_char (wc >> 8));
  1459. }
  1460. }
  1461. return finish ();
  1462. }
  1463. int
  1464. internal_to_utf16be_stream::refill ()
  1465. {
  1466. begin ();
  1467. if (s_bom)
  1468. {
  1469. s_bom = 0;
  1470. if (!s_in.eofp ())
  1471. {
  1472. put (u_char (UNICODE_BOM >> 8));
  1473. put (u_char (UNICODE_BOM));
  1474. }
  1475. }
  1476. while (room () > 0)
  1477. {
  1478. int c = getw ();
  1479. if (c == eof)
  1480. break;
  1481. ucs2_t wc = ucs2_t (c);
  1482. if (wc == '\n')
  1483. {
  1484. if (s_eol == eol_crlf)
  1485. {
  1486. put (0);
  1487. put ('\r');
  1488. put (0);
  1489. put ('\n');
  1490. }
  1491. else if (s_eol == eol_lf)
  1492. {
  1493. put (0);
  1494. put ('\n');
  1495. }
  1496. else
  1497. {
  1498. put (0);
  1499. put ('\r');
  1500. }
  1501. s_nlines++;
  1502. }
  1503. else
  1504. {
  1505. put (u_char (wc >> 8));
  1506. put (u_char (wc));
  1507. }
  1508. }
  1509. return finish ();
  1510. }
  1511. int
  1512. internal_to_utf8_stream::refill ()
  1513. {
  1514. begin ();
  1515. if (s_bom)
  1516. {
  1517. s_bom = 0;
  1518. if (!s_in.eofp ())
  1519. {
  1520. put (0xef);
  1521. put (0xbb);
  1522. put (0xbf);
  1523. }
  1524. }
  1525. while (room () > 0)
  1526. {
  1527. int c = getw ();
  1528. if (c == eof)
  1529. break;
  1530. ucs2_t wc = ucs2_t (c);
  1531. ucs4_t lc = wc;
  1532. if (utf16_surrogate_high_p (wc))
  1533. {
  1534. c = s_in.get ();
  1535. if (utf16_surrogate_low_p (ucs2_t (c)))
  1536. lc = utf16_pair_to_ucs4 (wc, ucs2_t (c));
  1537. else
  1538. s_in.putback (c);
  1539. }
  1540. if (lc < 0x80)
  1541. {
  1542. if (lc == '\n')
  1543. puteol ();
  1544. else
  1545. put (u_char (lc));
  1546. }
  1547. else if (lc < 0x800)
  1548. {
  1549. put (u_char (0xc0 | ((lc >> 6) & 0x1f)));
  1550. put (u_char (0x80 | (lc & 0x3f)));
  1551. }
  1552. else if (lc < 0x10000)
  1553. {
  1554. put (u_char (0xe0 | ((lc >> 12) & 0xf)));
  1555. put (u_char (0x80 | ((lc >> 6) & 0x3f)));
  1556. put (u_char (0x80 | (lc & 0x3f)));
  1557. }
  1558. else /* lc < 0x200000(0x110000) */
  1559. {
  1560. put (u_char (0xf0 | ((lc >> 18) & 7)));
  1561. put (u_char (0x80 | ((lc >> 12) & 0x3f)));
  1562. put (u_char (0x80 | ((lc >> 6) & 0x3f)));
  1563. put (u_char (0x80 | (lc & 0x3f)));
  1564. }
  1565. }
  1566. return finish ();
  1567. }
  1568. static const char b64chars[] =
  1569. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  1570. static const char imap4_b64chars[] =
  1571. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
  1572. internal_to_utf7_stream::internal_to_utf7_stream (xinput_stream <Char> &in,
  1573. eol_code eol, int flags)
  1574. : internal_to_utf_stream (in, eol, flags & ~ENCODING_UTF_SIGNATURE),
  1575. s_nb (0),
  1576. s_nshift (0),
  1577. s_accept (flags),
  1578. s_imap4p (flags & UTF7_IMAP4_MAILBOX_NAME),
  1579. s_shift_char (s_imap4p ? '&' : '+'),
  1580. s_b64 (s_imap4p ? imap4_b64chars : b64chars)
  1581. {
  1582. /* In IMAP4 modified UTF-7, "&" is always represented by "&-". */
  1583. if (s_imap4p)
  1584. s_accept |= UTF7_IMAP4_SHIFT_CHAR;
  1585. }
  1586. void
  1587. internal_to_utf7_stream::encode_b64 ()
  1588. {
  1589. int n = s_nb - s_nb % 3;
  1590. const u_char *b, *be;
  1591. for (b = s_b, be = s_b + n; b < be; b += 3)
  1592. {
  1593. put (s_b64[(b[0] >> 2) & 63]);
  1594. put (s_b64[((b[0] << 4) | (b[1] >> 4)) & 63]);
  1595. put (s_b64[((b[1] << 2) | (b[2] >> 6)) & 63]);
  1596. put (s_b64[b[2] & 63]);
  1597. }
  1598. if (n != s_nb)
  1599. switch (s_nb % 3)
  1600. {
  1601. case 1:
  1602. put (s_b64[(b[0] >> 2) & 63]);
  1603. put (s_b64[(b[0] << 4) & 63]);
  1604. break;
  1605. case 2:
  1606. put (s_b64[(b[0] >> 2) & 63]);
  1607. put (s_b64[((b[0] << 4) | (b[1] >> 4)) & 63]);
  1608. put (s_b64[(b[1] << 2) & 63]);
  1609. break;
  1610. }
  1611. }
  1612. int
  1613. internal_to_utf7_stream::refill ()
  1614. {
  1615. begin ();
  1616. while (room () > 0)
  1617. {
  1618. int c = getw ();
  1619. if (c == eof)
  1620. {
  1621. if (s_nshift)
  1622. {
  1623. if (s_nshift != 1 || s_nb != 2 || s_b[1] != s_shift_char)
  1624. encode_b64 ();
  1625. put ('-');
  1626. s_nshift = 0;
  1627. }
  1628. break;
  1629. }
  1630. ucs2_t wc = ucs2_t (c);
  1631. if (wc < 0x80 && utf7_set (wc) & s_accept)
  1632. {
  1633. if (s_nb)
  1634. {
  1635. if (s_nshift == 1 && s_nb == 2 && s_b[1] == s_shift_char)
  1636. put ('-');
  1637. else
  1638. {
  1639. encode_b64 ();
  1640. if (s_imap4p || wc == '-' || utf7_set (wc) & UTF7_SET_B)
  1641. put ('-');
  1642. }
  1643. s_nb = 0;
  1644. s_nshift = 0;
  1645. }
  1646. if (wc == '\n')
  1647. puteol ();
  1648. else
  1649. put (u_char (wc));
  1650. if (wc == s_shift_char)
  1651. put ('-');
  1652. }
  1653. else
  1654. {
  1655. if (!s_nshift)
  1656. put (u_char (s_shift_char));
  1657. if (s_nb == sizeof s_b)
  1658. {
  1659. encode_b64 ();
  1660. s_nb = 0;
  1661. }
  1662. s_b[s_nb++] = wc >> 8;
  1663. s_b[s_nb++] = u_char (wc);
  1664. s_nshift++;
  1665. }
  1666. }
  1667. return finish ();
  1668. }
  1669. int
  1670. internal_to_utf5_stream::refill ()
  1671. {
  1672. begin ();
  1673. while (room () > 0)
  1674. {
  1675. int c = getw ();
  1676. if (c == eof)
  1677. break;
  1678. ucs2_t wc = ucs2_t (c);
  1679. ucs4_t lc = wc;
  1680. if (utf16_surrogate_high_p (wc))
  1681. {
  1682. c = s_in.get ();
  1683. if (utf16_surrogate_low_p (ucs2_t (c)))
  1684. lc = utf16_pair_to_ucs4 (wc, ucs2_t (c));
  1685. else
  1686. s_in.putback (c);
  1687. }
  1688. if (!lc)
  1689. put ('G');
  1690. else if (lc < 0x10000)
  1691. {
  1692. for (int i = 0;; i++, lc <<= 4)
  1693. if (lc & 0xf000)
  1694. {
  1695. put ((upcase_digit_char + 16)[lc >> 12]);
  1696. for (; i < 3; i++, lc <<= 4)
  1697. put (upcase_digit_char[(lc >> 8) & 15]);
  1698. break;
  1699. }
  1700. }
  1701. else
  1702. {
  1703. for (int i = 0;; i++, lc <<= 4)
  1704. if (lc & 0xf0000000)
  1705. {
  1706. put ((upcase_digit_char + 16)[lc >> 28]);
  1707. for (; i < 7; i++, lc <<= 4)
  1708. put (upcase_digit_char[(lc >> 24) & 15]);
  1709. break;
  1710. }
  1711. }
  1712. }
  1713. return finish ();
  1714. }
  1715. const wc2int_hash &
  1716. internal_to_iso8859_stream::charset_hash (int ccs)
  1717. {
  1718. switch (ccs)
  1719. {
  1720. default:
  1721. assert (0);
  1722. case ccs_iso8859_1:
  1723. return wc2int_iso8859_1_hash;
  1724. case ccs_iso8859_2:
  1725. return wc2int_iso8859_2_hash;
  1726. case ccs_iso8859_3:
  1727. return wc2int_iso8859_3_hash;
  1728. case ccs_iso8859_4:
  1729. return wc2int_iso8859_4_hash;
  1730. case ccs_iso8859_5:
  1731. return wc2int_iso8859_5_hash;
  1732. case ccs_iso8859_7:
  1733. return wc2int_iso8859_7_hash;
  1734. case ccs_iso8859_9:
  1735. return wc2int_iso8859_9_hash;
  1736. case ccs_iso8859_10:
  1737. return wc2int_iso8859_10_hash;
  1738. case ccs_iso8859_13:
  1739. return wc2int_iso8859_13_hash;
  1740. }
  1741. }
  1742. int
  1743. internal_to_iso8859_stream::refill ()
  1744. {
  1745. begin ();
  1746. while (room () > 0)
  1747. {
  1748. int c = s_in.get ();
  1749. if (c == eof)
  1750. break;
  1751. Char cc = c;
  1752. if (cc >= 0xa0)
  1753. {
  1754. if (code_charset (cc) == s_charset)
  1755. cc = int_to_iso8859 (cc);
  1756. else
  1757. {
  1758. cc = lookup_wc2int_hash (s_hash, i2w (cc));
  1759. cc = cc != Char (-1) ? int_to_iso8859 (cc) : DEFCHAR;
  1760. }
  1761. if (cc >= 0x80 && cc < 0xa0)
  1762. cc = DEFCHAR;
  1763. }
  1764. if (cc == '\n')
  1765. puteol ();
  1766. else
  1767. put (u_char (cc));
  1768. }
  1769. return finish ();
  1770. }
  1771. int
  1772. internal_to_windows_codepage_stream::refill ()
  1773. {
  1774. begin ();
  1775. while (room () > 0)
  1776. {
  1777. int c = s_in.get ();
  1778. if (c == eof)
  1779. break;
  1780. Char cc = c;
  1781. if (cc >= 128)
  1782. {
  1783. cc = lookup_wc2int_hash (s_hash, i2w (cc));
  1784. if (cc == Char (-1))
  1785. cc = DEFCHAR;
  1786. }
  1787. if (cc == '\n')
  1788. puteol ();
  1789. else
  1790. put (u_char (cc));
  1791. }
  1792. return finish ();
  1793. }
  1794. int
  1795. xdecode_stream::decode (int nchars, const u_char *i)
  1796. {
  1797. if (!nchars)
  1798. return eof;
  1799. begin ();
  1800. for (; nchars >= 3; i += 4, nchars -= 3)
  1801. {
  1802. put ((i[0] << 2) | (i[1] >> 4));
  1803. put ((i[1] << 4) | (i[2] >> 2));
  1804. put ((i[2] << 6) | i[3]);
  1805. }
  1806. if (nchars > 0)
  1807. {
  1808. put ((i[0] << 2) | (i[1] >> 4));
  1809. if (nchars > 1)
  1810. put ((i[1] << 4) | (i[2] >> 2));
  1811. }
  1812. return finish ();
  1813. }
  1814. int
  1815. xdecode_b64_stream::refill ()
  1816. {
  1817. u_char buf[XDECODE_STREAM_BUFSIZE / 3 * 4];
  1818. int nchars;
  1819. for (nchars = 0; nchars < sizeof buf;)
  1820. {
  1821. int c = s_in.get ();
  1822. if (c == eof)
  1823. break;
  1824. c = base64_decode (c);
  1825. if (c < 64)
  1826. buf[nchars++] = c;
  1827. else if (c == 64 && nchars)
  1828. break;
  1829. }
  1830. return decode (nchars * 3 / 4, buf);
  1831. }
  1832. int
  1833. xdecode_uu_stream::refill ()
  1834. {
  1835. int c;
  1836. do
  1837. {
  1838. c = s_in.get ();
  1839. if (c == eof)
  1840. return eof;
  1841. }
  1842. while (c == '\r' || c == '\n');
  1843. int nchars = uudecode (c);
  1844. u_char buf[63 / 3 * 4];
  1845. int i;
  1846. for (i = 0; i < sizeof buf; i++)
  1847. {
  1848. c = s_in.get ();
  1849. if (c == eof || c == '\n')
  1850. break;
  1851. buf[i] = uudecode (c);
  1852. }
  1853. if (i == sizeof buf)
  1854. do
  1855. c = s_in.get ();
  1856. while (c != eof && c != '\n');
  1857. return decode (nchars, buf);
  1858. }
  1859. int
  1860. xdecode_qp_stream::refill ()
  1861. {
  1862. int c1, c2, c3;
  1863. begin ();
  1864. c1 = s_in.get ();
  1865. while (room () > 0)
  1866. {
  1867. if (c1 == eof)
  1868. break;
  1869. if (c1 != '=')
  1870. put (s_underscore_to_space && c1 == '_' ? ' ' : c1);
  1871. else
  1872. {
  1873. c2 = s_in.get ();
  1874. if (c2 == '\r')
  1875. {
  1876. c3 = s_in.get ();
  1877. if (c3 != '\n')
  1878. {
  1879. put (c1);
  1880. put (c2);
  1881. c1 = c3;
  1882. continue;
  1883. }
  1884. }
  1885. else if (c2 == '\n')
  1886. ;
  1887. else
  1888. {
  1889. if (c2 == eof || digit_char (c2) >= 16)
  1890. {
  1891. put (c1);
  1892. c1 = c2;
  1893. continue;
  1894. }
  1895. c3 = s_in.get ();
  1896. if (c3 == eof || digit_char (c3) >= 16)
  1897. {
  1898. put (c1);
  1899. put (c2);
  1900. c1 = c3;
  1901. continue;
  1902. }
  1903. put ((digit_char (c2) << 4) | digit_char (c3));
  1904. }
  1905. }
  1906. c1 = s_in.get ();
  1907. }
  1908. if (c1 != eof)
  1909. s_in.putback (c1);
  1910. return finish ();
  1911. }
  1912. int
  1913. xencode_b64_stream::refill ()
  1914. {
  1915. u_char *b = s_buf, *const be = b + s_width;
  1916. while (b < be)
  1917. {
  1918. int c1 = s_in.get ();
  1919. if (c1 == eof)
  1920. {
  1921. if (b == s_buf)
  1922. return eof;
  1923. break;
  1924. }
  1925. int c2 = s_in.get ();
  1926. if (c2 == eof)
  1927. {
  1928. *b++ = b64chars[(c1 >> 2) & 63];
  1929. *b++ = b64chars[(c1 << 4) & 63];
  1930. *b++ = '=';
  1931. *b++ = '=';
  1932. break;
  1933. }
  1934. int c3 = s_in.get ();
  1935. if (c3 == eof)
  1936. {
  1937. *b++ = b64chars[(c1 >> 2) & 63];
  1938. *b++ = b64chars[((c1 << 4) | (c2 >> 4)) & 63];
  1939. *b++ = b64chars[(c2 << 2) & 63];
  1940. *b++ = '=';
  1941. break;
  1942. }
  1943. *b++ = b64chars[(c1 >> 2) & 63];
  1944. *b++ = b64chars[((c1 << 4) | (c2 >> 4)) & 63];
  1945. *b++ = b64chars[((c2 << 2) | (c3 >> 6)) & 63];
  1946. *b++ = b64chars[c3 & 63];
  1947. }
  1948. if (s_fold_p)
  1949. *b++ = '\n';
  1950. return setbuf (s_buf, b);
  1951. }
  1952. int
  1953. xencode_uu_stream::refill ()
  1954. {
  1955. if (s_eofp)
  1956. return eof;
  1957. u_char buf[BUFSIZE];
  1958. int nchars;
  1959. for (nchars = 0; nchars < sizeof buf; nchars++)
  1960. {
  1961. int c = s_in.get ();
  1962. if (c == eof)
  1963. {
  1964. s_eofp = !nchars;
  1965. break;
  1966. }
  1967. buf[nchars] = c;
  1968. }
  1969. u_char *b = s_buf;
  1970. *b++ = uuencode (nchars);
  1971. for (int i = 0; i < nchars; i += 3)
  1972. {
  1973. *b++ = uuencode ((buf[i] >> 2) & 63);
  1974. *b++ = uuencode (((buf[i] << 4) | (buf[i + 1] >> 4)) & 63);
  1975. *b++ = uuencode (((buf[i + 1] << 2) | (buf[i + 2] >> 6)) & 63);
  1976. *b++ = uuencode (buf[i + 2] & 63);
  1977. }
  1978. *b++ = '\n';
  1979. return setbuf (s_buf, b);
  1980. }
  1981. inline u_char *
  1982. xencode_qp_stream::encode (u_char *b, int c)
  1983. {
  1984. *b++ = '=';
  1985. *b++ = upcase_digit_char[c >> 4];
  1986. *b++ = upcase_digit_char[c & 15];
  1987. return b;
  1988. }
  1989. int
  1990. xencode_qp_stream::refill ()
  1991. {
  1992. int c, c2;
  1993. u_char *b = s_buf, *const be = b + LINESIZE;
  1994. while (b < be)
  1995. {
  1996. c = s_in.get ();
  1997. switch (c)
  1998. {
  1999. case eof:

Large files files are truncated, but you can click here to view the full file