PageRenderTime 56ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/src/encoding.cc

https://github.com/snmsts/xyzzy
C++ | 2739 lines | 2464 code | 271 blank | 4 comment | 644 complexity | 3a444a7213d26e1c3c54ee91657adb6c MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. #include "stdafx.h"
  2. #include "ed.h"
  3. #include "encoding.h"
  4. #include "ibmext.h"
  5. #include "utf2sjis.h"
  6. u_char escseq_euckr[] = {ccs_usascii, ccs_ksc5601, ccs_invalid, ccs_invalid};
  7. u_char escseq_eucgb[] = {ccs_usascii, ccs_gb2312, ccs_invalid, ccs_invalid};
  8. u_int designatable_any[] = {u_int (-1), u_int (-1), u_int (-1), u_int (-1)};
  9. const Char *
  10. cjk_translate_table (int lang)
  11. {
  12. switch (lang)
  13. {
  14. case ENCODING_LANG_JP:
  15. case ENCODING_LANG_JP2:
  16. return wc2cp932_table;
  17. case ENCODING_LANG_KR:
  18. init_wc2ksc5601_table ();
  19. return wc2ksc5601_table;
  20. case ENCODING_LANG_CN_GB:
  21. init_wc2gb2312_table ();
  22. return wc2gb2312_table;
  23. case ENCODING_LANG_CN_BIG5:
  24. init_wc2big5_table ();
  25. return wc2big5_table;
  26. case ENCODING_LANG_CN:
  27. init_wc2gb2312_table ();
  28. init_wc2big5_table ();
  29. return 0;
  30. default:
  31. return 0;
  32. }
  33. }
  34. int
  35. xbuffered_read_stream::refill ()
  36. {
  37. do
  38. {
  39. begin ();
  40. refill_internal ();
  41. }
  42. while (head () == tail () && head () != base ());
  43. return setbuf (head (), tail ());
  44. }
  45. void
  46. sjis_to_internal_stream::refill_internal ()
  47. {
  48. while (room () > 0)
  49. {
  50. int c1 = s_in.get ();
  51. if (c1 == eof)
  52. break;
  53. if (SJISP (c1))
  54. {
  55. int c2 = s_in.get ();
  56. if (c2 != eof)
  57. c1 = (c1 << 8) + c2;
  58. }
  59. put (c1);
  60. }
  61. }
  62. void
  63. fast_sjis_to_internal_stream::refill_internal ()
  64. {
  65. const u_char *rs, *rse;
  66. Char *rd, *rde;
  67. s_in.begin_direct_input (rs, rse);
  68. begin_direct_output (rd, rde);
  69. const u_char *s = rs, *const se = rse;
  70. Char *d = rd, *const de = rde;
  71. for (; d < de && s < se; d++)
  72. {
  73. int c1 = *s++;
  74. if (SJISP (c1) && s < se)
  75. c1 = (c1 << 8) + *s++;
  76. *d = c1;
  77. }
  78. s_in.end_direct_input (s);
  79. end_direct_output (d);
  80. }
  81. Char
  82. jisx0212_to_internal (int c1, int c2, int vender)
  83. {
  84. if (vender == ENCODING_ISO_VENDER_OSFJVC && c1 >= 0x75)
  85. {
  86. c1 += 20;
  87. return (j2sh (c1, c2) << 8) | j2sl (c1, c2);
  88. }
  89. if (vender != ENCODING_ISO_VENDER_OSFJVC
  90. && vender != ENCODING_ISO_VENDER_IBMEXT)
  91. return jisx0212_to_int (c1, c2);
  92. if (c1 == 0x74)
  93. return ibmext_eucjp2sjis_table[c2 - (0x21 - (0x7f - 0x73))];
  94. if (c1 == 0x73 && c2 >= 0x73)
  95. return ibmext_eucjp2sjis_table[c2 - 0x73];
  96. Char cc = jisx0212_to_int (c1, c2);
  97. Char t = w2i (i2w (cc));
  98. if (t != Char (-1))
  99. return t;
  100. return cc;
  101. }
  102. iso2022_noesc_to_internal_stream::iso2022_noesc_to_internal_stream (xinput_stream <u_char> &in,
  103. const u_char *g,
  104. int flags)
  105. : xbuffered_read_stream (in),
  106. s_vender (vender_depend_code (flags & ENCODING_ISO_VENDER_MASK))
  107. {
  108. memcpy (s_g, g, 4);
  109. init_cns11643_table ();
  110. }
  111. void
  112. iso2022_noesc_to_internal_stream::to_internal (u_char ccs, int c1, int oc1)
  113. {
  114. if (ccs_1byte_charset_p (ccs))
  115. put ((ccs_1byte_94_charset_p (ccs)
  116. ? c1 <= ' ' || c1 >= 0x7f : c1 < ' ')
  117. ? oc1 : (ccs << 7) | c1);
  118. else
  119. {
  120. if (c1 <= 0x20 || c1 >= 0x7f)
  121. put (oc1);
  122. else
  123. {
  124. int oc2 = s_in.get ();
  125. if (oc2 == eof)
  126. {
  127. put (oc1);
  128. return;
  129. }
  130. int c2 = oc2 & 127;
  131. if (c2 <= 0x20 || c2 >= 0x7f)
  132. {
  133. put (oc1);
  134. s_in.putback (oc2);
  135. }
  136. else
  137. switch (ccs)
  138. {
  139. case ccs_jisx0208:
  140. if (s_vender == ENCODING_ISO_VENDER_OSFJVC && c1 >= 0x75)
  141. c1 += 10;
  142. put ((j2sh (c1, c2) << 8) | j2sl (c1, c2));
  143. break;
  144. case ccs_jisx0212:
  145. put (jisx0212_to_internal (c1, c2, s_vender));
  146. break;
  147. case ccs_gb2312:
  148. put (gb2312_to_int (c1, c2));
  149. break;
  150. case ccs_big5_1:
  151. case ccs_big5_2:
  152. mule_g2b (ccs, c1, c2);
  153. put (big5_to_int (c1, c2));
  154. break;
  155. case ccs_cns11643_1:
  156. {
  157. Char cc = cns11643_1_to_internal[c1 * 94 + c2 - (0x21 * 94 + 0x21)];
  158. if (cc != Char (-1))
  159. put (cc);
  160. else
  161. {
  162. put (oc1);
  163. put (oc2);
  164. }
  165. break;
  166. }
  167. case ccs_cns11643_2:
  168. {
  169. Char cc = cns11643_2_to_internal[c1 * 94 + c2 - (0x21 * 94 + 0x21)];
  170. if (cc != Char (-1))
  171. put (cc);
  172. else
  173. {
  174. put (oc1);
  175. put (oc2);
  176. }
  177. break;
  178. }
  179. case ccs_ksc5601:
  180. put (ksc5601_to_int (c1, c2));
  181. break;
  182. default:
  183. assert (0);
  184. put (oc1);
  185. put (oc2);
  186. break;
  187. }
  188. }
  189. }
  190. }
  191. void
  192. iso2022_noesc_to_internal_stream::refill_internal ()
  193. {
  194. while (room () > 0)
  195. {
  196. int c = s_in.get ();
  197. if (c == eof)
  198. break;
  199. if (c < 128)
  200. to_internal (s_g[0], c, c);
  201. else if (c == CC_SS2)
  202. {
  203. if (s_g[2] == ccs_invalid)
  204. put (c);
  205. else
  206. {
  207. c = s_in.get ();
  208. if (c == eof)
  209. break;
  210. to_internal (s_g[2], c & 127, c);
  211. }
  212. }
  213. else if (c == CC_SS3)
  214. {
  215. if (s_g[3] == ccs_invalid)
  216. put (c);
  217. else
  218. {
  219. c = s_in.get ();
  220. if (c == eof)
  221. break;
  222. to_internal (s_g[3], c & 127, c);
  223. }
  224. }
  225. else
  226. to_internal (s_g[1], c & 127, c);
  227. }
  228. }
  229. int
  230. iso2022_to_internal_stream::designate94 (u_char &g, int *cp)
  231. {
  232. int c = s_in.get ();
  233. switch (c)
  234. {
  235. case 'B':
  236. case 'J':
  237. g = ccs_usascii;
  238. return 1;
  239. case 'I':
  240. g = ccs_jisx0201_kana;
  241. return 1;
  242. default:
  243. put (cp[0]);
  244. put (cp[1]);
  245. cp[0] = c;
  246. return 0;
  247. }
  248. }
  249. int
  250. iso2022_to_internal_stream::designate96 (u_char &g, int *cp)
  251. {
  252. int c = s_in.get ();
  253. switch (c)
  254. {
  255. case 'A':
  256. g = ccs_iso8859_1;
  257. return 1;
  258. case 'B':
  259. g = ccs_iso8859_2;
  260. return 1;
  261. case 'C':
  262. g = ccs_iso8859_3;
  263. return 1;
  264. case 'D':
  265. g = ccs_iso8859_4;
  266. return 1;
  267. case 'F':
  268. g = ccs_iso8859_7;
  269. return 1;
  270. case 'L':
  271. g = ccs_iso8859_5;
  272. return 1;
  273. case 'M':
  274. g = ccs_iso8859_9;
  275. return 1;
  276. case 'V':
  277. g = ccs_iso8859_10;
  278. return 1;
  279. case 'Y':
  280. g = ccs_iso8859_13;
  281. return 1;
  282. default:
  283. put (cp[0]);
  284. put (cp[1]);
  285. cp[0] = c;
  286. return 0;
  287. }
  288. }
  289. int
  290. iso2022_to_internal_stream::designate94n (u_char &g, int *cp)
  291. {
  292. int c = s_in.get ();
  293. switch (c)
  294. {
  295. case '@':
  296. case 'B':
  297. g = ccs_jisx0208;
  298. return 1;
  299. case 'D':
  300. g = ccs_jisx0212;
  301. return 1;
  302. case 'A':
  303. g = ccs_gb2312;
  304. return 1;
  305. case 'C':
  306. g = ccs_ksc5601;
  307. return 1;
  308. case 'G':
  309. g = ccs_cns11643_1;
  310. return 1;
  311. case 'H':
  312. g = ccs_cns11643_2;
  313. return 1;
  314. case '0':
  315. g = ccs_big5_1;
  316. return 1;
  317. case '1':
  318. g = ccs_big5_2;
  319. return 1;
  320. default:
  321. put (cp[0]);
  322. put (cp[1]);
  323. put (cp[2]);
  324. cp[0] = c;
  325. return 0;
  326. }
  327. }
  328. void
  329. iso2022_to_internal_stream::refill_internal ()
  330. {
  331. while (room () > 0)
  332. {
  333. int c[4];
  334. while (1)
  335. {
  336. c[0] = s_in.get ();
  337. again:
  338. if (c[0] != CC_ESC)
  339. break;
  340. s_ss = 0;
  341. c[1] = s_in.get ();
  342. switch (c[1])
  343. {
  344. case '$':
  345. c[2] = s_in.get ();
  346. switch (c[2])
  347. {
  348. case '@':
  349. case 'B':
  350. s_g[0] = ccs_jisx0208;
  351. continue;
  352. case 'A':
  353. s_g[0] = ccs_gb2312;
  354. continue;
  355. case '(':
  356. if (designate94n (s_g[0], c))
  357. continue;
  358. break;
  359. case ')':
  360. if (designate94n (s_g[1], c))
  361. continue;
  362. break;
  363. case '*':
  364. if (designate94n (s_g[2], c))
  365. continue;
  366. break;
  367. case '+':
  368. if (designate94n (s_g[3], c))
  369. continue;
  370. break;
  371. default:
  372. put (c[0]);
  373. put (c[1]);
  374. c[0] = c[2];
  375. break;
  376. }
  377. break;
  378. case '(':
  379. if (designate94 (s_g[0], c))
  380. continue;
  381. break;
  382. case ')':
  383. if (designate94 (s_g[1], c))
  384. continue;
  385. break;
  386. case '*':
  387. if (designate94 (s_g[2], c))
  388. continue;
  389. break;
  390. case '+':
  391. if (designate94 (s_g[3], c))
  392. continue;
  393. break;
  394. case ',': // Mule compatible
  395. if (designate96 (s_g[0], c))
  396. continue;
  397. break;
  398. case '-':
  399. if (designate96 (s_g[1], c))
  400. continue;
  401. break;
  402. case '.':
  403. if (designate96 (s_g[2], c))
  404. continue;
  405. break;
  406. case '/':
  407. if (designate96 (s_g[3], c))
  408. continue;
  409. break;
  410. case 'N': // SS2
  411. if (s_g[2] == ccs_invalid)
  412. goto wrong;
  413. s_ss = &s_g[2];
  414. continue;
  415. case 'O': // SS3
  416. if (s_g[3] == ccs_invalid)
  417. goto wrong;
  418. s_ss = &s_g[3];
  419. continue;
  420. case 'n': // LS2:
  421. if (s_g[2] == ccs_invalid)
  422. goto wrong;
  423. s_gl = &s_g[2];
  424. continue;
  425. case 'o': // LS3:
  426. if (s_g[3] == ccs_invalid)
  427. goto wrong;
  428. s_gl = &s_g[3];
  429. continue;
  430. case '~': // LS1R
  431. if (s_g[1] == ccs_invalid)
  432. goto wrong;
  433. s_gr = &s_g[1];
  434. continue;
  435. case '}': // LS2R
  436. if (s_g[2] == ccs_invalid)
  437. goto wrong;
  438. s_gr = &s_g[2];
  439. continue;
  440. case '|': // LS3R
  441. if (s_g[3] == ccs_invalid)
  442. goto wrong;
  443. s_gr = &s_g[3];
  444. continue;
  445. wrong:
  446. put (c[0]);
  447. c[0] = c[1];
  448. goto normal_char;
  449. default:
  450. s_in.putback (c[1]);
  451. goto normal_char;
  452. }
  453. if (c[0] == eof)
  454. return;
  455. if (room () > 0)
  456. goto again;
  457. s_in.putback (c[0]);
  458. return;
  459. }
  460. normal_char:
  461. int cc = c[0];
  462. if (cc == eof)
  463. break;
  464. switch (cc)
  465. {
  466. case CC_SS2:
  467. if (s_g[2] == ccs_invalid)
  468. break;
  469. s_ss = &s_g[2];
  470. continue;
  471. case CC_SS3:
  472. if (s_g[3] == ccs_invalid)
  473. break;
  474. s_ss = &s_g[3];
  475. continue;
  476. case CC_SI:
  477. if (s_g[1] == ccs_invalid)
  478. break;
  479. s_gl = &s_g[0];
  480. continue;
  481. case CC_SO:
  482. if (s_g[1] == ccs_invalid)
  483. break;
  484. s_gl = &s_g[1];
  485. continue;
  486. }
  487. int ccs;
  488. if (cc < 128)
  489. ccs = s_ss ? *s_ss : *s_gl;
  490. else
  491. {
  492. ccs = s_ss ? *s_ss : *s_gr;
  493. if (ccs != ccs_invalid)
  494. cc &= 127;
  495. else
  496. ccs = ccs_usascii;
  497. }
  498. s_ss = 0;
  499. to_internal (ccs, cc, c[0]);
  500. }
  501. }
  502. void
  503. big5_to_internal_stream::refill_internal ()
  504. {
  505. while (room () > 0)
  506. {
  507. int c1 = s_in.get ();
  508. if (c1 == eof)
  509. break;
  510. if (c1 >= 0xa1 && c1 <= 0xf8 && c1 != 0xc8)
  511. {
  512. int c2 = s_in.get ();
  513. if (c2 >= 0x40 && c2 <= 0x7e || c2 >= 0xa1 && c2 <= 0xfe)
  514. c1 = big5_to_int (c1, c2);
  515. else
  516. s_in.putback (c2);
  517. }
  518. put (c1);
  519. }
  520. }
  521. void
  522. binary_to_internal_stream::refill_internal ()
  523. {
  524. while (room () > 0)
  525. {
  526. int c = s_in.get ();
  527. if (c == eof)
  528. break;
  529. put (c);
  530. }
  531. }
  532. utf_to_internal_stream::putw_t
  533. utf_to_internal_stream::per_lang_putw (int lang)
  534. {
  535. switch (lang)
  536. {
  537. default:
  538. case ENCODING_LANG_JP:
  539. case ENCODING_LANG_JP2:
  540. return &putw_jp;
  541. case ENCODING_LANG_KR:
  542. case ENCODING_LANG_CN_GB:
  543. case ENCODING_LANG_CN_BIG5:
  544. return &putw_gen;
  545. case ENCODING_LANG_CN:
  546. return &putw_cn;
  547. }
  548. }
  549. void
  550. utf_to_internal_stream::putw_jp (ucs2_t wc)
  551. {
  552. if (s_has_bom < 0)
  553. {
  554. s_has_bom = wc == UNICODE_BOM;
  555. if (s_has_bom)
  556. return;
  557. }
  558. if (!(s_flags & ENCODING_UTF_WINDOWS))
  559. {
  560. int n = wc % numberof (utf_shiftjis2internal_hash);
  561. if (utf_shiftjis2internal_hash[n].wc == wc)
  562. {
  563. put (utf_shiftjis2internal_hash[n].cc);
  564. return;
  565. }
  566. }
  567. Char cc;
  568. if (s_to_full_width
  569. && (cc = wc2cp932 (wc)) != Char (-1)
  570. && !ccs_1byte_94_charset_p (code_charset (cc)))
  571. put (cc);
  572. else
  573. {
  574. cc = w2i (wc);
  575. if (cc != Char (-1))
  576. put (cc);
  577. else
  578. {
  579. put (utf16_ucs2_to_undef_pair_high (wc));
  580. put (utf16_ucs2_to_undef_pair_low (wc));
  581. }
  582. }
  583. }
  584. void
  585. utf_to_internal_stream::putw_gen (ucs2_t wc)
  586. {
  587. if (s_has_bom < 0)
  588. {
  589. s_has_bom = wc == UNICODE_BOM;
  590. if (s_has_bom)
  591. return;
  592. }
  593. Char cc = w2i (wc);
  594. if (cc != Char (-1))
  595. {
  596. if (!ccs_1byte_94_charset_p (code_charset (cc)))
  597. {
  598. Char t = s_cjk_translate[wc];
  599. if (t != Char (-1))
  600. cc = t;
  601. }
  602. put (cc);
  603. }
  604. else
  605. {
  606. put (utf16_ucs2_to_undef_pair_high (wc));
  607. put (utf16_ucs2_to_undef_pair_low (wc));
  608. }
  609. }
  610. void
  611. utf_to_internal_stream::putw_cn (ucs2_t wc)
  612. {
  613. if (s_has_bom < 0)
  614. {
  615. s_has_bom = wc == UNICODE_BOM;
  616. if (s_has_bom)
  617. return;
  618. }
  619. Char cc = w2i (wc);
  620. if (cc != Char (-1))
  621. {
  622. if (!ccs_1byte_94_charset_p (code_charset (cc)))
  623. {
  624. Char t = wc2gb2312_table[wc];
  625. if (t != Char (-1) || (t = wc2big5_table[wc]) != Char (-1))
  626. cc = t;
  627. }
  628. put (cc);
  629. }
  630. else
  631. {
  632. put (utf16_ucs2_to_undef_pair_high (wc));
  633. put (utf16_ucs2_to_undef_pair_low (wc));
  634. }
  635. }
  636. inline void
  637. utf_to_internal_stream::putl (ucs4_t lc)
  638. {
  639. if (lc < 0x10000)
  640. putw (ucs2_t (lc));
  641. else
  642. {
  643. putw (utf16_ucs4_to_pair_high (lc));
  644. putw (utf16_ucs4_to_pair_low (lc));
  645. }
  646. }
  647. void
  648. utf16_to_internal_stream::refill_internal_le ()
  649. {
  650. while (room () > 0)
  651. {
  652. int c1 = s_in.get ();
  653. if (c1 == eof)
  654. break;
  655. int c2 = s_in.get ();
  656. if (c2 == eof)
  657. break;
  658. putw ((c2 << 8) | c1);
  659. }
  660. }
  661. void
  662. utf16_to_internal_stream::refill_internal_be ()
  663. {
  664. while (room () > 0)
  665. {
  666. int c1 = s_in.get ();
  667. if (c1 == eof)
  668. break;
  669. int c2 = s_in.get ();
  670. if (c2 == eof)
  671. break;
  672. putw ((c1 << 8) | c2);
  673. }
  674. }
  675. void
  676. utf16unknown_to_internal_stream::refill_internal ()
  677. {
  678. if (!s_byte_order)
  679. {
  680. int c1 = s_in.get ();
  681. if (c1 == eof)
  682. return;
  683. int c2 = s_in.get ();
  684. if (c2 == eof)
  685. return;
  686. ucs2_t wc = (c1 << 8) | c2;
  687. if (wc == UNICODE_BOM)
  688. s_byte_order = ENCODING_UTF_BE;
  689. else if (wc == UNICODE_REVBOM)
  690. s_byte_order = ENCODING_UTF_LE;
  691. else
  692. {
  693. putw (wc);
  694. if (xsymbol_value (Vdefault_utf16_byte_order) == Kbig_endian)
  695. s_byte_order = ENCODING_UTF_BE;
  696. else
  697. s_byte_order = ENCODING_UTF_LE;
  698. }
  699. }
  700. if (s_byte_order == ENCODING_UTF_BE)
  701. refill_internal_be ();
  702. else
  703. refill_internal_le ();
  704. }
  705. u_char utf8_chtab[] =
  706. {
  707. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  708. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  709. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  710. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  711. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  712. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  713. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  714. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  715. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  716. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  717. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  718. 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  719. 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
  720. 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
  721. 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
  722. 3,3,3,3,3,3,3,3,2,2,2,2,1,1,0,0,
  723. };
  724. u_char utf8_chmask[] = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f};
  725. void
  726. utf8_to_internal_stream::refill_internal ()
  727. {
  728. while (room () > 0)
  729. {
  730. int c = s_in.get ();
  731. if (c == eof)
  732. break;
  733. u_char nbits = utf8_chtab[c];
  734. c &= utf8_chmask[nbits];
  735. switch (nbits)
  736. {
  737. case 7:
  738. putw (c);
  739. break;
  740. case 0:
  741. case 6:
  742. /* invalid code */
  743. break;
  744. default:
  745. {
  746. ucs4_t code = c;
  747. do
  748. {
  749. c = s_in.get ();
  750. if (c == eof)
  751. return;
  752. code = (code << 6) | (c & 0x3f);
  753. }
  754. while (++nbits < 6);
  755. putl (code);
  756. break;
  757. }
  758. }
  759. }
  760. }
  761. utf7_to_internal_stream::utf7_to_internal_stream (xinput_stream <u_char> &in,
  762. int flags, int lang)
  763. : utf_to_internal_stream (in, flags | ENCODING_UTF_SIGNATURE, lang),
  764. s_direct_encoding (1), s_cc (eof),
  765. s_imap4p (flags & UTF7_IMAP4_MAILBOX_NAME),
  766. s_shift_char (s_imap4p ? '&' : '+')
  767. {
  768. }
  769. int
  770. utf7_to_internal_stream::unicode_shifted_encoding ()
  771. {
  772. u_char buf[8];
  773. int nchars;
  774. for (nchars = 0; nchars < sizeof buf; nchars++)
  775. {
  776. s_cc = s_in.get ();
  777. if (s_cc == eof)
  778. break;
  779. int b = s_imap4p ? imap4_base64_decode (s_cc) : base64_decode (s_cc);
  780. if (b >= 64)
  781. break;
  782. buf[nchars] = b;
  783. }
  784. int t = 0;
  785. int n = nchars & ~3;
  786. int i;
  787. for (i = 0; i < n; i += 4)
  788. {
  789. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  790. buf[t++] = (buf[i + 1] << 4) | (buf[i + 2] >> 2);
  791. buf[t++] = (buf[i + 2] << 6) | buf[i + 3];
  792. }
  793. switch (nchars & 3)
  794. {
  795. case 2:
  796. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  797. break;
  798. case 3:
  799. buf[t++] = (buf[i] << 2) | (buf[i + 1] >> 4);
  800. buf[t++] = (buf[i + 1] << 4) | (buf[i + 2] >> 2);
  801. break;
  802. }
  803. t &= ~1;
  804. for (i = 0; i < t; i += 2)
  805. putw ((buf[i] << 8) + buf[i + 1]);
  806. s_nbytes += t;
  807. s_direct_encoding = nchars < sizeof buf;
  808. if (s_cc == '-')
  809. {
  810. if (!s_nbytes)
  811. putw (s_shift_char);
  812. s_cc = s_in.get ();
  813. }
  814. return !s_direct_encoding;
  815. }
  816. void
  817. utf7_to_internal_stream::refill_internal ()
  818. {
  819. if (!s_direct_encoding)
  820. goto unicode_shifted_encoding;
  821. if (s_cc == eof)
  822. s_cc = s_in.get ();
  823. while (room () > 0)
  824. {
  825. if (s_cc == eof)
  826. break;
  827. if (s_cc != s_shift_char)
  828. {
  829. putw (s_cc);
  830. s_cc = s_in.get ();
  831. }
  832. else
  833. {
  834. s_nbytes = 0;
  835. s_direct_encoding = 0;
  836. unicode_shifted_encoding:
  837. while (room () > 0 && unicode_shifted_encoding ())
  838. ;
  839. }
  840. }
  841. }
  842. void
  843. utf5_to_internal_stream::refill_internal ()
  844. {
  845. while (1)
  846. {
  847. int c = s_in.get ();
  848. if (c == eof)
  849. return;
  850. nextchar:
  851. ucs4_t code = digit_char (c) - 16;
  852. if (code >= 16)
  853. continue;
  854. while (1)
  855. {
  856. c = s_in.get ();
  857. if (c == eof)
  858. {
  859. putl (code);
  860. return;
  861. }
  862. int n = digit_char (c);
  863. if (n >= 16)
  864. {
  865. putl (code);
  866. if (room () <= 0)
  867. {
  868. s_in.putback (c);
  869. return;
  870. }
  871. goto nextchar;
  872. }
  873. code = (code << 4) | n;
  874. }
  875. }
  876. }
  877. void
  878. iso8859_to_internal_stream::refill_internal ()
  879. {
  880. while (room () > 0)
  881. {
  882. int c = s_in.get ();
  883. if (c == eof)
  884. break;
  885. if (c >= 0xa0)
  886. c = s_charset | (c & 127);
  887. put (c);
  888. }
  889. }
  890. void
  891. windows_codepage_to_internal_stream::refill_internal ()
  892. {
  893. while (room () > 0)
  894. {
  895. int c = s_in.get ();
  896. if (c == eof)
  897. break;
  898. if (c >= 0x80 && s_translate[c - 0x80] != Char (-1))
  899. c = s_translate[c - 0x80];
  900. put (c);
  901. }
  902. }
  903. void
  904. xwrite_stream::puteol ()
  905. {
  906. if (s_eol == eol_crlf)
  907. {
  908. put ('\r');
  909. put ('\n');
  910. }
  911. else if (s_eol == eol_lf)
  912. put ('\n');
  913. else
  914. put ('\r');
  915. s_nlines++;
  916. }
  917. int
  918. internal_to_sjis_stream::refill ()
  919. {
  920. begin ();
  921. while (room () > 0)
  922. {
  923. int c = s_in.get ();
  924. if (c == eof)
  925. break;
  926. Char cc = c;
  927. if (cc >= 0x80)
  928. {
  929. if (code_charset_bit (cc) & ccsf_possible_cp932)
  930. {
  931. cc = wc2cp932 (i2w (cc));
  932. if (cc == Char (-1))
  933. cc = DEFCHAR;
  934. }
  935. else if (code_charset_bit (cc) & ccsf_not_cp932)
  936. cc = DEFCHAR;
  937. if (DBCP (cc))
  938. {
  939. put (u_char (cc >> 8));
  940. put (u_char (cc));
  941. continue;
  942. }
  943. }
  944. if (cc == '\n')
  945. puteol ();
  946. else
  947. put (u_char (cc));
  948. }
  949. return finish ();
  950. }
  951. int
  952. internal_to_big5_stream::refill ()
  953. {
  954. begin ();
  955. while (room () > 0)
  956. {
  957. int c = s_in.get ();
  958. if (c == eof)
  959. break;
  960. Char cc = wc2big5_table[i2w (c)];
  961. if (cc == Char (-1))
  962. cc = DEFCHAR;
  963. if (cc >= 0x100)
  964. {
  965. int c1, c2;
  966. int_to_big5 (cc, c1, c2);
  967. put (c1);
  968. put (c2);
  969. }
  970. else if (cc == '\n')
  971. puteol ();
  972. else
  973. put (u_char (cc));
  974. }
  975. return finish ();
  976. }
  977. int
  978. internal_to_binary_stream::refill ()
  979. {
  980. begin ();
  981. while (room () > 0)
  982. {
  983. int c = s_in.get ();
  984. if (c == eof)
  985. break;
  986. if (c >= 0x100)
  987. {
  988. put (u_char (c >> 8));
  989. put (u_char (c));
  990. }
  991. else if (c == '\n')
  992. puteol ();
  993. else
  994. put (u_char (c));
  995. }
  996. return finish ();
  997. }
  998. Char
  999. convert_ibmext (Char cc)
  1000. {
  1001. if (cc <= 0xeffc)
  1002. return cc;
  1003. if (cc < 0xfa40 || cc > 0xfc4b)
  1004. return DEFCHAR;
  1005. int c2 = cc & 0xff;
  1006. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1007. return DEFCHAR;
  1008. if (c2 >= 0x80)
  1009. c2--;
  1010. return ibmext2internal_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1011. }
  1012. Char
  1013. convert_ibmext2necext (Char cc)
  1014. {
  1015. if (cc <= 0xeffc)
  1016. return cc;
  1017. if (cc < 0xfa40 || cc > 0xfc4b)
  1018. return DEFCHAR;
  1019. int c2 = cc & 0xff;
  1020. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1021. return DEFCHAR;
  1022. if (c2 >= 0x80)
  1023. c2--;
  1024. return ibmext2necext_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1025. }
  1026. Char
  1027. convert_osfjvc (Char cc)
  1028. {
  1029. if (cc < 0xecfc)
  1030. return cc;
  1031. if (cc >= 0xed40 && cc <= 0xeefc)
  1032. {
  1033. cc = w2i (i2w (cc));
  1034. if (cc == Char (-1))
  1035. return DEFCHAR;
  1036. }
  1037. if (cc < 0xfa40 || cc > 0xfc4b)
  1038. return cc;
  1039. int c2 = cc & 0xff;
  1040. if (c2 < 0x40 || c2 == 0x7f || c2 > 0xfc)
  1041. return DEFCHAR;
  1042. if (c2 >= 0x80)
  1043. c2--;
  1044. return ibmext2internal_table[((cc >> 8) - 0xfa) * 188 + c2 - 0x40];
  1045. }
  1046. int
  1047. vender_depend_code (int vender)
  1048. {
  1049. if (vender == ENCODING_ISO_VENDER_NIL)
  1050. vender = to_vender_code (xsymbol_value (Vvender_depend_code_mapping));
  1051. if (vender != ENCODING_ISO_VENDER_NIL)
  1052. return vender;
  1053. return ENCODING_ISO_VENDER_NECEXT;
  1054. }
  1055. vender_code_mapper_fn
  1056. select_vender_code_mapper (int vender)
  1057. {
  1058. switch (vender_depend_code (vender))
  1059. {
  1060. default:
  1061. case ENCODING_ISO_VENDER_IBMEXT:
  1062. return convert_ibmext;
  1063. case ENCODING_ISO_VENDER_NECEXT:
  1064. return convert_ibmext2necext;
  1065. case ENCODING_ISO_VENDER_OSFJVC:
  1066. return convert_osfjvc;
  1067. }
  1068. }
  1069. const internal_to_iso2022_stream::ccs_data internal_to_iso2022_stream::s_ccs_data[32] =
  1070. {
  1071. {'B', ctype94}, // ccs_usascii
  1072. {'I', ctype94}, // ccs_jisx0201_kana
  1073. {'A', ctype96}, // ccs_iso8859_1
  1074. {'B', ctype96}, // ccs_iso8859_2
  1075. {'C', ctype96}, // ccs_iso8859_3
  1076. {'D', ctype96}, // ccs_iso8859_4
  1077. {'L', ctype96}, // ccs_iso8859_5
  1078. {'F', ctype96}, // ccs_iso8859_7
  1079. {'M', ctype96}, // ccs_iso8859_9
  1080. {'V', ctype96}, // ccs_iso8859_10
  1081. {'Y', ctype96}, // ccs_iso8859_13
  1082. {'B', ctype94n, 1}, // ccs_jisx0208
  1083. {'D', ctype94n}, // ccs_jisx0212
  1084. {'A', ctype94n, 1}, // ccs_gb2312
  1085. {'C', ctype94n}, // ccs_ksc5601
  1086. {'0', ctype94n}, // ccs_big5_1
  1087. {'1', ctype94n}, // ccs_big5_2
  1088. {0}, // ccs_utf16_undef_char_high
  1089. {0}, // ccs_utf16_undef_char_low
  1090. {0}, // ccs_utf16_surrogate_high
  1091. {0}, // ccs_utf16_surrogate_low
  1092. {'G', ctype94n}, // ccs_cns11643_1
  1093. {'H', ctype94n}, // ccs_cns11643_2
  1094. };
  1095. const char internal_to_iso2022_stream::s_inter94[] = {'(', ')', '*', '+'};
  1096. const char internal_to_iso2022_stream::s_inter96[] = {',', '-', '.', '/'};
  1097. int
  1098. internal_to_iso2022_stream::select_designation (int ccs) const
  1099. {
  1100. if (ccs == ccs_usascii)
  1101. return 0;
  1102. for (int i = 0; i < 4; i++)
  1103. if (s_initial[i] == ccs)
  1104. return i;
  1105. for (int i = 0; i < 4; i++)
  1106. if (s_designatable[i] != u_int (-1)
  1107. && s_designatable[i] & (1 << ccs))
  1108. return i;
  1109. for (int i = 0; i < 4; i++)
  1110. if (s_designatable[i] == u_int (-1))
  1111. return i;
  1112. if (s_flags & ENCODING_ISO_LOCKING_SHIFT)
  1113. return 1;
  1114. if (s_ccs_data[ccs].ctype == ctype96)
  1115. return 2;
  1116. return 0;
  1117. }
  1118. internal_to_iso2022_stream::internal_to_iso2022_stream (xinput_stream <Char> &in,
  1119. eol_code eol,
  1120. int flags,
  1121. const u_char *initial,
  1122. const u_int *designatable,
  1123. int cjk)
  1124. : xwrite_stream (in, eol), s_flags (flags), s_initial (initial),
  1125. s_gl (&s_g[0]), s_gr (flags & ENCODING_ISO_7BITS ? 0 : &s_g[1]),
  1126. s_designatable (designatable), s_cjk_translate (cjk_translate_table (cjk)),
  1127. s_lang_cn (cjk == ENCODING_LANG_CN),
  1128. s_vender_code_mapper (select_vender_code_mapper
  1129. (flags & ENCODING_ISO_VENDER_MASK))
  1130. {
  1131. memcpy (s_g, s_initial, 4);
  1132. for (int i = ccs_usascii; i < ccs_max; i++)
  1133. s_designation[i] = select_designation (i);
  1134. if (s_flags & ENCODING_ISO_USE_CNS11643)
  1135. init_big5cns_table ();
  1136. }
  1137. void
  1138. internal_to_iso2022_stream::designate (int n, u_char ccs)
  1139. {
  1140. if (s_g[n] != ccs)
  1141. {
  1142. put ('\033');
  1143. switch (s_ccs_data[ccs].ctype)
  1144. {
  1145. case ctype94:
  1146. put (s_inter94[n]);
  1147. break;
  1148. case ctype96:
  1149. put (s_inter96[n]);
  1150. break;
  1151. case ctype94n:
  1152. put ('$');
  1153. if (n || !(s_flags & ENCODING_ISO_SHORT_FORM) || !s_ccs_data[ccs].fshort)
  1154. put (s_inter94[n]);
  1155. break;
  1156. }
  1157. put (s_ccs_data[ccs].final);
  1158. s_g[n] = ccs;
  1159. }
  1160. }
  1161. int
  1162. internal_to_iso2022_stream::designate (u_char ccs)
  1163. {
  1164. int n = s_designation[ccs];
  1165. if (s_g[n] != ccs)
  1166. {
  1167. if (s_gl != &s_g[0] && s_flags & ENCODING_ISO_ASCII_CTRL)
  1168. {
  1169. put (CC_SI);
  1170. s_gl = &s_g[0];
  1171. }
  1172. designate (n, ccs);
  1173. }
  1174. switch (n)
  1175. {
  1176. default:
  1177. if (s_gl != &s_g[0])
  1178. {
  1179. put (CC_SI);
  1180. s_gl = &s_g[0];
  1181. }
  1182. return 0;
  1183. case 1:
  1184. if (s_gr == &s_g[1])
  1185. return 0x80;
  1186. if (s_gl != &s_g[1])
  1187. {
  1188. put (CC_SO);
  1189. s_gl = &s_g[1];
  1190. }
  1191. return 0;
  1192. case 2:
  1193. if (s_gr)
  1194. {
  1195. put (CC_SS2);
  1196. return 0x80;
  1197. }
  1198. else
  1199. {
  1200. put (CC_ESC);
  1201. put ('N');
  1202. return 0;
  1203. }
  1204. case 3:
  1205. if (s_gr)
  1206. {
  1207. put (CC_SS3);
  1208. return 0x80;
  1209. }
  1210. else
  1211. {
  1212. put (CC_ESC);
  1213. put ('O');
  1214. return 0;
  1215. }
  1216. }
  1217. }
  1218. int
  1219. internal_to_iso2022_stream::refill ()
  1220. {
  1221. begin ();
  1222. while (room () > 0)
  1223. {
  1224. int c = s_in.get ();
  1225. if (c == eof)
  1226. {
  1227. designate (ccs_usascii);
  1228. break;
  1229. }
  1230. Char cc = c;
  1231. u_int ccsf = code_charset_bit (cc);
  1232. if (ccsf & (ccsf_utf16_surrogate | ccsf_utf16_undef_char))
  1233. cc = DEFCHAR;
  1234. else
  1235. {
  1236. if (s_lang_cn)
  1237. {
  1238. if (!(ccsf & (ccsf_gb2312 | ccsf_big5)))
  1239. {
  1240. wchar_t wc = i2w (cc);
  1241. Char t = wc2gb2312_table[wc];
  1242. if (t != Char (-1) || (t = wc2big5_table[wc]) != Char (-1))
  1243. cc = t;
  1244. }
  1245. }
  1246. else
  1247. {
  1248. if (s_cjk_translate)
  1249. {
  1250. Char t = s_cjk_translate[i2w (cc)];
  1251. if (t != Char (-1))
  1252. cc = t;
  1253. }
  1254. }
  1255. cc = (*s_vender_code_mapper)(cc);
  1256. }
  1257. int ccs = code_charset (cc);
  1258. switch (ccs)
  1259. {
  1260. int c1, c2, f;
  1261. case ccs_usascii:
  1262. if (cc == '\n')
  1263. {
  1264. if (s_flags & ENCODING_ISO_ASCII_EOL)
  1265. {
  1266. if (s_gl != &s_g[0])
  1267. {
  1268. put (CC_SI);
  1269. s_gl = &s_g[0];
  1270. }
  1271. for (int i = 0; i < 4; i++)
  1272. if (s_g[i] != s_initial[i])
  1273. {
  1274. if (s_initial[i] != ccs_invalid)
  1275. designate (i, s_initial[i]);
  1276. else
  1277. s_g[i] = s_initial[i];
  1278. }
  1279. }
  1280. puteol ();
  1281. break;
  1282. }
  1283. /* fall thru... */
  1284. usascii:
  1285. if (s_flags & ENCODING_ISO_ASCII_CTRL || (cc > ' ' && cc < CC_DEL))
  1286. designate (ccs_usascii);
  1287. put (u_char (cc));
  1288. break;
  1289. case ccs_jisx0201_kana:
  1290. if (cc <= 0xa0 || cc == 0xff)
  1291. goto usascii;
  1292. f = designate (ccs_jisx0201_kana);
  1293. put (u_char (cc & 127 | f));
  1294. break;
  1295. case ccs_iso8859_1:
  1296. case ccs_iso8859_2:
  1297. case ccs_iso8859_3:
  1298. case ccs_iso8859_4:
  1299. case ccs_iso8859_5:
  1300. case ccs_iso8859_7:
  1301. case ccs_iso8859_9:
  1302. case ccs_iso8859_10:
  1303. case ccs_iso8859_13:
  1304. cc &= 127;
  1305. if (cc < ' ')
  1306. {
  1307. cc |= 0x80;
  1308. goto usascii;
  1309. }
  1310. f = designate (ccs);
  1311. put (u_char (cc | f));
  1312. break;
  1313. case ccs_jisx0212:
  1314. if (cc > CCS_JISX0212_MAX)
  1315. goto badchar;
  1316. int_to_jisx0212 (cc, c1, c2);
  1317. goto put94n;
  1318. case ccs_gb2312:
  1319. if (cc > CCS_GB2312_MAX)
  1320. goto badchar;
  1321. int_to_gb2312 (cc, c1, c2);
  1322. goto put94n;
  1323. case ccs_ksc5601:
  1324. if (cc > CCS_KSC5601_MAX)
  1325. goto badchar;
  1326. int_to_ksc5601 (cc, c1, c2);
  1327. goto put94n;
  1328. case ccs_big5:
  1329. if (cc > CCS_BIG5_MAX)
  1330. goto badchar;
  1331. if (s_flags & ENCODING_ISO_USE_CNS11643)
  1332. {
  1333. cc = big5cns_table[cc - CCS_BIG5_MIN];
  1334. if (cc != Char (-1))
  1335. {
  1336. switch (cc & 0x8080)
  1337. {
  1338. default:
  1339. ccs = ccs_gb2312;
  1340. break;
  1341. case BIG5CNS_CNS11643_1:
  1342. ccs = ccs_cns11643_1;
  1343. break;
  1344. case BIG5CNS_CNS11643_2:
  1345. ccs = ccs_cns11643_2;
  1346. break;
  1347. }
  1348. c1 = (cc >> 8) & 127;
  1349. c2 = cc & 127;
  1350. goto put94n;
  1351. }
  1352. }
  1353. int_to_big5 (cc, c1, c2);
  1354. mule_b2g (ccs, c1, c2);
  1355. goto put94n;
  1356. default:
  1357. c1 = cc >> 8;
  1358. c2 = cc & 255;
  1359. if (!SJISP (c1) || !SJIS2P (c2))
  1360. goto badchar;
  1361. s2j (c1, c2);
  1362. if (c1 >= 95 + 32)
  1363. {
  1364. if (c1 < 105 + 32)
  1365. c1 -= 10;
  1366. else if (c1 < 115 + 32)
  1367. {
  1368. c1 -= 20;
  1369. ccs = ccs_jisx0212;
  1370. }
  1371. else
  1372. goto badchar;
  1373. }
  1374. put94n:
  1375. f = designate (ccs);
  1376. put (c1 | f);
  1377. put (c2 | f);
  1378. break;
  1379. badchar:
  1380. designate (ccs_usascii);
  1381. put ('?');
  1382. break;
  1383. }
  1384. }
  1385. return finish ();
  1386. }
  1387. int
  1388. internal_to_utf_stream::getw () const
  1389. {
  1390. int c = s_in.get ();
  1391. if (c == eof)
  1392. return eof;
  1393. Char cc = Char (c);
  1394. if (!(s_flags & ENCODING_UTF_WINDOWS) && cc != Char (-1))
  1395. {
  1396. int n = cc % numberof (utf_internal2shiftjis_hash);
  1397. if (utf_internal2shiftjis_hash[n].cc == cc)
  1398. return utf_internal2shiftjis_hash[n].wc;
  1399. }
  1400. ucs2_t wc = i2w (cc);
  1401. if (wc != ucs2_t (-1))
  1402. return wc;
  1403. if (utf16_undef_char_high_p (ucs2_t (cc)))
  1404. {
  1405. int c2 = s_in.get ();
  1406. if (c2 != eof)
  1407. {
  1408. if (utf16_undef_char_low_p (ucs2_t (c2)))
  1409. return utf16_undef_pair_to_ucs2 (ucs2_t (cc), ucs2_t (c2));
  1410. s_in.putback (c2);
  1411. }
  1412. }
  1413. return DEFCHAR;
  1414. }
  1415. int
  1416. internal_to_utf16le_stream::refill ()
  1417. {
  1418. begin ();
  1419. if (s_bom)
  1420. {
  1421. s_bom = 0;
  1422. if (!s_in.eofp ())
  1423. {
  1424. put (u_char (UNICODE_BOM));
  1425. put (u_char (UNICODE_BOM >> 8));
  1426. }
  1427. }
  1428. while (room () > 0)
  1429. {
  1430. int c = getw ();
  1431. if (c == eof)
  1432. break;
  1433. ucs2_t wc = ucs2_t (c);
  1434. if (wc == '\n')
  1435. {
  1436. if (s_eol == eol_crlf)
  1437. {
  1438. put ('\r');
  1439. put (0);
  1440. put ('\n');
  1441. put (0);
  1442. }
  1443. else if (s_eol == eol_lf)
  1444. {
  1445. put ('\n');
  1446. put (0);
  1447. }
  1448. else
  1449. {
  1450. put ('\r');
  1451. put (0);
  1452. }
  1453. s_nlines++;
  1454. }
  1455. else
  1456. {
  1457. put (u_char (wc));
  1458. put (u_char (wc >> 8));
  1459. }
  1460. }
  1461. return finish ();
  1462. }
  1463. int
  1464. internal_to_utf16be_stream::refill ()
  1465. {
  1466. begin ();
  1467. if (s_bom)
  1468. {
  1469. s_bom = 0;
  1470. if (!s_in.eofp ())
  1471. {
  1472. put (u_char (UNICODE_BOM >> 8));
  1473. put (u_char (UNICODE_BOM));
  1474. }
  1475. }
  1476. while (room () > 0)
  1477. {
  1478. int c = getw ();
  1479. if (c == eof)
  1480. break;
  1481. ucs2_t wc = ucs2_t (c);
  1482. if (wc == '\n')
  1483. {
  1484. if (s_eol == eol_crlf)
  1485. {
  1486. put (0);
  1487. put ('\r');
  1488. put (0);
  1489. put ('\n');
  1490. }
  1491. else if (s_eol == eol_lf)
  1492. {
  1493. put (0);
  1494. put ('\n');
  1495. }
  1496. else
  1497. {
  1498. put (0);
  1499. put ('\r');
  1500. }
  1501. s_nlines++;
  1502. }
  1503. else
  1504. {
  1505. put (u_char (wc >> 8));
  1506. put (u_char (wc));
  1507. }
  1508. }
  1509. return finish ();
  1510. }
  1511. int
  1512. internal_to_utf8_stream::refill ()
  1513. {
  1514. begin ();
  1515. if (s_bom)
  1516. {
  1517. s_bom = 0;
  1518. if (!s_in.eofp ())
  1519. {
  1520. put (0xef);
  1521. put (0xbb);
  1522. put (0xbf);
  1523. }
  1524. }
  1525. while (room () > 0)
  1526. {
  1527. int c = getw ();
  1528. if (c == eof)
  1529. break;
  1530. ucs2_t wc = ucs2_t (c);
  1531. ucs4_t lc = wc;
  1532. if (utf16_surrogate_high_p (wc))
  1533. {
  1534. c = s_in.get ();
  1535. if (utf16_surrogate_low_p (ucs2_t (c)))
  1536. lc = utf16_pair_to_ucs4 (wc, ucs2_t (c));
  1537. else
  1538. s_in.putback (c);
  1539. }
  1540. if (lc < 0x80)
  1541. {
  1542. if (lc == '\n')
  1543. puteol ();
  1544. else
  1545. put (u_char (lc));
  1546. }
  1547. else if (lc < 0x800)
  1548. {
  1549. put (u_char (0xc0 | ((lc >> 6) & 0x1f)));
  1550. put (u_char (0x80 | (lc & 0x3f)));
  1551. }
  1552. else if (lc < 0x10000)
  1553. {
  1554. put (u_char (0xe0 | ((lc >> 12) & 0xf)));
  1555. put (u_char (0x80 | ((lc >> 6) & 0x3f)));
  1556. put (u_char (0x80 | (lc & 0x3f)));
  1557. }
  1558. else /* lc < 0x200000(0x110000) */
  1559. {
  1560. put (u_char (0xf0 | ((lc >> 18) & 7)));
  1561. put (u_char (0x80 | ((lc >> 12) & 0x3f)));
  1562. put (u_char (0x80 | ((lc >> 6) & 0x3f)));
  1563. put (u_char (0x80 | (lc & 0x3f)));
  1564. }
  1565. }
  1566. return finish ();
  1567. }
  1568. static const char b64chars[] =
  1569. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  1570. static const char imap4_b64chars[] =
  1571. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
  1572. internal_to_utf7_stream::internal_to_utf7_stream (xinput_stream <Char> &in,
  1573. eol_code eol, int flags)
  1574. : internal_to_utf_stream (in, eol, flags & ~ENCODING_UTF_SIGNATURE),
  1575. s_nb (0),
  1576. s_nshift (0),
  1577. s_accept (flags),
  1578. s_imap4p (flags & UTF7_IMAP4_MAILBOX_NAME),
  1579. s_shift_char (s_imap4p ? '&' : '+'),
  1580. s_b64 (s_imap4p ? imap4_b64chars : b64chars)
  1581. {
  1582. /* In IMAP4 modified UTF-7, "&" is always represented by "&-". */
  1583. if (s_imap4p)
  1584. s_accept |= UTF7_IMAP4_SHIFT_CHAR;
  1585. }
  1586. void
  1587. internal_to_utf7_stream::encode_b64 ()
  1588. {
  1589. int n = s_nb - s_nb % 3;
  1590. const u_char *b, *const be = s_b + n;
  1591. for (b = s_b; b < be; b += 3)
  1592. {
  1593. put (s_b64[(b[0] >> 2) & 63]);
  1594. put (s_b64[((b[0] << 4) | (b[1] >> 4)) & 63]);
  1595. put (s_b64[((b[1] << 2) | (b[2] >> 6)) & 63]);
  1596. put (s_b64[b[2] & 63]);
  1597. }
  1598. if (n != s_nb)
  1599. switch (s_nb % 3)
  1600. {
  1601. case 1:
  1602. put (s_b64[(b[0] >> 2) & 63]);
  1603. put (s_b64[(b[0] << 4) & 63]);
  1604. break;
  1605. case 2:
  1606. put (s_b64[(b[0] >> 2) & 63]);
  1607. put (s_b64[((b[0] << 4) | (b[1] >> 4)) & 63]);
  1608. put (s_b64[(b[1] << 2) & 63]);
  1609. break;
  1610. }
  1611. }
  1612. int
  1613. internal_to_utf7_stream::refill ()
  1614. {
  1615. begin ();
  1616. while (room () > 0)
  1617. {
  1618. int c = getw ();
  1619. if (c == eof)
  1620. {
  1621. if (s_nshift)
  1622. {
  1623. if (s_nshift != 1 || s_nb != 2 || s_b[1] != s_shift_char)
  1624. encode_b64 ();
  1625. put ('-');
  1626. s_nshift = 0;
  1627. }
  1628. break;
  1629. }
  1630. ucs2_t wc = ucs2_t (c);
  1631. if (wc < 0x80 && utf7_set (wc) & s_accept)
  1632. {
  1633. if (s_nb)
  1634. {
  1635. if (s_nshift == 1 && s_nb == 2 && s_b[1] == s_shift_char)
  1636. put ('-');
  1637. else
  1638. {
  1639. encode_b64 ();
  1640. if (s_imap4p || wc == '-' || utf7_set (wc) & UTF7_SET_B)
  1641. put ('-');
  1642. }
  1643. s_nb = 0;
  1644. s_nshift = 0;
  1645. }
  1646. if (wc == '\n')
  1647. puteol ();
  1648. else
  1649. put (u_char (wc));
  1650. if (wc == s_shift_char)
  1651. put ('-');
  1652. }
  1653. else
  1654. {
  1655. if (!s_nshift)
  1656. put (u_char (s_shift_char));
  1657. if (s_nb == sizeof s_b)
  1658. {
  1659. encode_b64 ();
  1660. s_nb = 0;
  1661. }
  1662. s_b[s_nb++] = wc >> 8;
  1663. s_b[s_nb++] = u_char (wc);
  1664. s_nshift++;
  1665. }
  1666. }
  1667. return finish ();
  1668. }
  1669. int
  1670. internal_to_utf5_stream::refill ()
  1671. {
  1672. begin ();
  1673. while (room () > 0)
  1674. {
  1675. int c = getw ();
  1676. if (c == eof)
  1677. break;
  1678. ucs2_t wc = ucs2_t (c);
  1679. ucs4_t lc = wc;
  1680. if (utf16_surrogate_high_p (wc))
  1681. {
  1682. c = s_in.get ();
  1683. if (utf16_surrogate_low_p (ucs2_t (c)))
  1684. lc = utf16_pair_to_ucs4 (wc, ucs2_t (c));
  1685. else
  1686. s_in.putback (c);
  1687. }
  1688. if (!lc)
  1689. put ('G');
  1690. else if (lc < 0x10000)
  1691. {
  1692. for (int i = 0;; i++, lc <<= 4)
  1693. if (lc & 0xf000)
  1694. {
  1695. put ((upcase_digit_char + 16)[lc >> 12]);
  1696. for (; i < 3; i++, lc <<= 4)
  1697. put (upcase_digit_char[(lc >> 8) & 15]);
  1698. break;
  1699. }
  1700. }
  1701. else
  1702. {
  1703. for (int i = 0;; i++, lc <<= 4)
  1704. if (lc & 0xf0000000)
  1705. {
  1706. put ((upcase_digit_char + 16)[lc >> 28]);
  1707. for (; i < 7; i++, lc <<= 4)
  1708. put (upcase_digit_char[(lc >> 24) & 15]);
  1709. break;
  1710. }
  1711. }
  1712. }
  1713. return finish ();
  1714. }
  1715. const wc2int_hash &
  1716. internal_to_iso8859_stream::charset_hash (int ccs)
  1717. {
  1718. switch (ccs)
  1719. {
  1720. default:
  1721. assert (0);
  1722. case ccs_iso8859_1:
  1723. return wc2int_iso8859_1_hash;
  1724. case ccs_iso8859_2:
  1725. return wc2int_iso8859_2_hash;
  1726. case ccs_iso8859_3:
  1727. return wc2int_iso8859_3_hash;
  1728. case ccs_iso8859_4:
  1729. return wc2int_iso8859_4_hash;
  1730. case ccs_iso8859_5:
  1731. return wc2int_iso8859_5_hash;
  1732. case ccs_iso8859_7:
  1733. return wc2int_iso8859_7_hash;
  1734. case ccs_iso8859_9:
  1735. return wc2int_iso8859_9_hash;
  1736. case ccs_iso8859_10:
  1737. return wc2int_iso8859_10_hash;
  1738. case ccs_iso8859_13:
  1739. return wc2int_iso8859_13_hash;
  1740. }
  1741. }
  1742. int
  1743. internal_to_iso8859_stream::refill ()
  1744. {
  1745. begin ();
  1746. while (room () > 0)
  1747. {
  1748. int c = s_in.get ();
  1749. if (c == eof)
  1750. break;
  1751. Char cc = c;
  1752. if (cc >= 0xa0)
  1753. {
  1754. if (code_charset (cc) == s_charset)
  1755. cc = int_to_iso8859 (cc);
  1756. else
  1757. {
  1758. cc = lookup_wc2int_hash (s_hash, i2w (cc));
  1759. cc = cc != Char (-1) ? int_to_iso8859 (cc) : DEFCHAR;
  1760. }
  1761. if (cc >= 0x80 && cc < 0xa0)
  1762. cc = DEFCHAR;
  1763. }
  1764. if (cc == '\n')
  1765. puteol ();
  1766. else
  1767. put (u_char (cc));
  1768. }
  1769. return finish ();
  1770. }
  1771. int
  1772. internal_to_windows_codepage_stream::refill ()
  1773. {
  1774. begin ();
  1775. while (room () > 0)
  1776. {
  1777. int c = s_in.get ();
  1778. if (c == eof)
  1779. break;
  1780. Char cc = c;
  1781. if (cc >= 128)
  1782. {
  1783. cc = lookup_wc2int_hash (s_hash, i2w (cc));
  1784. if (cc == Char (-1))
  1785. cc = DEFCHAR;
  1786. }
  1787. if (cc == '\n')
  1788. puteol ();
  1789. else
  1790. put (u_char (cc));
  1791. }
  1792. return finish ();
  1793. }
  1794. int
  1795. xdecode_stream::decode (int nchars, const u_char *i)
  1796. {
  1797. if (!nchars)
  1798. return eof;
  1799. begin ();
  1800. for (; nchars >= 3; i += 4, nchars -= 3)
  1801. {
  1802. put ((i[0] << 2) | (i[1] >> 4));
  1803. put ((i[1] << 4) | (i[2] >> 2));
  1804. put ((i[2] << 6) | i[3]);
  1805. }
  1806. if (nchars > 0)
  1807. {
  1808. put ((i[0] << 2) | (i[1] >> 4));
  1809. if (nchars > 1)
  1810. put ((i[1] << 4) | (i[2] >> 2));
  1811. }
  1812. return finish ();
  1813. }
  1814. int
  1815. xdecode_b64_stream::refill ()
  1816. {
  1817. u_char buf[XDECODE_STREAM_BUFSIZE / 3 * 4];
  1818. int nchars;
  1819. for (nchars = 0; nchars < sizeof buf;)
  1820. {
  1821. int c = s_in.get ();
  1822. if (c == eof)
  1823. break;
  1824. c = base64_decode (c);
  1825. if (c < 64)
  1826. buf[nchars++] = c;
  1827. else if (c == 64 && nchars)
  1828. break;
  1829. }
  1830. return decode (nchars * 3 / 4, buf);
  1831. }
  1832. int
  1833. xdecode_uu_stream::refill ()
  1834. {
  1835. int c;
  1836. do
  1837. {
  1838. c = s_in.get ();
  1839. if (c == eof)
  1840. return eof;
  1841. }
  1842. while (c == '\r' || c == '\n');
  1843. int nchars = uudecode (c);
  1844. u_char buf[63 / 3 * 4];
  1845. int i;
  1846. for (i = 0; i < sizeof buf; i++)
  1847. {
  1848. c = s_in.get ();
  1849. if (c == eof || c == '\n')
  1850. break;
  1851. buf[i] = uudecode (c);
  1852. }
  1853. if (i == sizeof buf)
  1854. do
  1855. c = s_in.get ();
  1856. while (c != eof && c != '\n');
  1857. return decode (nchars, buf);
  1858. }
  1859. int
  1860. xdecode_qp_stream::refill ()
  1861. {
  1862. int c1, c2, c3;
  1863. begin ();
  1864. c1 = s_in.get ();
  1865. while (room () > 0)
  1866. {
  1867. if (c1 == eof)
  1868. break;
  1869. if (c1 != '=')
  1870. put (s_underscore_to_space && c1 == '_' ? ' ' : c1);
  1871. else
  1872. {
  1873. c2 = s_in.get ();
  1874. if (c2 == '\r')
  1875. {
  1876. c3 = s_in.get ();
  1877. if (c3 != '\n')
  1878. {
  1879. put (c1);
  1880. put (c2);
  1881. c1 = c3;
  1882. continue;
  1883. }
  1884. }
  1885. else if (c2 == '\n')
  1886. ;
  1887. else
  1888. {
  1889. if (c2 == eof || digit_char (c2) >= 16)
  1890. {
  1891. put (c1);
  1892. c1 = c2;
  1893. continue;
  1894. }
  1895. c3 = s_in.get ();
  1896. if (c3 == eof || digit_char (c3) >= 16)
  1897. {
  1898. put (c1);
  1899. put (c2);
  1900. c1 = c3;
  1901. continue;
  1902. }
  1903. put ((digit_char (c2) << 4) | digit_char (c3));
  1904. }
  1905. }
  1906. c1 = s_in.get ();
  1907. }
  1908. if (c1 != eof)
  1909. s_in.putback (c1);
  1910. return finish ();
  1911. }
  1912. int
  1913. xencode_b64_stream::refill ()
  1914. {
  1915. u_char *b = s_buf, *const be = b + s_width;
  1916. while (b < be)
  1917. {
  1918. int c1 = s_in.get ();
  1919. if (c1 == eof)
  1920. {
  1921. if (b == s_buf)
  1922. return eof;
  1923. break;
  1924. }
  1925. int c2 = s_in.get ();
  1926. if (c2 == eof)
  1927. {
  1928. *b++ = b64chars[(c1 >> 2) & 63];
  1929. *b++ = b64chars[(c1 << 4) & 63];
  1930. *b++ = '=';
  1931. *b++ = '=';
  1932. break;
  1933. }
  1934. int c3 = s_in.get ();
  1935. if (c3 == eof)
  1936. {
  1937. *b++ = b64chars[(c1 >> 2) & 63];
  1938. *b++ = b64chars[((c1 << 4) | (c2 >> 4)) & 63];
  1939. *b++ = b64chars[(c2 << 2) & 63];
  1940. *b++ = '=';
  1941. break;
  1942. }
  1943. *b++ = b64chars[(c1 >> 2) & 63];
  1944. *b++ = b64chars[((c1 << 4) | (c2 >> 4)) & 63];
  1945. *b++ = b64chars[((c2 << 2) | (c3 >> 6)) & 63];
  1946. *b++ = b64chars[c3 & 63];
  1947. }
  1948. if (s_fold_p)
  1949. *b++ = '\n';
  1950. return setbuf (s_buf, b);
  1951. }
  1952. int
  1953. xencode_uu_stream::refill ()
  1954. {
  1955. if (s_eofp)
  1956. return eof;
  1957. u_char buf[BUFSIZE];
  1958. int nchars;
  1959. for (nchars = 0; nchars < sizeof buf; nchars++)
  1960. {
  1961. int c = s_in.get ();
  1962. if (c == eof)
  1963. {
  1964. s_eofp = !nchars;
  1965. break;
  1966. }
  1967. buf[nchars] = c;
  1968. }
  1969. u_char *b = s_buf;
  1970. *b++ = uuencode (nchars);
  1971. for (int i = 0; i < nchars; i += 3)
  1972. {
  1973. *b++ = uuencode ((buf[i] >> 2) & 63);
  1974. *b++ = uuencode (((buf[i] << 4) | (buf[i + 1] >> 4)) & 63);
  1975. *b++ = uuencode (((buf[i + 1] << 2) | (buf[i + 2] >> 6)) & 63);
  1976. *b++ = uuencode (buf[i + 2] & 63);
  1977. }
  1978. *b++ = '\n';
  1979. return setbuf (s_buf, b);
  1980. }
  1981. inline u_char *
  1982. xencode_qp_stream::encode (u_char *b, int c)
  1983. {
  1984. *b++ = '=';
  1985. *b++ = upcase_digit_char[c >> 4];
  1986. *b++ = upcase_digit_char[c & 15];
  1987. return b;
  1988. }
  1989. int
  1990. xencode_qp_stream::refill ()
  1991. {
  1992. int c, c2;
  1993. u_char *b = s_buf, *const be = b + LINESIZE;
  1994. while (b < be)
  1995. {
  1996. c = s_in.get ();
  1997. switch (c)
  1998. {
  1999. case eof:
  2000. return setbuf (s_buf, b);
  2001. case '\n':
  2002. *b++ = c;
  2003. return setbuf (s_buf, b);
  2004. case ' ':
  2005. if (s_space_to_underscore)
  2006. {
  2007. *b++ = '_';
  2008. break;
  2009. }
  2010. goto white_space;
  2011. case '\t':
  2012. if (s_space_to_underscore)
  2013. {
  2014. b = encode (b, c);
  2015. break;
  2016. }
  2017. /* fall thru... */
  2018. white_space:
  2019. c2 = s_in.get ();
  2020. if (c2 == '\n')
  2021. {
  2022. b = encode (b, c);
  2023. *b++ = c2;
  2024. return setbuf (s_buf, b);
  2025. }
  2026. s_in.putback (c2);
  2027. *b++ = c;
  2028. break;
  2029. case '_':
  2030. case '?':
  2031. if (s_space_to_underscore)
  2032. b = encode (b, c);
  2033. else
  2034. *b++ = c;
  2035. break;
  2036. case '=':
  2037. b = encode (b, c);
  2038. break;
  2039. default:
  2040. if (c > ' ' && c < 0x7f)
  2041. *b++ = c;
  2042. else
  2043. b = encode (b, c);
  2044. break;
  2045. }
  2046. }
  2047. *b++ = '=';
  2048. *b++ = '\n';
  2049. return setbuf (s_buf, b);
  2050. }
  2051. int
  2052. xdecode_url_stream::refill ()
  2053. {
  2054. int c1, c2, c3;
  2055. begin ();
  2056. c1 = s_in.get ();
  2057. while (room () > 0)
  2058. {
  2059. if (c1 == eof)
  2060. break;
  2061. if (c1 != '%')
  2062. put (c1);
  2063. else
  2064. {
  2065. c2 = s_in.get ();
  2066. if (c2 == eof || digit_char (c2) >= 16)
  2067. {
  2068. put (c1);
  2069. c1 = c2;
  2070. continue;
  2071. }
  2072. c3 = s_in.get ();
  2073. if (c3 == eof || digit_char (c3) >= 16)
  2074. {
  2075. put (c1);
  2076. put (c2);
  2077. c1 = c3;
  2078. continue;
  2079. }
  2080. put ((digit_char (c2) << 4) | digit_char (c3));
  2081. }
  2082. c1 = s_in.get ();
  2083. }
  2084. if (c1 != eof)
  2085. s_in.putback (c1);
  2086. return finish ();
  2087. }
  2088. int
  2089. xencode_url_stream::refill ()
  2090. {
  2091. begin ();
  2092. while (room () > 0)
  2093. {
  2094. int c = s_in.get ();
  2095. if (c == eof)
  2096. break;
  2097. if (s_literal[c])
  2098. put (c);
  2099. else
  2100. {
  2101. put ('%');
  2102. put (upcase_digit_char[c >> 4]);
  2103. put (upcase_digit_char[c & 15]);
  2104. }
  2105. }
  2106. return finish ();
  2107. }
  2108. int
  2109. xdecode_hqx_s

Large files files are truncated, but you can click here to view the full file