PageRenderTime 36ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/transcode.c

https://github.com/fizx/ruby
C | 4258 lines | 3064 code | 491 blank | 703 comment | 605 complexity | 250fdc509e495afc2048c1b1725ad0c1 MD5 | raw file
Possible License(s): LGPL-2.1, AGPL-3.0, GPL-2.0, BSD-3-Clause
  1. /**********************************************************************
  2. transcode.c -
  3. $Author$
  4. created at: Tue Oct 30 16:10:22 JST 2007
  5. Copyright (C) 2007 Martin Duerst
  6. **********************************************************************/
  7. #include "ruby/ruby.h"
  8. #include "ruby/encoding.h"
  9. #include "transcode_data.h"
  10. #include <ctype.h>
  11. /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
  12. VALUE rb_eUndefinedConversionError;
  13. VALUE rb_eInvalidByteSequenceError;
  14. VALUE rb_eConverterNotFoundError;
  15. VALUE rb_cEncodingConverter;
  16. static VALUE sym_invalid, sym_undef, sym_replace;
  17. static VALUE sym_xml, sym_text, sym_attr;
  18. static VALUE sym_universal_newline;
  19. static VALUE sym_crlf_newline;
  20. static VALUE sym_cr_newline;
  21. static VALUE sym_partial_input;
  22. static VALUE sym_invalid_byte_sequence;
  23. static VALUE sym_undefined_conversion;
  24. static VALUE sym_destination_buffer_full;
  25. static VALUE sym_source_buffer_empty;
  26. static VALUE sym_finished;
  27. static VALUE sym_after_output;
  28. static VALUE sym_incomplete_input;
  29. static unsigned char *
  30. allocate_converted_string(const char *sname, const char *dname,
  31. const unsigned char *str, size_t len,
  32. unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
  33. size_t *dst_len_ptr);
  34. /* dynamic structure, one per conversion (similar to iconv_t) */
  35. /* may carry conversion state (e.g. for iso-2022-jp) */
  36. typedef struct rb_transcoding {
  37. const rb_transcoder *transcoder;
  38. int flags;
  39. int resume_position;
  40. unsigned int next_table;
  41. VALUE next_info;
  42. unsigned char next_byte;
  43. unsigned int output_index;
  44. ssize_t recognized_len; /* already interpreted */
  45. ssize_t readagain_len; /* not yet interpreted */
  46. union {
  47. unsigned char ary[8]; /* max_input <= sizeof(ary) */
  48. unsigned char *ptr; /* length: max_input */
  49. } readbuf; /* recognized_len + readagain_len used */
  50. ssize_t writebuf_off;
  51. ssize_t writebuf_len;
  52. union {
  53. unsigned char ary[8]; /* max_output <= sizeof(ary) */
  54. unsigned char *ptr; /* length: max_output */
  55. } writebuf;
  56. union rb_transcoding_state_t { /* opaque data for stateful encoding */
  57. void *ptr;
  58. char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
  59. double dummy_for_alignment;
  60. } state;
  61. } rb_transcoding;
  62. #define TRANSCODING_READBUF(tc) \
  63. ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
  64. (tc)->readbuf.ary : \
  65. (tc)->readbuf.ptr)
  66. #define TRANSCODING_WRITEBUF(tc) \
  67. ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
  68. (tc)->writebuf.ary : \
  69. (tc)->writebuf.ptr)
  70. #define TRANSCODING_WRITEBUF_SIZE(tc) \
  71. ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
  72. sizeof((tc)->writebuf.ary) : \
  73. (size_t)(tc)->transcoder->max_output)
  74. #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
  75. #define TRANSCODING_STATE(tc) \
  76. ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
  77. (tc)->state.ary : \
  78. (tc)->state.ptr)
  79. typedef struct {
  80. struct rb_transcoding *tc;
  81. unsigned char *out_buf_start;
  82. unsigned char *out_data_start;
  83. unsigned char *out_data_end;
  84. unsigned char *out_buf_end;
  85. rb_econv_result_t last_result;
  86. } rb_econv_elem_t;
  87. struct rb_econv_t {
  88. int flags;
  89. const char *source_encoding_name;
  90. const char *destination_encoding_name;
  91. int started;
  92. const unsigned char *replacement_str;
  93. size_t replacement_len;
  94. const char *replacement_enc;
  95. int replacement_allocated;
  96. unsigned char *in_buf_start;
  97. unsigned char *in_data_start;
  98. unsigned char *in_data_end;
  99. unsigned char *in_buf_end;
  100. rb_econv_elem_t *elems;
  101. int num_allocated;
  102. int num_trans;
  103. int num_finished;
  104. struct rb_transcoding *last_tc;
  105. /* last error */
  106. struct {
  107. rb_econv_result_t result;
  108. struct rb_transcoding *error_tc;
  109. const char *source_encoding;
  110. const char *destination_encoding;
  111. const unsigned char *error_bytes_start;
  112. size_t error_bytes_len;
  113. size_t readagain_len;
  114. } last_error;
  115. /* The following fields are only for Encoding::Converter.
  116. * rb_econv_open set them NULL. */
  117. rb_encoding *source_encoding;
  118. rb_encoding *destination_encoding;
  119. };
  120. /*
  121. * Dispatch data and logic
  122. */
  123. #define DECORATOR_P(sname, dname) (*(sname) == '\0')
  124. typedef struct {
  125. const char *sname;
  126. const char *dname;
  127. const char *lib; /* null means means no need to load a library */
  128. const rb_transcoder *transcoder;
  129. } transcoder_entry_t;
  130. static st_table *transcoder_table;
  131. static transcoder_entry_t *
  132. make_transcoder_entry(const char *sname, const char *dname)
  133. {
  134. st_data_t val;
  135. st_table *table2;
  136. if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
  137. val = (st_data_t)st_init_strcasetable();
  138. st_add_direct(transcoder_table, (st_data_t)sname, val);
  139. }
  140. table2 = (st_table *)val;
  141. if (!st_lookup(table2, (st_data_t)dname, &val)) {
  142. transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
  143. entry->sname = sname;
  144. entry->dname = dname;
  145. entry->lib = NULL;
  146. entry->transcoder = NULL;
  147. val = (st_data_t)entry;
  148. st_add_direct(table2, (st_data_t)dname, val);
  149. }
  150. return (transcoder_entry_t *)val;
  151. }
  152. static transcoder_entry_t *
  153. get_transcoder_entry(const char *sname, const char *dname)
  154. {
  155. st_data_t val;
  156. st_table *table2;
  157. if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
  158. return NULL;
  159. }
  160. table2 = (st_table *)val;
  161. if (!st_lookup(table2, (st_data_t)dname, &val)) {
  162. return NULL;
  163. }
  164. return (transcoder_entry_t *)val;
  165. }
  166. void
  167. rb_register_transcoder(const rb_transcoder *tr)
  168. {
  169. const char *const sname = tr->src_encoding;
  170. const char *const dname = tr->dst_encoding;
  171. transcoder_entry_t *entry;
  172. entry = make_transcoder_entry(sname, dname);
  173. if (entry->transcoder) {
  174. rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
  175. sname, dname);
  176. }
  177. entry->transcoder = tr;
  178. }
  179. static void
  180. declare_transcoder(const char *sname, const char *dname, const char *lib)
  181. {
  182. transcoder_entry_t *entry;
  183. entry = make_transcoder_entry(sname, dname);
  184. entry->lib = lib;
  185. }
  186. #define MAX_TRANSCODER_LIBNAME_LEN 64
  187. static const char transcoder_lib_prefix[] = "enc/trans/";
  188. void
  189. rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
  190. {
  191. if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
  192. rb_raise(rb_eArgError, "invalid library name - %s",
  193. lib ? lib : "(null)");
  194. }
  195. declare_transcoder(enc1, enc2, lib);
  196. }
  197. #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
  198. typedef struct search_path_queue_tag {
  199. struct search_path_queue_tag *next;
  200. const char *enc;
  201. } search_path_queue_t;
  202. typedef struct {
  203. st_table *visited;
  204. search_path_queue_t *queue;
  205. search_path_queue_t **queue_last_ptr;
  206. const char *base_enc;
  207. } search_path_bfs_t;
  208. static int
  209. transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
  210. {
  211. const char *dname = (const char *)key;
  212. search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
  213. search_path_queue_t *q;
  214. if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
  215. return ST_CONTINUE;
  216. }
  217. q = ALLOC(search_path_queue_t);
  218. q->enc = dname;
  219. q->next = NULL;
  220. *bfs->queue_last_ptr = q;
  221. bfs->queue_last_ptr = &q->next;
  222. st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
  223. return ST_CONTINUE;
  224. }
  225. static int
  226. transcode_search_path(const char *sname, const char *dname,
  227. void (*callback)(const char *sname, const char *dname, int depth, void *arg),
  228. void *arg)
  229. {
  230. search_path_bfs_t bfs;
  231. search_path_queue_t *q;
  232. st_data_t val;
  233. st_table *table2;
  234. int found;
  235. int pathlen = -1;
  236. if (encoding_equal(sname, dname))
  237. return -1;
  238. q = ALLOC(search_path_queue_t);
  239. q->enc = sname;
  240. q->next = NULL;
  241. bfs.queue_last_ptr = &q->next;
  242. bfs.queue = q;
  243. bfs.visited = st_init_strcasetable();
  244. st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
  245. while (bfs.queue) {
  246. q = bfs.queue;
  247. bfs.queue = q->next;
  248. if (!bfs.queue)
  249. bfs.queue_last_ptr = &bfs.queue;
  250. if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
  251. xfree(q);
  252. continue;
  253. }
  254. table2 = (st_table *)val;
  255. if (st_lookup(table2, (st_data_t)dname, &val)) {
  256. st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
  257. xfree(q);
  258. found = 1;
  259. goto cleanup;
  260. }
  261. bfs.base_enc = q->enc;
  262. st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
  263. bfs.base_enc = NULL;
  264. xfree(q);
  265. }
  266. found = 0;
  267. cleanup:
  268. while (bfs.queue) {
  269. q = bfs.queue;
  270. bfs.queue = q->next;
  271. xfree(q);
  272. }
  273. if (found) {
  274. const char *enc = dname;
  275. int depth;
  276. pathlen = 0;
  277. while (1) {
  278. st_lookup(bfs.visited, (st_data_t)enc, &val);
  279. if (!val)
  280. break;
  281. pathlen++;
  282. enc = (const char *)val;
  283. }
  284. depth = pathlen;
  285. enc = dname;
  286. while (1) {
  287. st_lookup(bfs.visited, (st_data_t)enc, &val);
  288. if (!val)
  289. break;
  290. callback((const char *)val, enc, --depth, arg);
  291. enc = (const char *)val;
  292. }
  293. }
  294. st_free_table(bfs.visited);
  295. return pathlen; /* is -1 if not found */
  296. }
  297. static const rb_transcoder *
  298. load_transcoder_entry(transcoder_entry_t *entry)
  299. {
  300. if (entry->transcoder)
  301. return entry->transcoder;
  302. if (entry->lib) {
  303. const char *lib = entry->lib;
  304. size_t len = strlen(lib);
  305. char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
  306. entry->lib = NULL;
  307. if (len > MAX_TRANSCODER_LIBNAME_LEN)
  308. return NULL;
  309. memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
  310. memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
  311. if (!rb_require(path))
  312. return NULL;
  313. }
  314. if (entry->transcoder)
  315. return entry->transcoder;
  316. return NULL;
  317. }
  318. static const char*
  319. get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
  320. {
  321. if (encoding_equal(encname, "UTF-8")) {
  322. *len_ret = 3;
  323. *repl_encname_ptr = "UTF-8";
  324. return "\xEF\xBF\xBD";
  325. }
  326. else {
  327. *len_ret = 1;
  328. *repl_encname_ptr = "US-ASCII";
  329. return "?";
  330. }
  331. }
  332. /*
  333. * Transcoding engine logic
  334. */
  335. static const unsigned char *
  336. transcode_char_start(rb_transcoding *tc,
  337. const unsigned char *in_start,
  338. const unsigned char *inchar_start,
  339. const unsigned char *in_p,
  340. size_t *char_len_ptr)
  341. {
  342. const unsigned char *ptr;
  343. if (inchar_start - in_start < tc->recognized_len) {
  344. MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
  345. inchar_start, unsigned char, in_p - inchar_start);
  346. ptr = TRANSCODING_READBUF(tc);
  347. }
  348. else {
  349. ptr = inchar_start - tc->recognized_len;
  350. }
  351. *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
  352. return ptr;
  353. }
  354. static rb_econv_result_t
  355. transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
  356. const unsigned char *in_stop, unsigned char *out_stop,
  357. rb_transcoding *tc,
  358. const int opt)
  359. {
  360. const rb_transcoder *tr = tc->transcoder;
  361. int unitlen = tr->input_unit_length;
  362. ssize_t readagain_len = 0;
  363. const unsigned char *inchar_start;
  364. const unsigned char *in_p;
  365. unsigned char *out_p;
  366. in_p = inchar_start = *in_pos;
  367. out_p = *out_pos;
  368. #define SUSPEND(ret, num) \
  369. do { \
  370. tc->resume_position = (num); \
  371. if (0 < in_p - inchar_start) \
  372. MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
  373. inchar_start, unsigned char, in_p - inchar_start); \
  374. *in_pos = in_p; \
  375. *out_pos = out_p; \
  376. tc->recognized_len += in_p - inchar_start; \
  377. if (readagain_len) { \
  378. tc->recognized_len -= readagain_len; \
  379. tc->readagain_len = readagain_len; \
  380. } \
  381. return ret; \
  382. resume_label ## num:; \
  383. } while (0)
  384. #define SUSPEND_OBUF(num) \
  385. do { \
  386. while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
  387. } while (0)
  388. #define SUSPEND_AFTER_OUTPUT(num) \
  389. if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
  390. SUSPEND(econv_after_output, num); \
  391. }
  392. #define next_table (tc->next_table)
  393. #define next_info (tc->next_info)
  394. #define next_byte (tc->next_byte)
  395. #define writebuf_len (tc->writebuf_len)
  396. #define writebuf_off (tc->writebuf_off)
  397. switch (tc->resume_position) {
  398. case 0: break;
  399. case 1: goto resume_label1;
  400. case 2: goto resume_label2;
  401. case 3: goto resume_label3;
  402. case 4: goto resume_label4;
  403. case 5: goto resume_label5;
  404. case 6: goto resume_label6;
  405. case 7: goto resume_label7;
  406. case 8: goto resume_label8;
  407. case 9: goto resume_label9;
  408. case 10: goto resume_label10;
  409. case 11: goto resume_label11;
  410. case 12: goto resume_label12;
  411. case 13: goto resume_label13;
  412. case 14: goto resume_label14;
  413. case 15: goto resume_label15;
  414. case 16: goto resume_label16;
  415. case 17: goto resume_label17;
  416. case 18: goto resume_label18;
  417. case 19: goto resume_label19;
  418. case 20: goto resume_label20;
  419. case 21: goto resume_label21;
  420. case 22: goto resume_label22;
  421. case 23: goto resume_label23;
  422. case 24: goto resume_label24;
  423. case 25: goto resume_label25;
  424. case 26: goto resume_label26;
  425. case 27: goto resume_label27;
  426. case 28: goto resume_label28;
  427. case 29: goto resume_label29;
  428. case 30: goto resume_label30;
  429. case 31: goto resume_label31;
  430. case 32: goto resume_label32;
  431. case 33: goto resume_label33;
  432. case 34: goto resume_label34;
  433. }
  434. while (1) {
  435. inchar_start = in_p;
  436. tc->recognized_len = 0;
  437. next_table = tr->conv_tree_start;
  438. SUSPEND_AFTER_OUTPUT(24);
  439. if (in_stop <= in_p) {
  440. if (!(opt & ECONV_PARTIAL_INPUT))
  441. break;
  442. SUSPEND(econv_source_buffer_empty, 7);
  443. continue;
  444. }
  445. #define BYTE_ADDR(index) (tr->byte_array + (index))
  446. #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
  447. #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
  448. #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
  449. #define BL_MIN_BYTE (BL_BASE[0])
  450. #define BL_MAX_BYTE (BL_BASE[1])
  451. #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
  452. #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
  453. next_byte = (unsigned char)*in_p++;
  454. follow_byte:
  455. if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
  456. next_info = INVALID;
  457. else {
  458. next_info = (VALUE)BL_ACTION(next_byte);
  459. }
  460. follow_info:
  461. switch (next_info & 0x1F) {
  462. case NOMAP:
  463. {
  464. const unsigned char *p = inchar_start;
  465. writebuf_off = 0;
  466. while (p < in_p) {
  467. TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
  468. }
  469. writebuf_len = writebuf_off;
  470. writebuf_off = 0;
  471. while (writebuf_off < writebuf_len) {
  472. SUSPEND_OBUF(3);
  473. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  474. }
  475. }
  476. continue;
  477. case 0x00: case 0x04: case 0x08: case 0x0C:
  478. case 0x10: case 0x14: case 0x18: case 0x1C:
  479. SUSPEND_AFTER_OUTPUT(25);
  480. while (in_p >= in_stop) {
  481. if (!(opt & ECONV_PARTIAL_INPUT))
  482. goto incomplete;
  483. SUSPEND(econv_source_buffer_empty, 5);
  484. }
  485. next_byte = (unsigned char)*in_p++;
  486. next_table = (unsigned int)next_info;
  487. goto follow_byte;
  488. case ZERObt: /* drop input */
  489. continue;
  490. case ONEbt:
  491. SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
  492. continue;
  493. case TWObt:
  494. SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
  495. SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
  496. continue;
  497. case THREEbt:
  498. SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
  499. SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
  500. SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
  501. continue;
  502. case FOURbt:
  503. SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
  504. SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
  505. SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
  506. SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
  507. continue;
  508. case GB4bt:
  509. SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
  510. SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
  511. SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
  512. SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
  513. continue;
  514. case STR1:
  515. tc->output_index = 0;
  516. while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
  517. SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
  518. tc->output_index++;
  519. }
  520. continue;
  521. case FUNii:
  522. next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
  523. goto follow_info;
  524. case FUNsi:
  525. {
  526. const unsigned char *char_start;
  527. size_t char_len;
  528. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  529. next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
  530. goto follow_info;
  531. }
  532. case FUNio:
  533. SUSPEND_OBUF(13);
  534. if (tr->max_output <= out_stop - out_p)
  535. out_p += tr->func_io(TRANSCODING_STATE(tc),
  536. next_info, out_p, out_stop - out_p);
  537. else {
  538. writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
  539. next_info,
  540. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  541. writebuf_off = 0;
  542. while (writebuf_off < writebuf_len) {
  543. SUSPEND_OBUF(20);
  544. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  545. }
  546. }
  547. break;
  548. case FUNso:
  549. {
  550. const unsigned char *char_start;
  551. size_t char_len;
  552. SUSPEND_OBUF(14);
  553. if (tr->max_output <= out_stop - out_p) {
  554. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  555. out_p += tr->func_so(TRANSCODING_STATE(tc),
  556. char_start, (size_t)char_len,
  557. out_p, out_stop - out_p);
  558. }
  559. else {
  560. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  561. writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
  562. char_start, (size_t)char_len,
  563. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  564. writebuf_off = 0;
  565. while (writebuf_off < writebuf_len) {
  566. SUSPEND_OBUF(22);
  567. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  568. }
  569. }
  570. break;
  571. }
  572. case FUNsio:
  573. {
  574. const unsigned char *char_start;
  575. size_t char_len;
  576. SUSPEND_OBUF(33);
  577. if (tr->max_output <= out_stop - out_p) {
  578. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  579. out_p += tr->func_sio(TRANSCODING_STATE(tc),
  580. char_start, (size_t)char_len, next_info,
  581. out_p, out_stop - out_p);
  582. }
  583. else {
  584. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  585. writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
  586. char_start, (size_t)char_len, next_info,
  587. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  588. writebuf_off = 0;
  589. while (writebuf_off < writebuf_len) {
  590. SUSPEND_OBUF(34);
  591. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  592. }
  593. }
  594. break;
  595. }
  596. case INVALID:
  597. if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
  598. if (tc->recognized_len + (in_p - inchar_start) < unitlen)
  599. SUSPEND_AFTER_OUTPUT(26);
  600. while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
  601. in_p = in_stop;
  602. SUSPEND(econv_source_buffer_empty, 8);
  603. }
  604. if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
  605. in_p = in_stop;
  606. }
  607. else {
  608. in_p = inchar_start + (unitlen - tc->recognized_len);
  609. }
  610. }
  611. else {
  612. ssize_t invalid_len; /* including the last byte which causes invalid */
  613. ssize_t discard_len;
  614. invalid_len = tc->recognized_len + (in_p - inchar_start);
  615. discard_len = ((invalid_len - 1) / unitlen) * unitlen;
  616. readagain_len = invalid_len - discard_len;
  617. }
  618. goto invalid;
  619. case UNDEF:
  620. goto undef;
  621. default:
  622. rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
  623. }
  624. continue;
  625. invalid:
  626. SUSPEND(econv_invalid_byte_sequence, 1);
  627. continue;
  628. incomplete:
  629. SUSPEND(econv_incomplete_input, 27);
  630. continue;
  631. undef:
  632. SUSPEND(econv_undefined_conversion, 2);
  633. continue;
  634. }
  635. /* cleanup */
  636. if (tr->finish_func) {
  637. SUSPEND_OBUF(4);
  638. if (tr->max_output <= out_stop - out_p) {
  639. out_p += tr->finish_func(TRANSCODING_STATE(tc),
  640. out_p, out_stop - out_p);
  641. }
  642. else {
  643. writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
  644. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  645. writebuf_off = 0;
  646. while (writebuf_off < writebuf_len) {
  647. SUSPEND_OBUF(23);
  648. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  649. }
  650. }
  651. }
  652. while (1)
  653. SUSPEND(econv_finished, 6);
  654. #undef SUSPEND
  655. #undef next_table
  656. #undef next_info
  657. #undef next_byte
  658. #undef writebuf_len
  659. #undef writebuf_off
  660. }
  661. static rb_econv_result_t
  662. transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
  663. const unsigned char *in_stop, unsigned char *out_stop,
  664. rb_transcoding *tc,
  665. const int opt)
  666. {
  667. if (tc->readagain_len) {
  668. unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
  669. const unsigned char *readagain_pos = readagain_buf;
  670. const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
  671. rb_econv_result_t res;
  672. MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
  673. unsigned char, tc->readagain_len);
  674. tc->readagain_len = 0;
  675. res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
  676. if (res != econv_source_buffer_empty) {
  677. MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
  678. readagain_pos, unsigned char, readagain_stop - readagain_pos);
  679. tc->readagain_len += readagain_stop - readagain_pos;
  680. return res;
  681. }
  682. }
  683. return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
  684. }
  685. static rb_transcoding *
  686. rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
  687. {
  688. rb_transcoding *tc;
  689. tc = ALLOC(rb_transcoding);
  690. tc->transcoder = tr;
  691. tc->flags = flags;
  692. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
  693. tc->state.ptr = xmalloc(tr->state_size);
  694. if (tr->state_init_func) {
  695. (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
  696. }
  697. tc->resume_position = 0;
  698. tc->recognized_len = 0;
  699. tc->readagain_len = 0;
  700. tc->writebuf_len = 0;
  701. tc->writebuf_off = 0;
  702. if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
  703. tc->readbuf.ptr = xmalloc(tr->max_input);
  704. }
  705. if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
  706. tc->writebuf.ptr = xmalloc(tr->max_output);
  707. }
  708. return tc;
  709. }
  710. static rb_econv_result_t
  711. rb_transcoding_convert(rb_transcoding *tc,
  712. const unsigned char **input_ptr, const unsigned char *input_stop,
  713. unsigned char **output_ptr, unsigned char *output_stop,
  714. int flags)
  715. {
  716. return transcode_restartable(
  717. input_ptr, output_ptr,
  718. input_stop, output_stop,
  719. tc, flags);
  720. }
  721. static void
  722. rb_transcoding_close(rb_transcoding *tc)
  723. {
  724. const rb_transcoder *tr = tc->transcoder;
  725. if (tr->state_fini_func) {
  726. (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
  727. }
  728. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
  729. xfree(tc->state.ptr);
  730. if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
  731. xfree(tc->readbuf.ptr);
  732. if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
  733. xfree(tc->writebuf.ptr);
  734. xfree(tc);
  735. }
  736. static size_t
  737. rb_transcoding_memsize(rb_transcoding *tc)
  738. {
  739. size_t size = sizeof(rb_transcoding);
  740. const rb_transcoder *tr = tc->transcoder;
  741. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
  742. size += tr->state_size;
  743. }
  744. if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
  745. size += tr->max_input;
  746. }
  747. if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
  748. size += tr->max_output;
  749. }
  750. return size;
  751. }
  752. static rb_econv_t *
  753. rb_econv_alloc(int n_hint)
  754. {
  755. rb_econv_t *ec;
  756. if (n_hint <= 0)
  757. n_hint = 1;
  758. ec = ALLOC(rb_econv_t);
  759. ec->flags = 0;
  760. ec->source_encoding_name = NULL;
  761. ec->destination_encoding_name = NULL;
  762. ec->started = 0;
  763. ec->replacement_str = NULL;
  764. ec->replacement_len = 0;
  765. ec->replacement_enc = NULL;
  766. ec->replacement_allocated = 0;
  767. ec->in_buf_start = NULL;
  768. ec->in_data_start = NULL;
  769. ec->in_data_end = NULL;
  770. ec->in_buf_end = NULL;
  771. ec->num_allocated = n_hint;
  772. ec->num_trans = 0;
  773. ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
  774. ec->num_finished = 0;
  775. ec->last_tc = NULL;
  776. ec->last_error.result = econv_source_buffer_empty;
  777. ec->last_error.error_tc = NULL;
  778. ec->last_error.source_encoding = NULL;
  779. ec->last_error.destination_encoding = NULL;
  780. ec->last_error.error_bytes_start = NULL;
  781. ec->last_error.error_bytes_len = 0;
  782. ec->last_error.readagain_len = 0;
  783. ec->source_encoding = NULL;
  784. ec->destination_encoding = NULL;
  785. return ec;
  786. }
  787. static int
  788. rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
  789. {
  790. int n, j;
  791. int bufsize = 4096;
  792. unsigned char *p;
  793. if (ec->num_trans == ec->num_allocated) {
  794. n = ec->num_allocated * 2;
  795. REALLOC_N(ec->elems, rb_econv_elem_t, n);
  796. ec->num_allocated = n;
  797. }
  798. p = xmalloc(bufsize);
  799. MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
  800. ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
  801. ec->elems[i].out_buf_start = p;
  802. ec->elems[i].out_buf_end = p + bufsize;
  803. ec->elems[i].out_data_start = p;
  804. ec->elems[i].out_data_end = p;
  805. ec->elems[i].last_result = econv_source_buffer_empty;
  806. ec->num_trans++;
  807. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
  808. for (j = ec->num_trans-1; i <= j; j--) {
  809. rb_transcoding *tc = ec->elems[j].tc;
  810. const rb_transcoder *tr2 = tc->transcoder;
  811. if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
  812. ec->last_tc = tc;
  813. break;
  814. }
  815. }
  816. return 0;
  817. }
  818. static rb_econv_t *
  819. rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
  820. {
  821. rb_econv_t *ec;
  822. int i, ret;
  823. for (i = 0; i < n; i++) {
  824. const rb_transcoder *tr;
  825. tr = load_transcoder_entry(entries[i]);
  826. if (!tr)
  827. return NULL;
  828. }
  829. ec = rb_econv_alloc(n);
  830. for (i = 0; i < n; i++) {
  831. const rb_transcoder *tr = load_transcoder_entry(entries[i]);
  832. ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
  833. if (ret == -1) {
  834. rb_econv_close(ec);
  835. return NULL;
  836. }
  837. }
  838. return ec;
  839. }
  840. struct trans_open_t {
  841. transcoder_entry_t **entries;
  842. int num_additional;
  843. };
  844. static void
  845. trans_open_i(const char *sname, const char *dname, int depth, void *arg)
  846. {
  847. struct trans_open_t *toarg = arg;
  848. if (!toarg->entries) {
  849. toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
  850. }
  851. toarg->entries[depth] = get_transcoder_entry(sname, dname);
  852. }
  853. static rb_econv_t *
  854. rb_econv_open0(const char *sname, const char *dname, int ecflags)
  855. {
  856. transcoder_entry_t **entries = NULL;
  857. int num_trans;
  858. rb_econv_t *ec;
  859. rb_encoding *senc, *denc;
  860. int sidx, didx;
  861. senc = NULL;
  862. if (*sname) {
  863. sidx = rb_enc_find_index(sname);
  864. if (0 <= sidx) {
  865. senc = rb_enc_from_index(sidx);
  866. }
  867. }
  868. denc = NULL;
  869. if (*dname) {
  870. didx = rb_enc_find_index(dname);
  871. if (0 <= didx) {
  872. denc = rb_enc_from_index(didx);
  873. }
  874. }
  875. if (*sname == '\0' && *dname == '\0') {
  876. num_trans = 0;
  877. entries = NULL;
  878. }
  879. else {
  880. struct trans_open_t toarg;
  881. toarg.entries = NULL;
  882. toarg.num_additional = 0;
  883. num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
  884. entries = toarg.entries;
  885. if (num_trans < 0) {
  886. xfree(entries);
  887. return NULL;
  888. }
  889. }
  890. ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
  891. xfree(entries);
  892. if (!ec)
  893. return NULL;
  894. ec->flags = ecflags;
  895. ec->source_encoding_name = sname;
  896. ec->destination_encoding_name = dname;
  897. return ec;
  898. }
  899. #define MAX_ECFLAGS_DECORATORS 32
  900. static int
  901. decorator_names(int ecflags, const char **decorators_ret)
  902. {
  903. int num_decorators;
  904. if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) &&
  905. (ecflags & ECONV_CR_NEWLINE_DECORATOR))
  906. return -1;
  907. if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) &&
  908. (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR))
  909. return -1;
  910. if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
  911. (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
  912. return -1;
  913. num_decorators = 0;
  914. if (ecflags & ECONV_XML_TEXT_DECORATOR)
  915. decorators_ret[num_decorators++] = "xml_text_escape";
  916. if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
  917. decorators_ret[num_decorators++] = "xml_attr_content_escape";
  918. if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
  919. decorators_ret[num_decorators++] = "xml_attr_quote";
  920. if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
  921. decorators_ret[num_decorators++] = "crlf_newline";
  922. if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
  923. decorators_ret[num_decorators++] = "cr_newline";
  924. if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
  925. decorators_ret[num_decorators++] = "universal_newline";
  926. return num_decorators;
  927. }
  928. rb_econv_t *
  929. rb_econv_open(const char *sname, const char *dname, int ecflags)
  930. {
  931. rb_econv_t *ec;
  932. int num_decorators;
  933. const char *decorators[MAX_ECFLAGS_DECORATORS];
  934. int i;
  935. num_decorators = decorator_names(ecflags, decorators);
  936. if (num_decorators == -1)
  937. return NULL;
  938. ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
  939. if (!ec)
  940. return NULL;
  941. for (i = 0; i < num_decorators; i++)
  942. if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
  943. rb_econv_close(ec);
  944. return NULL;
  945. }
  946. ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
  947. return ec;
  948. }
  949. static int
  950. trans_sweep(rb_econv_t *ec,
  951. const unsigned char **input_ptr, const unsigned char *input_stop,
  952. unsigned char **output_ptr, unsigned char *output_stop,
  953. int flags,
  954. int start)
  955. {
  956. int try;
  957. int i, f;
  958. const unsigned char **ipp, *is, *iold;
  959. unsigned char **opp, *os, *oold;
  960. rb_econv_result_t res;
  961. try = 1;
  962. while (try) {
  963. try = 0;
  964. for (i = start; i < ec->num_trans; i++) {
  965. rb_econv_elem_t *te = &ec->elems[i];
  966. if (i == 0) {
  967. ipp = input_ptr;
  968. is = input_stop;
  969. }
  970. else {
  971. rb_econv_elem_t *prev_te = &ec->elems[i-1];
  972. ipp = (const unsigned char **)&prev_te->out_data_start;
  973. is = prev_te->out_data_end;
  974. }
  975. if (i == ec->num_trans-1) {
  976. opp = output_ptr;
  977. os = output_stop;
  978. }
  979. else {
  980. if (te->out_buf_start != te->out_data_start) {
  981. ssize_t len = te->out_data_end - te->out_data_start;
  982. ssize_t off = te->out_data_start - te->out_buf_start;
  983. MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
  984. te->out_data_start = te->out_buf_start;
  985. te->out_data_end -= off;
  986. }
  987. opp = &te->out_data_end;
  988. os = te->out_buf_end;
  989. }
  990. f = flags;
  991. if (ec->num_finished != i)
  992. f |= ECONV_PARTIAL_INPUT;
  993. if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
  994. start = 1;
  995. flags &= ~ECONV_AFTER_OUTPUT;
  996. }
  997. if (i != 0)
  998. f &= ~ECONV_AFTER_OUTPUT;
  999. iold = *ipp;
  1000. oold = *opp;
  1001. te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
  1002. if (iold != *ipp || oold != *opp)
  1003. try = 1;
  1004. switch (res) {
  1005. case econv_invalid_byte_sequence:
  1006. case econv_incomplete_input:
  1007. case econv_undefined_conversion:
  1008. case econv_after_output:
  1009. return i;
  1010. case econv_destination_buffer_full:
  1011. case econv_source_buffer_empty:
  1012. break;
  1013. case econv_finished:
  1014. ec->num_finished = i+1;
  1015. break;
  1016. }
  1017. }
  1018. }
  1019. return -1;
  1020. }
  1021. static rb_econv_result_t
  1022. rb_trans_conv(rb_econv_t *ec,
  1023. const unsigned char **input_ptr, const unsigned char *input_stop,
  1024. unsigned char **output_ptr, unsigned char *output_stop,
  1025. int flags,
  1026. int *result_position_ptr)
  1027. {
  1028. int i;
  1029. int needreport_index;
  1030. int sweep_start;
  1031. unsigned char empty_buf;
  1032. unsigned char *empty_ptr = &empty_buf;
  1033. if (!input_ptr) {
  1034. input_ptr = (const unsigned char **)&empty_ptr;
  1035. input_stop = empty_ptr;
  1036. }
  1037. if (!output_ptr) {
  1038. output_ptr = &empty_ptr;
  1039. output_stop = empty_ptr;
  1040. }
  1041. if (ec->elems[0].last_result == econv_after_output)
  1042. ec->elems[0].last_result = econv_source_buffer_empty;
  1043. needreport_index = -1;
  1044. for (i = ec->num_trans-1; 0 <= i; i--) {
  1045. switch (ec->elems[i].last_result) {
  1046. case econv_invalid_byte_sequence:
  1047. case econv_incomplete_input:
  1048. case econv_undefined_conversion:
  1049. case econv_after_output:
  1050. case econv_finished:
  1051. sweep_start = i+1;
  1052. needreport_index = i;
  1053. goto found_needreport;
  1054. case econv_destination_buffer_full:
  1055. case econv_source_buffer_empty:
  1056. break;
  1057. default:
  1058. rb_bug("unexpected transcode last result");
  1059. }
  1060. }
  1061. /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
  1062. if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
  1063. (flags & ECONV_AFTER_OUTPUT)) {
  1064. rb_econv_result_t res;
  1065. res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
  1066. (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
  1067. result_position_ptr);
  1068. if (res == econv_source_buffer_empty)
  1069. return econv_after_output;
  1070. return res;
  1071. }
  1072. sweep_start = 0;
  1073. found_needreport:
  1074. do {
  1075. needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
  1076. sweep_start = needreport_index + 1;
  1077. } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
  1078. for (i = ec->num_trans-1; 0 <= i; i--) {
  1079. if (ec->elems[i].last_result != econv_source_buffer_empty) {
  1080. rb_econv_result_t res = ec->elems[i].last_result;
  1081. if (res == econv_invalid_byte_sequence ||
  1082. res == econv_incomplete_input ||
  1083. res == econv_undefined_conversion ||
  1084. res == econv_after_output) {
  1085. ec->elems[i].last_result = econv_source_buffer_empty;
  1086. }
  1087. if (result_position_ptr)
  1088. *result_position_ptr = i;
  1089. return res;
  1090. }
  1091. }
  1092. if (result_position_ptr)
  1093. *result_position_ptr = -1;
  1094. return econv_source_buffer_empty;
  1095. }
  1096. static rb_econv_result_t
  1097. rb_econv_convert0(rb_econv_t *ec,
  1098. const unsigned char **input_ptr, const unsigned char *input_stop,
  1099. unsigned char **output_ptr, unsigned char *output_stop,
  1100. int flags)
  1101. {
  1102. rb_econv_result_t res;
  1103. int result_position;
  1104. int has_output = 0;
  1105. memset(&ec->last_error, 0, sizeof(ec->last_error));
  1106. if (ec->num_trans == 0) {
  1107. size_t len;
  1108. if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
  1109. if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
  1110. len = output_stop - *output_ptr;
  1111. memcpy(*output_ptr, ec->in_data_start, len);
  1112. *output_ptr = output_stop;
  1113. ec->in_data_start += len;
  1114. res = econv_destination_buffer_full;
  1115. goto gotresult;
  1116. }
  1117. len = ec->in_data_end - ec->in_data_start;
  1118. memcpy(*output_ptr, ec->in_data_start, len);
  1119. *output_ptr += len;
  1120. ec->in_data_start = ec->in_data_end = ec->in_buf_start;
  1121. if (flags & ECONV_AFTER_OUTPUT) {
  1122. res = econv_after_output;
  1123. goto gotresult;
  1124. }
  1125. }
  1126. if (output_stop - *output_ptr < input_stop - *input_ptr) {
  1127. len = output_stop - *output_ptr;
  1128. }
  1129. else {
  1130. len = input_stop - *input_ptr;
  1131. }
  1132. if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
  1133. *(*output_ptr)++ = *(*input_ptr)++;
  1134. res = econv_after_output;
  1135. goto gotresult;
  1136. }
  1137. memcpy(*output_ptr, *input_ptr, len);
  1138. *output_ptr += len;
  1139. *input_ptr += len;
  1140. if (*input_ptr != input_stop)
  1141. res = econv_destination_buffer_full;
  1142. else if (flags & ECONV_PARTIAL_INPUT)
  1143. res = econv_source_buffer_empty;
  1144. else
  1145. res = econv_finished;
  1146. goto gotresult;
  1147. }
  1148. if (ec->elems[ec->num_trans-1].out_data_start) {
  1149. unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
  1150. unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
  1151. if (data_start != data_end) {
  1152. size_t len;
  1153. if (output_stop - *output_ptr < data_end - data_start) {
  1154. len = output_stop - *output_ptr;
  1155. memcpy(*output_ptr, data_start, len);
  1156. *output_ptr = output_stop;
  1157. ec->elems[ec->num_trans-1].out_data_start += len;
  1158. res = econv_destination_buffer_full;
  1159. goto gotresult;
  1160. }
  1161. len = data_end - data_start;
  1162. memcpy(*output_ptr, data_start, len);
  1163. *output_ptr += len;
  1164. ec->elems[ec->num_trans-1].out_data_start =
  1165. ec->elems[ec->num_trans-1].out_data_end =
  1166. ec->elems[ec->num_trans-1].out_buf_start;
  1167. has_output = 1;
  1168. }
  1169. }
  1170. if (ec->in_buf_start &&
  1171. ec->in_data_start != ec->in_data_end) {
  1172. res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
  1173. (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
  1174. if (res != econv_source_buffer_empty)
  1175. goto gotresult;
  1176. }
  1177. if (has_output &&
  1178. (flags & ECONV_AFTER_OUTPUT) &&
  1179. *input_ptr != input_stop) {
  1180. input_stop = *input_ptr;
  1181. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1182. if (res == econv_source_buffer_empty)
  1183. res = econv_after_output;
  1184. }
  1185. else if ((flags & ECONV_AFTER_OUTPUT) ||
  1186. ec->num_trans == 1) {
  1187. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1188. }
  1189. else {
  1190. flags |= ECONV_AFTER_OUTPUT;
  1191. do {
  1192. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1193. } while (res == econv_after_output);
  1194. }
  1195. gotresult:
  1196. ec->last_error.result = res;
  1197. if (res == econv_invalid_byte_sequence ||
  1198. res == econv_incomplete_input ||
  1199. res == econv_undefined_conversion) {
  1200. rb_transcoding *error_tc = ec->elems[result_position].tc;
  1201. ec->last_error.error_tc = error_tc;
  1202. ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
  1203. ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
  1204. ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
  1205. ec->last_error.error_bytes_len = error_tc->recognized_len;
  1206. ec->last_error.readagain_len = error_tc->readagain_len;
  1207. }
  1208. return res;
  1209. }
  1210. static int output_replacement_character(rb_econv_t *ec);
  1211. static int
  1212. output_hex_charref(rb_econv_t *ec)
  1213. {
  1214. int ret;
  1215. unsigned char utfbuf[1024];
  1216. const unsigned char *utf;
  1217. size_t utf_len;
  1218. int utf_allocated = 0;
  1219. char charef_buf[16];
  1220. const unsigned char *p;
  1221. if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
  1222. utf = ec->last_error.error_bytes_start;
  1223. utf_len = ec->last_error.error_bytes_len;
  1224. }
  1225. else {
  1226. utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
  1227. ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
  1228. utfbuf, sizeof(utfbuf),
  1229. &utf_len);
  1230. if (!utf)
  1231. return -1;
  1232. if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
  1233. utf_allocated = 1;
  1234. }
  1235. if (utf_len % 4 != 0)
  1236. goto fail;
  1237. p = utf;
  1238. while (4 <= utf_len) {
  1239. unsigned int u = 0;
  1240. u += p[0] << 24;
  1241. u += p[1] << 16;
  1242. u += p[2] << 8;
  1243. u += p[3];
  1244. snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
  1245. ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
  1246. if (ret == -1)
  1247. goto fail;
  1248. p += 4;
  1249. utf_len -= 4;
  1250. }
  1251. if (utf_allocated)
  1252. xfree((void *)utf);
  1253. return 0;
  1254. fail:
  1255. if (utf_allocated)
  1256. xfree((void *)utf);
  1257. return -1;
  1258. }
  1259. rb_econv_result_t
  1260. rb_econv_convert(rb_econv_t *ec,
  1261. const unsigned char **input_ptr, const unsigned char *input_stop,
  1262. unsigned char **output_ptr, unsigned char *output_stop,
  1263. int flags)
  1264. {
  1265. rb_econv_result_t ret;
  1266. unsigned char empty_buf;
  1267. unsigned char *empty_ptr = &empty_buf;
  1268. ec->started = 1;
  1269. if (!input_ptr) {
  1270. input_ptr = (const unsigned char **)&empty_ptr;
  1271. input_stop = empty_ptr;
  1272. }
  1273. if (!output_ptr) {
  1274. output_ptr = &empty_ptr;
  1275. output_stop = empty_ptr;
  1276. }
  1277. resume:
  1278. ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
  1279. if (ret == econv_invalid_byte_sequence ||
  1280. ret == econv_incomplete_input) {
  1281. /* deal with invalid byte sequence */
  1282. /* todo: add more alternative behaviors */
  1283. switch (ec->flags & ECONV_INVALID_MASK) {
  1284. case ECONV_INVALID_REPLACE:
  1285. if (output_replacement_character(ec) == 0)
  1286. goto resume;
  1287. }
  1288. }
  1289. if (ret == econv_undefined_conversion) {
  1290. /* valid character in source encoding
  1291. * but no related character(s) in destination encoding */
  1292. /* todo: add more alternative behaviors */
  1293. switch (ec->flags & ECONV_UNDEF_MASK) {
  1294. case ECONV_UNDEF_REPLACE:
  1295. if (output_replacement_character(ec) == 0)
  1296. goto resume;
  1297. break;
  1298. case ECONV_UNDEF_HEX_CHARREF:
  1299. if (output_hex_charref(ec) == 0)
  1300. goto resume;
  1301. break;
  1302. }
  1303. }
  1304. return ret;
  1305. }
  1306. const char *
  1307. rb_econv_encoding_to_insert_output(rb_econv_t *ec)
  1308. {
  1309. rb_transcoding *tc = ec->last_tc;
  1310. const rb_transcoder *tr;
  1311. if (tc == NULL)
  1312. return "";
  1313. tr = tc->transcoder;
  1314. if (tr->asciicompat_type == asciicompat_encoder)
  1315. return tr->src_encoding;
  1316. return tr->dst_encoding;
  1317. }
  1318. static unsigned char *
  1319. allocate_converted_string(const char *sname, const char *dname,
  1320. const unsigned char *str, size_t len,
  1321. unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
  1322. size_t *dst_len_ptr)
  1323. {
  1324. unsigned char *dst_str;
  1325. size_t dst_len;
  1326. size_t dst_bufsize;
  1327. rb_econv_t *ec;
  1328. rb_econv_result_t res;
  1329. const unsigned char *sp;
  1330. unsigned char *dp;
  1331. if (caller_dst_buf)
  1332. dst_bufsize = caller_dst_bufsize;
  1333. else if (len == 0)
  1334. dst_bufsize = 1;
  1335. else
  1336. dst_bufsize = len;
  1337. ec = rb_econv_open(sname, dname, 0);
  1338. if (ec == NULL)
  1339. return NULL;
  1340. if (caller_dst_buf)
  1341. dst_str = caller_dst_buf;
  1342. else
  1343. dst_str = xmalloc(dst_bufsize);
  1344. dst_len = 0;
  1345. sp = str;
  1346. dp = dst_str+dst_len;
  1347. res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
  1348. dst_len = dp - dst_str;
  1349. while (res == econv_destination_buffer_full) {
  1350. if (SIZE_MAX/2 < dst_bufsize) {
  1351. goto fail;
  1352. }
  1353. dst_bufsize *= 2;
  1354. if (dst_str == caller_dst_buf) {
  1355. unsigned char *tmp;
  1356. tmp = xmalloc(dst_bufsize);
  1357. memcpy(tmp, dst_str, dst_bufsize/2);
  1358. dst_str = tmp;
  1359. }
  1360. else {
  1361. dst_str = xrealloc(dst_str, dst_bufsize);
  1362. }
  1363. dp = dst_str+dst_len;
  1364. res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
  1365. dst_len = dp - dst_str;
  1366. }
  1367. if (res != econv_finished) {
  1368. goto fail;
  1369. }
  1370. rb_econv_close(ec);
  1371. *dst_len_ptr = dst_len;
  1372. return dst_str;
  1373. fail:
  1374. if (dst_str != caller_dst_buf)
  1375. xfree(dst_str);
  1376. rb_econv_close(ec);
  1377. return NULL;
  1378. }
  1379. /* result: 0:success -1:failure */
  1380. int
  1381. rb_econv_insert_output(rb_econv_t *ec,
  1382. const unsigned char *str, size_t len, const char *str_encoding)
  1383. {
  1384. const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
  1385. unsigned char insert_buf[4096];
  1386. const unsigned char *insert_str = NULL;
  1387. size_t insert_len;
  1388. int last_trans_index;
  1389. rb_transcoding *tc;
  1390. unsigned char **buf_start_p;
  1391. unsigned char **data_start_p;
  1392. unsigned char **data_end_p;
  1393. unsigned char **buf_end_p;
  1394. size_t need;
  1395. ec->started = 1;
  1396. if (len == 0)
  1397. return 0;
  1398. if (encoding_equal(insert_encoding, str_encoding)) {
  1399. insert_str = str;
  1400. insert_len = len;
  1401. }
  1402. else {
  1403. insert_str = allocate_converted_string(str_encoding, insert_encoding,
  1404. str, len, insert_buf, sizeof(insert_buf), &insert_len);
  1405. if (insert_str == NULL)
  1406. return -1;
  1407. }
  1408. need = insert_len;
  1409. last_trans_index = ec->num_trans-1;
  1410. if (ec->num_trans == 0) {
  1411. tc = NULL;
  1412. buf_start_p = &ec->in_buf_start;
  1413. data_start_p = &ec->in_data_start;
  1414. data_end_p = &ec->in_data_end;
  1415. buf_end_p = &ec->in_buf_end;
  1416. }
  1417. else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
  1418. tc = ec->elems[last_trans_index].tc;
  1419. need += tc->readagain_len;
  1420. if (need < insert_len)
  1421. goto fail;
  1422. if (last_trans_index == 0) {
  1423. buf_start_p = &ec->in_buf_start;
  1424. data_start_p = &ec->in_data_start;
  1425. data_end_p = &ec->in_data_end;
  1426. buf_end_p = &ec->in_buf_end;
  1427. }
  1428. else {
  1429. rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
  1430. buf_start_p = &ee->out_buf_start;
  1431. data_start_p = &ee->out_data_start;
  1432. data_end_p = &ee->out_data_end;
  1433. buf_end_p = &ee->out_buf_end;
  1434. }
  1435. }
  1436. else {
  1437. rb_econv_elem_t *ee = &ec->elems[last_trans_index];
  1438. buf_start_p = &ee->out_buf_start;
  1439. data_start_p = &ee->out_data_start;
  1440. data_end_p = &ee->out_data_end;
  1441. buf_end_p = &ee->out_buf_end;
  1442. tc = ec->elems[last_trans_index].tc;
  1443. }
  1444. if (*buf_start_p == NULL) {
  1445. unsigned char *buf = xmalloc(need);
  1446. *buf_start_p = buf;
  1447. *data_start_p = buf;
  1448. *data_end_p = buf;
  1449. *buf_end_p = buf+need;
  1450. }
  1451. else if ((size_t)(*buf_end_p - *data_end_p) < need) {
  1452. MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
  1453. *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
  1454. *data_start_p = *buf_start_p;
  1455. if ((size_t)(*buf_end_p - *data_end_p) < need) {
  1456. unsigned char *buf;
  1457. size_t s = (*data_end_p - *buf_start_p) + need;
  1458. if (s < need)
  1459. goto fail;
  1460. buf = xrealloc(*buf_start_p, s);
  1461. *data_start_p = buf;
  1462. *data_end_p = buf + (*data_end_p - *buf_start_p);
  1463. *buf_start_p = buf;
  1464. *buf_end_p = buf + s;
  1465. }
  1466. }
  1467. memcpy(*data_end_p, insert_str, insert_len);
  1468. *data_end_p += insert_len;
  1469. if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
  1470. memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
  1471. *data_end_p += tc->readagain_len;
  1472. tc->readagain_len = 0;
  1473. }
  1474. if (insert_str != str && insert_str != insert_buf)
  1475. xfree((void*)insert_str);
  1476. return 0;
  1477. fail:
  1478. if (insert_str != str && insert_str != insert_buf)
  1479. xfree((void*)insert_str);
  1480. return -1;
  1481. }
  1482. void
  1483. rb_econv_close(rb_econv_t *ec)
  1484. {
  1485. int i;
  1486. if (ec->replacement_allocated) {
  1487. xfree((void *)ec->replacement_str);
  1488. }
  1489. for (i = 0; i < ec->num_trans; i++) {
  1490. rb_transcoding_close(ec->elems[i].tc);
  1491. if (ec->elems[i].out_buf_start)
  1492. xfree(ec->elems[i].out_buf_start);
  1493. }
  1494. xfree(ec->in_buf_start);
  1495. xfree(ec->elems);
  1496. xfree(ec);
  1497. }
  1498. size_t
  1499. rb_econv_memsize(rb_econv_t *ec)
  1500. {
  1501. size_t size = sizeof(rb_econv_t);
  1502. int i;
  1503. if (ec->replacement_allocated) {
  1504. size += ec->replacement_len;
  1505. }
  1506. for (i = 0; i < ec->num_trans; i++) {
  1507. size += rb_transcoding_memsize(ec->elems[i].tc);
  1508. if (ec->elems[i].out_buf_start) {
  1509. size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
  1510. }
  1511. }
  1512. size += ec->in_buf_end - ec->in_buf_start;
  1513. size += sizeof(rb_econv_elem_t) * ec->num_allocated;
  1514. return size;
  1515. }
  1516. int
  1517. rb_econv_putbackable(rb_econv_t *ec)
  1518. {
  1519. if (ec->num_trans == 0)
  1520. return 0;
  1521. #if SIZEOF_SIZE_T > SIZEOF_INT
  1522. if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
  1523. #endif
  1524. return (int)ec->elems[0].tc->readagain_len;
  1525. }
  1526. void
  1527. rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
  1528. {
  1529. rb_transcoding *tc;
  1530. if (ec->num_trans == 0 || n == 0)
  1531. return;
  1532. tc = ec->elems[0].tc;
  1533. memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
  1534. tc->readagain_len -= n;
  1535. }
  1536. struct asciicompat_encoding_t {
  1537. const char *ascii_compat_name;
  1538. const char *ascii_incompat_name;
  1539. };
  1540. static int
  1541. asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
  1542. {
  1543. struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
  1544. transcoder_entry_t *entry = (transcoder_entry_t *)val;
  1545. const rb_transcoder *tr;
  1546. if (DECORATOR_P(entry->sname, entry->dname))
  1547. return ST_CONTINUE;
  1548. tr = load_transcoder_entry(entry);
  1549. if (tr && tr->asciicompat_type == asciicompat_decoder) {
  1550. data->ascii_compat_name = tr->dst_encoding;
  1551. return ST_STOP;
  1552. }
  1553. return ST_CONTINUE;
  1554. }
  1555. const char *
  1556. rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
  1557. {
  1558. st_data_t v;
  1559. st_table *table2;
  1560. struct asciicompat_encoding_t data;
  1561. if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
  1562. return NULL;
  1563. table2 = (st_table *)v;
  1564. /*
  1565. * Assumption:
  1566. * There is at most one transcoder for
  1567. * converting from ASCII incompatible encoding.
  1568. *
  1569. * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
  1570. */
  1571. if (table2->num_entries != 1)
  1572. return NULL;
  1573. data.ascii_incompat_name = ascii_incompat_name;
  1574. data.ascii_compat_name = NULL;
  1575. st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
  1576. return data.ascii_compat_name;
  1577. }
  1578. VALUE
  1579. rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
  1580. {
  1581. unsigned const char *ss, *sp, *se;
  1582. unsigned char *ds, *dp, *de;
  1583. rb_econv_result_t res;
  1584. int max_output;
  1585. if (NIL_P(dst)) {
  1586. dst = rb_str_buf_new(len);
  1587. if (ec->destination_encoding)
  1588. rb_enc_associate(dst, ec->destination_encoding);
  1589. }
  1590. if (ec->last_tc)
  1591. max_output = ec->last_tc->transcoder->max_output;
  1592. else
  1593. max_output = 1;
  1594. res = econv_destination_buffer_full;
  1595. while (res == econv_destination_buffer_full) {
  1596. long dlen = RSTRING_LEN(dst);
  1597. if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
  1598. unsigned long new_capa = (unsigned long)dlen + len + max_output;
  1599. if (LONG_MAX < new_capa)
  1600. rb_raise(rb_eArgError, "too long string");
  1601. rb_str_resize(dst, new_capa);
  1602. rb_str_set_len(dst, dlen);
  1603. }
  1604. ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
  1605. se = ss + len;
  1606. ds = (unsigned char *)RSTRING_PTR(dst);
  1607. de = ds + rb_str_capacity(dst);
  1608. dp = ds += dlen;
  1609. res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
  1610. off += sp - ss;
  1611. len -= sp - ss;
  1612. rb_str_set_len(dst, dlen + (dp - ds));
  1613. rb_econv_check_error(ec);
  1614. }
  1615. return dst;
  1616. }
  1617. VALUE
  1618. rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
  1619. {
  1620. return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
  1621. }
  1622. VALUE
  1623. rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
  1624. {
  1625. return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
  1626. }
  1627. VALUE
  1628. rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
  1629. {
  1630. return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
  1631. }
  1632. static int
  1633. rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
  1634. {
  1635. transcoder_entry_t *entry;
  1636. const rb_transcoder *tr;
  1637. if (ec->started != 0)
  1638. return -1;
  1639. entry = get_transcoder_entry(sname, dname);
  1640. if (!entry)
  1641. return -1;
  1642. tr = load_transcoder_entry(entry);
  1643. return rb_econv_add_transcoder_at(ec, tr, n);
  1644. }
  1645. static int
  1646. rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
  1647. {
  1648. return rb_econv_add_converter(ec, "", decorator_name, n);
  1649. }
  1650. int
  1651. rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
  1652. {
  1653. const rb_transcoder *tr;
  1654. if (ec->num_trans == 0)
  1655. return rb_econv_decorate_at(ec, decorator_name, 0);
  1656. tr = ec->elems[0].tc->transcoder;
  1657. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
  1658. tr->asciicompat_type == asciicompat_decoder)
  1659. return rb_econv_decorate_at(ec, decorator_name, 1);
  1660. return rb_econv_decorate_at(ec, decorator_name, 0);
  1661. }
  1662. int
  1663. rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
  1664. {
  1665. const rb_transcoder *tr;
  1666. if (ec->num_trans == 0)
  1667. return rb_econv_decorate_at(ec, decorator_name, 0);
  1668. tr = ec->elems[ec->num_trans-1].tc->transcoder;
  1669. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
  1670. tr->asciicompat_type == asciicompat_encoder)
  1671. return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
  1672. return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
  1673. }
  1674. void
  1675. rb_econv_binmode(rb_econv_t *ec)
  1676. {
  1677. const rb_transcoder *trs[3];
  1678. int n, i, j;
  1679. transcoder_entry_t *entry;
  1680. int num_trans;
  1681. n = 0;
  1682. if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
  1683. entry = get_transcoder_entry("", "universal_newline");
  1684. if (entry->transcoder)
  1685. trs[n++] = entry->transcoder;
  1686. }
  1687. if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
  1688. entry = get_transcoder_entry("", "crlf_newline");
  1689. if (entry->transcoder)
  1690. trs[n++] = entry->transcoder;
  1691. }
  1692. if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
  1693. entry = get_transcoder_entry("", "cr_newline");
  1694. if (entry->transcoder)
  1695. trs[n++] = entry->transcoder;
  1696. }
  1697. num_trans = ec->num_trans;
  1698. j = 0;
  1699. for (i = 0; i < num_trans; i++) {
  1700. int k;
  1701. for (k = 0; k < n; k++)
  1702. if (trs[k] == ec->elems[i].tc->transcoder)
  1703. break;
  1704. if (k == n) {
  1705. ec->elems[j] = ec->elems[i];
  1706. j++;
  1707. }
  1708. else {
  1709. rb_transcoding_close(ec->elems[i].tc);
  1710. xfree(ec->elems[i].out_buf_start);
  1711. ec->num_trans--;
  1712. }
  1713. }
  1714. ec->flags &= ~(ECONV_UNIVERSAL_NEWLINE_DECORATOR|ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR);
  1715. }
  1716. static VALUE
  1717. econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
  1718. {
  1719. int has_description = 0;
  1720. if (NIL_P(mesg))
  1721. mesg = rb_str_new(NULL, 0);
  1722. if (*sname != '\0' || *dname != '\0') {
  1723. if (*sname == '\0')
  1724. rb_str_cat2(mesg, dname);
  1725. else if (*dname == '\0')
  1726. rb_str_cat2(mesg, sname);
  1727. else
  1728. rb_str_catf(mesg, "%s to %s", sname, dname);
  1729. has_description = 1;
  1730. }
  1731. if (ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
  1732. ECONV_CRLF_NEWLINE_DECORATOR|
  1733. ECONV_CR_NEWLINE_DECORATOR|
  1734. ECONV_XML_TEXT_DECORATOR|
  1735. ECONV_XML_ATTR_CONTENT_DECORATOR|
  1736. ECONV_XML_ATTR_QUOTE_DECORATOR)) {
  1737. const char *pre = "";
  1738. if (has_description)
  1739. rb_str_cat2(mesg, " with ");
  1740. if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
  1741. rb_str_cat2(mesg, pre); pre = ",";
  1742. rb_str_cat2(mesg, "universal_newline");
  1743. }
  1744. if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
  1745. rb_str_cat2(mesg, pre); pre = ",";
  1746. rb_str_cat2(mesg, "crlf_newline");
  1747. }
  1748. if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
  1749. rb_str_cat2(mesg, pre); pre = ",";
  1750. rb_str_cat2(mesg, "cr_newline");
  1751. }
  1752. if (ecflags & ECONV_XML_TEXT_DECORATOR) {
  1753. rb_str_cat2(mesg, pre); pre = ",";
  1754. rb_str_cat2(mesg, "xml_text");
  1755. }
  1756. if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
  1757. rb_str_cat2(mesg, pre); pre = ",";
  1758. rb_str_cat2(mesg, "xml_attr_content");
  1759. }
  1760. if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
  1761. rb_str_cat2(mesg, pre); pre = ",";
  1762. rb_str_cat2(mesg, "xml_attr_quote");
  1763. }
  1764. has_description = 1;
  1765. }
  1766. if (!has_description) {
  1767. rb_str_cat2(mesg, "no-conversion");
  1768. }
  1769. return mesg;
  1770. }
  1771. VALUE
  1772. rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
  1773. {
  1774. VALUE mesg, exc;
  1775. mesg = rb_str_new_cstr("code converter not found (");
  1776. econv_description(sname, dname, ecflags, mesg);
  1777. rb_str_cat2(mesg, ")");
  1778. exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
  1779. return exc;
  1780. }
  1781. static VALUE
  1782. make_econv_exception(rb_econv_t *ec)
  1783. {
  1784. VALUE mesg, exc;
  1785. if (ec->last_error.result == econv_invalid_byte_sequence ||
  1786. ec->last_error.result == econv_incomplete_input) {
  1787. const char *err = (const char *)ec->last_error.error_bytes_start;
  1788. size_t error_len = ec->last_error.error_bytes_len;
  1789. VALUE bytes = rb_str_new(err, error_len);
  1790. VALUE dumped = rb_str_dump(bytes);
  1791. size_t readagain_len = ec->last_error.readagain_len;
  1792. VALUE bytes2 = Qnil;
  1793. VALUE dumped2;
  1794. int idx;
  1795. if (ec->last_error.result == econv_incomplete_input) {
  1796. mesg = rb_sprintf("incomplete %s on %s",
  1797. StringValueCStr(dumped),
  1798. ec->last_error.source_encoding);
  1799. }
  1800. else if (readagain_len) {
  1801. bytes2 = rb_str_new(err+error_len, readagain_len);
  1802. dumped2 = rb_str_dump(bytes2);
  1803. mesg = rb_sprintf("%s followed by %s on %s",
  1804. StringValueCStr(dumped),
  1805. StringValueCStr(dumped2),
  1806. ec->last_error.source_encoding);
  1807. }
  1808. else {
  1809. mesg = rb_sprintf("%s on %s",
  1810. StringValueCStr(dumped),
  1811. ec->last_error.source_encoding);
  1812. }
  1813. exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
  1814. rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
  1815. rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
  1816. rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
  1817. set_encs:
  1818. rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
  1819. rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
  1820. idx = rb_enc_find_index(ec->last_error.source_encoding);
  1821. if (0 <= idx)
  1822. rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
  1823. idx = rb_enc_find_index(ec->last_error.destination_encoding);
  1824. if (0 <= idx)
  1825. rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
  1826. return exc;
  1827. }
  1828. if (ec->last_error.result == econv_undefined_conversion) {
  1829. VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
  1830. ec->last_error.error_bytes_len);
  1831. VALUE dumped = Qnil;
  1832. int idx;
  1833. if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
  1834. rb_encoding *utf8 = rb_utf8_encoding();
  1835. const char *start, *end;
  1836. int n;
  1837. start = (const char *)ec->last_error.error_bytes_start;
  1838. end = start + ec->last_error.error_bytes_len;
  1839. n = rb_enc_precise_mbclen(start, end, utf8);
  1840. if (MBCLEN_CHARFOUND_P(n) &&
  1841. (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
  1842. unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
  1843. dumped = rb_sprintf("U+%04X", cc);
  1844. }
  1845. }
  1846. if (dumped == Qnil)
  1847. dumped = rb_str_dump(bytes);
  1848. if (strcmp(ec->last_error.source_encoding,
  1849. ec->source_encoding_name) == 0 &&
  1850. strcmp(ec->last_error.destination_encoding,
  1851. ec->destination_encoding_name) == 0) {
  1852. mesg = rb_sprintf("%s from %s to %s",
  1853. StringValueCStr(dumped),
  1854. ec->last_error.source_encoding,
  1855. ec->last_error.destination_encoding);
  1856. }
  1857. else {
  1858. int i;
  1859. mesg = rb_sprintf("%s to %s in conversion from %s",
  1860. StringValueCStr(dumped),
  1861. ec->last_error.destination_encoding,
  1862. ec->source_encoding_name);
  1863. for (i = 0; i < ec->num_trans; i++) {
  1864. const rb_transcoder *tr = ec->elems[i].tc->transcoder;
  1865. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
  1866. rb_str_catf(mesg, " to %s",
  1867. ec->elems[i].tc->transcoder->dst_encoding);
  1868. }
  1869. }
  1870. exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
  1871. idx = rb_enc_find_index(ec->last_error.source_encoding);
  1872. if (0 <= idx)
  1873. rb_enc_associate_index(bytes, idx);
  1874. rb_ivar_set(exc, rb_intern("error_char"), bytes);
  1875. goto set_encs;
  1876. }
  1877. return Qnil;
  1878. }
  1879. static void
  1880. more_output_buffer(
  1881. VALUE destination,
  1882. unsigned char *(*resize_destination)(VALUE, size_t, size_t),
  1883. int max_output,
  1884. unsigned char **out_start_ptr,
  1885. unsigned char **out_pos,
  1886. unsigned char **out_stop_ptr)
  1887. {
  1888. size_t len = (*out_pos - *out_start_ptr);
  1889. size_t new_len = (len + max_output) * 2;
  1890. *out_start_ptr = resize_destination(destination, len, new_len);
  1891. *out_pos = *out_start_ptr + len;
  1892. *out_stop_ptr = *out_start_ptr + new_len;
  1893. }
  1894. static int
  1895. make_replacement(rb_econv_t *ec)
  1896. {
  1897. rb_transcoding *tc;
  1898. const rb_transcoder *tr;
  1899. rb_encoding *enc;
  1900. const unsigned char *replacement;
  1901. const char *repl_enc;
  1902. const char *ins_enc;
  1903. size_t len;
  1904. if (ec->replacement_str)
  1905. return 0;
  1906. ins_enc = rb_econv_encoding_to_insert_output(ec);
  1907. tc = ec->last_tc;
  1908. if (*ins_enc) {
  1909. tr = tc->transcoder;
  1910. enc = rb_enc_find(tr->dst_encoding);
  1911. replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
  1912. }
  1913. else {
  1914. replacement = (unsigned char *)"?";
  1915. len = 1;
  1916. repl_enc = "";
  1917. }
  1918. ec->replacement_str = replacement;
  1919. ec->replacement_len = len;
  1920. ec->replacement_enc = repl_enc;
  1921. ec->replacement_allocated = 0;
  1922. return 0;
  1923. }
  1924. int
  1925. rb_econv_set_replacement(rb_econv_t *ec,
  1926. const unsigned char *str, size_t len, const char *encname)
  1927. {
  1928. unsigned char *str2;
  1929. size_t len2;
  1930. const char *encname2;
  1931. encname2 = rb_econv_encoding_to_insert_output(ec);
  1932. if (encoding_equal(encname, encname2)) {
  1933. str2 = xmalloc(len);
  1934. MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
  1935. len2 = len;
  1936. encname2 = encname;
  1937. }
  1938. else {
  1939. str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
  1940. if (!str2)
  1941. return -1;
  1942. }
  1943. if (ec->replacement_allocated) {
  1944. xfree((void *)ec->replacement_str);
  1945. }
  1946. ec->replacement_allocated = 1;
  1947. ec->replacement_str = str2;
  1948. ec->replacement_len = len2;
  1949. ec->replacement_enc = encname2;
  1950. return 0;
  1951. }
  1952. static int
  1953. output_replacement_character(rb_econv_t *ec)
  1954. {
  1955. int ret;
  1956. if (make_replacement(ec) == -1)
  1957. return -1;
  1958. ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
  1959. if (ret == -1)
  1960. return -1;
  1961. return 0;
  1962. }
  1963. #if 1
  1964. static void
  1965. transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
  1966. const unsigned char *in_stop, unsigned char *out_stop,
  1967. VALUE destination,
  1968. unsigned char *(*resize_destination)(VALUE, size_t, size_t),
  1969. const char *src_encoding,
  1970. const char *dst_encoding,
  1971. int ecflags,
  1972. VALUE ecopts)
  1973. {
  1974. rb_econv_t *ec;
  1975. rb_transcoding *last_tc;
  1976. rb_econv_result_t ret;
  1977. unsigned char *out_start = *out_pos;
  1978. int max_output;
  1979. VALUE exc;
  1980. ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
  1981. if (!ec)
  1982. rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
  1983. last_tc = ec->last_tc;
  1984. max_output = last_tc ? last_tc->transcoder->max_output : 1;
  1985. resume:
  1986. ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
  1987. if (ret == econv_invalid_byte_sequence ||
  1988. ret == econv_incomplete_input ||
  1989. ret == econv_undefined_conversion) {
  1990. exc = make_econv_exception(ec);
  1991. rb_econv_close(ec);
  1992. rb_exc_raise(exc);
  1993. }
  1994. if (ret == econv_destination_buffer_full) {
  1995. more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
  1996. goto resume;
  1997. }
  1998. rb_econv_close(ec);
  1999. return;
  2000. }
  2001. #else
  2002. /* sample transcode_loop implementation in byte-by-byte stream style */
  2003. static void
  2004. transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
  2005. const unsigned char *in_stop, unsigned char *out_stop,
  2006. VALUE destination,
  2007. unsigned char *(*resize_destination)(VALUE, size_t, size_t),
  2008. const char *src_encoding,
  2009. const char *dst_encoding,
  2010. int ecflags,
  2011. VALUE ecopts)
  2012. {
  2013. rb_econv_t *ec;
  2014. rb_transcoding *last_tc;
  2015. rb_econv_result_t ret;
  2016. unsigned char *out_start = *out_pos;
  2017. const unsigned char *ptr;
  2018. int max_output;
  2019. VALUE exc;
  2020. ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
  2021. if (!ec)
  2022. rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
  2023. last_tc = ec->last_tc;
  2024. max_output = last_tc ? last_tc->transcoder->max_output : 1;
  2025. ret = econv_source_buffer_empty;
  2026. ptr = *in_pos;
  2027. while (ret != econv_finished) {
  2028. unsigned char input_byte;
  2029. const unsigned char *p = &input_byte;
  2030. if (ret == econv_source_buffer_empty) {
  2031. if (ptr < in_stop) {
  2032. input_byte = *ptr;
  2033. ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
  2034. }
  2035. else {
  2036. ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
  2037. }
  2038. }
  2039. else {
  2040. ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
  2041. }
  2042. if (&input_byte != p)
  2043. ptr += p - &input_byte;
  2044. switch (ret) {
  2045. case econv_invalid_byte_sequence:
  2046. case econv_incomplete_input:
  2047. case econv_undefined_conversion:
  2048. exc = make_econv_exception(ec);
  2049. rb_econv_close(ec);
  2050. rb_exc_raise(exc);
  2051. break;
  2052. case econv_destination_buffer_full:
  2053. more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
  2054. break;
  2055. case econv_source_buffer_empty:
  2056. break;
  2057. case econv_finished:
  2058. break;
  2059. }
  2060. }
  2061. rb_econv_close(ec);
  2062. *in_pos = in_stop;
  2063. return;
  2064. }
  2065. #endif
  2066. /*
  2067. * String-specific code
  2068. */
  2069. static unsigned char *
  2070. str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
  2071. {
  2072. rb_str_resize(destination, new_len);
  2073. return (unsigned char *)RSTRING_PTR(destination);
  2074. }
  2075. static int
  2076. econv_opts(VALUE opt)
  2077. {
  2078. VALUE v;
  2079. int ecflags = 0;
  2080. v = rb_hash_aref(opt, sym_invalid);
  2081. if (NIL_P(v)) {
  2082. }
  2083. else if (v==sym_replace) {
  2084. ecflags |= ECONV_INVALID_REPLACE;
  2085. }
  2086. else {
  2087. rb_raise(rb_eArgError, "unknown value for invalid character option");
  2088. }
  2089. v = rb_hash_aref(opt, sym_undef);
  2090. if (NIL_P(v)) {
  2091. }
  2092. else if (v==sym_replace) {
  2093. ecflags |= ECONV_UNDEF_REPLACE;
  2094. }
  2095. else {
  2096. rb_raise(rb_eArgError, "unknown value for undefined character option");
  2097. }
  2098. v = rb_hash_aref(opt, sym_xml);
  2099. if (!NIL_P(v)) {
  2100. if (v==sym_text) {
  2101. ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
  2102. }
  2103. else if (v==sym_attr) {
  2104. ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
  2105. }
  2106. else if (TYPE(v) == T_SYMBOL) {
  2107. rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
  2108. }
  2109. else {
  2110. rb_raise(rb_eArgError, "unexpected value for xml option");
  2111. }
  2112. }
  2113. v = rb_hash_aref(opt, sym_universal_newline);
  2114. if (RTEST(v))
  2115. ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
  2116. v = rb_hash_aref(opt, sym_crlf_newline);
  2117. if (RTEST(v))
  2118. ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
  2119. v = rb_hash_aref(opt, sym_cr_newline);
  2120. if (RTEST(v))
  2121. ecflags |= ECONV_CR_NEWLINE_DECORATOR;
  2122. return ecflags;
  2123. }
  2124. int
  2125. rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
  2126. {
  2127. int ecflags;
  2128. VALUE newhash = Qnil;
  2129. VALUE v;
  2130. if (NIL_P(opthash)) {
  2131. *opts = Qnil;
  2132. return 0;
  2133. }
  2134. ecflags = econv_opts(opthash);
  2135. v = rb_hash_aref(opthash, sym_replace);
  2136. if (!NIL_P(v)) {
  2137. StringValue(v);
  2138. if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
  2139. VALUE dumped = rb_str_dump(v);
  2140. rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
  2141. StringValueCStr(dumped),
  2142. rb_enc_name(rb_enc_get(v)));
  2143. }
  2144. v = rb_str_new_frozen(v);
  2145. newhash = rb_hash_new();
  2146. rb_hash_aset(newhash, sym_replace, v);
  2147. }
  2148. if (!NIL_P(newhash))
  2149. rb_hash_freeze(newhash);
  2150. *opts = newhash;
  2151. return ecflags;
  2152. }
  2153. rb_econv_t *
  2154. rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
  2155. {
  2156. rb_econv_t *ec;
  2157. VALUE replacement;
  2158. if (NIL_P(opthash)) {
  2159. replacement = Qnil;
  2160. }
  2161. else {
  2162. if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
  2163. rb_bug("rb_econv_open_opts called with invalid opthash");
  2164. replacement = rb_hash_aref(opthash, sym_replace);
  2165. }
  2166. ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
  2167. if (!ec)
  2168. return ec;
  2169. if (!NIL_P(replacement)) {
  2170. int ret;
  2171. rb_encoding *enc = rb_enc_get(replacement);
  2172. ret = rb_econv_set_replacement(ec,
  2173. (const unsigned char *)RSTRING_PTR(replacement),
  2174. RSTRING_LEN(replacement),
  2175. rb_enc_name(enc));
  2176. if (ret == -1) {
  2177. rb_econv_close(ec);
  2178. return NULL;
  2179. }
  2180. }
  2181. return ec;
  2182. }
  2183. static int
  2184. enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
  2185. {
  2186. rb_encoding *enc;
  2187. const char *n;
  2188. int encidx;
  2189. VALUE encval;
  2190. if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
  2191. !(enc = rb_enc_from_index(encidx))) {
  2192. enc = NULL;
  2193. encidx = 0;
  2194. n = StringValueCStr(*arg);
  2195. }
  2196. else {
  2197. n = rb_enc_name(enc);
  2198. }
  2199. *name_p = n;
  2200. *enc_p = enc;
  2201. return encidx;
  2202. }
  2203. static int
  2204. str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
  2205. const char **sname_p, rb_encoding **senc_p,
  2206. const char **dname_p, rb_encoding **denc_p)
  2207. {
  2208. rb_encoding *senc, *denc;
  2209. const char *sname, *dname;
  2210. int sencidx, dencidx;
  2211. dencidx = enc_arg(arg1, &dname, &denc);
  2212. if (NIL_P(*arg2)) {
  2213. sencidx = rb_enc_get_index(str);
  2214. senc = rb_enc_from_index(sencidx);
  2215. sname = rb_enc_name(senc);
  2216. }
  2217. else {
  2218. sencidx = enc_arg(arg2, &sname, &senc);
  2219. }
  2220. *sname_p = sname;
  2221. *senc_p = senc;
  2222. *dname_p = dname;
  2223. *denc_p = denc;
  2224. return dencidx;
  2225. }
  2226. static int
  2227. str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
  2228. {
  2229. VALUE dest;
  2230. VALUE str = *self;
  2231. volatile VALUE arg1, arg2;
  2232. long blen, slen;
  2233. unsigned char *buf, *bp, *sp;
  2234. const unsigned char *fromp;
  2235. rb_encoding *senc, *denc;
  2236. const char *sname, *dname;
  2237. int dencidx;
  2238. if (argc <0 || argc > 2) {
  2239. rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
  2240. }
  2241. if (argc == 0) {
  2242. arg1 = rb_enc_default_internal();
  2243. if (NIL_P(arg1)) {
  2244. if (!ecflags) return -1;
  2245. arg1 = rb_obj_encoding(str);
  2246. }
  2247. ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
  2248. }
  2249. else {
  2250. arg1 = argv[0];
  2251. }
  2252. arg2 = argc<=1 ? Qnil : argv[1];
  2253. dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
  2254. if ((ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
  2255. ECONV_CRLF_NEWLINE_DECORATOR|
  2256. ECONV_CR_NEWLINE_DECORATOR|
  2257. ECONV_XML_TEXT_DECORATOR|
  2258. ECONV_XML_ATTR_CONTENT_DECORATOR|
  2259. ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
  2260. if (senc && senc == denc) {
  2261. return NIL_P(arg2) ? -1 : dencidx;
  2262. }
  2263. if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
  2264. if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
  2265. return dencidx;
  2266. }
  2267. }
  2268. if (encoding_equal(sname, dname)) {
  2269. return NIL_P(arg2) ? -1 : dencidx;
  2270. }
  2271. }
  2272. else {
  2273. if (encoding_equal(sname, dname)) {
  2274. sname = "";
  2275. dname = "";
  2276. }
  2277. }
  2278. fromp = sp = (unsigned char *)RSTRING_PTR(str);
  2279. slen = RSTRING_LEN(str);
  2280. blen = slen + 30; /* len + margin */
  2281. dest = rb_str_tmp_new(blen);
  2282. bp = (unsigned char *)RSTRING_PTR(dest);
  2283. transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
  2284. if (fromp != sp+slen) {
  2285. rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
  2286. }
  2287. buf = (unsigned char *)RSTRING_PTR(dest);
  2288. *bp = '\0';
  2289. rb_str_set_len(dest, bp - buf);
  2290. /* set encoding */
  2291. if (!denc) {
  2292. dencidx = rb_define_dummy_encoding(dname);
  2293. }
  2294. *self = dest;
  2295. return dencidx;
  2296. }
  2297. static int
  2298. str_transcode(int argc, VALUE *argv, VALUE *self)
  2299. {
  2300. VALUE opt;
  2301. int ecflags = 0;
  2302. VALUE ecopts = Qnil;
  2303. if (0 < argc) {
  2304. opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
  2305. if (!NIL_P(opt)) {
  2306. argc--;
  2307. ecflags = rb_econv_prepare_opts(opt, &ecopts);
  2308. }
  2309. }
  2310. return str_transcode0(argc, argv, self, ecflags, ecopts);
  2311. }
  2312. static inline VALUE
  2313. str_encode_associate(VALUE str, int encidx)
  2314. {
  2315. int cr = 0;
  2316. rb_enc_associate_index(str, encidx);
  2317. /* transcoded string never be broken. */
  2318. if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
  2319. rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
  2320. }
  2321. else {
  2322. cr = ENC_CODERANGE_VALID;
  2323. }
  2324. ENC_CODERANGE_SET(str, cr);
  2325. return str;
  2326. }
  2327. /*
  2328. * call-seq:
  2329. * str.encode!(encoding [, options] ) => str
  2330. * str.encode!(dst_encoding, src_encoding [, options] ) => str
  2331. *
  2332. * The first form transcodes the contents of <i>str</i> from
  2333. * str.encoding to +encoding+.
  2334. * The second form transcodes the contents of <i>str</i> from
  2335. * src_encoding to dst_encoding.
  2336. * The options Hash gives details for conversion. See String#encode
  2337. * for details.
  2338. * Returns the string even if no changes were made.
  2339. */
  2340. static VALUE
  2341. str_encode_bang(int argc, VALUE *argv, VALUE str)
  2342. {
  2343. VALUE newstr;
  2344. int encidx;
  2345. if (OBJ_FROZEN(str)) { /* in future, may use str_frozen_check from string.c, but that's currently static */
  2346. rb_raise(rb_eRuntimeError, "string frozen");
  2347. }
  2348. newstr = str;
  2349. encidx = str_transcode(argc, argv, &newstr);
  2350. if (encidx < 0) return str;
  2351. rb_str_shared_replace(str, newstr);
  2352. return str_encode_associate(str, encidx);
  2353. }
  2354. /*
  2355. * call-seq:
  2356. * str.encode(encoding [, options] ) => str
  2357. * str.encode(dst_encoding, src_encoding [, options] ) => str
  2358. * str.encode([options]) => str
  2359. *
  2360. * The first form returns a copy of <i>str</i> transcoded
  2361. * to encoding +encoding+.
  2362. * The second form returns a copy of <i>str</i> transcoded
  2363. * from src_encoding to dst_encoding.
  2364. * The last form returns a copy of <i>str</i> transcoded to
  2365. * <code>Encoding.default_internal</code>.
  2366. * By default, the first and second form raise
  2367. * Encoding::UndefinedConversionError for characters that are
  2368. * undefined in the destination encoding, and
  2369. * Encoding::InvalidByteSequenceError for invalid byte sequences
  2370. * in the source encoding. The last form by default does not raise
  2371. * exceptions but uses replacement strings.
  2372. * The <code>options</code> Hash gives details for conversion.
  2373. *
  2374. * === options
  2375. * The hash <code>options</code> can have the following keys:
  2376. * :invalid ::
  2377. * If the value is <code>:replace</code>, <code>#encode</code> replaces
  2378. * invalid byte sequences in <code>str</code> with the replacement character.
  2379. * The default is to raise the exception
  2380. * :undef ::
  2381. * If the value is <code>:replace</code>, <code>#encode</code> replaces
  2382. * characters which are undefined in the destination encoding with
  2383. * the replacement character.
  2384. * :replace ::
  2385. * Sets the replacement string to the value. The default replacement
  2386. * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
  2387. * :xml ::
  2388. * The value must be <code>:text</code> or <code>:attr</code>.
  2389. * If the value is <code>:text</code> <code>#encode</code> replaces
  2390. * undefined characters with their (upper-case hexadecimal) numeric
  2391. * character references. '&', '<', and '>' are converted to "&amp;",
  2392. * "&lt;", and "&gt;", respectively.
  2393. * If the value is <code>:attr</code>, <code>#encode</code> also quotes
  2394. * the replacement result (using '"'), and replaces '"' with "&quot;".
  2395. * :cr_newline ::
  2396. * Replaces LF ("\n") with CR ("\r") if value is true.
  2397. * :crlf_newline ::
  2398. * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
  2399. * :universal_newline ::
  2400. * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
  2401. */
  2402. static VALUE
  2403. str_encode(int argc, VALUE *argv, VALUE str)
  2404. {
  2405. VALUE newstr = str;
  2406. int encidx = str_transcode(argc, argv, &newstr);
  2407. if (encidx < 0) return rb_str_dup(str);
  2408. if (newstr == str) {
  2409. newstr = rb_str_dup(str);
  2410. }
  2411. else {
  2412. RBASIC(newstr)->klass = rb_obj_class(str);
  2413. }
  2414. return str_encode_associate(newstr, encidx);
  2415. }
  2416. VALUE
  2417. rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
  2418. {
  2419. int argc = 1;
  2420. VALUE *argv = &to;
  2421. VALUE newstr = str;
  2422. int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
  2423. if (encidx < 0) return rb_str_dup(str);
  2424. RBASIC(newstr)->klass = rb_obj_class(str);
  2425. return str_encode_associate(newstr, encidx);
  2426. }
  2427. static void
  2428. econv_free(void *ptr)
  2429. {
  2430. rb_econv_t *ec = ptr;
  2431. rb_econv_close(ec);
  2432. }
  2433. static size_t
  2434. econv_memsize(const void *ptr)
  2435. {
  2436. return ptr ? sizeof(rb_econv_t) : 0;
  2437. }
  2438. static const rb_data_type_t econv_data_type = {
  2439. "econv",
  2440. NULL, econv_free, econv_memsize,
  2441. };
  2442. static VALUE
  2443. econv_s_allocate(VALUE klass)
  2444. {
  2445. return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
  2446. }
  2447. static rb_encoding *
  2448. make_dummy_encoding(const char *name)
  2449. {
  2450. rb_encoding *enc;
  2451. int idx;
  2452. idx = rb_define_dummy_encoding(name);
  2453. enc = rb_enc_from_index(idx);
  2454. return enc;
  2455. }
  2456. static rb_encoding *
  2457. make_encoding(const char *name)
  2458. {
  2459. rb_encoding *enc;
  2460. enc = rb_enc_find(name);
  2461. if (!enc)
  2462. enc = make_dummy_encoding(name);
  2463. return enc;
  2464. }
  2465. static VALUE
  2466. make_encobj(const char *name)
  2467. {
  2468. return rb_enc_from_encoding(make_encoding(name));
  2469. }
  2470. /*
  2471. * call-seq:
  2472. * Encoding::Converter.asciicompat_encoding(string) => encoding or nil
  2473. * Encoding::Converter.asciicompat_encoding(encoding) => encoding or nil
  2474. *
  2475. * Returns the corresponding ASCII compatible encoding.
  2476. *
  2477. * Returns nil if the argument is an ASCII compatible encoding.
  2478. *
  2479. * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which
  2480. * can represents exactly the same characters as the given ASCII incompatible encoding.
  2481. * So, no conversion undefined error occurs when converting between the two encodings.
  2482. *
  2483. * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
  2484. * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
  2485. * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
  2486. *
  2487. */
  2488. static VALUE
  2489. econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
  2490. {
  2491. const char *arg_name, *result_name;
  2492. rb_encoding *arg_enc, *result_enc;
  2493. enc_arg(&arg, &arg_name, &arg_enc);
  2494. result_name = rb_econv_asciicompat_encoding(arg_name);
  2495. if (result_name == NULL)
  2496. return Qnil;
  2497. result_enc = make_encoding(result_name);
  2498. return rb_enc_from_encoding(result_enc);
  2499. }
  2500. static void
  2501. econv_args(int argc, VALUE *argv,
  2502. volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
  2503. const char **sname_p, const char **dname_p,
  2504. rb_encoding **senc_p, rb_encoding **denc_p,
  2505. int *ecflags_p,
  2506. VALUE *ecopts_p)
  2507. {
  2508. VALUE opt, opthash, flags_v, ecopts;
  2509. int sidx, didx;
  2510. const char *sname, *dname;
  2511. rb_encoding *senc, *denc;
  2512. int ecflags;
  2513. rb_scan_args(argc, argv, "21", snamev_p, dnamev_p, &opt);
  2514. if (NIL_P(opt)) {
  2515. ecflags = 0;
  2516. ecopts = Qnil;
  2517. }
  2518. else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
  2519. ecflags = NUM2INT(flags_v);
  2520. ecopts = Qnil;
  2521. }
  2522. else {
  2523. opthash = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
  2524. ecflags = rb_econv_prepare_opts(opthash, &ecopts);
  2525. }
  2526. senc = NULL;
  2527. sidx = rb_to_encoding_index(*snamev_p);
  2528. if (0 <= sidx) {
  2529. senc = rb_enc_from_index(sidx);
  2530. }
  2531. else {
  2532. StringValue(*snamev_p);
  2533. }
  2534. denc = NULL;
  2535. didx = rb_to_encoding_index(*dnamev_p);
  2536. if (0 <= didx) {
  2537. denc = rb_enc_from_index(didx);
  2538. }
  2539. else {
  2540. StringValue(*dnamev_p);
  2541. }
  2542. sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
  2543. dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
  2544. *sname_p = sname;
  2545. *dname_p = dname;
  2546. *senc_p = senc;
  2547. *denc_p = denc;
  2548. *ecflags_p = ecflags;
  2549. *ecopts_p = ecopts;
  2550. }
  2551. static int
  2552. decorate_convpath(VALUE convpath, int ecflags)
  2553. {
  2554. int num_decorators;
  2555. const char *decorators[MAX_ECFLAGS_DECORATORS];
  2556. int i;
  2557. int n, len;
  2558. num_decorators = decorator_names(ecflags, decorators);
  2559. if (num_decorators == -1)
  2560. return -1;
  2561. len = n = RARRAY_LENINT(convpath);
  2562. if (n != 0) {
  2563. VALUE pair = RARRAY_PTR(convpath)[n-1];
  2564. if (TYPE(pair) == T_ARRAY) {
  2565. const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
  2566. const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
  2567. transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
  2568. const rb_transcoder *tr = load_transcoder_entry(entry);
  2569. if (!tr)
  2570. return -1;
  2571. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
  2572. tr->asciicompat_type == asciicompat_encoder) {
  2573. n--;
  2574. rb_ary_store(convpath, len + num_decorators - 1, pair);
  2575. }
  2576. }
  2577. else {
  2578. rb_ary_store(convpath, len + num_decorators - 1, pair);
  2579. }
  2580. }
  2581. for (i = 0; i < num_decorators; i++)
  2582. rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
  2583. return 0;
  2584. }
  2585. static void
  2586. search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
  2587. {
  2588. VALUE *ary_p = arg;
  2589. VALUE v;
  2590. if (*ary_p == Qnil) {
  2591. *ary_p = rb_ary_new();
  2592. }
  2593. if (DECORATOR_P(sname, dname)) {
  2594. v = rb_str_new_cstr(dname);
  2595. }
  2596. else {
  2597. v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
  2598. }
  2599. rb_ary_store(*ary_p, depth, v);
  2600. }
  2601. /*
  2602. * call-seq:
  2603. * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
  2604. * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
  2605. *
  2606. * Returns a conversion path.
  2607. *
  2608. * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
  2609. * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
  2610. * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
  2611. *
  2612. * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
  2613. * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
  2614. * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
  2615. * # "universal_newline"]
  2616. *
  2617. * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
  2618. * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
  2619. * # "universal_newline",
  2620. * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
  2621. */
  2622. static VALUE
  2623. econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
  2624. {
  2625. volatile VALUE snamev, dnamev;
  2626. const char *sname, *dname;
  2627. rb_encoding *senc, *denc;
  2628. int ecflags;
  2629. VALUE ecopts;
  2630. VALUE convpath;
  2631. econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
  2632. convpath = Qnil;
  2633. transcode_search_path(sname, dname, search_convpath_i, &convpath);
  2634. if (NIL_P(convpath))
  2635. rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
  2636. if (decorate_convpath(convpath, ecflags) == -1)
  2637. rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
  2638. return convpath;
  2639. }
  2640. /*
  2641. * Check the existence of a conversion path.
  2642. * Returns the number of converters in the conversion path.
  2643. * result: >=0:success -1:failure
  2644. */
  2645. int
  2646. rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
  2647. {
  2648. VALUE convpath = Qnil;
  2649. transcode_search_path(from_encoding, to_encoding, search_convpath_i,
  2650. &convpath);
  2651. return RTEST(convpath);
  2652. }
  2653. struct rb_econv_init_by_convpath_t {
  2654. rb_econv_t *ec;
  2655. int index;
  2656. int ret;
  2657. };
  2658. static void
  2659. rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
  2660. {
  2661. struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
  2662. int ret;
  2663. if (a->ret == -1)
  2664. return;
  2665. ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
  2666. a->ret = ret;
  2667. return;
  2668. }
  2669. static rb_econv_t *
  2670. rb_econv_init_by_convpath(VALUE self, VALUE convpath,
  2671. const char **sname_p, const char **dname_p,
  2672. rb_encoding **senc_p, rb_encoding**denc_p)
  2673. {
  2674. rb_econv_t *ec;
  2675. long i;
  2676. int ret, first=1;
  2677. VALUE elt;
  2678. rb_encoding *senc = 0, *denc = 0;
  2679. const char *sname, *dname;
  2680. ec = rb_econv_alloc(RARRAY_LENINT(convpath));
  2681. DATA_PTR(self) = ec;
  2682. for (i = 0; i < RARRAY_LEN(convpath); i++) {
  2683. volatile VALUE snamev, dnamev;
  2684. VALUE pair;
  2685. elt = rb_ary_entry(convpath, i);
  2686. if (!NIL_P(pair = rb_check_array_type(elt))) {
  2687. if (RARRAY_LEN(pair) != 2)
  2688. rb_raise(rb_eArgError, "not a 2-element array in convpath");
  2689. snamev = rb_ary_entry(pair, 0);
  2690. enc_arg(&snamev, &sname, &senc);
  2691. dnamev = rb_ary_entry(pair, 1);
  2692. enc_arg(&dnamev, &dname, &denc);
  2693. }
  2694. else {
  2695. sname = "";
  2696. dname = StringValueCStr(elt);
  2697. }
  2698. if (DECORATOR_P(sname, dname)) {
  2699. ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
  2700. if (ret == -1)
  2701. rb_raise(rb_eArgError, "decoration failed: %s", dname);
  2702. }
  2703. else {
  2704. int j = ec->num_trans;
  2705. struct rb_econv_init_by_convpath_t arg;
  2706. arg.ec = ec;
  2707. arg.index = ec->num_trans;
  2708. arg.ret = 0;
  2709. ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
  2710. if (ret == -1 || arg.ret == -1)
  2711. rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
  2712. if (first) {
  2713. first = 0;
  2714. *senc_p = senc;
  2715. *sname_p = ec->elems[j].tc->transcoder->src_encoding;
  2716. }
  2717. *denc_p = denc;
  2718. *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
  2719. }
  2720. }
  2721. if (first) {
  2722. *senc_p = NULL;
  2723. *denc_p = NULL;
  2724. *sname_p = "";
  2725. *dname_p = "";
  2726. }
  2727. ec->source_encoding_name = *sname_p;
  2728. ec->destination_encoding_name = *dname_p;
  2729. return ec;
  2730. }
  2731. /*
  2732. * call-seq:
  2733. * Encoding::Converter.new(source_encoding, destination_encoding)
  2734. * Encoding::Converter.new(source_encoding, destination_encoding, opt)
  2735. * Encoding::Converter.new(convpath)
  2736. *
  2737. * possible options elements:
  2738. * hash form:
  2739. * :invalid => nil # raise error on invalid byte sequence (default)
  2740. * :invalid => :replace # replace invalid byte sequence
  2741. * :undef => nil # raise error on undefined conversion (default)
  2742. * :undef => :replace # replace undefined conversion
  2743. * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
  2744. * :universal_newline => true # decorator for converting CRLF and CR to LF
  2745. * :crlf_newline => true # decorator for converting LF to CRLF
  2746. * :cr_newline => true # decorator for converting LF to CR
  2747. * :xml => :text # escape as XML CharData.
  2748. * :xml => :attr # escape as XML AttValue
  2749. * integer form:
  2750. * Encoding::Converter::INVALID_REPLACE
  2751. * Encoding::Converter::UNDEF_REPLACE
  2752. * Encoding::Converter::UNDEF_HEX_CHARREF
  2753. * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
  2754. * Encoding::Converter::CRLF_NEWLINE_DECORATOR
  2755. * Encoding::Converter::CR_NEWLINE_DECORATOR
  2756. * Encoding::Converter::XML_TEXT_DECORATOR
  2757. * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
  2758. * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
  2759. *
  2760. * Encoding::Converter.new creates an instance of Encoding::Converter.
  2761. *
  2762. * Source_encoding and destination_encoding should be a string or
  2763. * Encoding object.
  2764. *
  2765. * opt should be nil, a hash or an integer.
  2766. *
  2767. * convpath should be an array.
  2768. * convpath may contain
  2769. * - two-element arrays which contain encodings or encoding names, or
  2770. * - strings representing decorator names.
  2771. *
  2772. * Encoding::Converter.new optionally takes an option.
  2773. * The option should be a hash or an integer.
  2774. * The option hash can contain :invalid => nil, etc.
  2775. * The option integer should be logical-or of constants such as
  2776. * Encoding::Converter::INVALID_REPLACE, etc.
  2777. *
  2778. * [:invalid => nil]
  2779. * Raise error on invalid byte sequence. This is a default behavior.
  2780. * [:invalid => :replace]
  2781. * Replace invalid byte sequence by replacement string.
  2782. * [:undef => nil]
  2783. * Raise an error if a character in source_encoding is not defined in destination_encoding.
  2784. * This is a default behavior.
  2785. * [:undef => :replace]
  2786. * Replace undefined character in destination_encoding with replacement string.
  2787. * [:replace => string]
  2788. * Specify the replacement string.
  2789. * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
  2790. * [:universal_newline => true]
  2791. * Convert CRLF and CR to LF.
  2792. * [:crlf_newline => true]
  2793. * Convert LF to CRLF.
  2794. * [:cr_newline => true]
  2795. * Convert LF to CR.
  2796. * [:xml => :text]
  2797. * Escape as XML CharData.
  2798. * This form can be used as a HTML 4.0 #PCDATA.
  2799. * - '&' -> '&amp;'
  2800. * - '<' -> '&lt;'
  2801. * - '>' -> '&gt;'
  2802. * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
  2803. * [:xml => :attr]
  2804. * Escape as XML AttValue.
  2805. * The converted result is quoted as "...".
  2806. * This form can be used as a HTML 4.0 attribute value.
  2807. * - '&' -> '&amp;'
  2808. * - '<' -> '&lt;'
  2809. * - '>' -> '&gt;'
  2810. * - '"' -> '&quot;'
  2811. * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
  2812. *
  2813. * Examples:
  2814. * # UTF-16BE to UTF-8
  2815. * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
  2816. *
  2817. * # Usually, decorators such as newline conversion are inserted last.
  2818. * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
  2819. * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
  2820. * # "universal_newline"]
  2821. *
  2822. * # But, if the last encoding is ASCII incompatible,
  2823. * # decorators are inserted before the last conversion.
  2824. * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
  2825. * p ec.convpath #=> ["crlf_newline",
  2826. * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
  2827. *
  2828. * # Conversion path can be specified directly.
  2829. * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
  2830. * p ec.convpath #=> ["universal_newline",
  2831. * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
  2832. * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
  2833. */
  2834. static VALUE
  2835. econv_init(int argc, VALUE *argv, VALUE self)
  2836. {
  2837. VALUE ecopts;
  2838. volatile VALUE snamev, dnamev;
  2839. const char *sname, *dname;
  2840. rb_encoding *senc, *denc;
  2841. rb_econv_t *ec;
  2842. int ecflags;
  2843. VALUE convpath;
  2844. if (rb_check_typeddata(self, &econv_data_type)) {
  2845. rb_raise(rb_eTypeError, "already initialized");
  2846. }
  2847. if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
  2848. ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
  2849. ecflags = 0;
  2850. ecopts = Qnil;
  2851. }
  2852. else {
  2853. econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
  2854. ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
  2855. }
  2856. if (!ec) {
  2857. rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
  2858. }
  2859. if (!DECORATOR_P(sname, dname)) {
  2860. if (!senc)
  2861. senc = make_dummy_encoding(sname);
  2862. if (!denc)
  2863. denc = make_dummy_encoding(dname);
  2864. }
  2865. ec->source_encoding = senc;
  2866. ec->destination_encoding = denc;
  2867. DATA_PTR(self) = ec;
  2868. return self;
  2869. }
  2870. /*
  2871. * call-seq:
  2872. * ec.inspect -> string
  2873. *
  2874. * Returns a printable version of <i>ec</i>
  2875. *
  2876. * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
  2877. * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
  2878. *
  2879. */
  2880. static VALUE
  2881. econv_inspect(VALUE self)
  2882. {
  2883. const char *cname = rb_obj_classname(self);
  2884. rb_econv_t *ec;
  2885. TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
  2886. if (!ec)
  2887. return rb_sprintf("#<%s: uninitialized>", cname);
  2888. else {
  2889. const char *sname = ec->source_encoding_name;
  2890. const char *dname = ec->destination_encoding_name;
  2891. VALUE str;
  2892. str = rb_sprintf("#<%s: ", cname);
  2893. econv_description(sname, dname, ec->flags, str);
  2894. rb_str_cat2(str, ">");
  2895. return str;
  2896. }
  2897. }
  2898. static rb_econv_t *
  2899. check_econv(VALUE self)
  2900. {
  2901. rb_econv_t *ec;
  2902. TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
  2903. if (!ec) {
  2904. rb_raise(rb_eTypeError, "uninitialized encoding converter");
  2905. }
  2906. return ec;
  2907. }
  2908. /*
  2909. * call-seq:
  2910. * ec.source_encoding -> encoding
  2911. *
  2912. * Returns the source encoding as an Encoding object.
  2913. */
  2914. static VALUE
  2915. econv_source_encoding(VALUE self)
  2916. {
  2917. rb_econv_t *ec = check_econv(self);
  2918. if (!ec->source_encoding)
  2919. return Qnil;
  2920. return rb_enc_from_encoding(ec->source_encoding);
  2921. }
  2922. /*
  2923. * call-seq:
  2924. * ec.destination_encoding -> encoding
  2925. *
  2926. * Returns the destination encoding as an Encoding object.
  2927. */
  2928. static VALUE
  2929. econv_destination_encoding(VALUE self)
  2930. {
  2931. rb_econv_t *ec = check_econv(self);
  2932. if (!ec->destination_encoding)
  2933. return Qnil;
  2934. return rb_enc_from_encoding(ec->destination_encoding);
  2935. }
  2936. /*
  2937. * call-seq:
  2938. * ec.convpath -> ary
  2939. *
  2940. * Returns the conversion path of ec.
  2941. *
  2942. * The result is an array of conversions.
  2943. *
  2944. * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
  2945. * p ec.convpath
  2946. * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
  2947. * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
  2948. * # "crlf_newline"]
  2949. *
  2950. * Each element of the array is a pair of encodings or a string.
  2951. * A pair means an encoding conversion.
  2952. * A string means a decorator.
  2953. *
  2954. * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
  2955. * a converter from ISO-8859-1 to UTF-8.
  2956. * "crlf_newline" means newline converter from LF to CRLF.
  2957. */
  2958. static VALUE
  2959. econv_convpath(VALUE self)
  2960. {
  2961. rb_econv_t *ec = check_econv(self);
  2962. VALUE result;
  2963. int i;
  2964. result = rb_ary_new();
  2965. for (i = 0; i < ec->num_trans; i++) {
  2966. const rb_transcoder *tr = ec->elems[i].tc->transcoder;
  2967. VALUE v;
  2968. if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
  2969. v = rb_str_new_cstr(tr->dst_encoding);
  2970. else
  2971. v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
  2972. rb_ary_push(result, v);
  2973. }
  2974. return result;
  2975. }
  2976. static VALUE
  2977. econv_result_to_symbol(rb_econv_result_t res)
  2978. {
  2979. switch (res) {
  2980. case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
  2981. case econv_incomplete_input: return sym_incomplete_input;
  2982. case econv_undefined_conversion: return sym_undefined_conversion;
  2983. case econv_destination_buffer_full: return sym_destination_buffer_full;
  2984. case econv_source_buffer_empty: return sym_source_buffer_empty;
  2985. case econv_finished: return sym_finished;
  2986. case econv_after_output: return sym_after_output;
  2987. default: return INT2NUM(res); /* should not be reached */
  2988. }
  2989. }
  2990. /*
  2991. * call-seq:
  2992. * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
  2993. * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
  2994. * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
  2995. * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
  2996. *
  2997. * possible opt elements:
  2998. * hash form:
  2999. * :partial_input => true # source buffer may be part of larger source
  3000. * :after_output => true # stop conversion after output before input
  3001. * integer form:
  3002. * Encoding::Converter::PARTIAL_INPUT
  3003. * Encoding::Converter::AFTER_OUTPUT
  3004. *
  3005. * possible results:
  3006. * :invalid_byte_sequence
  3007. * :incomplete_input
  3008. * :undefined_conversion
  3009. * :after_output
  3010. * :destination_buffer_full
  3011. * :source_buffer_empty
  3012. * :finished
  3013. *
  3014. * primitive_convert converts source_buffer into destination_buffer.
  3015. *
  3016. * source_buffer should be a string or nil.
  3017. * nil means a empty string.
  3018. *
  3019. * destination_buffer should be a string.
  3020. *
  3021. * destination_byteoffset should be an integer or nil.
  3022. * nil means the end of destination_buffer.
  3023. * If it is omitted, nil is assumed.
  3024. *
  3025. * destination_bytesize should be an integer or nil.
  3026. * nil means unlimited.
  3027. * If it is omitted, nil is assumed.
  3028. *
  3029. * opt should be nil, a hash or an integer.
  3030. * nil means no flags.
  3031. * If it is omitted, nil is assumed.
  3032. *
  3033. * primitive_convert converts the content of source_buffer from beginning
  3034. * and store the result into destination_buffer.
  3035. *
  3036. * destination_byteoffset and destination_bytesize specify the region which
  3037. * the converted result is stored.
  3038. * destination_byteoffset specifies the start position in destination_buffer in bytes.
  3039. * If destination_byteoffset is nil,
  3040. * destination_buffer.bytesize is used for appending the result.
  3041. * destination_bytesize specifies maximum number of bytes.
  3042. * If destination_bytesize is nil,
  3043. * destination size is unlimited.
  3044. * After conversion, destination_buffer is resized to
  3045. * destination_byteoffset + actually produced number of bytes.
  3046. * Also destination_buffer's encoding is set to destination_encoding.
  3047. *
  3048. * primitive_convert drops the converted part of source_buffer.
  3049. * the dropped part is converted in destination_buffer or
  3050. * buffered in Encoding::Converter object.
  3051. *
  3052. * primitive_convert stops conversion when one of following condition met.
  3053. * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
  3054. * - unexpected end of source buffer (:incomplete_input)
  3055. * this occur only when :partial_input is not specified.
  3056. * - character not representable in output encoding (:undefined_conversion)
  3057. * - after some output is generated, before input is done (:after_output)
  3058. * this occur only when :after_output is specified.
  3059. * - destination buffer is full (:destination_buffer_full)
  3060. * this occur only when destination_bytesize is non-nil.
  3061. * - source buffer is empty (:source_buffer_empty)
  3062. * this occur only when :partial_input is specified.
  3063. * - conversion is finished (:finished)
  3064. *
  3065. * example:
  3066. * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
  3067. * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
  3068. * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
  3069. *
  3070. * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
  3071. * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
  3072. * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
  3073. * ret = ec.primitive_convert(src, dst="", nil, 1)
  3074. * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
  3075. * ret = ec.primitive_convert(src, dst="", nil, 1)
  3076. * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
  3077. * ret = ec.primitive_convert(src, dst="", nil, 1)
  3078. * p [ret, src, dst] #=> [:finished, "", "i"]
  3079. *
  3080. */
  3081. static VALUE
  3082. econv_primitive_convert(int argc, VALUE *argv, VALUE self)
  3083. {
  3084. VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
  3085. rb_econv_t *ec = check_econv(self);
  3086. rb_econv_result_t res;
  3087. const unsigned char *ip, *is;
  3088. unsigned char *op, *os;
  3089. long output_byteoffset, output_bytesize;
  3090. unsigned long output_byteend;
  3091. int flags;
  3092. rb_scan_args(argc, argv, "23", &input, &output, &output_byteoffset_v, &output_bytesize_v, &opt);
  3093. if (NIL_P(output_byteoffset_v))
  3094. output_byteoffset = 0; /* dummy */
  3095. else
  3096. output_byteoffset = NUM2LONG(output_byteoffset_v);
  3097. if (NIL_P(output_bytesize_v))
  3098. output_bytesize = 0; /* dummy */
  3099. else
  3100. output_bytesize = NUM2LONG(output_bytesize_v);
  3101. if (NIL_P(opt)) {
  3102. flags = 0;
  3103. }
  3104. else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
  3105. flags = NUM2INT(flags_v);
  3106. }
  3107. else {
  3108. VALUE v;
  3109. opt = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
  3110. flags = 0;
  3111. v = rb_hash_aref(opt, sym_partial_input);
  3112. if (RTEST(v))
  3113. flags |= ECONV_PARTIAL_INPUT;
  3114. v = rb_hash_aref(opt, sym_after_output);
  3115. if (RTEST(v))
  3116. flags |= ECONV_AFTER_OUTPUT;
  3117. }
  3118. StringValue(output);
  3119. if (!NIL_P(input))
  3120. StringValue(input);
  3121. rb_str_modify(output);
  3122. if (NIL_P(output_bytesize_v)) {
  3123. output_bytesize = RSTRING_EMBED_LEN_MAX;
  3124. if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
  3125. output_bytesize = RSTRING_LEN(input);
  3126. }
  3127. retry:
  3128. if (NIL_P(output_byteoffset_v))
  3129. output_byteoffset = RSTRING_LEN(output);
  3130. if (output_byteoffset < 0)
  3131. rb_raise(rb_eArgError, "negative output_byteoffset");
  3132. if (RSTRING_LEN(output) < output_byteoffset)
  3133. rb_raise(rb_eArgError, "output_byteoffset too big");
  3134. if (output_bytesize < 0)
  3135. rb_raise(rb_eArgError, "negative output_bytesize");
  3136. output_byteend = (unsigned long)output_byteoffset +
  3137. (unsigned long)output_bytesize;
  3138. if (output_byteend < (unsigned long)output_byteoffset ||
  3139. LONG_MAX < output_byteend)
  3140. rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
  3141. if (rb_str_capacity(output) < output_byteend)
  3142. rb_str_resize(output, output_byteend);
  3143. if (NIL_P(input)) {
  3144. ip = is = NULL;
  3145. }
  3146. else {
  3147. ip = (const unsigned char *)RSTRING_PTR(input);
  3148. is = ip + RSTRING_LEN(input);
  3149. }
  3150. op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
  3151. os = op + output_bytesize;
  3152. res = rb_econv_convert(ec, &ip, is, &op, os, flags);
  3153. rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
  3154. if (!NIL_P(input))
  3155. rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
  3156. if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
  3157. if (LONG_MAX / 2 < output_bytesize)
  3158. rb_raise(rb_eArgError, "too long conversion result");
  3159. output_bytesize *= 2;
  3160. output_byteoffset_v = Qnil;
  3161. goto retry;
  3162. }
  3163. if (ec->destination_encoding) {
  3164. rb_enc_associate(output, ec->destination_encoding);
  3165. }
  3166. return econv_result_to_symbol(res);
  3167. }
  3168. /*
  3169. * call-seq:
  3170. * ec.convert(source_string) -> destination_string
  3171. *
  3172. * Convert source_string and return destination_string.
  3173. *
  3174. * source_string is assumed as a part of source.
  3175. * i.e. :partial_input=>true is specified internally.
  3176. * finish method should be used last.
  3177. *
  3178. * ec = Encoding::Converter.new("utf-8", "euc-jp")
  3179. * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
  3180. * puts ec.finish.dump #=> ""
  3181. *
  3182. * ec = Encoding::Converter.new("euc-jp", "utf-8")
  3183. * puts ec.convert("\xA4").dump #=> ""
  3184. * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
  3185. * puts ec.finish.dump #=> ""
  3186. *
  3187. * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
  3188. * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
  3189. * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
  3190. * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
  3191. * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
  3192. *
  3193. * If a conversion error occur,
  3194. * Encoding::UndefinedConversionError or
  3195. * Encoding::InvalidByteSequenceError is raised.
  3196. * Encoding::Converter#convert doesn't supply methods to recover or restart
  3197. * from these exceptions.
  3198. * When you want to handle these conversion errors,
  3199. * use Encoding::Converter#primitive_convert.
  3200. *
  3201. */
  3202. static VALUE
  3203. econv_convert(VALUE self, VALUE source_string)
  3204. {
  3205. VALUE ret, dst;
  3206. VALUE av[5];
  3207. int ac;
  3208. rb_econv_t *ec = check_econv(self);
  3209. StringValue(source_string);
  3210. dst = rb_str_new(NULL, 0);
  3211. av[0] = rb_str_dup(source_string);
  3212. av[1] = dst;
  3213. av[2] = Qnil;
  3214. av[3] = Qnil;
  3215. av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
  3216. ac = 5;
  3217. ret = econv_primitive_convert(ac, av, self);
  3218. if (ret == sym_invalid_byte_sequence ||
  3219. ret == sym_undefined_conversion ||
  3220. ret == sym_incomplete_input) {
  3221. VALUE exc = make_econv_exception(ec);
  3222. rb_exc_raise(exc);
  3223. }
  3224. if (ret == sym_finished) {
  3225. rb_raise(rb_eArgError, "converter already finished");
  3226. }
  3227. if (ret != sym_source_buffer_empty) {
  3228. rb_bug("unexpected result of econv_primitive_convert");
  3229. }
  3230. return dst;
  3231. }
  3232. /*
  3233. * call-seq:
  3234. * ec.finish -> string
  3235. *
  3236. * Finishes the converter.
  3237. * It returns the last part of the converted string.
  3238. *
  3239. * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
  3240. * p ec.convert("\u3042") #=> "\e$B$\""
  3241. * p ec.finish #=> "\e(B"
  3242. */
  3243. static VALUE
  3244. econv_finish(VALUE self)
  3245. {
  3246. VALUE ret, dst;
  3247. VALUE av[5];
  3248. int ac;
  3249. rb_econv_t *ec = check_econv(self);
  3250. dst = rb_str_new(NULL, 0);
  3251. av[0] = Qnil;
  3252. av[1] = dst;
  3253. av[2] = Qnil;
  3254. av[3] = Qnil;
  3255. av[4] = INT2NUM(0);
  3256. ac = 5;
  3257. ret = econv_primitive_convert(ac, av, self);
  3258. if (ret == sym_invalid_byte_sequence ||
  3259. ret == sym_undefined_conversion ||
  3260. ret == sym_incomplete_input) {
  3261. VALUE exc = make_econv_exception(ec);
  3262. rb_exc_raise(exc);
  3263. }
  3264. if (ret != sym_finished) {
  3265. rb_bug("unexpected result of econv_primitive_convert");
  3266. }
  3267. return dst;
  3268. }
  3269. /*
  3270. * call-seq:
  3271. * ec.primitive_errinfo -> array
  3272. *
  3273. * primitive_errinfo returns important information regarding the last error
  3274. * as a 5-element array:
  3275. *
  3276. * [result, enc1, enc2, error_bytes, readagain_bytes]
  3277. *
  3278. * result is the last result of primitive_convert.
  3279. *
  3280. * Other elements are only meaningful when result is
  3281. * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
  3282. *
  3283. * enc1 and enc2 indicate a conversion step as a pair of strings.
  3284. * For example, a converter from EUC-JP to ISO-8859-1 converts
  3285. * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
  3286. * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
  3287. *
  3288. * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
  3289. * error_bytes is discarded portion.
  3290. * readagain_bytes is buffered portion which is read again on next conversion.
  3291. *
  3292. * Example:
  3293. *
  3294. * # \xff is invalid as EUC-JP.
  3295. * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
  3296. * ec.primitive_convert(src="\xff", dst="", nil, 10)
  3297. * p ec.primitive_errinfo
  3298. * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
  3299. *
  3300. * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
  3301. * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
  3302. * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
  3303. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
  3304. * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
  3305. * p ec.primitive_errinfo
  3306. * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
  3307. *
  3308. * # partial character is invalid
  3309. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
  3310. * ec.primitive_convert(src="\xa4", dst="", nil, 10)
  3311. * p ec.primitive_errinfo
  3312. * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
  3313. *
  3314. * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
  3315. * # partial characters.
  3316. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
  3317. * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
  3318. * p ec.primitive_errinfo
  3319. * #=> [:source_buffer_empty, nil, nil, nil, nil]
  3320. *
  3321. * # \xd8\x00\x00@ is invalid as UTF-16BE because
  3322. * # no low surrogate after high surrogate (\xd8\x00).
  3323. * # It is detected by 3rd byte (\00) which is part of next character.
  3324. * # So the high surrogate (\xd8\x00) is discarded and
  3325. * # the 3rd byte is read again later.
  3326. * # Since the byte is buffered in ec, it is dropped from src.
  3327. * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
  3328. * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
  3329. * p ec.primitive_errinfo
  3330. * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
  3331. * p src
  3332. * #=> "@"
  3333. *
  3334. * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
  3335. * # The problem is detected by 4th byte.
  3336. * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
  3337. * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
  3338. * p ec.primitive_errinfo
  3339. * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
  3340. * p src
  3341. * #=> ""
  3342. *
  3343. */
  3344. static VALUE
  3345. econv_primitive_errinfo(VALUE self)
  3346. {
  3347. rb_econv_t *ec = check_econv(self);
  3348. VALUE ary;
  3349. ary = rb_ary_new2(5);
  3350. rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
  3351. rb_ary_store(ary, 4, Qnil);
  3352. if (ec->last_error.source_encoding)
  3353. rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
  3354. if (ec->last_error.destination_encoding)
  3355. rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
  3356. if (ec->last_error.error_bytes_start) {
  3357. rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
  3358. rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
  3359. }
  3360. return ary;
  3361. }
  3362. /*
  3363. * call-seq:
  3364. * ec.insert_output(string) -> nil
  3365. *
  3366. * Inserts string into the encoding converter.
  3367. * The string will be converted to the destination encoding and
  3368. * output on later conversions.
  3369. *
  3370. * If the destination encoding is stateful,
  3371. * string is converted according to the state and the state is updated.
  3372. *
  3373. * This method should be used only when a conversion error occurs.
  3374. *
  3375. * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
  3376. * src = "HIRAGANA LETTER A is \u{3042}."
  3377. * dst = ""
  3378. * p ec.primitive_convert(src, dst) #=> :undefined_conversion
  3379. * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
  3380. * ec.insert_output("<err>")
  3381. * p ec.primitive_convert(src, dst) #=> :finished
  3382. * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
  3383. *
  3384. * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
  3385. * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
  3386. * dst = ""
  3387. * p ec.primitive_convert(src, dst) #=> :undefined_conversion
  3388. * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
  3389. * ec.insert_output "?" # state change required to output "?".
  3390. * p ec.primitive_convert(src, dst) #=> :finished
  3391. * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
  3392. *
  3393. */
  3394. static VALUE
  3395. econv_insert_output(VALUE self, VALUE string)
  3396. {
  3397. const char *insert_enc;
  3398. int ret;
  3399. rb_econv_t *ec = check_econv(self);
  3400. StringValue(string);
  3401. insert_enc = rb_econv_encoding_to_insert_output(ec);
  3402. string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
  3403. ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
  3404. if (ret == -1) {
  3405. rb_raise(rb_eArgError, "too big string");
  3406. }
  3407. return Qnil;
  3408. }
  3409. /*
  3410. * call-seq
  3411. * ec.putback => string
  3412. * ec.putback(max_numbytes) => string
  3413. *
  3414. * Put back the bytes which will be converted.
  3415. *
  3416. * The bytes are caused by invalid_byte_sequence error.
  3417. * When invalid_byte_sequence error, some bytes are discarded and
  3418. * some bytes are buffered to be converted later.
  3419. * The latter bytes can be put back.
  3420. * It can be observed by
  3421. * Encoding::InvalidByteSequenceError#readagain_bytes and
  3422. * Encoding::Converter#primitive_errinfo.
  3423. *
  3424. * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
  3425. * src = "\x00\xd8\x61\x00"
  3426. * dst = ""
  3427. * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
  3428. * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
  3429. * p ec.putback #=> "a\x00"
  3430. * p ec.putback #=> "" # no more bytes to put back
  3431. *
  3432. */
  3433. static VALUE
  3434. econv_putback(int argc, VALUE *argv, VALUE self)
  3435. {
  3436. rb_econv_t *ec = check_econv(self);
  3437. int n;
  3438. int putbackable;
  3439. VALUE str, max;
  3440. rb_scan_args(argc, argv, "01", &max);
  3441. if (NIL_P(max))
  3442. n = rb_econv_putbackable(ec);
  3443. else {
  3444. n = NUM2INT(max);
  3445. putbackable = rb_econv_putbackable(ec);
  3446. if (putbackable < n)
  3447. n = putbackable;
  3448. }
  3449. str = rb_str_new(NULL, n);
  3450. rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
  3451. if (ec->source_encoding) {
  3452. rb_enc_associate(str, ec->source_encoding);
  3453. }
  3454. return str;
  3455. }
  3456. /*
  3457. * call-seq:
  3458. * ec.last_error -> exception or nil
  3459. *
  3460. * Returns an exception object for the last conversion.
  3461. * Returns nil if the last conversion did not produce an error.
  3462. *
  3463. * "error" means that
  3464. * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
  3465. * Encoding::Converter#convert and
  3466. * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
  3467. * Encoding::Converter#primitive_convert.
  3468. *
  3469. * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
  3470. * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
  3471. * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
  3472. * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
  3473. * p ec.last_error #=> nil
  3474. *
  3475. */
  3476. static VALUE
  3477. econv_last_error(VALUE self)
  3478. {
  3479. rb_econv_t *ec = check_econv(self);
  3480. VALUE exc;
  3481. exc = make_econv_exception(ec);
  3482. if (NIL_P(exc))
  3483. return Qnil;
  3484. return exc;
  3485. }
  3486. /*
  3487. * call-seq:
  3488. * ec.replacement -> string
  3489. *
  3490. * Returns the replacement string.
  3491. *
  3492. * ec = Encoding::Converter.new("euc-jp", "us-ascii")
  3493. * p ec.replacement #=> "?"
  3494. *
  3495. * ec = Encoding::Converter.new("euc-jp", "utf-8")
  3496. * p ec.replacement #=> "\uFFFD"
  3497. */
  3498. static VALUE
  3499. econv_get_replacement(VALUE self)
  3500. {
  3501. rb_econv_t *ec = check_econv(self);
  3502. int ret;
  3503. rb_encoding *enc;
  3504. ret = make_replacement(ec);
  3505. if (ret == -1) {
  3506. rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
  3507. }
  3508. enc = rb_enc_find(ec->replacement_enc);
  3509. return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
  3510. }
  3511. /*
  3512. * call-seq:
  3513. * ec.replacement = string
  3514. *
  3515. * Sets the replacement string.
  3516. *
  3517. * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
  3518. * ec.replacement = "<undef>"
  3519. * p ec.convert("a \u3042 b") #=> "a <undef> b"
  3520. */
  3521. static VALUE
  3522. econv_set_replacement(VALUE self, VALUE arg)
  3523. {
  3524. rb_econv_t *ec = check_econv(self);
  3525. VALUE string = arg;
  3526. int ret;
  3527. rb_encoding *enc;
  3528. StringValue(string);
  3529. enc = rb_enc_get(string);
  3530. ret = rb_econv_set_replacement(ec,
  3531. (const unsigned char *)RSTRING_PTR(string),
  3532. RSTRING_LEN(string),
  3533. rb_enc_name(enc));
  3534. if (ret == -1) {
  3535. /* xxx: rb_eInvalidByteSequenceError? */
  3536. rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
  3537. }
  3538. return arg;
  3539. }
  3540. VALUE
  3541. rb_econv_make_exception(rb_econv_t *ec)
  3542. {
  3543. return make_econv_exception(ec);
  3544. }
  3545. void
  3546. rb_econv_check_error(rb_econv_t *ec)
  3547. {
  3548. VALUE exc;
  3549. exc = make_econv_exception(ec);
  3550. if (NIL_P(exc))
  3551. return;
  3552. rb_exc_raise(exc);
  3553. }
  3554. /*
  3555. * call-seq:
  3556. * ecerr.source_encoding_name -> string
  3557. *
  3558. * Returns the source encoding name as a string.
  3559. */
  3560. static VALUE
  3561. ecerr_source_encoding_name(VALUE self)
  3562. {
  3563. return rb_attr_get(self, rb_intern("source_encoding_name"));
  3564. }
  3565. /*
  3566. * call-seq:
  3567. * ecerr.source_encoding -> encoding
  3568. *
  3569. * Returns the source encoding as an encoding object.
  3570. *
  3571. * Note that the result may not be equal to the source encoding of
  3572. * the encoding converter if the conversion has multiple steps.
  3573. *
  3574. * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
  3575. * begin
  3576. * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
  3577. * rescue Encoding::UndefinedConversionError
  3578. * p $!.source_encoding #=> #<Encoding:UTF-8>
  3579. * p $!.destination_encoding #=> #<Encoding:EUC-JP>
  3580. * p $!.source_encoding_name #=> "UTF-8"
  3581. * p $!.destination_encoding_name #=> "EUC-JP"
  3582. * end
  3583. *
  3584. */
  3585. static VALUE
  3586. ecerr_source_encoding(VALUE self)
  3587. {
  3588. return rb_attr_get(self, rb_intern("source_encoding"));
  3589. }
  3590. /*
  3591. * call-seq:
  3592. * ecerr.destination_encoding_name -> string
  3593. *
  3594. * Returns the destination encoding name as a string.
  3595. */
  3596. static VALUE
  3597. ecerr_destination_encoding_name(VALUE self)
  3598. {
  3599. return rb_attr_get(self, rb_intern("destination_encoding_name"));
  3600. }
  3601. /*
  3602. * call-seq:
  3603. * ecerr.destination_encoding -> string
  3604. *
  3605. * Returns the destination encoding as an encoding object.
  3606. */
  3607. static VALUE
  3608. ecerr_destination_encoding(VALUE self)
  3609. {
  3610. return rb_attr_get(self, rb_intern("destination_encoding"));
  3611. }
  3612. /*
  3613. * call-seq:
  3614. * ecerr.error_char -> string
  3615. *
  3616. * Returns the one-character string which cause Encoding::UndefinedConversionError.
  3617. *
  3618. * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
  3619. * begin
  3620. * ec.convert("\xa0")
  3621. * rescue Encoding::UndefinedConversionError
  3622. * puts $!.error_char.dump #=> "\xC2\xA0"
  3623. * p $!.error_char.encoding #=> #<Encoding:UTF-8>
  3624. * end
  3625. *
  3626. */
  3627. static VALUE
  3628. ecerr_error_char(VALUE self)
  3629. {
  3630. return rb_attr_get(self, rb_intern("error_char"));
  3631. }
  3632. /*
  3633. * call-seq:
  3634. * ecerr.error_bytes -> string
  3635. *
  3636. * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
  3637. *
  3638. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
  3639. * begin
  3640. * ec.convert("abc\xA1\xFFdef")
  3641. * rescue Encoding::InvalidByteSequenceError
  3642. * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
  3643. * puts $!.error_bytes.dump #=> "\xA1"
  3644. * puts $!.readagain_bytes.dump #=> "\xFF"
  3645. * end
  3646. */
  3647. static VALUE
  3648. ecerr_error_bytes(VALUE self)
  3649. {
  3650. return rb_attr_get(self, rb_intern("error_bytes"));
  3651. }
  3652. /*
  3653. * call-seq:
  3654. * ecerr.readagain_bytes -> string
  3655. *
  3656. * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
  3657. */
  3658. static VALUE
  3659. ecerr_readagain_bytes(VALUE self)
  3660. {
  3661. return rb_attr_get(self, rb_intern("readagain_bytes"));
  3662. }
  3663. /*
  3664. * call-seq:
  3665. * ecerr.incomplete_input? -> true or false
  3666. *
  3667. * Returns true if the invalid byte sequence error is caused by
  3668. * premature end of string.
  3669. *
  3670. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
  3671. *
  3672. * begin
  3673. * ec.convert("abc\xA1z")
  3674. * rescue Encoding::InvalidByteSequenceError
  3675. * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
  3676. * p $!.incomplete_input? #=> false
  3677. * end
  3678. *
  3679. * begin
  3680. * ec.convert("abc\xA1")
  3681. * ec.finish
  3682. * rescue Encoding::InvalidByteSequenceError
  3683. * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
  3684. * p $!.incomplete_input? #=> true
  3685. * end
  3686. */
  3687. static VALUE
  3688. ecerr_incomplete_input(VALUE self)
  3689. {
  3690. return rb_attr_get(self, rb_intern("incomplete_input"));
  3691. }
  3692. extern void Init_newline(void);
  3693. void
  3694. Init_transcode(void)
  3695. {
  3696. rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
  3697. rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
  3698. rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
  3699. transcoder_table = st_init_strcasetable();
  3700. sym_invalid = ID2SYM(rb_intern("invalid"));
  3701. sym_undef = ID2SYM(rb_intern("undef"));
  3702. sym_replace = ID2SYM(rb_intern("replace"));
  3703. sym_xml = ID2SYM(rb_intern("xml"));
  3704. sym_text = ID2SYM(rb_intern("text"));
  3705. sym_attr = ID2SYM(rb_intern("attr"));
  3706. sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
  3707. sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
  3708. sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
  3709. sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
  3710. sym_finished = ID2SYM(rb_intern("finished"));
  3711. sym_after_output = ID2SYM(rb_intern("after_output"));
  3712. sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
  3713. sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
  3714. sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
  3715. sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
  3716. sym_partial_input = ID2SYM(rb_intern("partial_input"));
  3717. rb_define_method(rb_cString, "encode", str_encode, -1);
  3718. rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
  3719. rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
  3720. rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
  3721. rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
  3722. rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
  3723. rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
  3724. rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
  3725. rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
  3726. rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
  3727. rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
  3728. rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
  3729. rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
  3730. rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
  3731. rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
  3732. rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
  3733. rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
  3734. rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
  3735. rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
  3736. rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
  3737. rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
  3738. rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
  3739. rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
  3740. rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
  3741. rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
  3742. rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
  3743. rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
  3744. rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
  3745. rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
  3746. rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
  3747. rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
  3748. rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
  3749. rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
  3750. rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
  3751. rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
  3752. rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
  3753. rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
  3754. rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
  3755. rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
  3756. rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
  3757. rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
  3758. rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
  3759. rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
  3760. rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
  3761. rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
  3762. Init_newline();
  3763. }