PageRenderTime 58ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 1ms

/transcode.c

https://github.com/fizx/ruby
C | 4258 lines | 3064 code | 491 blank | 703 comment | 605 complexity | 250fdc509e495afc2048c1b1725ad0c1 MD5 | raw file
Possible License(s): LGPL-2.1, AGPL-3.0, GPL-2.0, BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. /**********************************************************************
  2. transcode.c -
  3. $Author$
  4. created at: Tue Oct 30 16:10:22 JST 2007
  5. Copyright (C) 2007 Martin Duerst
  6. **********************************************************************/
  7. #include "ruby/ruby.h"
  8. #include "ruby/encoding.h"
  9. #include "transcode_data.h"
  10. #include <ctype.h>
  11. /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
  12. VALUE rb_eUndefinedConversionError;
  13. VALUE rb_eInvalidByteSequenceError;
  14. VALUE rb_eConverterNotFoundError;
  15. VALUE rb_cEncodingConverter;
  16. static VALUE sym_invalid, sym_undef, sym_replace;
  17. static VALUE sym_xml, sym_text, sym_attr;
  18. static VALUE sym_universal_newline;
  19. static VALUE sym_crlf_newline;
  20. static VALUE sym_cr_newline;
  21. static VALUE sym_partial_input;
  22. static VALUE sym_invalid_byte_sequence;
  23. static VALUE sym_undefined_conversion;
  24. static VALUE sym_destination_buffer_full;
  25. static VALUE sym_source_buffer_empty;
  26. static VALUE sym_finished;
  27. static VALUE sym_after_output;
  28. static VALUE sym_incomplete_input;
  29. static unsigned char *
  30. allocate_converted_string(const char *sname, const char *dname,
  31. const unsigned char *str, size_t len,
  32. unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
  33. size_t *dst_len_ptr);
  34. /* dynamic structure, one per conversion (similar to iconv_t) */
  35. /* may carry conversion state (e.g. for iso-2022-jp) */
  36. typedef struct rb_transcoding {
  37. const rb_transcoder *transcoder;
  38. int flags;
  39. int resume_position;
  40. unsigned int next_table;
  41. VALUE next_info;
  42. unsigned char next_byte;
  43. unsigned int output_index;
  44. ssize_t recognized_len; /* already interpreted */
  45. ssize_t readagain_len; /* not yet interpreted */
  46. union {
  47. unsigned char ary[8]; /* max_input <= sizeof(ary) */
  48. unsigned char *ptr; /* length: max_input */
  49. } readbuf; /* recognized_len + readagain_len used */
  50. ssize_t writebuf_off;
  51. ssize_t writebuf_len;
  52. union {
  53. unsigned char ary[8]; /* max_output <= sizeof(ary) */
  54. unsigned char *ptr; /* length: max_output */
  55. } writebuf;
  56. union rb_transcoding_state_t { /* opaque data for stateful encoding */
  57. void *ptr;
  58. char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
  59. double dummy_for_alignment;
  60. } state;
  61. } rb_transcoding;
  62. #define TRANSCODING_READBUF(tc) \
  63. ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
  64. (tc)->readbuf.ary : \
  65. (tc)->readbuf.ptr)
  66. #define TRANSCODING_WRITEBUF(tc) \
  67. ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
  68. (tc)->writebuf.ary : \
  69. (tc)->writebuf.ptr)
  70. #define TRANSCODING_WRITEBUF_SIZE(tc) \
  71. ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
  72. sizeof((tc)->writebuf.ary) : \
  73. (size_t)(tc)->transcoder->max_output)
  74. #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
  75. #define TRANSCODING_STATE(tc) \
  76. ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
  77. (tc)->state.ary : \
  78. (tc)->state.ptr)
  79. typedef struct {
  80. struct rb_transcoding *tc;
  81. unsigned char *out_buf_start;
  82. unsigned char *out_data_start;
  83. unsigned char *out_data_end;
  84. unsigned char *out_buf_end;
  85. rb_econv_result_t last_result;
  86. } rb_econv_elem_t;
  87. struct rb_econv_t {
  88. int flags;
  89. const char *source_encoding_name;
  90. const char *destination_encoding_name;
  91. int started;
  92. const unsigned char *replacement_str;
  93. size_t replacement_len;
  94. const char *replacement_enc;
  95. int replacement_allocated;
  96. unsigned char *in_buf_start;
  97. unsigned char *in_data_start;
  98. unsigned char *in_data_end;
  99. unsigned char *in_buf_end;
  100. rb_econv_elem_t *elems;
  101. int num_allocated;
  102. int num_trans;
  103. int num_finished;
  104. struct rb_transcoding *last_tc;
  105. /* last error */
  106. struct {
  107. rb_econv_result_t result;
  108. struct rb_transcoding *error_tc;
  109. const char *source_encoding;
  110. const char *destination_encoding;
  111. const unsigned char *error_bytes_start;
  112. size_t error_bytes_len;
  113. size_t readagain_len;
  114. } last_error;
  115. /* The following fields are only for Encoding::Converter.
  116. * rb_econv_open set them NULL. */
  117. rb_encoding *source_encoding;
  118. rb_encoding *destination_encoding;
  119. };
  120. /*
  121. * Dispatch data and logic
  122. */
  123. #define DECORATOR_P(sname, dname) (*(sname) == '\0')
  124. typedef struct {
  125. const char *sname;
  126. const char *dname;
  127. const char *lib; /* null means means no need to load a library */
  128. const rb_transcoder *transcoder;
  129. } transcoder_entry_t;
  130. static st_table *transcoder_table;
  131. static transcoder_entry_t *
  132. make_transcoder_entry(const char *sname, const char *dname)
  133. {
  134. st_data_t val;
  135. st_table *table2;
  136. if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
  137. val = (st_data_t)st_init_strcasetable();
  138. st_add_direct(transcoder_table, (st_data_t)sname, val);
  139. }
  140. table2 = (st_table *)val;
  141. if (!st_lookup(table2, (st_data_t)dname, &val)) {
  142. transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
  143. entry->sname = sname;
  144. entry->dname = dname;
  145. entry->lib = NULL;
  146. entry->transcoder = NULL;
  147. val = (st_data_t)entry;
  148. st_add_direct(table2, (st_data_t)dname, val);
  149. }
  150. return (transcoder_entry_t *)val;
  151. }
  152. static transcoder_entry_t *
  153. get_transcoder_entry(const char *sname, const char *dname)
  154. {
  155. st_data_t val;
  156. st_table *table2;
  157. if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
  158. return NULL;
  159. }
  160. table2 = (st_table *)val;
  161. if (!st_lookup(table2, (st_data_t)dname, &val)) {
  162. return NULL;
  163. }
  164. return (transcoder_entry_t *)val;
  165. }
  166. void
  167. rb_register_transcoder(const rb_transcoder *tr)
  168. {
  169. const char *const sname = tr->src_encoding;
  170. const char *const dname = tr->dst_encoding;
  171. transcoder_entry_t *entry;
  172. entry = make_transcoder_entry(sname, dname);
  173. if (entry->transcoder) {
  174. rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
  175. sname, dname);
  176. }
  177. entry->transcoder = tr;
  178. }
  179. static void
  180. declare_transcoder(const char *sname, const char *dname, const char *lib)
  181. {
  182. transcoder_entry_t *entry;
  183. entry = make_transcoder_entry(sname, dname);
  184. entry->lib = lib;
  185. }
  186. #define MAX_TRANSCODER_LIBNAME_LEN 64
  187. static const char transcoder_lib_prefix[] = "enc/trans/";
  188. void
  189. rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
  190. {
  191. if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
  192. rb_raise(rb_eArgError, "invalid library name - %s",
  193. lib ? lib : "(null)");
  194. }
  195. declare_transcoder(enc1, enc2, lib);
  196. }
  197. #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
  198. typedef struct search_path_queue_tag {
  199. struct search_path_queue_tag *next;
  200. const char *enc;
  201. } search_path_queue_t;
  202. typedef struct {
  203. st_table *visited;
  204. search_path_queue_t *queue;
  205. search_path_queue_t **queue_last_ptr;
  206. const char *base_enc;
  207. } search_path_bfs_t;
  208. static int
  209. transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
  210. {
  211. const char *dname = (const char *)key;
  212. search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
  213. search_path_queue_t *q;
  214. if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
  215. return ST_CONTINUE;
  216. }
  217. q = ALLOC(search_path_queue_t);
  218. q->enc = dname;
  219. q->next = NULL;
  220. *bfs->queue_last_ptr = q;
  221. bfs->queue_last_ptr = &q->next;
  222. st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
  223. return ST_CONTINUE;
  224. }
  225. static int
  226. transcode_search_path(const char *sname, const char *dname,
  227. void (*callback)(const char *sname, const char *dname, int depth, void *arg),
  228. void *arg)
  229. {
  230. search_path_bfs_t bfs;
  231. search_path_queue_t *q;
  232. st_data_t val;
  233. st_table *table2;
  234. int found;
  235. int pathlen = -1;
  236. if (encoding_equal(sname, dname))
  237. return -1;
  238. q = ALLOC(search_path_queue_t);
  239. q->enc = sname;
  240. q->next = NULL;
  241. bfs.queue_last_ptr = &q->next;
  242. bfs.queue = q;
  243. bfs.visited = st_init_strcasetable();
  244. st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
  245. while (bfs.queue) {
  246. q = bfs.queue;
  247. bfs.queue = q->next;
  248. if (!bfs.queue)
  249. bfs.queue_last_ptr = &bfs.queue;
  250. if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
  251. xfree(q);
  252. continue;
  253. }
  254. table2 = (st_table *)val;
  255. if (st_lookup(table2, (st_data_t)dname, &val)) {
  256. st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
  257. xfree(q);
  258. found = 1;
  259. goto cleanup;
  260. }
  261. bfs.base_enc = q->enc;
  262. st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
  263. bfs.base_enc = NULL;
  264. xfree(q);
  265. }
  266. found = 0;
  267. cleanup:
  268. while (bfs.queue) {
  269. q = bfs.queue;
  270. bfs.queue = q->next;
  271. xfree(q);
  272. }
  273. if (found) {
  274. const char *enc = dname;
  275. int depth;
  276. pathlen = 0;
  277. while (1) {
  278. st_lookup(bfs.visited, (st_data_t)enc, &val);
  279. if (!val)
  280. break;
  281. pathlen++;
  282. enc = (const char *)val;
  283. }
  284. depth = pathlen;
  285. enc = dname;
  286. while (1) {
  287. st_lookup(bfs.visited, (st_data_t)enc, &val);
  288. if (!val)
  289. break;
  290. callback((const char *)val, enc, --depth, arg);
  291. enc = (const char *)val;
  292. }
  293. }
  294. st_free_table(bfs.visited);
  295. return pathlen; /* is -1 if not found */
  296. }
  297. static const rb_transcoder *
  298. load_transcoder_entry(transcoder_entry_t *entry)
  299. {
  300. if (entry->transcoder)
  301. return entry->transcoder;
  302. if (entry->lib) {
  303. const char *lib = entry->lib;
  304. size_t len = strlen(lib);
  305. char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
  306. entry->lib = NULL;
  307. if (len > MAX_TRANSCODER_LIBNAME_LEN)
  308. return NULL;
  309. memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
  310. memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
  311. if (!rb_require(path))
  312. return NULL;
  313. }
  314. if (entry->transcoder)
  315. return entry->transcoder;
  316. return NULL;
  317. }
  318. static const char*
  319. get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
  320. {
  321. if (encoding_equal(encname, "UTF-8")) {
  322. *len_ret = 3;
  323. *repl_encname_ptr = "UTF-8";
  324. return "\xEF\xBF\xBD";
  325. }
  326. else {
  327. *len_ret = 1;
  328. *repl_encname_ptr = "US-ASCII";
  329. return "?";
  330. }
  331. }
  332. /*
  333. * Transcoding engine logic
  334. */
  335. static const unsigned char *
  336. transcode_char_start(rb_transcoding *tc,
  337. const unsigned char *in_start,
  338. const unsigned char *inchar_start,
  339. const unsigned char *in_p,
  340. size_t *char_len_ptr)
  341. {
  342. const unsigned char *ptr;
  343. if (inchar_start - in_start < tc->recognized_len) {
  344. MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
  345. inchar_start, unsigned char, in_p - inchar_start);
  346. ptr = TRANSCODING_READBUF(tc);
  347. }
  348. else {
  349. ptr = inchar_start - tc->recognized_len;
  350. }
  351. *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
  352. return ptr;
  353. }
  354. static rb_econv_result_t
  355. transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
  356. const unsigned char *in_stop, unsigned char *out_stop,
  357. rb_transcoding *tc,
  358. const int opt)
  359. {
  360. const rb_transcoder *tr = tc->transcoder;
  361. int unitlen = tr->input_unit_length;
  362. ssize_t readagain_len = 0;
  363. const unsigned char *inchar_start;
  364. const unsigned char *in_p;
  365. unsigned char *out_p;
  366. in_p = inchar_start = *in_pos;
  367. out_p = *out_pos;
  368. #define SUSPEND(ret, num) \
  369. do { \
  370. tc->resume_position = (num); \
  371. if (0 < in_p - inchar_start) \
  372. MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
  373. inchar_start, unsigned char, in_p - inchar_start); \
  374. *in_pos = in_p; \
  375. *out_pos = out_p; \
  376. tc->recognized_len += in_p - inchar_start; \
  377. if (readagain_len) { \
  378. tc->recognized_len -= readagain_len; \
  379. tc->readagain_len = readagain_len; \
  380. } \
  381. return ret; \
  382. resume_label ## num:; \
  383. } while (0)
  384. #define SUSPEND_OBUF(num) \
  385. do { \
  386. while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
  387. } while (0)
  388. #define SUSPEND_AFTER_OUTPUT(num) \
  389. if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
  390. SUSPEND(econv_after_output, num); \
  391. }
  392. #define next_table (tc->next_table)
  393. #define next_info (tc->next_info)
  394. #define next_byte (tc->next_byte)
  395. #define writebuf_len (tc->writebuf_len)
  396. #define writebuf_off (tc->writebuf_off)
  397. switch (tc->resume_position) {
  398. case 0: break;
  399. case 1: goto resume_label1;
  400. case 2: goto resume_label2;
  401. case 3: goto resume_label3;
  402. case 4: goto resume_label4;
  403. case 5: goto resume_label5;
  404. case 6: goto resume_label6;
  405. case 7: goto resume_label7;
  406. case 8: goto resume_label8;
  407. case 9: goto resume_label9;
  408. case 10: goto resume_label10;
  409. case 11: goto resume_label11;
  410. case 12: goto resume_label12;
  411. case 13: goto resume_label13;
  412. case 14: goto resume_label14;
  413. case 15: goto resume_label15;
  414. case 16: goto resume_label16;
  415. case 17: goto resume_label17;
  416. case 18: goto resume_label18;
  417. case 19: goto resume_label19;
  418. case 20: goto resume_label20;
  419. case 21: goto resume_label21;
  420. case 22: goto resume_label22;
  421. case 23: goto resume_label23;
  422. case 24: goto resume_label24;
  423. case 25: goto resume_label25;
  424. case 26: goto resume_label26;
  425. case 27: goto resume_label27;
  426. case 28: goto resume_label28;
  427. case 29: goto resume_label29;
  428. case 30: goto resume_label30;
  429. case 31: goto resume_label31;
  430. case 32: goto resume_label32;
  431. case 33: goto resume_label33;
  432. case 34: goto resume_label34;
  433. }
  434. while (1) {
  435. inchar_start = in_p;
  436. tc->recognized_len = 0;
  437. next_table = tr->conv_tree_start;
  438. SUSPEND_AFTER_OUTPUT(24);
  439. if (in_stop <= in_p) {
  440. if (!(opt & ECONV_PARTIAL_INPUT))
  441. break;
  442. SUSPEND(econv_source_buffer_empty, 7);
  443. continue;
  444. }
  445. #define BYTE_ADDR(index) (tr->byte_array + (index))
  446. #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
  447. #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
  448. #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
  449. #define BL_MIN_BYTE (BL_BASE[0])
  450. #define BL_MAX_BYTE (BL_BASE[1])
  451. #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
  452. #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
  453. next_byte = (unsigned char)*in_p++;
  454. follow_byte:
  455. if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
  456. next_info = INVALID;
  457. else {
  458. next_info = (VALUE)BL_ACTION(next_byte);
  459. }
  460. follow_info:
  461. switch (next_info & 0x1F) {
  462. case NOMAP:
  463. {
  464. const unsigned char *p = inchar_start;
  465. writebuf_off = 0;
  466. while (p < in_p) {
  467. TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
  468. }
  469. writebuf_len = writebuf_off;
  470. writebuf_off = 0;
  471. while (writebuf_off < writebuf_len) {
  472. SUSPEND_OBUF(3);
  473. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  474. }
  475. }
  476. continue;
  477. case 0x00: case 0x04: case 0x08: case 0x0C:
  478. case 0x10: case 0x14: case 0x18: case 0x1C:
  479. SUSPEND_AFTER_OUTPUT(25);
  480. while (in_p >= in_stop) {
  481. if (!(opt & ECONV_PARTIAL_INPUT))
  482. goto incomplete;
  483. SUSPEND(econv_source_buffer_empty, 5);
  484. }
  485. next_byte = (unsigned char)*in_p++;
  486. next_table = (unsigned int)next_info;
  487. goto follow_byte;
  488. case ZERObt: /* drop input */
  489. continue;
  490. case ONEbt:
  491. SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
  492. continue;
  493. case TWObt:
  494. SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
  495. SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
  496. continue;
  497. case THREEbt:
  498. SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
  499. SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
  500. SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
  501. continue;
  502. case FOURbt:
  503. SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
  504. SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
  505. SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
  506. SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
  507. continue;
  508. case GB4bt:
  509. SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
  510. SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
  511. SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
  512. SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
  513. continue;
  514. case STR1:
  515. tc->output_index = 0;
  516. while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
  517. SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
  518. tc->output_index++;
  519. }
  520. continue;
  521. case FUNii:
  522. next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
  523. goto follow_info;
  524. case FUNsi:
  525. {
  526. const unsigned char *char_start;
  527. size_t char_len;
  528. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  529. next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
  530. goto follow_info;
  531. }
  532. case FUNio:
  533. SUSPEND_OBUF(13);
  534. if (tr->max_output <= out_stop - out_p)
  535. out_p += tr->func_io(TRANSCODING_STATE(tc),
  536. next_info, out_p, out_stop - out_p);
  537. else {
  538. writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
  539. next_info,
  540. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  541. writebuf_off = 0;
  542. while (writebuf_off < writebuf_len) {
  543. SUSPEND_OBUF(20);
  544. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  545. }
  546. }
  547. break;
  548. case FUNso:
  549. {
  550. const unsigned char *char_start;
  551. size_t char_len;
  552. SUSPEND_OBUF(14);
  553. if (tr->max_output <= out_stop - out_p) {
  554. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  555. out_p += tr->func_so(TRANSCODING_STATE(tc),
  556. char_start, (size_t)char_len,
  557. out_p, out_stop - out_p);
  558. }
  559. else {
  560. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  561. writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
  562. char_start, (size_t)char_len,
  563. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  564. writebuf_off = 0;
  565. while (writebuf_off < writebuf_len) {
  566. SUSPEND_OBUF(22);
  567. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  568. }
  569. }
  570. break;
  571. }
  572. case FUNsio:
  573. {
  574. const unsigned char *char_start;
  575. size_t char_len;
  576. SUSPEND_OBUF(33);
  577. if (tr->max_output <= out_stop - out_p) {
  578. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  579. out_p += tr->func_sio(TRANSCODING_STATE(tc),
  580. char_start, (size_t)char_len, next_info,
  581. out_p, out_stop - out_p);
  582. }
  583. else {
  584. char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
  585. writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
  586. char_start, (size_t)char_len, next_info,
  587. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  588. writebuf_off = 0;
  589. while (writebuf_off < writebuf_len) {
  590. SUSPEND_OBUF(34);
  591. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  592. }
  593. }
  594. break;
  595. }
  596. case INVALID:
  597. if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
  598. if (tc->recognized_len + (in_p - inchar_start) < unitlen)
  599. SUSPEND_AFTER_OUTPUT(26);
  600. while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
  601. in_p = in_stop;
  602. SUSPEND(econv_source_buffer_empty, 8);
  603. }
  604. if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
  605. in_p = in_stop;
  606. }
  607. else {
  608. in_p = inchar_start + (unitlen - tc->recognized_len);
  609. }
  610. }
  611. else {
  612. ssize_t invalid_len; /* including the last byte which causes invalid */
  613. ssize_t discard_len;
  614. invalid_len = tc->recognized_len + (in_p - inchar_start);
  615. discard_len = ((invalid_len - 1) / unitlen) * unitlen;
  616. readagain_len = invalid_len - discard_len;
  617. }
  618. goto invalid;
  619. case UNDEF:
  620. goto undef;
  621. default:
  622. rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
  623. }
  624. continue;
  625. invalid:
  626. SUSPEND(econv_invalid_byte_sequence, 1);
  627. continue;
  628. incomplete:
  629. SUSPEND(econv_incomplete_input, 27);
  630. continue;
  631. undef:
  632. SUSPEND(econv_undefined_conversion, 2);
  633. continue;
  634. }
  635. /* cleanup */
  636. if (tr->finish_func) {
  637. SUSPEND_OBUF(4);
  638. if (tr->max_output <= out_stop - out_p) {
  639. out_p += tr->finish_func(TRANSCODING_STATE(tc),
  640. out_p, out_stop - out_p);
  641. }
  642. else {
  643. writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
  644. TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
  645. writebuf_off = 0;
  646. while (writebuf_off < writebuf_len) {
  647. SUSPEND_OBUF(23);
  648. *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
  649. }
  650. }
  651. }
  652. while (1)
  653. SUSPEND(econv_finished, 6);
  654. #undef SUSPEND
  655. #undef next_table
  656. #undef next_info
  657. #undef next_byte
  658. #undef writebuf_len
  659. #undef writebuf_off
  660. }
  661. static rb_econv_result_t
  662. transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
  663. const unsigned char *in_stop, unsigned char *out_stop,
  664. rb_transcoding *tc,
  665. const int opt)
  666. {
  667. if (tc->readagain_len) {
  668. unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
  669. const unsigned char *readagain_pos = readagain_buf;
  670. const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
  671. rb_econv_result_t res;
  672. MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
  673. unsigned char, tc->readagain_len);
  674. tc->readagain_len = 0;
  675. res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
  676. if (res != econv_source_buffer_empty) {
  677. MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
  678. readagain_pos, unsigned char, readagain_stop - readagain_pos);
  679. tc->readagain_len += readagain_stop - readagain_pos;
  680. return res;
  681. }
  682. }
  683. return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
  684. }
  685. static rb_transcoding *
  686. rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
  687. {
  688. rb_transcoding *tc;
  689. tc = ALLOC(rb_transcoding);
  690. tc->transcoder = tr;
  691. tc->flags = flags;
  692. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
  693. tc->state.ptr = xmalloc(tr->state_size);
  694. if (tr->state_init_func) {
  695. (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
  696. }
  697. tc->resume_position = 0;
  698. tc->recognized_len = 0;
  699. tc->readagain_len = 0;
  700. tc->writebuf_len = 0;
  701. tc->writebuf_off = 0;
  702. if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
  703. tc->readbuf.ptr = xmalloc(tr->max_input);
  704. }
  705. if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
  706. tc->writebuf.ptr = xmalloc(tr->max_output);
  707. }
  708. return tc;
  709. }
  710. static rb_econv_result_t
  711. rb_transcoding_convert(rb_transcoding *tc,
  712. const unsigned char **input_ptr, const unsigned char *input_stop,
  713. unsigned char **output_ptr, unsigned char *output_stop,
  714. int flags)
  715. {
  716. return transcode_restartable(
  717. input_ptr, output_ptr,
  718. input_stop, output_stop,
  719. tc, flags);
  720. }
  721. static void
  722. rb_transcoding_close(rb_transcoding *tc)
  723. {
  724. const rb_transcoder *tr = tc->transcoder;
  725. if (tr->state_fini_func) {
  726. (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
  727. }
  728. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
  729. xfree(tc->state.ptr);
  730. if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
  731. xfree(tc->readbuf.ptr);
  732. if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
  733. xfree(tc->writebuf.ptr);
  734. xfree(tc);
  735. }
  736. static size_t
  737. rb_transcoding_memsize(rb_transcoding *tc)
  738. {
  739. size_t size = sizeof(rb_transcoding);
  740. const rb_transcoder *tr = tc->transcoder;
  741. if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
  742. size += tr->state_size;
  743. }
  744. if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
  745. size += tr->max_input;
  746. }
  747. if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
  748. size += tr->max_output;
  749. }
  750. return size;
  751. }
  752. static rb_econv_t *
  753. rb_econv_alloc(int n_hint)
  754. {
  755. rb_econv_t *ec;
  756. if (n_hint <= 0)
  757. n_hint = 1;
  758. ec = ALLOC(rb_econv_t);
  759. ec->flags = 0;
  760. ec->source_encoding_name = NULL;
  761. ec->destination_encoding_name = NULL;
  762. ec->started = 0;
  763. ec->replacement_str = NULL;
  764. ec->replacement_len = 0;
  765. ec->replacement_enc = NULL;
  766. ec->replacement_allocated = 0;
  767. ec->in_buf_start = NULL;
  768. ec->in_data_start = NULL;
  769. ec->in_data_end = NULL;
  770. ec->in_buf_end = NULL;
  771. ec->num_allocated = n_hint;
  772. ec->num_trans = 0;
  773. ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
  774. ec->num_finished = 0;
  775. ec->last_tc = NULL;
  776. ec->last_error.result = econv_source_buffer_empty;
  777. ec->last_error.error_tc = NULL;
  778. ec->last_error.source_encoding = NULL;
  779. ec->last_error.destination_encoding = NULL;
  780. ec->last_error.error_bytes_start = NULL;
  781. ec->last_error.error_bytes_len = 0;
  782. ec->last_error.readagain_len = 0;
  783. ec->source_encoding = NULL;
  784. ec->destination_encoding = NULL;
  785. return ec;
  786. }
  787. static int
  788. rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
  789. {
  790. int n, j;
  791. int bufsize = 4096;
  792. unsigned char *p;
  793. if (ec->num_trans == ec->num_allocated) {
  794. n = ec->num_allocated * 2;
  795. REALLOC_N(ec->elems, rb_econv_elem_t, n);
  796. ec->num_allocated = n;
  797. }
  798. p = xmalloc(bufsize);
  799. MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
  800. ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
  801. ec->elems[i].out_buf_start = p;
  802. ec->elems[i].out_buf_end = p + bufsize;
  803. ec->elems[i].out_data_start = p;
  804. ec->elems[i].out_data_end = p;
  805. ec->elems[i].last_result = econv_source_buffer_empty;
  806. ec->num_trans++;
  807. if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
  808. for (j = ec->num_trans-1; i <= j; j--) {
  809. rb_transcoding *tc = ec->elems[j].tc;
  810. const rb_transcoder *tr2 = tc->transcoder;
  811. if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
  812. ec->last_tc = tc;
  813. break;
  814. }
  815. }
  816. return 0;
  817. }
  818. static rb_econv_t *
  819. rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
  820. {
  821. rb_econv_t *ec;
  822. int i, ret;
  823. for (i = 0; i < n; i++) {
  824. const rb_transcoder *tr;
  825. tr = load_transcoder_entry(entries[i]);
  826. if (!tr)
  827. return NULL;
  828. }
  829. ec = rb_econv_alloc(n);
  830. for (i = 0; i < n; i++) {
  831. const rb_transcoder *tr = load_transcoder_entry(entries[i]);
  832. ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
  833. if (ret == -1) {
  834. rb_econv_close(ec);
  835. return NULL;
  836. }
  837. }
  838. return ec;
  839. }
  840. struct trans_open_t {
  841. transcoder_entry_t **entries;
  842. int num_additional;
  843. };
  844. static void
  845. trans_open_i(const char *sname, const char *dname, int depth, void *arg)
  846. {
  847. struct trans_open_t *toarg = arg;
  848. if (!toarg->entries) {
  849. toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
  850. }
  851. toarg->entries[depth] = get_transcoder_entry(sname, dname);
  852. }
  853. static rb_econv_t *
  854. rb_econv_open0(const char *sname, const char *dname, int ecflags)
  855. {
  856. transcoder_entry_t **entries = NULL;
  857. int num_trans;
  858. rb_econv_t *ec;
  859. rb_encoding *senc, *denc;
  860. int sidx, didx;
  861. senc = NULL;
  862. if (*sname) {
  863. sidx = rb_enc_find_index(sname);
  864. if (0 <= sidx) {
  865. senc = rb_enc_from_index(sidx);
  866. }
  867. }
  868. denc = NULL;
  869. if (*dname) {
  870. didx = rb_enc_find_index(dname);
  871. if (0 <= didx) {
  872. denc = rb_enc_from_index(didx);
  873. }
  874. }
  875. if (*sname == '\0' && *dname == '\0') {
  876. num_trans = 0;
  877. entries = NULL;
  878. }
  879. else {
  880. struct trans_open_t toarg;
  881. toarg.entries = NULL;
  882. toarg.num_additional = 0;
  883. num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
  884. entries = toarg.entries;
  885. if (num_trans < 0) {
  886. xfree(entries);
  887. return NULL;
  888. }
  889. }
  890. ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
  891. xfree(entries);
  892. if (!ec)
  893. return NULL;
  894. ec->flags = ecflags;
  895. ec->source_encoding_name = sname;
  896. ec->destination_encoding_name = dname;
  897. return ec;
  898. }
  899. #define MAX_ECFLAGS_DECORATORS 32
  900. static int
  901. decorator_names(int ecflags, const char **decorators_ret)
  902. {
  903. int num_decorators;
  904. if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) &&
  905. (ecflags & ECONV_CR_NEWLINE_DECORATOR))
  906. return -1;
  907. if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) &&
  908. (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR))
  909. return -1;
  910. if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
  911. (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
  912. return -1;
  913. num_decorators = 0;
  914. if (ecflags & ECONV_XML_TEXT_DECORATOR)
  915. decorators_ret[num_decorators++] = "xml_text_escape";
  916. if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
  917. decorators_ret[num_decorators++] = "xml_attr_content_escape";
  918. if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
  919. decorators_ret[num_decorators++] = "xml_attr_quote";
  920. if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
  921. decorators_ret[num_decorators++] = "crlf_newline";
  922. if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
  923. decorators_ret[num_decorators++] = "cr_newline";
  924. if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
  925. decorators_ret[num_decorators++] = "universal_newline";
  926. return num_decorators;
  927. }
  928. rb_econv_t *
  929. rb_econv_open(const char *sname, const char *dname, int ecflags)
  930. {
  931. rb_econv_t *ec;
  932. int num_decorators;
  933. const char *decorators[MAX_ECFLAGS_DECORATORS];
  934. int i;
  935. num_decorators = decorator_names(ecflags, decorators);
  936. if (num_decorators == -1)
  937. return NULL;
  938. ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
  939. if (!ec)
  940. return NULL;
  941. for (i = 0; i < num_decorators; i++)
  942. if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
  943. rb_econv_close(ec);
  944. return NULL;
  945. }
  946. ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
  947. return ec;
  948. }
  949. static int
  950. trans_sweep(rb_econv_t *ec,
  951. const unsigned char **input_ptr, const unsigned char *input_stop,
  952. unsigned char **output_ptr, unsigned char *output_stop,
  953. int flags,
  954. int start)
  955. {
  956. int try;
  957. int i, f;
  958. const unsigned char **ipp, *is, *iold;
  959. unsigned char **opp, *os, *oold;
  960. rb_econv_result_t res;
  961. try = 1;
  962. while (try) {
  963. try = 0;
  964. for (i = start; i < ec->num_trans; i++) {
  965. rb_econv_elem_t *te = &ec->elems[i];
  966. if (i == 0) {
  967. ipp = input_ptr;
  968. is = input_stop;
  969. }
  970. else {
  971. rb_econv_elem_t *prev_te = &ec->elems[i-1];
  972. ipp = (const unsigned char **)&prev_te->out_data_start;
  973. is = prev_te->out_data_end;
  974. }
  975. if (i == ec->num_trans-1) {
  976. opp = output_ptr;
  977. os = output_stop;
  978. }
  979. else {
  980. if (te->out_buf_start != te->out_data_start) {
  981. ssize_t len = te->out_data_end - te->out_data_start;
  982. ssize_t off = te->out_data_start - te->out_buf_start;
  983. MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
  984. te->out_data_start = te->out_buf_start;
  985. te->out_data_end -= off;
  986. }
  987. opp = &te->out_data_end;
  988. os = te->out_buf_end;
  989. }
  990. f = flags;
  991. if (ec->num_finished != i)
  992. f |= ECONV_PARTIAL_INPUT;
  993. if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
  994. start = 1;
  995. flags &= ~ECONV_AFTER_OUTPUT;
  996. }
  997. if (i != 0)
  998. f &= ~ECONV_AFTER_OUTPUT;
  999. iold = *ipp;
  1000. oold = *opp;
  1001. te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
  1002. if (iold != *ipp || oold != *opp)
  1003. try = 1;
  1004. switch (res) {
  1005. case econv_invalid_byte_sequence:
  1006. case econv_incomplete_input:
  1007. case econv_undefined_conversion:
  1008. case econv_after_output:
  1009. return i;
  1010. case econv_destination_buffer_full:
  1011. case econv_source_buffer_empty:
  1012. break;
  1013. case econv_finished:
  1014. ec->num_finished = i+1;
  1015. break;
  1016. }
  1017. }
  1018. }
  1019. return -1;
  1020. }
  1021. static rb_econv_result_t
  1022. rb_trans_conv(rb_econv_t *ec,
  1023. const unsigned char **input_ptr, const unsigned char *input_stop,
  1024. unsigned char **output_ptr, unsigned char *output_stop,
  1025. int flags,
  1026. int *result_position_ptr)
  1027. {
  1028. int i;
  1029. int needreport_index;
  1030. int sweep_start;
  1031. unsigned char empty_buf;
  1032. unsigned char *empty_ptr = &empty_buf;
  1033. if (!input_ptr) {
  1034. input_ptr = (const unsigned char **)&empty_ptr;
  1035. input_stop = empty_ptr;
  1036. }
  1037. if (!output_ptr) {
  1038. output_ptr = &empty_ptr;
  1039. output_stop = empty_ptr;
  1040. }
  1041. if (ec->elems[0].last_result == econv_after_output)
  1042. ec->elems[0].last_result = econv_source_buffer_empty;
  1043. needreport_index = -1;
  1044. for (i = ec->num_trans-1; 0 <= i; i--) {
  1045. switch (ec->elems[i].last_result) {
  1046. case econv_invalid_byte_sequence:
  1047. case econv_incomplete_input:
  1048. case econv_undefined_conversion:
  1049. case econv_after_output:
  1050. case econv_finished:
  1051. sweep_start = i+1;
  1052. needreport_index = i;
  1053. goto found_needreport;
  1054. case econv_destination_buffer_full:
  1055. case econv_source_buffer_empty:
  1056. break;
  1057. default:
  1058. rb_bug("unexpected transcode last result");
  1059. }
  1060. }
  1061. /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
  1062. if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
  1063. (flags & ECONV_AFTER_OUTPUT)) {
  1064. rb_econv_result_t res;
  1065. res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
  1066. (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
  1067. result_position_ptr);
  1068. if (res == econv_source_buffer_empty)
  1069. return econv_after_output;
  1070. return res;
  1071. }
  1072. sweep_start = 0;
  1073. found_needreport:
  1074. do {
  1075. needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
  1076. sweep_start = needreport_index + 1;
  1077. } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
  1078. for (i = ec->num_trans-1; 0 <= i; i--) {
  1079. if (ec->elems[i].last_result != econv_source_buffer_empty) {
  1080. rb_econv_result_t res = ec->elems[i].last_result;
  1081. if (res == econv_invalid_byte_sequence ||
  1082. res == econv_incomplete_input ||
  1083. res == econv_undefined_conversion ||
  1084. res == econv_after_output) {
  1085. ec->elems[i].last_result = econv_source_buffer_empty;
  1086. }
  1087. if (result_position_ptr)
  1088. *result_position_ptr = i;
  1089. return res;
  1090. }
  1091. }
  1092. if (result_position_ptr)
  1093. *result_position_ptr = -1;
  1094. return econv_source_buffer_empty;
  1095. }
  1096. static rb_econv_result_t
  1097. rb_econv_convert0(rb_econv_t *ec,
  1098. const unsigned char **input_ptr, const unsigned char *input_stop,
  1099. unsigned char **output_ptr, unsigned char *output_stop,
  1100. int flags)
  1101. {
  1102. rb_econv_result_t res;
  1103. int result_position;
  1104. int has_output = 0;
  1105. memset(&ec->last_error, 0, sizeof(ec->last_error));
  1106. if (ec->num_trans == 0) {
  1107. size_t len;
  1108. if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
  1109. if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
  1110. len = output_stop - *output_ptr;
  1111. memcpy(*output_ptr, ec->in_data_start, len);
  1112. *output_ptr = output_stop;
  1113. ec->in_data_start += len;
  1114. res = econv_destination_buffer_full;
  1115. goto gotresult;
  1116. }
  1117. len = ec->in_data_end - ec->in_data_start;
  1118. memcpy(*output_ptr, ec->in_data_start, len);
  1119. *output_ptr += len;
  1120. ec->in_data_start = ec->in_data_end = ec->in_buf_start;
  1121. if (flags & ECONV_AFTER_OUTPUT) {
  1122. res = econv_after_output;
  1123. goto gotresult;
  1124. }
  1125. }
  1126. if (output_stop - *output_ptr < input_stop - *input_ptr) {
  1127. len = output_stop - *output_ptr;
  1128. }
  1129. else {
  1130. len = input_stop - *input_ptr;
  1131. }
  1132. if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
  1133. *(*output_ptr)++ = *(*input_ptr)++;
  1134. res = econv_after_output;
  1135. goto gotresult;
  1136. }
  1137. memcpy(*output_ptr, *input_ptr, len);
  1138. *output_ptr += len;
  1139. *input_ptr += len;
  1140. if (*input_ptr != input_stop)
  1141. res = econv_destination_buffer_full;
  1142. else if (flags & ECONV_PARTIAL_INPUT)
  1143. res = econv_source_buffer_empty;
  1144. else
  1145. res = econv_finished;
  1146. goto gotresult;
  1147. }
  1148. if (ec->elems[ec->num_trans-1].out_data_start) {
  1149. unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
  1150. unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
  1151. if (data_start != data_end) {
  1152. size_t len;
  1153. if (output_stop - *output_ptr < data_end - data_start) {
  1154. len = output_stop - *output_ptr;
  1155. memcpy(*output_ptr, data_start, len);
  1156. *output_ptr = output_stop;
  1157. ec->elems[ec->num_trans-1].out_data_start += len;
  1158. res = econv_destination_buffer_full;
  1159. goto gotresult;
  1160. }
  1161. len = data_end - data_start;
  1162. memcpy(*output_ptr, data_start, len);
  1163. *output_ptr += len;
  1164. ec->elems[ec->num_trans-1].out_data_start =
  1165. ec->elems[ec->num_trans-1].out_data_end =
  1166. ec->elems[ec->num_trans-1].out_buf_start;
  1167. has_output = 1;
  1168. }
  1169. }
  1170. if (ec->in_buf_start &&
  1171. ec->in_data_start != ec->in_data_end) {
  1172. res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
  1173. (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
  1174. if (res != econv_source_buffer_empty)
  1175. goto gotresult;
  1176. }
  1177. if (has_output &&
  1178. (flags & ECONV_AFTER_OUTPUT) &&
  1179. *input_ptr != input_stop) {
  1180. input_stop = *input_ptr;
  1181. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1182. if (res == econv_source_buffer_empty)
  1183. res = econv_after_output;
  1184. }
  1185. else if ((flags & ECONV_AFTER_OUTPUT) ||
  1186. ec->num_trans == 1) {
  1187. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1188. }
  1189. else {
  1190. flags |= ECONV_AFTER_OUTPUT;
  1191. do {
  1192. res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
  1193. } while (res == econv_after_output);
  1194. }
  1195. gotresult:
  1196. ec->last_error.result = res;
  1197. if (res == econv_invalid_byte_sequence ||
  1198. res == econv_incomplete_input ||
  1199. res == econv_undefined_conversion) {
  1200. rb_transcoding *error_tc = ec->elems[result_position].tc;
  1201. ec->last_error.error_tc = error_tc;
  1202. ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
  1203. ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
  1204. ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
  1205. ec->last_error.error_bytes_len = error_tc->recognized_len;
  1206. ec->last_error.readagain_len = error_tc->readagain_len;
  1207. }
  1208. return res;
  1209. }
  1210. static int output_replacement_character(rb_econv_t *ec);
  1211. static int
  1212. output_hex_charref(rb_econv_t *ec)
  1213. {
  1214. int ret;
  1215. unsigned char utfbuf[1024];
  1216. const unsigned char *utf;
  1217. size_t utf_len;
  1218. int utf_allocated = 0;
  1219. char charef_buf[16];
  1220. const unsigned char *p;
  1221. if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
  1222. utf = ec->last_error.error_bytes_start;
  1223. utf_len = ec->last_error.error_bytes_len;
  1224. }
  1225. else {
  1226. utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
  1227. ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
  1228. utfbuf, sizeof(utfbuf),
  1229. &utf_len);
  1230. if (!utf)
  1231. return -1;
  1232. if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
  1233. utf_allocated = 1;
  1234. }
  1235. if (utf_len % 4 != 0)
  1236. goto fail;
  1237. p = utf;
  1238. while (4 <= utf_len) {
  1239. unsigned int u = 0;
  1240. u += p[0] << 24;
  1241. u += p[1] << 16;
  1242. u += p[2] << 8;
  1243. u += p[3];
  1244. snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
  1245. ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
  1246. if (ret == -1)
  1247. goto fail;
  1248. p += 4;
  1249. utf_len -= 4;
  1250. }
  1251. if (utf_allocated)
  1252. xfree((void *)utf);
  1253. return 0;
  1254. fail:
  1255. if (utf_allocated)
  1256. xfree((void *)utf);
  1257. return -1;
  1258. }
  1259. rb_econv_result_t
  1260. rb_econv_convert(rb_econv_t *ec,
  1261. const unsigned char **input_ptr, const unsigned char *input_stop,
  1262. unsigned char **output_ptr, unsigned char *output_stop,
  1263. int flags)
  1264. {
  1265. rb_econv_result_t ret;
  1266. unsigned char empty_buf;
  1267. unsigned char *empty_ptr = &empty_buf;
  1268. ec->started = 1;
  1269. if (!input_ptr) {
  1270. input_ptr = (const unsigned char **)&empty_ptr;
  1271. input_stop = empty_ptr;
  1272. }
  1273. if (!output_ptr) {
  1274. output_ptr = &empty_ptr;
  1275. output_stop = empty_ptr;
  1276. }
  1277. resume:
  1278. ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
  1279. if (ret == econv_invalid_byte_sequence ||
  1280. ret == econv_incomplete_input) {
  1281. /* deal with invalid byte sequence */
  1282. /* todo: add more alternative behaviors */
  1283. switch (ec->flags & ECONV_INVALID_MASK) {
  1284. case ECONV_INVALID_REPLACE:
  1285. if (output_replacement_character(ec) == 0)
  1286. goto resume;
  1287. }
  1288. }
  1289. if (ret == econv_undefined_conversion) {
  1290. /* valid character in source encoding
  1291. * but no related character(s) in destination encoding */
  1292. /* todo: add more alternative behaviors */
  1293. switch (ec->flags & ECONV_UNDEF_MASK) {
  1294. case ECONV_UNDEF_REPLACE:
  1295. if (output_replacement_character(ec) == 0)
  1296. goto resume;
  1297. break;
  1298. case ECONV_UNDEF_HEX_CHARREF:
  1299. if (output_hex_charref(ec) == 0)
  1300. goto resume;
  1301. break;
  1302. }
  1303. }
  1304. return ret;
  1305. }
  1306. const char *
  1307. rb_econv_encoding_to_insert_output(rb_econv_t *ec)
  1308. {
  1309. rb_transcoding *tc = ec->last_tc;
  1310. const rb_transcoder *tr;
  1311. if (tc == NULL)
  1312. return "";
  1313. tr = tc->transcoder;
  1314. if (tr->asciicompat_type == asciicompat_encoder)
  1315. return tr->src_encoding;
  1316. return tr->dst_encoding;
  1317. }
  1318. static unsigned char *
  1319. allocate_converted_string(const char *sname, const char *dname,
  1320. const unsigned char *str, size_t len,
  1321. unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
  1322. size_t *dst_len_ptr)
  1323. {
  1324. unsigned char *dst_str;
  1325. size_t dst_len;
  1326. size_t dst_bufsize;
  1327. rb_econv_t *ec;
  1328. rb_econv_result_t res;
  1329. const unsigned char *sp;
  1330. unsigned char *dp;
  1331. if (caller_dst_buf)
  1332. dst_bufsize = caller_dst_bufsize;
  1333. else if (len == 0)
  1334. dst_bufsize = 1;
  1335. else
  1336. dst_bufsize = len;
  1337. ec = rb_econv_open(sname, dname, 0);
  1338. if (ec == NULL)
  1339. return NULL;
  1340. if (caller_dst_buf)
  1341. dst_str = caller_dst_buf;
  1342. else
  1343. dst_str = xmalloc(dst_bufsize);
  1344. dst_len = 0;
  1345. sp = str;
  1346. dp = dst_str+dst_len;
  1347. res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
  1348. dst_len = dp - dst_str;
  1349. while (res == econv_destination_buffer_full) {
  1350. if (SIZE_MAX/2 < dst_bufsize) {
  1351. goto fail;
  1352. }
  1353. dst_bufsize *= 2;
  1354. if (dst_str == caller_dst_buf) {
  1355. unsigned char *tmp;
  1356. tmp = xmalloc(dst_bufsize);
  1357. memcpy(tmp, dst_str, dst_bufsize/2);
  1358. dst_str = tmp;
  1359. }
  1360. else {
  1361. dst_str = xrealloc(dst_str, dst_bufsize);
  1362. }
  1363. dp = dst_str+dst_len;
  1364. res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
  1365. dst_len = dp - dst_str;
  1366. }
  1367. if (res != econv_finished) {
  1368. goto fail;
  1369. }
  1370. rb_econv_close(ec);
  1371. *dst_len_ptr = dst_len;
  1372. return dst_str;
  1373. fail:
  1374. if (dst_str != caller_dst_buf)
  1375. xfree(dst_str);
  1376. rb_econv_close(ec);
  1377. return NULL;
  1378. }
  1379. /* result: 0:success -1:failure */
  1380. int
  1381. rb_econv_insert_output(rb_econv_t *ec,
  1382. const unsigned char *str, size_t len, const char *str_encoding)
  1383. {
  1384. const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
  1385. unsigned char insert_buf[4096];
  1386. const unsigned char *insert_str = NULL;
  1387. size_t insert_len;
  1388. int last_trans_index;
  1389. rb_transcoding *tc;
  1390. unsigned char **buf_start_p;
  1391. unsigned char **data_start_p;
  1392. unsigned char **data_end_p;
  1393. unsigned char **buf_end_p;
  1394. size_t need;
  1395. ec->started = 1;
  1396. if (len == 0)
  1397. return 0;
  1398. if (encoding_equal(insert_encoding, str_encoding)) {
  1399. insert_str = str;
  1400. insert_len = len;
  1401. }
  1402. else {
  1403. insert_str = allocate_converted_string(str_encoding, insert_encoding,
  1404. str, len, insert_buf, sizeof(insert_buf), &insert_len);
  1405. if (insert_str == NULL)
  1406. return -1;
  1407. }
  1408. need = insert_len;
  1409. last_trans_index = ec->num_trans-1;
  1410. if (ec->num_trans == 0) {
  1411. tc = NULL;
  1412. buf_start_p = &ec->in_buf_start;
  1413. data_start_p = &ec->in_data_start;
  1414. data_end_p = &ec->in_data_end;
  1415. buf_end_p = &ec->in_buf_end;
  1416. }
  1417. else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
  1418. tc = ec->elems[last_trans_index].tc;
  1419. need += tc->readagain_len;
  1420. if (need < insert_len)
  1421. goto fail;
  1422. if (last_trans_index == 0) {
  1423. buf_start_p = &ec->in_buf_start;
  1424. data_start_p = &ec->in_data_start;
  1425. data_end_p = &ec->in_data_end;
  1426. buf_end_p = &ec->in_buf_end;
  1427. }
  1428. else {
  1429. rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
  1430. buf_start_p = &ee->out_buf_start;
  1431. data_start_p = &ee->out_data_start;
  1432. data_end_p = &ee->out_data_end;
  1433. buf_end_p = &ee->out_buf_end;
  1434. }
  1435. }
  1436. else {
  1437. rb_econv_elem_t *ee = &ec->elems[last_trans_index];
  1438. buf_start_p = &ee->out_buf_start;
  1439. data_start_p = &ee->out_data_start;
  1440. data_end_p = &ee->out_data_end;
  1441. buf_end_p = &ee->out_

Large files files are truncated, but you can click here to view the full file