PageRenderTime 70ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/ext/iconv/iconv.c

https://github.com/fizx/ruby
C | 1216 lines | 830 code | 109 blank | 277 comment | 147 complexity | 3742c27950b742d35af74e3778f132f6 MD5 | raw file
Possible License(s): LGPL-2.1, AGPL-3.0, GPL-2.0, BSD-3-Clause
  1. /* -*- mode:c; c-file-style:"ruby" -*- */
  2. /**********************************************************************
  3. iconv.c -
  4. $Author$
  5. created at: Wed Dec 1 20:28:09 JST 1999
  6. All the files in this distribution are covered under the Ruby's
  7. license (see the file COPYING).
  8. Documentation by Yukihiro Matsumoto and Gavin Sinclair.
  9. **********************************************************************/
  10. #include "ruby/ruby.h"
  11. #include <errno.h>
  12. #include <iconv.h>
  13. #include <assert.h>
  14. #include "ruby/st.h"
  15. #include "ruby/encoding.h"
  16. /*
  17. * Document-class: Iconv
  18. *
  19. * == Summary
  20. *
  21. * Ruby extension for charset conversion.
  22. *
  23. * == Abstract
  24. *
  25. * Iconv is a wrapper class for the UNIX 95 <tt>iconv()</tt> function family,
  26. * which translates string between various encoding systems.
  27. *
  28. * See Open Group's on-line documents for more details.
  29. * * <tt>iconv.h</tt>: http://www.opengroup.org/onlinepubs/007908799/xsh/iconv.h.html
  30. * * <tt>iconv_open()</tt>: http://www.opengroup.org/onlinepubs/007908799/xsh/iconv_open.html
  31. * * <tt>iconv()</tt>: http://www.opengroup.org/onlinepubs/007908799/xsh/iconv.html
  32. * * <tt>iconv_close()</tt>: http://www.opengroup.org/onlinepubs/007908799/xsh/iconv_close.html
  33. *
  34. * Which coding systems are available is platform-dependent.
  35. *
  36. * == Examples
  37. *
  38. * 1. Simple conversion between two charsets.
  39. *
  40. * converted_text = Iconv.conv('iso-8859-15', 'utf-8', text)
  41. *
  42. * 2. Instantiate a new Iconv and use method Iconv#iconv.
  43. *
  44. * cd = Iconv.new(to, from)
  45. * begin
  46. * input.each { |s| output << cd.iconv(s) }
  47. * output << cd.iconv(nil) # Don't forget this!
  48. * ensure
  49. * cd.close
  50. * end
  51. *
  52. * 3. Invoke Iconv.open with a block.
  53. *
  54. * Iconv.open(to, from) do |cd|
  55. * input.each { |s| output << cd.iconv(s) }
  56. * output << cd.iconv(nil)
  57. * end
  58. *
  59. * 4. Shorthand for (3).
  60. *
  61. * Iconv.iconv(to, from, *input.to_a)
  62. *
  63. * == Attentions
  64. *
  65. * Even if some extentions of implementation dependent are useful,
  66. * DON'T USE those extentions in libraries and scripts to widely distribute.
  67. * If you want to use those feature, use String#encode.
  68. */
  69. /* Invalid value for iconv_t is -1 but 0 for VALUE, I hope VALUE is
  70. big enough to keep iconv_t */
  71. #define VALUE2ICONV(v) ((iconv_t)((VALUE)(v) ^ -1))
  72. #define ICONV2VALUE(c) ((VALUE)(c) ^ -1)
  73. struct iconv_env_t
  74. {
  75. iconv_t cd;
  76. int argc;
  77. VALUE *argv;
  78. VALUE ret;
  79. int toidx;
  80. VALUE (*append)_((VALUE, VALUE));
  81. };
  82. struct rb_iconv_opt_t
  83. {
  84. VALUE transliterate;
  85. VALUE discard_ilseq;
  86. };
  87. static ID id_transliterate, id_discard_ilseq;
  88. static VALUE rb_eIconvInvalidEncoding;
  89. static VALUE rb_eIconvFailure;
  90. static VALUE rb_eIconvIllegalSeq;
  91. static VALUE rb_eIconvInvalidChar;
  92. static VALUE rb_eIconvOutOfRange;
  93. static VALUE rb_eIconvBrokenLibrary;
  94. static ID rb_success, rb_failed;
  95. static VALUE iconv_fail _((VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg));
  96. static VALUE iconv_fail_retry _((VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg));
  97. static VALUE iconv_failure_initialize _((VALUE error, VALUE mesg, VALUE success, VALUE failed));
  98. static VALUE iconv_failure_success _((VALUE self));
  99. static VALUE iconv_failure_failed _((VALUE self));
  100. static iconv_t iconv_create _((VALUE to, VALUE from, struct rb_iconv_opt_t *opt, int *idx));
  101. static void iconv_dfree _((void *cd));
  102. static VALUE iconv_free _((VALUE cd));
  103. static VALUE iconv_try _((iconv_t cd, const char **inptr, size_t *inlen, char **outptr, size_t *outlen));
  104. static VALUE rb_str_derive _((VALUE str, const char* ptr, long len));
  105. static VALUE iconv_convert _((iconv_t cd, VALUE str, long start, long length, int toidx,
  106. struct iconv_env_t* env));
  107. static VALUE iconv_s_allocate _((VALUE klass));
  108. static VALUE iconv_initialize _((int argc, VALUE *argv, VALUE self));
  109. static VALUE iconv_s_open _((int argc, VALUE *argv, VALUE self));
  110. static VALUE iconv_s_convert _((struct iconv_env_t* env));
  111. static VALUE iconv_s_iconv _((int argc, VALUE *argv, VALUE self));
  112. static VALUE iconv_init_state _((VALUE cd));
  113. static VALUE iconv_finish _((VALUE self));
  114. static VALUE iconv_iconv _((int argc, VALUE *argv, VALUE self));
  115. static VALUE iconv_conv _((int argc, VALUE *argv, VALUE self));
  116. static VALUE charset_map;
  117. /*
  118. * Document-method: charset_map
  119. * call-seq: Iconv.charset_map
  120. *
  121. * Returns the map from canonical name to system dependent name.
  122. */
  123. static VALUE
  124. charset_map_get(void)
  125. {
  126. return charset_map;
  127. }
  128. static VALUE
  129. strip_glibc_option(VALUE *code)
  130. {
  131. VALUE val = StringValue(*code);
  132. const char *ptr = RSTRING_PTR(val), *pend = RSTRING_END(val);
  133. const char *slash = memchr(ptr, '/', pend - ptr);
  134. if (slash && slash < pend - 1 && slash[1] == '/') {
  135. VALUE opt = rb_str_subseq(val, slash - ptr, pend - slash);
  136. val = rb_str_subseq(val, 0, slash - ptr);
  137. *code = val;
  138. return opt;
  139. }
  140. return 0;
  141. }
  142. static char *
  143. map_charset(VALUE *code)
  144. {
  145. VALUE val = StringValue(*code);
  146. if (RHASH_SIZE(charset_map)) {
  147. VALUE key = rb_funcall2(val, rb_intern("downcase"), 0, 0);
  148. StringValuePtr(key);
  149. if (st_lookup(RHASH_TBL(charset_map), key, &val)) {
  150. *code = val;
  151. }
  152. }
  153. return StringValuePtr(*code);
  154. }
  155. static iconv_t
  156. iconv_create(VALUE to, VALUE from, struct rb_iconv_opt_t *opt, int *idx)
  157. {
  158. VALUE toopt = strip_glibc_option(&to);
  159. VALUE fromopt = strip_glibc_option(&from);
  160. VALUE toenc = 0, fromenc = 0;
  161. const char* tocode = map_charset(&to);
  162. const char* fromcode = map_charset(&from);
  163. iconv_t cd;
  164. int retry = 0;
  165. *idx = rb_enc_find_index(tocode);
  166. if (toopt) {
  167. toenc = rb_str_plus(to, toopt);
  168. tocode = RSTRING_PTR(toenc);
  169. }
  170. if (fromopt) {
  171. fromenc = rb_str_plus(from, fromopt);
  172. fromcode = RSTRING_PTR(fromenc);
  173. }
  174. while ((cd = iconv_open(tocode, fromcode)) == (iconv_t)-1) {
  175. int inval = 0;
  176. switch (errno) {
  177. case EMFILE:
  178. case ENFILE:
  179. case ENOMEM:
  180. if (!retry++) {
  181. rb_gc();
  182. continue;
  183. }
  184. break;
  185. case EINVAL:
  186. retry = 0;
  187. inval = 1;
  188. if (toenc) {
  189. tocode = RSTRING_PTR(to);
  190. rb_str_resize(toenc, 0);
  191. toenc = 0;
  192. continue;
  193. }
  194. if (fromenc) {
  195. fromcode = RSTRING_PTR(from);
  196. rb_str_resize(fromenc, 0);
  197. fromenc = 0;
  198. continue;
  199. }
  200. break;
  201. }
  202. {
  203. const char *s = inval ? "invalid encoding " : "iconv";
  204. volatile VALUE msg = rb_str_new(0, strlen(s) + RSTRING_LEN(to) +
  205. RSTRING_LEN(from) + 8);
  206. sprintf(RSTRING_PTR(msg), "%s(\"%s\", \"%s\")",
  207. s, RSTRING_PTR(to), RSTRING_PTR(from));
  208. s = RSTRING_PTR(msg);
  209. rb_str_set_len(msg, strlen(s));
  210. if (!inval) rb_sys_fail(s);
  211. rb_exc_raise(iconv_fail(rb_eIconvInvalidEncoding, Qnil,
  212. rb_ary_new3(2, to, from), NULL, s));
  213. }
  214. }
  215. if (toopt || fromopt) {
  216. if (toopt && fromopt && RTEST(rb_str_equal(toopt, fromopt))) {
  217. fromopt = 0;
  218. }
  219. if (toopt && fromopt) {
  220. rb_warning("encoding option isn't portable: %s, %s",
  221. RSTRING_PTR(toopt) + 2, RSTRING_PTR(fromopt) + 2);
  222. }
  223. else {
  224. rb_warning("encoding option isn't portable: %s",
  225. (toopt ? RSTRING_PTR(toopt) : RSTRING_PTR(fromopt)) + 2);
  226. }
  227. }
  228. if (opt) {
  229. #ifdef ICONV_SET_TRANSLITERATE
  230. if (opt->transliterate != Qundef) {
  231. int flag = RTEST(opt->transliterate);
  232. rb_warning("encoding option isn't portable: transliterate");
  233. if (iconvctl(cd, ICONV_SET_TRANSLITERATE, (void *)&flag))
  234. rb_sys_fail("ICONV_SET_TRANSLITERATE");
  235. }
  236. #endif
  237. #ifdef ICONV_SET_DISCARD_ILSEQ
  238. if (opt->discard_ilseq != Qundef) {
  239. int flag = RTEST(opt->discard_ilseq);
  240. rb_warning("encoding option isn't portable: discard_ilseq");
  241. if (iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, (void *)&flag))
  242. rb_sys_fail("ICONV_SET_DISCARD_ILSEQ");
  243. }
  244. #endif
  245. }
  246. return cd;
  247. }
  248. static void
  249. iconv_dfree(void *cd)
  250. {
  251. iconv_close(VALUE2ICONV(cd));
  252. }
  253. #define ICONV_FREE iconv_dfree
  254. static VALUE
  255. iconv_free(VALUE cd)
  256. {
  257. if (cd && iconv_close(VALUE2ICONV(cd)) == -1)
  258. rb_sys_fail("iconv_close");
  259. return Qnil;
  260. }
  261. static VALUE
  262. check_iconv(VALUE obj)
  263. {
  264. Check_Type(obj, T_DATA);
  265. if (RDATA(obj)->dfree != ICONV_FREE) {
  266. rb_raise(rb_eArgError, "Iconv expected (%s)", rb_class2name(CLASS_OF(obj)));
  267. }
  268. return (VALUE)DATA_PTR(obj);
  269. }
  270. static VALUE
  271. iconv_try(iconv_t cd, const char **inptr, size_t *inlen, char **outptr, size_t *outlen)
  272. {
  273. #ifdef ICONV_INPTR_CONST
  274. #define ICONV_INPTR_CAST
  275. #else
  276. #define ICONV_INPTR_CAST (char **)
  277. #endif
  278. size_t ret;
  279. errno = 0;
  280. ret = iconv(cd, ICONV_INPTR_CAST inptr, inlen, outptr, outlen);
  281. if (ret == (size_t)-1) {
  282. if (!*inlen)
  283. return Qfalse;
  284. switch (errno) {
  285. case E2BIG:
  286. /* try the left in next loop */
  287. break;
  288. case EILSEQ:
  289. return rb_eIconvIllegalSeq;
  290. case EINVAL:
  291. return rb_eIconvInvalidChar;
  292. case 0:
  293. return rb_eIconvBrokenLibrary;
  294. default:
  295. rb_sys_fail("iconv");
  296. }
  297. }
  298. else if (*inlen > 0) {
  299. /* something goes wrong */
  300. return rb_eIconvIllegalSeq;
  301. }
  302. else if (ret) {
  303. return Qnil; /* conversion */
  304. }
  305. return Qfalse;
  306. }
  307. #define FAILED_MAXLEN 16
  308. static VALUE
  309. iconv_failure_initialize(VALUE error, VALUE mesg, VALUE success, VALUE failed)
  310. {
  311. rb_call_super(1, &mesg);
  312. rb_ivar_set(error, rb_success, success);
  313. rb_ivar_set(error, rb_failed, failed);
  314. return error;
  315. }
  316. static VALUE
  317. iconv_fail(VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg)
  318. {
  319. VALUE args[3];
  320. if (mesg && *mesg) {
  321. args[0] = rb_str_new2(mesg);
  322. }
  323. else if (TYPE(failed) != T_STRING || RSTRING_LEN(failed) < FAILED_MAXLEN) {
  324. args[0] = rb_inspect(failed);
  325. }
  326. else {
  327. args[0] = rb_inspect(rb_str_substr(failed, 0, FAILED_MAXLEN));
  328. rb_str_cat2(args[0], "...");
  329. }
  330. args[1] = success;
  331. args[2] = failed;
  332. if (env) {
  333. args[1] = env->append(rb_obj_dup(env->ret), success);
  334. if (env->argc > 0) {
  335. *(env->argv) = failed;
  336. args[2] = rb_ary_new4(env->argc, env->argv);
  337. }
  338. }
  339. return rb_class_new_instance(3, args, error);
  340. }
  341. static VALUE
  342. iconv_fail_retry(VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg)
  343. {
  344. error = iconv_fail(error, success, failed, env, mesg);
  345. if (!rb_block_given_p()) rb_exc_raise(error);
  346. rb_set_errinfo(error);
  347. return rb_yield(failed);
  348. }
  349. static VALUE
  350. rb_str_derive(VALUE str, const char* ptr, long len)
  351. {
  352. VALUE ret;
  353. if (NIL_P(str))
  354. return rb_str_new(ptr, len);
  355. if (RSTRING_PTR(str) + RSTRING_LEN(str) == ptr + len)
  356. ret = rb_str_subseq(str, ptr - RSTRING_PTR(str), len);
  357. else
  358. ret = rb_str_new(ptr, len);
  359. OBJ_INFECT(ret, str);
  360. return ret;
  361. }
  362. static VALUE
  363. iconv_convert(iconv_t cd, VALUE str, long start, long length, int toidx, struct iconv_env_t* env)
  364. {
  365. VALUE ret = Qfalse;
  366. VALUE error = Qfalse;
  367. VALUE rescue;
  368. const char *inptr, *instart;
  369. size_t inlen;
  370. /* I believe ONE CHARACTER never exceed this. */
  371. char buffer[BUFSIZ];
  372. char *outptr;
  373. size_t outlen;
  374. if (cd == (iconv_t)-1)
  375. rb_raise(rb_eArgError, "closed iconv");
  376. if (NIL_P(str)) {
  377. /* Reset output pointer or something. */
  378. inptr = "";
  379. inlen = 0;
  380. outptr = buffer;
  381. outlen = sizeof(buffer);
  382. error = iconv_try(cd, &inptr, &inlen, &outptr, &outlen);
  383. if (RTEST(error)) {
  384. unsigned int i;
  385. rescue = iconv_fail_retry(error, Qnil, Qnil, env, 0);
  386. if (TYPE(rescue) == T_ARRAY) {
  387. str = RARRAY_LEN(rescue) > 0 ? RARRAY_PTR(rescue)[0] : Qnil;
  388. }
  389. if (FIXNUM_P(str) && (i = FIX2INT(str)) <= 0xff) {
  390. char c = i;
  391. str = rb_str_new(&c, 1);
  392. }
  393. else if (!NIL_P(str)) {
  394. StringValue(str);
  395. }
  396. }
  397. inptr = NULL;
  398. length = 0;
  399. }
  400. else {
  401. long slen;
  402. StringValue(str);
  403. slen = RSTRING_LEN(str);
  404. inptr = RSTRING_PTR(str);
  405. inptr += start;
  406. if (length < 0 || length > start + slen)
  407. length = slen - start;
  408. }
  409. instart = inptr;
  410. inlen = length;
  411. do {
  412. char errmsg[50];
  413. const char *tmpstart = inptr;
  414. outptr = buffer;
  415. outlen = sizeof(buffer);
  416. errmsg[0] = 0;
  417. error = iconv_try(cd, &inptr, &inlen, &outptr, &outlen);
  418. if (
  419. #if SIGNEDNESS_OF_SIZE_T < 0
  420. 0 <= outlen &&
  421. #endif
  422. outlen <= sizeof(buffer)) {
  423. outlen = sizeof(buffer) - outlen;
  424. if (NIL_P(error) || /* something converted */
  425. outlen > (size_t)(inptr - tmpstart) || /* input can't contain output */
  426. (outlen < (size_t)(inptr - tmpstart) && inlen > 0) || /* something skipped */
  427. memcmp(buffer, tmpstart, outlen)) /* something differs */
  428. {
  429. if (NIL_P(str)) {
  430. ret = rb_str_new(buffer, outlen);
  431. if (toidx >= 0) rb_enc_associate_index(ret, toidx);
  432. }
  433. else {
  434. if (ret) {
  435. ret = rb_str_buf_cat(ret, instart, tmpstart - instart);
  436. }
  437. else {
  438. ret = rb_str_new(instart, tmpstart - instart);
  439. if (toidx >= 0) rb_enc_associate_index(ret, toidx);
  440. OBJ_INFECT(ret, str);
  441. }
  442. ret = rb_str_buf_cat(ret, buffer, outlen);
  443. instart = inptr;
  444. }
  445. }
  446. else if (!inlen) {
  447. inptr = tmpstart + outlen;
  448. }
  449. }
  450. else {
  451. /* Some iconv() have a bug, return *outlen out of range */
  452. sprintf(errmsg, "bug?(output length = %ld)", (long)(sizeof(buffer) - outlen));
  453. error = rb_eIconvOutOfRange;
  454. }
  455. if (RTEST(error)) {
  456. long len = 0;
  457. if (!ret) {
  458. ret = rb_str_derive(str, instart, inptr - instart);
  459. if (toidx >= 0) rb_enc_associate_index(ret, toidx);
  460. }
  461. else if (inptr > instart) {
  462. rb_str_cat(ret, instart, inptr - instart);
  463. }
  464. str = rb_str_derive(str, inptr, inlen);
  465. rescue = iconv_fail_retry(error, ret, str, env, errmsg);
  466. if (TYPE(rescue) == T_ARRAY) {
  467. if ((len = RARRAY_LEN(rescue)) > 0)
  468. rb_str_concat(ret, RARRAY_PTR(rescue)[0]);
  469. if (len > 1 && !NIL_P(str = RARRAY_PTR(rescue)[1])) {
  470. StringValue(str);
  471. inlen = length = RSTRING_LEN(str);
  472. instart = inptr = RSTRING_PTR(str);
  473. continue;
  474. }
  475. }
  476. else if (!NIL_P(rescue)) {
  477. rb_str_concat(ret, rescue);
  478. }
  479. break;
  480. }
  481. } while (inlen > 0);
  482. if (!ret) {
  483. ret = rb_str_derive(str, instart, inptr - instart);
  484. if (toidx >= 0) rb_enc_associate_index(ret, toidx);
  485. }
  486. else if (inptr > instart) {
  487. rb_str_cat(ret, instart, inptr - instart);
  488. }
  489. return ret;
  490. }
  491. static VALUE
  492. iconv_s_allocate(VALUE klass)
  493. {
  494. return Data_Wrap_Struct(klass, 0, ICONV_FREE, 0);
  495. }
  496. static VALUE
  497. get_iconv_opt_i(VALUE i, VALUE arg)
  498. {
  499. struct rb_iconv_opt_t *opt = (struct rb_iconv_opt_t *)arg;
  500. VALUE name, val;
  501. (void)opt;
  502. i = rb_Array(i);
  503. name = rb_ary_entry(i, 0);
  504. val = rb_ary_entry(i, 1);
  505. do {
  506. if (SYMBOL_P(name)) {
  507. ID id = SYM2ID(name);
  508. if (id == id_transliterate) {
  509. #ifdef ICONV_SET_TRANSLITERATE
  510. opt->transliterate = val;
  511. #else
  512. rb_notimplement();
  513. #endif
  514. break;
  515. }
  516. if (id == id_discard_ilseq) {
  517. #ifdef ICONV_SET_DISCARD_ILSEQ
  518. opt->discard_ilseq = val;
  519. #else
  520. rb_notimplement();
  521. #endif
  522. break;
  523. }
  524. }
  525. else {
  526. const char *s = StringValueCStr(name);
  527. if (strcmp(s, "transliterate") == 0) {
  528. #ifdef ICONV_SET_TRANSLITERATE
  529. opt->transliterate = val;
  530. #else
  531. rb_notimplement();
  532. #endif
  533. break;
  534. }
  535. if (strcmp(s, "discard_ilseq") == 0) {
  536. #ifdef ICONV_SET_DISCARD_ILSEQ
  537. opt->discard_ilseq = val;
  538. #else
  539. rb_notimplement();
  540. #endif
  541. break;
  542. }
  543. }
  544. name = rb_inspect(name);
  545. rb_raise(rb_eArgError, "unknown option - %s", StringValueCStr(name));
  546. } while (0);
  547. return Qnil;
  548. }
  549. static void
  550. get_iconv_opt(struct rb_iconv_opt_t *opt, VALUE options)
  551. {
  552. opt->transliterate = Qundef;
  553. opt->discard_ilseq = Qundef;
  554. if (!NIL_P(options)) {
  555. rb_block_call(options, rb_intern("each"), 0, 0, get_iconv_opt_i, (VALUE)opt);
  556. }
  557. }
  558. #define iconv_ctl(self, func, val) (\
  559. iconvctl(VALUE2ICONV(check_iconv(self)), func, (void *)&(val)) ? \
  560. rb_sys_fail(#func) : (void)0)
  561. /*
  562. * Document-method: new
  563. * call-seq: Iconv.new(to, from, [options])
  564. *
  565. * Creates new code converter from a coding-system designated with +from+
  566. * to another one designated with +to+.
  567. *
  568. * === Parameters
  569. *
  570. * +to+:: encoding name for destination
  571. * +from+:: encoding name for source
  572. * +options+:: options for converter
  573. *
  574. * === Exceptions
  575. *
  576. * TypeError:: if +to+ or +from+ aren't String
  577. * InvalidEncoding:: if designated converter couldn't find out
  578. * SystemCallError:: if <tt>iconv_open(3)</tt> fails
  579. */
  580. static VALUE
  581. iconv_initialize(int argc, VALUE *argv, VALUE self)
  582. {
  583. VALUE to, from, options;
  584. struct rb_iconv_opt_t opt;
  585. int idx;
  586. rb_scan_args(argc, argv, "21", &to, &from, &options);
  587. get_iconv_opt(&opt, options);
  588. iconv_free(check_iconv(self));
  589. DATA_PTR(self) = NULL;
  590. DATA_PTR(self) = (void *)ICONV2VALUE(iconv_create(to, from, &opt, &idx));
  591. if (idx >= 0) ENCODING_SET(self, idx);
  592. return self;
  593. }
  594. /*
  595. * Document-method: open
  596. * call-seq: Iconv.open(to, from) { |iconv| ... }
  597. *
  598. * Equivalent to Iconv.new except that when it is called with a block, it
  599. * yields with the new instance and closes it, and returns the result which
  600. * returned from the block.
  601. */
  602. static VALUE
  603. iconv_s_open(int argc, VALUE *argv, VALUE self)
  604. {
  605. VALUE to, from, options, cd;
  606. struct rb_iconv_opt_t opt;
  607. int idx;
  608. rb_scan_args(argc, argv, "21", &to, &from, &options);
  609. get_iconv_opt(&opt, options);
  610. cd = ICONV2VALUE(iconv_create(to, from, &opt, &idx));
  611. self = Data_Wrap_Struct(self, NULL, ICONV_FREE, (void *)cd);
  612. if (idx >= 0) ENCODING_SET(self, idx);
  613. if (rb_block_given_p()) {
  614. return rb_ensure(rb_yield, self, (VALUE(*)())iconv_finish, self);
  615. }
  616. else {
  617. return self;
  618. }
  619. }
  620. static VALUE
  621. iconv_s_convert(struct iconv_env_t* env)
  622. {
  623. VALUE last = 0;
  624. for (; env->argc > 0; --env->argc, ++env->argv) {
  625. VALUE s = iconv_convert(env->cd, last = *(env->argv),
  626. 0, -1, env->toidx, env);
  627. env->append(env->ret, s);
  628. }
  629. if (!NIL_P(last)) {
  630. VALUE s = iconv_convert(env->cd, Qnil, 0, 0, env->toidx, env);
  631. if (RSTRING_LEN(s))
  632. env->append(env->ret, s);
  633. }
  634. return env->ret;
  635. }
  636. /*
  637. * Document-method: Iconv::iconv
  638. * call-seq: Iconv.iconv(to, from, *strs)
  639. *
  640. * Shorthand for
  641. * Iconv.open(to, from) { |cd|
  642. * (strs + [nil]).collect { |s| cd.iconv(s) }
  643. * }
  644. *
  645. * === Parameters
  646. *
  647. * <tt>to, from</tt>:: see Iconv.new
  648. * <tt>strs</tt>:: strings to be converted
  649. *
  650. * === Exceptions
  651. *
  652. * Exceptions thrown by Iconv.new, Iconv.open and Iconv#iconv.
  653. */
  654. static VALUE
  655. iconv_s_iconv(int argc, VALUE *argv, VALUE self)
  656. {
  657. struct iconv_env_t arg;
  658. if (argc < 2) /* needs `to' and `from' arguments at least */
  659. rb_raise(rb_eArgError, "wrong number of arguments (%d for %d)", argc, 2);
  660. arg.argc = argc -= 2;
  661. arg.argv = argv + 2;
  662. arg.append = rb_ary_push;
  663. arg.ret = rb_ary_new2(argc);
  664. arg.cd = iconv_create(argv[0], argv[1], NULL, &arg.toidx);
  665. return rb_ensure(iconv_s_convert, (VALUE)&arg, iconv_free, ICONV2VALUE(arg.cd));
  666. }
  667. /*
  668. * Document-method: Iconv::conv
  669. * call-seq: Iconv.conv(to, from, str)
  670. *
  671. * Shorthand for
  672. * Iconv.iconv(to, from, str).join
  673. * See Iconv.iconv.
  674. */
  675. static VALUE
  676. iconv_s_conv(VALUE self, VALUE to, VALUE from, VALUE str)
  677. {
  678. struct iconv_env_t arg;
  679. arg.argc = 1;
  680. arg.argv = &str;
  681. arg.append = rb_str_append;
  682. arg.ret = rb_str_new(0, 0);
  683. arg.cd = iconv_create(to, from, NULL, &arg.toidx);
  684. return rb_ensure(iconv_s_convert, (VALUE)&arg, iconv_free, ICONV2VALUE(arg.cd));
  685. }
  686. /*
  687. * Document-method: list
  688. * call-seq: Iconv.list {|*aliases| ... }
  689. *
  690. * Iterates each alias sets.
  691. */
  692. #ifdef HAVE_ICONVLIST
  693. struct iconv_name_list
  694. {
  695. unsigned int namescount;
  696. const char *const *names;
  697. VALUE array;
  698. };
  699. static VALUE
  700. list_iconv_i(VALUE ptr)
  701. {
  702. struct iconv_name_list *p = (struct iconv_name_list *)ptr;
  703. unsigned int i, namescount = p->namescount;
  704. const char *const *names = p->names;
  705. VALUE ary = rb_ary_new2(namescount);
  706. for (i = 0; i < namescount; i++) {
  707. rb_ary_push(ary, rb_str_new2(names[i]));
  708. }
  709. if (p->array) {
  710. return rb_ary_push(p->array, ary);
  711. }
  712. return rb_yield(ary);
  713. }
  714. static int
  715. list_iconv(unsigned int namescount, const char *const *names, void *data)
  716. {
  717. int *state = data;
  718. struct iconv_name_list list;
  719. list.namescount = namescount;
  720. list.names = names;
  721. list.array = ((VALUE *)data)[1];
  722. rb_protect(list_iconv_i, (VALUE)&list, state);
  723. return *state;
  724. }
  725. #endif
  726. #if defined(HAVE_ICONVLIST) || defined(HAVE___ICONV_FREE_LIST)
  727. static VALUE
  728. iconv_s_list(void)
  729. {
  730. #ifdef HAVE_ICONVLIST
  731. int state;
  732. VALUE args[2];
  733. args[1] = rb_block_given_p() ? 0 : rb_ary_new();
  734. iconvlist(list_iconv, args);
  735. state = *(int *)args;
  736. if (state) rb_jump_tag(state);
  737. if (args[1]) return args[1];
  738. #elif defined(HAVE___ICONV_FREE_LIST)
  739. char **list;
  740. size_t sz, i;
  741. VALUE ary;
  742. if (__iconv_get_list(&list, &sz)) return Qnil;
  743. ary = rb_ary_new2(sz);
  744. for (i = 0; i < sz; i++) {
  745. rb_ary_push(ary, rb_str_new2(list[i]));
  746. }
  747. __iconv_free_list(list, sz);
  748. if (!rb_block_given_p())
  749. return ary;
  750. for (i = 0; i < RARRAY_LEN(ary); i++) {
  751. rb_yield(RARRAY_PTR(ary)[i]);
  752. }
  753. #endif
  754. return Qnil;
  755. }
  756. #else
  757. #define iconv_s_list rb_f_notimplement
  758. #endif
  759. /*
  760. * Document-method: close
  761. *
  762. * Finishes conversion.
  763. *
  764. * After calling this, calling Iconv#iconv will cause an exception, but
  765. * multiple calls of #close are guaranteed to end successfully.
  766. *
  767. * Returns a string containing the byte sequence to change the output buffer to
  768. * its initial shift state.
  769. */
  770. static VALUE
  771. iconv_init_state(VALUE self)
  772. {
  773. iconv_t cd = VALUE2ICONV((VALUE)DATA_PTR(self));
  774. DATA_PTR(self) = NULL;
  775. return iconv_convert(cd, Qnil, 0, 0, ENCODING_GET(self), NULL);
  776. }
  777. static VALUE
  778. iconv_finish(VALUE self)
  779. {
  780. VALUE cd = check_iconv(self);
  781. if (!cd) return Qnil;
  782. return rb_ensure(iconv_init_state, self, iconv_free, cd);
  783. }
  784. /*
  785. * Document-method: Iconv#iconv
  786. * call-seq: iconv(str, start=0, length=-1)
  787. *
  788. * Converts string and returns the result.
  789. * * If +str+ is a String, converts <tt>str[start, length]</tt> and returns the converted string.
  790. * * If +str+ is +nil+, places converter itself into initial shift state and
  791. * just returns a string containing the byte sequence to change the output
  792. * buffer to its initial shift state.
  793. * * Otherwise, raises an exception.
  794. *
  795. * === Parameters
  796. *
  797. * str:: string to be converted, or nil
  798. * start:: starting offset
  799. * length:: conversion length; nil or -1 means whole the string from start
  800. *
  801. * === Exceptions
  802. *
  803. * * IconvIllegalSequence
  804. * * IconvInvalidCharacter
  805. * * IconvOutOfRange
  806. *
  807. * === Examples
  808. *
  809. * See the Iconv documentation.
  810. */
  811. static VALUE
  812. iconv_iconv(int argc, VALUE *argv, VALUE self)
  813. {
  814. VALUE str, n1, n2;
  815. VALUE cd = check_iconv(self);
  816. long start = 0, length = 0, slen = 0;
  817. rb_scan_args(argc, argv, "12", &str, &n1, &n2);
  818. if (!NIL_P(str)) {
  819. VALUE n = rb_str_length(StringValue(str));
  820. slen = NUM2LONG(n);
  821. }
  822. if (argc != 2 || !RTEST(rb_range_beg_len(n1, &start, &length, slen, 0))) {
  823. if (NIL_P(n1) || ((start = NUM2LONG(n1)) < 0 ? (start += slen) >= 0 : start < slen)) {
  824. length = NIL_P(n2) ? -1 : NUM2LONG(n2);
  825. }
  826. }
  827. if (start > 0 || length > 0) {
  828. rb_encoding *enc = rb_enc_get(str);
  829. const char *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
  830. const char *ps = s;
  831. if (start > 0) {
  832. start = (ps = rb_enc_nth(s, e, start, enc)) - s;
  833. }
  834. if (length > 0) {
  835. length = rb_enc_nth(ps, e, length, enc) - ps;
  836. }
  837. }
  838. return iconv_convert(VALUE2ICONV(cd), str, start, length, ENCODING_GET(self), NULL);
  839. }
  840. /*
  841. * Document-method: conv
  842. * call-seq: conv(str...)
  843. *
  844. * Equivalent to
  845. *
  846. * iconv(nil, str..., nil).join
  847. */
  848. static VALUE
  849. iconv_conv(int argc, VALUE *argv, VALUE self)
  850. {
  851. iconv_t cd = VALUE2ICONV(check_iconv(self));
  852. VALUE str, s;
  853. int toidx = ENCODING_GET(self);
  854. str = iconv_convert(cd, Qnil, 0, 0, toidx, NULL);
  855. if (argc > 0) {
  856. do {
  857. s = iconv_convert(cd, *argv++, 0, -1, toidx, NULL);
  858. if (RSTRING_LEN(s))
  859. rb_str_buf_append(str, s);
  860. } while (--argc);
  861. s = iconv_convert(cd, Qnil, 0, 0, toidx, NULL);
  862. if (RSTRING_LEN(s))
  863. rb_str_buf_append(str, s);
  864. }
  865. return str;
  866. }
  867. #ifdef ICONV_TRIVIALP
  868. /*
  869. * Document-method: trivial?
  870. * call-seq: trivial?
  871. *
  872. * Returns trivial flag.
  873. */
  874. static VALUE
  875. iconv_trivialp(VALUE self)
  876. {
  877. int trivial = 0;
  878. iconv_ctl(self, ICONV_TRIVIALP, trivial);
  879. if (trivial) return Qtrue;
  880. return Qfalse;
  881. }
  882. #else
  883. #define iconv_trivialp rb_f_notimplement
  884. #endif
  885. #ifdef ICONV_GET_TRANSLITERATE
  886. /*
  887. * Document-method: transliterate?
  888. * call-seq: transliterate?
  889. *
  890. * Returns transliterate flag.
  891. */
  892. static VALUE
  893. iconv_get_transliterate(VALUE self)
  894. {
  895. int trans = 0;
  896. iconv_ctl(self, ICONV_GET_TRANSLITERATE, trans);
  897. if (trans) return Qtrue;
  898. return Qfalse;
  899. }
  900. #else
  901. #define iconv_get_transliterate rb_f_notimplement
  902. #endif
  903. #ifdef ICONV_SET_TRANSLITERATE
  904. /*
  905. * Document-method: transliterate=
  906. * call-seq: cd.transliterate = flag
  907. *
  908. * Sets transliterate flag.
  909. */
  910. static VALUE
  911. iconv_set_transliterate(VALUE self, VALUE transliterate)
  912. {
  913. int trans = RTEST(transliterate);
  914. iconv_ctl(self, ICONV_SET_TRANSLITERATE, trans);
  915. return self;
  916. }
  917. #else
  918. #define iconv_set_transliterate rb_f_notimplement
  919. #endif
  920. #ifdef ICONV_GET_DISCARD_ILSEQ
  921. /*
  922. * Document-method: discard_ilseq?
  923. * call-seq: discard_ilseq?
  924. *
  925. * Returns discard_ilseq flag.
  926. */
  927. static VALUE
  928. iconv_get_discard_ilseq(VALUE self)
  929. {
  930. int dis = 0;
  931. iconv_ctl(self, ICONV_GET_DISCARD_ILSEQ, dis);
  932. if (dis) return Qtrue;
  933. return Qfalse;
  934. }
  935. #else
  936. #define iconv_get_discard_ilseq rb_f_notimplement
  937. #endif
  938. #ifdef ICONV_SET_DISCARD_ILSEQ
  939. /*
  940. * Document-method: discard_ilseq=
  941. * call-seq: cd.discard_ilseq = flag
  942. *
  943. * Sets discard_ilseq flag.
  944. */
  945. static VALUE
  946. iconv_set_discard_ilseq(VALUE self, VALUE discard_ilseq)
  947. {
  948. int dis = RTEST(discard_ilseq);
  949. iconv_ctl(self, ICONV_SET_DISCARD_ILSEQ, dis);
  950. return self;
  951. }
  952. #else
  953. #define iconv_set_discard_ilseq rb_f_notimplement
  954. #endif
  955. /*
  956. * Document-method: ctlmethods
  957. * call-seq: Iconv.ctlmethods => array
  958. *
  959. * Returns available iconvctl() method list.
  960. */
  961. static VALUE
  962. iconv_s_ctlmethods(VALUE klass)
  963. {
  964. VALUE ary = rb_ary_new();
  965. #ifdef ICONV_TRIVIALP
  966. rb_ary_push(ary, ID2SYM(rb_intern("trivial?")));
  967. #endif
  968. #ifdef ICONV_GET_TRANSLITERATE
  969. rb_ary_push(ary, ID2SYM(rb_intern("transliterate?")));
  970. #endif
  971. #ifdef ICONV_SET_TRANSLITERATE
  972. rb_ary_push(ary, ID2SYM(rb_intern("transliterate=")));
  973. #endif
  974. #ifdef ICONV_GET_DISCARD_ILSEQ
  975. rb_ary_push(ary, ID2SYM(rb_intern("discard_ilseq?")));
  976. #endif
  977. #ifdef ICONV_SET_DISCARD_ILSEQ
  978. rb_ary_push(ary, ID2SYM(rb_intern("discard_ilseq=")));
  979. #endif
  980. return ary;
  981. }
  982. /*
  983. * Document-class: Iconv::Failure
  984. *
  985. * Base attributes for Iconv exceptions.
  986. */
  987. /*
  988. * Document-method: success
  989. * call-seq: success
  990. *
  991. * Returns string(s) translated successfully until the exception occurred.
  992. * * In the case of failure occurred within Iconv.iconv, returned
  993. * value is an array of strings translated successfully preceding
  994. * failure and the last element is string on the way.
  995. */
  996. static VALUE
  997. iconv_failure_success(VALUE self)
  998. {
  999. return rb_attr_get(self, rb_success);
  1000. }
  1001. /*
  1002. * Document-method: failed
  1003. * call-seq: failed
  1004. *
  1005. * Returns substring of the original string passed to Iconv that starts at the
  1006. * character caused the exception.
  1007. */
  1008. static VALUE
  1009. iconv_failure_failed(VALUE self)
  1010. {
  1011. return rb_attr_get(self, rb_failed);
  1012. }
  1013. /*
  1014. * Document-method: inspect
  1015. * call-seq: inspect
  1016. *
  1017. * Returns inspected string like as: #<_class_: _success_, _failed_>
  1018. */
  1019. static VALUE
  1020. iconv_failure_inspect(VALUE self)
  1021. {
  1022. const char *cname = rb_class2name(CLASS_OF(self));
  1023. VALUE success = rb_attr_get(self, rb_success);
  1024. VALUE failed = rb_attr_get(self, rb_failed);
  1025. VALUE str = rb_str_buf_cat2(rb_str_new2("#<"), cname);
  1026. str = rb_str_buf_cat(str, ": ", 2);
  1027. str = rb_str_buf_append(str, rb_inspect(success));
  1028. str = rb_str_buf_cat(str, ", ", 2);
  1029. str = rb_str_buf_append(str, rb_inspect(failed));
  1030. return rb_str_buf_cat(str, ">", 1);
  1031. }
  1032. /*
  1033. * Document-class: Iconv::InvalidEncoding
  1034. *
  1035. * Requested coding-system is not available on this system.
  1036. */
  1037. /*
  1038. * Document-class: Iconv::IllegalSequence
  1039. *
  1040. * Input conversion stopped due to an input byte that does not belong to
  1041. * the input codeset, or the output codeset does not contain the
  1042. * character.
  1043. */
  1044. /*
  1045. * Document-class: Iconv::InvalidCharacter
  1046. *
  1047. * Input conversion stopped due to an incomplete character or shift
  1048. * sequence at the end of the input buffer.
  1049. */
  1050. /*
  1051. * Document-class: Iconv::OutOfRange
  1052. *
  1053. * Iconv library internal error. Must not occur.
  1054. */
  1055. /*
  1056. * Document-class: Iconv::BrokenLibrary
  1057. *
  1058. * Detected a bug of underlying iconv(3) libray.
  1059. * * returns an error without setting errno properly
  1060. */
  1061. void
  1062. Init_iconv(void)
  1063. {
  1064. VALUE rb_cIconv = rb_define_class("Iconv", rb_cData);
  1065. rb_define_alloc_func(rb_cIconv, iconv_s_allocate);
  1066. rb_define_singleton_method(rb_cIconv, "open", iconv_s_open, -1);
  1067. rb_define_singleton_method(rb_cIconv, "iconv", iconv_s_iconv, -1);
  1068. rb_define_singleton_method(rb_cIconv, "conv", iconv_s_conv, 3);
  1069. rb_define_singleton_method(rb_cIconv, "list", iconv_s_list, 0);
  1070. rb_define_singleton_method(rb_cIconv, "ctlmethods", iconv_s_ctlmethods, 0);
  1071. rb_define_method(rb_cIconv, "initialize", iconv_initialize, -1);
  1072. rb_define_method(rb_cIconv, "close", iconv_finish, 0);
  1073. rb_define_method(rb_cIconv, "iconv", iconv_iconv, -1);
  1074. rb_define_method(rb_cIconv, "conv", iconv_conv, -1);
  1075. rb_define_method(rb_cIconv, "trivial?", iconv_trivialp, 0);
  1076. rb_define_method(rb_cIconv, "transliterate?", iconv_get_transliterate, 0);
  1077. rb_define_method(rb_cIconv, "transliterate=", iconv_set_transliterate, 1);
  1078. rb_define_method(rb_cIconv, "discard_ilseq?", iconv_get_discard_ilseq, 0);
  1079. rb_define_method(rb_cIconv, "discard_ilseq=", iconv_set_discard_ilseq, 1);
  1080. rb_eIconvFailure = rb_define_module_under(rb_cIconv, "Failure");
  1081. rb_define_method(rb_eIconvFailure, "initialize", iconv_failure_initialize, 3);
  1082. rb_define_method(rb_eIconvFailure, "success", iconv_failure_success, 0);
  1083. rb_define_method(rb_eIconvFailure, "failed", iconv_failure_failed, 0);
  1084. rb_define_method(rb_eIconvFailure, "inspect", iconv_failure_inspect, 0);
  1085. rb_eIconvInvalidEncoding = rb_define_class_under(rb_cIconv, "InvalidEncoding", rb_eArgError);
  1086. rb_eIconvIllegalSeq = rb_define_class_under(rb_cIconv, "IllegalSequence", rb_eArgError);
  1087. rb_eIconvInvalidChar = rb_define_class_under(rb_cIconv, "InvalidCharacter", rb_eArgError);
  1088. rb_eIconvOutOfRange = rb_define_class_under(rb_cIconv, "OutOfRange", rb_eRuntimeError);
  1089. rb_eIconvBrokenLibrary = rb_define_class_under(rb_cIconv, "BrokenLibrary", rb_eRuntimeError);
  1090. rb_include_module(rb_eIconvInvalidEncoding, rb_eIconvFailure);
  1091. rb_include_module(rb_eIconvIllegalSeq, rb_eIconvFailure);
  1092. rb_include_module(rb_eIconvInvalidChar, rb_eIconvFailure);
  1093. rb_include_module(rb_eIconvOutOfRange, rb_eIconvFailure);
  1094. rb_include_module(rb_eIconvBrokenLibrary, rb_eIconvFailure);
  1095. rb_success = rb_intern("success");
  1096. rb_failed = rb_intern("failed");
  1097. id_transliterate = rb_intern("transliterate");
  1098. id_discard_ilseq = rb_intern("discard_ilseq");
  1099. rb_gc_register_address(&charset_map);
  1100. charset_map = rb_hash_new();
  1101. rb_define_singleton_method(rb_cIconv, "charset_map", charset_map_get, 0);
  1102. }