PageRenderTime 57ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 1ms

/string.c

https://github.com/vuxuandung/ruby
C | 8315 lines | 5858 code | 822 blank | 1635 comment | 1556 complexity | 8b89c4a4b18f27bf63623a27c3a99f6b MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, AGPL-3.0, 0BSD

Large files files are truncated, but you can click here to view the full file

  1. /**********************************************************************
  2. string.c -
  3. $Author$
  4. created at: Mon Aug 9 17:12:58 JST 1993
  5. Copyright (C) 1993-2007 Yukihiro Matsumoto
  6. Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
  7. Copyright (C) 2000 Information-technology Promotion Agency, Japan
  8. **********************************************************************/
  9. #include "ruby/ruby.h"
  10. #include "ruby/re.h"
  11. #include "ruby/encoding.h"
  12. #include "vm_core.h"
  13. #include "internal.h"
  14. #include "probes.h"
  15. #include <assert.h>
  16. #define BEG(no) (regs->beg[(no)])
  17. #define END(no) (regs->end[(no)])
  18. #include <math.h>
  19. #include <ctype.h>
  20. #ifdef HAVE_UNISTD_H
  21. #include <unistd.h>
  22. #endif
  23. #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
  24. #undef rb_str_new_cstr
  25. #undef rb_tainted_str_new_cstr
  26. #undef rb_usascii_str_new_cstr
  27. #undef rb_external_str_new_cstr
  28. #undef rb_locale_str_new_cstr
  29. #undef rb_str_new2
  30. #undef rb_str_new3
  31. #undef rb_str_new4
  32. #undef rb_str_new5
  33. #undef rb_tainted_str_new2
  34. #undef rb_usascii_str_new2
  35. #undef rb_str_dup_frozen
  36. #undef rb_str_buf_new_cstr
  37. #undef rb_str_buf_new2
  38. #undef rb_str_buf_cat2
  39. #undef rb_str_cat2
  40. static VALUE rb_str_clear(VALUE str);
  41. VALUE rb_cString;
  42. VALUE rb_cSymbol;
  43. #define RUBY_MAX_CHAR_LEN 16
  44. #define STR_TMPLOCK FL_USER7
  45. #define STR_NOEMBED FL_USER1
  46. #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
  47. #define STR_ASSOC FL_USER3
  48. #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
  49. #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
  50. #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  51. #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
  52. #define STR_UNSET_NOCAPA(s) do {\
  53. if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
  54. } while (0)
  55. #define STR_SET_NOEMBED(str) do {\
  56. FL_SET((str), STR_NOEMBED);\
  57. STR_SET_EMBED_LEN((str), 0);\
  58. } while (0)
  59. #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
  60. #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
  61. #define STR_SET_EMBED_LEN(str, n) do { \
  62. long tmp_n = (n);\
  63. RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  64. RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  65. } while (0)
  66. #define STR_SET_LEN(str, n) do { \
  67. if (STR_EMBED_P(str)) {\
  68. STR_SET_EMBED_LEN((str), (n));\
  69. }\
  70. else {\
  71. RSTRING(str)->as.heap.len = (n);\
  72. }\
  73. } while (0)
  74. #define STR_DEC_LEN(str) do {\
  75. if (STR_EMBED_P(str)) {\
  76. long n = RSTRING_LEN(str);\
  77. n--;\
  78. STR_SET_EMBED_LEN((str), n);\
  79. }\
  80. else {\
  81. RSTRING(str)->as.heap.len--;\
  82. }\
  83. } while (0)
  84. #define RESIZE_CAPA(str,capacity) do {\
  85. if (STR_EMBED_P(str)) {\
  86. if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  87. char *tmp = ALLOC_N(char, (capacity)+1);\
  88. memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  89. RSTRING(str)->as.heap.ptr = tmp;\
  90. RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  91. STR_SET_NOEMBED(str);\
  92. RSTRING(str)->as.heap.aux.capa = (capacity);\
  93. }\
  94. }\
  95. else {\
  96. REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  97. if (!STR_NOCAPA_P(str))\
  98. RSTRING(str)->as.heap.aux.capa = (capacity);\
  99. }\
  100. } while (0)
  101. #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  102. #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
  103. #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
  104. static inline int
  105. single_byte_optimizable(VALUE str)
  106. {
  107. rb_encoding *enc;
  108. /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
  109. if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
  110. return 1;
  111. enc = STR_ENC_GET(str);
  112. if (rb_enc_mbmaxlen(enc) == 1)
  113. return 1;
  114. /* Conservative. Possibly single byte.
  115. * "\xa1" in Shift_JIS for example. */
  116. return 0;
  117. }
  118. VALUE rb_fs;
  119. static inline const char *
  120. search_nonascii(const char *p, const char *e)
  121. {
  122. #if SIZEOF_VALUE == 8
  123. # define NONASCII_MASK 0x8080808080808080ULL
  124. #elif SIZEOF_VALUE == 4
  125. # define NONASCII_MASK 0x80808080UL
  126. #endif
  127. #ifdef NONASCII_MASK
  128. if ((int)sizeof(VALUE) * 2 < e - p) {
  129. const VALUE *s, *t;
  130. const VALUE lowbits = sizeof(VALUE) - 1;
  131. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  132. while (p < (const char *)s) {
  133. if (!ISASCII(*p))
  134. return p;
  135. p++;
  136. }
  137. t = (const VALUE*)(~lowbits & (VALUE)e);
  138. while (s < t) {
  139. if (*s & NONASCII_MASK) {
  140. t = s;
  141. break;
  142. }
  143. s++;
  144. }
  145. p = (const char *)t;
  146. }
  147. #endif
  148. while (p < e) {
  149. if (!ISASCII(*p))
  150. return p;
  151. p++;
  152. }
  153. return NULL;
  154. }
  155. static int
  156. coderange_scan(const char *p, long len, rb_encoding *enc)
  157. {
  158. const char *e = p + len;
  159. if (rb_enc_to_index(enc) == 0) {
  160. /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
  161. p = search_nonascii(p, e);
  162. return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
  163. }
  164. if (rb_enc_asciicompat(enc)) {
  165. p = search_nonascii(p, e);
  166. if (!p) {
  167. return ENC_CODERANGE_7BIT;
  168. }
  169. while (p < e) {
  170. int ret = rb_enc_precise_mbclen(p, e, enc);
  171. if (!MBCLEN_CHARFOUND_P(ret)) {
  172. return ENC_CODERANGE_BROKEN;
  173. }
  174. p += MBCLEN_CHARFOUND_LEN(ret);
  175. if (p < e) {
  176. p = search_nonascii(p, e);
  177. if (!p) {
  178. return ENC_CODERANGE_VALID;
  179. }
  180. }
  181. }
  182. if (e < p) {
  183. return ENC_CODERANGE_BROKEN;
  184. }
  185. return ENC_CODERANGE_VALID;
  186. }
  187. while (p < e) {
  188. int ret = rb_enc_precise_mbclen(p, e, enc);
  189. if (!MBCLEN_CHARFOUND_P(ret)) {
  190. return ENC_CODERANGE_BROKEN;
  191. }
  192. p += MBCLEN_CHARFOUND_LEN(ret);
  193. }
  194. if (e < p) {
  195. return ENC_CODERANGE_BROKEN;
  196. }
  197. return ENC_CODERANGE_VALID;
  198. }
  199. long
  200. rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
  201. {
  202. const char *p = s;
  203. if (*cr == ENC_CODERANGE_BROKEN)
  204. return e - s;
  205. if (rb_enc_to_index(enc) == 0) {
  206. /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
  207. p = search_nonascii(p, e);
  208. *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
  209. return e - s;
  210. }
  211. else if (rb_enc_asciicompat(enc)) {
  212. p = search_nonascii(p, e);
  213. if (!p) {
  214. if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
  215. return e - s;
  216. }
  217. while (p < e) {
  218. int ret = rb_enc_precise_mbclen(p, e, enc);
  219. if (!MBCLEN_CHARFOUND_P(ret)) {
  220. *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
  221. return p - s;
  222. }
  223. p += MBCLEN_CHARFOUND_LEN(ret);
  224. if (p < e) {
  225. p = search_nonascii(p, e);
  226. if (!p) {
  227. *cr = ENC_CODERANGE_VALID;
  228. return e - s;
  229. }
  230. }
  231. }
  232. *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
  233. return p - s;
  234. }
  235. else {
  236. while (p < e) {
  237. int ret = rb_enc_precise_mbclen(p, e, enc);
  238. if (!MBCLEN_CHARFOUND_P(ret)) {
  239. *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
  240. return p - s;
  241. }
  242. p += MBCLEN_CHARFOUND_LEN(ret);
  243. }
  244. *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
  245. return p - s;
  246. }
  247. }
  248. static inline void
  249. str_enc_copy(VALUE str1, VALUE str2)
  250. {
  251. rb_enc_set_index(str1, ENCODING_GET(str2));
  252. }
  253. static void
  254. rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
  255. {
  256. /* this function is designed for copying encoding and coderange
  257. * from src to new string "dest" which is made from the part of src.
  258. */
  259. str_enc_copy(dest, src);
  260. if (RSTRING_LEN(dest) == 0) {
  261. if (!rb_enc_asciicompat(STR_ENC_GET(src)))
  262. ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
  263. else
  264. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  265. return;
  266. }
  267. switch (ENC_CODERANGE(src)) {
  268. case ENC_CODERANGE_7BIT:
  269. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  270. break;
  271. case ENC_CODERANGE_VALID:
  272. if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
  273. search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
  274. ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
  275. else
  276. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  277. break;
  278. default:
  279. break;
  280. }
  281. }
  282. static void
  283. rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
  284. {
  285. str_enc_copy(dest, src);
  286. ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
  287. }
  288. int
  289. rb_enc_str_coderange(VALUE str)
  290. {
  291. int cr = ENC_CODERANGE(str);
  292. if (cr == ENC_CODERANGE_UNKNOWN) {
  293. rb_encoding *enc = STR_ENC_GET(str);
  294. cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
  295. ENC_CODERANGE_SET(str, cr);
  296. }
  297. return cr;
  298. }
  299. int
  300. rb_enc_str_asciionly_p(VALUE str)
  301. {
  302. rb_encoding *enc = STR_ENC_GET(str);
  303. if (!rb_enc_asciicompat(enc))
  304. return FALSE;
  305. else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  306. return TRUE;
  307. return FALSE;
  308. }
  309. static inline void
  310. str_mod_check(VALUE s, const char *p, long len)
  311. {
  312. if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
  313. rb_raise(rb_eRuntimeError, "string modified");
  314. }
  315. }
  316. size_t
  317. rb_str_capacity(VALUE str)
  318. {
  319. if (STR_EMBED_P(str)) {
  320. return RSTRING_EMBED_LEN_MAX;
  321. }
  322. else if (STR_NOCAPA_P(str)) {
  323. return RSTRING(str)->as.heap.len;
  324. }
  325. else {
  326. return RSTRING(str)->as.heap.aux.capa;
  327. }
  328. }
  329. static inline VALUE
  330. str_alloc(VALUE klass)
  331. {
  332. NEWOBJ_OF(str, struct RString, klass, T_STRING);
  333. str->as.heap.ptr = 0;
  334. str->as.heap.len = 0;
  335. str->as.heap.aux.capa = 0;
  336. return (VALUE)str;
  337. }
  338. static inline VALUE
  339. empty_str_alloc(VALUE klass)
  340. {
  341. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  342. RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
  343. }
  344. return str_alloc(klass);
  345. }
  346. static VALUE
  347. str_new(VALUE klass, const char *ptr, long len)
  348. {
  349. VALUE str;
  350. if (len < 0) {
  351. rb_raise(rb_eArgError, "negative string size (or size too big)");
  352. }
  353. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  354. RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
  355. }
  356. str = str_alloc(klass);
  357. if (len > RSTRING_EMBED_LEN_MAX) {
  358. RSTRING(str)->as.heap.aux.capa = len;
  359. RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
  360. STR_SET_NOEMBED(str);
  361. }
  362. else if (len == 0) {
  363. ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
  364. }
  365. if (ptr) {
  366. memcpy(RSTRING_PTR(str), ptr, len);
  367. }
  368. STR_SET_LEN(str, len);
  369. RSTRING_PTR(str)[len] = '\0';
  370. return str;
  371. }
  372. VALUE
  373. rb_str_new(const char *ptr, long len)
  374. {
  375. return str_new(rb_cString, ptr, len);
  376. }
  377. VALUE
  378. rb_usascii_str_new(const char *ptr, long len)
  379. {
  380. VALUE str = rb_str_new(ptr, len);
  381. ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
  382. return str;
  383. }
  384. VALUE
  385. rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
  386. {
  387. VALUE str = rb_str_new(ptr, len);
  388. rb_enc_associate(str, enc);
  389. return str;
  390. }
  391. VALUE
  392. rb_str_new_cstr(const char *ptr)
  393. {
  394. if (!ptr) {
  395. rb_raise(rb_eArgError, "NULL pointer given");
  396. }
  397. return rb_str_new(ptr, strlen(ptr));
  398. }
  399. RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
  400. #define rb_str_new2 rb_str_new_cstr
  401. VALUE
  402. rb_usascii_str_new_cstr(const char *ptr)
  403. {
  404. VALUE str = rb_str_new2(ptr);
  405. ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
  406. return str;
  407. }
  408. RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
  409. #define rb_usascii_str_new2 rb_usascii_str_new_cstr
  410. VALUE
  411. rb_tainted_str_new(const char *ptr, long len)
  412. {
  413. VALUE str = rb_str_new(ptr, len);
  414. OBJ_TAINT(str);
  415. return str;
  416. }
  417. VALUE
  418. rb_tainted_str_new_cstr(const char *ptr)
  419. {
  420. VALUE str = rb_str_new2(ptr);
  421. OBJ_TAINT(str);
  422. return str;
  423. }
  424. RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
  425. #define rb_tainted_str_new2 rb_tainted_str_new_cstr
  426. VALUE
  427. rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
  428. {
  429. rb_econv_t *ec;
  430. rb_econv_result_t ret;
  431. long len;
  432. VALUE newstr;
  433. const unsigned char *sp;
  434. unsigned char *dp;
  435. if (!to) return str;
  436. if (!from) from = rb_enc_get(str);
  437. if (from == to) return str;
  438. if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
  439. to == rb_ascii8bit_encoding()) {
  440. if (STR_ENC_GET(str) != to) {
  441. str = rb_str_dup(str);
  442. rb_enc_associate(str, to);
  443. }
  444. return str;
  445. }
  446. len = RSTRING_LEN(str);
  447. newstr = rb_str_new(0, len);
  448. retry:
  449. ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
  450. if (!ec) return str;
  451. sp = (unsigned char*)RSTRING_PTR(str);
  452. dp = (unsigned char*)RSTRING_PTR(newstr);
  453. ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
  454. &dp, (unsigned char*)RSTRING_END(newstr), 0);
  455. rb_econv_close(ec);
  456. switch (ret) {
  457. case econv_destination_buffer_full:
  458. /* destination buffer short */
  459. len = len < 2 ? 2 : len * 2;
  460. rb_str_resize(newstr, len);
  461. goto retry;
  462. case econv_finished:
  463. len = dp - (unsigned char*)RSTRING_PTR(newstr);
  464. rb_str_set_len(newstr, len);
  465. rb_enc_associate(newstr, to);
  466. return newstr;
  467. default:
  468. /* some error, return original */
  469. return str;
  470. }
  471. }
  472. VALUE
  473. rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
  474. {
  475. return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
  476. }
  477. VALUE
  478. rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
  479. {
  480. VALUE str;
  481. str = rb_tainted_str_new(ptr, len);
  482. if (eenc == rb_usascii_encoding() &&
  483. rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
  484. rb_enc_associate(str, rb_ascii8bit_encoding());
  485. return str;
  486. }
  487. rb_enc_associate(str, eenc);
  488. return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
  489. }
  490. VALUE
  491. rb_external_str_new(const char *ptr, long len)
  492. {
  493. return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
  494. }
  495. VALUE
  496. rb_external_str_new_cstr(const char *ptr)
  497. {
  498. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
  499. }
  500. VALUE
  501. rb_locale_str_new(const char *ptr, long len)
  502. {
  503. return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
  504. }
  505. VALUE
  506. rb_locale_str_new_cstr(const char *ptr)
  507. {
  508. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
  509. }
  510. VALUE
  511. rb_filesystem_str_new(const char *ptr, long len)
  512. {
  513. return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
  514. }
  515. VALUE
  516. rb_filesystem_str_new_cstr(const char *ptr)
  517. {
  518. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
  519. }
  520. VALUE
  521. rb_str_export(VALUE str)
  522. {
  523. return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
  524. }
  525. VALUE
  526. rb_str_export_locale(VALUE str)
  527. {
  528. return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
  529. }
  530. VALUE
  531. rb_str_export_to_enc(VALUE str, rb_encoding *enc)
  532. {
  533. return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
  534. }
  535. static VALUE
  536. str_replace_shared_without_enc(VALUE str2, VALUE str)
  537. {
  538. if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
  539. STR_SET_EMBED(str2);
  540. memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
  541. STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
  542. }
  543. else {
  544. str = rb_str_new_frozen(str);
  545. FL_SET(str2, STR_NOEMBED);
  546. RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
  547. RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
  548. RSTRING(str2)->as.heap.aux.shared = str;
  549. FL_SET(str2, ELTS_SHARED);
  550. }
  551. return str2;
  552. }
  553. static VALUE
  554. str_replace_shared(VALUE str2, VALUE str)
  555. {
  556. str_replace_shared_without_enc(str2, str);
  557. rb_enc_cr_str_exact_copy(str2, str);
  558. return str2;
  559. }
  560. static VALUE
  561. str_new_shared(VALUE klass, VALUE str)
  562. {
  563. return str_replace_shared(str_alloc(klass), str);
  564. }
  565. static VALUE
  566. str_new3(VALUE klass, VALUE str)
  567. {
  568. return str_new_shared(klass, str);
  569. }
  570. VALUE
  571. rb_str_new_shared(VALUE str)
  572. {
  573. VALUE str2 = str_new3(rb_obj_class(str), str);
  574. OBJ_INFECT(str2, str);
  575. return str2;
  576. }
  577. RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
  578. #define rb_str_new3 rb_str_new_shared
  579. static VALUE
  580. str_new4(VALUE klass, VALUE str)
  581. {
  582. VALUE str2;
  583. str2 = str_alloc(klass);
  584. STR_SET_NOEMBED(str2);
  585. RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
  586. RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
  587. if (STR_SHARED_P(str)) {
  588. VALUE shared = RSTRING(str)->as.heap.aux.shared;
  589. assert(OBJ_FROZEN(shared));
  590. FL_SET(str2, ELTS_SHARED);
  591. RSTRING(str2)->as.heap.aux.shared = shared;
  592. }
  593. else {
  594. FL_SET(str, ELTS_SHARED);
  595. RSTRING(str)->as.heap.aux.shared = str2;
  596. }
  597. rb_enc_cr_str_exact_copy(str2, str);
  598. OBJ_INFECT(str2, str);
  599. return str2;
  600. }
  601. VALUE
  602. rb_str_new_frozen(VALUE orig)
  603. {
  604. VALUE klass, str;
  605. if (OBJ_FROZEN(orig)) return orig;
  606. klass = rb_obj_class(orig);
  607. if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
  608. long ofs;
  609. assert(OBJ_FROZEN(str));
  610. ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
  611. if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
  612. ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
  613. ENCODING_GET(str) != ENCODING_GET(orig)) {
  614. str = str_new3(klass, str);
  615. RSTRING(str)->as.heap.ptr += ofs;
  616. RSTRING(str)->as.heap.len -= ofs;
  617. rb_enc_cr_str_exact_copy(str, orig);
  618. OBJ_INFECT(str, orig);
  619. }
  620. }
  621. else if (STR_EMBED_P(orig)) {
  622. str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
  623. rb_enc_cr_str_exact_copy(str, orig);
  624. OBJ_INFECT(str, orig);
  625. }
  626. else if (STR_ASSOC_P(orig)) {
  627. VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
  628. FL_UNSET(orig, STR_ASSOC);
  629. str = str_new4(klass, orig);
  630. FL_SET(str, STR_ASSOC);
  631. RSTRING(str)->as.heap.aux.shared = assoc;
  632. }
  633. else {
  634. str = str_new4(klass, orig);
  635. }
  636. OBJ_FREEZE(str);
  637. return str;
  638. }
  639. RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
  640. #define rb_str_new4 rb_str_new_frozen
  641. VALUE
  642. rb_str_new_with_class(VALUE obj, const char *ptr, long len)
  643. {
  644. return str_new(rb_obj_class(obj), ptr, len);
  645. }
  646. RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
  647. rb_str_new_with_class, (obj, ptr, len))
  648. #define rb_str_new5 rb_str_new_with_class
  649. static VALUE
  650. str_new_empty(VALUE str)
  651. {
  652. VALUE v = rb_str_new5(str, 0, 0);
  653. rb_enc_copy(v, str);
  654. OBJ_INFECT(v, str);
  655. return v;
  656. }
  657. #define STR_BUF_MIN_SIZE 128
  658. VALUE
  659. rb_str_buf_new(long capa)
  660. {
  661. VALUE str = str_alloc(rb_cString);
  662. if (capa < STR_BUF_MIN_SIZE) {
  663. capa = STR_BUF_MIN_SIZE;
  664. }
  665. FL_SET(str, STR_NOEMBED);
  666. RSTRING(str)->as.heap.aux.capa = capa;
  667. RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
  668. RSTRING(str)->as.heap.ptr[0] = '\0';
  669. return str;
  670. }
  671. VALUE
  672. rb_str_buf_new_cstr(const char *ptr)
  673. {
  674. VALUE str;
  675. long len = strlen(ptr);
  676. str = rb_str_buf_new(len);
  677. rb_str_buf_cat(str, ptr, len);
  678. return str;
  679. }
  680. RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
  681. #define rb_str_buf_new2 rb_str_buf_new_cstr
  682. VALUE
  683. rb_str_tmp_new(long len)
  684. {
  685. return str_new(0, 0, len);
  686. }
  687. void *
  688. rb_alloc_tmp_buffer(volatile VALUE *store, long len)
  689. {
  690. VALUE s = rb_str_tmp_new(len);
  691. *store = s;
  692. return RSTRING_PTR(s);
  693. }
  694. void
  695. rb_free_tmp_buffer(volatile VALUE *store)
  696. {
  697. VALUE s = *store;
  698. *store = 0;
  699. if (s) rb_str_clear(s);
  700. }
  701. void
  702. rb_str_free(VALUE str)
  703. {
  704. if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
  705. xfree(RSTRING(str)->as.heap.ptr);
  706. }
  707. }
  708. RUBY_FUNC_EXPORTED size_t
  709. rb_str_memsize(VALUE str)
  710. {
  711. if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
  712. return RSTRING(str)->as.heap.aux.capa;
  713. }
  714. else {
  715. return 0;
  716. }
  717. }
  718. VALUE
  719. rb_str_to_str(VALUE str)
  720. {
  721. return rb_convert_type(str, T_STRING, "String", "to_str");
  722. }
  723. static inline void str_discard(VALUE str);
  724. void
  725. rb_str_shared_replace(VALUE str, VALUE str2)
  726. {
  727. rb_encoding *enc;
  728. int cr;
  729. if (str == str2) return;
  730. enc = STR_ENC_GET(str2);
  731. cr = ENC_CODERANGE(str2);
  732. str_discard(str);
  733. OBJ_INFECT(str, str2);
  734. if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
  735. STR_SET_EMBED(str);
  736. memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
  737. STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
  738. rb_enc_associate(str, enc);
  739. ENC_CODERANGE_SET(str, cr);
  740. return;
  741. }
  742. STR_SET_NOEMBED(str);
  743. STR_UNSET_NOCAPA(str);
  744. RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
  745. RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
  746. if (STR_NOCAPA_P(str2)) {
  747. FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
  748. RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
  749. }
  750. else {
  751. RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
  752. }
  753. STR_SET_EMBED(str2); /* abandon str2 */
  754. RSTRING_PTR(str2)[0] = 0;
  755. STR_SET_EMBED_LEN(str2, 0);
  756. rb_enc_associate(str, enc);
  757. ENC_CODERANGE_SET(str, cr);
  758. }
  759. static ID id_to_s;
  760. VALUE
  761. rb_obj_as_string(VALUE obj)
  762. {
  763. VALUE str;
  764. if (RB_TYPE_P(obj, T_STRING)) {
  765. return obj;
  766. }
  767. str = rb_funcall(obj, id_to_s, 0);
  768. if (!RB_TYPE_P(str, T_STRING))
  769. return rb_any_to_s(obj);
  770. if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
  771. return str;
  772. }
  773. static VALUE
  774. str_replace(VALUE str, VALUE str2)
  775. {
  776. long len;
  777. len = RSTRING_LEN(str2);
  778. if (STR_ASSOC_P(str2)) {
  779. str2 = rb_str_new4(str2);
  780. }
  781. if (STR_SHARED_P(str2)) {
  782. VALUE shared = RSTRING(str2)->as.heap.aux.shared;
  783. assert(OBJ_FROZEN(shared));
  784. STR_SET_NOEMBED(str);
  785. RSTRING(str)->as.heap.len = len;
  786. RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
  787. FL_SET(str, ELTS_SHARED);
  788. FL_UNSET(str, STR_ASSOC);
  789. RSTRING(str)->as.heap.aux.shared = shared;
  790. }
  791. else {
  792. str_replace_shared(str, str2);
  793. }
  794. OBJ_INFECT(str, str2);
  795. rb_enc_cr_str_exact_copy(str, str2);
  796. return str;
  797. }
  798. static VALUE
  799. str_duplicate(VALUE klass, VALUE str)
  800. {
  801. VALUE dup = str_alloc(klass);
  802. str_replace(dup, str);
  803. return dup;
  804. }
  805. VALUE
  806. rb_str_dup(VALUE str)
  807. {
  808. return str_duplicate(rb_obj_class(str), str);
  809. }
  810. VALUE
  811. rb_str_resurrect(VALUE str)
  812. {
  813. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  814. RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
  815. rb_sourcefile(), rb_sourceline());
  816. }
  817. return str_replace(str_alloc(rb_cString), str);
  818. }
  819. /*
  820. * call-seq:
  821. * String.new(str="") -> new_str
  822. *
  823. * Returns a new string object containing a copy of <i>str</i>.
  824. */
  825. static VALUE
  826. rb_str_init(int argc, VALUE *argv, VALUE str)
  827. {
  828. VALUE orig;
  829. if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
  830. rb_str_replace(str, orig);
  831. return str;
  832. }
  833. static inline long
  834. enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
  835. {
  836. long c;
  837. const char *q;
  838. if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  839. return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
  840. }
  841. else if (rb_enc_asciicompat(enc)) {
  842. c = 0;
  843. if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
  844. while (p < e) {
  845. if (ISASCII(*p)) {
  846. q = search_nonascii(p, e);
  847. if (!q)
  848. return c + (e - p);
  849. c += q - p;
  850. p = q;
  851. }
  852. p += rb_enc_fast_mbclen(p, e, enc);
  853. c++;
  854. }
  855. }
  856. else {
  857. while (p < e) {
  858. if (ISASCII(*p)) {
  859. q = search_nonascii(p, e);
  860. if (!q)
  861. return c + (e - p);
  862. c += q - p;
  863. p = q;
  864. }
  865. p += rb_enc_mbclen(p, e, enc);
  866. c++;
  867. }
  868. }
  869. return c;
  870. }
  871. for (c=0; p<e; c++) {
  872. p += rb_enc_mbclen(p, e, enc);
  873. }
  874. return c;
  875. }
  876. long
  877. rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
  878. {
  879. return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
  880. }
  881. long
  882. rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
  883. {
  884. long c;
  885. const char *q;
  886. int ret;
  887. *cr = 0;
  888. if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  889. return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
  890. }
  891. else if (rb_enc_asciicompat(enc)) {
  892. c = 0;
  893. while (p < e) {
  894. if (ISASCII(*p)) {
  895. q = search_nonascii(p, e);
  896. if (!q) {
  897. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  898. return c + (e - p);
  899. }
  900. c += q - p;
  901. p = q;
  902. }
  903. ret = rb_enc_precise_mbclen(p, e, enc);
  904. if (MBCLEN_CHARFOUND_P(ret)) {
  905. *cr |= ENC_CODERANGE_VALID;
  906. p += MBCLEN_CHARFOUND_LEN(ret);
  907. }
  908. else {
  909. *cr = ENC_CODERANGE_BROKEN;
  910. p++;
  911. }
  912. c++;
  913. }
  914. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  915. return c;
  916. }
  917. for (c=0; p<e; c++) {
  918. ret = rb_enc_precise_mbclen(p, e, enc);
  919. if (MBCLEN_CHARFOUND_P(ret)) {
  920. *cr |= ENC_CODERANGE_VALID;
  921. p += MBCLEN_CHARFOUND_LEN(ret);
  922. }
  923. else {
  924. *cr = ENC_CODERANGE_BROKEN;
  925. if (p + rb_enc_mbminlen(enc) <= e)
  926. p += rb_enc_mbminlen(enc);
  927. else
  928. p = e;
  929. }
  930. }
  931. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  932. return c;
  933. }
  934. #ifdef NONASCII_MASK
  935. #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
  936. /*
  937. * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
  938. * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
  939. * Therefore, following pseudo code can detect UTF-8 leading byte.
  940. *
  941. * if (!(byte & 0x80))
  942. * byte |= 0x40; // turn on bit6
  943. * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
  944. *
  945. * This function calculate every bytes in the argument word `s'
  946. * using the above logic concurrently. and gather every bytes result.
  947. */
  948. static inline VALUE
  949. count_utf8_lead_bytes_with_word(const VALUE *s)
  950. {
  951. VALUE d = *s;
  952. /* Transform into bit0 represent UTF-8 leading or not. */
  953. d |= ~(d>>1);
  954. d >>= 6;
  955. d &= NONASCII_MASK >> 7;
  956. /* Gather every bytes. */
  957. d += (d>>8);
  958. d += (d>>16);
  959. #if SIZEOF_VALUE == 8
  960. d += (d>>32);
  961. #endif
  962. return (d&0xF);
  963. }
  964. #endif
  965. static long
  966. str_strlen(VALUE str, rb_encoding *enc)
  967. {
  968. const char *p, *e;
  969. long n;
  970. int cr;
  971. if (single_byte_optimizable(str)) return RSTRING_LEN(str);
  972. if (!enc) enc = STR_ENC_GET(str);
  973. p = RSTRING_PTR(str);
  974. e = RSTRING_END(str);
  975. cr = ENC_CODERANGE(str);
  976. #ifdef NONASCII_MASK
  977. if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
  978. enc == rb_utf8_encoding()) {
  979. VALUE len = 0;
  980. if ((int)sizeof(VALUE) * 2 < e - p) {
  981. const VALUE *s, *t;
  982. const VALUE lowbits = sizeof(VALUE) - 1;
  983. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  984. t = (const VALUE*)(~lowbits & (VALUE)e);
  985. while (p < (const char *)s) {
  986. if (is_utf8_lead_byte(*p)) len++;
  987. p++;
  988. }
  989. while (s < t) {
  990. len += count_utf8_lead_bytes_with_word(s);
  991. s++;
  992. }
  993. p = (const char *)s;
  994. }
  995. while (p < e) {
  996. if (is_utf8_lead_byte(*p)) len++;
  997. p++;
  998. }
  999. return (long)len;
  1000. }
  1001. #endif
  1002. n = rb_enc_strlen_cr(p, e, enc, &cr);
  1003. if (cr) {
  1004. ENC_CODERANGE_SET(str, cr);
  1005. }
  1006. return n;
  1007. }
  1008. long
  1009. rb_str_strlen(VALUE str)
  1010. {
  1011. return str_strlen(str, STR_ENC_GET(str));
  1012. }
  1013. /*
  1014. * call-seq:
  1015. * str.length -> integer
  1016. * str.size -> integer
  1017. *
  1018. * Returns the character length of <i>str</i>.
  1019. */
  1020. VALUE
  1021. rb_str_length(VALUE str)
  1022. {
  1023. long len;
  1024. len = str_strlen(str, STR_ENC_GET(str));
  1025. return LONG2NUM(len);
  1026. }
  1027. /*
  1028. * call-seq:
  1029. * str.bytesize -> integer
  1030. *
  1031. * Returns the length of +str+ in bytes.
  1032. *
  1033. * "\x80\u3042".bytesize #=> 4
  1034. * "hello".bytesize #=> 5
  1035. */
  1036. static VALUE
  1037. rb_str_bytesize(VALUE str)
  1038. {
  1039. return LONG2NUM(RSTRING_LEN(str));
  1040. }
  1041. /*
  1042. * call-seq:
  1043. * str.empty? -> true or false
  1044. *
  1045. * Returns <code>true</code> if <i>str</i> has a length of zero.
  1046. *
  1047. * "hello".empty? #=> false
  1048. * " ".empty? #=> false
  1049. * "".empty? #=> true
  1050. */
  1051. static VALUE
  1052. rb_str_empty(VALUE str)
  1053. {
  1054. if (RSTRING_LEN(str) == 0)
  1055. return Qtrue;
  1056. return Qfalse;
  1057. }
  1058. /*
  1059. * call-seq:
  1060. * str + other_str -> new_str
  1061. *
  1062. * Concatenation---Returns a new <code>String</code> containing
  1063. * <i>other_str</i> concatenated to <i>str</i>.
  1064. *
  1065. * "Hello from " + self.to_s #=> "Hello from main"
  1066. */
  1067. VALUE
  1068. rb_str_plus(VALUE str1, VALUE str2)
  1069. {
  1070. VALUE str3;
  1071. rb_encoding *enc;
  1072. StringValue(str2);
  1073. enc = rb_enc_check(str1, str2);
  1074. str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
  1075. memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
  1076. memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
  1077. RSTRING_PTR(str2), RSTRING_LEN(str2));
  1078. RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
  1079. if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
  1080. OBJ_TAINT(str3);
  1081. ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
  1082. ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
  1083. return str3;
  1084. }
  1085. /*
  1086. * call-seq:
  1087. * str * integer -> new_str
  1088. *
  1089. * Copy --- Returns a new String containing +integer+ copies of the receiver.
  1090. * +integer+ must be greater than or equal to 0.
  1091. *
  1092. * "Ho! " * 3 #=> "Ho! Ho! Ho! "
  1093. * "Ho! " * 0 #=> ""
  1094. */
  1095. VALUE
  1096. rb_str_times(VALUE str, VALUE times)
  1097. {
  1098. VALUE str2;
  1099. long n, len;
  1100. char *ptr2;
  1101. len = NUM2LONG(times);
  1102. if (len < 0) {
  1103. rb_raise(rb_eArgError, "negative argument");
  1104. }
  1105. if (len && LONG_MAX/len < RSTRING_LEN(str)) {
  1106. rb_raise(rb_eArgError, "argument too big");
  1107. }
  1108. str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
  1109. ptr2 = RSTRING_PTR(str2);
  1110. if (len) {
  1111. n = RSTRING_LEN(str);
  1112. memcpy(ptr2, RSTRING_PTR(str), n);
  1113. while (n <= len/2) {
  1114. memcpy(ptr2 + n, ptr2, n);
  1115. n *= 2;
  1116. }
  1117. memcpy(ptr2 + n, ptr2, len-n);
  1118. }
  1119. ptr2[RSTRING_LEN(str2)] = '\0';
  1120. OBJ_INFECT(str2, str);
  1121. rb_enc_cr_str_copy_for_substr(str2, str);
  1122. return str2;
  1123. }
  1124. /*
  1125. * call-seq:
  1126. * str % arg -> new_str
  1127. *
  1128. * Format---Uses <i>str</i> as a format specification, and returns the result
  1129. * of applying it to <i>arg</i>. If the format specification contains more than
  1130. * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
  1131. * containing the values to be substituted. See <code>Kernel::sprintf</code> for
  1132. * details of the format string.
  1133. *
  1134. * "%05d" % 123 #=> "00123"
  1135. * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
  1136. * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
  1137. */
  1138. static VALUE
  1139. rb_str_format_m(VALUE str, VALUE arg)
  1140. {
  1141. volatile VALUE tmp = rb_check_array_type(arg);
  1142. if (!NIL_P(tmp)) {
  1143. return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
  1144. }
  1145. return rb_str_format(1, &arg, str);
  1146. }
  1147. static inline void
  1148. str_modifiable(VALUE str)
  1149. {
  1150. if (FL_TEST(str, STR_TMPLOCK)) {
  1151. rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
  1152. }
  1153. rb_check_frozen(str);
  1154. if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
  1155. rb_raise(rb_eSecurityError, "Insecure: can't modify string");
  1156. }
  1157. static inline int
  1158. str_independent(VALUE str)
  1159. {
  1160. str_modifiable(str);
  1161. if (!STR_SHARED_P(str)) return 1;
  1162. if (STR_EMBED_P(str)) return 1;
  1163. return 0;
  1164. }
  1165. static void
  1166. str_make_independent_expand(VALUE str, long expand)
  1167. {
  1168. char *ptr;
  1169. long len = RSTRING_LEN(str);
  1170. long capa = len + expand;
  1171. if (len > capa) len = capa;
  1172. ptr = ALLOC_N(char, capa + 1);
  1173. if (RSTRING_PTR(str)) {
  1174. memcpy(ptr, RSTRING_PTR(str), len);
  1175. }
  1176. STR_SET_NOEMBED(str);
  1177. STR_UNSET_NOCAPA(str);
  1178. ptr[len] = 0;
  1179. RSTRING(str)->as.heap.ptr = ptr;
  1180. RSTRING(str)->as.heap.len = len;
  1181. RSTRING(str)->as.heap.aux.capa = capa;
  1182. }
  1183. #define str_make_independent(str) str_make_independent_expand((str), 0L)
  1184. void
  1185. rb_str_modify(VALUE str)
  1186. {
  1187. if (!str_independent(str))
  1188. str_make_independent(str);
  1189. ENC_CODERANGE_CLEAR(str);
  1190. }
  1191. void
  1192. rb_str_modify_expand(VALUE str, long expand)
  1193. {
  1194. if (expand < 0) {
  1195. rb_raise(rb_eArgError, "negative expanding string size");
  1196. }
  1197. if (!str_independent(str)) {
  1198. str_make_independent_expand(str, expand);
  1199. }
  1200. else if (expand > 0) {
  1201. long len = RSTRING_LEN(str);
  1202. long capa = len + expand;
  1203. if (!STR_EMBED_P(str)) {
  1204. REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
  1205. RSTRING(str)->as.heap.aux.capa = capa;
  1206. }
  1207. else if (capa > RSTRING_EMBED_LEN_MAX) {
  1208. str_make_independent_expand(str, expand);
  1209. }
  1210. }
  1211. ENC_CODERANGE_CLEAR(str);
  1212. }
  1213. /* As rb_str_modify(), but don't clear coderange */
  1214. static void
  1215. str_modify_keep_cr(VALUE str)
  1216. {
  1217. if (!str_independent(str))
  1218. str_make_independent(str);
  1219. if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
  1220. /* Force re-scan later */
  1221. ENC_CODERANGE_CLEAR(str);
  1222. }
  1223. static inline void
  1224. str_discard(VALUE str)
  1225. {
  1226. str_modifiable(str);
  1227. if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
  1228. xfree(RSTRING_PTR(str));
  1229. RSTRING(str)->as.heap.ptr = 0;
  1230. RSTRING(str)->as.heap.len = 0;
  1231. }
  1232. }
  1233. void
  1234. rb_str_associate(VALUE str, VALUE add)
  1235. {
  1236. /* sanity check */
  1237. rb_check_frozen(str);
  1238. if (STR_ASSOC_P(str)) {
  1239. /* already associated */
  1240. rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
  1241. }
  1242. else {
  1243. if (STR_SHARED_P(str)) {
  1244. VALUE assoc = RSTRING(str)->as.heap.aux.shared;
  1245. str_make_independent(str);
  1246. if (STR_ASSOC_P(assoc)) {
  1247. assoc = RSTRING(assoc)->as.heap.aux.shared;
  1248. rb_ary_concat(assoc, add);
  1249. add = assoc;
  1250. }
  1251. }
  1252. else if (STR_EMBED_P(str)) {
  1253. str_make_independent(str);
  1254. }
  1255. else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
  1256. RESIZE_CAPA(str, RSTRING_LEN(str));
  1257. }
  1258. FL_SET(str, STR_ASSOC);
  1259. RBASIC(add)->klass = 0;
  1260. RSTRING(str)->as.heap.aux.shared = add;
  1261. }
  1262. }
  1263. VALUE
  1264. rb_str_associated(VALUE str)
  1265. {
  1266. if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
  1267. if (STR_ASSOC_P(str)) {
  1268. return RSTRING(str)->as.heap.aux.shared;
  1269. }
  1270. return Qfalse;
  1271. }
  1272. void
  1273. rb_must_asciicompat(VALUE str)
  1274. {
  1275. rb_encoding *enc = rb_enc_get(str);
  1276. if (!rb_enc_asciicompat(enc)) {
  1277. rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
  1278. }
  1279. }
  1280. VALUE
  1281. rb_string_value(volatile VALUE *ptr)
  1282. {
  1283. VALUE s = *ptr;
  1284. if (!RB_TYPE_P(s, T_STRING)) {
  1285. s = rb_str_to_str(s);
  1286. *ptr = s;
  1287. }
  1288. return s;
  1289. }
  1290. char *
  1291. rb_string_value_ptr(volatile VALUE *ptr)
  1292. {
  1293. VALUE str = rb_string_value(ptr);
  1294. return RSTRING_PTR(str);
  1295. }
  1296. char *
  1297. rb_string_value_cstr(volatile VALUE *ptr)
  1298. {
  1299. VALUE str = rb_string_value(ptr);
  1300. char *s = RSTRING_PTR(str);
  1301. long len = RSTRING_LEN(str);
  1302. if (!s || memchr(s, 0, len)) {
  1303. rb_raise(rb_eArgError, "string contains null byte");
  1304. }
  1305. if (s[len]) {
  1306. rb_str_modify(str);
  1307. s = RSTRING_PTR(str);
  1308. s[RSTRING_LEN(str)] = 0;
  1309. }
  1310. return s;
  1311. }
  1312. VALUE
  1313. rb_check_string_type(VALUE str)
  1314. {
  1315. str = rb_check_convert_type(str, T_STRING, "String", "to_str");
  1316. return str;
  1317. }
  1318. /*
  1319. * call-seq:
  1320. * String.try_convert(obj) -> string or nil
  1321. *
  1322. * Try to convert <i>obj</i> into a String, using to_str method.
  1323. * Returns converted string or nil if <i>obj</i> cannot be converted
  1324. * for any reason.
  1325. *
  1326. * String.try_convert("str") #=> "str"
  1327. * String.try_convert(/re/) #=> nil
  1328. */
  1329. static VALUE
  1330. rb_str_s_try_convert(VALUE dummy, VALUE str)
  1331. {
  1332. return rb_check_string_type(str);
  1333. }
  1334. static char*
  1335. str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
  1336. {
  1337. long nth = *nthp;
  1338. if (rb_enc_mbmaxlen(enc) == 1) {
  1339. p += nth;
  1340. }
  1341. else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  1342. p += nth * rb_enc_mbmaxlen(enc);
  1343. }
  1344. else if (rb_enc_asciicompat(enc)) {
  1345. const char *p2, *e2;
  1346. int n;
  1347. while (p < e && 0 < nth) {
  1348. e2 = p + nth;
  1349. if (e < e2) {
  1350. *nthp = nth;
  1351. return (char *)e;
  1352. }
  1353. if (ISASCII(*p)) {
  1354. p2 = search_nonascii(p, e2);
  1355. if (!p2) {
  1356. nth -= e2 - p;
  1357. *nthp = nth;
  1358. return (char *)e2;
  1359. }
  1360. nth -= p2 - p;
  1361. p = p2;
  1362. }
  1363. n = rb_enc_mbclen(p, e, enc);
  1364. p += n;
  1365. nth--;
  1366. }
  1367. *nthp = nth;
  1368. if (nth != 0) {
  1369. return (char *)e;
  1370. }
  1371. return (char *)p;
  1372. }
  1373. else {
  1374. while (p < e && nth--) {
  1375. p += rb_enc_mbclen(p, e, enc);
  1376. }
  1377. }
  1378. if (p > e) p = e;
  1379. *nthp = nth;
  1380. return (char*)p;
  1381. }
  1382. char*
  1383. rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
  1384. {
  1385. return str_nth_len(p, e, &nth, enc);
  1386. }
  1387. static char*
  1388. str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
  1389. {
  1390. if (singlebyte)
  1391. p += nth;
  1392. else {
  1393. p = str_nth_len(p, e, &nth, enc);
  1394. }
  1395. if (!p) return 0;
  1396. if (p > e) p = e;
  1397. return (char *)p;
  1398. }
  1399. /* char offset to byte offset */
  1400. static long
  1401. str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
  1402. {
  1403. const char *pp = str_nth(p, e, nth, enc, singlebyte);
  1404. if (!pp) return e - p;
  1405. return pp - p;
  1406. }
  1407. long
  1408. rb_str_offset(VALUE str, long pos)
  1409. {
  1410. return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
  1411. STR_ENC_GET(str), single_byte_optimizable(str));
  1412. }
  1413. #ifdef NONASCII_MASK
  1414. static char *
  1415. str_utf8_nth(const char *p, const char *e, long *nthp)
  1416. {
  1417. long nth = *nthp;
  1418. if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
  1419. const VALUE *s, *t;
  1420. const VALUE lowbits = sizeof(VALUE) - 1;
  1421. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  1422. t = (const VALUE*)(~lowbits & (VALUE)e);
  1423. while (p < (const char *)s) {
  1424. if (is_utf8_lead_byte(*p)) nth--;
  1425. p++;
  1426. }
  1427. do {
  1428. nth -= count_utf8_lead_bytes_with_word(s);
  1429. s++;
  1430. } while (s < t && (int)sizeof(VALUE) <= nth);
  1431. p = (char *)s;
  1432. }
  1433. while (p < e) {
  1434. if (is_utf8_lead_byte(*p)) {
  1435. if (nth == 0) break;
  1436. nth--;
  1437. }
  1438. p++;
  1439. }
  1440. *nthp = nth;
  1441. return (char *)p;
  1442. }
  1443. static long
  1444. str_utf8_offset(const char *p, const char *e, long nth)
  1445. {
  1446. const char *pp = str_utf8_nth(p, e, &nth);
  1447. return pp - p;
  1448. }
  1449. #endif
  1450. /* byte offset to char offset */
  1451. long
  1452. rb_str_sublen(VALUE str, long pos)
  1453. {
  1454. if (single_byte_optimizable(str) || pos < 0)
  1455. return pos;
  1456. else {
  1457. char *p = RSTRING_PTR(str);
  1458. return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
  1459. }
  1460. }
  1461. VALUE
  1462. rb_str_subseq(VALUE str, long beg, long len)
  1463. {
  1464. VALUE str2;
  1465. if (RSTRING_LEN(str) == beg + len &&
  1466. RSTRING_EMBED_LEN_MAX < len) {
  1467. str2 = rb_str_new_shared(rb_str_new_frozen(str));
  1468. rb_str_drop_bytes(str2, beg);
  1469. }
  1470. else {
  1471. str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
  1472. RB_GC_GUARD(str);
  1473. }
  1474. rb_enc_cr_str_copy_for_substr(str2, str);
  1475. OBJ_INFECT(str2, str);
  1476. return str2;
  1477. }
  1478. static char *
  1479. rb_str_subpos(VALUE str, long beg, long *lenp)
  1480. {
  1481. long len = *lenp;
  1482. long slen = -1L;
  1483. long blen = RSTRING_LEN(str);
  1484. rb_encoding *enc = STR_ENC_GET(str);
  1485. char *p, *s = RSTRING_PTR(str), *e = s + blen;
  1486. if (len < 0) return 0;
  1487. if (!blen) {
  1488. len = 0;
  1489. }
  1490. if (single_byte_optimizable(str)) {
  1491. if (beg > blen) return 0;
  1492. if (beg < 0) {
  1493. beg += blen;
  1494. if (beg < 0) return 0;
  1495. }
  1496. if (beg + len > blen)
  1497. len = blen - beg;
  1498. if (len < 0) return 0;
  1499. p = s + beg;
  1500. goto end;
  1501. }
  1502. if (beg < 0) {
  1503. if (len > -beg) len = -beg;
  1504. if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
  1505. beg = -beg;
  1506. while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
  1507. p = e;
  1508. if (!p) return 0;
  1509. while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
  1510. if (!p) return 0;
  1511. len = e - p;
  1512. goto end;
  1513. }
  1514. else {
  1515. slen = str_strlen(str, enc);
  1516. beg += slen;
  1517. if (beg < 0) return 0;
  1518. p = s + beg;
  1519. if (len == 0) goto end;
  1520. }
  1521. }
  1522. else if (beg > 0 && beg > RSTRING_LEN(str)) {
  1523. return 0;
  1524. }
  1525. if (len == 0) {
  1526. if (beg > str_strlen(str, enc)) return 0;
  1527. p = s + beg;
  1528. }
  1529. #ifdef NONASCII_MASK
  1530. else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
  1531. enc == rb_utf8_encoding()) {
  1532. p = str_utf8_nth(s, e, &beg);
  1533. if (beg > 0) return 0;
  1534. len = str_utf8_offset(p, e, len);
  1535. }
  1536. #endif
  1537. else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  1538. int char_sz = rb_enc_mbmaxlen(enc);
  1539. p = s + beg * char_sz;
  1540. if (p > e) {
  1541. return 0;
  1542. }
  1543. else if (len * char_sz > e - p)
  1544. len = e - p;
  1545. else
  1546. len *= char_sz;
  1547. }
  1548. else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
  1549. if (beg > 0) return 0;
  1550. len = 0;
  1551. }
  1552. else {
  1553. len = str_offset(p, e, len, enc, 0);
  1554. }
  1555. end:
  1556. *lenp = len;
  1557. RB_GC_GUARD(str);
  1558. return p;
  1559. }
  1560. VALUE
  1561. rb_str_substr(VALUE str, long beg, long len)
  1562. {
  1563. VALUE str2;
  1564. char *p = rb_str_subpos(str, beg, &len);
  1565. if (!p) return Qnil;
  1566. if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
  1567. str2 = rb_str_new4(str);
  1568. str2 = str_new3(rb_obj_class(str2), str2);
  1569. RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
  1570. RSTRING(str2)->as.heap.len = len;
  1571. }
  1572. else {
  1573. str2 = rb_str_new5(str, p, len);
  1574. rb_enc_cr_str_copy_for_substr(str2, str);
  1575. OBJ_INFECT(str2, str);
  1576. RB_GC_GUARD(str);
  1577. }
  1578. return str2;
  1579. }
  1580. VALUE
  1581. rb_str_freeze(VALUE str)
  1582. {
  1583. if (STR_ASSOC_P(str)) {
  1584. VALUE ary = RSTRING(str)->as.heap.aux.shared;
  1585. OBJ_FREEZE(ary);
  1586. }
  1587. return rb_obj_freeze(str);
  1588. }
  1589. RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
  1590. #define rb_str_dup_frozen rb_str_new_frozen
  1591. VALUE
  1592. rb_str_locktmp(VALUE str)
  1593. {
  1594. if (FL_TEST(str, STR_TMPLOCK)) {
  1595. rb_raise(rb_eRuntimeError, "temporal locking already locked string");
  1596. }
  1597. FL_SET(str, STR_TMPLOCK);
  1598. return str;
  1599. }
  1600. VALUE
  1601. rb_str_unlocktmp(VALUE str)
  1602. {
  1603. if (!FL_TEST(str, STR_TMPLOCK)) {
  1604. rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
  1605. }
  1606. FL_UNSET(str, STR_TMPLOCK);
  1607. return str;
  1608. }
  1609. void
  1610. rb_str_set_len(VALUE str, long len)
  1611. {
  1612. long capa;
  1613. str_modifiable(str);
  1614. if (STR_SHARED_P(str)) {
  1615. rb_raise(rb_eRuntimeError, "can't set length of shared string");
  1616. }
  1617. if (len > (capa = (long)rb_str_capacity(str))) {
  1618. rb_bug("probable buffer overflow: %ld for %ld", len, capa);
  1619. }
  1620. STR_SET_LEN(str, len);
  1621. RSTRING_PTR(str)[len] = '\0';
  1622. }
  1623. VALUE
  1624. rb_str_resize(VALUE str, long len)
  1625. {
  1626. long slen;
  1627. int independent;
  1628. if (len < 0) {
  1629. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1630. }
  1631. independent = str_independent(str);
  1632. ENC_CODERANGE_CLEAR(str);
  1633. slen = RSTRING_LEN(str);
  1634. if (len != slen) {
  1635. if (STR_EMBED_P(str)) {
  1636. if (len <= RSTRING_EMBED_LEN_MAX) {
  1637. STR_SET_EMBED_LEN(str, len);
  1638. RSTRING(str)->as.ary[len] = '\0';
  1639. return str;
  1640. }
  1641. str_make_independent_expand(str, len - slen);
  1642. STR_SET_NOEMBED(str);
  1643. }
  1644. else if (len <= RSTRING_EMBED_LEN_MAX) {
  1645. char *ptr = RSTRING(str)->as.heap.ptr;
  1646. STR_SET_EMBED(str);
  1647. if (slen > len) slen = len;
  1648. if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
  1649. RSTRING(str)->as.ary[len] = '\0';
  1650. STR_SET_EMBED_LEN(str, len);
  1651. if (independent) xfree(ptr);
  1652. return str;
  1653. }
  1654. else if (!independent) {
  1655. str_make_independent_expand(str, len - slen);
  1656. }
  1657. else if (slen < len || slen - len > 1024) {
  1658. REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
  1659. }
  1660. if (!STR_NOCAPA_P(str)) {
  1661. RSTRING(str)->as.heap.aux.capa = len;
  1662. }
  1663. RSTRING(str)->as.heap.len = len;
  1664. RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
  1665. }
  1666. return str;
  1667. }
  1668. static VALUE
  1669. str_buf_cat(VALUE str, const char *ptr, long len)
  1670. {
  1671. long capa, total, off = -1;
  1672. if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
  1673. off = ptr - RSTRING_PTR(str);
  1674. }
  1675. rb_str_modify(str);
  1676. if (len == 0) return 0;
  1677. if (STR_ASSOC_P(str)) {
  1678. FL_UNSET(str, STR_ASSOC);
  1679. capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
  1680. }
  1681. else if (STR_EMBED_P(str)) {
  1682. capa = RSTRING_EMBED_LEN_MAX;
  1683. }
  1684. else {
  1685. capa = RSTRING(str)->as.heap.aux.capa;
  1686. }
  1687. if (RSTRING_LEN(str) >= LONG_MAX - len) {
  1688. rb_raise(rb_eArgError, "string sizes too big");
  1689. }
  1690. total = RSTRING_LEN(str)+len;
  1691. if (capa <= total) {
  1692. while (total > capa) {
  1693. if (capa + 1 >= LONG_MAX / 2) {
  1694. capa = (total + 4095) / 4096;
  1695. break;
  1696. }
  1697. capa = (capa + 1) * 2;
  1698. }
  1699. RESIZE_CAPA(str, capa);
  1700. }
  1701. if (off != -1) {
  1702. ptr = RSTRING_PTR(str) + off;
  1703. }
  1704. memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
  1705. STR_SET_LEN(str, total);
  1706. RSTRING_PTR(str)[total] = '\0'; /* sentinel */
  1707. return str;
  1708. }
  1709. #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
  1710. VALUE
  1711. rb_str_buf_cat(VALUE str, const char *ptr, long len)
  1712. {
  1713. if (len == 0) return str;
  1714. if (len < 0) {
  1715. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1716. }
  1717. return str_buf_cat(str, ptr, len);
  1718. }
  1719. VALUE
  1720. rb_str_buf_cat2(VALUE str, const char *ptr)
  1721. {
  1722. return rb_str_buf_cat(str, ptr, strlen(ptr));
  1723. }
  1724. VALUE
  1725. rb_str_cat(VALUE str, const char *ptr, long len)
  1726. {
  1727. if (len < 0) {
  1728. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1729. }
  1730. if (STR_ASSOC_P(str)) {
  1731. char *p;
  1732. rb_str_modify_expand(str, len);
  1733. p = RSTRING(str)->as.heap.ptr;
  1734. memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
  1735. len = RSTRING(str)->as.heap.len += len;
  1736. p[len] = '\0'; /* sentinel */
  1737. return str;
  1738. }
  1739. return rb_str_buf_cat(str, ptr, len);
  1740. }
  1741. VALUE
  1742. rb_str_cat2(VALUE str, const char *ptr)
  1743. {
  1744. return rb_str_cat(str, ptr, strlen(ptr));
  1745. }
  1746. static VALUE
  1747. rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
  1748. int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
  1749. {
  1750. int str_encindex = ENCODING_GET(str);
  1751. int res_encindex;
  1752. int str_cr, res_cr;
  1753. str_cr = ENC_CODERANGE(str);
  1754. if (str_encindex == ptr_encindex) {
  1755. if (str_cr == ENC_CODERANGE_UNKNOWN)
  1756. ptr_cr = ENC_CODERANGE_UNKNOWN;
  1757. else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
  1758. ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
  1759. }
  1760. }
  1761. else {
  1762. rb_encoding *str_enc = rb_enc_from_index(str_encindex);
  1763. rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
  1764. if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
  1765. if (len == 0)
  1766. return str;
  1767. if (RSTRING_LEN(str) == 0) {
  1768. rb_str_buf_cat(str, ptr, len);
  1769. ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
  1770. return str;
  1771. }
  1772. goto incompatible;
  1773. }
  1774. if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
  1775. ptr_cr = coderange_scan(ptr, len, ptr_enc);
  1776. }
  1777. if (str_cr == ENC_CODERANGE_UNKNOWN) {
  1778. if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
  1779. str_cr = rb_enc_str_coderange(str);
  1780. }
  1781. }
  1782. }
  1783. if (ptr_cr_ret)
  1784. *ptr_cr_ret = ptr_cr;
  1785. if (str_encindex != ptr_encindex &&
  1786. str_cr != ENC_CODERANGE_7BIT &&
  1787. ptr_cr != ENC_CODERANGE_7BIT) {
  1788. incompatible:
  1789. rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
  1790. rb_enc_name(rb_enc_from_index(str_encindex)),
  1791. rb_enc_name(rb_enc_from_index(ptr_encindex)));
  1792. }
  1793. if (str_cr == ENC_CODERANGE_UNKNOWN) {
  1794. res_encindex = str_encindex;
  1795. res_cr = ENC_CODERANGE_UNKNOWN;
  1796. }
  1797. else if (str_cr == ENC_CODERANGE_7BIT) {
  1798. if (ptr_cr == ENC_CODERANGE_7BIT) {
  1799. res_encindex = str_encindex;
  1800. res_cr = ENC_CODERANGE_7BIT;
  1801. }
  1802. else {
  1803. res_encindex = ptr_encindex;
  1804. res_cr = ptr_cr;
  1805. }
  1806. }
  1807. else if (str_cr == ENC_CODERANGE_VALID) {
  1808. res_encindex = str_encindex;
  1809. if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
  1810. res_cr = str_cr;
  1811. else
  1812. res_cr = ptr_cr;
  1813. }
  1814. else { /* str_cr == ENC_CODERANGE_BROKEN */
  1815. res_encindex = str_encindex;
  1816. res_cr = str_cr;
  1817. if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
  1818. }
  1819. if (len < 0) {
  1820. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1821. }
  1822. str_buf_cat(str, ptr, len);
  1823. ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
  1824. return str;
  1825. }
  1826. VALUE
  1827. rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
  1828. {
  1829. return rb_enc_cr_str_buf_cat(str, ptr, len,
  1830. rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
  1831. }
  1832. VALUE
  1833. rb_str_buf_cat_ascii(VALUE str, const char *ptr)
  1834. {
  1835. /* ptr must reference NUL terminated ASCII string. */
  1836. int encindex = ENCODING_GET(str);
  1837. rb_encoding *enc = rb_enc_from_index(encindex);
  1838. if (rb_enc_asciicompat(enc)) {
  1839. return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
  1840. encindex, ENC_CODERANGE_7BIT, 0);
  1841. }
  1842. else {
  1843. char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
  1844. while (*ptr) {
  1845. unsigned int c = (unsigned char)*ptr;
  1846. int len = rb_enc_codelen(c, enc);
  1847. rb_enc_mbcput(c, buf, enc);
  1848. rb_enc_cr_str_buf_cat(str, buf, len,
  1849. encindex, ENC_CODERANGE_VALID, 0);
  1850. ptr++;
  1851. }
  1852. return str;
  1853. }
  1854. }
  1855. VALUE
  1856. rb_str_buf_append(VALUE str, VALUE str2)
  1857. {
  1858. int str2_cr;
  1859. str2_cr = ENC_CODERANGE(str2);
  1860. rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
  1861. ENCODING_GET(str2), str2_cr, &str2_cr);
  1862. OBJ_INFECT(str, str2);
  1863. ENC_CODERANGE_SET(str2, str2_cr);
  1864. return str;
  1865. }
  1866. VALUE
  1867. rb_str_append(VALUE str, VALUE str2)
  1868. {
  1869. rb_encoding *enc;
  1870. int cr, cr2;
  1871. long len2;
  1872. StringValue(str2);
  1873. if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
  1874. long len = RSTRING_LEN(str) + len2;
  1875. enc = rb_enc_check(str, str2);
  1876. cr = ENC_CODERANGE(str);
  1877. if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
  1878. rb_str_modify_expand(str, len2);
  1879. memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
  1880. RSTRING_PTR(str2), len2+1);
  1881. RSTRING(str)->as.heap.len = len;
  1882. rb_enc_associate(str, enc);
  1883. ENC_CODERANGE_SET(str, cr);
  1884. OBJ_INFECT(str, str2);
  1885. return str;
  1886. }
  1887. return rb_str_buf_append(str, str2);
  1888. }
  1889. /*
  1890. * call-seq:
  1891. * str << integer -> str
  1892. * str.concat(integer) -> str
  1893. * str << obj -> str
  1894. * str.concat(obj) -> str
  1895. *
  1896. * Append---Concatenates the given object to <i>str</i>. If the object is a
  1897. * <code>Integer</code>, it is considered as a codepoint,…

Large files files are truncated, but you can click here to view the full file