PageRenderTime 49ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/string.c

https://github.com/vuxuandung/ruby
C | 8315 lines | 5858 code | 822 blank | 1635 comment | 1556 complexity | 8b89c4a4b18f27bf63623a27c3a99f6b MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, AGPL-3.0, 0BSD
  1. /**********************************************************************
  2. string.c -
  3. $Author$
  4. created at: Mon Aug 9 17:12:58 JST 1993
  5. Copyright (C) 1993-2007 Yukihiro Matsumoto
  6. Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
  7. Copyright (C) 2000 Information-technology Promotion Agency, Japan
  8. **********************************************************************/
  9. #include "ruby/ruby.h"
  10. #include "ruby/re.h"
  11. #include "ruby/encoding.h"
  12. #include "vm_core.h"
  13. #include "internal.h"
  14. #include "probes.h"
  15. #include <assert.h>
  16. #define BEG(no) (regs->beg[(no)])
  17. #define END(no) (regs->end[(no)])
  18. #include <math.h>
  19. #include <ctype.h>
  20. #ifdef HAVE_UNISTD_H
  21. #include <unistd.h>
  22. #endif
  23. #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
  24. #undef rb_str_new_cstr
  25. #undef rb_tainted_str_new_cstr
  26. #undef rb_usascii_str_new_cstr
  27. #undef rb_external_str_new_cstr
  28. #undef rb_locale_str_new_cstr
  29. #undef rb_str_new2
  30. #undef rb_str_new3
  31. #undef rb_str_new4
  32. #undef rb_str_new5
  33. #undef rb_tainted_str_new2
  34. #undef rb_usascii_str_new2
  35. #undef rb_str_dup_frozen
  36. #undef rb_str_buf_new_cstr
  37. #undef rb_str_buf_new2
  38. #undef rb_str_buf_cat2
  39. #undef rb_str_cat2
  40. static VALUE rb_str_clear(VALUE str);
  41. VALUE rb_cString;
  42. VALUE rb_cSymbol;
  43. #define RUBY_MAX_CHAR_LEN 16
  44. #define STR_TMPLOCK FL_USER7
  45. #define STR_NOEMBED FL_USER1
  46. #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
  47. #define STR_ASSOC FL_USER3
  48. #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
  49. #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
  50. #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  51. #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
  52. #define STR_UNSET_NOCAPA(s) do {\
  53. if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
  54. } while (0)
  55. #define STR_SET_NOEMBED(str) do {\
  56. FL_SET((str), STR_NOEMBED);\
  57. STR_SET_EMBED_LEN((str), 0);\
  58. } while (0)
  59. #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
  60. #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
  61. #define STR_SET_EMBED_LEN(str, n) do { \
  62. long tmp_n = (n);\
  63. RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  64. RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  65. } while (0)
  66. #define STR_SET_LEN(str, n) do { \
  67. if (STR_EMBED_P(str)) {\
  68. STR_SET_EMBED_LEN((str), (n));\
  69. }\
  70. else {\
  71. RSTRING(str)->as.heap.len = (n);\
  72. }\
  73. } while (0)
  74. #define STR_DEC_LEN(str) do {\
  75. if (STR_EMBED_P(str)) {\
  76. long n = RSTRING_LEN(str);\
  77. n--;\
  78. STR_SET_EMBED_LEN((str), n);\
  79. }\
  80. else {\
  81. RSTRING(str)->as.heap.len--;\
  82. }\
  83. } while (0)
  84. #define RESIZE_CAPA(str,capacity) do {\
  85. if (STR_EMBED_P(str)) {\
  86. if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  87. char *tmp = ALLOC_N(char, (capacity)+1);\
  88. memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  89. RSTRING(str)->as.heap.ptr = tmp;\
  90. RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  91. STR_SET_NOEMBED(str);\
  92. RSTRING(str)->as.heap.aux.capa = (capacity);\
  93. }\
  94. }\
  95. else {\
  96. REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  97. if (!STR_NOCAPA_P(str))\
  98. RSTRING(str)->as.heap.aux.capa = (capacity);\
  99. }\
  100. } while (0)
  101. #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  102. #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
  103. #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
  104. static inline int
  105. single_byte_optimizable(VALUE str)
  106. {
  107. rb_encoding *enc;
  108. /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
  109. if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
  110. return 1;
  111. enc = STR_ENC_GET(str);
  112. if (rb_enc_mbmaxlen(enc) == 1)
  113. return 1;
  114. /* Conservative. Possibly single byte.
  115. * "\xa1" in Shift_JIS for example. */
  116. return 0;
  117. }
  118. VALUE rb_fs;
  119. static inline const char *
  120. search_nonascii(const char *p, const char *e)
  121. {
  122. #if SIZEOF_VALUE == 8
  123. # define NONASCII_MASK 0x8080808080808080ULL
  124. #elif SIZEOF_VALUE == 4
  125. # define NONASCII_MASK 0x80808080UL
  126. #endif
  127. #ifdef NONASCII_MASK
  128. if ((int)sizeof(VALUE) * 2 < e - p) {
  129. const VALUE *s, *t;
  130. const VALUE lowbits = sizeof(VALUE) - 1;
  131. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  132. while (p < (const char *)s) {
  133. if (!ISASCII(*p))
  134. return p;
  135. p++;
  136. }
  137. t = (const VALUE*)(~lowbits & (VALUE)e);
  138. while (s < t) {
  139. if (*s & NONASCII_MASK) {
  140. t = s;
  141. break;
  142. }
  143. s++;
  144. }
  145. p = (const char *)t;
  146. }
  147. #endif
  148. while (p < e) {
  149. if (!ISASCII(*p))
  150. return p;
  151. p++;
  152. }
  153. return NULL;
  154. }
  155. static int
  156. coderange_scan(const char *p, long len, rb_encoding *enc)
  157. {
  158. const char *e = p + len;
  159. if (rb_enc_to_index(enc) == 0) {
  160. /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
  161. p = search_nonascii(p, e);
  162. return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
  163. }
  164. if (rb_enc_asciicompat(enc)) {
  165. p = search_nonascii(p, e);
  166. if (!p) {
  167. return ENC_CODERANGE_7BIT;
  168. }
  169. while (p < e) {
  170. int ret = rb_enc_precise_mbclen(p, e, enc);
  171. if (!MBCLEN_CHARFOUND_P(ret)) {
  172. return ENC_CODERANGE_BROKEN;
  173. }
  174. p += MBCLEN_CHARFOUND_LEN(ret);
  175. if (p < e) {
  176. p = search_nonascii(p, e);
  177. if (!p) {
  178. return ENC_CODERANGE_VALID;
  179. }
  180. }
  181. }
  182. if (e < p) {
  183. return ENC_CODERANGE_BROKEN;
  184. }
  185. return ENC_CODERANGE_VALID;
  186. }
  187. while (p < e) {
  188. int ret = rb_enc_precise_mbclen(p, e, enc);
  189. if (!MBCLEN_CHARFOUND_P(ret)) {
  190. return ENC_CODERANGE_BROKEN;
  191. }
  192. p += MBCLEN_CHARFOUND_LEN(ret);
  193. }
  194. if (e < p) {
  195. return ENC_CODERANGE_BROKEN;
  196. }
  197. return ENC_CODERANGE_VALID;
  198. }
  199. long
  200. rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
  201. {
  202. const char *p = s;
  203. if (*cr == ENC_CODERANGE_BROKEN)
  204. return e - s;
  205. if (rb_enc_to_index(enc) == 0) {
  206. /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
  207. p = search_nonascii(p, e);
  208. *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
  209. return e - s;
  210. }
  211. else if (rb_enc_asciicompat(enc)) {
  212. p = search_nonascii(p, e);
  213. if (!p) {
  214. if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
  215. return e - s;
  216. }
  217. while (p < e) {
  218. int ret = rb_enc_precise_mbclen(p, e, enc);
  219. if (!MBCLEN_CHARFOUND_P(ret)) {
  220. *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
  221. return p - s;
  222. }
  223. p += MBCLEN_CHARFOUND_LEN(ret);
  224. if (p < e) {
  225. p = search_nonascii(p, e);
  226. if (!p) {
  227. *cr = ENC_CODERANGE_VALID;
  228. return e - s;
  229. }
  230. }
  231. }
  232. *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
  233. return p - s;
  234. }
  235. else {
  236. while (p < e) {
  237. int ret = rb_enc_precise_mbclen(p, e, enc);
  238. if (!MBCLEN_CHARFOUND_P(ret)) {
  239. *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
  240. return p - s;
  241. }
  242. p += MBCLEN_CHARFOUND_LEN(ret);
  243. }
  244. *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
  245. return p - s;
  246. }
  247. }
  248. static inline void
  249. str_enc_copy(VALUE str1, VALUE str2)
  250. {
  251. rb_enc_set_index(str1, ENCODING_GET(str2));
  252. }
  253. static void
  254. rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
  255. {
  256. /* this function is designed for copying encoding and coderange
  257. * from src to new string "dest" which is made from the part of src.
  258. */
  259. str_enc_copy(dest, src);
  260. if (RSTRING_LEN(dest) == 0) {
  261. if (!rb_enc_asciicompat(STR_ENC_GET(src)))
  262. ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
  263. else
  264. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  265. return;
  266. }
  267. switch (ENC_CODERANGE(src)) {
  268. case ENC_CODERANGE_7BIT:
  269. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  270. break;
  271. case ENC_CODERANGE_VALID:
  272. if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
  273. search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
  274. ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
  275. else
  276. ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
  277. break;
  278. default:
  279. break;
  280. }
  281. }
  282. static void
  283. rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
  284. {
  285. str_enc_copy(dest, src);
  286. ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
  287. }
  288. int
  289. rb_enc_str_coderange(VALUE str)
  290. {
  291. int cr = ENC_CODERANGE(str);
  292. if (cr == ENC_CODERANGE_UNKNOWN) {
  293. rb_encoding *enc = STR_ENC_GET(str);
  294. cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
  295. ENC_CODERANGE_SET(str, cr);
  296. }
  297. return cr;
  298. }
  299. int
  300. rb_enc_str_asciionly_p(VALUE str)
  301. {
  302. rb_encoding *enc = STR_ENC_GET(str);
  303. if (!rb_enc_asciicompat(enc))
  304. return FALSE;
  305. else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  306. return TRUE;
  307. return FALSE;
  308. }
  309. static inline void
  310. str_mod_check(VALUE s, const char *p, long len)
  311. {
  312. if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
  313. rb_raise(rb_eRuntimeError, "string modified");
  314. }
  315. }
  316. size_t
  317. rb_str_capacity(VALUE str)
  318. {
  319. if (STR_EMBED_P(str)) {
  320. return RSTRING_EMBED_LEN_MAX;
  321. }
  322. else if (STR_NOCAPA_P(str)) {
  323. return RSTRING(str)->as.heap.len;
  324. }
  325. else {
  326. return RSTRING(str)->as.heap.aux.capa;
  327. }
  328. }
  329. static inline VALUE
  330. str_alloc(VALUE klass)
  331. {
  332. NEWOBJ_OF(str, struct RString, klass, T_STRING);
  333. str->as.heap.ptr = 0;
  334. str->as.heap.len = 0;
  335. str->as.heap.aux.capa = 0;
  336. return (VALUE)str;
  337. }
  338. static inline VALUE
  339. empty_str_alloc(VALUE klass)
  340. {
  341. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  342. RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
  343. }
  344. return str_alloc(klass);
  345. }
  346. static VALUE
  347. str_new(VALUE klass, const char *ptr, long len)
  348. {
  349. VALUE str;
  350. if (len < 0) {
  351. rb_raise(rb_eArgError, "negative string size (or size too big)");
  352. }
  353. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  354. RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
  355. }
  356. str = str_alloc(klass);
  357. if (len > RSTRING_EMBED_LEN_MAX) {
  358. RSTRING(str)->as.heap.aux.capa = len;
  359. RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
  360. STR_SET_NOEMBED(str);
  361. }
  362. else if (len == 0) {
  363. ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
  364. }
  365. if (ptr) {
  366. memcpy(RSTRING_PTR(str), ptr, len);
  367. }
  368. STR_SET_LEN(str, len);
  369. RSTRING_PTR(str)[len] = '\0';
  370. return str;
  371. }
  372. VALUE
  373. rb_str_new(const char *ptr, long len)
  374. {
  375. return str_new(rb_cString, ptr, len);
  376. }
  377. VALUE
  378. rb_usascii_str_new(const char *ptr, long len)
  379. {
  380. VALUE str = rb_str_new(ptr, len);
  381. ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
  382. return str;
  383. }
  384. VALUE
  385. rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
  386. {
  387. VALUE str = rb_str_new(ptr, len);
  388. rb_enc_associate(str, enc);
  389. return str;
  390. }
  391. VALUE
  392. rb_str_new_cstr(const char *ptr)
  393. {
  394. if (!ptr) {
  395. rb_raise(rb_eArgError, "NULL pointer given");
  396. }
  397. return rb_str_new(ptr, strlen(ptr));
  398. }
  399. RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
  400. #define rb_str_new2 rb_str_new_cstr
  401. VALUE
  402. rb_usascii_str_new_cstr(const char *ptr)
  403. {
  404. VALUE str = rb_str_new2(ptr);
  405. ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
  406. return str;
  407. }
  408. RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
  409. #define rb_usascii_str_new2 rb_usascii_str_new_cstr
  410. VALUE
  411. rb_tainted_str_new(const char *ptr, long len)
  412. {
  413. VALUE str = rb_str_new(ptr, len);
  414. OBJ_TAINT(str);
  415. return str;
  416. }
  417. VALUE
  418. rb_tainted_str_new_cstr(const char *ptr)
  419. {
  420. VALUE str = rb_str_new2(ptr);
  421. OBJ_TAINT(str);
  422. return str;
  423. }
  424. RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
  425. #define rb_tainted_str_new2 rb_tainted_str_new_cstr
  426. VALUE
  427. rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
  428. {
  429. rb_econv_t *ec;
  430. rb_econv_result_t ret;
  431. long len;
  432. VALUE newstr;
  433. const unsigned char *sp;
  434. unsigned char *dp;
  435. if (!to) return str;
  436. if (!from) from = rb_enc_get(str);
  437. if (from == to) return str;
  438. if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
  439. to == rb_ascii8bit_encoding()) {
  440. if (STR_ENC_GET(str) != to) {
  441. str = rb_str_dup(str);
  442. rb_enc_associate(str, to);
  443. }
  444. return str;
  445. }
  446. len = RSTRING_LEN(str);
  447. newstr = rb_str_new(0, len);
  448. retry:
  449. ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
  450. if (!ec) return str;
  451. sp = (unsigned char*)RSTRING_PTR(str);
  452. dp = (unsigned char*)RSTRING_PTR(newstr);
  453. ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
  454. &dp, (unsigned char*)RSTRING_END(newstr), 0);
  455. rb_econv_close(ec);
  456. switch (ret) {
  457. case econv_destination_buffer_full:
  458. /* destination buffer short */
  459. len = len < 2 ? 2 : len * 2;
  460. rb_str_resize(newstr, len);
  461. goto retry;
  462. case econv_finished:
  463. len = dp - (unsigned char*)RSTRING_PTR(newstr);
  464. rb_str_set_len(newstr, len);
  465. rb_enc_associate(newstr, to);
  466. return newstr;
  467. default:
  468. /* some error, return original */
  469. return str;
  470. }
  471. }
  472. VALUE
  473. rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
  474. {
  475. return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
  476. }
  477. VALUE
  478. rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
  479. {
  480. VALUE str;
  481. str = rb_tainted_str_new(ptr, len);
  482. if (eenc == rb_usascii_encoding() &&
  483. rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
  484. rb_enc_associate(str, rb_ascii8bit_encoding());
  485. return str;
  486. }
  487. rb_enc_associate(str, eenc);
  488. return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
  489. }
  490. VALUE
  491. rb_external_str_new(const char *ptr, long len)
  492. {
  493. return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
  494. }
  495. VALUE
  496. rb_external_str_new_cstr(const char *ptr)
  497. {
  498. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
  499. }
  500. VALUE
  501. rb_locale_str_new(const char *ptr, long len)
  502. {
  503. return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
  504. }
  505. VALUE
  506. rb_locale_str_new_cstr(const char *ptr)
  507. {
  508. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
  509. }
  510. VALUE
  511. rb_filesystem_str_new(const char *ptr, long len)
  512. {
  513. return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
  514. }
  515. VALUE
  516. rb_filesystem_str_new_cstr(const char *ptr)
  517. {
  518. return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
  519. }
  520. VALUE
  521. rb_str_export(VALUE str)
  522. {
  523. return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
  524. }
  525. VALUE
  526. rb_str_export_locale(VALUE str)
  527. {
  528. return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
  529. }
  530. VALUE
  531. rb_str_export_to_enc(VALUE str, rb_encoding *enc)
  532. {
  533. return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
  534. }
  535. static VALUE
  536. str_replace_shared_without_enc(VALUE str2, VALUE str)
  537. {
  538. if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
  539. STR_SET_EMBED(str2);
  540. memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
  541. STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
  542. }
  543. else {
  544. str = rb_str_new_frozen(str);
  545. FL_SET(str2, STR_NOEMBED);
  546. RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
  547. RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
  548. RSTRING(str2)->as.heap.aux.shared = str;
  549. FL_SET(str2, ELTS_SHARED);
  550. }
  551. return str2;
  552. }
  553. static VALUE
  554. str_replace_shared(VALUE str2, VALUE str)
  555. {
  556. str_replace_shared_without_enc(str2, str);
  557. rb_enc_cr_str_exact_copy(str2, str);
  558. return str2;
  559. }
  560. static VALUE
  561. str_new_shared(VALUE klass, VALUE str)
  562. {
  563. return str_replace_shared(str_alloc(klass), str);
  564. }
  565. static VALUE
  566. str_new3(VALUE klass, VALUE str)
  567. {
  568. return str_new_shared(klass, str);
  569. }
  570. VALUE
  571. rb_str_new_shared(VALUE str)
  572. {
  573. VALUE str2 = str_new3(rb_obj_class(str), str);
  574. OBJ_INFECT(str2, str);
  575. return str2;
  576. }
  577. RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
  578. #define rb_str_new3 rb_str_new_shared
  579. static VALUE
  580. str_new4(VALUE klass, VALUE str)
  581. {
  582. VALUE str2;
  583. str2 = str_alloc(klass);
  584. STR_SET_NOEMBED(str2);
  585. RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
  586. RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
  587. if (STR_SHARED_P(str)) {
  588. VALUE shared = RSTRING(str)->as.heap.aux.shared;
  589. assert(OBJ_FROZEN(shared));
  590. FL_SET(str2, ELTS_SHARED);
  591. RSTRING(str2)->as.heap.aux.shared = shared;
  592. }
  593. else {
  594. FL_SET(str, ELTS_SHARED);
  595. RSTRING(str)->as.heap.aux.shared = str2;
  596. }
  597. rb_enc_cr_str_exact_copy(str2, str);
  598. OBJ_INFECT(str2, str);
  599. return str2;
  600. }
  601. VALUE
  602. rb_str_new_frozen(VALUE orig)
  603. {
  604. VALUE klass, str;
  605. if (OBJ_FROZEN(orig)) return orig;
  606. klass = rb_obj_class(orig);
  607. if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
  608. long ofs;
  609. assert(OBJ_FROZEN(str));
  610. ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
  611. if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
  612. ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
  613. ENCODING_GET(str) != ENCODING_GET(orig)) {
  614. str = str_new3(klass, str);
  615. RSTRING(str)->as.heap.ptr += ofs;
  616. RSTRING(str)->as.heap.len -= ofs;
  617. rb_enc_cr_str_exact_copy(str, orig);
  618. OBJ_INFECT(str, orig);
  619. }
  620. }
  621. else if (STR_EMBED_P(orig)) {
  622. str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
  623. rb_enc_cr_str_exact_copy(str, orig);
  624. OBJ_INFECT(str, orig);
  625. }
  626. else if (STR_ASSOC_P(orig)) {
  627. VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
  628. FL_UNSET(orig, STR_ASSOC);
  629. str = str_new4(klass, orig);
  630. FL_SET(str, STR_ASSOC);
  631. RSTRING(str)->as.heap.aux.shared = assoc;
  632. }
  633. else {
  634. str = str_new4(klass, orig);
  635. }
  636. OBJ_FREEZE(str);
  637. return str;
  638. }
  639. RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
  640. #define rb_str_new4 rb_str_new_frozen
  641. VALUE
  642. rb_str_new_with_class(VALUE obj, const char *ptr, long len)
  643. {
  644. return str_new(rb_obj_class(obj), ptr, len);
  645. }
  646. RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
  647. rb_str_new_with_class, (obj, ptr, len))
  648. #define rb_str_new5 rb_str_new_with_class
  649. static VALUE
  650. str_new_empty(VALUE str)
  651. {
  652. VALUE v = rb_str_new5(str, 0, 0);
  653. rb_enc_copy(v, str);
  654. OBJ_INFECT(v, str);
  655. return v;
  656. }
  657. #define STR_BUF_MIN_SIZE 128
  658. VALUE
  659. rb_str_buf_new(long capa)
  660. {
  661. VALUE str = str_alloc(rb_cString);
  662. if (capa < STR_BUF_MIN_SIZE) {
  663. capa = STR_BUF_MIN_SIZE;
  664. }
  665. FL_SET(str, STR_NOEMBED);
  666. RSTRING(str)->as.heap.aux.capa = capa;
  667. RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
  668. RSTRING(str)->as.heap.ptr[0] = '\0';
  669. return str;
  670. }
  671. VALUE
  672. rb_str_buf_new_cstr(const char *ptr)
  673. {
  674. VALUE str;
  675. long len = strlen(ptr);
  676. str = rb_str_buf_new(len);
  677. rb_str_buf_cat(str, ptr, len);
  678. return str;
  679. }
  680. RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
  681. #define rb_str_buf_new2 rb_str_buf_new_cstr
  682. VALUE
  683. rb_str_tmp_new(long len)
  684. {
  685. return str_new(0, 0, len);
  686. }
  687. void *
  688. rb_alloc_tmp_buffer(volatile VALUE *store, long len)
  689. {
  690. VALUE s = rb_str_tmp_new(len);
  691. *store = s;
  692. return RSTRING_PTR(s);
  693. }
  694. void
  695. rb_free_tmp_buffer(volatile VALUE *store)
  696. {
  697. VALUE s = *store;
  698. *store = 0;
  699. if (s) rb_str_clear(s);
  700. }
  701. void
  702. rb_str_free(VALUE str)
  703. {
  704. if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
  705. xfree(RSTRING(str)->as.heap.ptr);
  706. }
  707. }
  708. RUBY_FUNC_EXPORTED size_t
  709. rb_str_memsize(VALUE str)
  710. {
  711. if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
  712. return RSTRING(str)->as.heap.aux.capa;
  713. }
  714. else {
  715. return 0;
  716. }
  717. }
  718. VALUE
  719. rb_str_to_str(VALUE str)
  720. {
  721. return rb_convert_type(str, T_STRING, "String", "to_str");
  722. }
  723. static inline void str_discard(VALUE str);
  724. void
  725. rb_str_shared_replace(VALUE str, VALUE str2)
  726. {
  727. rb_encoding *enc;
  728. int cr;
  729. if (str == str2) return;
  730. enc = STR_ENC_GET(str2);
  731. cr = ENC_CODERANGE(str2);
  732. str_discard(str);
  733. OBJ_INFECT(str, str2);
  734. if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
  735. STR_SET_EMBED(str);
  736. memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
  737. STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
  738. rb_enc_associate(str, enc);
  739. ENC_CODERANGE_SET(str, cr);
  740. return;
  741. }
  742. STR_SET_NOEMBED(str);
  743. STR_UNSET_NOCAPA(str);
  744. RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
  745. RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
  746. if (STR_NOCAPA_P(str2)) {
  747. FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
  748. RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
  749. }
  750. else {
  751. RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
  752. }
  753. STR_SET_EMBED(str2); /* abandon str2 */
  754. RSTRING_PTR(str2)[0] = 0;
  755. STR_SET_EMBED_LEN(str2, 0);
  756. rb_enc_associate(str, enc);
  757. ENC_CODERANGE_SET(str, cr);
  758. }
  759. static ID id_to_s;
  760. VALUE
  761. rb_obj_as_string(VALUE obj)
  762. {
  763. VALUE str;
  764. if (RB_TYPE_P(obj, T_STRING)) {
  765. return obj;
  766. }
  767. str = rb_funcall(obj, id_to_s, 0);
  768. if (!RB_TYPE_P(str, T_STRING))
  769. return rb_any_to_s(obj);
  770. if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
  771. return str;
  772. }
  773. static VALUE
  774. str_replace(VALUE str, VALUE str2)
  775. {
  776. long len;
  777. len = RSTRING_LEN(str2);
  778. if (STR_ASSOC_P(str2)) {
  779. str2 = rb_str_new4(str2);
  780. }
  781. if (STR_SHARED_P(str2)) {
  782. VALUE shared = RSTRING(str2)->as.heap.aux.shared;
  783. assert(OBJ_FROZEN(shared));
  784. STR_SET_NOEMBED(str);
  785. RSTRING(str)->as.heap.len = len;
  786. RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
  787. FL_SET(str, ELTS_SHARED);
  788. FL_UNSET(str, STR_ASSOC);
  789. RSTRING(str)->as.heap.aux.shared = shared;
  790. }
  791. else {
  792. str_replace_shared(str, str2);
  793. }
  794. OBJ_INFECT(str, str2);
  795. rb_enc_cr_str_exact_copy(str, str2);
  796. return str;
  797. }
  798. static VALUE
  799. str_duplicate(VALUE klass, VALUE str)
  800. {
  801. VALUE dup = str_alloc(klass);
  802. str_replace(dup, str);
  803. return dup;
  804. }
  805. VALUE
  806. rb_str_dup(VALUE str)
  807. {
  808. return str_duplicate(rb_obj_class(str), str);
  809. }
  810. VALUE
  811. rb_str_resurrect(VALUE str)
  812. {
  813. if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
  814. RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
  815. rb_sourcefile(), rb_sourceline());
  816. }
  817. return str_replace(str_alloc(rb_cString), str);
  818. }
  819. /*
  820. * call-seq:
  821. * String.new(str="") -> new_str
  822. *
  823. * Returns a new string object containing a copy of <i>str</i>.
  824. */
  825. static VALUE
  826. rb_str_init(int argc, VALUE *argv, VALUE str)
  827. {
  828. VALUE orig;
  829. if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
  830. rb_str_replace(str, orig);
  831. return str;
  832. }
  833. static inline long
  834. enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
  835. {
  836. long c;
  837. const char *q;
  838. if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  839. return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
  840. }
  841. else if (rb_enc_asciicompat(enc)) {
  842. c = 0;
  843. if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
  844. while (p < e) {
  845. if (ISASCII(*p)) {
  846. q = search_nonascii(p, e);
  847. if (!q)
  848. return c + (e - p);
  849. c += q - p;
  850. p = q;
  851. }
  852. p += rb_enc_fast_mbclen(p, e, enc);
  853. c++;
  854. }
  855. }
  856. else {
  857. while (p < e) {
  858. if (ISASCII(*p)) {
  859. q = search_nonascii(p, e);
  860. if (!q)
  861. return c + (e - p);
  862. c += q - p;
  863. p = q;
  864. }
  865. p += rb_enc_mbclen(p, e, enc);
  866. c++;
  867. }
  868. }
  869. return c;
  870. }
  871. for (c=0; p<e; c++) {
  872. p += rb_enc_mbclen(p, e, enc);
  873. }
  874. return c;
  875. }
  876. long
  877. rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
  878. {
  879. return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
  880. }
  881. long
  882. rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
  883. {
  884. long c;
  885. const char *q;
  886. int ret;
  887. *cr = 0;
  888. if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  889. return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
  890. }
  891. else if (rb_enc_asciicompat(enc)) {
  892. c = 0;
  893. while (p < e) {
  894. if (ISASCII(*p)) {
  895. q = search_nonascii(p, e);
  896. if (!q) {
  897. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  898. return c + (e - p);
  899. }
  900. c += q - p;
  901. p = q;
  902. }
  903. ret = rb_enc_precise_mbclen(p, e, enc);
  904. if (MBCLEN_CHARFOUND_P(ret)) {
  905. *cr |= ENC_CODERANGE_VALID;
  906. p += MBCLEN_CHARFOUND_LEN(ret);
  907. }
  908. else {
  909. *cr = ENC_CODERANGE_BROKEN;
  910. p++;
  911. }
  912. c++;
  913. }
  914. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  915. return c;
  916. }
  917. for (c=0; p<e; c++) {
  918. ret = rb_enc_precise_mbclen(p, e, enc);
  919. if (MBCLEN_CHARFOUND_P(ret)) {
  920. *cr |= ENC_CODERANGE_VALID;
  921. p += MBCLEN_CHARFOUND_LEN(ret);
  922. }
  923. else {
  924. *cr = ENC_CODERANGE_BROKEN;
  925. if (p + rb_enc_mbminlen(enc) <= e)
  926. p += rb_enc_mbminlen(enc);
  927. else
  928. p = e;
  929. }
  930. }
  931. if (!*cr) *cr = ENC_CODERANGE_7BIT;
  932. return c;
  933. }
  934. #ifdef NONASCII_MASK
  935. #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
  936. /*
  937. * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
  938. * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
  939. * Therefore, following pseudo code can detect UTF-8 leading byte.
  940. *
  941. * if (!(byte & 0x80))
  942. * byte |= 0x40; // turn on bit6
  943. * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
  944. *
  945. * This function calculate every bytes in the argument word `s'
  946. * using the above logic concurrently. and gather every bytes result.
  947. */
  948. static inline VALUE
  949. count_utf8_lead_bytes_with_word(const VALUE *s)
  950. {
  951. VALUE d = *s;
  952. /* Transform into bit0 represent UTF-8 leading or not. */
  953. d |= ~(d>>1);
  954. d >>= 6;
  955. d &= NONASCII_MASK >> 7;
  956. /* Gather every bytes. */
  957. d += (d>>8);
  958. d += (d>>16);
  959. #if SIZEOF_VALUE == 8
  960. d += (d>>32);
  961. #endif
  962. return (d&0xF);
  963. }
  964. #endif
  965. static long
  966. str_strlen(VALUE str, rb_encoding *enc)
  967. {
  968. const char *p, *e;
  969. long n;
  970. int cr;
  971. if (single_byte_optimizable(str)) return RSTRING_LEN(str);
  972. if (!enc) enc = STR_ENC_GET(str);
  973. p = RSTRING_PTR(str);
  974. e = RSTRING_END(str);
  975. cr = ENC_CODERANGE(str);
  976. #ifdef NONASCII_MASK
  977. if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
  978. enc == rb_utf8_encoding()) {
  979. VALUE len = 0;
  980. if ((int)sizeof(VALUE) * 2 < e - p) {
  981. const VALUE *s, *t;
  982. const VALUE lowbits = sizeof(VALUE) - 1;
  983. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  984. t = (const VALUE*)(~lowbits & (VALUE)e);
  985. while (p < (const char *)s) {
  986. if (is_utf8_lead_byte(*p)) len++;
  987. p++;
  988. }
  989. while (s < t) {
  990. len += count_utf8_lead_bytes_with_word(s);
  991. s++;
  992. }
  993. p = (const char *)s;
  994. }
  995. while (p < e) {
  996. if (is_utf8_lead_byte(*p)) len++;
  997. p++;
  998. }
  999. return (long)len;
  1000. }
  1001. #endif
  1002. n = rb_enc_strlen_cr(p, e, enc, &cr);
  1003. if (cr) {
  1004. ENC_CODERANGE_SET(str, cr);
  1005. }
  1006. return n;
  1007. }
  1008. long
  1009. rb_str_strlen(VALUE str)
  1010. {
  1011. return str_strlen(str, STR_ENC_GET(str));
  1012. }
  1013. /*
  1014. * call-seq:
  1015. * str.length -> integer
  1016. * str.size -> integer
  1017. *
  1018. * Returns the character length of <i>str</i>.
  1019. */
  1020. VALUE
  1021. rb_str_length(VALUE str)
  1022. {
  1023. long len;
  1024. len = str_strlen(str, STR_ENC_GET(str));
  1025. return LONG2NUM(len);
  1026. }
  1027. /*
  1028. * call-seq:
  1029. * str.bytesize -> integer
  1030. *
  1031. * Returns the length of +str+ in bytes.
  1032. *
  1033. * "\x80\u3042".bytesize #=> 4
  1034. * "hello".bytesize #=> 5
  1035. */
  1036. static VALUE
  1037. rb_str_bytesize(VALUE str)
  1038. {
  1039. return LONG2NUM(RSTRING_LEN(str));
  1040. }
  1041. /*
  1042. * call-seq:
  1043. * str.empty? -> true or false
  1044. *
  1045. * Returns <code>true</code> if <i>str</i> has a length of zero.
  1046. *
  1047. * "hello".empty? #=> false
  1048. * " ".empty? #=> false
  1049. * "".empty? #=> true
  1050. */
  1051. static VALUE
  1052. rb_str_empty(VALUE str)
  1053. {
  1054. if (RSTRING_LEN(str) == 0)
  1055. return Qtrue;
  1056. return Qfalse;
  1057. }
  1058. /*
  1059. * call-seq:
  1060. * str + other_str -> new_str
  1061. *
  1062. * Concatenation---Returns a new <code>String</code> containing
  1063. * <i>other_str</i> concatenated to <i>str</i>.
  1064. *
  1065. * "Hello from " + self.to_s #=> "Hello from main"
  1066. */
  1067. VALUE
  1068. rb_str_plus(VALUE str1, VALUE str2)
  1069. {
  1070. VALUE str3;
  1071. rb_encoding *enc;
  1072. StringValue(str2);
  1073. enc = rb_enc_check(str1, str2);
  1074. str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
  1075. memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
  1076. memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
  1077. RSTRING_PTR(str2), RSTRING_LEN(str2));
  1078. RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
  1079. if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
  1080. OBJ_TAINT(str3);
  1081. ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
  1082. ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
  1083. return str3;
  1084. }
  1085. /*
  1086. * call-seq:
  1087. * str * integer -> new_str
  1088. *
  1089. * Copy --- Returns a new String containing +integer+ copies of the receiver.
  1090. * +integer+ must be greater than or equal to 0.
  1091. *
  1092. * "Ho! " * 3 #=> "Ho! Ho! Ho! "
  1093. * "Ho! " * 0 #=> ""
  1094. */
  1095. VALUE
  1096. rb_str_times(VALUE str, VALUE times)
  1097. {
  1098. VALUE str2;
  1099. long n, len;
  1100. char *ptr2;
  1101. len = NUM2LONG(times);
  1102. if (len < 0) {
  1103. rb_raise(rb_eArgError, "negative argument");
  1104. }
  1105. if (len && LONG_MAX/len < RSTRING_LEN(str)) {
  1106. rb_raise(rb_eArgError, "argument too big");
  1107. }
  1108. str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
  1109. ptr2 = RSTRING_PTR(str2);
  1110. if (len) {
  1111. n = RSTRING_LEN(str);
  1112. memcpy(ptr2, RSTRING_PTR(str), n);
  1113. while (n <= len/2) {
  1114. memcpy(ptr2 + n, ptr2, n);
  1115. n *= 2;
  1116. }
  1117. memcpy(ptr2 + n, ptr2, len-n);
  1118. }
  1119. ptr2[RSTRING_LEN(str2)] = '\0';
  1120. OBJ_INFECT(str2, str);
  1121. rb_enc_cr_str_copy_for_substr(str2, str);
  1122. return str2;
  1123. }
  1124. /*
  1125. * call-seq:
  1126. * str % arg -> new_str
  1127. *
  1128. * Format---Uses <i>str</i> as a format specification, and returns the result
  1129. * of applying it to <i>arg</i>. If the format specification contains more than
  1130. * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
  1131. * containing the values to be substituted. See <code>Kernel::sprintf</code> for
  1132. * details of the format string.
  1133. *
  1134. * "%05d" % 123 #=> "00123"
  1135. * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
  1136. * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
  1137. */
  1138. static VALUE
  1139. rb_str_format_m(VALUE str, VALUE arg)
  1140. {
  1141. volatile VALUE tmp = rb_check_array_type(arg);
  1142. if (!NIL_P(tmp)) {
  1143. return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
  1144. }
  1145. return rb_str_format(1, &arg, str);
  1146. }
  1147. static inline void
  1148. str_modifiable(VALUE str)
  1149. {
  1150. if (FL_TEST(str, STR_TMPLOCK)) {
  1151. rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
  1152. }
  1153. rb_check_frozen(str);
  1154. if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
  1155. rb_raise(rb_eSecurityError, "Insecure: can't modify string");
  1156. }
  1157. static inline int
  1158. str_independent(VALUE str)
  1159. {
  1160. str_modifiable(str);
  1161. if (!STR_SHARED_P(str)) return 1;
  1162. if (STR_EMBED_P(str)) return 1;
  1163. return 0;
  1164. }
  1165. static void
  1166. str_make_independent_expand(VALUE str, long expand)
  1167. {
  1168. char *ptr;
  1169. long len = RSTRING_LEN(str);
  1170. long capa = len + expand;
  1171. if (len > capa) len = capa;
  1172. ptr = ALLOC_N(char, capa + 1);
  1173. if (RSTRING_PTR(str)) {
  1174. memcpy(ptr, RSTRING_PTR(str), len);
  1175. }
  1176. STR_SET_NOEMBED(str);
  1177. STR_UNSET_NOCAPA(str);
  1178. ptr[len] = 0;
  1179. RSTRING(str)->as.heap.ptr = ptr;
  1180. RSTRING(str)->as.heap.len = len;
  1181. RSTRING(str)->as.heap.aux.capa = capa;
  1182. }
  1183. #define str_make_independent(str) str_make_independent_expand((str), 0L)
  1184. void
  1185. rb_str_modify(VALUE str)
  1186. {
  1187. if (!str_independent(str))
  1188. str_make_independent(str);
  1189. ENC_CODERANGE_CLEAR(str);
  1190. }
  1191. void
  1192. rb_str_modify_expand(VALUE str, long expand)
  1193. {
  1194. if (expand < 0) {
  1195. rb_raise(rb_eArgError, "negative expanding string size");
  1196. }
  1197. if (!str_independent(str)) {
  1198. str_make_independent_expand(str, expand);
  1199. }
  1200. else if (expand > 0) {
  1201. long len = RSTRING_LEN(str);
  1202. long capa = len + expand;
  1203. if (!STR_EMBED_P(str)) {
  1204. REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
  1205. RSTRING(str)->as.heap.aux.capa = capa;
  1206. }
  1207. else if (capa > RSTRING_EMBED_LEN_MAX) {
  1208. str_make_independent_expand(str, expand);
  1209. }
  1210. }
  1211. ENC_CODERANGE_CLEAR(str);
  1212. }
  1213. /* As rb_str_modify(), but don't clear coderange */
  1214. static void
  1215. str_modify_keep_cr(VALUE str)
  1216. {
  1217. if (!str_independent(str))
  1218. str_make_independent(str);
  1219. if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
  1220. /* Force re-scan later */
  1221. ENC_CODERANGE_CLEAR(str);
  1222. }
  1223. static inline void
  1224. str_discard(VALUE str)
  1225. {
  1226. str_modifiable(str);
  1227. if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
  1228. xfree(RSTRING_PTR(str));
  1229. RSTRING(str)->as.heap.ptr = 0;
  1230. RSTRING(str)->as.heap.len = 0;
  1231. }
  1232. }
  1233. void
  1234. rb_str_associate(VALUE str, VALUE add)
  1235. {
  1236. /* sanity check */
  1237. rb_check_frozen(str);
  1238. if (STR_ASSOC_P(str)) {
  1239. /* already associated */
  1240. rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
  1241. }
  1242. else {
  1243. if (STR_SHARED_P(str)) {
  1244. VALUE assoc = RSTRING(str)->as.heap.aux.shared;
  1245. str_make_independent(str);
  1246. if (STR_ASSOC_P(assoc)) {
  1247. assoc = RSTRING(assoc)->as.heap.aux.shared;
  1248. rb_ary_concat(assoc, add);
  1249. add = assoc;
  1250. }
  1251. }
  1252. else if (STR_EMBED_P(str)) {
  1253. str_make_independent(str);
  1254. }
  1255. else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
  1256. RESIZE_CAPA(str, RSTRING_LEN(str));
  1257. }
  1258. FL_SET(str, STR_ASSOC);
  1259. RBASIC(add)->klass = 0;
  1260. RSTRING(str)->as.heap.aux.shared = add;
  1261. }
  1262. }
  1263. VALUE
  1264. rb_str_associated(VALUE str)
  1265. {
  1266. if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
  1267. if (STR_ASSOC_P(str)) {
  1268. return RSTRING(str)->as.heap.aux.shared;
  1269. }
  1270. return Qfalse;
  1271. }
  1272. void
  1273. rb_must_asciicompat(VALUE str)
  1274. {
  1275. rb_encoding *enc = rb_enc_get(str);
  1276. if (!rb_enc_asciicompat(enc)) {
  1277. rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
  1278. }
  1279. }
  1280. VALUE
  1281. rb_string_value(volatile VALUE *ptr)
  1282. {
  1283. VALUE s = *ptr;
  1284. if (!RB_TYPE_P(s, T_STRING)) {
  1285. s = rb_str_to_str(s);
  1286. *ptr = s;
  1287. }
  1288. return s;
  1289. }
  1290. char *
  1291. rb_string_value_ptr(volatile VALUE *ptr)
  1292. {
  1293. VALUE str = rb_string_value(ptr);
  1294. return RSTRING_PTR(str);
  1295. }
  1296. char *
  1297. rb_string_value_cstr(volatile VALUE *ptr)
  1298. {
  1299. VALUE str = rb_string_value(ptr);
  1300. char *s = RSTRING_PTR(str);
  1301. long len = RSTRING_LEN(str);
  1302. if (!s || memchr(s, 0, len)) {
  1303. rb_raise(rb_eArgError, "string contains null byte");
  1304. }
  1305. if (s[len]) {
  1306. rb_str_modify(str);
  1307. s = RSTRING_PTR(str);
  1308. s[RSTRING_LEN(str)] = 0;
  1309. }
  1310. return s;
  1311. }
  1312. VALUE
  1313. rb_check_string_type(VALUE str)
  1314. {
  1315. str = rb_check_convert_type(str, T_STRING, "String", "to_str");
  1316. return str;
  1317. }
  1318. /*
  1319. * call-seq:
  1320. * String.try_convert(obj) -> string or nil
  1321. *
  1322. * Try to convert <i>obj</i> into a String, using to_str method.
  1323. * Returns converted string or nil if <i>obj</i> cannot be converted
  1324. * for any reason.
  1325. *
  1326. * String.try_convert("str") #=> "str"
  1327. * String.try_convert(/re/) #=> nil
  1328. */
  1329. static VALUE
  1330. rb_str_s_try_convert(VALUE dummy, VALUE str)
  1331. {
  1332. return rb_check_string_type(str);
  1333. }
  1334. static char*
  1335. str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
  1336. {
  1337. long nth = *nthp;
  1338. if (rb_enc_mbmaxlen(enc) == 1) {
  1339. p += nth;
  1340. }
  1341. else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  1342. p += nth * rb_enc_mbmaxlen(enc);
  1343. }
  1344. else if (rb_enc_asciicompat(enc)) {
  1345. const char *p2, *e2;
  1346. int n;
  1347. while (p < e && 0 < nth) {
  1348. e2 = p + nth;
  1349. if (e < e2) {
  1350. *nthp = nth;
  1351. return (char *)e;
  1352. }
  1353. if (ISASCII(*p)) {
  1354. p2 = search_nonascii(p, e2);
  1355. if (!p2) {
  1356. nth -= e2 - p;
  1357. *nthp = nth;
  1358. return (char *)e2;
  1359. }
  1360. nth -= p2 - p;
  1361. p = p2;
  1362. }
  1363. n = rb_enc_mbclen(p, e, enc);
  1364. p += n;
  1365. nth--;
  1366. }
  1367. *nthp = nth;
  1368. if (nth != 0) {
  1369. return (char *)e;
  1370. }
  1371. return (char *)p;
  1372. }
  1373. else {
  1374. while (p < e && nth--) {
  1375. p += rb_enc_mbclen(p, e, enc);
  1376. }
  1377. }
  1378. if (p > e) p = e;
  1379. *nthp = nth;
  1380. return (char*)p;
  1381. }
  1382. char*
  1383. rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
  1384. {
  1385. return str_nth_len(p, e, &nth, enc);
  1386. }
  1387. static char*
  1388. str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
  1389. {
  1390. if (singlebyte)
  1391. p += nth;
  1392. else {
  1393. p = str_nth_len(p, e, &nth, enc);
  1394. }
  1395. if (!p) return 0;
  1396. if (p > e) p = e;
  1397. return (char *)p;
  1398. }
  1399. /* char offset to byte offset */
  1400. static long
  1401. str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
  1402. {
  1403. const char *pp = str_nth(p, e, nth, enc, singlebyte);
  1404. if (!pp) return e - p;
  1405. return pp - p;
  1406. }
  1407. long
  1408. rb_str_offset(VALUE str, long pos)
  1409. {
  1410. return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
  1411. STR_ENC_GET(str), single_byte_optimizable(str));
  1412. }
  1413. #ifdef NONASCII_MASK
  1414. static char *
  1415. str_utf8_nth(const char *p, const char *e, long *nthp)
  1416. {
  1417. long nth = *nthp;
  1418. if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
  1419. const VALUE *s, *t;
  1420. const VALUE lowbits = sizeof(VALUE) - 1;
  1421. s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
  1422. t = (const VALUE*)(~lowbits & (VALUE)e);
  1423. while (p < (const char *)s) {
  1424. if (is_utf8_lead_byte(*p)) nth--;
  1425. p++;
  1426. }
  1427. do {
  1428. nth -= count_utf8_lead_bytes_with_word(s);
  1429. s++;
  1430. } while (s < t && (int)sizeof(VALUE) <= nth);
  1431. p = (char *)s;
  1432. }
  1433. while (p < e) {
  1434. if (is_utf8_lead_byte(*p)) {
  1435. if (nth == 0) break;
  1436. nth--;
  1437. }
  1438. p++;
  1439. }
  1440. *nthp = nth;
  1441. return (char *)p;
  1442. }
  1443. static long
  1444. str_utf8_offset(const char *p, const char *e, long nth)
  1445. {
  1446. const char *pp = str_utf8_nth(p, e, &nth);
  1447. return pp - p;
  1448. }
  1449. #endif
  1450. /* byte offset to char offset */
  1451. long
  1452. rb_str_sublen(VALUE str, long pos)
  1453. {
  1454. if (single_byte_optimizable(str) || pos < 0)
  1455. return pos;
  1456. else {
  1457. char *p = RSTRING_PTR(str);
  1458. return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
  1459. }
  1460. }
  1461. VALUE
  1462. rb_str_subseq(VALUE str, long beg, long len)
  1463. {
  1464. VALUE str2;
  1465. if (RSTRING_LEN(str) == beg + len &&
  1466. RSTRING_EMBED_LEN_MAX < len) {
  1467. str2 = rb_str_new_shared(rb_str_new_frozen(str));
  1468. rb_str_drop_bytes(str2, beg);
  1469. }
  1470. else {
  1471. str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
  1472. RB_GC_GUARD(str);
  1473. }
  1474. rb_enc_cr_str_copy_for_substr(str2, str);
  1475. OBJ_INFECT(str2, str);
  1476. return str2;
  1477. }
  1478. static char *
  1479. rb_str_subpos(VALUE str, long beg, long *lenp)
  1480. {
  1481. long len = *lenp;
  1482. long slen = -1L;
  1483. long blen = RSTRING_LEN(str);
  1484. rb_encoding *enc = STR_ENC_GET(str);
  1485. char *p, *s = RSTRING_PTR(str), *e = s + blen;
  1486. if (len < 0) return 0;
  1487. if (!blen) {
  1488. len = 0;
  1489. }
  1490. if (single_byte_optimizable(str)) {
  1491. if (beg > blen) return 0;
  1492. if (beg < 0) {
  1493. beg += blen;
  1494. if (beg < 0) return 0;
  1495. }
  1496. if (beg + len > blen)
  1497. len = blen - beg;
  1498. if (len < 0) return 0;
  1499. p = s + beg;
  1500. goto end;
  1501. }
  1502. if (beg < 0) {
  1503. if (len > -beg) len = -beg;
  1504. if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
  1505. beg = -beg;
  1506. while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
  1507. p = e;
  1508. if (!p) return 0;
  1509. while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
  1510. if (!p) return 0;
  1511. len = e - p;
  1512. goto end;
  1513. }
  1514. else {
  1515. slen = str_strlen(str, enc);
  1516. beg += slen;
  1517. if (beg < 0) return 0;
  1518. p = s + beg;
  1519. if (len == 0) goto end;
  1520. }
  1521. }
  1522. else if (beg > 0 && beg > RSTRING_LEN(str)) {
  1523. return 0;
  1524. }
  1525. if (len == 0) {
  1526. if (beg > str_strlen(str, enc)) return 0;
  1527. p = s + beg;
  1528. }
  1529. #ifdef NONASCII_MASK
  1530. else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
  1531. enc == rb_utf8_encoding()) {
  1532. p = str_utf8_nth(s, e, &beg);
  1533. if (beg > 0) return 0;
  1534. len = str_utf8_offset(p, e, len);
  1535. }
  1536. #endif
  1537. else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  1538. int char_sz = rb_enc_mbmaxlen(enc);
  1539. p = s + beg * char_sz;
  1540. if (p > e) {
  1541. return 0;
  1542. }
  1543. else if (len * char_sz > e - p)
  1544. len = e - p;
  1545. else
  1546. len *= char_sz;
  1547. }
  1548. else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
  1549. if (beg > 0) return 0;
  1550. len = 0;
  1551. }
  1552. else {
  1553. len = str_offset(p, e, len, enc, 0);
  1554. }
  1555. end:
  1556. *lenp = len;
  1557. RB_GC_GUARD(str);
  1558. return p;
  1559. }
  1560. VALUE
  1561. rb_str_substr(VALUE str, long beg, long len)
  1562. {
  1563. VALUE str2;
  1564. char *p = rb_str_subpos(str, beg, &len);
  1565. if (!p) return Qnil;
  1566. if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
  1567. str2 = rb_str_new4(str);
  1568. str2 = str_new3(rb_obj_class(str2), str2);
  1569. RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
  1570. RSTRING(str2)->as.heap.len = len;
  1571. }
  1572. else {
  1573. str2 = rb_str_new5(str, p, len);
  1574. rb_enc_cr_str_copy_for_substr(str2, str);
  1575. OBJ_INFECT(str2, str);
  1576. RB_GC_GUARD(str);
  1577. }
  1578. return str2;
  1579. }
  1580. VALUE
  1581. rb_str_freeze(VALUE str)
  1582. {
  1583. if (STR_ASSOC_P(str)) {
  1584. VALUE ary = RSTRING(str)->as.heap.aux.shared;
  1585. OBJ_FREEZE(ary);
  1586. }
  1587. return rb_obj_freeze(str);
  1588. }
  1589. RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
  1590. #define rb_str_dup_frozen rb_str_new_frozen
  1591. VALUE
  1592. rb_str_locktmp(VALUE str)
  1593. {
  1594. if (FL_TEST(str, STR_TMPLOCK)) {
  1595. rb_raise(rb_eRuntimeError, "temporal locking already locked string");
  1596. }
  1597. FL_SET(str, STR_TMPLOCK);
  1598. return str;
  1599. }
  1600. VALUE
  1601. rb_str_unlocktmp(VALUE str)
  1602. {
  1603. if (!FL_TEST(str, STR_TMPLOCK)) {
  1604. rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
  1605. }
  1606. FL_UNSET(str, STR_TMPLOCK);
  1607. return str;
  1608. }
  1609. void
  1610. rb_str_set_len(VALUE str, long len)
  1611. {
  1612. long capa;
  1613. str_modifiable(str);
  1614. if (STR_SHARED_P(str)) {
  1615. rb_raise(rb_eRuntimeError, "can't set length of shared string");
  1616. }
  1617. if (len > (capa = (long)rb_str_capacity(str))) {
  1618. rb_bug("probable buffer overflow: %ld for %ld", len, capa);
  1619. }
  1620. STR_SET_LEN(str, len);
  1621. RSTRING_PTR(str)[len] = '\0';
  1622. }
  1623. VALUE
  1624. rb_str_resize(VALUE str, long len)
  1625. {
  1626. long slen;
  1627. int independent;
  1628. if (len < 0) {
  1629. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1630. }
  1631. independent = str_independent(str);
  1632. ENC_CODERANGE_CLEAR(str);
  1633. slen = RSTRING_LEN(str);
  1634. if (len != slen) {
  1635. if (STR_EMBED_P(str)) {
  1636. if (len <= RSTRING_EMBED_LEN_MAX) {
  1637. STR_SET_EMBED_LEN(str, len);
  1638. RSTRING(str)->as.ary[len] = '\0';
  1639. return str;
  1640. }
  1641. str_make_independent_expand(str, len - slen);
  1642. STR_SET_NOEMBED(str);
  1643. }
  1644. else if (len <= RSTRING_EMBED_LEN_MAX) {
  1645. char *ptr = RSTRING(str)->as.heap.ptr;
  1646. STR_SET_EMBED(str);
  1647. if (slen > len) slen = len;
  1648. if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
  1649. RSTRING(str)->as.ary[len] = '\0';
  1650. STR_SET_EMBED_LEN(str, len);
  1651. if (independent) xfree(ptr);
  1652. return str;
  1653. }
  1654. else if (!independent) {
  1655. str_make_independent_expand(str, len - slen);
  1656. }
  1657. else if (slen < len || slen - len > 1024) {
  1658. REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
  1659. }
  1660. if (!STR_NOCAPA_P(str)) {
  1661. RSTRING(str)->as.heap.aux.capa = len;
  1662. }
  1663. RSTRING(str)->as.heap.len = len;
  1664. RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
  1665. }
  1666. return str;
  1667. }
  1668. static VALUE
  1669. str_buf_cat(VALUE str, const char *ptr, long len)
  1670. {
  1671. long capa, total, off = -1;
  1672. if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
  1673. off = ptr - RSTRING_PTR(str);
  1674. }
  1675. rb_str_modify(str);
  1676. if (len == 0) return 0;
  1677. if (STR_ASSOC_P(str)) {
  1678. FL_UNSET(str, STR_ASSOC);
  1679. capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
  1680. }
  1681. else if (STR_EMBED_P(str)) {
  1682. capa = RSTRING_EMBED_LEN_MAX;
  1683. }
  1684. else {
  1685. capa = RSTRING(str)->as.heap.aux.capa;
  1686. }
  1687. if (RSTRING_LEN(str) >= LONG_MAX - len) {
  1688. rb_raise(rb_eArgError, "string sizes too big");
  1689. }
  1690. total = RSTRING_LEN(str)+len;
  1691. if (capa <= total) {
  1692. while (total > capa) {
  1693. if (capa + 1 >= LONG_MAX / 2) {
  1694. capa = (total + 4095) / 4096;
  1695. break;
  1696. }
  1697. capa = (capa + 1) * 2;
  1698. }
  1699. RESIZE_CAPA(str, capa);
  1700. }
  1701. if (off != -1) {
  1702. ptr = RSTRING_PTR(str) + off;
  1703. }
  1704. memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
  1705. STR_SET_LEN(str, total);
  1706. RSTRING_PTR(str)[total] = '\0'; /* sentinel */
  1707. return str;
  1708. }
  1709. #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
  1710. VALUE
  1711. rb_str_buf_cat(VALUE str, const char *ptr, long len)
  1712. {
  1713. if (len == 0) return str;
  1714. if (len < 0) {
  1715. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1716. }
  1717. return str_buf_cat(str, ptr, len);
  1718. }
  1719. VALUE
  1720. rb_str_buf_cat2(VALUE str, const char *ptr)
  1721. {
  1722. return rb_str_buf_cat(str, ptr, strlen(ptr));
  1723. }
  1724. VALUE
  1725. rb_str_cat(VALUE str, const char *ptr, long len)
  1726. {
  1727. if (len < 0) {
  1728. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1729. }
  1730. if (STR_ASSOC_P(str)) {
  1731. char *p;
  1732. rb_str_modify_expand(str, len);
  1733. p = RSTRING(str)->as.heap.ptr;
  1734. memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
  1735. len = RSTRING(str)->as.heap.len += len;
  1736. p[len] = '\0'; /* sentinel */
  1737. return str;
  1738. }
  1739. return rb_str_buf_cat(str, ptr, len);
  1740. }
  1741. VALUE
  1742. rb_str_cat2(VALUE str, const char *ptr)
  1743. {
  1744. return rb_str_cat(str, ptr, strlen(ptr));
  1745. }
  1746. static VALUE
  1747. rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
  1748. int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
  1749. {
  1750. int str_encindex = ENCODING_GET(str);
  1751. int res_encindex;
  1752. int str_cr, res_cr;
  1753. str_cr = ENC_CODERANGE(str);
  1754. if (str_encindex == ptr_encindex) {
  1755. if (str_cr == ENC_CODERANGE_UNKNOWN)
  1756. ptr_cr = ENC_CODERANGE_UNKNOWN;
  1757. else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
  1758. ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
  1759. }
  1760. }
  1761. else {
  1762. rb_encoding *str_enc = rb_enc_from_index(str_encindex);
  1763. rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
  1764. if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
  1765. if (len == 0)
  1766. return str;
  1767. if (RSTRING_LEN(str) == 0) {
  1768. rb_str_buf_cat(str, ptr, len);
  1769. ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
  1770. return str;
  1771. }
  1772. goto incompatible;
  1773. }
  1774. if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
  1775. ptr_cr = coderange_scan(ptr, len, ptr_enc);
  1776. }
  1777. if (str_cr == ENC_CODERANGE_UNKNOWN) {
  1778. if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
  1779. str_cr = rb_enc_str_coderange(str);
  1780. }
  1781. }
  1782. }
  1783. if (ptr_cr_ret)
  1784. *ptr_cr_ret = ptr_cr;
  1785. if (str_encindex != ptr_encindex &&
  1786. str_cr != ENC_CODERANGE_7BIT &&
  1787. ptr_cr != ENC_CODERANGE_7BIT) {
  1788. incompatible:
  1789. rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
  1790. rb_enc_name(rb_enc_from_index(str_encindex)),
  1791. rb_enc_name(rb_enc_from_index(ptr_encindex)));
  1792. }
  1793. if (str_cr == ENC_CODERANGE_UNKNOWN) {
  1794. res_encindex = str_encindex;
  1795. res_cr = ENC_CODERANGE_UNKNOWN;
  1796. }
  1797. else if (str_cr == ENC_CODERANGE_7BIT) {
  1798. if (ptr_cr == ENC_CODERANGE_7BIT) {
  1799. res_encindex = str_encindex;
  1800. res_cr = ENC_CODERANGE_7BIT;
  1801. }
  1802. else {
  1803. res_encindex = ptr_encindex;
  1804. res_cr = ptr_cr;
  1805. }
  1806. }
  1807. else if (str_cr == ENC_CODERANGE_VALID) {
  1808. res_encindex = str_encindex;
  1809. if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
  1810. res_cr = str_cr;
  1811. else
  1812. res_cr = ptr_cr;
  1813. }
  1814. else { /* str_cr == ENC_CODERANGE_BROKEN */
  1815. res_encindex = str_encindex;
  1816. res_cr = str_cr;
  1817. if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
  1818. }
  1819. if (len < 0) {
  1820. rb_raise(rb_eArgError, "negative string size (or size too big)");
  1821. }
  1822. str_buf_cat(str, ptr, len);
  1823. ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
  1824. return str;
  1825. }
  1826. VALUE
  1827. rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
  1828. {
  1829. return rb_enc_cr_str_buf_cat(str, ptr, len,
  1830. rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
  1831. }
  1832. VALUE
  1833. rb_str_buf_cat_ascii(VALUE str, const char *ptr)
  1834. {
  1835. /* ptr must reference NUL terminated ASCII string. */
  1836. int encindex = ENCODING_GET(str);
  1837. rb_encoding *enc = rb_enc_from_index(encindex);
  1838. if (rb_enc_asciicompat(enc)) {
  1839. return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
  1840. encindex, ENC_CODERANGE_7BIT, 0);
  1841. }
  1842. else {
  1843. char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
  1844. while (*ptr) {
  1845. unsigned int c = (unsigned char)*ptr;
  1846. int len = rb_enc_codelen(c, enc);
  1847. rb_enc_mbcput(c, buf, enc);
  1848. rb_enc_cr_str_buf_cat(str, buf, len,
  1849. encindex, ENC_CODERANGE_VALID, 0);
  1850. ptr++;
  1851. }
  1852. return str;
  1853. }
  1854. }
  1855. VALUE
  1856. rb_str_buf_append(VALUE str, VALUE str2)
  1857. {
  1858. int str2_cr;
  1859. str2_cr = ENC_CODERANGE(str2);
  1860. rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
  1861. ENCODING_GET(str2), str2_cr, &str2_cr);
  1862. OBJ_INFECT(str, str2);
  1863. ENC_CODERANGE_SET(str2, str2_cr);
  1864. return str;
  1865. }
  1866. VALUE
  1867. rb_str_append(VALUE str, VALUE str2)
  1868. {
  1869. rb_encoding *enc;
  1870. int cr, cr2;
  1871. long len2;
  1872. StringValue(str2);
  1873. if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
  1874. long len = RSTRING_LEN(str) + len2;
  1875. enc = rb_enc_check(str, str2);
  1876. cr = ENC_CODERANGE(str);
  1877. if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
  1878. rb_str_modify_expand(str, len2);
  1879. memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
  1880. RSTRING_PTR(str2), len2+1);
  1881. RSTRING(str)->as.heap.len = len;
  1882. rb_enc_associate(str, enc);
  1883. ENC_CODERANGE_SET(str, cr);
  1884. OBJ_INFECT(str, str2);
  1885. return str;
  1886. }
  1887. return rb_str_buf_append(str, str2);
  1888. }
  1889. /*
  1890. * call-seq:
  1891. * str << integer -> str
  1892. * str.concat(integer) -> str
  1893. * str << obj -> str
  1894. * str.concat(obj) -> str
  1895. *
  1896. * Append---Concatenates the given object to <i>str</i>. If the object is a
  1897. * <code>Integer</code>, it is considered as a codepoint, and is converted
  1898. * to a character before concatenation.
  1899. *
  1900. * a = "hello "
  1901. * a << "world" #=> "hello world"
  1902. * a.concat(33) #=> "hello world!"
  1903. */
  1904. VALUE
  1905. rb_str_concat(VALUE str1, VALUE str2)
  1906. {
  1907. unsigned int code;
  1908. rb_encoding *enc = STR_ENC_GET(str1);
  1909. if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
  1910. if (rb_num_to_uint(str2, &code) == 0) {
  1911. }
  1912. else if (FIXNUM_P(str2)) {
  1913. rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
  1914. }
  1915. else {
  1916. rb_raise(rb_eRangeError, "bignum out of char range");
  1917. }
  1918. }
  1919. else {
  1920. return rb_str_append(str1, str2);
  1921. }
  1922. if (enc == rb_usascii_encoding()) {
  1923. /* US-ASCII automatically extended to ASCII-8BIT */
  1924. char buf[1];
  1925. buf[0] = (char)code;
  1926. if (code > 0xFF) {
  1927. rb_raise(rb_eRangeError, "%u out of char range", code);
  1928. }
  1929. rb_str_cat(str1, buf, 1);
  1930. if (code > 127) {
  1931. rb_enc_associate(str1, rb_ascii8bit_encoding());
  1932. ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
  1933. }
  1934. }
  1935. else {
  1936. long pos = RSTRING_LEN(str1);
  1937. int cr = ENC_CODERANGE(str1);
  1938. int len;
  1939. char *buf;
  1940. switch (len = rb_enc_codelen(code, enc)) {
  1941. case ONIGERR_INVALID_CODE_POINT_VALUE:
  1942. rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
  1943. break;
  1944. case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
  1945. case 0:
  1946. rb_raise(rb_eRangeError, "%u out of char range", code);
  1947. break;
  1948. }
  1949. buf = ALLOCA_N(char, len + 1);
  1950. rb_enc_mbcput(code, buf, enc);
  1951. if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
  1952. rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
  1953. }
  1954. rb_str_resize(str1, pos+len);
  1955. memcpy(RSTRING_PTR(str1) + pos, buf, len);
  1956. if (cr == ENC_CODERANGE_7BIT && code > 127)
  1957. cr = ENC_CODERANGE_VALID;
  1958. ENC_CODERANGE_SET(str1, cr);
  1959. }
  1960. return str1;
  1961. }
  1962. /*
  1963. * call-seq:
  1964. * str.prepend(other_str) -> str
  1965. *
  1966. * Prepend---Prepend the given string to <i>str</i>.
  1967. *
  1968. * a = "world"
  1969. * a.prepend("hello ") #=> "hello world"
  1970. * a #=> "hello world"
  1971. */
  1972. static VALUE
  1973. rb_str_prepend(VALUE str, VALUE str2)
  1974. {
  1975. StringValue(str2);
  1976. StringValue(str);
  1977. rb_str_update(str, 0L, 0L, str2);
  1978. return str;
  1979. }
  1980. st_index_t
  1981. rb_str_hash(VALUE str)
  1982. {
  1983. int e = ENCODING_GET(str);
  1984. if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
  1985. e = 0;
  1986. }
  1987. return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
  1988. }
  1989. int
  1990. rb_str_hash_cmp(VALUE str1, VALUE str2)
  1991. {
  1992. long len;
  1993. if (!rb_str_comparable(str1, str2)) return 1;
  1994. if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
  1995. memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
  1996. return 0;
  1997. }
  1998. return 1;
  1999. }
  2000. /*
  2001. * call-seq:
  2002. * str.hash -> fixnum
  2003. *
  2004. * Return a hash based on the string's length and content.
  2005. */
  2006. static VALUE
  2007. rb_str_hash_m(VALUE str)
  2008. {
  2009. st_index_t hval = rb_str_hash(str);
  2010. return INT2FIX(hval);
  2011. }
  2012. #define lesser(a,b) (((a)>(b))?(b):(a))
  2013. int
  2014. rb_str_comparable(VALUE str1, VALUE str2)
  2015. {
  2016. int idx1, idx2;
  2017. int rc1, rc2;
  2018. if (RSTRING_LEN(str1) == 0) return TRUE;
  2019. if (RSTRING_LEN(str2) == 0) return TRUE;
  2020. idx1 = ENCODING_GET(str1);
  2021. idx2 = ENCODING_GET(str2);
  2022. if (idx1 == idx2) return TRUE;
  2023. rc1 = rb_enc_str_coderange(str1);
  2024. rc2 = rb_enc_str_coderange(str2);
  2025. if (rc1 == ENC_CODERANGE_7BIT) {
  2026. if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
  2027. if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
  2028. return TRUE;
  2029. }
  2030. if (rc2 == ENC_CODERANGE_7BIT) {
  2031. if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
  2032. return TRUE;
  2033. }
  2034. return FALSE;
  2035. }
  2036. int
  2037. rb_str_cmp(VALUE str1, VALUE str2)
  2038. {
  2039. long len1, len2;
  2040. const char *ptr1, *ptr2;
  2041. int retval;
  2042. if (str1 == str2) return 0;
  2043. RSTRING_GETMEM(str1, ptr1, len1);
  2044. RSTRING_GETMEM(str2, ptr2, len2);
  2045. if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
  2046. if (len1 == len2) {
  2047. if (!rb_str_comparable(str1, str2)) {
  2048. if (ENCODING_GET(str1) > ENCODING_GET(str2))
  2049. return 1;
  2050. return -1;
  2051. }
  2052. return 0;
  2053. }
  2054. if (len1 > len2) return 1;
  2055. return -1;
  2056. }
  2057. if (retval > 0) return 1;
  2058. return -1;
  2059. }
  2060. /* expect tail call optimization */
  2061. static VALUE
  2062. str_eql(const VALUE str1, const VALUE str2)
  2063. {
  2064. const long len = RSTRING_LEN(str1);
  2065. const char *ptr1, *ptr2;
  2066. if (len != RSTRING_LEN(str2)) return Qfalse;
  2067. if (!rb_str_comparable(str1, str2)) return Qfalse;
  2068. if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
  2069. return Qtrue;
  2070. if (memcmp(ptr1, ptr2, len) == 0)
  2071. return Qtrue;
  2072. return Qfalse;
  2073. }
  2074. /*
  2075. * call-seq:
  2076. * str == obj -> true or false
  2077. *
  2078. * Equality---If <i>obj</i> is not a <code>String</code>, returns
  2079. * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
  2080. * <code><=></code> <i>obj</i> returns zero.
  2081. */
  2082. VALUE
  2083. rb_str_equal(VALUE str1, VALUE str2)
  2084. {
  2085. if (str1 == str2) return Qtrue;
  2086. if (!RB_TYPE_P(str2, T_STRING)) {
  2087. if (!rb_respond_to(str2, rb_intern("to_str"))) {
  2088. return Qfalse;
  2089. }
  2090. return rb_equal(str2, str1);
  2091. }
  2092. return str_eql(str1, str2);
  2093. }
  2094. /*
  2095. * call-seq:
  2096. * str.eql?(other) -> true or false
  2097. *
  2098. * Two strings are equal if they have the same length and content.
  2099. */
  2100. static VALUE
  2101. rb_str_eql(VALUE str1, VALUE str2)
  2102. {
  2103. if (str1 == str2) return Qtrue;
  2104. if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
  2105. return str_eql(str1, str2);
  2106. }
  2107. /*
  2108. * call-seq:
  2109. * str <=> other_str -> -1, 0, +1 or nil
  2110. *
  2111. * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
  2112. * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
  2113. * <i>str</i>. If the strings are of different lengths, and the strings are
  2114. * equal when compared up to the shortest length, then the longer string is
  2115. * considered greater than the shorter one. In older versions of Ruby, setting
  2116. * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
  2117. * in favor of using <code>String#casecmp</code>.
  2118. *
  2119. * <code><=></code> is the basis for the methods <code><</code>,
  2120. * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
  2121. * included from module <code>Comparable</code>. The method
  2122. * <code>String#==</code> does not use <code>Comparable#==</code>.
  2123. *
  2124. * "abcdef" <=> "abcde" #=> 1
  2125. * "abcdef" <=> "abcdef" #=> 0
  2126. * "abcdef" <=> "abcdefg" #=> -1
  2127. * "abcdef" <=> "ABCDEF" #=> 1
  2128. */
  2129. static VALUE
  2130. rb_str_cmp_m(VALUE str1, VALUE str2)
  2131. {
  2132. int result;
  2133. if (!RB_TYPE_P(str2, T_STRING)) {
  2134. VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
  2135. if (RB_TYPE_P(tmp, T_STRING)) {
  2136. result = rb_str_cmp(str1, tmp);
  2137. }
  2138. else if ((tmp = rb_check_funcall(str2, rb_intern("<=>"), 1, &str1)) ==
  2139. Qundef) {
  2140. return Qnil;
  2141. }
  2142. else {
  2143. if (NIL_P(tmp)) return Qnil;
  2144. result = -rb_cmpint(tmp, str1, str2);
  2145. }
  2146. }
  2147. else {
  2148. result = rb_str_cmp(str1, str2);
  2149. }
  2150. return INT2FIX(result);
  2151. }
  2152. /*
  2153. * call-seq:
  2154. * str.casecmp(other_str) -> -1, 0, +1 or nil
  2155. *
  2156. * Case-insensitive version of <code>String#<=></code>.
  2157. *
  2158. * "abcdef".casecmp("abcde") #=> 1
  2159. * "aBcDeF".casecmp("abcdef") #=> 0
  2160. * "abcdef".casecmp("abcdefg") #=> -1
  2161. * "abcdef".casecmp("ABCDEF") #=> 0
  2162. */
  2163. static VALUE
  2164. rb_str_casecmp(VALUE str1, VALUE str2)
  2165. {
  2166. long len;
  2167. rb_encoding *enc;
  2168. char *p1, *p1end, *p2, *p2end;
  2169. StringValue(str2);
  2170. enc = rb_enc_compatible(str1, str2);
  2171. if (!enc) {
  2172. return Qnil;
  2173. }
  2174. p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
  2175. p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
  2176. if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
  2177. while (p1 < p1end && p2 < p2end) {
  2178. if (*p1 != *p2) {
  2179. unsigned int c1 = TOUPPER(*p1 & 0xff);
  2180. unsigned int c2 = TOUPPER(*p2 & 0xff);
  2181. if (c1 != c2)
  2182. return INT2FIX(c1 < c2 ? -1 : 1);
  2183. }
  2184. p1++;
  2185. p2++;
  2186. }
  2187. }
  2188. else {
  2189. while (p1 < p1end && p2 < p2end) {
  2190. int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
  2191. int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
  2192. if (0 <= c1 && 0 <= c2) {
  2193. c1 = TOUPPER(c1);
  2194. c2 = TOUPPER(c2);
  2195. if (c1 != c2)
  2196. return INT2FIX(c1 < c2 ? -1 : 1);
  2197. }
  2198. else {
  2199. int r;
  2200. l1 = rb_enc_mbclen(p1, p1end, enc);
  2201. l2 = rb_enc_mbclen(p2, p2end, enc);
  2202. len = l1 < l2 ? l1 : l2;
  2203. r = memcmp(p1, p2, len);
  2204. if (r != 0)
  2205. return INT2FIX(r < 0 ? -1 : 1);
  2206. if (l1 != l2)
  2207. return INT2FIX(l1 < l2 ? -1 : 1);
  2208. }
  2209. p1 += l1;
  2210. p2 += l2;
  2211. }
  2212. }
  2213. if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
  2214. if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
  2215. return INT2FIX(-1);
  2216. }
  2217. static long
  2218. rb_str_index(VALUE str, VALUE sub, long offset)
  2219. {
  2220. long pos;
  2221. char *s, *sptr, *e;
  2222. long len, slen;
  2223. rb_encoding *enc;
  2224. enc = rb_enc_check(str, sub);
  2225. if (is_broken_string(sub)) {
  2226. return -1;
  2227. }
  2228. len = str_strlen(str, enc);
  2229. slen = str_strlen(sub, enc);
  2230. if (offset < 0) {
  2231. offset += len;
  2232. if (offset < 0) return -1;
  2233. }
  2234. if (len - offset < slen) return -1;
  2235. s = RSTRING_PTR(str);
  2236. e = s + RSTRING_LEN(str);
  2237. if (offset) {
  2238. offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
  2239. s += offset;
  2240. }
  2241. if (slen == 0) return offset;
  2242. /* need proceed one character at a time */
  2243. sptr = RSTRING_PTR(sub);
  2244. slen = RSTRING_LEN(sub);
  2245. len = RSTRING_LEN(str) - offset;
  2246. for (;;) {
  2247. char *t;
  2248. pos = rb_memsearch(sptr, slen, s, len, enc);
  2249. if (pos < 0) return pos;
  2250. t = rb_enc_right_char_head(s, s+pos, e, enc);
  2251. if (t == s + pos) break;
  2252. if ((len -= t - s) <= 0) return -1;
  2253. offset += t - s;
  2254. s = t;
  2255. }
  2256. return pos + offset;
  2257. }
  2258. /*
  2259. * call-seq:
  2260. * str.index(substring [, offset]) -> fixnum or nil
  2261. * str.index(regexp [, offset]) -> fixnum or nil
  2262. *
  2263. * Returns the index of the first occurrence of the given <i>substring</i> or
  2264. * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
  2265. * found. If the second parameter is present, it specifies the position in the
  2266. * string to begin the search.
  2267. *
  2268. * "hello".index('e') #=> 1
  2269. * "hello".index('lo') #=> 3
  2270. * "hello".index('a') #=> nil
  2271. * "hello".index(?e) #=> 1
  2272. * "hello".index(/[aeiou]/, -3) #=> 4
  2273. */
  2274. static VALUE
  2275. rb_str_index_m(int argc, VALUE *argv, VALUE str)
  2276. {
  2277. VALUE sub;
  2278. VALUE initpos;
  2279. long pos;
  2280. if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
  2281. pos = NUM2LONG(initpos);
  2282. }
  2283. else {
  2284. pos = 0;
  2285. }
  2286. if (pos < 0) {
  2287. pos += str_strlen(str, STR_ENC_GET(str));
  2288. if (pos < 0) {
  2289. if (RB_TYPE_P(sub, T_REGEXP)) {
  2290. rb_backref_set(Qnil);
  2291. }
  2292. return Qnil;
  2293. }
  2294. }
  2295. if (SPECIAL_CONST_P(sub)) goto generic;
  2296. switch (BUILTIN_TYPE(sub)) {
  2297. case T_REGEXP:
  2298. if (pos > str_strlen(str, STR_ENC_GET(str)))
  2299. return Qnil;
  2300. pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
  2301. rb_enc_check(str, sub), single_byte_optimizable(str));
  2302. pos = rb_reg_search(sub, str, pos, 0);
  2303. pos = rb_str_sublen(str, pos);
  2304. break;
  2305. generic:
  2306. default: {
  2307. VALUE tmp;
  2308. tmp = rb_check_string_type(sub);
  2309. if (NIL_P(tmp)) {
  2310. rb_raise(rb_eTypeError, "type mismatch: %s given",
  2311. rb_obj_classname(sub));
  2312. }
  2313. sub = tmp;
  2314. }
  2315. /* fall through */
  2316. case T_STRING:
  2317. pos = rb_str_index(str, sub, pos);
  2318. pos = rb_str_sublen(str, pos);
  2319. break;
  2320. }
  2321. if (pos == -1) return Qnil;
  2322. return LONG2NUM(pos);
  2323. }
  2324. static long
  2325. rb_str_rindex(VALUE str, VALUE sub, long pos)
  2326. {
  2327. long len, slen;
  2328. char *s, *sbeg, *e, *t;
  2329. rb_encoding *enc;
  2330. int singlebyte = single_byte_optimizable(str);
  2331. enc = rb_enc_check(str, sub);
  2332. if (is_broken_string(sub)) {
  2333. return -1;
  2334. }
  2335. len = str_strlen(str, enc);
  2336. slen = str_strlen(sub, enc);
  2337. /* substring longer than string */
  2338. if (len < slen) return -1;
  2339. if (len - pos < slen) {
  2340. pos = len - slen;
  2341. }
  2342. if (len == 0) {
  2343. return pos;
  2344. }
  2345. sbeg = RSTRING_PTR(str);
  2346. e = RSTRING_END(str);
  2347. t = RSTRING_PTR(sub);
  2348. slen = RSTRING_LEN(sub);
  2349. s = str_nth(sbeg, e, pos, enc, singlebyte);
  2350. while (s) {
  2351. if (memcmp(s, t, slen) == 0) {
  2352. return pos;
  2353. }
  2354. if (pos == 0) break;
  2355. pos--;
  2356. s = rb_enc_prev_char(sbeg, s, e, enc);
  2357. }
  2358. return -1;
  2359. }
  2360. /*
  2361. * call-seq:
  2362. * str.rindex(substring [, fixnum]) -> fixnum or nil
  2363. * str.rindex(regexp [, fixnum]) -> fixnum or nil
  2364. *
  2365. * Returns the index of the last occurrence of the given <i>substring</i> or
  2366. * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
  2367. * found. If the second parameter is present, it specifies the position in the
  2368. * string to end the search---characters beyond this point will not be
  2369. * considered.
  2370. *
  2371. * "hello".rindex('e') #=> 1
  2372. * "hello".rindex('l') #=> 3
  2373. * "hello".rindex('a') #=> nil
  2374. * "hello".rindex(?e) #=> 1
  2375. * "hello".rindex(/[aeiou]/, -2) #=> 1
  2376. */
  2377. static VALUE
  2378. rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
  2379. {
  2380. VALUE sub;
  2381. VALUE vpos;
  2382. rb_encoding *enc = STR_ENC_GET(str);
  2383. long pos, len = str_strlen(str, enc);
  2384. if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
  2385. pos = NUM2LONG(vpos);
  2386. if (pos < 0) {
  2387. pos += len;
  2388. if (pos < 0) {
  2389. if (RB_TYPE_P(sub, T_REGEXP)) {
  2390. rb_backref_set(Qnil);
  2391. }
  2392. return Qnil;
  2393. }
  2394. }
  2395. if (pos > len) pos = len;
  2396. }
  2397. else {
  2398. pos = len;
  2399. }
  2400. if (SPECIAL_CONST_P(sub)) goto generic;
  2401. switch (BUILTIN_TYPE(sub)) {
  2402. case T_REGEXP:
  2403. /* enc = rb_get_check(str, sub); */
  2404. pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
  2405. STR_ENC_GET(str), single_byte_optimizable(str));
  2406. if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
  2407. pos = rb_reg_search(sub, str, pos, 1);
  2408. pos = rb_str_sublen(str, pos);
  2409. }
  2410. if (pos >= 0) return LONG2NUM(pos);
  2411. break;
  2412. generic:
  2413. default: {
  2414. VALUE tmp;
  2415. tmp = rb_check_string_type(sub);
  2416. if (NIL_P(tmp)) {
  2417. rb_raise(rb_eTypeError, "type mismatch: %s given",
  2418. rb_obj_classname(sub));
  2419. }
  2420. sub = tmp;
  2421. }
  2422. /* fall through */
  2423. case T_STRING:
  2424. pos = rb_str_rindex(str, sub, pos);
  2425. if (pos >= 0) return LONG2NUM(pos);
  2426. break;
  2427. }
  2428. return Qnil;
  2429. }
  2430. /*
  2431. * call-seq:
  2432. * str =~ obj -> fixnum or nil
  2433. *
  2434. * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
  2435. * against <i>str</i>,and returns the position the match starts, or
  2436. * <code>nil</code> if there is no match. Otherwise, invokes
  2437. * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
  2438. * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
  2439. *
  2440. * Note: <code>str =~ regexp</code> is not the same as
  2441. * <code>regexp =~ str</code>. Strings captured from named capture groups
  2442. * are assigned to local variables only in the second case.
  2443. *
  2444. * "cat o' 9 tails" =~ /\d/ #=> 7
  2445. * "cat o' 9 tails" =~ 9 #=> nil
  2446. */
  2447. static VALUE
  2448. rb_str_match(VALUE x, VALUE y)
  2449. {
  2450. if (SPECIAL_CONST_P(y)) goto generic;
  2451. switch (BUILTIN_TYPE(y)) {
  2452. case T_STRING:
  2453. rb_raise(rb_eTypeError, "type mismatch: String given");
  2454. case T_REGEXP:
  2455. return rb_reg_match(y, x);
  2456. generic:
  2457. default:
  2458. return rb_funcall(y, rb_intern("=~"), 1, x);
  2459. }
  2460. }
  2461. static VALUE get_pat(VALUE, int);
  2462. /*
  2463. * call-seq:
  2464. * str.match(pattern) -> matchdata or nil
  2465. * str.match(pattern, pos) -> matchdata or nil
  2466. *
  2467. * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
  2468. * then invokes its <code>match</code> method on <i>str</i>. If the second
  2469. * parameter is present, it specifies the position in the string to begin the
  2470. * search.
  2471. *
  2472. * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
  2473. * 'hello'.match('(.)\1')[0] #=> "ll"
  2474. * 'hello'.match(/(.)\1/)[0] #=> "ll"
  2475. * 'hello'.match('xx') #=> nil
  2476. *
  2477. * If a block is given, invoke the block with MatchData if match succeed, so
  2478. * that you can write
  2479. *
  2480. * str.match(pat) {|m| ...}
  2481. *
  2482. * instead of
  2483. *
  2484. * if m = str.match(pat)
  2485. * ...
  2486. * end
  2487. *
  2488. * The return value is a value from block execution in this case.
  2489. */
  2490. static VALUE
  2491. rb_str_match_m(int argc, VALUE *argv, VALUE str)
  2492. {
  2493. VALUE re, result;
  2494. if (argc < 1)
  2495. rb_check_arity(argc, 1, 2);
  2496. re = argv[0];
  2497. argv[0] = str;
  2498. result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
  2499. if (!NIL_P(result) && rb_block_given_p()) {
  2500. return rb_yield(result);
  2501. }
  2502. return result;
  2503. }
  2504. enum neighbor_char {
  2505. NEIGHBOR_NOT_CHAR,
  2506. NEIGHBOR_FOUND,
  2507. NEIGHBOR_WRAPPED
  2508. };
  2509. static enum neighbor_char
  2510. enc_succ_char(char *p, long len, rb_encoding *enc)
  2511. {
  2512. long i;
  2513. int l;
  2514. while (1) {
  2515. for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
  2516. p[i] = '\0';
  2517. if (i < 0)
  2518. return NEIGHBOR_WRAPPED;
  2519. ++((unsigned char*)p)[i];
  2520. l = rb_enc_precise_mbclen(p, p+len, enc);
  2521. if (MBCLEN_CHARFOUND_P(l)) {
  2522. l = MBCLEN_CHARFOUND_LEN(l);
  2523. if (l == len) {
  2524. return NEIGHBOR_FOUND;
  2525. }
  2526. else {
  2527. memset(p+l, 0xff, len-l);
  2528. }
  2529. }
  2530. if (MBCLEN_INVALID_P(l) && i < len-1) {
  2531. long len2;
  2532. int l2;
  2533. for (len2 = len-1; 0 < len2; len2--) {
  2534. l2 = rb_enc_precise_mbclen(p, p+len2, enc);
  2535. if (!MBCLEN_INVALID_P(l2))
  2536. break;
  2537. }
  2538. memset(p+len2+1, 0xff, len-(len2+1));
  2539. }
  2540. }
  2541. }
  2542. static enum neighbor_char
  2543. enc_pred_char(char *p, long len, rb_encoding *enc)
  2544. {
  2545. long i;
  2546. int l;
  2547. while (1) {
  2548. for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
  2549. p[i] = '\xff';
  2550. if (i < 0)
  2551. return NEIGHBOR_WRAPPED;
  2552. --((unsigned char*)p)[i];
  2553. l = rb_enc_precise_mbclen(p, p+len, enc);
  2554. if (MBCLEN_CHARFOUND_P(l)) {
  2555. l = MBCLEN_CHARFOUND_LEN(l);
  2556. if (l == len) {
  2557. return NEIGHBOR_FOUND;
  2558. }
  2559. else {
  2560. memset(p+l, 0, len-l);
  2561. }
  2562. }
  2563. if (MBCLEN_INVALID_P(l) && i < len-1) {
  2564. long len2;
  2565. int l2;
  2566. for (len2 = len-1; 0 < len2; len2--) {
  2567. l2 = rb_enc_precise_mbclen(p, p+len2, enc);
  2568. if (!MBCLEN_INVALID_P(l2))
  2569. break;
  2570. }
  2571. memset(p+len2+1, 0, len-(len2+1));
  2572. }
  2573. }
  2574. }
  2575. /*
  2576. overwrite +p+ by succeeding letter in +enc+ and returns
  2577. NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
  2578. When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
  2579. assuming each ranges are successive, and mbclen
  2580. never change in each ranges.
  2581. NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
  2582. character.
  2583. */
  2584. static enum neighbor_char
  2585. enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
  2586. {
  2587. enum neighbor_char ret;
  2588. unsigned int c;
  2589. int ctype;
  2590. int range;
  2591. char save[ONIGENC_CODE_TO_MBC_MAXLEN];
  2592. c = rb_enc_mbc_to_codepoint(p, p+len, enc);
  2593. if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
  2594. ctype = ONIGENC_CTYPE_DIGIT;
  2595. else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
  2596. ctype = ONIGENC_CTYPE_ALPHA;
  2597. else
  2598. return NEIGHBOR_NOT_CHAR;
  2599. MEMCPY(save, p, char, len);
  2600. ret = enc_succ_char(p, len, enc);
  2601. if (ret == NEIGHBOR_FOUND) {
  2602. c = rb_enc_mbc_to_codepoint(p, p+len, enc);
  2603. if (rb_enc_isctype(c, ctype, enc))
  2604. return NEIGHBOR_FOUND;
  2605. }
  2606. MEMCPY(p, save, char, len);
  2607. range = 1;
  2608. while (1) {
  2609. MEMCPY(save, p, char, len);
  2610. ret = enc_pred_char(p, len, enc);
  2611. if (ret == NEIGHBOR_FOUND) {
  2612. c = rb_enc_mbc_to_codepoint(p, p+len, enc);
  2613. if (!rb_enc_isctype(c, ctype, enc)) {
  2614. MEMCPY(p, save, char, len);
  2615. break;
  2616. }
  2617. }
  2618. else {
  2619. MEMCPY(p, save, char, len);
  2620. break;
  2621. }
  2622. range++;
  2623. }
  2624. if (range == 1) {
  2625. return NEIGHBOR_NOT_CHAR;
  2626. }
  2627. if (ctype != ONIGENC_CTYPE_DIGIT) {
  2628. MEMCPY(carry, p, char, len);
  2629. return NEIGHBOR_WRAPPED;
  2630. }
  2631. MEMCPY(carry, p, char, len);
  2632. enc_succ_char(carry, len, enc);
  2633. return NEIGHBOR_WRAPPED;
  2634. }
  2635. /*
  2636. * call-seq:
  2637. * str.succ -> new_str
  2638. * str.next -> new_str
  2639. *
  2640. * Returns the successor to <i>str</i>. The successor is calculated by
  2641. * incrementing characters starting from the rightmost alphanumeric (or
  2642. * the rightmost character if there are no alphanumerics) in the
  2643. * string. Incrementing a digit always results in another digit, and
  2644. * incrementing a letter results in another letter of the same case.
  2645. * Incrementing nonalphanumerics uses the underlying character set's
  2646. * collating sequence.
  2647. *
  2648. * If the increment generates a ``carry,'' the character to the left of
  2649. * it is incremented. This process repeats until there is no carry,
  2650. * adding an additional character if necessary.
  2651. *
  2652. * "abcd".succ #=> "abce"
  2653. * "THX1138".succ #=> "THX1139"
  2654. * "<<koala>>".succ #=> "<<koalb>>"
  2655. * "1999zzz".succ #=> "2000aaa"
  2656. * "ZZZ9999".succ #=> "AAAA0000"
  2657. * "***".succ #=> "**+"
  2658. */
  2659. VALUE
  2660. rb_str_succ(VALUE orig)
  2661. {
  2662. rb_encoding *enc;
  2663. VALUE str;
  2664. char *sbeg, *s, *e, *last_alnum = 0;
  2665. int c = -1;
  2666. long l;
  2667. char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
  2668. long carry_pos = 0, carry_len = 1;
  2669. enum neighbor_char neighbor = NEIGHBOR_FOUND;
  2670. str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
  2671. rb_enc_cr_str_copy_for_substr(str, orig);
  2672. OBJ_INFECT(str, orig);
  2673. if (RSTRING_LEN(str) == 0) return str;
  2674. enc = STR_ENC_GET(orig);
  2675. sbeg = RSTRING_PTR(str);
  2676. s = e = sbeg + RSTRING_LEN(str);
  2677. while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
  2678. if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
  2679. if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
  2680. ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
  2681. s = last_alnum;
  2682. break;
  2683. }
  2684. }
  2685. if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
  2686. neighbor = enc_succ_alnum_char(s, l, enc, carry);
  2687. switch (neighbor) {
  2688. case NEIGHBOR_NOT_CHAR:
  2689. continue;
  2690. case NEIGHBOR_FOUND:
  2691. return str;
  2692. case NEIGHBOR_WRAPPED:
  2693. last_alnum = s;
  2694. break;
  2695. }
  2696. c = 1;
  2697. carry_pos = s - sbeg;
  2698. carry_len = l;
  2699. }
  2700. if (c == -1) { /* str contains no alnum */
  2701. s = e;
  2702. while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
  2703. enum neighbor_char neighbor;
  2704. if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
  2705. neighbor = enc_succ_char(s, l, enc);
  2706. if (neighbor == NEIGHBOR_FOUND)
  2707. return str;
  2708. if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
  2709. /* wrapped to \0...\0. search next valid char. */
  2710. enc_succ_char(s, l, enc);
  2711. }
  2712. if (!rb_enc_asciicompat(enc)) {
  2713. MEMCPY(carry, s, char, l);
  2714. carry_len = l;
  2715. }
  2716. carry_pos = s - sbeg;
  2717. }
  2718. }
  2719. RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
  2720. s = RSTRING_PTR(str) + carry_pos;
  2721. memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
  2722. memmove(s, carry, carry_len);
  2723. STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
  2724. RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
  2725. rb_enc_str_coderange(str);
  2726. return str;
  2727. }
  2728. /*
  2729. * call-seq:
  2730. * str.succ! -> str
  2731. * str.next! -> str
  2732. *
  2733. * Equivalent to <code>String#succ</code>, but modifies the receiver in
  2734. * place.
  2735. */
  2736. static VALUE
  2737. rb_str_succ_bang(VALUE str)
  2738. {
  2739. rb_str_shared_replace(str, rb_str_succ(str));
  2740. return str;
  2741. }
  2742. /*
  2743. * call-seq:
  2744. * str.upto(other_str, exclusive=false) {|s| block } -> str
  2745. * str.upto(other_str, exclusive=false) -> an_enumerator
  2746. *
  2747. * Iterates through successive values, starting at <i>str</i> and
  2748. * ending at <i>other_str</i> inclusive, passing each value in turn to
  2749. * the block. The <code>String#succ</code> method is used to generate
  2750. * each value. If optional second argument exclusive is omitted or is false,
  2751. * the last value will be included; otherwise it will be excluded.
  2752. *
  2753. * If no block is given, an enumerator is returned instead.
  2754. *
  2755. * "a8".upto("b6") {|s| print s, ' ' }
  2756. * for s in "a8".."b6"
  2757. * print s, ' '
  2758. * end
  2759. *
  2760. * <em>produces:</em>
  2761. *
  2762. * a8 a9 b0 b1 b2 b3 b4 b5 b6
  2763. * a8 a9 b0 b1 b2 b3 b4 b5 b6
  2764. *
  2765. * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
  2766. * both are recognized as decimal numbers. In addition, the width of
  2767. * string (e.g. leading zeros) is handled appropriately.
  2768. *
  2769. * "9".upto("11").to_a #=> ["9", "10", "11"]
  2770. * "25".upto("5").to_a #=> []
  2771. * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
  2772. */
  2773. static VALUE
  2774. rb_str_upto(int argc, VALUE *argv, VALUE beg)
  2775. {
  2776. VALUE end, exclusive;
  2777. VALUE current, after_end;
  2778. ID succ;
  2779. int n, excl, ascii;
  2780. rb_encoding *enc;
  2781. rb_scan_args(argc, argv, "11", &end, &exclusive);
  2782. RETURN_ENUMERATOR(beg, argc, argv);
  2783. excl = RTEST(exclusive);
  2784. CONST_ID(succ, "succ");
  2785. StringValue(end);
  2786. enc = rb_enc_check(beg, end);
  2787. ascii = (is_ascii_string(beg) && is_ascii_string(end));
  2788. /* single character */
  2789. if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
  2790. char c = RSTRING_PTR(beg)[0];
  2791. char e = RSTRING_PTR(end)[0];
  2792. if (c > e || (excl && c == e)) return beg;
  2793. for (;;) {
  2794. rb_yield(rb_enc_str_new(&c, 1, enc));
  2795. if (!excl && c == e) break;
  2796. c++;
  2797. if (excl && c == e) break;
  2798. }
  2799. return beg;
  2800. }
  2801. /* both edges are all digits */
  2802. if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
  2803. char *s, *send;
  2804. VALUE b, e;
  2805. int width;
  2806. s = RSTRING_PTR(beg); send = RSTRING_END(beg);
  2807. width = rb_long2int(send - s);
  2808. while (s < send) {
  2809. if (!ISDIGIT(*s)) goto no_digits;
  2810. s++;
  2811. }
  2812. s = RSTRING_PTR(end); send = RSTRING_END(end);
  2813. while (s < send) {
  2814. if (!ISDIGIT(*s)) goto no_digits;
  2815. s++;
  2816. }
  2817. b = rb_str_to_inum(beg, 10, FALSE);
  2818. e = rb_str_to_inum(end, 10, FALSE);
  2819. if (FIXNUM_P(b) && FIXNUM_P(e)) {
  2820. long bi = FIX2LONG(b);
  2821. long ei = FIX2LONG(e);
  2822. rb_encoding *usascii = rb_usascii_encoding();
  2823. while (bi <= ei) {
  2824. if (excl && bi == ei) break;
  2825. rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
  2826. bi++;
  2827. }
  2828. }
  2829. else {
  2830. ID op = excl ? '<' : rb_intern("<=");
  2831. VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
  2832. args[0] = INT2FIX(width);
  2833. while (rb_funcall(b, op, 1, e)) {
  2834. args[1] = b;
  2835. rb_yield(rb_str_format(numberof(args), args, fmt));
  2836. b = rb_funcall(b, succ, 0, 0);
  2837. }
  2838. }
  2839. return beg;
  2840. }
  2841. /* normal case */
  2842. no_digits:
  2843. n = rb_str_cmp(beg, end);
  2844. if (n > 0 || (excl && n == 0)) return beg;
  2845. after_end = rb_funcall(end, succ, 0, 0);
  2846. current = rb_str_dup(beg);
  2847. while (!rb_str_equal(current, after_end)) {
  2848. VALUE next = Qnil;
  2849. if (excl || !rb_str_equal(current, end))
  2850. next = rb_funcall(current, succ, 0, 0);
  2851. rb_yield(current);
  2852. if (NIL_P(next)) break;
  2853. current = next;
  2854. StringValue(current);
  2855. if (excl && rb_str_equal(current, end)) break;
  2856. if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
  2857. break;
  2858. }
  2859. return beg;
  2860. }
  2861. static VALUE
  2862. rb_str_subpat(VALUE str, VALUE re, VALUE backref)
  2863. {
  2864. if (rb_reg_search(re, str, 0, 0) >= 0) {
  2865. VALUE match = rb_backref_get();
  2866. int nth = rb_reg_backref_number(match, backref);
  2867. return rb_reg_nth_match(nth, match);
  2868. }
  2869. return Qnil;
  2870. }
  2871. static VALUE
  2872. rb_str_aref(VALUE str, VALUE indx)
  2873. {
  2874. long idx;
  2875. if (FIXNUM_P(indx)) {
  2876. idx = FIX2LONG(indx);
  2877. num_index:
  2878. str = rb_str_substr(str, idx, 1);
  2879. if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
  2880. return str;
  2881. }
  2882. if (SPECIAL_CONST_P(indx)) goto generic;
  2883. switch (BUILTIN_TYPE(indx)) {
  2884. case T_REGEXP:
  2885. return rb_str_subpat(str, indx, INT2FIX(0));
  2886. case T_STRING:
  2887. if (rb_str_index(str, indx, 0) != -1)
  2888. return rb_str_dup(indx);
  2889. return Qnil;
  2890. generic:
  2891. default:
  2892. /* check if indx is Range */
  2893. {
  2894. long beg, len;
  2895. VALUE tmp;
  2896. len = str_strlen(str, STR_ENC_GET(str));
  2897. switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
  2898. case Qfalse:
  2899. break;
  2900. case Qnil:
  2901. return Qnil;
  2902. default:
  2903. tmp = rb_str_substr(str, beg, len);
  2904. return tmp;
  2905. }
  2906. }
  2907. idx = NUM2LONG(indx);
  2908. goto num_index;
  2909. }
  2910. UNREACHABLE;
  2911. }
  2912. /*
  2913. * call-seq:
  2914. * str[index] -> new_str or nil
  2915. * str[start, length] -> new_str or nil
  2916. * str[range] -> new_str or nil
  2917. * str[regexp] -> new_str or nil
  2918. * str[regexp, capture] -> new_str or nil
  2919. * str[match_str] -> new_str or nil
  2920. * str.slice(index) -> new_str or nil
  2921. * str.slice(start, length) -> new_str or nil
  2922. * str.slice(range) -> new_str or nil
  2923. * str.slice(regexp) -> new_str or nil
  2924. * str.slice(regexp, capture) -> new_str or nil
  2925. * str.slice(match_str) -> new_str or nil
  2926. *
  2927. * Element Reference --- If passed a single +index+, returns a substring of
  2928. * one character at that index. If passed a +start+ index and a +length+,
  2929. * returns a substring containing +length+ characters starting at the
  2930. * +index+. If passed a +range+, its beginning and end are interpreted as
  2931. * offsets delimiting the substring to be returned.
  2932. *
  2933. * In these three cases, if an index is negative, it is counted from the end
  2934. * of the string. For the +start+ and +range+ cases the starting index
  2935. * is just before a character and an index matching the string's size.
  2936. * Additionally, an empty string is returned when the starting index for a
  2937. * character range is at the end of the string.
  2938. *
  2939. * Returns +nil+ if the initial index falls outside the string or the length
  2940. * is negative.
  2941. *
  2942. * If a +Regexp+ is supplied, the matching portion of the string is
  2943. * returned. If a +capture+ follows the regular expression, which may be a
  2944. * capture group index or name, follows the regular expression that component
  2945. * of the MatchData is returned instead.
  2946. *
  2947. * If a +match_str+ is given, that string is returned if it occurs in
  2948. * the string.
  2949. *
  2950. * Returns +nil+ if the regular expression does not match or the match string
  2951. * cannot be found.
  2952. *
  2953. * a = "hello there"
  2954. *
  2955. * a[1] #=> "e"
  2956. * a[2, 3] #=> "llo"
  2957. * a[2..3] #=> "ll"
  2958. *
  2959. * a[-3, 2] #=> "er"
  2960. * a[7..-2] #=> "her"
  2961. * a[-4..-2] #=> "her"
  2962. * a[-2..-4] #=> ""
  2963. *
  2964. * a[11, 0] #=> ""
  2965. * a[11] #=> nil
  2966. * a[12, 0] #=> nil
  2967. * a[12..-1] #=> nil
  2968. *
  2969. * a[/[aeiou](.)\1/] #=> "ell"
  2970. * a[/[aeiou](.)\1/, 0] #=> "ell"
  2971. * a[/[aeiou](.)\1/, 1] #=> "l"
  2972. * a[/[aeiou](.)\1/, 2] #=> nil
  2973. *
  2974. * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
  2975. * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
  2976. *
  2977. * a["lo"] #=> "lo"
  2978. * a["bye"] #=> nil
  2979. */
  2980. static VALUE
  2981. rb_str_aref_m(int argc, VALUE *argv, VALUE str)
  2982. {
  2983. if (argc == 2) {
  2984. if (RB_TYPE_P(argv[0], T_REGEXP)) {
  2985. return rb_str_subpat(str, argv[0], argv[1]);
  2986. }
  2987. return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
  2988. }
  2989. rb_check_arity(argc, 1, 2);
  2990. return rb_str_aref(str, argv[0]);
  2991. }
  2992. VALUE
  2993. rb_str_drop_bytes(VALUE str, long len)
  2994. {
  2995. char *ptr = RSTRING_PTR(str);
  2996. long olen = RSTRING_LEN(str), nlen;
  2997. str_modifiable(str);
  2998. if (len > olen) len = olen;
  2999. nlen = olen - len;
  3000. if (nlen <= RSTRING_EMBED_LEN_MAX) {
  3001. char *oldptr = ptr;
  3002. int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
  3003. STR_SET_EMBED(str);
  3004. STR_SET_EMBED_LEN(str, nlen);
  3005. ptr = RSTRING(str)->as.ary;
  3006. memmove(ptr, oldptr + len, nlen);
  3007. if (fl == STR_NOEMBED) xfree(oldptr);
  3008. }
  3009. else {
  3010. if (!STR_SHARED_P(str)) rb_str_new4(str);
  3011. ptr = RSTRING(str)->as.heap.ptr += len;
  3012. RSTRING(str)->as.heap.len = nlen;
  3013. }
  3014. ptr[nlen] = 0;
  3015. ENC_CODERANGE_CLEAR(str);
  3016. return str;
  3017. }
  3018. static void
  3019. rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
  3020. {
  3021. if (beg == 0 && RSTRING_LEN(val) == 0) {
  3022. rb_str_drop_bytes(str, len);
  3023. OBJ_INFECT(str, val);
  3024. return;
  3025. }
  3026. rb_str_modify(str);
  3027. if (len < RSTRING_LEN(val)) {
  3028. /* expand string */
  3029. RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
  3030. }
  3031. if (RSTRING_LEN(val) != len) {
  3032. memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
  3033. RSTRING_PTR(str) + beg + len,
  3034. RSTRING_LEN(str) - (beg + len));
  3035. }
  3036. if (RSTRING_LEN(val) < beg && len < 0) {
  3037. MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
  3038. }
  3039. if (RSTRING_LEN(val) > 0) {
  3040. memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
  3041. }
  3042. STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
  3043. if (RSTRING_PTR(str)) {
  3044. RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
  3045. }
  3046. OBJ_INFECT(str, val);
  3047. }
  3048. static void
  3049. rb_str_splice(VALUE str, long beg, long len, VALUE val)
  3050. {
  3051. long slen;
  3052. char *p, *e;
  3053. rb_encoding *enc;
  3054. int singlebyte = single_byte_optimizable(str);
  3055. int cr;
  3056. if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
  3057. StringValue(val);
  3058. enc = rb_enc_check(str, val);
  3059. slen = str_strlen(str, enc);
  3060. if (slen < beg) {
  3061. out_of_range:
  3062. rb_raise(rb_eIndexError, "index %ld out of string", beg);
  3063. }
  3064. if (beg < 0) {
  3065. if (-beg > slen) {
  3066. goto out_of_range;
  3067. }
  3068. beg += slen;
  3069. }
  3070. if (slen < len || slen < beg + len) {
  3071. len = slen - beg;
  3072. }
  3073. str_modify_keep_cr(str);
  3074. p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
  3075. if (!p) p = RSTRING_END(str);
  3076. e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
  3077. if (!e) e = RSTRING_END(str);
  3078. /* error check */
  3079. beg = p - RSTRING_PTR(str); /* physical position */
  3080. len = e - p; /* physical length */
  3081. rb_str_splice_0(str, beg, len, val);
  3082. rb_enc_associate(str, enc);
  3083. cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
  3084. if (cr != ENC_CODERANGE_BROKEN)
  3085. ENC_CODERANGE_SET(str, cr);
  3086. }
  3087. void
  3088. rb_str_update(VALUE str, long beg, long len, VALUE val)
  3089. {
  3090. rb_str_splice(str, beg, len, val);
  3091. }
  3092. static void
  3093. rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
  3094. {
  3095. int nth;
  3096. VALUE match;
  3097. long start, end, len;
  3098. rb_encoding *enc;
  3099. struct re_registers *regs;
  3100. if (rb_reg_search(re, str, 0, 0) < 0) {
  3101. rb_raise(rb_eIndexError, "regexp not matched");
  3102. }
  3103. match = rb_backref_get();
  3104. nth = rb_reg_backref_number(match, backref);
  3105. regs = RMATCH_REGS(match);
  3106. if (nth >= regs->num_regs) {
  3107. out_of_range:
  3108. rb_raise(rb_eIndexError, "index %d out of regexp", nth);
  3109. }
  3110. if (nth < 0) {
  3111. if (-nth >= regs->num_regs) {
  3112. goto out_of_range;
  3113. }
  3114. nth += regs->num_regs;
  3115. }
  3116. start = BEG(nth);
  3117. if (start == -1) {
  3118. rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
  3119. }
  3120. end = END(nth);
  3121. len = end - start;
  3122. StringValue(val);
  3123. enc = rb_enc_check(str, val);
  3124. rb_str_splice_0(str, start, len, val);
  3125. rb_enc_associate(str, enc);
  3126. }
  3127. static VALUE
  3128. rb_str_aset(VALUE str, VALUE indx, VALUE val)
  3129. {
  3130. long idx, beg;
  3131. if (FIXNUM_P(indx)) {
  3132. idx = FIX2LONG(indx);
  3133. num_index:
  3134. rb_str_splice(str, idx, 1, val);
  3135. return val;
  3136. }
  3137. if (SPECIAL_CONST_P(indx)) goto generic;
  3138. switch (TYPE(indx)) {
  3139. case T_REGEXP:
  3140. rb_str_subpat_set(str, indx, INT2FIX(0), val);
  3141. return val;
  3142. case T_STRING:
  3143. beg = rb_str_index(str, indx, 0);
  3144. if (beg < 0) {
  3145. rb_raise(rb_eIndexError, "string not matched");
  3146. }
  3147. beg = rb_str_sublen(str, beg);
  3148. rb_str_splice(str, beg, str_strlen(indx, 0), val);
  3149. return val;
  3150. generic:
  3151. default:
  3152. /* check if indx is Range */
  3153. {
  3154. long beg, len;
  3155. if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
  3156. rb_str_splice(str, beg, len, val);
  3157. return val;
  3158. }
  3159. }
  3160. idx = NUM2LONG(indx);
  3161. goto num_index;
  3162. }
  3163. }
  3164. /*
  3165. * call-seq:
  3166. * str[fixnum] = new_str
  3167. * str[fixnum, fixnum] = new_str
  3168. * str[range] = aString
  3169. * str[regexp] = new_str
  3170. * str[regexp, fixnum] = new_str
  3171. * str[regexp, name] = new_str
  3172. * str[other_str] = new_str
  3173. *
  3174. * Element Assignment---Replaces some or all of the content of <i>str</i>. The
  3175. * portion of the string affected is determined using the same criteria as
  3176. * <code>String#[]</code>. If the replacement string is not the same length as
  3177. * the text it is replacing, the string will be adjusted accordingly. If the
  3178. * regular expression or string is used as the index doesn't match a position
  3179. * in the string, <code>IndexError</code> is raised. If the regular expression
  3180. * form is used, the optional second <code>Fixnum</code> allows you to specify
  3181. * which portion of the match to replace (effectively using the
  3182. * <code>MatchData</code> indexing rules. The forms that take a
  3183. * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
  3184. * out of range; the <code>Range</code> form will raise a
  3185. * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
  3186. * forms will silently ignore the assignment.
  3187. */
  3188. static VALUE
  3189. rb_str_aset_m(int argc, VALUE *argv, VALUE str)
  3190. {
  3191. if (argc == 3) {
  3192. if (RB_TYPE_P(argv[0], T_REGEXP)) {
  3193. rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
  3194. }
  3195. else {
  3196. rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
  3197. }
  3198. return argv[2];
  3199. }
  3200. rb_check_arity(argc, 2, 3);
  3201. return rb_str_aset(str, argv[0], argv[1]);
  3202. }
  3203. /*
  3204. * call-seq:
  3205. * str.insert(index, other_str) -> str
  3206. *
  3207. * Inserts <i>other_str</i> before the character at the given
  3208. * <i>index</i>, modifying <i>str</i>. Negative indices count from the
  3209. * end of the string, and insert <em>after</em> the given character.
  3210. * The intent is insert <i>aString</i> so that it starts at the given
  3211. * <i>index</i>.
  3212. *
  3213. * "abcd".insert(0, 'X') #=> "Xabcd"
  3214. * "abcd".insert(3, 'X') #=> "abcXd"
  3215. * "abcd".insert(4, 'X') #=> "abcdX"
  3216. * "abcd".insert(-3, 'X') #=> "abXcd"
  3217. * "abcd".insert(-1, 'X') #=> "abcdX"
  3218. */
  3219. static VALUE
  3220. rb_str_insert(VALUE str, VALUE idx, VALUE str2)
  3221. {
  3222. long pos = NUM2LONG(idx);
  3223. if (pos == -1) {
  3224. return rb_str_append(str, str2);
  3225. }
  3226. else if (pos < 0) {
  3227. pos++;
  3228. }
  3229. rb_str_splice(str, pos, 0, str2);
  3230. return str;
  3231. }
  3232. /*
  3233. * call-seq:
  3234. * str.slice!(fixnum) -> fixnum or nil
  3235. * str.slice!(fixnum, fixnum) -> new_str or nil
  3236. * str.slice!(range) -> new_str or nil
  3237. * str.slice!(regexp) -> new_str or nil
  3238. * str.slice!(other_str) -> new_str or nil
  3239. *
  3240. * Deletes the specified portion from <i>str</i>, and returns the portion
  3241. * deleted.
  3242. *
  3243. * string = "this is a string"
  3244. * string.slice!(2) #=> "i"
  3245. * string.slice!(3..6) #=> " is "
  3246. * string.slice!(/s.*t/) #=> "sa st"
  3247. * string.slice!("r") #=> "r"
  3248. * string #=> "thing"
  3249. */
  3250. static VALUE
  3251. rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
  3252. {
  3253. VALUE result;
  3254. VALUE buf[3];
  3255. int i;
  3256. rb_check_arity(argc, 1, 2);
  3257. for (i=0; i<argc; i++) {
  3258. buf[i] = argv[i];
  3259. }
  3260. str_modify_keep_cr(str);
  3261. result = rb_str_aref_m(argc, buf, str);
  3262. if (!NIL_P(result)) {
  3263. buf[i] = rb_str_new(0,0);
  3264. rb_str_aset_m(argc+1, buf, str);
  3265. }
  3266. return result;
  3267. }
  3268. static VALUE
  3269. get_pat(VALUE pat, int quote)
  3270. {
  3271. VALUE val;
  3272. switch (TYPE(pat)) {
  3273. case T_REGEXP:
  3274. return pat;
  3275. case T_STRING:
  3276. break;
  3277. default:
  3278. val = rb_check_string_type(pat);
  3279. if (NIL_P(val)) {
  3280. Check_Type(pat, T_REGEXP);
  3281. }
  3282. pat = val;
  3283. }
  3284. if (quote) {
  3285. pat = rb_reg_quote(pat);
  3286. }
  3287. return rb_reg_regcomp(pat);
  3288. }
  3289. /*
  3290. * call-seq:
  3291. * str.sub!(pattern, replacement) -> str or nil
  3292. * str.sub!(pattern) {|match| block } -> str or nil
  3293. *
  3294. * Performs the same substitution as String#sub in-place.
  3295. *
  3296. * Returns +str+ if a substitution was performed or +nil+ if no substitution
  3297. * was performed.
  3298. */
  3299. static VALUE
  3300. rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
  3301. {
  3302. VALUE pat, repl, hash = Qnil;
  3303. int iter = 0;
  3304. int tainted = 0;
  3305. int untrusted = 0;
  3306. long plen;
  3307. int min_arity = rb_block_given_p() ? 1 : 2;
  3308. rb_check_arity(argc, min_arity, 2);
  3309. if (argc == 1) {
  3310. iter = 1;
  3311. }
  3312. else {
  3313. repl = argv[1];
  3314. hash = rb_check_hash_type(argv[1]);
  3315. if (NIL_P(hash)) {
  3316. StringValue(repl);
  3317. }
  3318. if (OBJ_TAINTED(repl)) tainted = 1;
  3319. if (OBJ_UNTRUSTED(repl)) untrusted = 1;
  3320. }
  3321. pat = get_pat(argv[0], 1);
  3322. str_modifiable(str);
  3323. if (rb_reg_search(pat, str, 0, 0) >= 0) {
  3324. rb_encoding *enc;
  3325. int cr = ENC_CODERANGE(str);
  3326. VALUE match = rb_backref_get();
  3327. struct re_registers *regs = RMATCH_REGS(match);
  3328. long beg0 = BEG(0);
  3329. long end0 = END(0);
  3330. char *p, *rp;
  3331. long len, rlen;
  3332. if (iter || !NIL_P(hash)) {
  3333. p = RSTRING_PTR(str); len = RSTRING_LEN(str);
  3334. if (iter) {
  3335. repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
  3336. }
  3337. else {
  3338. repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
  3339. repl = rb_obj_as_string(repl);
  3340. }
  3341. str_mod_check(str, p, len);
  3342. rb_check_frozen(str);
  3343. }
  3344. else {
  3345. repl = rb_reg_regsub(repl, str, regs, pat);
  3346. }
  3347. enc = rb_enc_compatible(str, repl);
  3348. if (!enc) {
  3349. rb_encoding *str_enc = STR_ENC_GET(str);
  3350. p = RSTRING_PTR(str); len = RSTRING_LEN(str);
  3351. if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
  3352. coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
  3353. rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
  3354. rb_enc_name(str_enc),
  3355. rb_enc_name(STR_ENC_GET(repl)));
  3356. }
  3357. enc = STR_ENC_GET(repl);
  3358. }
  3359. rb_str_modify(str);
  3360. rb_enc_associate(str, enc);
  3361. if (OBJ_TAINTED(repl)) tainted = 1;
  3362. if (OBJ_UNTRUSTED(repl)) untrusted = 1;
  3363. if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
  3364. int cr2 = ENC_CODERANGE(repl);
  3365. if (cr2 == ENC_CODERANGE_BROKEN ||
  3366. (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
  3367. cr = ENC_CODERANGE_UNKNOWN;
  3368. else
  3369. cr = cr2;
  3370. }
  3371. plen = end0 - beg0;
  3372. rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
  3373. len = RSTRING_LEN(str);
  3374. if (rlen > plen) {
  3375. RESIZE_CAPA(str, len + rlen - plen);
  3376. }
  3377. p = RSTRING_PTR(str);
  3378. if (rlen != plen) {
  3379. memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
  3380. }
  3381. memcpy(p + beg0, rp, rlen);
  3382. len += rlen - plen;
  3383. STR_SET_LEN(str, len);
  3384. RSTRING_PTR(str)[len] = '\0';
  3385. ENC_CODERANGE_SET(str, cr);
  3386. if (tainted) OBJ_TAINT(str);
  3387. if (untrusted) OBJ_UNTRUST(str);
  3388. return str;
  3389. }
  3390. return Qnil;
  3391. }
  3392. /*
  3393. * call-seq:
  3394. * str.sub(pattern, replacement) -> new_str
  3395. * str.sub(pattern, hash) -> new_str
  3396. * str.sub(pattern) {|match| block } -> new_str
  3397. *
  3398. * Returns a copy of +str+ with the _first_ occurrence of +pattern+
  3399. * replaced by the second argument. The +pattern+ is typically a Regexp; if
  3400. * given as a String, any regular expression metacharacters it contains will
  3401. * be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
  3402. * followed by 'd', instead of a digit.
  3403. *
  3404. * If +replacement+ is a String it will be substituted for the matched text.
  3405. * It may contain back-references to the pattern's capture groups of the form
  3406. * <code>"\\d"</code>, where <i>d</i> is a group number, or
  3407. * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
  3408. * double-quoted string, both back-references must be preceded by an
  3409. * additional backslash. However, within +replacement+ the special match
  3410. * variables, such as <code>&$</code>, will not refer to the current match.
  3411. *
  3412. * If the second argument is a Hash, and the matched text is one of its keys,
  3413. * the corresponding value is the replacement string.
  3414. *
  3415. * In the block form, the current match string is passed in as a parameter,
  3416. * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
  3417. * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
  3418. * returned by the block will be substituted for the match on each call.
  3419. *
  3420. * The result inherits any tainting in the original string or any supplied
  3421. * replacement string.
  3422. *
  3423. * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
  3424. * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
  3425. * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
  3426. * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
  3427. * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
  3428. * #=> "Is /bin/bash your preferred shell?"
  3429. */
  3430. static VALUE
  3431. rb_str_sub(int argc, VALUE *argv, VALUE str)
  3432. {
  3433. str = rb_str_dup(str);
  3434. rb_str_sub_bang(argc, argv, str);
  3435. return str;
  3436. }
  3437. static VALUE
  3438. str_gsub(int argc, VALUE *argv, VALUE str, int bang)
  3439. {
  3440. VALUE pat, val, repl, match, dest, hash = Qnil;
  3441. struct re_registers *regs;
  3442. long beg, n;
  3443. long beg0, end0;
  3444. long offset, blen, slen, len, last;
  3445. int iter = 0;
  3446. char *sp, *cp;
  3447. int tainted = 0;
  3448. rb_encoding *str_enc;
  3449. switch (argc) {
  3450. case 1:
  3451. RETURN_ENUMERATOR(str, argc, argv);
  3452. iter = 1;
  3453. break;
  3454. case 2:
  3455. repl = argv[1];
  3456. hash = rb_check_hash_type(argv[1]);
  3457. if (NIL_P(hash)) {
  3458. StringValue(repl);
  3459. }
  3460. if (OBJ_TAINTED(repl)) tainted = 1;
  3461. break;
  3462. default:
  3463. rb_check_arity(argc, 1, 2);
  3464. }
  3465. pat = get_pat(argv[0], 1);
  3466. beg = rb_reg_search(pat, str, 0, 0);
  3467. if (beg < 0) {
  3468. if (bang) return Qnil; /* no match, no substitution */
  3469. return rb_str_dup(str);
  3470. }
  3471. offset = 0;
  3472. n = 0;
  3473. blen = RSTRING_LEN(str) + 30; /* len + margin */
  3474. dest = rb_str_buf_new(blen);
  3475. sp = RSTRING_PTR(str);
  3476. slen = RSTRING_LEN(str);
  3477. cp = sp;
  3478. str_enc = STR_ENC_GET(str);
  3479. rb_enc_associate(dest, str_enc);
  3480. ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
  3481. do {
  3482. n++;
  3483. match = rb_backref_get();
  3484. regs = RMATCH_REGS(match);
  3485. beg0 = BEG(0);
  3486. end0 = END(0);
  3487. if (iter || !NIL_P(hash)) {
  3488. if (iter) {
  3489. val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
  3490. }
  3491. else {
  3492. val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
  3493. val = rb_obj_as_string(val);
  3494. }
  3495. str_mod_check(str, sp, slen);
  3496. if (val == dest) { /* paranoid check [ruby-dev:24827] */
  3497. rb_raise(rb_eRuntimeError, "block should not cheat");
  3498. }
  3499. }
  3500. else {
  3501. val = rb_reg_regsub(repl, str, regs, pat);
  3502. }
  3503. if (OBJ_TAINTED(val)) tainted = 1;
  3504. len = beg - offset; /* copy pre-match substr */
  3505. if (len) {
  3506. rb_enc_str_buf_cat(dest, cp, len, str_enc);
  3507. }
  3508. rb_str_buf_append(dest, val);
  3509. last = offset;
  3510. offset = end0;
  3511. if (beg0 == end0) {
  3512. /*
  3513. * Always consume at least one character of the input string
  3514. * in order to prevent infinite loops.
  3515. */
  3516. if (RSTRING_LEN(str) <= end0) break;
  3517. len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
  3518. rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
  3519. offset = end0 + len;
  3520. }
  3521. cp = RSTRING_PTR(str) + offset;
  3522. if (offset > RSTRING_LEN(str)) break;
  3523. beg = rb_reg_search(pat, str, offset, 0);
  3524. } while (beg >= 0);
  3525. if (RSTRING_LEN(str) > offset) {
  3526. rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
  3527. }
  3528. rb_reg_search(pat, str, last, 0);
  3529. if (bang) {
  3530. rb_str_shared_replace(str, dest);
  3531. }
  3532. else {
  3533. RBASIC(dest)->klass = rb_obj_class(str);
  3534. OBJ_INFECT(dest, str);
  3535. str = dest;
  3536. }
  3537. if (tainted) OBJ_TAINT(str);
  3538. return str;
  3539. }
  3540. /*
  3541. * call-seq:
  3542. * str.gsub!(pattern, replacement) -> str or nil
  3543. * str.gsub!(pattern) {|match| block } -> str or nil
  3544. * str.gsub!(pattern) -> an_enumerator
  3545. *
  3546. * Performs the substitutions of <code>String#gsub</code> in place, returning
  3547. * <i>str</i>, or <code>nil</code> if no substitutions were performed.
  3548. * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
  3549. */
  3550. static VALUE
  3551. rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
  3552. {
  3553. str_modify_keep_cr(str);
  3554. return str_gsub(argc, argv, str, 1);
  3555. }
  3556. /*
  3557. * call-seq:
  3558. * str.gsub(pattern, replacement) -> new_str
  3559. * str.gsub(pattern, hash) -> new_str
  3560. * str.gsub(pattern) {|match| block } -> new_str
  3561. * str.gsub(pattern) -> enumerator
  3562. *
  3563. * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
  3564. * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
  3565. * typically a <code>Regexp</code>; if given as a <code>String</code>, any
  3566. * regular expression metacharacters it contains will be interpreted
  3567. * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
  3568. * instead of a digit.
  3569. *
  3570. * If <i>replacement</i> is a <code>String</code> it will be substituted for
  3571. * the matched text. It may contain back-references to the pattern's capture
  3572. * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
  3573. * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
  3574. * double-quoted string, both back-references must be preceded by an
  3575. * additional backslash. However, within <i>replacement</i> the special match
  3576. * variables, such as <code>&$</code>, will not refer to the current match.
  3577. *
  3578. * If the second argument is a <code>Hash</code>, and the matched text is one
  3579. * of its keys, the corresponding value is the replacement string.
  3580. *
  3581. * In the block form, the current match string is passed in as a parameter,
  3582. * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
  3583. * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
  3584. * returned by the block will be substituted for the match on each call.
  3585. *
  3586. * The result inherits any tainting in the original string or any supplied
  3587. * replacement string.
  3588. *
  3589. * When neither a block nor a second argument is supplied, an
  3590. * <code>Enumerator</code> is returned.
  3591. *
  3592. * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
  3593. * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
  3594. * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
  3595. * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
  3596. * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
  3597. */
  3598. static VALUE
  3599. rb_str_gsub(int argc, VALUE *argv, VALUE str)
  3600. {
  3601. return str_gsub(argc, argv, str, 0);
  3602. }
  3603. /*
  3604. * call-seq:
  3605. * str.replace(other_str) -> str
  3606. *
  3607. * Replaces the contents and taintedness of <i>str</i> with the corresponding
  3608. * values in <i>other_str</i>.
  3609. *
  3610. * s = "hello" #=> "hello"
  3611. * s.replace "world" #=> "world"
  3612. */
  3613. VALUE
  3614. rb_str_replace(VALUE str, VALUE str2)
  3615. {
  3616. str_modifiable(str);
  3617. if (str == str2) return str;
  3618. StringValue(str2);
  3619. str_discard(str);
  3620. return str_replace(str, str2);
  3621. }
  3622. /*
  3623. * call-seq:
  3624. * string.clear -> string
  3625. *
  3626. * Makes string empty.
  3627. *
  3628. * a = "abcde"
  3629. * a.clear #=> ""
  3630. */
  3631. static VALUE
  3632. rb_str_clear(VALUE str)
  3633. {
  3634. str_discard(str);
  3635. STR_SET_EMBED(str);
  3636. STR_SET_EMBED_LEN(str, 0);
  3637. RSTRING_PTR(str)[0] = 0;
  3638. if (rb_enc_asciicompat(STR_ENC_GET(str)))
  3639. ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
  3640. else
  3641. ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
  3642. return str;
  3643. }
  3644. /*
  3645. * call-seq:
  3646. * string.chr -> string
  3647. *
  3648. * Returns a one-character string at the beginning of the string.
  3649. *
  3650. * a = "abcde"
  3651. * a.chr #=> "a"
  3652. */
  3653. static VALUE
  3654. rb_str_chr(VALUE str)
  3655. {
  3656. return rb_str_substr(str, 0, 1);
  3657. }
  3658. /*
  3659. * call-seq:
  3660. * str.getbyte(index) -> 0 .. 255
  3661. *
  3662. * returns the <i>index</i>th byte as an integer.
  3663. */
  3664. static VALUE
  3665. rb_str_getbyte(VALUE str, VALUE index)
  3666. {
  3667. long pos = NUM2LONG(index);
  3668. if (pos < 0)
  3669. pos += RSTRING_LEN(str);
  3670. if (pos < 0 || RSTRING_LEN(str) <= pos)
  3671. return Qnil;
  3672. return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
  3673. }
  3674. /*
  3675. * call-seq:
  3676. * str.setbyte(index, int) -> int
  3677. *
  3678. * modifies the <i>index</i>th byte as <i>int</i>.
  3679. */
  3680. static VALUE
  3681. rb_str_setbyte(VALUE str, VALUE index, VALUE value)
  3682. {
  3683. long pos = NUM2LONG(index);
  3684. int byte = NUM2INT(value);
  3685. rb_str_modify(str);
  3686. if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
  3687. rb_raise(rb_eIndexError, "index %ld out of string", pos);
  3688. if (pos < 0)
  3689. pos += RSTRING_LEN(str);
  3690. RSTRING_PTR(str)[pos] = byte;
  3691. return value;
  3692. }
  3693. static VALUE
  3694. str_byte_substr(VALUE str, long beg, long len)
  3695. {
  3696. char *p, *s = RSTRING_PTR(str);
  3697. long n = RSTRING_LEN(str);
  3698. VALUE str2;
  3699. if (beg > n || len < 0) return Qnil;
  3700. if (beg < 0) {
  3701. beg += n;
  3702. if (beg < 0) return Qnil;
  3703. }
  3704. if (beg + len > n)
  3705. len = n - beg;
  3706. if (len <= 0) {
  3707. len = 0;
  3708. p = 0;
  3709. }
  3710. else
  3711. p = s + beg;
  3712. if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
  3713. str2 = rb_str_new4(str);
  3714. str2 = str_new3(rb_obj_class(str2), str2);
  3715. RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
  3716. RSTRING(str2)->as.heap.len = len;
  3717. }
  3718. else {
  3719. str2 = rb_str_new5(str, p, len);
  3720. rb_enc_cr_str_copy_for_substr(str2, str);
  3721. OBJ_INFECT(str2, str);
  3722. }
  3723. return str2;
  3724. }
  3725. static VALUE
  3726. str_byte_aref(VALUE str, VALUE indx)
  3727. {
  3728. long idx;
  3729. switch (TYPE(indx)) {
  3730. case T_FIXNUM:
  3731. idx = FIX2LONG(indx);
  3732. num_index:
  3733. str = str_byte_substr(str, idx, 1);
  3734. if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
  3735. return str;
  3736. default:
  3737. /* check if indx is Range */
  3738. {
  3739. long beg, len = RSTRING_LEN(str);
  3740. switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
  3741. case Qfalse:
  3742. break;
  3743. case Qnil:
  3744. return Qnil;
  3745. default:
  3746. return str_byte_substr(str, beg, len);
  3747. }
  3748. }
  3749. idx = NUM2LONG(indx);
  3750. goto num_index;
  3751. }
  3752. UNREACHABLE;
  3753. }
  3754. /*
  3755. * call-seq:
  3756. * str.byteslice(fixnum) -> new_str or nil
  3757. * str.byteslice(fixnum, fixnum) -> new_str or nil
  3758. * str.byteslice(range) -> new_str or nil
  3759. *
  3760. * Byte Reference---If passed a single <code>Fixnum</code>, returns a
  3761. * substring of one byte at that position. If passed two <code>Fixnum</code>
  3762. * objects, returns a substring starting at the offset given by the first, and
  3763. * a length given by the second. If given a <code>Range</code>, a substring containing
  3764. * bytes at offsets given by the range is returned. In all three cases, if
  3765. * an offset is negative, it is counted from the end of <i>str</i>. Returns
  3766. * <code>nil</code> if the initial offset falls outside the string, the length
  3767. * is negative, or the beginning of the range is greater than the end.
  3768. * The encoding of the resulted string keeps original encoding.
  3769. *
  3770. * "hello".byteslice(1) #=> "e"
  3771. * "hello".byteslice(-1) #=> "o"
  3772. * "hello".byteslice(1, 2) #=> "el"
  3773. * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
  3774. * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
  3775. */
  3776. static VALUE
  3777. rb_str_byteslice(int argc, VALUE *argv, VALUE str)
  3778. {
  3779. if (argc == 2) {
  3780. return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
  3781. }
  3782. rb_check_arity(argc, 1, 2);
  3783. return str_byte_aref(str, argv[0]);
  3784. }
  3785. /*
  3786. * call-seq:
  3787. * str.reverse -> new_str
  3788. *
  3789. * Returns a new string with the characters from <i>str</i> in reverse order.
  3790. *
  3791. * "stressed".reverse #=> "desserts"
  3792. */
  3793. static VALUE
  3794. rb_str_reverse(VALUE str)
  3795. {
  3796. rb_encoding *enc;
  3797. VALUE rev;
  3798. char *s, *e, *p;
  3799. int single = 1;
  3800. if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
  3801. enc = STR_ENC_GET(str);
  3802. rev = rb_str_new5(str, 0, RSTRING_LEN(str));
  3803. s = RSTRING_PTR(str); e = RSTRING_END(str);
  3804. p = RSTRING_END(rev);
  3805. if (RSTRING_LEN(str) > 1) {
  3806. if (single_byte_optimizable(str)) {
  3807. while (s < e) {
  3808. *--p = *s++;
  3809. }
  3810. }
  3811. else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
  3812. while (s < e) {
  3813. int clen = rb_enc_fast_mbclen(s, e, enc);
  3814. if (clen > 1 || (*s & 0x80)) single = 0;
  3815. p -= clen;
  3816. memcpy(p, s, clen);
  3817. s += clen;
  3818. }
  3819. }
  3820. else {
  3821. while (s < e) {
  3822. int clen = rb_enc_mbclen(s, e, enc);
  3823. if (clen > 1 || (*s & 0x80)) single = 0;
  3824. p -= clen;
  3825. memcpy(p, s, clen);
  3826. s += clen;
  3827. }
  3828. }
  3829. }
  3830. STR_SET_LEN(rev, RSTRING_LEN(str));
  3831. OBJ_INFECT(rev, str);
  3832. if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
  3833. if (single) {
  3834. ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
  3835. }
  3836. else {
  3837. ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
  3838. }
  3839. }
  3840. rb_enc_cr_str_copy_for_substr(rev, str);
  3841. return rev;
  3842. }
  3843. /*
  3844. * call-seq:
  3845. * str.reverse! -> str
  3846. *
  3847. * Reverses <i>str</i> in place.
  3848. */
  3849. static VALUE
  3850. rb_str_reverse_bang(VALUE str)
  3851. {
  3852. if (RSTRING_LEN(str) > 1) {
  3853. if (single_byte_optimizable(str)) {
  3854. char *s, *e, c;
  3855. str_modify_keep_cr(str);
  3856. s = RSTRING_PTR(str);
  3857. e = RSTRING_END(str) - 1;
  3858. while (s < e) {
  3859. c = *s;
  3860. *s++ = *e;
  3861. *e-- = c;
  3862. }
  3863. }
  3864. else {
  3865. rb_str_shared_replace(str, rb_str_reverse(str));
  3866. }
  3867. }
  3868. else {
  3869. str_modify_keep_cr(str);
  3870. }
  3871. return str;
  3872. }
  3873. /*
  3874. * call-seq:
  3875. * str.include? other_str -> true or false
  3876. *
  3877. * Returns <code>true</code> if <i>str</i> contains the given string or
  3878. * character.
  3879. *
  3880. * "hello".include? "lo" #=> true
  3881. * "hello".include? "ol" #=> false
  3882. * "hello".include? ?h #=> true
  3883. */
  3884. static VALUE
  3885. rb_str_include(VALUE str, VALUE arg)
  3886. {
  3887. long i;
  3888. StringValue(arg);
  3889. i = rb_str_index(str, arg, 0);
  3890. if (i == -1) return Qfalse;
  3891. return Qtrue;
  3892. }
  3893. /*
  3894. * call-seq:
  3895. * str.to_i(base=10) -> integer
  3896. *
  3897. * Returns the result of interpreting leading characters in <i>str</i> as an
  3898. * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
  3899. * end of a valid number are ignored. If there is not a valid number at the
  3900. * start of <i>str</i>, <code>0</code> is returned. This method never raises an
  3901. * exception when <i>base</i> is valid.
  3902. *
  3903. * "12345".to_i #=> 12345
  3904. * "99 red balloons".to_i #=> 99
  3905. * "0a".to_i #=> 0
  3906. * "0a".to_i(16) #=> 10
  3907. * "hello".to_i #=> 0
  3908. * "1100101".to_i(2) #=> 101
  3909. * "1100101".to_i(8) #=> 294977
  3910. * "1100101".to_i(10) #=> 1100101
  3911. * "1100101".to_i(16) #=> 17826049
  3912. */
  3913. static VALUE
  3914. rb_str_to_i(int argc, VALUE *argv, VALUE str)
  3915. {
  3916. int base;
  3917. if (argc == 0) base = 10;
  3918. else {
  3919. VALUE b;
  3920. rb_scan_args(argc, argv, "01", &b);
  3921. base = NUM2INT(b);
  3922. }
  3923. if (base < 0) {
  3924. rb_raise(rb_eArgError, "invalid radix %d", base);
  3925. }
  3926. return rb_str_to_inum(str, base, FALSE);
  3927. }
  3928. /*
  3929. * call-seq:
  3930. * str.to_f -> float
  3931. *
  3932. * Returns the result of interpreting leading characters in <i>str</i> as a
  3933. * floating point number. Extraneous characters past the end of a valid number
  3934. * are ignored. If there is not a valid number at the start of <i>str</i>,
  3935. * <code>0.0</code> is returned. This method never raises an exception.
  3936. *
  3937. * "123.45e1".to_f #=> 1234.5
  3938. * "45.67 degrees".to_f #=> 45.67
  3939. * "thx1138".to_f #=> 0.0
  3940. */
  3941. static VALUE
  3942. rb_str_to_f(VALUE str)
  3943. {
  3944. return DBL2NUM(rb_str_to_dbl(str, FALSE));
  3945. }
  3946. /*
  3947. * call-seq:
  3948. * str.to_s -> str
  3949. * str.to_str -> str
  3950. *
  3951. * Returns the receiver.
  3952. */
  3953. static VALUE
  3954. rb_str_to_s(VALUE str)
  3955. {
  3956. if (rb_obj_class(str) != rb_cString) {
  3957. return str_duplicate(rb_cString, str);
  3958. }
  3959. return str;
  3960. }
  3961. #if 0
  3962. static void
  3963. str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
  3964. {
  3965. char s[RUBY_MAX_CHAR_LEN];
  3966. int n = rb_enc_codelen(c, enc);
  3967. rb_enc_mbcput(c, s, enc);
  3968. rb_enc_str_buf_cat(str, s, n, enc);
  3969. }
  3970. #endif
  3971. #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
  3972. int
  3973. rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
  3974. {
  3975. char buf[CHAR_ESC_LEN + 1];
  3976. int l;
  3977. #if SIZEOF_INT > 4
  3978. c &= 0xffffffff;
  3979. #endif
  3980. if (unicode_p) {
  3981. if (c < 0x7F && ISPRINT(c)) {
  3982. snprintf(buf, CHAR_ESC_LEN, "%c", c);
  3983. }
  3984. else if (c < 0x10000) {
  3985. snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
  3986. }
  3987. else {
  3988. snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
  3989. }
  3990. }
  3991. else {
  3992. if (c < 0x100) {
  3993. snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
  3994. }
  3995. else {
  3996. snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
  3997. }
  3998. }
  3999. l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
  4000. rb_str_buf_cat(result, buf, l);
  4001. return l;
  4002. }
  4003. /*
  4004. * call-seq:
  4005. * str.inspect -> string
  4006. *
  4007. * Returns a printable version of _str_, surrounded by quote marks,
  4008. * with special characters escaped.
  4009. *
  4010. * str = "hello"
  4011. * str[3] = "\b"
  4012. * str.inspect #=> "\"hel\\bo\""
  4013. */
  4014. VALUE
  4015. rb_str_inspect(VALUE str)
  4016. {
  4017. rb_encoding *enc = STR_ENC_GET(str);
  4018. const char *p, *pend, *prev;
  4019. char buf[CHAR_ESC_LEN + 1];
  4020. VALUE result = rb_str_buf_new(0);
  4021. rb_encoding *resenc = rb_default_internal_encoding();
  4022. int unicode_p = rb_enc_unicode_p(enc);
  4023. int asciicompat = rb_enc_asciicompat(enc);
  4024. static rb_encoding *utf16, *utf32;
  4025. if (!utf16) utf16 = rb_enc_find("UTF-16");
  4026. if (!utf32) utf32 = rb_enc_find("UTF-32");
  4027. if (resenc == NULL) resenc = rb_default_external_encoding();
  4028. if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
  4029. rb_enc_associate(result, resenc);
  4030. str_buf_cat2(result, "\"");
  4031. p = RSTRING_PTR(str); pend = RSTRING_END(str);
  4032. prev = p;
  4033. if (enc == utf16) {
  4034. const unsigned char *q = (const unsigned char *)p;
  4035. if (q[0] == 0xFE && q[1] == 0xFF)
  4036. enc = rb_enc_find("UTF-16BE");
  4037. else if (q[0] == 0xFF && q[1] == 0xFE)
  4038. enc = rb_enc_find("UTF-16LE");
  4039. else
  4040. unicode_p = 0;
  4041. }
  4042. else if (enc == utf32) {
  4043. const unsigned char *q = (const unsigned char *)p;
  4044. if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
  4045. enc = rb_enc_find("UTF-32BE");
  4046. else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
  4047. enc = rb_enc_find("UTF-32LE");
  4048. else
  4049. unicode_p = 0;
  4050. }
  4051. while (p < pend) {
  4052. unsigned int c, cc;
  4053. int n;
  4054. n = rb_enc_precise_mbclen(p, pend, enc);
  4055. if (!MBCLEN_CHARFOUND_P(n)) {
  4056. if (p > prev) str_buf_cat(result, prev, p - prev);
  4057. n = rb_enc_mbminlen(enc);
  4058. if (pend < p + n)
  4059. n = (int)(pend - p);
  4060. while (n--) {
  4061. snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
  4062. str_buf_cat(result, buf, strlen(buf));
  4063. prev = ++p;
  4064. }
  4065. continue;
  4066. }
  4067. n = MBCLEN_CHARFOUND_LEN(n);
  4068. c = rb_enc_mbc_to_codepoint(p, pend, enc);
  4069. p += n;
  4070. if ((asciicompat || unicode_p) &&
  4071. (c == '"'|| c == '\\' ||
  4072. (c == '#' &&
  4073. p < pend &&
  4074. MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
  4075. (cc = rb_enc_codepoint(p,pend,enc),
  4076. (cc == '$' || cc == '@' || cc == '{'))))) {
  4077. if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
  4078. str_buf_cat2(result, "\\");
  4079. if (asciicompat || enc == resenc) {
  4080. prev = p - n;
  4081. continue;
  4082. }
  4083. }
  4084. switch (c) {
  4085. case '\0': cc = '0'; break;
  4086. case '\n': cc = 'n'; break;
  4087. case '\r': cc = 'r'; break;
  4088. case '\t': cc = 't'; break;
  4089. case '\f': cc = 'f'; break;
  4090. case '\013': cc = 'v'; break;
  4091. case '\010': cc = 'b'; break;
  4092. case '\007': cc = 'a'; break;
  4093. case 033: cc = 'e'; break;
  4094. default: cc = 0; break;
  4095. }
  4096. if (cc) {
  4097. if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
  4098. buf[0] = '\\';
  4099. buf[1] = (char)cc;
  4100. str_buf_cat(result, buf, 2);
  4101. prev = p;
  4102. continue;
  4103. }
  4104. if ((enc == resenc && rb_enc_isprint(c, enc)) ||
  4105. (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
  4106. continue;
  4107. }
  4108. else {
  4109. if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
  4110. rb_str_buf_cat_escaped_char(result, c, unicode_p);
  4111. prev = p;
  4112. continue;
  4113. }
  4114. }
  4115. if (p > prev) str_buf_cat(result, prev, p - prev);
  4116. str_buf_cat2(result, "\"");
  4117. OBJ_INFECT(result, str);
  4118. return result;
  4119. }
  4120. #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
  4121. /*
  4122. * call-seq:
  4123. * str.dump -> new_str
  4124. *
  4125. * Produces a version of +str+ with all non-printing characters replaced by
  4126. * <code>\nnn</code> notation and all special characters escaped.
  4127. *
  4128. * "hello \n ''".dump #=> "\"hello \\n ''\"
  4129. */
  4130. VALUE
  4131. rb_str_dump(VALUE str)
  4132. {
  4133. rb_encoding *enc = rb_enc_get(str);
  4134. long len;
  4135. const char *p, *pend;
  4136. char *q, *qend;
  4137. VALUE result;
  4138. int u8 = (enc == rb_utf8_encoding());
  4139. len = 2; /* "" */
  4140. p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
  4141. while (p < pend) {
  4142. unsigned char c = *p++;
  4143. switch (c) {
  4144. case '"': case '\\':
  4145. case '\n': case '\r':
  4146. case '\t': case '\f':
  4147. case '\013': case '\010': case '\007': case '\033':
  4148. len += 2;
  4149. break;
  4150. case '#':
  4151. len += IS_EVSTR(p, pend) ? 2 : 1;
  4152. break;
  4153. default:
  4154. if (ISPRINT(c)) {
  4155. len++;
  4156. }
  4157. else {
  4158. if (u8) { /* \u{NN} */
  4159. int n = rb_enc_precise_mbclen(p-1, pend, enc);
  4160. if (MBCLEN_CHARFOUND_P(n-1)) {
  4161. unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
  4162. while (cc >>= 4) len++;
  4163. len += 5;
  4164. p += MBCLEN_CHARFOUND_LEN(n)-1;
  4165. break;
  4166. }
  4167. }
  4168. len += 4; /* \xNN */
  4169. }
  4170. break;
  4171. }
  4172. }
  4173. if (!rb_enc_asciicompat(enc)) {
  4174. len += 19; /* ".force_encoding('')" */
  4175. len += strlen(enc->name);
  4176. }
  4177. result = rb_str_new5(str, 0, len);
  4178. p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
  4179. q = RSTRING_PTR(result); qend = q + len + 1;
  4180. *q++ = '"';
  4181. while (p < pend) {
  4182. unsigned char c = *p++;
  4183. if (c == '"' || c == '\\') {
  4184. *q++ = '\\';
  4185. *q++ = c;
  4186. }
  4187. else if (c == '#') {
  4188. if (IS_EVSTR(p, pend)) *q++ = '\\';
  4189. *q++ = '#';
  4190. }
  4191. else if (c == '\n') {
  4192. *q++ = '\\';
  4193. *q++ = 'n';
  4194. }
  4195. else if (c == '\r') {
  4196. *q++ = '\\';
  4197. *q++ = 'r';
  4198. }
  4199. else if (c == '\t') {
  4200. *q++ = '\\';
  4201. *q++ = 't';
  4202. }
  4203. else if (c == '\f') {
  4204. *q++ = '\\';
  4205. *q++ = 'f';
  4206. }
  4207. else if (c == '\013') {
  4208. *q++ = '\\';
  4209. *q++ = 'v';
  4210. }
  4211. else if (c == '\010') {
  4212. *q++ = '\\';
  4213. *q++ = 'b';
  4214. }
  4215. else if (c == '\007') {
  4216. *q++ = '\\';
  4217. *q++ = 'a';
  4218. }
  4219. else if (c == '\033') {
  4220. *q++ = '\\';
  4221. *q++ = 'e';
  4222. }
  4223. else if (ISPRINT(c)) {
  4224. *q++ = c;
  4225. }
  4226. else {
  4227. *q++ = '\\';
  4228. if (u8) {
  4229. int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
  4230. if (MBCLEN_CHARFOUND_P(n)) {
  4231. int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
  4232. p += n;
  4233. snprintf(q, qend-q, "u{%x}", cc);
  4234. q += strlen(q);
  4235. continue;
  4236. }
  4237. }
  4238. snprintf(q, qend-q, "x%02X", c);
  4239. q += 3;
  4240. }
  4241. }
  4242. *q++ = '"';
  4243. *q = '\0';
  4244. if (!rb_enc_asciicompat(enc)) {
  4245. snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
  4246. enc = rb_ascii8bit_encoding();
  4247. }
  4248. OBJ_INFECT(result, str);
  4249. /* result from dump is ASCII */
  4250. rb_enc_associate(result, enc);
  4251. ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
  4252. return result;
  4253. }
  4254. static void
  4255. rb_str_check_dummy_enc(rb_encoding *enc)
  4256. {
  4257. if (rb_enc_dummy_p(enc)) {
  4258. rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
  4259. rb_enc_name(enc));
  4260. }
  4261. }
  4262. /*
  4263. * call-seq:
  4264. * str.upcase! -> str or nil
  4265. *
  4266. * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
  4267. * were made.
  4268. * Note: case replacement is effective only in ASCII region.
  4269. */
  4270. static VALUE
  4271. rb_str_upcase_bang(VALUE str)
  4272. {
  4273. rb_encoding *enc;
  4274. char *s, *send;
  4275. int modify = 0;
  4276. int n;
  4277. str_modify_keep_cr(str);
  4278. enc = STR_ENC_GET(str);
  4279. rb_str_check_dummy_enc(enc);
  4280. s = RSTRING_PTR(str); send = RSTRING_END(str);
  4281. if (single_byte_optimizable(str)) {
  4282. while (s < send) {
  4283. unsigned int c = *(unsigned char*)s;
  4284. if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
  4285. *s = 'A' + (c - 'a');
  4286. modify = 1;
  4287. }
  4288. s++;
  4289. }
  4290. }
  4291. else {
  4292. int ascompat = rb_enc_asciicompat(enc);
  4293. while (s < send) {
  4294. unsigned int c;
  4295. if (ascompat && (c = *(unsigned char*)s) < 0x80) {
  4296. if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
  4297. *s = 'A' + (c - 'a');
  4298. modify = 1;
  4299. }
  4300. s++;
  4301. }
  4302. else {
  4303. c = rb_enc_codepoint_len(s, send, &n, enc);
  4304. if (rb_enc_islower(c, enc)) {
  4305. /* assuming toupper returns codepoint with same size */
  4306. rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
  4307. modify = 1;
  4308. }
  4309. s += n;
  4310. }
  4311. }
  4312. }
  4313. if (modify) return str;
  4314. return Qnil;
  4315. }
  4316. /*
  4317. * call-seq:
  4318. * str.upcase -> new_str
  4319. *
  4320. * Returns a copy of <i>str</i> with all lowercase letters replaced with their
  4321. * uppercase counterparts. The operation is locale insensitive---only
  4322. * characters ``a'' to ``z'' are affected.
  4323. * Note: case replacement is effective only in ASCII region.
  4324. *
  4325. * "hEllO".upcase #=> "HELLO"
  4326. */
  4327. static VALUE
  4328. rb_str_upcase(VALUE str)
  4329. {
  4330. str = rb_str_dup(str);
  4331. rb_str_upcase_bang(str);
  4332. return str;
  4333. }
  4334. /*
  4335. * call-seq:
  4336. * str.downcase! -> str or nil
  4337. *
  4338. * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
  4339. * changes were made.
  4340. * Note: case replacement is effective only in ASCII region.
  4341. */
  4342. static VALUE
  4343. rb_str_downcase_bang(VALUE str)
  4344. {
  4345. rb_encoding *enc;
  4346. char *s, *send;
  4347. int modify = 0;
  4348. str_modify_keep_cr(str);
  4349. enc = STR_ENC_GET(str);
  4350. rb_str_check_dummy_enc(enc);
  4351. s = RSTRING_PTR(str); send = RSTRING_END(str);
  4352. if (single_byte_optimizable(str)) {
  4353. while (s < send) {
  4354. unsigned int c = *(unsigned char*)s;
  4355. if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
  4356. *s = 'a' + (c - 'A');
  4357. modify = 1;
  4358. }
  4359. s++;
  4360. }
  4361. }
  4362. else {
  4363. int ascompat = rb_enc_asciicompat(enc);
  4364. while (s < send) {
  4365. unsigned int c;
  4366. int n;
  4367. if (ascompat && (c = *(unsigned char*)s) < 0x80) {
  4368. if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
  4369. *s = 'a' + (c - 'A');
  4370. modify = 1;
  4371. }
  4372. s++;
  4373. }
  4374. else {
  4375. c = rb_enc_codepoint_len(s, send, &n, enc);
  4376. if (rb_enc_isupper(c, enc)) {
  4377. /* assuming toupper returns codepoint with same size */
  4378. rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
  4379. modify = 1;
  4380. }
  4381. s += n;
  4382. }
  4383. }
  4384. }
  4385. if (modify) return str;
  4386. return Qnil;
  4387. }
  4388. /*
  4389. * call-seq:
  4390. * str.downcase -> new_str
  4391. *
  4392. * Returns a copy of <i>str</i> with all uppercase letters replaced with their
  4393. * lowercase counterparts. The operation is locale insensitive---only
  4394. * characters ``A'' to ``Z'' are affected.
  4395. * Note: case replacement is effective only in ASCII region.
  4396. *
  4397. * "hEllO".downcase #=> "hello"
  4398. */
  4399. static VALUE
  4400. rb_str_downcase(VALUE str)
  4401. {
  4402. str = rb_str_dup(str);
  4403. rb_str_downcase_bang(str);
  4404. return str;
  4405. }
  4406. /*
  4407. * call-seq:
  4408. * str.capitalize! -> str or nil
  4409. *
  4410. * Modifies <i>str</i> by converting the first character to uppercase and the
  4411. * remainder to lowercase. Returns <code>nil</code> if no changes are made.
  4412. * Note: case conversion is effective only in ASCII region.
  4413. *
  4414. * a = "hello"
  4415. * a.capitalize! #=> "Hello"
  4416. * a #=> "Hello"
  4417. * a.capitalize! #=> nil
  4418. */
  4419. static VALUE
  4420. rb_str_capitalize_bang(VALUE str)
  4421. {
  4422. rb_encoding *enc;
  4423. char *s, *send;
  4424. int modify = 0;
  4425. unsigned int c;
  4426. int n;
  4427. str_modify_keep_cr(str);
  4428. enc = STR_ENC_GET(str);
  4429. rb_str_check_dummy_enc(enc);
  4430. if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
  4431. s = RSTRING_PTR(str); send = RSTRING_END(str);
  4432. c = rb_enc_codepoint_len(s, send, &n, enc);
  4433. if (rb_enc_islower(c, enc)) {
  4434. rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
  4435. modify = 1;
  4436. }
  4437. s += n;
  4438. while (s < send) {
  4439. c = rb_enc_codepoint_len(s, send, &n, enc);
  4440. if (rb_enc_isupper(c, enc)) {
  4441. rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
  4442. modify = 1;
  4443. }
  4444. s += n;
  4445. }
  4446. if (modify) return str;
  4447. return Qnil;
  4448. }
  4449. /*
  4450. * call-seq:
  4451. * str.capitalize -> new_str
  4452. *
  4453. * Returns a copy of <i>str</i> with the first character converted to uppercase
  4454. * and the remainder to lowercase.
  4455. * Note: case conversion is effective only in ASCII region.
  4456. *
  4457. * "hello".capitalize #=> "Hello"
  4458. * "HELLO".capitalize #=> "Hello"
  4459. * "123ABC".capitalize #=> "123abc"
  4460. */
  4461. static VALUE
  4462. rb_str_capitalize(VALUE str)
  4463. {
  4464. str = rb_str_dup(str);
  4465. rb_str_capitalize_bang(str);
  4466. return str;
  4467. }
  4468. /*
  4469. * call-seq:
  4470. * str.swapcase! -> str or nil
  4471. *
  4472. * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
  4473. * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
  4474. * Note: case conversion is effective only in ASCII region.
  4475. */
  4476. static VALUE
  4477. rb_str_swapcase_bang(VALUE str)
  4478. {
  4479. rb_encoding *enc;
  4480. char *s, *send;
  4481. int modify = 0;
  4482. int n;
  4483. str_modify_keep_cr(str);
  4484. enc = STR_ENC_GET(str);
  4485. rb_str_check_dummy_enc(enc);
  4486. s = RSTRING_PTR(str); send = RSTRING_END(str);
  4487. while (s < send) {
  4488. unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
  4489. if (rb_enc_isupper(c, enc)) {
  4490. /* assuming toupper returns codepoint with same size */
  4491. rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
  4492. modify = 1;
  4493. }
  4494. else if (rb_enc_islower(c, enc)) {
  4495. /* assuming tolower returns codepoint with same size */
  4496. rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
  4497. modify = 1;
  4498. }
  4499. s += n;
  4500. }
  4501. if (modify) return str;
  4502. return Qnil;
  4503. }
  4504. /*
  4505. * call-seq:
  4506. * str.swapcase -> new_str
  4507. *
  4508. * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
  4509. * to lowercase and lowercase characters converted to uppercase.
  4510. * Note: case conversion is effective only in ASCII region.
  4511. *
  4512. * "Hello".swapcase #=> "hELLO"
  4513. * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
  4514. */
  4515. static VALUE
  4516. rb_str_swapcase(VALUE str)
  4517. {
  4518. str = rb_str_dup(str);
  4519. rb_str_swapcase_bang(str);
  4520. return str;
  4521. }
  4522. typedef unsigned char *USTR;
  4523. struct tr {
  4524. int gen;
  4525. unsigned int now, max;
  4526. char *p, *pend;
  4527. };
  4528. static unsigned int
  4529. trnext(struct tr *t, rb_encoding *enc)
  4530. {
  4531. int n;
  4532. for (;;) {
  4533. if (!t->gen) {
  4534. nextpart:
  4535. if (t->p == t->pend) return -1;
  4536. if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
  4537. t->p += n;
  4538. }
  4539. t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
  4540. t->p += n;
  4541. if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
  4542. t->p += n;
  4543. if (t->p < t->pend) {
  4544. unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
  4545. t->p += n;
  4546. if (t->now > c) {
  4547. if (t->now < 0x80 && c < 0x80) {
  4548. rb_raise(rb_eArgError,
  4549. "invalid range \"%c-%c\" in string transliteration",
  4550. t->now, c);
  4551. }
  4552. else {
  4553. rb_raise(rb_eArgError, "invalid range in string transliteration");
  4554. }
  4555. continue; /* not reached */
  4556. }
  4557. t->gen = 1;
  4558. t->max = c;
  4559. }
  4560. }
  4561. return t->now;
  4562. }
  4563. else {
  4564. while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
  4565. if (t->now == t->max) {
  4566. t->gen = 0;
  4567. goto nextpart;
  4568. }
  4569. }
  4570. if (t->now < t->max) {
  4571. return t->now;
  4572. }
  4573. else {
  4574. t->gen = 0;
  4575. return t->max;
  4576. }
  4577. }
  4578. }
  4579. }
  4580. static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
  4581. static VALUE
  4582. tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
  4583. {
  4584. const unsigned int errc = -1;
  4585. unsigned int trans[256];
  4586. rb_encoding *enc, *e1, *e2;
  4587. struct tr trsrc, trrepl;
  4588. int cflag = 0;
  4589. unsigned int c, c0, last = 0;
  4590. int modify = 0, i, l;
  4591. char *s, *send;
  4592. VALUE hash = 0;
  4593. int singlebyte = single_byte_optimizable(str);
  4594. int cr;
  4595. #define CHECK_IF_ASCII(c) \
  4596. (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
  4597. (cr = ENC_CODERANGE_VALID) : 0)
  4598. StringValue(src);
  4599. StringValue(repl);
  4600. if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
  4601. if (RSTRING_LEN(repl) == 0) {
  4602. return rb_str_delete_bang(1, &src, str);
  4603. }
  4604. cr = ENC_CODERANGE(str);
  4605. e1 = rb_enc_check(str, src);
  4606. e2 = rb_enc_check(str, repl);
  4607. if (e1 == e2) {
  4608. enc = e1;
  4609. }
  4610. else {
  4611. enc = rb_enc_check(src, repl);
  4612. }
  4613. trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
  4614. if (RSTRING_LEN(src) > 1 &&
  4615. rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
  4616. trsrc.p + l < trsrc.pend) {
  4617. cflag = 1;
  4618. trsrc.p += l;
  4619. }
  4620. trrepl.p = RSTRING_PTR(repl);
  4621. trrepl.pend = trrepl.p + RSTRING_LEN(repl);
  4622. trsrc.gen = trrepl.gen = 0;
  4623. trsrc.now = trrepl.now = 0;
  4624. trsrc.max = trrepl.max = 0;
  4625. if (cflag) {
  4626. for (i=0; i<256; i++) {
  4627. trans[i] = 1;
  4628. }
  4629. while ((c = trnext(&trsrc, enc)) != errc) {
  4630. if (c < 256) {
  4631. trans[c] = errc;
  4632. }
  4633. else {
  4634. if (!hash) hash = rb_hash_new();
  4635. rb_hash_aset(hash, UINT2NUM(c), Qtrue);
  4636. }
  4637. }
  4638. while ((c = trnext(&trrepl, enc)) != errc)
  4639. /* retrieve last replacer */;
  4640. last = trrepl.now;
  4641. for (i=0; i<256; i++) {
  4642. if (trans[i] != errc) {
  4643. trans[i] = last;
  4644. }
  4645. }
  4646. }
  4647. else {
  4648. unsigned int r;
  4649. for (i=0; i<256; i++) {
  4650. trans[i] = errc;
  4651. }
  4652. while ((c = trnext(&trsrc, enc)) != errc) {
  4653. r = trnext(&trrepl, enc);
  4654. if (r == errc) r = trrepl.now;
  4655. if (c < 256) {
  4656. trans[c] = r;
  4657. if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
  4658. }
  4659. else {
  4660. if (!hash) hash = rb_hash_new();
  4661. rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
  4662. }
  4663. }
  4664. }
  4665. if (cr == ENC_CODERANGE_VALID)
  4666. cr = ENC_CODERANGE_7BIT;
  4667. str_modify_keep_cr(str);
  4668. s = RSTRING_PTR(str); send = RSTRING_END(str);
  4669. if (sflag) {
  4670. int clen, tlen;
  4671. long offset, max = RSTRING_LEN(str);
  4672. unsigned int save = -1;
  4673. char *buf = ALLOC_N(char, max), *t = buf;
  4674. while (s < send) {
  4675. int may_modify = 0;
  4676. c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
  4677. tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
  4678. s += clen;
  4679. if (c < 256) {
  4680. c = trans[c];
  4681. }
  4682. else if (hash) {
  4683. VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
  4684. if (NIL_P(tmp)) {
  4685. if (cflag) c = last;
  4686. else c = errc;
  4687. }
  4688. else if (cflag) c = errc;
  4689. else c = NUM2INT(tmp);
  4690. }
  4691. else {
  4692. c = errc;
  4693. }
  4694. if (c != (unsigned int)-1) {
  4695. if (save == c) {
  4696. CHECK_IF_ASCII(c);
  4697. continue;
  4698. }
  4699. save = c;
  4700. tlen = rb_enc_codelen(c, enc);
  4701. modify = 1;
  4702. }
  4703. else {
  4704. save = -1;
  4705. c = c0;
  4706. if (enc != e1) may_modify = 1;
  4707. }
  4708. while (t - buf + tlen >= max) {
  4709. offset = t - buf;
  4710. max *= 2;
  4711. REALLOC_N(buf, char, max);
  4712. t = buf + offset;
  4713. }
  4714. rb_enc_mbcput(c, t, enc);
  4715. if (may_modify && memcmp(s, t, tlen) != 0) {
  4716. modify = 1;
  4717. }
  4718. CHECK_IF_ASCII(c);
  4719. t += tlen;
  4720. }
  4721. if (!STR_EMBED_P(str)) {
  4722. xfree(RSTRING(str)->as.heap.ptr);
  4723. }
  4724. *t = '\0';
  4725. RSTRING(str)->as.heap.ptr = buf;
  4726. RSTRING(str)->as.heap.len = t - buf;
  4727. STR_SET_NOEMBED(str);
  4728. RSTRING(str)->as.heap.aux.capa = max;
  4729. }
  4730. else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
  4731. while (s < send) {
  4732. c = (unsigned char)*s;
  4733. if (trans[c] != errc) {
  4734. if (!cflag) {
  4735. c = trans[c];
  4736. *s = c;
  4737. modify = 1;
  4738. }
  4739. else {
  4740. *s = last;
  4741. modify = 1;
  4742. }
  4743. }
  4744. CHECK_IF_ASCII(c);
  4745. s++;
  4746. }
  4747. }
  4748. else {
  4749. int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
  4750. long offset;
  4751. char *buf = ALLOC_N(char, max), *t = buf;
  4752. while (s < send) {
  4753. int may_modify = 0;
  4754. c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
  4755. tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
  4756. if (c < 256) {
  4757. c = trans[c];
  4758. }
  4759. else if (hash) {
  4760. VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
  4761. if (NIL_P(tmp)) {
  4762. if (cflag) c = last;
  4763. else c = errc;
  4764. }
  4765. else if (cflag) c = errc;
  4766. else c = NUM2INT(tmp);
  4767. }
  4768. else {
  4769. c = cflag ? last : errc;
  4770. }
  4771. if (c != errc) {
  4772. tlen = rb_enc_codelen(c, enc);
  4773. modify = 1;
  4774. }
  4775. else {
  4776. c = c0;
  4777. if (enc != e1) may_modify = 1;
  4778. }
  4779. while (t - buf + tlen >= max) {
  4780. offset = t - buf;
  4781. max *= 2;
  4782. REALLOC_N(buf, char, max);
  4783. t = buf + offset;
  4784. }
  4785. if (s != t) {
  4786. rb_enc_mbcput(c, t, enc);
  4787. if (may_modify && memcmp(s, t, tlen) != 0) {
  4788. modify = 1;
  4789. }
  4790. }
  4791. CHECK_IF_ASCII(c);
  4792. s += clen;
  4793. t += tlen;
  4794. }
  4795. if (!STR_EMBED_P(str)) {
  4796. xfree(RSTRING(str)->as.heap.ptr);
  4797. }
  4798. *t = '\0';
  4799. RSTRING(str)->as.heap.ptr = buf;
  4800. RSTRING(str)->as.heap.len = t - buf;
  4801. STR_SET_NOEMBED(str);
  4802. RSTRING(str)->as.heap.aux.capa = max;
  4803. }
  4804. if (modify) {
  4805. if (cr != ENC_CODERANGE_BROKEN)
  4806. ENC_CODERANGE_SET(str, cr);
  4807. rb_enc_associate(str, enc);
  4808. return str;
  4809. }
  4810. return Qnil;
  4811. }
  4812. /*
  4813. * call-seq:
  4814. * str.tr!(from_str, to_str) -> str or nil
  4815. *
  4816. * Translates <i>str</i> in place, using the same rules as
  4817. * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
  4818. * changes were made.
  4819. */
  4820. static VALUE
  4821. rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
  4822. {
  4823. return tr_trans(str, src, repl, 0);
  4824. }
  4825. /*
  4826. * call-seq:
  4827. * str.tr(from_str, to_str) => new_str
  4828. *
  4829. * Returns a copy of +str+ with the characters in +from_str+ replaced by the
  4830. * corresponding characters in +to_str+. If +to_str+ is shorter than
  4831. * +from_str+, it is padded with its last character in order to maintain the
  4832. * correspondence.
  4833. *
  4834. * "hello".tr('el', 'ip') #=> "hippo"
  4835. * "hello".tr('aeiou', '*') #=> "h*ll*"
  4836. * "hello".tr('aeiou', 'AA*') #=> "hAll*"
  4837. *
  4838. * Both strings may use the <code>c1-c2</code> notation to denote ranges of
  4839. * characters, and +from_str+ may start with a <code>^</code>, which denotes
  4840. * all characters except those listed.
  4841. *
  4842. * "hello".tr('a-y', 'b-z') #=> "ifmmp"
  4843. * "hello".tr('^aeiou', '*') #=> "*e**o"
  4844. *
  4845. * The backslash character <code>\</code> can be used to escape
  4846. * <code>^</code> or <code>-</code> and is otherwise ignored unless it
  4847. * appears at the end of a range or the end of the +from_str+ or +to_str+:
  4848. *
  4849. * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
  4850. * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
  4851. *
  4852. * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
  4853. * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
  4854. * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
  4855. *
  4856. * "X['\\b']".tr("X\\", "") #=> "['b']"
  4857. * "X['\\b']".tr("X-\\]", "") #=> "'b'"
  4858. */
  4859. static VALUE
  4860. rb_str_tr(VALUE str, VALUE src, VALUE repl)
  4861. {
  4862. str = rb_str_dup(str);
  4863. tr_trans(str, src, repl, 0);
  4864. return str;
  4865. }
  4866. #define TR_TABLE_SIZE 257
  4867. static void
  4868. tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
  4869. VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
  4870. {
  4871. const unsigned int errc = -1;
  4872. char buf[256];
  4873. struct tr tr;
  4874. unsigned int c;
  4875. VALUE table = 0, ptable = 0;
  4876. int i, l, cflag = 0;
  4877. tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
  4878. tr.gen = tr.now = tr.max = 0;
  4879. if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
  4880. cflag = 1;
  4881. tr.p += l;
  4882. }
  4883. if (first) {
  4884. for (i=0; i<256; i++) {
  4885. stable[i] = 1;
  4886. }
  4887. stable[256] = cflag;
  4888. }
  4889. else if (stable[256] && !cflag) {
  4890. stable[256] = 0;
  4891. }
  4892. for (i=0; i<256; i++) {
  4893. buf[i] = cflag;
  4894. }
  4895. while ((c = trnext(&tr, enc)) != errc) {
  4896. if (c < 256) {
  4897. buf[c & 0xff] = !cflag;
  4898. }
  4899. else {
  4900. VALUE key = UINT2NUM(c);
  4901. if (!table && (first || *tablep || stable[256])) {
  4902. if (cflag) {
  4903. ptable = *ctablep;
  4904. table = ptable ? ptable : rb_hash_new();
  4905. *ctablep = table;
  4906. }
  4907. else {
  4908. table = rb_hash_new();
  4909. ptable = *tablep;
  4910. *tablep = table;
  4911. }
  4912. }
  4913. if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
  4914. rb_hash_aset(table, key, Qtrue);
  4915. }
  4916. }
  4917. }
  4918. for (i=0; i<256; i++) {
  4919. stable[i] = stable[i] && buf[i];
  4920. }
  4921. if (!table && !cflag) {
  4922. *tablep = 0;
  4923. }
  4924. }
  4925. static int
  4926. tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
  4927. {
  4928. if (c < 256) {
  4929. return table[c] != 0;
  4930. }
  4931. else {
  4932. VALUE v = UINT2NUM(c);
  4933. if (del) {
  4934. if (!NIL_P(rb_hash_lookup(del, v)) &&
  4935. (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
  4936. return TRUE;
  4937. }
  4938. }
  4939. else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
  4940. return FALSE;
  4941. }
  4942. return table[256] ? TRUE : FALSE;
  4943. }
  4944. }
  4945. /*
  4946. * call-seq:
  4947. * str.delete!([other_str]+) -> str or nil
  4948. *
  4949. * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
  4950. * <code>nil</code> if <i>str</i> was not modified.
  4951. */
  4952. static VALUE
  4953. rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
  4954. {
  4955. char squeez[TR_TABLE_SIZE];
  4956. rb_encoding *enc = 0;
  4957. char *s, *send, *t;
  4958. VALUE del = 0, nodel = 0;
  4959. int modify = 0;
  4960. int i, ascompat, cr;
  4961. if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
  4962. rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
  4963. for (i=0; i<argc; i++) {
  4964. VALUE s = argv[i];
  4965. StringValue(s);
  4966. enc = rb_enc_check(str, s);
  4967. tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
  4968. }
  4969. str_modify_keep_cr(str);
  4970. ascompat = rb_enc_asciicompat(enc);
  4971. s = t = RSTRING_PTR(str);
  4972. send = RSTRING_END(str);
  4973. cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
  4974. while (s < send) {
  4975. unsigned int c;
  4976. int clen;
  4977. if (ascompat && (c = *(unsigned char*)s) < 0x80) {
  4978. if (squeez[c]) {
  4979. modify = 1;
  4980. }
  4981. else {
  4982. if (t != s) *t = c;
  4983. t++;
  4984. }
  4985. s++;
  4986. }
  4987. else {
  4988. c = rb_enc_codepoint_len(s, send, &clen, enc);
  4989. if (tr_find(c, squeez, del, nodel)) {
  4990. modify = 1;
  4991. }
  4992. else {
  4993. if (t != s) rb_enc_mbcput(c, t, enc);
  4994. t += clen;
  4995. if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
  4996. }
  4997. s += clen;
  4998. }
  4999. }
  5000. *t = '\0';
  5001. STR_SET_LEN(str, t - RSTRING_PTR(str));
  5002. ENC_CODERANGE_SET(str, cr);
  5003. if (modify) return str;
  5004. return Qnil;
  5005. }
  5006. /*
  5007. * call-seq:
  5008. * str.delete([other_str]+) -> new_str
  5009. *
  5010. * Returns a copy of <i>str</i> with all characters in the intersection of its
  5011. * arguments deleted. Uses the same rules for building the set of characters as
  5012. * <code>String#count</code>.
  5013. *
  5014. * "hello".delete "l","lo" #=> "heo"
  5015. * "hello".delete "lo" #=> "he"
  5016. * "hello".delete "aeiou", "^e" #=> "hell"
  5017. * "hello".delete "ej-m" #=> "ho"
  5018. */
  5019. static VALUE
  5020. rb_str_delete(int argc, VALUE *argv, VALUE str)
  5021. {
  5022. str = rb_str_dup(str);
  5023. rb_str_delete_bang(argc, argv, str);
  5024. return str;
  5025. }
  5026. /*
  5027. * call-seq:
  5028. * str.squeeze!([other_str]*) -> str or nil
  5029. *
  5030. * Squeezes <i>str</i> in place, returning either <i>str</i>, or
  5031. * <code>nil</code> if no changes were made.
  5032. */
  5033. static VALUE
  5034. rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
  5035. {
  5036. char squeez[TR_TABLE_SIZE];
  5037. rb_encoding *enc = 0;
  5038. VALUE del = 0, nodel = 0;
  5039. char *s, *send, *t;
  5040. int i, modify = 0;
  5041. int ascompat, singlebyte = single_byte_optimizable(str);
  5042. unsigned int save;
  5043. if (argc == 0) {
  5044. enc = STR_ENC_GET(str);
  5045. }
  5046. else {
  5047. for (i=0; i<argc; i++) {
  5048. VALUE s = argv[i];
  5049. StringValue(s);
  5050. enc = rb_enc_check(str, s);
  5051. if (singlebyte && !single_byte_optimizable(s))
  5052. singlebyte = 0;
  5053. tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
  5054. }
  5055. }
  5056. str_modify_keep_cr(str);
  5057. s = t = RSTRING_PTR(str);
  5058. if (!s || RSTRING_LEN(str) == 0) return Qnil;
  5059. send = RSTRING_END(str);
  5060. save = -1;
  5061. ascompat = rb_enc_asciicompat(enc);
  5062. if (singlebyte) {
  5063. while (s < send) {
  5064. unsigned int c = *(unsigned char*)s++;
  5065. if (c != save || (argc > 0 && !squeez[c])) {
  5066. *t++ = save = c;
  5067. }
  5068. }
  5069. } else {
  5070. while (s < send) {
  5071. unsigned int c;
  5072. int clen;
  5073. if (ascompat && (c = *(unsigned char*)s) < 0x80) {
  5074. if (c != save || (argc > 0 && !squeez[c])) {
  5075. *t++ = save = c;
  5076. }
  5077. s++;
  5078. }
  5079. else {
  5080. c = rb_enc_codepoint_len(s, send, &clen, enc);
  5081. if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
  5082. if (t != s) rb_enc_mbcput(c, t, enc);
  5083. save = c;
  5084. t += clen;
  5085. }
  5086. s += clen;
  5087. }
  5088. }
  5089. }
  5090. *t = '\0';
  5091. if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
  5092. STR_SET_LEN(str, t - RSTRING_PTR(str));
  5093. modify = 1;
  5094. }
  5095. if (modify) return str;
  5096. return Qnil;
  5097. }
  5098. /*
  5099. * call-seq:
  5100. * str.squeeze([other_str]*) -> new_str
  5101. *
  5102. * Builds a set of characters from the <i>other_str</i> parameter(s) using the
  5103. * procedure described for <code>String#count</code>. Returns a new string
  5104. * where runs of the same character that occur in this set are replaced by a
  5105. * single character. If no arguments are given, all runs of identical
  5106. * characters are replaced by a single character.
  5107. *
  5108. * "yellow moon".squeeze #=> "yelow mon"
  5109. * " now is the".squeeze(" ") #=> " now is the"
  5110. * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
  5111. */
  5112. static VALUE
  5113. rb_str_squeeze(int argc, VALUE *argv, VALUE str)
  5114. {
  5115. str = rb_str_dup(str);
  5116. rb_str_squeeze_bang(argc, argv, str);
  5117. return str;
  5118. }
  5119. /*
  5120. * call-seq:
  5121. * str.tr_s!(from_str, to_str) -> str or nil
  5122. *
  5123. * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
  5124. * returning <i>str</i>, or <code>nil</code> if no changes were made.
  5125. */
  5126. static VALUE
  5127. rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
  5128. {
  5129. return tr_trans(str, src, repl, 1);
  5130. }
  5131. /*
  5132. * call-seq:
  5133. * str.tr_s(from_str, to_str) -> new_str
  5134. *
  5135. * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
  5136. * then removes duplicate characters in regions that were affected by the
  5137. * translation.
  5138. *
  5139. * "hello".tr_s('l', 'r') #=> "hero"
  5140. * "hello".tr_s('el', '*') #=> "h*o"
  5141. * "hello".tr_s('el', 'hx') #=> "hhxo"
  5142. */
  5143. static VALUE
  5144. rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
  5145. {
  5146. str = rb_str_dup(str);
  5147. tr_trans(str, src, repl, 1);
  5148. return str;
  5149. }
  5150. /*
  5151. * call-seq:
  5152. * str.count([other_str]+) -> fixnum
  5153. *
  5154. * Each +other_str+ parameter defines a set of characters to count. The
  5155. * intersection of these sets defines the characters to count in +str+. Any
  5156. * +other_str+ that starts with a caret <code>^</code> is negated. The
  5157. * sequence <code>c1-c2</code> means all characters between c1 and c2. The
  5158. * backslash character <code>\</code> can be used to escape <code>^</code> or
  5159. * <code>-</code> and is otherwise ignored unless it appears at the end of a
  5160. * sequence or the end of a +other_str+.
  5161. *
  5162. * a = "hello world"
  5163. * a.count "lo" #=> 5
  5164. * a.count "lo", "o" #=> 2
  5165. * a.count "hello", "^l" #=> 4
  5166. * a.count "ej-m" #=> 4
  5167. *
  5168. * "hello^world".count "\\^aeiou" #=> 4
  5169. * "hello-world".count "a\\-eo" #=> 4
  5170. *
  5171. * c = "hello world\\r\\n"
  5172. * c.count "\\" #=> 2
  5173. * c.count "\\A" #=> 0
  5174. * c.count "X-\\w" #=> 3
  5175. */
  5176. static VALUE
  5177. rb_str_count(int argc, VALUE *argv, VALUE str)
  5178. {
  5179. char table[TR_TABLE_SIZE];
  5180. rb_encoding *enc = 0;
  5181. VALUE del = 0, nodel = 0;
  5182. char *s, *send;
  5183. int i;
  5184. int ascompat;
  5185. rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
  5186. for (i=0; i<argc; i++) {
  5187. VALUE tstr = argv[i];
  5188. unsigned char c;
  5189. StringValue(tstr);
  5190. enc = rb_enc_check(str, tstr);
  5191. if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
  5192. (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
  5193. int n = 0;
  5194. s = RSTRING_PTR(str);
  5195. if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
  5196. send = RSTRING_END(str);
  5197. while (s < send) {
  5198. if (*(unsigned char*)s++ == c) n++;
  5199. }
  5200. return INT2NUM(n);
  5201. }
  5202. tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
  5203. }
  5204. s = RSTRING_PTR(str);
  5205. if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
  5206. send = RSTRING_END(str);
  5207. ascompat = rb_enc_asciicompat(enc);
  5208. i = 0;
  5209. while (s < send) {
  5210. unsigned int c;
  5211. if (ascompat && (c = *(unsigned char*)s) < 0x80) {
  5212. if (table[c]) {
  5213. i++;
  5214. }
  5215. s++;
  5216. }
  5217. else {
  5218. int clen;
  5219. c = rb_enc_codepoint_len(s, send, &clen, enc);
  5220. if (tr_find(c, table, del, nodel)) {
  5221. i++;
  5222. }
  5223. s += clen;
  5224. }
  5225. }
  5226. return INT2NUM(i);
  5227. }
  5228. static const char isspacetable[256] = {
  5229. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
  5230. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5231. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5232. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5233. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5234. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5235. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5236. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5237. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5238. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5239. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5240. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5241. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5242. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5243. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  5244. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  5245. };
  5246. #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
  5247. /*
  5248. * call-seq:
  5249. * str.split(pattern=$;, [limit]) -> anArray
  5250. *
  5251. * Divides <i>str</i> into substrings based on a delimiter, returning an array
  5252. * of these substrings.
  5253. *
  5254. * If <i>pattern</i> is a <code>String</code>, then its contents are used as
  5255. * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
  5256. * space, <i>str</i> is split on whitespace, with leading whitespace and runs
  5257. * of contiguous whitespace characters ignored.
  5258. *
  5259. * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
  5260. * pattern matches. Whenever the pattern matches a zero-length string,
  5261. * <i>str</i> is split into individual characters. If <i>pattern</i> contains
  5262. * groups, the respective matches will be returned in the array as well.
  5263. *
  5264. * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
  5265. * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
  5266. * split on whitespace as if ` ' were specified.
  5267. *
  5268. * If the <i>limit</i> parameter is omitted, trailing null fields are
  5269. * suppressed. If <i>limit</i> is a positive number, at most that number of
  5270. * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
  5271. * string is returned as the only entry in an array). If negative, there is no
  5272. * limit to the number of fields returned, and trailing null fields are not
  5273. * suppressed.
  5274. *
  5275. * When the input +str+ is empty an empty Array is returned as the string is
  5276. * considered to have no fields to split.
  5277. *
  5278. * " now's the time".split #=> ["now's", "the", "time"]
  5279. * " now's the time".split(' ') #=> ["now's", "the", "time"]
  5280. * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
  5281. * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
  5282. * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
  5283. * "hello".split(//, 3) #=> ["h", "e", "llo"]
  5284. * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
  5285. *
  5286. * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
  5287. * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
  5288. * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
  5289. * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
  5290. *
  5291. * "".split(',', -1) #=> []
  5292. */
  5293. static VALUE
  5294. rb_str_split_m(int argc, VALUE *argv, VALUE str)
  5295. {
  5296. rb_encoding *enc;
  5297. VALUE spat;
  5298. VALUE limit;
  5299. enum {awk, string, regexp} split_type;
  5300. long beg, end, i = 0;
  5301. int lim = 0;
  5302. VALUE result, tmp;
  5303. if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
  5304. lim = NUM2INT(limit);
  5305. if (lim <= 0) limit = Qnil;
  5306. else if (lim == 1) {
  5307. if (RSTRING_LEN(str) == 0)
  5308. return rb_ary_new2(0);
  5309. return rb_ary_new3(1, str);
  5310. }
  5311. i = 1;
  5312. }
  5313. enc = STR_ENC_GET(str);
  5314. if (NIL_P(spat)) {
  5315. if (!NIL_P(rb_fs)) {
  5316. spat = rb_fs;
  5317. goto fs_set;
  5318. }
  5319. split_type = awk;
  5320. }
  5321. else {
  5322. fs_set:
  5323. if (RB_TYPE_P(spat, T_STRING)) {
  5324. rb_encoding *enc2 = STR_ENC_GET(spat);
  5325. split_type = string;
  5326. if (RSTRING_LEN(spat) == 0) {
  5327. /* Special case - split into chars */
  5328. spat = rb_reg_regcomp(spat);
  5329. split_type = regexp;
  5330. }
  5331. else if (rb_enc_asciicompat(enc2) == 1) {
  5332. if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
  5333. split_type = awk;
  5334. }
  5335. }
  5336. else {
  5337. int l;
  5338. if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
  5339. RSTRING_LEN(spat) == l) {
  5340. split_type = awk;
  5341. }
  5342. }
  5343. }
  5344. else {
  5345. spat = get_pat(spat, 1);
  5346. split_type = regexp;
  5347. }
  5348. }
  5349. result = rb_ary_new();
  5350. beg = 0;
  5351. if (split_type == awk) {
  5352. char *ptr = RSTRING_PTR(str);
  5353. char *eptr = RSTRING_END(str);
  5354. char *bptr = ptr;
  5355. int skip = 1;
  5356. unsigned int c;
  5357. end = beg;
  5358. if (is_ascii_string(str)) {
  5359. while (ptr < eptr) {
  5360. c = (unsigned char)*ptr++;
  5361. if (skip) {
  5362. if (ascii_isspace(c)) {
  5363. beg = ptr - bptr;
  5364. }
  5365. else {
  5366. end = ptr - bptr;
  5367. skip = 0;
  5368. if (!NIL_P(limit) && lim <= i) break;
  5369. }
  5370. }
  5371. else if (ascii_isspace(c)) {
  5372. rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
  5373. skip = 1;
  5374. beg = ptr - bptr;
  5375. if (!NIL_P(limit)) ++i;
  5376. }
  5377. else {
  5378. end = ptr - bptr;
  5379. }
  5380. }
  5381. }
  5382. else {
  5383. while (ptr < eptr) {
  5384. int n;
  5385. c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
  5386. ptr += n;
  5387. if (skip) {
  5388. if (rb_isspace(c)) {
  5389. beg = ptr - bptr;
  5390. }
  5391. else {
  5392. end = ptr - bptr;
  5393. skip = 0;
  5394. if (!NIL_P(limit) && lim <= i) break;
  5395. }
  5396. }
  5397. else if (rb_isspace(c)) {
  5398. rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
  5399. skip = 1;
  5400. beg = ptr - bptr;
  5401. if (!NIL_P(limit)) ++i;
  5402. }
  5403. else {
  5404. end = ptr - bptr;
  5405. }
  5406. }
  5407. }
  5408. }
  5409. else if (split_type == string) {
  5410. char *ptr = RSTRING_PTR(str);
  5411. char *temp = ptr;
  5412. char *eptr = RSTRING_END(str);
  5413. char *sptr = RSTRING_PTR(spat);
  5414. long slen = RSTRING_LEN(spat);
  5415. if (is_broken_string(str)) {
  5416. rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
  5417. }
  5418. if (is_broken_string(spat)) {
  5419. rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
  5420. }
  5421. enc = rb_enc_check(str, spat);
  5422. while (ptr < eptr &&
  5423. (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
  5424. /* Check we are at the start of a char */
  5425. char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
  5426. if (t != ptr + end) {
  5427. ptr = t;
  5428. continue;
  5429. }
  5430. rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
  5431. ptr += end + slen;
  5432. if (!NIL_P(limit) && lim <= ++i) break;
  5433. }
  5434. beg = ptr - temp;
  5435. }
  5436. else {
  5437. char *ptr = RSTRING_PTR(str);
  5438. long len = RSTRING_LEN(str);
  5439. long start = beg;
  5440. long idx;
  5441. int last_null = 0;
  5442. struct re_registers *regs;
  5443. while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
  5444. regs = RMATCH_REGS(rb_backref_get());
  5445. if (start == end && BEG(0) == END(0)) {
  5446. if (!ptr) {
  5447. rb_ary_push(result, str_new_empty(str));
  5448. break;
  5449. }
  5450. else if (last_null == 1) {
  5451. rb_ary_push(result, rb_str_subseq(str, beg,
  5452. rb_enc_fast_mbclen(ptr+beg,
  5453. ptr+len,
  5454. enc)));
  5455. beg = start;
  5456. }
  5457. else {
  5458. if (ptr+start == ptr+len)
  5459. start++;
  5460. else
  5461. start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
  5462. last_null = 1;
  5463. continue;
  5464. }
  5465. }
  5466. else {
  5467. rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
  5468. beg = start = END(0);
  5469. }
  5470. last_null = 0;
  5471. for (idx=1; idx < regs->num_regs; idx++) {
  5472. if (BEG(idx) == -1) continue;
  5473. if (BEG(idx) == END(idx))
  5474. tmp = str_new_empty(str);
  5475. else
  5476. tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
  5477. rb_ary_push(result, tmp);
  5478. }
  5479. if (!NIL_P(limit) && lim <= ++i) break;
  5480. }
  5481. }
  5482. if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
  5483. if (RSTRING_LEN(str) == beg)
  5484. tmp = str_new_empty(str);
  5485. else
  5486. tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
  5487. rb_ary_push(result, tmp);
  5488. }
  5489. if (NIL_P(limit) && lim == 0) {
  5490. long len;
  5491. while ((len = RARRAY_LEN(result)) > 0 &&
  5492. (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
  5493. rb_ary_pop(result);
  5494. }
  5495. return result;
  5496. }
  5497. VALUE
  5498. rb_str_split(VALUE str, const char *sep0)
  5499. {
  5500. VALUE sep;
  5501. StringValue(str);
  5502. sep = rb_str_new2(sep0);
  5503. return rb_str_split_m(1, &sep, str);
  5504. }
  5505. static VALUE
  5506. rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
  5507. {
  5508. rb_encoding *enc;
  5509. VALUE rs;
  5510. unsigned int newline;
  5511. const char *p, *pend, *s, *ptr;
  5512. long len, rslen;
  5513. VALUE line;
  5514. int n;
  5515. VALUE orig = str;
  5516. VALUE UNINITIALIZED_VAR(ary);
  5517. if (argc == 0) {
  5518. rs = rb_rs;
  5519. }
  5520. else {
  5521. rb_scan_args(argc, argv, "01", &rs);
  5522. }
  5523. if (rb_block_given_p()) {
  5524. if (wantarray) {
  5525. #if 0 /* next major */
  5526. rb_warn("given block not used");
  5527. ary = rb_ary_new();
  5528. #else
  5529. rb_warning("passing a block to String#lines is deprecated");
  5530. wantarray = 0;
  5531. #endif
  5532. }
  5533. }
  5534. else {
  5535. if (wantarray)
  5536. ary = rb_ary_new();
  5537. else
  5538. RETURN_ENUMERATOR(str, argc, argv);
  5539. }
  5540. if (NIL_P(rs)) {
  5541. if (wantarray) {
  5542. rb_ary_push(ary, str);
  5543. return ary;
  5544. }
  5545. else {
  5546. rb_yield(str);
  5547. return orig;
  5548. }
  5549. }
  5550. str = rb_str_new4(str);
  5551. ptr = p = s = RSTRING_PTR(str);
  5552. pend = p + RSTRING_LEN(str);
  5553. len = RSTRING_LEN(str);
  5554. StringValue(rs);
  5555. if (rs == rb_default_rs) {
  5556. enc = rb_enc_get(str);
  5557. while (p < pend) {
  5558. char *p0;
  5559. p = memchr(p, '\n', pend - p);
  5560. if (!p) break;
  5561. p0 = rb_enc_left_char_head(s, p, pend, enc);
  5562. if (!rb_enc_is_newline(p0, pend, enc)) {
  5563. p++;
  5564. continue;
  5565. }
  5566. p = p0 + rb_enc_mbclen(p0, pend, enc);
  5567. line = rb_str_subseq(str, s - ptr, p - s);
  5568. if (wantarray)
  5569. rb_ary_push(ary, line);
  5570. else
  5571. rb_yield(line);
  5572. str_mod_check(str, ptr, len);
  5573. s = p;
  5574. }
  5575. goto finish;
  5576. }
  5577. enc = rb_enc_check(str, rs);
  5578. rslen = RSTRING_LEN(rs);
  5579. if (rslen == 0) {
  5580. newline = '\n';
  5581. }
  5582. else {
  5583. newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
  5584. }
  5585. while (p < pend) {
  5586. unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
  5587. again:
  5588. if (rslen == 0 && c == newline) {
  5589. p += n;
  5590. if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
  5591. goto again;
  5592. }
  5593. while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
  5594. p += n;
  5595. }
  5596. p -= n;
  5597. }
  5598. if (c == newline &&
  5599. (rslen <= 1 ||
  5600. (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
  5601. const char *pp = p + (rslen ? rslen : n);
  5602. line = rb_str_subseq(str, s - ptr, pp - s);
  5603. if (wantarray)
  5604. rb_ary_push(ary, line);
  5605. else
  5606. rb_yield(line);
  5607. str_mod_check(str, ptr, len);
  5608. s = pp;
  5609. }
  5610. p += n;
  5611. }
  5612. finish:
  5613. if (s != pend) {
  5614. line = rb_str_subseq(str, s - ptr, pend - s);
  5615. if (wantarray)
  5616. rb_ary_push(ary, line);
  5617. else
  5618. rb_yield(line);
  5619. RB_GC_GUARD(str);
  5620. }
  5621. if (wantarray)
  5622. return ary;
  5623. else
  5624. return orig;
  5625. }
  5626. /*
  5627. * call-seq:
  5628. * str.each_line(separator=$/) {|substr| block } -> str
  5629. * str.each_line(separator=$/) -> an_enumerator
  5630. *
  5631. * Splits <i>str</i> using the supplied parameter as the record
  5632. * separator (<code>$/</code> by default), passing each substring in
  5633. * turn to the supplied block. If a zero-length record separator is
  5634. * supplied, the string is split into paragraphs delimited by
  5635. * multiple successive newlines.
  5636. *
  5637. * If no block is given, an enumerator is returned instead.
  5638. *
  5639. * print "Example one\n"
  5640. * "hello\nworld".each_line {|s| p s}
  5641. * print "Example two\n"
  5642. * "hello\nworld".each_line('l') {|s| p s}
  5643. * print "Example three\n"
  5644. * "hello\n\n\nworld".each_line('') {|s| p s}
  5645. *
  5646. * <em>produces:</em>
  5647. *
  5648. * Example one
  5649. * "hello\n"
  5650. * "world"
  5651. * Example two
  5652. * "hel"
  5653. * "l"
  5654. * "o\nworl"
  5655. * "d"
  5656. * Example three
  5657. * "hello\n\n\n"
  5658. * "world"
  5659. */
  5660. static VALUE
  5661. rb_str_each_line(int argc, VALUE *argv, VALUE str)
  5662. {
  5663. return rb_str_enumerate_lines(argc, argv, str, 0);
  5664. }
  5665. /*
  5666. * call-seq:
  5667. * str.lines(separator=$/) -> an_array
  5668. *
  5669. * Returns an array of lines in <i>str</i> split using the supplied
  5670. * record separator (<code>$/</code> by default). This is a
  5671. * shorthand for <code>str.each_line(separator).to_a</code>.
  5672. *
  5673. * If a block is given, which is a deprecated form, works the same as
  5674. * <code>each_line</code>.
  5675. */
  5676. static VALUE
  5677. rb_str_lines(int argc, VALUE *argv, VALUE str)
  5678. {
  5679. return rb_str_enumerate_lines(argc, argv, str, 1);
  5680. }
  5681. static VALUE
  5682. rb_str_each_byte_size(VALUE str, VALUE args)
  5683. {
  5684. return LONG2FIX(RSTRING_LEN(str));
  5685. }
  5686. static VALUE
  5687. rb_str_enumerate_bytes(VALUE str, int wantarray)
  5688. {
  5689. long i;
  5690. VALUE UNINITIALIZED_VAR(ary);
  5691. if (rb_block_given_p()) {
  5692. if (wantarray) {
  5693. #if 0 /* next major */
  5694. rb_warn("given block not used");
  5695. ary = rb_ary_new();
  5696. #else
  5697. rb_warning("passing a block to String#bytes is deprecated");
  5698. wantarray = 0;
  5699. #endif
  5700. }
  5701. }
  5702. else {
  5703. if (wantarray)
  5704. ary = rb_ary_new2(RSTRING_LEN(str));
  5705. else
  5706. RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
  5707. }
  5708. for (i=0; i<RSTRING_LEN(str); i++) {
  5709. if (wantarray)
  5710. rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
  5711. else
  5712. rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
  5713. }
  5714. if (wantarray)
  5715. return ary;
  5716. else
  5717. return str;
  5718. }
  5719. /*
  5720. * call-seq:
  5721. * str.each_byte {|fixnum| block } -> str
  5722. * str.each_byte -> an_enumerator
  5723. *
  5724. * Passes each byte in <i>str</i> to the given block, or returns an
  5725. * enumerator if no block is given.
  5726. *
  5727. * "hello".each_byte {|c| print c, ' ' }
  5728. *
  5729. * <em>produces:</em>
  5730. *
  5731. * 104 101 108 108 111
  5732. */
  5733. static VALUE
  5734. rb_str_each_byte(VALUE str)
  5735. {
  5736. return rb_str_enumerate_bytes(str, 0);
  5737. }
  5738. /*
  5739. * call-seq:
  5740. * str.bytes -> an_array
  5741. *
  5742. * Returns an array of bytes in <i>str</i>. This is a shorthand for
  5743. * <code>str.each_byte.to_a</code>.
  5744. *
  5745. * If a block is given, which is a deprecated form, works the same as
  5746. * <code>each_byte</code>.
  5747. */
  5748. static VALUE
  5749. rb_str_bytes(VALUE str)
  5750. {
  5751. return rb_str_enumerate_bytes(str, 1);
  5752. }
  5753. static VALUE
  5754. rb_str_each_char_size(VALUE str)
  5755. {
  5756. long len = RSTRING_LEN(str);
  5757. if (!single_byte_optimizable(str)) {
  5758. const char *ptr = RSTRING_PTR(str);
  5759. rb_encoding *enc = rb_enc_get(str);
  5760. const char *end_ptr = ptr + len;
  5761. for (len = 0; ptr < end_ptr; ++len) {
  5762. ptr += rb_enc_mbclen(ptr, end_ptr, enc);
  5763. }
  5764. }
  5765. return LONG2FIX(len);
  5766. }
  5767. static VALUE
  5768. rb_str_enumerate_chars(VALUE str, int wantarray)
  5769. {
  5770. VALUE orig = str;
  5771. VALUE substr;
  5772. long i, len, n;
  5773. const char *ptr;
  5774. rb_encoding *enc;
  5775. VALUE UNINITIALIZED_VAR(ary);
  5776. if (rb_block_given_p()) {
  5777. if (wantarray) {
  5778. #if 0 /* next major */
  5779. rb_warn("given block not used");
  5780. ary = rb_ary_new();
  5781. #else
  5782. rb_warning("passing a block to String#chars is deprecated");
  5783. wantarray = 0;
  5784. #endif
  5785. }
  5786. }
  5787. else {
  5788. if (wantarray)
  5789. ary = rb_ary_new();
  5790. else
  5791. RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
  5792. }
  5793. str = rb_str_new4(str);
  5794. ptr = RSTRING_PTR(str);
  5795. len = RSTRING_LEN(str);
  5796. enc = rb_enc_get(str);
  5797. switch (ENC_CODERANGE(str)) {
  5798. case ENC_CODERANGE_VALID:
  5799. case ENC_CODERANGE_7BIT:
  5800. for (i = 0; i < len; i += n) {
  5801. n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
  5802. substr = rb_str_subseq(str, i, n);
  5803. if (wantarray)
  5804. rb_ary_push(ary, substr);
  5805. else
  5806. rb_yield(substr);
  5807. }
  5808. break;
  5809. default:
  5810. for (i = 0; i < len; i += n) {
  5811. n = rb_enc_mbclen(ptr + i, ptr + len, enc);
  5812. substr = rb_str_subseq(str, i, n);
  5813. if (wantarray)
  5814. rb_ary_push(ary, substr);
  5815. else
  5816. rb_yield(substr);
  5817. }
  5818. }
  5819. RB_GC_GUARD(str);
  5820. if (wantarray)
  5821. return ary;
  5822. else
  5823. return orig;
  5824. }
  5825. /*
  5826. * call-seq:
  5827. * str.each_char {|cstr| block } -> str
  5828. * str.each_char -> an_enumerator
  5829. *
  5830. * Passes each character in <i>str</i> to the given block, or returns
  5831. * an enumerator if no block is given.
  5832. *
  5833. * "hello".each_char {|c| print c, ' ' }
  5834. *
  5835. * <em>produces:</em>
  5836. *
  5837. * h e l l o
  5838. */
  5839. static VALUE
  5840. rb_str_each_char(VALUE str)
  5841. {
  5842. return rb_str_enumerate_chars(str, 0);
  5843. }
  5844. /*
  5845. * call-seq:
  5846. * str.chars -> an_array
  5847. *
  5848. * Returns an array of characters in <i>str</i>. This is a shorthand
  5849. * for <code>str.each_char.to_a</code>.
  5850. *
  5851. * If a block is given, which is a deprecated form, works the same as
  5852. * <code>each_char</code>.
  5853. */
  5854. static VALUE
  5855. rb_str_chars(VALUE str)
  5856. {
  5857. return rb_str_enumerate_chars(str, 1);
  5858. }
  5859. static VALUE
  5860. rb_str_enumerate_codepoints(VALUE str, int wantarray)
  5861. {
  5862. VALUE orig = str;
  5863. int n;
  5864. unsigned int c;
  5865. const char *ptr, *end;
  5866. rb_encoding *enc;
  5867. VALUE UNINITIALIZED_VAR(ary);
  5868. if (single_byte_optimizable(str))
  5869. return rb_str_enumerate_bytes(str, wantarray);
  5870. if (rb_block_given_p()) {
  5871. if (wantarray) {
  5872. #if 0 /* next major */
  5873. rb_warn("given block not used");
  5874. ary = rb_ary_new();
  5875. #else
  5876. rb_warning("passing a block to String#codepoints is deprecated");
  5877. wantarray = 0;
  5878. #endif
  5879. }
  5880. }
  5881. else {
  5882. if (wantarray)
  5883. ary = rb_ary_new();
  5884. else
  5885. RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
  5886. }
  5887. str = rb_str_new4(str);
  5888. ptr = RSTRING_PTR(str);
  5889. end = RSTRING_END(str);
  5890. enc = STR_ENC_GET(str);
  5891. while (ptr < end) {
  5892. c = rb_enc_codepoint_len(ptr, end, &n, enc);
  5893. if (wantarray)
  5894. rb_ary_push(ary, UINT2NUM(c));
  5895. else
  5896. rb_yield(UINT2NUM(c));
  5897. ptr += n;
  5898. }
  5899. RB_GC_GUARD(str);
  5900. if (wantarray)
  5901. return ary;
  5902. else
  5903. return orig;
  5904. }
  5905. /*
  5906. * call-seq:
  5907. * str.each_codepoint {|integer| block } -> str
  5908. * str.each_codepoint -> an_enumerator
  5909. *
  5910. * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
  5911. * also known as a <i>codepoint</i> when applied to Unicode strings to the
  5912. * given block.
  5913. *
  5914. * If no block is given, an enumerator is returned instead.
  5915. *
  5916. * "hello\u0639".each_codepoint {|c| print c, ' ' }
  5917. *
  5918. * <em>produces:</em>
  5919. *
  5920. * 104 101 108 108 111 1593
  5921. */
  5922. static VALUE
  5923. rb_str_each_codepoint(VALUE str)
  5924. {
  5925. return rb_str_enumerate_codepoints(str, 0);
  5926. }
  5927. /*
  5928. * call-seq:
  5929. * str.codepoints -> an_array
  5930. *
  5931. * Returns an array of the <code>Integer</code> ordinals of the
  5932. * characters in <i>str</i>. This is a shorthand for
  5933. * <code>str.each_codepoint.to_a</code>.
  5934. *
  5935. * If a block is given, which is a deprecated form, works the same as
  5936. * <code>each_codepoint</code>.
  5937. */
  5938. static VALUE
  5939. rb_str_codepoints(VALUE str)
  5940. {
  5941. return rb_str_enumerate_codepoints(str, 1);
  5942. }
  5943. static long
  5944. chopped_length(VALUE str)
  5945. {
  5946. rb_encoding *enc = STR_ENC_GET(str);
  5947. const char *p, *p2, *beg, *end;
  5948. beg = RSTRING_PTR(str);
  5949. end = beg + RSTRING_LEN(str);
  5950. if (beg > end) return 0;
  5951. p = rb_enc_prev_char(beg, end, end, enc);
  5952. if (!p) return 0;
  5953. if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
  5954. p2 = rb_enc_prev_char(beg, p, end, enc);
  5955. if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
  5956. }
  5957. return p - beg;
  5958. }
  5959. /*
  5960. * call-seq:
  5961. * str.chop! -> str or nil
  5962. *
  5963. * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
  5964. * or <code>nil</code> if <i>str</i> is the empty string. See also
  5965. * <code>String#chomp!</code>.
  5966. */
  5967. static VALUE
  5968. rb_str_chop_bang(VALUE str)
  5969. {
  5970. str_modify_keep_cr(str);
  5971. if (RSTRING_LEN(str) > 0) {
  5972. long len;
  5973. len = chopped_length(str);
  5974. STR_SET_LEN(str, len);
  5975. RSTRING_PTR(str)[len] = '\0';
  5976. if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
  5977. ENC_CODERANGE_CLEAR(str);
  5978. }
  5979. return str;
  5980. }
  5981. return Qnil;
  5982. }
  5983. /*
  5984. * call-seq:
  5985. * str.chop -> new_str
  5986. *
  5987. * Returns a new <code>String</code> with the last character removed. If the
  5988. * string ends with <code>\r\n</code>, both characters are removed. Applying
  5989. * <code>chop</code> to an empty string returns an empty
  5990. * string. <code>String#chomp</code> is often a safer alternative, as it leaves
  5991. * the string unchanged if it doesn't end in a record separator.
  5992. *
  5993. * "string\r\n".chop #=> "string"
  5994. * "string\n\r".chop #=> "string\n"
  5995. * "string\n".chop #=> "string"
  5996. * "string".chop #=> "strin"
  5997. * "x".chop.chop #=> ""
  5998. */
  5999. static VALUE
  6000. rb_str_chop(VALUE str)
  6001. {
  6002. return rb_str_subseq(str, 0, chopped_length(str));
  6003. }
  6004. /*
  6005. * call-seq:
  6006. * str.chomp!(separator=$/) -> str or nil
  6007. *
  6008. * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
  6009. * returning <i>str</i>, or <code>nil</code> if no modifications were made.
  6010. */
  6011. static VALUE
  6012. rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
  6013. {
  6014. rb_encoding *enc;
  6015. VALUE rs;
  6016. int newline;
  6017. char *p, *pp, *e;
  6018. long len, rslen;
  6019. str_modify_keep_cr(str);
  6020. len = RSTRING_LEN(str);
  6021. if (len == 0) return Qnil;
  6022. p = RSTRING_PTR(str);
  6023. e = p + len;
  6024. if (argc == 0) {
  6025. rs = rb_rs;
  6026. if (rs == rb_default_rs) {
  6027. smart_chomp:
  6028. enc = rb_enc_get(str);
  6029. if (rb_enc_mbminlen(enc) > 1) {
  6030. pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
  6031. if (rb_enc_is_newline(pp, e, enc)) {
  6032. e = pp;
  6033. }
  6034. pp = e - rb_enc_mbminlen(enc);
  6035. if (pp >= p) {
  6036. pp = rb_enc_left_char_head(p, pp, e, enc);
  6037. if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
  6038. e = pp;
  6039. }
  6040. }
  6041. if (e == RSTRING_END(str)) {
  6042. return Qnil;
  6043. }
  6044. len = e - RSTRING_PTR(str);
  6045. STR_SET_LEN(str, len);
  6046. }
  6047. else {
  6048. if (RSTRING_PTR(str)[len-1] == '\n') {
  6049. STR_DEC_LEN(str);
  6050. if (RSTRING_LEN(str) > 0 &&
  6051. RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
  6052. STR_DEC_LEN(str);
  6053. }
  6054. }
  6055. else if (RSTRING_PTR(str)[len-1] == '\r') {
  6056. STR_DEC_LEN(str);
  6057. }
  6058. else {
  6059. return Qnil;
  6060. }
  6061. }
  6062. RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
  6063. return str;
  6064. }
  6065. }
  6066. else {
  6067. rb_scan_args(argc, argv, "01", &rs);
  6068. }
  6069. if (NIL_P(rs)) return Qnil;
  6070. StringValue(rs);
  6071. rslen = RSTRING_LEN(rs);
  6072. if (rslen == 0) {
  6073. while (len>0 && p[len-1] == '\n') {
  6074. len--;
  6075. if (len>0 && p[len-1] == '\r')
  6076. len--;
  6077. }
  6078. if (len < RSTRING_LEN(str)) {
  6079. STR_SET_LEN(str, len);
  6080. RSTRING_PTR(str)[len] = '\0';
  6081. return str;
  6082. }
  6083. return Qnil;
  6084. }
  6085. if (rslen > len) return Qnil;
  6086. newline = RSTRING_PTR(rs)[rslen-1];
  6087. if (rslen == 1 && newline == '\n')
  6088. goto smart_chomp;
  6089. enc = rb_enc_check(str, rs);
  6090. if (is_broken_string(rs)) {
  6091. return Qnil;
  6092. }
  6093. pp = e - rslen;
  6094. if (p[len-1] == newline &&
  6095. (rslen <= 1 ||
  6096. memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
  6097. if (rb_enc_left_char_head(p, pp, e, enc) != pp)
  6098. return Qnil;
  6099. if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
  6100. ENC_CODERANGE_CLEAR(str);
  6101. }
  6102. STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
  6103. RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
  6104. return str;
  6105. }
  6106. return Qnil;
  6107. }
  6108. /*
  6109. * call-seq:
  6110. * str.chomp(separator=$/) -> new_str
  6111. *
  6112. * Returns a new <code>String</code> with the given record separator removed
  6113. * from the end of <i>str</i> (if present). If <code>$/</code> has not been
  6114. * changed from the default Ruby record separator, then <code>chomp</code> also
  6115. * removes carriage return characters (that is it will remove <code>\n</code>,
  6116. * <code>\r</code>, and <code>\r\n</code>).
  6117. *
  6118. * "hello".chomp #=> "hello"
  6119. * "hello\n".chomp #=> "hello"
  6120. * "hello\r\n".chomp #=> "hello"
  6121. * "hello\n\r".chomp #=> "hello\n"
  6122. * "hello\r".chomp #=> "hello"
  6123. * "hello \n there".chomp #=> "hello \n there"
  6124. * "hello".chomp("llo") #=> "he"
  6125. */
  6126. static VALUE
  6127. rb_str_chomp(int argc, VALUE *argv, VALUE str)
  6128. {
  6129. str = rb_str_dup(str);
  6130. rb_str_chomp_bang(argc, argv, str);
  6131. return str;
  6132. }
  6133. /*
  6134. * call-seq:
  6135. * str.lstrip! -> self or nil
  6136. *
  6137. * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
  6138. * change was made. See also <code>String#rstrip!</code> and
  6139. * <code>String#strip!</code>.
  6140. *
  6141. * " hello ".lstrip #=> "hello "
  6142. * "hello".lstrip! #=> nil
  6143. */
  6144. static VALUE
  6145. rb_str_lstrip_bang(VALUE str)
  6146. {
  6147. rb_encoding *enc;
  6148. char *s, *t, *e;
  6149. str_modify_keep_cr(str);
  6150. enc = STR_ENC_GET(str);
  6151. s = RSTRING_PTR(str);
  6152. if (!s || RSTRING_LEN(str) == 0) return Qnil;
  6153. e = t = RSTRING_END(str);
  6154. /* remove spaces at head */
  6155. while (s < e) {
  6156. int n;
  6157. unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
  6158. if (!rb_isspace(cc)) break;
  6159. s += n;
  6160. }
  6161. if (s > RSTRING_PTR(str)) {
  6162. STR_SET_LEN(str, t-s);
  6163. memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
  6164. RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
  6165. return str;
  6166. }
  6167. return Qnil;
  6168. }
  6169. /*
  6170. * call-seq:
  6171. * str.lstrip -> new_str
  6172. *
  6173. * Returns a copy of <i>str</i> with leading whitespace removed. See also
  6174. * <code>String#rstrip</code> and <code>String#strip</code>.
  6175. *
  6176. * " hello ".lstrip #=> "hello "
  6177. * "hello".lstrip #=> "hello"
  6178. */
  6179. static VALUE
  6180. rb_str_lstrip(VALUE str)
  6181. {
  6182. str = rb_str_dup(str);
  6183. rb_str_lstrip_bang(str);
  6184. return str;
  6185. }
  6186. /*
  6187. * call-seq:
  6188. * str.rstrip! -> self or nil
  6189. *
  6190. * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
  6191. * no change was made. See also <code>String#lstrip!</code> and
  6192. * <code>String#strip!</code>.
  6193. *
  6194. * " hello ".rstrip #=> " hello"
  6195. * "hello".rstrip! #=> nil
  6196. */
  6197. static VALUE
  6198. rb_str_rstrip_bang(VALUE str)
  6199. {
  6200. rb_encoding *enc;
  6201. char *s, *t, *e;
  6202. str_modify_keep_cr(str);
  6203. enc = STR_ENC_GET(str);
  6204. rb_str_check_dummy_enc(enc);
  6205. s = RSTRING_PTR(str);
  6206. if (!s || RSTRING_LEN(str) == 0) return Qnil;
  6207. t = e = RSTRING_END(str);
  6208. /* remove trailing spaces or '\0's */
  6209. if (single_byte_optimizable(str)) {
  6210. unsigned char c;
  6211. while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
  6212. }
  6213. else {
  6214. char *tp;
  6215. while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
  6216. unsigned int c = rb_enc_codepoint(tp, e, enc);
  6217. if (c && !rb_isspace(c)) break;
  6218. t = tp;
  6219. }
  6220. }
  6221. if (t < e) {
  6222. long len = t-RSTRING_PTR(str);
  6223. STR_SET_LEN(str, len);
  6224. RSTRING_PTR(str)[len] = '\0';
  6225. return str;
  6226. }
  6227. return Qnil;
  6228. }
  6229. /*
  6230. * call-seq:
  6231. * str.rstrip -> new_str
  6232. *
  6233. * Returns a copy of <i>str</i> with trailing whitespace removed. See also
  6234. * <code>String#lstrip</code> and <code>String#strip</code>.
  6235. *
  6236. * " hello ".rstrip #=> " hello"
  6237. * "hello".rstrip #=> "hello"
  6238. */
  6239. static VALUE
  6240. rb_str_rstrip(VALUE str)
  6241. {
  6242. str = rb_str_dup(str);
  6243. rb_str_rstrip_bang(str);
  6244. return str;
  6245. }
  6246. /*
  6247. * call-seq:
  6248. * str.strip! -> str or nil
  6249. *
  6250. * Removes leading and trailing whitespace from <i>str</i>. Returns
  6251. * <code>nil</code> if <i>str</i> was not altered.
  6252. */
  6253. static VALUE
  6254. rb_str_strip_bang(VALUE str)
  6255. {
  6256. VALUE l = rb_str_lstrip_bang(str);
  6257. VALUE r = rb_str_rstrip_bang(str);
  6258. if (NIL_P(l) && NIL_P(r)) return Qnil;
  6259. return str;
  6260. }
  6261. /*
  6262. * call-seq:
  6263. * str.strip -> new_str
  6264. *
  6265. * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
  6266. *
  6267. * " hello ".strip #=> "hello"
  6268. * "\tgoodbye\r\n".strip #=> "goodbye"
  6269. */
  6270. static VALUE
  6271. rb_str_strip(VALUE str)
  6272. {
  6273. str = rb_str_dup(str);
  6274. rb_str_strip_bang(str);
  6275. return str;
  6276. }
  6277. static VALUE
  6278. scan_once(VALUE str, VALUE pat, long *start)
  6279. {
  6280. VALUE result, match;
  6281. struct re_registers *regs;
  6282. int i;
  6283. if (rb_reg_search(pat, str, *start, 0) >= 0) {
  6284. match = rb_backref_get();
  6285. regs = RMATCH_REGS(match);
  6286. if (BEG(0) == END(0)) {
  6287. rb_encoding *enc = STR_ENC_GET(str);
  6288. /*
  6289. * Always consume at least one character of the input string
  6290. */
  6291. if (RSTRING_LEN(str) > END(0))
  6292. *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
  6293. RSTRING_END(str), enc);
  6294. else
  6295. *start = END(0)+1;
  6296. }
  6297. else {
  6298. *start = END(0);
  6299. }
  6300. if (regs->num_regs == 1) {
  6301. return rb_reg_nth_match(0, match);
  6302. }
  6303. result = rb_ary_new2(regs->num_regs);
  6304. for (i=1; i < regs->num_regs; i++) {
  6305. rb_ary_push(result, rb_reg_nth_match(i, match));
  6306. }
  6307. return result;
  6308. }
  6309. return Qnil;
  6310. }
  6311. /*
  6312. * call-seq:
  6313. * str.scan(pattern) -> array
  6314. * str.scan(pattern) {|match, ...| block } -> str
  6315. *
  6316. * Both forms iterate through <i>str</i>, matching the pattern (which may be a
  6317. * <code>Regexp</code> or a <code>String</code>). For each match, a result is
  6318. * generated and either added to the result array or passed to the block. If
  6319. * the pattern contains no groups, each individual result consists of the
  6320. * matched string, <code>$&</code>. If the pattern contains groups, each
  6321. * individual result is itself an array containing one entry per group.
  6322. *
  6323. * a = "cruel world"
  6324. * a.scan(/\w+/) #=> ["cruel", "world"]
  6325. * a.scan(/.../) #=> ["cru", "el ", "wor"]
  6326. * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
  6327. * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
  6328. *
  6329. * And the block form:
  6330. *
  6331. * a.scan(/\w+/) {|w| print "<<#{w}>> " }
  6332. * print "\n"
  6333. * a.scan(/(.)(.)/) {|x,y| print y, x }
  6334. * print "\n"
  6335. *
  6336. * <em>produces:</em>
  6337. *
  6338. * <<cruel>> <<world>>
  6339. * rceu lowlr
  6340. */
  6341. static VALUE
  6342. rb_str_scan(VALUE str, VALUE pat)
  6343. {
  6344. VALUE result;
  6345. long start = 0;
  6346. long last = -1, prev = 0;
  6347. char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
  6348. pat = get_pat(pat, 1);
  6349. if (!rb_block_given_p()) {
  6350. VALUE ary = rb_ary_new();
  6351. while (!NIL_P(result = scan_once(str, pat, &start))) {
  6352. last = prev;
  6353. prev = start;
  6354. rb_ary_push(ary, result);
  6355. }
  6356. if (last >= 0) rb_reg_search(pat, str, last, 0);
  6357. return ary;
  6358. }
  6359. while (!NIL_P(result = scan_once(str, pat, &start))) {
  6360. last = prev;
  6361. prev = start;
  6362. rb_yield(result);
  6363. str_mod_check(str, p, len);
  6364. }
  6365. if (last >= 0) rb_reg_search(pat, str, last, 0);
  6366. return str;
  6367. }
  6368. /*
  6369. * call-seq:
  6370. * str.hex -> integer
  6371. *
  6372. * Treats leading characters from <i>str</i> as a string of hexadecimal digits
  6373. * (with an optional sign and an optional <code>0x</code>) and returns the
  6374. * corresponding number. Zero is returned on error.
  6375. *
  6376. * "0x0a".hex #=> 10
  6377. * "-1234".hex #=> -4660
  6378. * "0".hex #=> 0
  6379. * "wombat".hex #=> 0
  6380. */
  6381. static VALUE
  6382. rb_str_hex(VALUE str)
  6383. {
  6384. return rb_str_to_inum(str, 16, FALSE);
  6385. }
  6386. /*
  6387. * call-seq:
  6388. * str.oct -> integer
  6389. *
  6390. * Treats leading characters of <i>str</i> as a string of octal digits (with an
  6391. * optional sign) and returns the corresponding number. Returns 0 if the
  6392. * conversion fails.
  6393. *
  6394. * "123".oct #=> 83
  6395. * "-377".oct #=> -255
  6396. * "bad".oct #=> 0
  6397. * "0377bad".oct #=> 255
  6398. */
  6399. static VALUE
  6400. rb_str_oct(VALUE str)
  6401. {
  6402. return rb_str_to_inum(str, -8, FALSE);
  6403. }
  6404. /*
  6405. * call-seq:
  6406. * str.crypt(salt_str) -> new_str
  6407. *
  6408. * Applies a one-way cryptographic hash to <i>str</i> by invoking the
  6409. * standard library function <code>crypt(3)</code> with the given
  6410. * salt string. While the format and the result are system and
  6411. * implementation dependent, using a salt matching the regular
  6412. * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
  6413. * safe on any platform, in which only the first two characters are
  6414. * significant.
  6415. *
  6416. * This method is for use in system specific scripts, so if you want
  6417. * a cross-platform hash function consider using Digest or OpenSSL
  6418. * instead.
  6419. */
  6420. static VALUE
  6421. rb_str_crypt(VALUE str, VALUE salt)
  6422. {
  6423. extern char *crypt(const char *, const char *);
  6424. VALUE result;
  6425. const char *s, *saltp;
  6426. char *res;
  6427. #ifdef BROKEN_CRYPT
  6428. char salt_8bit_clean[3];
  6429. #endif
  6430. StringValue(salt);
  6431. if (RSTRING_LEN(salt) < 2)
  6432. rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
  6433. s = RSTRING_PTR(str);
  6434. if (!s) s = "";
  6435. saltp = RSTRING_PTR(salt);
  6436. #ifdef BROKEN_CRYPT
  6437. if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
  6438. salt_8bit_clean[0] = saltp[0] & 0x7f;
  6439. salt_8bit_clean[1] = saltp[1] & 0x7f;
  6440. salt_8bit_clean[2] = '\0';
  6441. saltp = salt_8bit_clean;
  6442. }
  6443. #endif
  6444. res = crypt(s, saltp);
  6445. if (!res) {
  6446. rb_sys_fail("crypt");
  6447. }
  6448. result = rb_str_new2(res);
  6449. OBJ_INFECT(result, str);
  6450. OBJ_INFECT(result, salt);
  6451. return result;
  6452. }
  6453. /*
  6454. * call-seq:
  6455. * str.intern -> symbol
  6456. * str.to_sym -> symbol
  6457. *
  6458. * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
  6459. * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
  6460. *
  6461. * "Koala".intern #=> :Koala
  6462. * s = 'cat'.to_sym #=> :cat
  6463. * s == :cat #=> true
  6464. * s = '@cat'.to_sym #=> :@cat
  6465. * s == :@cat #=> true
  6466. *
  6467. * This can also be used to create symbols that cannot be represented using the
  6468. * <code>:xxx</code> notation.
  6469. *
  6470. * 'cat and dog'.to_sym #=> :"cat and dog"
  6471. */
  6472. VALUE
  6473. rb_str_intern(VALUE s)
  6474. {
  6475. VALUE str = RB_GC_GUARD(s);
  6476. ID id;
  6477. id = rb_intern_str(str);
  6478. return ID2SYM(id);
  6479. }
  6480. /*
  6481. * call-seq:
  6482. * str.ord -> integer
  6483. *
  6484. * Return the <code>Integer</code> ordinal of a one-character string.
  6485. *
  6486. * "a".ord #=> 97
  6487. */
  6488. VALUE
  6489. rb_str_ord(VALUE s)
  6490. {
  6491. unsigned int c;
  6492. c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
  6493. return UINT2NUM(c);
  6494. }
  6495. /*
  6496. * call-seq:
  6497. * str.sum(n=16) -> integer
  6498. *
  6499. * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
  6500. * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
  6501. * to 16. The result is simply the sum of the binary value of each character in
  6502. * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
  6503. * checksum.
  6504. */
  6505. static VALUE
  6506. rb_str_sum(int argc, VALUE *argv, VALUE str)
  6507. {
  6508. VALUE vbits;
  6509. int bits;
  6510. char *ptr, *p, *pend;
  6511. long len;
  6512. VALUE sum = INT2FIX(0);
  6513. unsigned long sum0 = 0;
  6514. if (argc == 0) {
  6515. bits = 16;
  6516. }
  6517. else {
  6518. rb_scan_args(argc, argv, "01", &vbits);
  6519. bits = NUM2INT(vbits);
  6520. }
  6521. ptr = p = RSTRING_PTR(str);
  6522. len = RSTRING_LEN(str);
  6523. pend = p + len;
  6524. while (p < pend) {
  6525. if (FIXNUM_MAX - UCHAR_MAX < sum0) {
  6526. sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
  6527. str_mod_check(str, ptr, len);
  6528. sum0 = 0;
  6529. }
  6530. sum0 += (unsigned char)*p;
  6531. p++;
  6532. }
  6533. if (bits == 0) {
  6534. if (sum0) {
  6535. sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
  6536. }
  6537. }
  6538. else {
  6539. if (sum == INT2FIX(0)) {
  6540. if (bits < (int)sizeof(long)*CHAR_BIT) {
  6541. sum0 &= (((unsigned long)1)<<bits)-1;
  6542. }
  6543. sum = LONG2FIX(sum0);
  6544. }
  6545. else {
  6546. VALUE mod;
  6547. if (sum0) {
  6548. sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
  6549. }
  6550. mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
  6551. mod = rb_funcall(mod, '-', 1, INT2FIX(1));
  6552. sum = rb_funcall(sum, '&', 1, mod);
  6553. }
  6554. }
  6555. return sum;
  6556. }
  6557. static VALUE
  6558. rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
  6559. {
  6560. rb_encoding *enc;
  6561. VALUE w;
  6562. long width, len, flen = 1, fclen = 1;
  6563. VALUE res;
  6564. char *p;
  6565. const char *f = " ";
  6566. long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
  6567. volatile VALUE pad;
  6568. int singlebyte = 1, cr;
  6569. rb_scan_args(argc, argv, "11", &w, &pad);
  6570. enc = STR_ENC_GET(str);
  6571. width = NUM2LONG(w);
  6572. if (argc == 2) {
  6573. StringValue(pad);
  6574. enc = rb_enc_check(str, pad);
  6575. f = RSTRING_PTR(pad);
  6576. flen = RSTRING_LEN(pad);
  6577. fclen = str_strlen(pad, enc);
  6578. singlebyte = single_byte_optimizable(pad);
  6579. if (flen == 0 || fclen == 0) {
  6580. rb_raise(rb_eArgError, "zero width padding");
  6581. }
  6582. }
  6583. len = str_strlen(str, enc);
  6584. if (width < 0 || len >= width) return rb_str_dup(str);
  6585. n = width - len;
  6586. llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
  6587. rlen = n - llen;
  6588. cr = ENC_CODERANGE(str);
  6589. if (flen > 1) {
  6590. llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
  6591. rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
  6592. }
  6593. size = RSTRING_LEN(str);
  6594. if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
  6595. (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
  6596. (len += llen2 + rlen2) >= LONG_MAX - size) {
  6597. rb_raise(rb_eArgError, "argument too big");
  6598. }
  6599. len += size;
  6600. res = rb_str_new5(str, 0, len);
  6601. p = RSTRING_PTR(res);
  6602. if (flen <= 1) {
  6603. memset(p, *f, llen);
  6604. p += llen;
  6605. }
  6606. else {
  6607. while (llen >= fclen) {
  6608. memcpy(p,f,flen);
  6609. p += flen;
  6610. llen -= fclen;
  6611. }
  6612. if (llen > 0) {
  6613. memcpy(p, f, llen2);
  6614. p += llen2;
  6615. }
  6616. }
  6617. memcpy(p, RSTRING_PTR(str), size);
  6618. p += size;
  6619. if (flen <= 1) {
  6620. memset(p, *f, rlen);
  6621. p += rlen;
  6622. }
  6623. else {
  6624. while (rlen >= fclen) {
  6625. memcpy(p,f,flen);
  6626. p += flen;
  6627. rlen -= fclen;
  6628. }
  6629. if (rlen > 0) {
  6630. memcpy(p, f, rlen2);
  6631. p += rlen2;
  6632. }
  6633. }
  6634. *p = '\0';
  6635. STR_SET_LEN(res, p-RSTRING_PTR(res));
  6636. OBJ_INFECT(res, str);
  6637. if (!NIL_P(pad)) OBJ_INFECT(res, pad);
  6638. rb_enc_associate(res, enc);
  6639. if (argc == 2)
  6640. cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
  6641. if (cr != ENC_CODERANGE_BROKEN)
  6642. ENC_CODERANGE_SET(res, cr);
  6643. return res;
  6644. }
  6645. /*
  6646. * call-seq:
  6647. * str.ljust(integer, padstr=' ') -> new_str
  6648. *
  6649. * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
  6650. * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
  6651. * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
  6652. *
  6653. * "hello".ljust(4) #=> "hello"
  6654. * "hello".ljust(20) #=> "hello "
  6655. * "hello".ljust(20, '1234') #=> "hello123412341234123"
  6656. */
  6657. static VALUE
  6658. rb_str_ljust(int argc, VALUE *argv, VALUE str)
  6659. {
  6660. return rb_str_justify(argc, argv, str, 'l');
  6661. }
  6662. /*
  6663. * call-seq:
  6664. * str.rjust(integer, padstr=' ') -> new_str
  6665. *
  6666. * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
  6667. * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
  6668. * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
  6669. *
  6670. * "hello".rjust(4) #=> "hello"
  6671. * "hello".rjust(20) #=> " hello"
  6672. * "hello".rjust(20, '1234') #=> "123412341234123hello"
  6673. */
  6674. static VALUE
  6675. rb_str_rjust(int argc, VALUE *argv, VALUE str)
  6676. {
  6677. return rb_str_justify(argc, argv, str, 'r');
  6678. }
  6679. /*
  6680. * call-seq:
  6681. * str.center(width, padstr=' ') -> new_str
  6682. *
  6683. * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
  6684. * returns a new String of length +width+ with +str+ centered and padded with
  6685. * +padstr+; otherwise, returns +str+.
  6686. *
  6687. * "hello".center(4) #=> "hello"
  6688. * "hello".center(20) #=> " hello "
  6689. * "hello".center(20, '123') #=> "1231231hello12312312"
  6690. */
  6691. static VALUE
  6692. rb_str_center(int argc, VALUE *argv, VALUE str)
  6693. {
  6694. return rb_str_justify(argc, argv, str, 'c');
  6695. }
  6696. /*
  6697. * call-seq:
  6698. * str.partition(sep) -> [head, sep, tail]
  6699. * str.partition(regexp) -> [head, match, tail]
  6700. *
  6701. * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
  6702. * and returns the part before it, the match, and the part
  6703. * after it.
  6704. * If it is not found, returns two empty strings and <i>str</i>.
  6705. *
  6706. * "hello".partition("l") #=> ["he", "l", "lo"]
  6707. * "hello".partition("x") #=> ["hello", "", ""]
  6708. * "hello".partition(/.l/) #=> ["h", "el", "lo"]
  6709. */
  6710. static VALUE
  6711. rb_str_partition(VALUE str, VALUE sep)
  6712. {
  6713. long pos;
  6714. int regex = FALSE;
  6715. if (RB_TYPE_P(sep, T_REGEXP)) {
  6716. pos = rb_reg_search(sep, str, 0, 0);
  6717. regex = TRUE;
  6718. }
  6719. else {
  6720. VALUE tmp;
  6721. tmp = rb_check_string_type(sep);
  6722. if (NIL_P(tmp)) {
  6723. rb_raise(rb_eTypeError, "type mismatch: %s given",
  6724. rb_obj_classname(sep));
  6725. }
  6726. sep = tmp;
  6727. pos = rb_str_index(str, sep, 0);
  6728. }
  6729. if (pos < 0) {
  6730. failed:
  6731. return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
  6732. }
  6733. if (regex) {
  6734. sep = rb_str_subpat(str, sep, INT2FIX(0));
  6735. if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
  6736. }
  6737. return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
  6738. sep,
  6739. rb_str_subseq(str, pos+RSTRING_LEN(sep),
  6740. RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
  6741. }
  6742. /*
  6743. * call-seq:
  6744. * str.rpartition(sep) -> [head, sep, tail]
  6745. * str.rpartition(regexp) -> [head, match, tail]
  6746. *
  6747. * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
  6748. * of the string, and returns the part before it, the match, and the part
  6749. * after it.
  6750. * If it is not found, returns two empty strings and <i>str</i>.
  6751. *
  6752. * "hello".rpartition("l") #=> ["hel", "l", "o"]
  6753. * "hello".rpartition("x") #=> ["", "", "hello"]
  6754. * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
  6755. */
  6756. static VALUE
  6757. rb_str_rpartition(VALUE str, VALUE sep)
  6758. {
  6759. long pos = RSTRING_LEN(str);
  6760. int regex = FALSE;
  6761. if (RB_TYPE_P(sep, T_REGEXP)) {
  6762. pos = rb_reg_search(sep, str, pos, 1);
  6763. regex = TRUE;
  6764. }
  6765. else {
  6766. VALUE tmp;
  6767. tmp = rb_check_string_type(sep);
  6768. if (NIL_P(tmp)) {
  6769. rb_raise(rb_eTypeError, "type mismatch: %s given",
  6770. rb_obj_classname(sep));
  6771. }
  6772. sep = tmp;
  6773. pos = rb_str_sublen(str, pos);
  6774. pos = rb_str_rindex(str, sep, pos);
  6775. }
  6776. if (pos < 0) {
  6777. return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
  6778. }
  6779. if (regex) {
  6780. sep = rb_reg_nth_match(0, rb_backref_get());
  6781. }
  6782. return rb_ary_new3(3, rb_str_substr(str, 0, pos),
  6783. sep,
  6784. rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
  6785. }
  6786. /*
  6787. * call-seq:
  6788. * str.start_with?([prefixes]+) -> true or false
  6789. *
  6790. * Returns true if +str+ starts with one of the +prefixes+ given.
  6791. *
  6792. * "hello".start_with?("hell") #=> true
  6793. *
  6794. * # returns true if one of the prefixes matches.
  6795. * "hello".start_with?("heaven", "hell") #=> true
  6796. * "hello".start_with?("heaven", "paradise") #=> false
  6797. */
  6798. static VALUE
  6799. rb_str_start_with(int argc, VALUE *argv, VALUE str)
  6800. {
  6801. int i;
  6802. for (i=0; i<argc; i++) {
  6803. VALUE tmp = argv[i];
  6804. StringValue(tmp);
  6805. rb_enc_check(str, tmp);
  6806. if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
  6807. if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
  6808. return Qtrue;
  6809. }
  6810. return Qfalse;
  6811. }
  6812. /*
  6813. * call-seq:
  6814. * str.end_with?([suffixes]+) -> true or false
  6815. *
  6816. * Returns true if +str+ ends with one of the +suffixes+ given.
  6817. */
  6818. static VALUE
  6819. rb_str_end_with(int argc, VALUE *argv, VALUE str)
  6820. {
  6821. int i;
  6822. char *p, *s, *e;
  6823. rb_encoding *enc;
  6824. for (i=0; i<argc; i++) {
  6825. VALUE tmp = argv[i];
  6826. StringValue(tmp);
  6827. enc = rb_enc_check(str, tmp);
  6828. if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
  6829. p = RSTRING_PTR(str);
  6830. e = p + RSTRING_LEN(str);
  6831. s = e - RSTRING_LEN(tmp);
  6832. if (rb_enc_left_char_head(p, s, e, enc) != s)
  6833. continue;
  6834. if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
  6835. return Qtrue;
  6836. }
  6837. return Qfalse;
  6838. }
  6839. void
  6840. rb_str_setter(VALUE val, ID id, VALUE *var)
  6841. {
  6842. if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
  6843. rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
  6844. }
  6845. *var = val;
  6846. }
  6847. /*
  6848. * call-seq:
  6849. * str.force_encoding(encoding) -> str
  6850. *
  6851. * Changes the encoding to +encoding+ and returns self.
  6852. */
  6853. static VALUE
  6854. rb_str_force_encoding(VALUE str, VALUE enc)
  6855. {
  6856. str_modifiable(str);
  6857. rb_enc_associate(str, rb_to_encoding(enc));
  6858. ENC_CODERANGE_CLEAR(str);
  6859. return str;
  6860. }
  6861. /*
  6862. * call-seq:
  6863. * str.b -> str
  6864. *
  6865. * Returns a copied string whose encoding is ASCII-8BIT.
  6866. */
  6867. static VALUE
  6868. rb_str_b(VALUE str)
  6869. {
  6870. VALUE str2 = str_alloc(rb_cString);
  6871. str_replace_shared_without_enc(str2, str);
  6872. OBJ_INFECT(str2, str);
  6873. ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
  6874. return str2;
  6875. }
  6876. /*
  6877. * call-seq:
  6878. * str.valid_encoding? -> true or false
  6879. *
  6880. * Returns true for a string which encoded correctly.
  6881. *
  6882. * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
  6883. * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
  6884. * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
  6885. */
  6886. static VALUE
  6887. rb_str_valid_encoding_p(VALUE str)
  6888. {
  6889. int cr = rb_enc_str_coderange(str);
  6890. return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
  6891. }
  6892. /*
  6893. * call-seq:
  6894. * str.ascii_only? -> true or false
  6895. *
  6896. * Returns true for a string which has only ASCII characters.
  6897. *
  6898. * "abc".force_encoding("UTF-8").ascii_only? #=> true
  6899. * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
  6900. */
  6901. static VALUE
  6902. rb_str_is_ascii_only_p(VALUE str)
  6903. {
  6904. int cr = rb_enc_str_coderange(str);
  6905. return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
  6906. }
  6907. /**
  6908. * Shortens _str_ and adds three dots, an ellipsis, if it is longer
  6909. * than _len_ characters.
  6910. *
  6911. * \param str the string to ellipsize.
  6912. * \param len the maximum string length.
  6913. * \return the ellipsized string.
  6914. * \pre _len_ must not be negative.
  6915. * \post the length of the returned string in characters is less than or equal to _len_.
  6916. * \post If the length of _str_ is less than or equal _len_, returns _str_ itself.
  6917. * \post the encoded of returned string is equal to the encoded of _str_.
  6918. * \post the class of returned string is equal to the class of _str_.
  6919. * \note the length is counted in characters.
  6920. */
  6921. VALUE
  6922. rb_str_ellipsize(VALUE str, long len)
  6923. {
  6924. static const char ellipsis[] = "...";
  6925. const long ellipsislen = sizeof(ellipsis) - 1;
  6926. rb_encoding *const enc = rb_enc_get(str);
  6927. const long blen = RSTRING_LEN(str);
  6928. const char *const p = RSTRING_PTR(str), *e = p + blen;
  6929. VALUE estr, ret = 0;
  6930. if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
  6931. if (len * rb_enc_mbminlen(enc) >= blen ||
  6932. (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
  6933. ret = str;
  6934. }
  6935. else if (len <= ellipsislen ||
  6936. !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
  6937. if (rb_enc_asciicompat(enc)) {
  6938. ret = rb_str_new_with_class(str, ellipsis, len);
  6939. rb_enc_associate(ret, enc);
  6940. }
  6941. else {
  6942. estr = rb_usascii_str_new(ellipsis, len);
  6943. ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
  6944. }
  6945. }
  6946. else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
  6947. rb_str_cat(ret, ellipsis, ellipsislen);
  6948. }
  6949. else {
  6950. estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
  6951. rb_enc_from_encoding(enc), 0, Qnil);
  6952. rb_str_append(ret, estr);
  6953. }
  6954. return ret;
  6955. }
  6956. /**********************************************************************
  6957. * Document-class: Symbol
  6958. *
  6959. * <code>Symbol</code> objects represent names and some strings
  6960. * inside the Ruby
  6961. * interpreter. They are generated using the <code>:name</code> and
  6962. * <code>:"string"</code> literals
  6963. * syntax, and by the various <code>to_sym</code> methods. The same
  6964. * <code>Symbol</code> object will be created for a given name or string
  6965. * for the duration of a program's execution, regardless of the context
  6966. * or meaning of that name. Thus if <code>Fred</code> is a constant in
  6967. * one context, a method in another, and a class in a third, the
  6968. * <code>Symbol</code> <code>:Fred</code> will be the same object in
  6969. * all three contexts.
  6970. *
  6971. * module One
  6972. * class Fred
  6973. * end
  6974. * $f1 = :Fred
  6975. * end
  6976. * module Two
  6977. * Fred = 1
  6978. * $f2 = :Fred
  6979. * end
  6980. * def Fred()
  6981. * end
  6982. * $f3 = :Fred
  6983. * $f1.object_id #=> 2514190
  6984. * $f2.object_id #=> 2514190
  6985. * $f3.object_id #=> 2514190
  6986. *
  6987. */
  6988. /*
  6989. * call-seq:
  6990. * sym == obj -> true or false
  6991. *
  6992. * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
  6993. * symbol, returns <code>true</code>.
  6994. */
  6995. static VALUE
  6996. sym_equal(VALUE sym1, VALUE sym2)
  6997. {
  6998. if (sym1 == sym2) return Qtrue;
  6999. return Qfalse;
  7000. }
  7001. static int
  7002. sym_printable(const char *s, const char *send, rb_encoding *enc)
  7003. {
  7004. while (s < send) {
  7005. int n;
  7006. int c = rb_enc_codepoint_len(s, send, &n, enc);
  7007. if (!rb_enc_isprint(c, enc)) return FALSE;
  7008. s += n;
  7009. }
  7010. return TRUE;
  7011. }
  7012. int
  7013. rb_str_symname_p(VALUE sym)
  7014. {
  7015. rb_encoding *enc;
  7016. const char *ptr;
  7017. long len;
  7018. rb_encoding *resenc = rb_default_internal_encoding();
  7019. if (resenc == NULL) resenc = rb_default_external_encoding();
  7020. enc = STR_ENC_GET(sym);
  7021. ptr = RSTRING_PTR(sym);
  7022. len = RSTRING_LEN(sym);
  7023. if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
  7024. !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
  7025. return FALSE;
  7026. }
  7027. return TRUE;
  7028. }
  7029. VALUE
  7030. rb_str_quote_unprintable(VALUE str)
  7031. {
  7032. rb_encoding *enc;
  7033. const char *ptr;
  7034. long len;
  7035. rb_encoding *resenc = rb_default_internal_encoding();
  7036. if (resenc == NULL) resenc = rb_default_external_encoding();
  7037. enc = STR_ENC_GET(str);
  7038. ptr = RSTRING_PTR(str);
  7039. len = RSTRING_LEN(str);
  7040. if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
  7041. !sym_printable(ptr, ptr + len, enc)) {
  7042. return rb_str_inspect(str);
  7043. }
  7044. return str;
  7045. }
  7046. VALUE
  7047. rb_id_quote_unprintable(ID id)
  7048. {
  7049. return rb_str_quote_unprintable(rb_id2str(id));
  7050. }
  7051. /*
  7052. * call-seq:
  7053. * sym.inspect -> string
  7054. *
  7055. * Returns the representation of <i>sym</i> as a symbol literal.
  7056. *
  7057. * :fred.inspect #=> ":fred"
  7058. */
  7059. static VALUE
  7060. sym_inspect(VALUE sym)
  7061. {
  7062. VALUE str;
  7063. const char *ptr;
  7064. long len;
  7065. ID id = SYM2ID(sym);
  7066. char *dest;
  7067. sym = rb_id2str(id);
  7068. if (!rb_str_symname_p(sym)) {
  7069. str = rb_str_inspect(sym);
  7070. len = RSTRING_LEN(str);
  7071. rb_str_resize(str, len + 1);
  7072. dest = RSTRING_PTR(str);
  7073. memmove(dest + 1, dest, len);
  7074. dest[0] = ':';
  7075. }
  7076. else {
  7077. rb_encoding *enc = STR_ENC_GET(sym);
  7078. ptr = RSTRING_PTR(sym);
  7079. len = RSTRING_LEN(sym);
  7080. str = rb_enc_str_new(0, len + 1, enc);
  7081. dest = RSTRING_PTR(str);
  7082. dest[0] = ':';
  7083. memcpy(dest + 1, ptr, len);
  7084. }
  7085. return str;
  7086. }
  7087. /*
  7088. * call-seq:
  7089. * sym.id2name -> string
  7090. * sym.to_s -> string
  7091. *
  7092. * Returns the name or string corresponding to <i>sym</i>.
  7093. *
  7094. * :fred.id2name #=> "fred"
  7095. */
  7096. VALUE
  7097. rb_sym_to_s(VALUE sym)
  7098. {
  7099. ID id = SYM2ID(sym);
  7100. return str_new3(rb_cString, rb_id2str(id));
  7101. }
  7102. /*
  7103. * call-seq:
  7104. * sym.to_sym -> sym
  7105. * sym.intern -> sym
  7106. *
  7107. * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
  7108. * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
  7109. * in this case.
  7110. */
  7111. static VALUE
  7112. sym_to_sym(VALUE sym)
  7113. {
  7114. return sym;
  7115. }
  7116. static VALUE
  7117. sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
  7118. {
  7119. VALUE obj;
  7120. if (argc < 1) {
  7121. rb_raise(rb_eArgError, "no receiver given");
  7122. }
  7123. obj = argv[0];
  7124. return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
  7125. }
  7126. /*
  7127. * call-seq:
  7128. * sym.to_proc
  7129. *
  7130. * Returns a _Proc_ object which respond to the given method by _sym_.
  7131. *
  7132. * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
  7133. */
  7134. static VALUE
  7135. sym_to_proc(VALUE sym)
  7136. {
  7137. static VALUE sym_proc_cache = Qfalse;
  7138. enum {SYM_PROC_CACHE_SIZE = 67};
  7139. VALUE proc;
  7140. long id, index;
  7141. VALUE *aryp;
  7142. if (!sym_proc_cache) {
  7143. sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
  7144. rb_gc_register_mark_object(sym_proc_cache);
  7145. rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
  7146. }
  7147. id = SYM2ID(sym);
  7148. index = (id % SYM_PROC_CACHE_SIZE) << 1;
  7149. aryp = RARRAY_PTR(sym_proc_cache);
  7150. if (aryp[index] == sym) {
  7151. return aryp[index + 1];
  7152. }
  7153. else {
  7154. proc = rb_proc_new(sym_call, (VALUE)id);
  7155. aryp[index] = sym;
  7156. aryp[index + 1] = proc;
  7157. return proc;
  7158. }
  7159. }
  7160. /*
  7161. * call-seq:
  7162. *
  7163. * sym.succ
  7164. *
  7165. * Same as <code>sym.to_s.succ.intern</code>.
  7166. */
  7167. static VALUE
  7168. sym_succ(VALUE sym)
  7169. {
  7170. return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
  7171. }
  7172. /*
  7173. * call-seq:
  7174. *
  7175. * str <=> other -> -1, 0, +1 or nil
  7176. *
  7177. * Compares _sym_ with _other_ in string form.
  7178. */
  7179. static VALUE
  7180. sym_cmp(VALUE sym, VALUE other)
  7181. {
  7182. if (!SYMBOL_P(other)) {
  7183. return Qnil;
  7184. }
  7185. return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
  7186. }
  7187. /*
  7188. * call-seq:
  7189. *
  7190. * sym.casecmp(other) -> -1, 0, +1 or nil
  7191. *
  7192. * Case-insensitive version of <code>Symbol#<=></code>.
  7193. */
  7194. static VALUE
  7195. sym_casecmp(VALUE sym, VALUE other)
  7196. {
  7197. if (!SYMBOL_P(other)) {
  7198. return Qnil;
  7199. }
  7200. return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
  7201. }
  7202. /*
  7203. * call-seq:
  7204. * sym =~ obj -> fixnum or nil
  7205. *
  7206. * Returns <code>sym.to_s =~ obj</code>.
  7207. */
  7208. static VALUE
  7209. sym_match(VALUE sym, VALUE other)
  7210. {
  7211. return rb_str_match(rb_sym_to_s(sym), other);
  7212. }
  7213. /*
  7214. * call-seq:
  7215. * sym[idx] -> char
  7216. * sym[b, n] -> char
  7217. *
  7218. * Returns <code>sym.to_s[]</code>.
  7219. */
  7220. static VALUE
  7221. sym_aref(int argc, VALUE *argv, VALUE sym)
  7222. {
  7223. return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
  7224. }
  7225. /*
  7226. * call-seq:
  7227. * sym.length -> integer
  7228. *
  7229. * Same as <code>sym.to_s.length</code>.
  7230. */
  7231. static VALUE
  7232. sym_length(VALUE sym)
  7233. {
  7234. return rb_str_length(rb_id2str(SYM2ID(sym)));
  7235. }
  7236. /*
  7237. * call-seq:
  7238. * sym.empty? -> true or false
  7239. *
  7240. * Returns that _sym_ is :"" or not.
  7241. */
  7242. static VALUE
  7243. sym_empty(VALUE sym)
  7244. {
  7245. return rb_str_empty(rb_id2str(SYM2ID(sym)));
  7246. }
  7247. /*
  7248. * call-seq:
  7249. * sym.upcase -> symbol
  7250. *
  7251. * Same as <code>sym.to_s.upcase.intern</code>.
  7252. */
  7253. static VALUE
  7254. sym_upcase(VALUE sym)
  7255. {
  7256. return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
  7257. }
  7258. /*
  7259. * call-seq:
  7260. * sym.downcase -> symbol
  7261. *
  7262. * Same as <code>sym.to_s.downcase.intern</code>.
  7263. */
  7264. static VALUE
  7265. sym_downcase(VALUE sym)
  7266. {
  7267. return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
  7268. }
  7269. /*
  7270. * call-seq:
  7271. * sym.capitalize -> symbol
  7272. *
  7273. * Same as <code>sym.to_s.capitalize.intern</code>.
  7274. */
  7275. static VALUE
  7276. sym_capitalize(VALUE sym)
  7277. {
  7278. return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
  7279. }
  7280. /*
  7281. * call-seq:
  7282. * sym.swapcase -> symbol
  7283. *
  7284. * Same as <code>sym.to_s.swapcase.intern</code>.
  7285. */
  7286. static VALUE
  7287. sym_swapcase(VALUE sym)
  7288. {
  7289. return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
  7290. }
  7291. /*
  7292. * call-seq:
  7293. * sym.encoding -> encoding
  7294. *
  7295. * Returns the Encoding object that represents the encoding of _sym_.
  7296. */
  7297. static VALUE
  7298. sym_encoding(VALUE sym)
  7299. {
  7300. return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
  7301. }
  7302. ID
  7303. rb_to_id(VALUE name)
  7304. {
  7305. VALUE tmp;
  7306. switch (TYPE(name)) {
  7307. default:
  7308. tmp = rb_check_string_type(name);
  7309. if (NIL_P(tmp)) {
  7310. tmp = rb_inspect(name);
  7311. rb_raise(rb_eTypeError, "%s is not a symbol",
  7312. RSTRING_PTR(tmp));
  7313. }
  7314. name = tmp;
  7315. /* fall through */
  7316. case T_STRING:
  7317. name = rb_str_intern(name);
  7318. /* fall through */
  7319. case T_SYMBOL:
  7320. return SYM2ID(name);
  7321. }
  7322. UNREACHABLE;
  7323. }
  7324. /*
  7325. * A <code>String</code> object holds and manipulates an arbitrary sequence of
  7326. * bytes, typically representing characters. String objects may be created
  7327. * using <code>String::new</code> or as literals.
  7328. *
  7329. * Because of aliasing issues, users of strings should be aware of the methods
  7330. * that modify the contents of a <code>String</code> object. Typically,
  7331. * methods with names ending in ``!'' modify their receiver, while those
  7332. * without a ``!'' return a new <code>String</code>. However, there are
  7333. * exceptions, such as <code>String#[]=</code>.
  7334. *
  7335. */
  7336. void
  7337. Init_String(void)
  7338. {
  7339. #undef rb_intern
  7340. #define rb_intern(str) rb_intern_const(str)
  7341. rb_cString = rb_define_class("String", rb_cObject);
  7342. rb_include_module(rb_cString, rb_mComparable);
  7343. rb_define_alloc_func(rb_cString, empty_str_alloc);
  7344. rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
  7345. rb_define_method(rb_cString, "initialize", rb_str_init, -1);
  7346. rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
  7347. rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
  7348. rb_define_method(rb_cString, "==", rb_str_equal, 1);
  7349. rb_define_method(rb_cString, "===", rb_str_equal, 1);
  7350. rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
  7351. rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
  7352. rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
  7353. rb_define_method(rb_cString, "+", rb_str_plus, 1);
  7354. rb_define_method(rb_cString, "*", rb_str_times, 1);
  7355. rb_define_method(rb_cString, "%", rb_str_format_m, 1);
  7356. rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
  7357. rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
  7358. rb_define_method(rb_cString, "insert", rb_str_insert, 2);
  7359. rb_define_method(rb_cString, "length", rb_str_length, 0);
  7360. rb_define_method(rb_cString, "size", rb_str_length, 0);
  7361. rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
  7362. rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
  7363. rb_define_method(rb_cString, "=~", rb_str_match, 1);
  7364. rb_define_method(rb_cString, "match", rb_str_match_m, -1);
  7365. rb_define_method(rb_cString, "succ", rb_str_succ, 0);
  7366. rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
  7367. rb_define_method(rb_cString, "next", rb_str_succ, 0);
  7368. rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
  7369. rb_define_method(rb_cString, "upto", rb_str_upto, -1);
  7370. rb_define_method(rb_cString, "index", rb_str_index_m, -1);
  7371. rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
  7372. rb_define_method(rb_cString, "replace", rb_str_replace, 1);
  7373. rb_define_method(rb_cString, "clear", rb_str_clear, 0);
  7374. rb_define_method(rb_cString, "chr", rb_str_chr, 0);
  7375. rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
  7376. rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
  7377. rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
  7378. rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
  7379. rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
  7380. rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
  7381. rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
  7382. rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
  7383. rb_define_method(rb_cString, "dump", rb_str_dump, 0);
  7384. rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
  7385. rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
  7386. rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
  7387. rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
  7388. rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
  7389. rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
  7390. rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
  7391. rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
  7392. rb_define_method(rb_cString, "hex", rb_str_hex, 0);
  7393. rb_define_method(rb_cString, "oct", rb_str_oct, 0);
  7394. rb_define_method(rb_cString, "split", rb_str_split_m, -1);
  7395. rb_define_method(rb_cString, "lines", rb_str_lines, -1);
  7396. rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
  7397. rb_define_method(rb_cString, "chars", rb_str_chars, 0);
  7398. rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
  7399. rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
  7400. rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
  7401. rb_define_method(rb_cString, "concat", rb_str_concat, 1);
  7402. rb_define_method(rb_cString, "<<", rb_str_concat, 1);
  7403. rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
  7404. rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
  7405. rb_define_method(rb_cString, "intern", rb_str_intern, 0);
  7406. rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
  7407. rb_define_method(rb_cString, "ord", rb_str_ord, 0);
  7408. rb_define_method(rb_cString, "include?", rb_str_include, 1);
  7409. rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
  7410. rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
  7411. rb_define_method(rb_cString, "scan", rb_str_scan, 1);
  7412. rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
  7413. rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
  7414. rb_define_method(rb_cString, "center", rb_str_center, -1);
  7415. rb_define_method(rb_cString, "sub", rb_str_sub, -1);
  7416. rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
  7417. rb_define_method(rb_cString, "chop", rb_str_chop, 0);
  7418. rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
  7419. rb_define_method(rb_cString, "strip", rb_str_strip, 0);
  7420. rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
  7421. rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
  7422. rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
  7423. rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
  7424. rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
  7425. rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
  7426. rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
  7427. rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
  7428. rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
  7429. rb_define_method(rb_cString, "tr", rb_str_tr, 2);
  7430. rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
  7431. rb_define_method(rb_cString, "delete", rb_str_delete, -1);
  7432. rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
  7433. rb_define_method(rb_cString, "count", rb_str_count, -1);
  7434. rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
  7435. rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
  7436. rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
  7437. rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
  7438. rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
  7439. rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
  7440. rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
  7441. rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
  7442. rb_define_method(rb_cString, "sum", rb_str_sum, -1);
  7443. rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
  7444. rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
  7445. rb_define_method(rb_cString, "partition", rb_str_partition, 1);
  7446. rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
  7447. rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
  7448. rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
  7449. rb_define_method(rb_cString, "b", rb_str_b, 0);
  7450. rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
  7451. rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
  7452. id_to_s = rb_intern("to_s");
  7453. rb_fs = Qnil;
  7454. rb_define_variable("$;", &rb_fs);
  7455. rb_define_variable("$-F", &rb_fs);
  7456. rb_cSymbol = rb_define_class("Symbol", rb_cObject);
  7457. rb_include_module(rb_cSymbol, rb_mComparable);
  7458. rb_undef_alloc_func(rb_cSymbol);
  7459. rb_undef_method(CLASS_OF(rb_cSymbol), "new");
  7460. rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
  7461. rb_define_method(rb_cSymbol, "==", sym_equal, 1);
  7462. rb_define_method(rb_cSymbol, "===", sym_equal, 1);
  7463. rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
  7464. rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
  7465. rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
  7466. rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
  7467. rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
  7468. rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
  7469. rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
  7470. rb_define_method(rb_cSymbol, "next", sym_succ, 0);
  7471. rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
  7472. rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
  7473. rb_define_method(rb_cSymbol, "=~", sym_match, 1);
  7474. rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
  7475. rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
  7476. rb_define_method(rb_cSymbol, "length", sym_length, 0);
  7477. rb_define_method(rb_cSymbol, "size", sym_length, 0);
  7478. rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
  7479. rb_define_method(rb_cSymbol, "match", sym_match, 1);
  7480. rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
  7481. rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
  7482. rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
  7483. rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
  7484. rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
  7485. }