/string.c
C | 8315 lines | 5858 code | 822 blank | 1635 comment | 1556 complexity | 8b89c4a4b18f27bf63623a27c3a99f6b MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, AGPL-3.0, 0BSD
Large files files are truncated, but you can click here to view the full file
- /**********************************************************************
- string.c -
- $Author$
- created at: Mon Aug 9 17:12:58 JST 1993
- Copyright (C) 1993-2007 Yukihiro Matsumoto
- Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
- Copyright (C) 2000 Information-technology Promotion Agency, Japan
- **********************************************************************/
- #include "ruby/ruby.h"
- #include "ruby/re.h"
- #include "ruby/encoding.h"
- #include "vm_core.h"
- #include "internal.h"
- #include "probes.h"
- #include <assert.h>
- #define BEG(no) (regs->beg[(no)])
- #define END(no) (regs->end[(no)])
- #include <math.h>
- #include <ctype.h>
- #ifdef HAVE_UNISTD_H
- #include <unistd.h>
- #endif
- #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
- #undef rb_str_new_cstr
- #undef rb_tainted_str_new_cstr
- #undef rb_usascii_str_new_cstr
- #undef rb_external_str_new_cstr
- #undef rb_locale_str_new_cstr
- #undef rb_str_new2
- #undef rb_str_new3
- #undef rb_str_new4
- #undef rb_str_new5
- #undef rb_tainted_str_new2
- #undef rb_usascii_str_new2
- #undef rb_str_dup_frozen
- #undef rb_str_buf_new_cstr
- #undef rb_str_buf_new2
- #undef rb_str_buf_cat2
- #undef rb_str_cat2
- static VALUE rb_str_clear(VALUE str);
- VALUE rb_cString;
- VALUE rb_cSymbol;
- #define RUBY_MAX_CHAR_LEN 16
- #define STR_TMPLOCK FL_USER7
- #define STR_NOEMBED FL_USER1
- #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
- #define STR_ASSOC FL_USER3
- #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
- #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
- #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
- #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
- #define STR_UNSET_NOCAPA(s) do {\
- if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
- } while (0)
- #define STR_SET_NOEMBED(str) do {\
- FL_SET((str), STR_NOEMBED);\
- STR_SET_EMBED_LEN((str), 0);\
- } while (0)
- #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
- #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
- #define STR_SET_EMBED_LEN(str, n) do { \
- long tmp_n = (n);\
- RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
- RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
- } while (0)
- #define STR_SET_LEN(str, n) do { \
- if (STR_EMBED_P(str)) {\
- STR_SET_EMBED_LEN((str), (n));\
- }\
- else {\
- RSTRING(str)->as.heap.len = (n);\
- }\
- } while (0)
- #define STR_DEC_LEN(str) do {\
- if (STR_EMBED_P(str)) {\
- long n = RSTRING_LEN(str);\
- n--;\
- STR_SET_EMBED_LEN((str), n);\
- }\
- else {\
- RSTRING(str)->as.heap.len--;\
- }\
- } while (0)
- #define RESIZE_CAPA(str,capacity) do {\
- if (STR_EMBED_P(str)) {\
- if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
- char *tmp = ALLOC_N(char, (capacity)+1);\
- memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
- RSTRING(str)->as.heap.ptr = tmp;\
- RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
- STR_SET_NOEMBED(str);\
- RSTRING(str)->as.heap.aux.capa = (capacity);\
- }\
- }\
- else {\
- REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
- if (!STR_NOCAPA_P(str))\
- RSTRING(str)->as.heap.aux.capa = (capacity);\
- }\
- } while (0)
- #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
- #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
- #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
- static inline int
- single_byte_optimizable(VALUE str)
- {
- rb_encoding *enc;
- /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
- return 1;
- enc = STR_ENC_GET(str);
- if (rb_enc_mbmaxlen(enc) == 1)
- return 1;
- /* Conservative. Possibly single byte.
- * "\xa1" in Shift_JIS for example. */
- return 0;
- }
- VALUE rb_fs;
- static inline const char *
- search_nonascii(const char *p, const char *e)
- {
- #if SIZEOF_VALUE == 8
- # define NONASCII_MASK 0x8080808080808080ULL
- #elif SIZEOF_VALUE == 4
- # define NONASCII_MASK 0x80808080UL
- #endif
- #ifdef NONASCII_MASK
- if ((int)sizeof(VALUE) * 2 < e - p) {
- const VALUE *s, *t;
- const VALUE lowbits = sizeof(VALUE) - 1;
- s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
- while (p < (const char *)s) {
- if (!ISASCII(*p))
- return p;
- p++;
- }
- t = (const VALUE*)(~lowbits & (VALUE)e);
- while (s < t) {
- if (*s & NONASCII_MASK) {
- t = s;
- break;
- }
- s++;
- }
- p = (const char *)t;
- }
- #endif
- while (p < e) {
- if (!ISASCII(*p))
- return p;
- p++;
- }
- return NULL;
- }
- static int
- coderange_scan(const char *p, long len, rb_encoding *enc)
- {
- const char *e = p + len;
- if (rb_enc_to_index(enc) == 0) {
- /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
- p = search_nonascii(p, e);
- return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
- }
- if (rb_enc_asciicompat(enc)) {
- p = search_nonascii(p, e);
- if (!p) {
- return ENC_CODERANGE_7BIT;
- }
- while (p < e) {
- int ret = rb_enc_precise_mbclen(p, e, enc);
- if (!MBCLEN_CHARFOUND_P(ret)) {
- return ENC_CODERANGE_BROKEN;
- }
- p += MBCLEN_CHARFOUND_LEN(ret);
- if (p < e) {
- p = search_nonascii(p, e);
- if (!p) {
- return ENC_CODERANGE_VALID;
- }
- }
- }
- if (e < p) {
- return ENC_CODERANGE_BROKEN;
- }
- return ENC_CODERANGE_VALID;
- }
- while (p < e) {
- int ret = rb_enc_precise_mbclen(p, e, enc);
- if (!MBCLEN_CHARFOUND_P(ret)) {
- return ENC_CODERANGE_BROKEN;
- }
- p += MBCLEN_CHARFOUND_LEN(ret);
- }
- if (e < p) {
- return ENC_CODERANGE_BROKEN;
- }
- return ENC_CODERANGE_VALID;
- }
- long
- rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
- {
- const char *p = s;
- if (*cr == ENC_CODERANGE_BROKEN)
- return e - s;
- if (rb_enc_to_index(enc) == 0) {
- /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
- p = search_nonascii(p, e);
- *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
- return e - s;
- }
- else if (rb_enc_asciicompat(enc)) {
- p = search_nonascii(p, e);
- if (!p) {
- if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
- return e - s;
- }
- while (p < e) {
- int ret = rb_enc_precise_mbclen(p, e, enc);
- if (!MBCLEN_CHARFOUND_P(ret)) {
- *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
- return p - s;
- }
- p += MBCLEN_CHARFOUND_LEN(ret);
- if (p < e) {
- p = search_nonascii(p, e);
- if (!p) {
- *cr = ENC_CODERANGE_VALID;
- return e - s;
- }
- }
- }
- *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
- return p - s;
- }
- else {
- while (p < e) {
- int ret = rb_enc_precise_mbclen(p, e, enc);
- if (!MBCLEN_CHARFOUND_P(ret)) {
- *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
- return p - s;
- }
- p += MBCLEN_CHARFOUND_LEN(ret);
- }
- *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
- return p - s;
- }
- }
- static inline void
- str_enc_copy(VALUE str1, VALUE str2)
- {
- rb_enc_set_index(str1, ENCODING_GET(str2));
- }
- static void
- rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
- {
- /* this function is designed for copying encoding and coderange
- * from src to new string "dest" which is made from the part of src.
- */
- str_enc_copy(dest, src);
- if (RSTRING_LEN(dest) == 0) {
- if (!rb_enc_asciicompat(STR_ENC_GET(src)))
- ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
- else
- ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
- return;
- }
- switch (ENC_CODERANGE(src)) {
- case ENC_CODERANGE_7BIT:
- ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
- break;
- case ENC_CODERANGE_VALID:
- if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
- search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
- ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
- else
- ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
- break;
- default:
- break;
- }
- }
- static void
- rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
- {
- str_enc_copy(dest, src);
- ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
- }
- int
- rb_enc_str_coderange(VALUE str)
- {
- int cr = ENC_CODERANGE(str);
- if (cr == ENC_CODERANGE_UNKNOWN) {
- rb_encoding *enc = STR_ENC_GET(str);
- cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
- ENC_CODERANGE_SET(str, cr);
- }
- return cr;
- }
- int
- rb_enc_str_asciionly_p(VALUE str)
- {
- rb_encoding *enc = STR_ENC_GET(str);
- if (!rb_enc_asciicompat(enc))
- return FALSE;
- else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
- return TRUE;
- return FALSE;
- }
- static inline void
- str_mod_check(VALUE s, const char *p, long len)
- {
- if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
- rb_raise(rb_eRuntimeError, "string modified");
- }
- }
- size_t
- rb_str_capacity(VALUE str)
- {
- if (STR_EMBED_P(str)) {
- return RSTRING_EMBED_LEN_MAX;
- }
- else if (STR_NOCAPA_P(str)) {
- return RSTRING(str)->as.heap.len;
- }
- else {
- return RSTRING(str)->as.heap.aux.capa;
- }
- }
- static inline VALUE
- str_alloc(VALUE klass)
- {
- NEWOBJ_OF(str, struct RString, klass, T_STRING);
- str->as.heap.ptr = 0;
- str->as.heap.len = 0;
- str->as.heap.aux.capa = 0;
- return (VALUE)str;
- }
- static inline VALUE
- empty_str_alloc(VALUE klass)
- {
- if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
- RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
- }
- return str_alloc(klass);
- }
- static VALUE
- str_new(VALUE klass, const char *ptr, long len)
- {
- VALUE str;
- if (len < 0) {
- rb_raise(rb_eArgError, "negative string size (or size too big)");
- }
- if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
- RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
- }
- str = str_alloc(klass);
- if (len > RSTRING_EMBED_LEN_MAX) {
- RSTRING(str)->as.heap.aux.capa = len;
- RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
- STR_SET_NOEMBED(str);
- }
- else if (len == 0) {
- ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
- }
- if (ptr) {
- memcpy(RSTRING_PTR(str), ptr, len);
- }
- STR_SET_LEN(str, len);
- RSTRING_PTR(str)[len] = '\0';
- return str;
- }
- VALUE
- rb_str_new(const char *ptr, long len)
- {
- return str_new(rb_cString, ptr, len);
- }
- VALUE
- rb_usascii_str_new(const char *ptr, long len)
- {
- VALUE str = rb_str_new(ptr, len);
- ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
- return str;
- }
- VALUE
- rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
- {
- VALUE str = rb_str_new(ptr, len);
- rb_enc_associate(str, enc);
- return str;
- }
- VALUE
- rb_str_new_cstr(const char *ptr)
- {
- if (!ptr) {
- rb_raise(rb_eArgError, "NULL pointer given");
- }
- return rb_str_new(ptr, strlen(ptr));
- }
- RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
- #define rb_str_new2 rb_str_new_cstr
- VALUE
- rb_usascii_str_new_cstr(const char *ptr)
- {
- VALUE str = rb_str_new2(ptr);
- ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
- return str;
- }
- RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
- #define rb_usascii_str_new2 rb_usascii_str_new_cstr
- VALUE
- rb_tainted_str_new(const char *ptr, long len)
- {
- VALUE str = rb_str_new(ptr, len);
- OBJ_TAINT(str);
- return str;
- }
- VALUE
- rb_tainted_str_new_cstr(const char *ptr)
- {
- VALUE str = rb_str_new2(ptr);
- OBJ_TAINT(str);
- return str;
- }
- RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
- #define rb_tainted_str_new2 rb_tainted_str_new_cstr
- VALUE
- rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
- {
- rb_econv_t *ec;
- rb_econv_result_t ret;
- long len;
- VALUE newstr;
- const unsigned char *sp;
- unsigned char *dp;
- if (!to) return str;
- if (!from) from = rb_enc_get(str);
- if (from == to) return str;
- if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
- to == rb_ascii8bit_encoding()) {
- if (STR_ENC_GET(str) != to) {
- str = rb_str_dup(str);
- rb_enc_associate(str, to);
- }
- return str;
- }
- len = RSTRING_LEN(str);
- newstr = rb_str_new(0, len);
- retry:
- ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
- if (!ec) return str;
- sp = (unsigned char*)RSTRING_PTR(str);
- dp = (unsigned char*)RSTRING_PTR(newstr);
- ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
- &dp, (unsigned char*)RSTRING_END(newstr), 0);
- rb_econv_close(ec);
- switch (ret) {
- case econv_destination_buffer_full:
- /* destination buffer short */
- len = len < 2 ? 2 : len * 2;
- rb_str_resize(newstr, len);
- goto retry;
- case econv_finished:
- len = dp - (unsigned char*)RSTRING_PTR(newstr);
- rb_str_set_len(newstr, len);
- rb_enc_associate(newstr, to);
- return newstr;
- default:
- /* some error, return original */
- return str;
- }
- }
- VALUE
- rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
- {
- return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
- }
- VALUE
- rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
- {
- VALUE str;
- str = rb_tainted_str_new(ptr, len);
- if (eenc == rb_usascii_encoding() &&
- rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
- rb_enc_associate(str, rb_ascii8bit_encoding());
- return str;
- }
- rb_enc_associate(str, eenc);
- return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
- }
- VALUE
- rb_external_str_new(const char *ptr, long len)
- {
- return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
- }
- VALUE
- rb_external_str_new_cstr(const char *ptr)
- {
- return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
- }
- VALUE
- rb_locale_str_new(const char *ptr, long len)
- {
- return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
- }
- VALUE
- rb_locale_str_new_cstr(const char *ptr)
- {
- return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
- }
- VALUE
- rb_filesystem_str_new(const char *ptr, long len)
- {
- return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
- }
- VALUE
- rb_filesystem_str_new_cstr(const char *ptr)
- {
- return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
- }
- VALUE
- rb_str_export(VALUE str)
- {
- return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
- }
- VALUE
- rb_str_export_locale(VALUE str)
- {
- return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
- }
- VALUE
- rb_str_export_to_enc(VALUE str, rb_encoding *enc)
- {
- return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
- }
- static VALUE
- str_replace_shared_without_enc(VALUE str2, VALUE str)
- {
- if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
- STR_SET_EMBED(str2);
- memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
- STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
- }
- else {
- str = rb_str_new_frozen(str);
- FL_SET(str2, STR_NOEMBED);
- RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
- RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
- RSTRING(str2)->as.heap.aux.shared = str;
- FL_SET(str2, ELTS_SHARED);
- }
- return str2;
- }
- static VALUE
- str_replace_shared(VALUE str2, VALUE str)
- {
- str_replace_shared_without_enc(str2, str);
- rb_enc_cr_str_exact_copy(str2, str);
- return str2;
- }
- static VALUE
- str_new_shared(VALUE klass, VALUE str)
- {
- return str_replace_shared(str_alloc(klass), str);
- }
- static VALUE
- str_new3(VALUE klass, VALUE str)
- {
- return str_new_shared(klass, str);
- }
- VALUE
- rb_str_new_shared(VALUE str)
- {
- VALUE str2 = str_new3(rb_obj_class(str), str);
- OBJ_INFECT(str2, str);
- return str2;
- }
- RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
- #define rb_str_new3 rb_str_new_shared
- static VALUE
- str_new4(VALUE klass, VALUE str)
- {
- VALUE str2;
- str2 = str_alloc(klass);
- STR_SET_NOEMBED(str2);
- RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
- RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
- if (STR_SHARED_P(str)) {
- VALUE shared = RSTRING(str)->as.heap.aux.shared;
- assert(OBJ_FROZEN(shared));
- FL_SET(str2, ELTS_SHARED);
- RSTRING(str2)->as.heap.aux.shared = shared;
- }
- else {
- FL_SET(str, ELTS_SHARED);
- RSTRING(str)->as.heap.aux.shared = str2;
- }
- rb_enc_cr_str_exact_copy(str2, str);
- OBJ_INFECT(str2, str);
- return str2;
- }
- VALUE
- rb_str_new_frozen(VALUE orig)
- {
- VALUE klass, str;
- if (OBJ_FROZEN(orig)) return orig;
- klass = rb_obj_class(orig);
- if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
- long ofs;
- assert(OBJ_FROZEN(str));
- ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
- if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
- ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
- ENCODING_GET(str) != ENCODING_GET(orig)) {
- str = str_new3(klass, str);
- RSTRING(str)->as.heap.ptr += ofs;
- RSTRING(str)->as.heap.len -= ofs;
- rb_enc_cr_str_exact_copy(str, orig);
- OBJ_INFECT(str, orig);
- }
- }
- else if (STR_EMBED_P(orig)) {
- str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
- rb_enc_cr_str_exact_copy(str, orig);
- OBJ_INFECT(str, orig);
- }
- else if (STR_ASSOC_P(orig)) {
- VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
- FL_UNSET(orig, STR_ASSOC);
- str = str_new4(klass, orig);
- FL_SET(str, STR_ASSOC);
- RSTRING(str)->as.heap.aux.shared = assoc;
- }
- else {
- str = str_new4(klass, orig);
- }
- OBJ_FREEZE(str);
- return str;
- }
- RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
- #define rb_str_new4 rb_str_new_frozen
- VALUE
- rb_str_new_with_class(VALUE obj, const char *ptr, long len)
- {
- return str_new(rb_obj_class(obj), ptr, len);
- }
- RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
- rb_str_new_with_class, (obj, ptr, len))
- #define rb_str_new5 rb_str_new_with_class
- static VALUE
- str_new_empty(VALUE str)
- {
- VALUE v = rb_str_new5(str, 0, 0);
- rb_enc_copy(v, str);
- OBJ_INFECT(v, str);
- return v;
- }
- #define STR_BUF_MIN_SIZE 128
- VALUE
- rb_str_buf_new(long capa)
- {
- VALUE str = str_alloc(rb_cString);
- if (capa < STR_BUF_MIN_SIZE) {
- capa = STR_BUF_MIN_SIZE;
- }
- FL_SET(str, STR_NOEMBED);
- RSTRING(str)->as.heap.aux.capa = capa;
- RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
- RSTRING(str)->as.heap.ptr[0] = '\0';
- return str;
- }
- VALUE
- rb_str_buf_new_cstr(const char *ptr)
- {
- VALUE str;
- long len = strlen(ptr);
- str = rb_str_buf_new(len);
- rb_str_buf_cat(str, ptr, len);
- return str;
- }
- RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
- #define rb_str_buf_new2 rb_str_buf_new_cstr
- VALUE
- rb_str_tmp_new(long len)
- {
- return str_new(0, 0, len);
- }
- void *
- rb_alloc_tmp_buffer(volatile VALUE *store, long len)
- {
- VALUE s = rb_str_tmp_new(len);
- *store = s;
- return RSTRING_PTR(s);
- }
- void
- rb_free_tmp_buffer(volatile VALUE *store)
- {
- VALUE s = *store;
- *store = 0;
- if (s) rb_str_clear(s);
- }
- void
- rb_str_free(VALUE str)
- {
- if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
- xfree(RSTRING(str)->as.heap.ptr);
- }
- }
- RUBY_FUNC_EXPORTED size_t
- rb_str_memsize(VALUE str)
- {
- if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
- return RSTRING(str)->as.heap.aux.capa;
- }
- else {
- return 0;
- }
- }
- VALUE
- rb_str_to_str(VALUE str)
- {
- return rb_convert_type(str, T_STRING, "String", "to_str");
- }
- static inline void str_discard(VALUE str);
- void
- rb_str_shared_replace(VALUE str, VALUE str2)
- {
- rb_encoding *enc;
- int cr;
- if (str == str2) return;
- enc = STR_ENC_GET(str2);
- cr = ENC_CODERANGE(str2);
- str_discard(str);
- OBJ_INFECT(str, str2);
- if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
- STR_SET_EMBED(str);
- memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
- STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
- rb_enc_associate(str, enc);
- ENC_CODERANGE_SET(str, cr);
- return;
- }
- STR_SET_NOEMBED(str);
- STR_UNSET_NOCAPA(str);
- RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
- RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
- if (STR_NOCAPA_P(str2)) {
- FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
- RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
- }
- else {
- RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
- }
- STR_SET_EMBED(str2); /* abandon str2 */
- RSTRING_PTR(str2)[0] = 0;
- STR_SET_EMBED_LEN(str2, 0);
- rb_enc_associate(str, enc);
- ENC_CODERANGE_SET(str, cr);
- }
- static ID id_to_s;
- VALUE
- rb_obj_as_string(VALUE obj)
- {
- VALUE str;
- if (RB_TYPE_P(obj, T_STRING)) {
- return obj;
- }
- str = rb_funcall(obj, id_to_s, 0);
- if (!RB_TYPE_P(str, T_STRING))
- return rb_any_to_s(obj);
- if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
- return str;
- }
- static VALUE
- str_replace(VALUE str, VALUE str2)
- {
- long len;
- len = RSTRING_LEN(str2);
- if (STR_ASSOC_P(str2)) {
- str2 = rb_str_new4(str2);
- }
- if (STR_SHARED_P(str2)) {
- VALUE shared = RSTRING(str2)->as.heap.aux.shared;
- assert(OBJ_FROZEN(shared));
- STR_SET_NOEMBED(str);
- RSTRING(str)->as.heap.len = len;
- RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
- FL_SET(str, ELTS_SHARED);
- FL_UNSET(str, STR_ASSOC);
- RSTRING(str)->as.heap.aux.shared = shared;
- }
- else {
- str_replace_shared(str, str2);
- }
- OBJ_INFECT(str, str2);
- rb_enc_cr_str_exact_copy(str, str2);
- return str;
- }
- static VALUE
- str_duplicate(VALUE klass, VALUE str)
- {
- VALUE dup = str_alloc(klass);
- str_replace(dup, str);
- return dup;
- }
- VALUE
- rb_str_dup(VALUE str)
- {
- return str_duplicate(rb_obj_class(str), str);
- }
- VALUE
- rb_str_resurrect(VALUE str)
- {
- if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
- RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
- rb_sourcefile(), rb_sourceline());
- }
- return str_replace(str_alloc(rb_cString), str);
- }
- /*
- * call-seq:
- * String.new(str="") -> new_str
- *
- * Returns a new string object containing a copy of <i>str</i>.
- */
- static VALUE
- rb_str_init(int argc, VALUE *argv, VALUE str)
- {
- VALUE orig;
- if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
- rb_str_replace(str, orig);
- return str;
- }
- static inline long
- enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
- {
- long c;
- const char *q;
- if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
- return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
- }
- else if (rb_enc_asciicompat(enc)) {
- c = 0;
- if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
- while (p < e) {
- if (ISASCII(*p)) {
- q = search_nonascii(p, e);
- if (!q)
- return c + (e - p);
- c += q - p;
- p = q;
- }
- p += rb_enc_fast_mbclen(p, e, enc);
- c++;
- }
- }
- else {
- while (p < e) {
- if (ISASCII(*p)) {
- q = search_nonascii(p, e);
- if (!q)
- return c + (e - p);
- c += q - p;
- p = q;
- }
- p += rb_enc_mbclen(p, e, enc);
- c++;
- }
- }
- return c;
- }
- for (c=0; p<e; c++) {
- p += rb_enc_mbclen(p, e, enc);
- }
- return c;
- }
- long
- rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
- {
- return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
- }
- long
- rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
- {
- long c;
- const char *q;
- int ret;
- *cr = 0;
- if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
- return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
- }
- else if (rb_enc_asciicompat(enc)) {
- c = 0;
- while (p < e) {
- if (ISASCII(*p)) {
- q = search_nonascii(p, e);
- if (!q) {
- if (!*cr) *cr = ENC_CODERANGE_7BIT;
- return c + (e - p);
- }
- c += q - p;
- p = q;
- }
- ret = rb_enc_precise_mbclen(p, e, enc);
- if (MBCLEN_CHARFOUND_P(ret)) {
- *cr |= ENC_CODERANGE_VALID;
- p += MBCLEN_CHARFOUND_LEN(ret);
- }
- else {
- *cr = ENC_CODERANGE_BROKEN;
- p++;
- }
- c++;
- }
- if (!*cr) *cr = ENC_CODERANGE_7BIT;
- return c;
- }
- for (c=0; p<e; c++) {
- ret = rb_enc_precise_mbclen(p, e, enc);
- if (MBCLEN_CHARFOUND_P(ret)) {
- *cr |= ENC_CODERANGE_VALID;
- p += MBCLEN_CHARFOUND_LEN(ret);
- }
- else {
- *cr = ENC_CODERANGE_BROKEN;
- if (p + rb_enc_mbminlen(enc) <= e)
- p += rb_enc_mbminlen(enc);
- else
- p = e;
- }
- }
- if (!*cr) *cr = ENC_CODERANGE_7BIT;
- return c;
- }
- #ifdef NONASCII_MASK
- #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
- /*
- * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
- * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
- * Therefore, following pseudo code can detect UTF-8 leading byte.
- *
- * if (!(byte & 0x80))
- * byte |= 0x40; // turn on bit6
- * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
- *
- * This function calculate every bytes in the argument word `s'
- * using the above logic concurrently. and gather every bytes result.
- */
- static inline VALUE
- count_utf8_lead_bytes_with_word(const VALUE *s)
- {
- VALUE d = *s;
- /* Transform into bit0 represent UTF-8 leading or not. */
- d |= ~(d>>1);
- d >>= 6;
- d &= NONASCII_MASK >> 7;
- /* Gather every bytes. */
- d += (d>>8);
- d += (d>>16);
- #if SIZEOF_VALUE == 8
- d += (d>>32);
- #endif
- return (d&0xF);
- }
- #endif
- static long
- str_strlen(VALUE str, rb_encoding *enc)
- {
- const char *p, *e;
- long n;
- int cr;
- if (single_byte_optimizable(str)) return RSTRING_LEN(str);
- if (!enc) enc = STR_ENC_GET(str);
- p = RSTRING_PTR(str);
- e = RSTRING_END(str);
- cr = ENC_CODERANGE(str);
- #ifdef NONASCII_MASK
- if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
- enc == rb_utf8_encoding()) {
- VALUE len = 0;
- if ((int)sizeof(VALUE) * 2 < e - p) {
- const VALUE *s, *t;
- const VALUE lowbits = sizeof(VALUE) - 1;
- s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
- t = (const VALUE*)(~lowbits & (VALUE)e);
- while (p < (const char *)s) {
- if (is_utf8_lead_byte(*p)) len++;
- p++;
- }
- while (s < t) {
- len += count_utf8_lead_bytes_with_word(s);
- s++;
- }
- p = (const char *)s;
- }
- while (p < e) {
- if (is_utf8_lead_byte(*p)) len++;
- p++;
- }
- return (long)len;
- }
- #endif
- n = rb_enc_strlen_cr(p, e, enc, &cr);
- if (cr) {
- ENC_CODERANGE_SET(str, cr);
- }
- return n;
- }
- long
- rb_str_strlen(VALUE str)
- {
- return str_strlen(str, STR_ENC_GET(str));
- }
- /*
- * call-seq:
- * str.length -> integer
- * str.size -> integer
- *
- * Returns the character length of <i>str</i>.
- */
- VALUE
- rb_str_length(VALUE str)
- {
- long len;
- len = str_strlen(str, STR_ENC_GET(str));
- return LONG2NUM(len);
- }
- /*
- * call-seq:
- * str.bytesize -> integer
- *
- * Returns the length of +str+ in bytes.
- *
- * "\x80\u3042".bytesize #=> 4
- * "hello".bytesize #=> 5
- */
- static VALUE
- rb_str_bytesize(VALUE str)
- {
- return LONG2NUM(RSTRING_LEN(str));
- }
- /*
- * call-seq:
- * str.empty? -> true or false
- *
- * Returns <code>true</code> if <i>str</i> has a length of zero.
- *
- * "hello".empty? #=> false
- * " ".empty? #=> false
- * "".empty? #=> true
- */
- static VALUE
- rb_str_empty(VALUE str)
- {
- if (RSTRING_LEN(str) == 0)
- return Qtrue;
- return Qfalse;
- }
- /*
- * call-seq:
- * str + other_str -> new_str
- *
- * Concatenation---Returns a new <code>String</code> containing
- * <i>other_str</i> concatenated to <i>str</i>.
- *
- * "Hello from " + self.to_s #=> "Hello from main"
- */
- VALUE
- rb_str_plus(VALUE str1, VALUE str2)
- {
- VALUE str3;
- rb_encoding *enc;
- StringValue(str2);
- enc = rb_enc_check(str1, str2);
- str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
- memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
- memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
- RSTRING_PTR(str2), RSTRING_LEN(str2));
- RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
- if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
- OBJ_TAINT(str3);
- ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
- ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
- return str3;
- }
- /*
- * call-seq:
- * str * integer -> new_str
- *
- * Copy --- Returns a new String containing +integer+ copies of the receiver.
- * +integer+ must be greater than or equal to 0.
- *
- * "Ho! " * 3 #=> "Ho! Ho! Ho! "
- * "Ho! " * 0 #=> ""
- */
- VALUE
- rb_str_times(VALUE str, VALUE times)
- {
- VALUE str2;
- long n, len;
- char *ptr2;
- len = NUM2LONG(times);
- if (len < 0) {
- rb_raise(rb_eArgError, "negative argument");
- }
- if (len && LONG_MAX/len < RSTRING_LEN(str)) {
- rb_raise(rb_eArgError, "argument too big");
- }
- str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
- ptr2 = RSTRING_PTR(str2);
- if (len) {
- n = RSTRING_LEN(str);
- memcpy(ptr2, RSTRING_PTR(str), n);
- while (n <= len/2) {
- memcpy(ptr2 + n, ptr2, n);
- n *= 2;
- }
- memcpy(ptr2 + n, ptr2, len-n);
- }
- ptr2[RSTRING_LEN(str2)] = '\0';
- OBJ_INFECT(str2, str);
- rb_enc_cr_str_copy_for_substr(str2, str);
- return str2;
- }
- /*
- * call-seq:
- * str % arg -> new_str
- *
- * Format---Uses <i>str</i> as a format specification, and returns the result
- * of applying it to <i>arg</i>. If the format specification contains more than
- * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
- * containing the values to be substituted. See <code>Kernel::sprintf</code> for
- * details of the format string.
- *
- * "%05d" % 123 #=> "00123"
- * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
- * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
- */
- static VALUE
- rb_str_format_m(VALUE str, VALUE arg)
- {
- volatile VALUE tmp = rb_check_array_type(arg);
- if (!NIL_P(tmp)) {
- return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
- }
- return rb_str_format(1, &arg, str);
- }
- static inline void
- str_modifiable(VALUE str)
- {
- if (FL_TEST(str, STR_TMPLOCK)) {
- rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
- }
- rb_check_frozen(str);
- if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
- rb_raise(rb_eSecurityError, "Insecure: can't modify string");
- }
- static inline int
- str_independent(VALUE str)
- {
- str_modifiable(str);
- if (!STR_SHARED_P(str)) return 1;
- if (STR_EMBED_P(str)) return 1;
- return 0;
- }
- static void
- str_make_independent_expand(VALUE str, long expand)
- {
- char *ptr;
- long len = RSTRING_LEN(str);
- long capa = len + expand;
- if (len > capa) len = capa;
- ptr = ALLOC_N(char, capa + 1);
- if (RSTRING_PTR(str)) {
- memcpy(ptr, RSTRING_PTR(str), len);
- }
- STR_SET_NOEMBED(str);
- STR_UNSET_NOCAPA(str);
- ptr[len] = 0;
- RSTRING(str)->as.heap.ptr = ptr;
- RSTRING(str)->as.heap.len = len;
- RSTRING(str)->as.heap.aux.capa = capa;
- }
- #define str_make_independent(str) str_make_independent_expand((str), 0L)
- void
- rb_str_modify(VALUE str)
- {
- if (!str_independent(str))
- str_make_independent(str);
- ENC_CODERANGE_CLEAR(str);
- }
- void
- rb_str_modify_expand(VALUE str, long expand)
- {
- if (expand < 0) {
- rb_raise(rb_eArgError, "negative expanding string size");
- }
- if (!str_independent(str)) {
- str_make_independent_expand(str, expand);
- }
- else if (expand > 0) {
- long len = RSTRING_LEN(str);
- long capa = len + expand;
- if (!STR_EMBED_P(str)) {
- REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
- RSTRING(str)->as.heap.aux.capa = capa;
- }
- else if (capa > RSTRING_EMBED_LEN_MAX) {
- str_make_independent_expand(str, expand);
- }
- }
- ENC_CODERANGE_CLEAR(str);
- }
- /* As rb_str_modify(), but don't clear coderange */
- static void
- str_modify_keep_cr(VALUE str)
- {
- if (!str_independent(str))
- str_make_independent(str);
- if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
- /* Force re-scan later */
- ENC_CODERANGE_CLEAR(str);
- }
- static inline void
- str_discard(VALUE str)
- {
- str_modifiable(str);
- if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
- xfree(RSTRING_PTR(str));
- RSTRING(str)->as.heap.ptr = 0;
- RSTRING(str)->as.heap.len = 0;
- }
- }
- void
- rb_str_associate(VALUE str, VALUE add)
- {
- /* sanity check */
- rb_check_frozen(str);
- if (STR_ASSOC_P(str)) {
- /* already associated */
- rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
- }
- else {
- if (STR_SHARED_P(str)) {
- VALUE assoc = RSTRING(str)->as.heap.aux.shared;
- str_make_independent(str);
- if (STR_ASSOC_P(assoc)) {
- assoc = RSTRING(assoc)->as.heap.aux.shared;
- rb_ary_concat(assoc, add);
- add = assoc;
- }
- }
- else if (STR_EMBED_P(str)) {
- str_make_independent(str);
- }
- else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
- RESIZE_CAPA(str, RSTRING_LEN(str));
- }
- FL_SET(str, STR_ASSOC);
- RBASIC(add)->klass = 0;
- RSTRING(str)->as.heap.aux.shared = add;
- }
- }
- VALUE
- rb_str_associated(VALUE str)
- {
- if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
- if (STR_ASSOC_P(str)) {
- return RSTRING(str)->as.heap.aux.shared;
- }
- return Qfalse;
- }
- void
- rb_must_asciicompat(VALUE str)
- {
- rb_encoding *enc = rb_enc_get(str);
- if (!rb_enc_asciicompat(enc)) {
- rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
- }
- }
- VALUE
- rb_string_value(volatile VALUE *ptr)
- {
- VALUE s = *ptr;
- if (!RB_TYPE_P(s, T_STRING)) {
- s = rb_str_to_str(s);
- *ptr = s;
- }
- return s;
- }
- char *
- rb_string_value_ptr(volatile VALUE *ptr)
- {
- VALUE str = rb_string_value(ptr);
- return RSTRING_PTR(str);
- }
- char *
- rb_string_value_cstr(volatile VALUE *ptr)
- {
- VALUE str = rb_string_value(ptr);
- char *s = RSTRING_PTR(str);
- long len = RSTRING_LEN(str);
- if (!s || memchr(s, 0, len)) {
- rb_raise(rb_eArgError, "string contains null byte");
- }
- if (s[len]) {
- rb_str_modify(str);
- s = RSTRING_PTR(str);
- s[RSTRING_LEN(str)] = 0;
- }
- return s;
- }
- VALUE
- rb_check_string_type(VALUE str)
- {
- str = rb_check_convert_type(str, T_STRING, "String", "to_str");
- return str;
- }
- /*
- * call-seq:
- * String.try_convert(obj) -> string or nil
- *
- * Try to convert <i>obj</i> into a String, using to_str method.
- * Returns converted string or nil if <i>obj</i> cannot be converted
- * for any reason.
- *
- * String.try_convert("str") #=> "str"
- * String.try_convert(/re/) #=> nil
- */
- static VALUE
- rb_str_s_try_convert(VALUE dummy, VALUE str)
- {
- return rb_check_string_type(str);
- }
- static char*
- str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
- {
- long nth = *nthp;
- if (rb_enc_mbmaxlen(enc) == 1) {
- p += nth;
- }
- else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
- p += nth * rb_enc_mbmaxlen(enc);
- }
- else if (rb_enc_asciicompat(enc)) {
- const char *p2, *e2;
- int n;
- while (p < e && 0 < nth) {
- e2 = p + nth;
- if (e < e2) {
- *nthp = nth;
- return (char *)e;
- }
- if (ISASCII(*p)) {
- p2 = search_nonascii(p, e2);
- if (!p2) {
- nth -= e2 - p;
- *nthp = nth;
- return (char *)e2;
- }
- nth -= p2 - p;
- p = p2;
- }
- n = rb_enc_mbclen(p, e, enc);
- p += n;
- nth--;
- }
- *nthp = nth;
- if (nth != 0) {
- return (char *)e;
- }
- return (char *)p;
- }
- else {
- while (p < e && nth--) {
- p += rb_enc_mbclen(p, e, enc);
- }
- }
- if (p > e) p = e;
- *nthp = nth;
- return (char*)p;
- }
- char*
- rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
- {
- return str_nth_len(p, e, &nth, enc);
- }
- static char*
- str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
- {
- if (singlebyte)
- p += nth;
- else {
- p = str_nth_len(p, e, &nth, enc);
- }
- if (!p) return 0;
- if (p > e) p = e;
- return (char *)p;
- }
- /* char offset to byte offset */
- static long
- str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
- {
- const char *pp = str_nth(p, e, nth, enc, singlebyte);
- if (!pp) return e - p;
- return pp - p;
- }
- long
- rb_str_offset(VALUE str, long pos)
- {
- return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
- STR_ENC_GET(str), single_byte_optimizable(str));
- }
- #ifdef NONASCII_MASK
- static char *
- str_utf8_nth(const char *p, const char *e, long *nthp)
- {
- long nth = *nthp;
- if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
- const VALUE *s, *t;
- const VALUE lowbits = sizeof(VALUE) - 1;
- s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
- t = (const VALUE*)(~lowbits & (VALUE)e);
- while (p < (const char *)s) {
- if (is_utf8_lead_byte(*p)) nth--;
- p++;
- }
- do {
- nth -= count_utf8_lead_bytes_with_word(s);
- s++;
- } while (s < t && (int)sizeof(VALUE) <= nth);
- p = (char *)s;
- }
- while (p < e) {
- if (is_utf8_lead_byte(*p)) {
- if (nth == 0) break;
- nth--;
- }
- p++;
- }
- *nthp = nth;
- return (char *)p;
- }
- static long
- str_utf8_offset(const char *p, const char *e, long nth)
- {
- const char *pp = str_utf8_nth(p, e, &nth);
- return pp - p;
- }
- #endif
- /* byte offset to char offset */
- long
- rb_str_sublen(VALUE str, long pos)
- {
- if (single_byte_optimizable(str) || pos < 0)
- return pos;
- else {
- char *p = RSTRING_PTR(str);
- return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
- }
- }
- VALUE
- rb_str_subseq(VALUE str, long beg, long len)
- {
- VALUE str2;
- if (RSTRING_LEN(str) == beg + len &&
- RSTRING_EMBED_LEN_MAX < len) {
- str2 = rb_str_new_shared(rb_str_new_frozen(str));
- rb_str_drop_bytes(str2, beg);
- }
- else {
- str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
- RB_GC_GUARD(str);
- }
- rb_enc_cr_str_copy_for_substr(str2, str);
- OBJ_INFECT(str2, str);
- return str2;
- }
- static char *
- rb_str_subpos(VALUE str, long beg, long *lenp)
- {
- long len = *lenp;
- long slen = -1L;
- long blen = RSTRING_LEN(str);
- rb_encoding *enc = STR_ENC_GET(str);
- char *p, *s = RSTRING_PTR(str), *e = s + blen;
- if (len < 0) return 0;
- if (!blen) {
- len = 0;
- }
- if (single_byte_optimizable(str)) {
- if (beg > blen) return 0;
- if (beg < 0) {
- beg += blen;
- if (beg < 0) return 0;
- }
- if (beg + len > blen)
- len = blen - beg;
- if (len < 0) return 0;
- p = s + beg;
- goto end;
- }
- if (beg < 0) {
- if (len > -beg) len = -beg;
- if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
- beg = -beg;
- while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
- p = e;
- if (!p) return 0;
- while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
- if (!p) return 0;
- len = e - p;
- goto end;
- }
- else {
- slen = str_strlen(str, enc);
- beg += slen;
- if (beg < 0) return 0;
- p = s + beg;
- if (len == 0) goto end;
- }
- }
- else if (beg > 0 && beg > RSTRING_LEN(str)) {
- return 0;
- }
- if (len == 0) {
- if (beg > str_strlen(str, enc)) return 0;
- p = s + beg;
- }
- #ifdef NONASCII_MASK
- else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
- enc == rb_utf8_encoding()) {
- p = str_utf8_nth(s, e, &beg);
- if (beg > 0) return 0;
- len = str_utf8_offset(p, e, len);
- }
- #endif
- else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
- int char_sz = rb_enc_mbmaxlen(enc);
- p = s + beg * char_sz;
- if (p > e) {
- return 0;
- }
- else if (len * char_sz > e - p)
- len = e - p;
- else
- len *= char_sz;
- }
- else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
- if (beg > 0) return 0;
- len = 0;
- }
- else {
- len = str_offset(p, e, len, enc, 0);
- }
- end:
- *lenp = len;
- RB_GC_GUARD(str);
- return p;
- }
- VALUE
- rb_str_substr(VALUE str, long beg, long len)
- {
- VALUE str2;
- char *p = rb_str_subpos(str, beg, &len);
- if (!p) return Qnil;
- if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
- str2 = rb_str_new4(str);
- str2 = str_new3(rb_obj_class(str2), str2);
- RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
- RSTRING(str2)->as.heap.len = len;
- }
- else {
- str2 = rb_str_new5(str, p, len);
- rb_enc_cr_str_copy_for_substr(str2, str);
- OBJ_INFECT(str2, str);
- RB_GC_GUARD(str);
- }
- return str2;
- }
- VALUE
- rb_str_freeze(VALUE str)
- {
- if (STR_ASSOC_P(str)) {
- VALUE ary = RSTRING(str)->as.heap.aux.shared;
- OBJ_FREEZE(ary);
- }
- return rb_obj_freeze(str);
- }
- RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
- #define rb_str_dup_frozen rb_str_new_frozen
- VALUE
- rb_str_locktmp(VALUE str)
- {
- if (FL_TEST(str, STR_TMPLOCK)) {
- rb_raise(rb_eRuntimeError, "temporal locking already locked string");
- }
- FL_SET(str, STR_TMPLOCK);
- return str;
- }
- VALUE
- rb_str_unlocktmp(VALUE str)
- {
- if (!FL_TEST(str, STR_TMPLOCK)) {
- rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
- }
- FL_UNSET(str, STR_TMPLOCK);
- return str;
- }
- void
- rb_str_set_len(VALUE str, long len)
- {
- long capa;
- str_modifiable(str);
- if (STR_SHARED_P(str)) {
- rb_raise(rb_eRuntimeError, "can't set length of shared string");
- }
- if (len > (capa = (long)rb_str_capacity(str))) {
- rb_bug("probable buffer overflow: %ld for %ld", len, capa);
- }
- STR_SET_LEN(str, len);
- RSTRING_PTR(str)[len] = '\0';
- }
- VALUE
- rb_str_resize(VALUE str, long len)
- {
- long slen;
- int independent;
- if (len < 0) {
- rb_raise(rb_eArgError, "negative string size (or size too big)");
- }
- independent = str_independent(str);
- ENC_CODERANGE_CLEAR(str);
- slen = RSTRING_LEN(str);
- if (len != slen) {
- if (STR_EMBED_P(str)) {
- if (len <= RSTRING_EMBED_LEN_MAX) {
- STR_SET_EMBED_LEN(str, len);
- RSTRING(str)->as.ary[len] = '\0';
- return str;
- }
- str_make_independent_expand(str, len - slen);
- STR_SET_NOEMBED(str);
- }
- else if (len <= RSTRING_EMBED_LEN_MAX) {
- char *ptr = RSTRING(str)->as.heap.ptr;
- STR_SET_EMBED(str);
- if (slen > len) slen = len;
- if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
- RSTRING(str)->as.ary[len] = '\0';
- STR_SET_EMBED_LEN(str, len);
- if (independent) xfree(ptr);
- return str;
- }
- else if (!independent) {
- str_make_independent_expand(str, len - slen);
- }
- else if (slen < len || slen - len > 1024) {
- REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
- }
- if (!STR_NOCAPA_P(str)) {
- RSTRING(str)->as.heap.aux.capa = len;
- }
- RSTRING(str)->as.heap.len = len;
- RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
- }
- return str;
- }
- static VALUE
- str_buf_cat(VALUE str, const char *ptr, long len)
- {
- long capa, total, off = -1;
- if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
- off = ptr - RSTRING_PTR(str);
- }
- rb_str_modify(str);
- if (len == 0) return 0;
- if (STR_ASSOC_P(str)) {
- FL_UNSET(str, STR_ASSOC);
- capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
- }
- else if (STR_EMBED_P(str)) {
- capa = RSTRING_EMBED_LEN_MAX;
- }
- else {
- capa = RSTRING(str)->as.heap.aux.capa;
- }
- if (RSTRING_LEN(str) >= LONG_MAX - len) {
- rb_raise(rb_eArgError, "string sizes too big");
- }
- total = RSTRING_LEN(str)+len;
- if (capa <= total) {
- while (total > capa) {
- if (capa + 1 >= LONG_MAX / 2) {
- capa = (total + 4095) / 4096;
- break;
- }
- capa = (capa + 1) * 2;
- }
- RESIZE_CAPA(str, capa);
- }
- if (off != -1) {
- ptr = RSTRING_PTR(str) + off;
- }
- memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
- STR_SET_LEN(str, total);
- RSTRING_PTR(str)[total] = '\0'; /* sentinel */
- return str;
- }
- #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
- VALUE
- rb_str_buf_cat(VALUE str, const char *ptr, long len)
- {
- if (len == 0) return str;
- if (len < 0) {
- rb_raise(rb_eArgError, "negative string size (or size too big)");
- }
- return str_buf_cat(str, ptr, len);
- }
- VALUE
- rb_str_buf_cat2(VALUE str, const char *ptr)
- {
- return rb_str_buf_cat(str, ptr, strlen(ptr));
- }
- VALUE
- rb_str_cat(VALUE str, const char *ptr, long len)
- {
- if (len < 0) {
- rb_raise(rb_eArgError, "negative string size (or size too big)");
- }
- if (STR_ASSOC_P(str)) {
- char *p;
- rb_str_modify_expand(str, len);
- p = RSTRING(str)->as.heap.ptr;
- memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
- len = RSTRING(str)->as.heap.len += len;
- p[len] = '\0'; /* sentinel */
- return str;
- }
- return rb_str_buf_cat(str, ptr, len);
- }
- VALUE
- rb_str_cat2(VALUE str, const char *ptr)
- {
- return rb_str_cat(str, ptr, strlen(ptr));
- }
- static VALUE
- rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
- int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
- {
- int str_encindex = ENCODING_GET(str);
- int res_encindex;
- int str_cr, res_cr;
- str_cr = ENC_CODERANGE(str);
- if (str_encindex == ptr_encindex) {
- if (str_cr == ENC_CODERANGE_UNKNOWN)
- ptr_cr = ENC_CODERANGE_UNKNOWN;
- else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
- ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
- }
- }
- else {
- rb_encoding *str_enc = rb_enc_from_index(str_encindex);
- rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
- if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
- if (len == 0)
- return str;
- if (RSTRING_LEN(str) == 0) {
- rb_str_buf_cat(str, ptr, len);
- ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
- return str;
- }
- goto incompatible;
- }
- if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
- ptr_cr = coderange_scan(ptr, len, ptr_enc);
- }
- if (str_cr == ENC_CODERANGE_UNKNOWN) {
- if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
- str_cr = rb_enc_str_coderange(str);
- }
- }
- }
- if (ptr_cr_ret)
- *ptr_cr_ret = ptr_cr;
- if (str_encindex != ptr_encindex &&
- str_cr != ENC_CODERANGE_7BIT &&
- ptr_cr != ENC_CODERANGE_7BIT) {
- incompatible:
- rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
- rb_enc_name(rb_enc_from_index(str_encindex)),
- rb_enc_name(rb_enc_from_index(ptr_encindex)));
- }
- if (str_cr == ENC_CODERANGE_UNKNOWN) {
- res_encindex = str_encindex;
- res_cr = ENC_CODERANGE_UNKNOWN;
- }
- else if (str_cr == ENC_CODERANGE_7BIT) {
- if (ptr_cr == ENC_CODERANGE_7BIT) {
- res_encindex = str_encindex;
- res_cr = ENC_CODERANGE_7BIT;
- }
- else {
- res_encindex = ptr_encindex;
- res_cr = ptr_cr;
- }
- }
- else if (str_cr == ENC_CODERANGE_VALID) {
- res_encindex = str_encindex;
- if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
- res_cr = str_cr;
- else
- res_cr = ptr_cr;
- }
- else { /* str_cr == ENC_CODERANGE_BROKEN */
- res_encindex = str_encindex;
- res_cr = str_cr;
- if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
- }
- if (len < 0) {
- rb_raise(rb_eArgError, "negative string size (or size too big)");
- }
- str_buf_cat(str, ptr, len);
- ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
- return str;
- }
- VALUE
- rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
- {
- return rb_enc_cr_str_buf_cat(str, ptr, len,
- rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
- }
- VALUE
- rb_str_buf_cat_ascii(VALUE str, const char *ptr)
- {
- /* ptr must reference NUL terminated ASCII string. */
- int encindex = ENCODING_GET(str);
- rb_encoding *enc = rb_enc_from_index(encindex);
- if (rb_enc_asciicompat(enc)) {
- return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
- encindex, ENC_CODERANGE_7BIT, 0);
- }
- else {
- char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
- while (*ptr) {
- unsigned int c = (unsigned char)*ptr;
- int len = rb_enc_codelen(c, enc);
- rb_enc_mbcput(c, buf, enc);
- rb_enc_cr_str_buf_cat(str, buf, len,
- encindex, ENC_CODERANGE_VALID, 0);
- ptr++;
- }
- return str;
- }
- }
- VALUE
- rb_str_buf_append(VALUE str, VALUE str2)
- {
- int str2_cr;
- str2_cr = ENC_CODERANGE(str2);
- rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
- ENCODING_GET(str2), str2_cr, &str2_cr);
- OBJ_INFECT(str, str2);
- ENC_CODERANGE_SET(str2, str2_cr);
- return str;
- }
- VALUE
- rb_str_append(VALUE str, VALUE str2)
- {
- rb_encoding *enc;
- int cr, cr2;
- long len2;
- StringValue(str2);
- if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
- long len = RSTRING_LEN(str) + len2;
- enc = rb_enc_check(str, str2);
- cr = ENC_CODERANGE(str);
- if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
- rb_str_modify_expand(str, len2);
- memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
- RSTRING_PTR(str2), len2+1);
- RSTRING(str)->as.heap.len = len;
- rb_enc_associate(str, enc);
- ENC_CODERANGE_SET(str, cr);
- OBJ_INFECT(str, str2);
- return str;
- }
- return rb_str_buf_append(str, str2);
- }
- /*
- * call-seq:
- * str << integer -> str
- * str.concat(integer) -> str
- * str << obj -> str
- * str.concat(obj) -> str
- *
- * Append---Concatenates the given object to <i>str</i>. If the object is a
- * <code>Integer</code>, it is considered as a codepoint,…
Large files files are truncated, but you can click here to view the full file