/Objects/stringobject.c
http://unladen-swallow.googlecode.com/ · C · 5242 lines · 4329 code · 538 blank · 375 comment · 1221 complexity · a1c18aba068e8f25c3e3da8e22507f9f MD5 · raw file
Large files are truncated click here to view the full file
- /* String (str/bytes) object implementation */
- #define PY_SSIZE_T_CLEAN
- #include "Python.h"
- #include <ctype.h>
- #ifdef COUNT_ALLOCS
- int null_strings, one_strings;
- #endif
- static PyStringObject *characters[UCHAR_MAX + 1];
- static PyStringObject *nullstring;
- /* This dictionary holds all interned strings. Note that references to
- strings in this dictionary are *not* counted in the string's ob_refcnt.
- When the interned string reaches a refcnt of 0 the string deallocation
- function will delete the reference from this dictionary.
- Another way to look at this is that to say that the actual reference
- count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
- */
- static PyObject *interned;
- /*
- For both PyString_FromString() and PyString_FromStringAndSize(), the
- parameter `size' denotes number of characters to allocate, not counting any
- null terminating character.
- For PyString_FromString(), the parameter `str' points to a null-terminated
- string containing exactly `size' bytes.
- For PyString_FromStringAndSize(), the parameter the parameter `str' is
- either NULL or else points to a string containing at least `size' bytes.
- For PyString_FromStringAndSize(), the string in the `str' parameter does
- not have to be null-terminated. (Therefore it is safe to construct a
- substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
- If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
- bytes (setting the last byte to the null terminating character) and you can
- fill in the data yourself. If `str' is non-NULL then the resulting
- PyString object must be treated as immutable and you must not fill in nor
- alter the data yourself, since the strings may be shared.
- The PyObject member `op->ob_size', which denotes the number of "extra
- items" in a variable-size object, will contain the number of bytes
- allocated for string data, not counting the null terminating character. It
- is therefore equal to the equal to the `size' parameter (for
- PyString_FromStringAndSize()) or the length of the string in the `str'
- parameter (for PyString_FromString()).
- */
- PyObject *
- PyString_FromStringAndSize(const char *str, Py_ssize_t size)
- {
- register PyStringObject *op;
- if (size < 0) {
- PyErr_SetString(PyExc_SystemError,
- "Negative size passed to PyString_FromStringAndSize");
- return NULL;
- }
- if (size == 0 && (op = nullstring) != NULL) {
- #ifdef COUNT_ALLOCS
- null_strings++;
- #endif
- Py_INCREF(op);
- return (PyObject *)op;
- }
- if (size == 1 && str != NULL &&
- (op = characters[*str & UCHAR_MAX]) != NULL)
- {
- #ifdef COUNT_ALLOCS
- one_strings++;
- #endif
- Py_INCREF(op);
- return (PyObject *)op;
- }
- if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
- PyErr_SetString(PyExc_OverflowError, "string is too large");
- return NULL;
- }
- /* Inline PyObject_NewVar */
- op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
- if (op == NULL)
- return PyErr_NoMemory();
- PyObject_INIT_VAR(op, &PyString_Type, size);
- op->ob_shash = -1;
- op->ob_sstate = SSTATE_NOT_INTERNED;
- if (str != NULL)
- Py_MEMCPY(op->ob_sval, str, size);
- op->ob_sval[size] = '\0';
- /* share short strings */
- if (size == 0) {
- PyObject *t = (PyObject *)op;
- PyString_InternInPlace(&t);
- op = (PyStringObject *)t;
- nullstring = op;
- Py_INCREF(op);
- } else if (size == 1 && str != NULL) {
- PyObject *t = (PyObject *)op;
- PyString_InternInPlace(&t);
- op = (PyStringObject *)t;
- characters[*str & UCHAR_MAX] = op;
- Py_INCREF(op);
- }
- return (PyObject *) op;
- }
- PyObject *
- PyString_FromString(const char *str)
- {
- register size_t size;
- register PyStringObject *op;
- assert(str != NULL);
- size = strlen(str);
- if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
- PyErr_SetString(PyExc_OverflowError,
- "string is too long for a Python string");
- return NULL;
- }
- if (size == 0 && (op = nullstring) != NULL) {
- #ifdef COUNT_ALLOCS
- null_strings++;
- #endif
- Py_INCREF(op);
- return (PyObject *)op;
- }
- if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
- #ifdef COUNT_ALLOCS
- one_strings++;
- #endif
- Py_INCREF(op);
- return (PyObject *)op;
- }
- /* Inline PyObject_NewVar */
- op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
- if (op == NULL)
- return PyErr_NoMemory();
- PyObject_INIT_VAR(op, &PyString_Type, size);
- op->ob_shash = -1;
- op->ob_sstate = SSTATE_NOT_INTERNED;
- Py_MEMCPY(op->ob_sval, str, size+1);
- /* share short strings */
- if (size == 0) {
- PyObject *t = (PyObject *)op;
- PyString_InternInPlace(&t);
- op = (PyStringObject *)t;
- nullstring = op;
- Py_INCREF(op);
- } else if (size == 1) {
- PyObject *t = (PyObject *)op;
- PyString_InternInPlace(&t);
- op = (PyStringObject *)t;
- characters[*str & UCHAR_MAX] = op;
- Py_INCREF(op);
- }
- return (PyObject *) op;
- }
- PyObject *
- PyString_FromFormatV(const char *format, va_list vargs)
- {
- va_list count;
- Py_ssize_t n = 0;
- const char* f;
- char *s;
- PyObject* string;
- #ifdef VA_LIST_IS_ARRAY
- Py_MEMCPY(count, vargs, sizeof(va_list));
- #else
- #ifdef __va_copy
- __va_copy(count, vargs);
- #else
- count = vargs;
- #endif
- #endif
- /* step 1: figure out how large a buffer we need */
- for (f = format; *f; f++) {
- if (*f == '%') {
- const char* p = f;
- while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
- ;
- /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
- * they don't affect the amount of space we reserve.
- */
- if ((*f == 'l' || *f == 'z') &&
- (f[1] == 'd' || f[1] == 'u'))
- ++f;
- switch (*f) {
- case 'c':
- (void)va_arg(count, int);
- /* fall through... */
- case '%':
- n++;
- break;
- case 'd': case 'u': case 'i': case 'x':
- (void) va_arg(count, int);
- /* 20 bytes is enough to hold a 64-bit
- integer. Decimal takes the most space.
- This isn't enough for octal. */
- n += 20;
- break;
- case 's':
- s = va_arg(count, char*);
- n += strlen(s);
- break;
- case 'p':
- (void) va_arg(count, int);
- /* maximum 64-bit pointer representation:
- * 0xffffffffffffffff
- * so 19 characters is enough.
- * XXX I count 18 -- what's the extra for?
- */
- n += 19;
- break;
- default:
- /* if we stumble upon an unknown
- formatting code, copy the rest of
- the format string to the output
- string. (we cannot just skip the
- code, since there's no way to know
- what's in the argument list) */
- n += strlen(p);
- goto expand;
- }
- } else
- n++;
- }
- expand:
- /* step 2: fill the buffer */
- /* Since we've analyzed how much space we need for the worst case,
- use sprintf directly instead of the slower PyOS_snprintf. */
- string = PyString_FromStringAndSize(NULL, n);
- if (!string)
- return NULL;
- s = PyString_AsString(string);
- for (f = format; *f; f++) {
- if (*f == '%') {
- const char* p = f++;
- Py_ssize_t i;
- int longflag = 0;
- int size_tflag = 0;
- /* parse the width.precision part (we're only
- interested in the precision value, if any) */
- n = 0;
- while (isdigit(Py_CHARMASK(*f)))
- n = (n*10) + *f++ - '0';
- if (*f == '.') {
- f++;
- n = 0;
- while (isdigit(Py_CHARMASK(*f)))
- n = (n*10) + *f++ - '0';
- }
- while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
- f++;
- /* handle the long flag, but only for %ld and %lu.
- others can be added when necessary. */
- if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
- longflag = 1;
- ++f;
- }
- /* handle the size_t flag. */
- if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
- size_tflag = 1;
- ++f;
- }
- switch (*f) {
- case 'c':
- *s++ = va_arg(vargs, int);
- break;
- case 'd':
- if (longflag)
- sprintf(s, "%ld", va_arg(vargs, long));
- else if (size_tflag)
- sprintf(s, "%" PY_FORMAT_SIZE_T "d",
- va_arg(vargs, Py_ssize_t));
- else
- sprintf(s, "%d", va_arg(vargs, int));
- s += strlen(s);
- break;
- case 'u':
- if (longflag)
- sprintf(s, "%lu",
- va_arg(vargs, unsigned long));
- else if (size_tflag)
- sprintf(s, "%" PY_FORMAT_SIZE_T "u",
- va_arg(vargs, size_t));
- else
- sprintf(s, "%u",
- va_arg(vargs, unsigned int));
- s += strlen(s);
- break;
- case 'i':
- sprintf(s, "%i", va_arg(vargs, int));
- s += strlen(s);
- break;
- case 'x':
- sprintf(s, "%x", va_arg(vargs, int));
- s += strlen(s);
- break;
- case 's':
- p = va_arg(vargs, char*);
- i = strlen(p);
- if (n > 0 && i > n)
- i = n;
- Py_MEMCPY(s, p, i);
- s += i;
- break;
- case 'p':
- sprintf(s, "%p", va_arg(vargs, void*));
- /* %p is ill-defined: ensure leading 0x. */
- if (s[1] == 'X')
- s[1] = 'x';
- else if (s[1] != 'x') {
- memmove(s+2, s, strlen(s)+1);
- s[0] = '0';
- s[1] = 'x';
- }
- s += strlen(s);
- break;
- case '%':
- *s++ = '%';
- break;
- default:
- strcpy(s, p);
- s += strlen(s);
- goto end;
- }
- } else
- *s++ = *f;
- }
- end:
- _PyString_Resize(&string, s - PyString_AS_STRING(string));
- return string;
- }
- PyObject *
- PyString_FromFormat(const char *format, ...)
- {
- PyObject* ret;
- va_list vargs;
- #ifdef HAVE_STDARG_PROTOTYPES
- va_start(vargs, format);
- #else
- va_start(vargs);
- #endif
- ret = PyString_FromFormatV(format, vargs);
- va_end(vargs);
- return ret;
- }
- PyObject *PyString_Decode(const char *s,
- Py_ssize_t size,
- const char *encoding,
- const char *errors)
- {
- PyObject *v, *str;
- str = PyString_FromStringAndSize(s, size);
- if (str == NULL)
- return NULL;
- v = PyString_AsDecodedString(str, encoding, errors);
- Py_DECREF(str);
- return v;
- }
- PyObject *PyString_AsDecodedObject(PyObject *str,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- if (!PyString_Check(str)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (encoding == NULL) {
- #ifdef Py_USING_UNICODE
- encoding = PyUnicode_GetDefaultEncoding();
- #else
- PyErr_SetString(PyExc_ValueError, "no encoding specified");
- goto onError;
- #endif
- }
- /* Decode via the codec registry */
- v = PyCodec_Decode(str, encoding, errors);
- if (v == NULL)
- goto onError;
- return v;
- onError:
- return NULL;
- }
- PyObject *PyString_AsDecodedString(PyObject *str,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- v = PyString_AsDecodedObject(str, encoding, errors);
- if (v == NULL)
- goto onError;
- #ifdef Py_USING_UNICODE
- /* Convert Unicode to a string using the default encoding */
- if (PyUnicode_Check(v)) {
- PyObject *temp = v;
- v = PyUnicode_AsEncodedString(v, NULL, NULL);
- Py_DECREF(temp);
- if (v == NULL)
- goto onError;
- }
- #endif
- if (!PyString_Check(v)) {
- PyErr_Format(PyExc_TypeError,
- "decoder did not return a string object (type=%.400s)",
- Py_TYPE(v)->tp_name);
- Py_DECREF(v);
- goto onError;
- }
- return v;
- onError:
- return NULL;
- }
- PyObject *PyString_Encode(const char *s,
- Py_ssize_t size,
- const char *encoding,
- const char *errors)
- {
- PyObject *v, *str;
- str = PyString_FromStringAndSize(s, size);
- if (str == NULL)
- return NULL;
- v = PyString_AsEncodedString(str, encoding, errors);
- Py_DECREF(str);
- return v;
- }
- PyObject *PyString_AsEncodedObject(PyObject *str,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- if (!PyString_Check(str)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (encoding == NULL) {
- #ifdef Py_USING_UNICODE
- encoding = PyUnicode_GetDefaultEncoding();
- #else
- PyErr_SetString(PyExc_ValueError, "no encoding specified");
- goto onError;
- #endif
- }
- /* Encode via the codec registry */
- v = PyCodec_Encode(str, encoding, errors);
- if (v == NULL)
- goto onError;
- return v;
- onError:
- return NULL;
- }
- PyObject *PyString_AsEncodedString(PyObject *str,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- v = PyString_AsEncodedObject(str, encoding, errors);
- if (v == NULL)
- goto onError;
- #ifdef Py_USING_UNICODE
- /* Convert Unicode to a string using the default encoding */
- if (PyUnicode_Check(v)) {
- PyObject *temp = v;
- v = PyUnicode_AsEncodedString(v, NULL, NULL);
- Py_DECREF(temp);
- if (v == NULL)
- goto onError;
- }
- #endif
- if (!PyString_Check(v)) {
- PyErr_Format(PyExc_TypeError,
- "encoder did not return a string object (type=%.400s)",
- Py_TYPE(v)->tp_name);
- Py_DECREF(v);
- goto onError;
- }
- return v;
- onError:
- return NULL;
- }
- static void
- string_dealloc(PyObject *op)
- {
- switch (PyString_CHECK_INTERNED(op)) {
- case SSTATE_NOT_INTERNED:
- break;
- case SSTATE_INTERNED_MORTAL:
- /* revive dead object temporarily for DelItem */
- Py_REFCNT(op) = 3;
- if (PyDict_DelItem(interned, op) != 0)
- Py_FatalError(
- "deletion of interned string failed");
- break;
- case SSTATE_INTERNED_IMMORTAL:
- Py_FatalError("Immortal interned string died.");
- default:
- Py_FatalError("Inconsistent interned string state.");
- }
- Py_TYPE(op)->tp_free(op);
- }
- /* Unescape a backslash-escaped string. If unicode is non-zero,
- the string is a u-literal. If recode_encoding is non-zero,
- the string is UTF-8 encoded and should be re-encoded in the
- specified encoding. */
- PyObject *PyString_DecodeEscape(const char *s,
- Py_ssize_t len,
- const char *errors,
- Py_ssize_t unicode,
- const char *recode_encoding)
- {
- int c;
- char *p, *buf;
- const char *end;
- PyObject *v;
- Py_ssize_t newlen = recode_encoding ? 4*len:len;
- v = PyString_FromStringAndSize((char *)NULL, newlen);
- if (v == NULL)
- return NULL;
- p = buf = PyString_AsString(v);
- end = s + len;
- while (s < end) {
- if (*s != '\\') {
- non_esc:
- #ifdef Py_USING_UNICODE
- if (recode_encoding && (*s & 0x80)) {
- PyObject *u, *w;
- char *r;
- const char* t;
- Py_ssize_t rn;
- t = s;
- /* Decode non-ASCII bytes as UTF-8. */
- while (t < end && (*t & 0x80)) t++;
- u = PyUnicode_DecodeUTF8(s, t - s, errors);
- if(!u) goto failed;
- /* Recode them in target encoding. */
- w = PyUnicode_AsEncodedString(
- u, recode_encoding, errors);
- Py_DECREF(u);
- if (!w) goto failed;
- /* Append bytes to output buffer. */
- assert(PyString_Check(w));
- r = PyString_AS_STRING(w);
- rn = PyString_GET_SIZE(w);
- Py_MEMCPY(p, r, rn);
- p += rn;
- Py_DECREF(w);
- s = t;
- } else {
- *p++ = *s++;
- }
- #else
- *p++ = *s++;
- #endif
- continue;
- }
- s++;
- if (s==end) {
- PyErr_SetString(PyExc_ValueError,
- "Trailing \\ in string");
- goto failed;
- }
- switch (*s++) {
- /* XXX This assumes ASCII! */
- case '\n': break;
- case '\\': *p++ = '\\'; break;
- case '\'': *p++ = '\''; break;
- case '\"': *p++ = '\"'; break;
- case 'b': *p++ = '\b'; break;
- case 'f': *p++ = '\014'; break; /* FF */
- case 't': *p++ = '\t'; break;
- case 'n': *p++ = '\n'; break;
- case 'r': *p++ = '\r'; break;
- case 'v': *p++ = '\013'; break; /* VT */
- case 'a': *p++ = '\007'; break; /* BEL, not classic C */
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- c = s[-1] - '0';
- if (s < end && '0' <= *s && *s <= '7') {
- c = (c<<3) + *s++ - '0';
- if (s < end && '0' <= *s && *s <= '7')
- c = (c<<3) + *s++ - '0';
- }
- *p++ = c;
- break;
- case 'x':
- if (s+1 < end &&
- isxdigit(Py_CHARMASK(s[0])) &&
- isxdigit(Py_CHARMASK(s[1])))
- {
- unsigned int x = 0;
- c = Py_CHARMASK(*s);
- s++;
- if (isdigit(c))
- x = c - '0';
- else if (islower(c))
- x = 10 + c - 'a';
- else
- x = 10 + c - 'A';
- x = x << 4;
- c = Py_CHARMASK(*s);
- s++;
- if (isdigit(c))
- x += c - '0';
- else if (islower(c))
- x += 10 + c - 'a';
- else
- x += 10 + c - 'A';
- *p++ = x;
- break;
- }
- if (!errors || strcmp(errors, "strict") == 0) {
- PyErr_SetString(PyExc_ValueError,
- "invalid \\x escape");
- goto failed;
- }
- if (strcmp(errors, "replace") == 0) {
- *p++ = '?';
- } else if (strcmp(errors, "ignore") == 0)
- /* do nothing */;
- else {
- PyErr_Format(PyExc_ValueError,
- "decoding error; "
- "unknown error handling code: %.400s",
- errors);
- goto failed;
- }
- #ifndef Py_USING_UNICODE
- case 'u':
- case 'U':
- case 'N':
- if (unicode) {
- PyErr_SetString(PyExc_ValueError,
- "Unicode escapes not legal "
- "when Unicode disabled");
- goto failed;
- }
- #endif
- default:
- *p++ = '\\';
- s--;
- goto non_esc; /* an arbitry number of unescaped
- UTF-8 bytes may follow. */
- }
- }
- if (p-buf < newlen)
- _PyString_Resize(&v, p - buf);
- return v;
- failed:
- Py_DECREF(v);
- return NULL;
- }
- /* -------------------------------------------------------------------- */
- /* object api */
- static Py_ssize_t
- string_getsize(register PyObject *op)
- {
- char *s;
- Py_ssize_t len;
- if (PyString_AsStringAndSize(op, &s, &len))
- return -1;
- return len;
- }
- static /*const*/ char *
- string_getbuffer(register PyObject *op)
- {
- char *s;
- Py_ssize_t len;
- if (PyString_AsStringAndSize(op, &s, &len))
- return NULL;
- return s;
- }
- Py_ssize_t
- PyString_Size(register PyObject *op)
- {
- if (!PyString_Check(op))
- return string_getsize(op);
- return Py_SIZE(op);
- }
- /*const*/ char *
- PyString_AsString(register PyObject *op)
- {
- if (!PyString_Check(op))
- return string_getbuffer(op);
- return ((PyStringObject *)op) -> ob_sval;
- }
- int
- PyString_AsStringAndSize(register PyObject *obj,
- register char **s,
- register Py_ssize_t *len)
- {
- if (s == NULL) {
- PyErr_BadInternalCall();
- return -1;
- }
- if (!PyString_Check(obj)) {
- #ifdef Py_USING_UNICODE
- if (PyUnicode_Check(obj)) {
- obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
- if (obj == NULL)
- return -1;
- }
- else
- #endif
- {
- PyErr_Format(PyExc_TypeError,
- "expected string or Unicode object, "
- "%.200s found", Py_TYPE(obj)->tp_name);
- return -1;
- }
- }
- *s = PyString_AS_STRING(obj);
- if (len != NULL)
- *len = PyString_GET_SIZE(obj);
- else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
- PyErr_SetString(PyExc_TypeError,
- "expected string without null bytes");
- return -1;
- }
- return 0;
- }
- /* -------------------------------------------------------------------- */
- /* Methods */
- #include "stringlib/stringdefs.h"
- #include "stringlib/fastsearch.h"
- #include "stringlib/count.h"
- #include "stringlib/find.h"
- #include "stringlib/partition.h"
- #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
- #include "stringlib/localeutil.h"
- static int
- string_print(PyStringObject *op, FILE *fp, int flags)
- {
- Py_ssize_t i, str_len;
- char c;
- int quote;
- /* XXX Ought to check for interrupts when writing long strings */
- if (! PyString_CheckExact(op)) {
- int ret;
- /* A str subclass may have its own __str__ method. */
- op = (PyStringObject *) PyObject_Str((PyObject *)op);
- if (op == NULL)
- return -1;
- ret = string_print(op, fp, flags);
- Py_DECREF(op);
- return ret;
- }
- if (flags & Py_PRINT_RAW) {
- char *data = op->ob_sval;
- Py_ssize_t size = Py_SIZE(op);
- Py_BEGIN_ALLOW_THREADS
- while (size > INT_MAX) {
- /* Very long strings cannot be written atomically.
- * But don't write exactly INT_MAX bytes at a time
- * to avoid memory aligment issues.
- */
- const int chunk_size = INT_MAX & ~0x3FFF;
- fwrite(data, 1, chunk_size, fp);
- data += chunk_size;
- size -= chunk_size;
- }
- #ifdef __VMS
- if (size) fwrite(data, (int)size, 1, fp);
- #else
- fwrite(data, 1, (int)size, fp);
- #endif
- Py_END_ALLOW_THREADS
- return 0;
- }
- /* figure out which quote to use; single is preferred */
- quote = '\'';
- if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
- !memchr(op->ob_sval, '"', Py_SIZE(op)))
- quote = '"';
- str_len = Py_SIZE(op);
- Py_BEGIN_ALLOW_THREADS
- fputc(quote, fp);
- for (i = 0; i < str_len; i++) {
- /* Since strings are immutable and the caller should have a
- reference, accessing the interal buffer should not be an issue
- with the GIL released. */
- c = op->ob_sval[i];
- if (c == quote || c == '\\')
- fprintf(fp, "\\%c", c);
- else if (c == '\t')
- fprintf(fp, "\\t");
- else if (c == '\n')
- fprintf(fp, "\\n");
- else if (c == '\r')
- fprintf(fp, "\\r");
- else if (c < ' ' || c >= 0x7f)
- fprintf(fp, "\\x%02x", c & 0xff);
- else
- fputc(c, fp);
- }
- fputc(quote, fp);
- Py_END_ALLOW_THREADS
- return 0;
- }
- PyObject *
- PyString_Repr(PyObject *obj, int smartquotes)
- {
- register PyStringObject* op = (PyStringObject*) obj;
- size_t newsize = 2 + 4 * Py_SIZE(op);
- PyObject *v;
- if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
- PyErr_SetString(PyExc_OverflowError,
- "string is too large to make repr");
- return NULL;
- }
- v = PyString_FromStringAndSize((char *)NULL, newsize);
- if (v == NULL) {
- return NULL;
- }
- else {
- register Py_ssize_t i;
- register char c;
- register char *p;
- int quote;
- /* figure out which quote to use; single is preferred */
- quote = '\'';
- if (smartquotes &&
- memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
- !memchr(op->ob_sval, '"', Py_SIZE(op)))
- quote = '"';
- p = PyString_AS_STRING(v);
- *p++ = quote;
- for (i = 0; i < Py_SIZE(op); i++) {
- /* There's at least enough room for a hex escape
- and a closing quote. */
- assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
- c = op->ob_sval[i];
- if (c == quote || c == '\\')
- *p++ = '\\', *p++ = c;
- else if (c == '\t')
- *p++ = '\\', *p++ = 't';
- else if (c == '\n')
- *p++ = '\\', *p++ = 'n';
- else if (c == '\r')
- *p++ = '\\', *p++ = 'r';
- else if (c < ' ' || c >= 0x7f) {
- /* For performance, we don't want to call
- PyOS_snprintf here (extra layers of
- function call). */
- sprintf(p, "\\x%02x", c & 0xff);
- p += 4;
- }
- else
- *p++ = c;
- }
- assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
- *p++ = quote;
- *p = '\0';
- _PyString_Resize(
- &v, (p - PyString_AS_STRING(v)));
- return v;
- }
- }
- static PyObject *
- string_repr(PyObject *op)
- {
- return PyString_Repr(op, 1);
- }
- static PyObject *
- string_str(PyObject *s)
- {
- assert(PyString_Check(s));
- if (PyString_CheckExact(s)) {
- Py_INCREF(s);
- return s;
- }
- else {
- /* Subtype -- return genuine string with the same value. */
- PyStringObject *t = (PyStringObject *) s;
- return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
- }
- }
- static Py_ssize_t
- string_length(PyStringObject *a)
- {
- return Py_SIZE(a);
- }
- static PyObject *
- string_concat(register PyStringObject *a, register PyObject *bb)
- {
- register Py_ssize_t size;
- register PyStringObject *op;
- if (!PyString_Check(bb)) {
- #ifdef Py_USING_UNICODE
- if (PyUnicode_Check(bb))
- return PyUnicode_Concat((PyObject *)a, bb);
- #endif
- if (PyByteArray_Check(bb))
- return PyByteArray_Concat((PyObject *)a, bb);
- PyErr_Format(PyExc_TypeError,
- "cannot concatenate 'str' and '%.200s' objects",
- Py_TYPE(bb)->tp_name);
- return NULL;
- }
- #define b ((PyStringObject *)bb)
- /* Optimize cases with empty left or right operand */
- if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
- PyString_CheckExact(a) && PyString_CheckExact(b)) {
- if (Py_SIZE(a) == 0) {
- Py_INCREF(bb);
- return bb;
- }
- Py_INCREF(a);
- return (PyObject *)a;
- }
- size = Py_SIZE(a) + Py_SIZE(b);
- /* Check that string sizes are not negative, to prevent an
- overflow in cases where we are passed incorrectly-created
- strings with negative lengths (due to a bug in other code).
- */
- if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
- Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
- PyErr_SetString(PyExc_OverflowError,
- "strings are too large to concat");
- return NULL;
- }
-
- /* Inline PyObject_NewVar */
- if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
- PyErr_SetString(PyExc_OverflowError,
- "strings are too large to concat");
- return NULL;
- }
- op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
- if (op == NULL)
- return PyErr_NoMemory();
- PyObject_INIT_VAR(op, &PyString_Type, size);
- op->ob_shash = -1;
- op->ob_sstate = SSTATE_NOT_INTERNED;
- Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
- Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
- op->ob_sval[size] = '\0';
- return (PyObject *) op;
- #undef b
- }
- static PyObject *
- string_repeat(register PyStringObject *a, register Py_ssize_t n)
- {
- register Py_ssize_t i;
- register Py_ssize_t j;
- register Py_ssize_t size;
- register PyStringObject *op;
- size_t nbytes;
- if (n < 0)
- n = 0;
- /* watch out for overflows: the size can overflow int,
- * and the # of bytes needed can overflow size_t
- */
- size = Py_SIZE(a) * n;
- if (n && size / n != Py_SIZE(a)) {
- PyErr_SetString(PyExc_OverflowError,
- "repeated string is too long");
- return NULL;
- }
- if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
- Py_INCREF(a);
- return (PyObject *)a;
- }
- nbytes = (size_t)size;
- if (nbytes + sizeof(PyStringObject) <= nbytes) {
- PyErr_SetString(PyExc_OverflowError,
- "repeated string is too long");
- return NULL;
- }
- op = (PyStringObject *)
- PyObject_MALLOC(sizeof(PyStringObject) + nbytes);
- if (op == NULL)
- return PyErr_NoMemory();
- PyObject_INIT_VAR(op, &PyString_Type, size);
- op->ob_shash = -1;
- op->ob_sstate = SSTATE_NOT_INTERNED;
- op->ob_sval[size] = '\0';
- if (Py_SIZE(a) == 1 && n > 0) {
- memset(op->ob_sval, a->ob_sval[0] , n);
- return (PyObject *) op;
- }
- i = 0;
- if (i < size) {
- Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
- i = Py_SIZE(a);
- }
- while (i < size) {
- j = (i <= size-i) ? i : size-i;
- Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
- i += j;
- }
- return (PyObject *) op;
- }
- /* String slice a[i:j] consists of characters a[i] ... a[j-1] */
- static PyObject *
- string_slice(register PyStringObject *a, register Py_ssize_t i,
- register Py_ssize_t j)
- /* j -- may be negative! */
- {
- if (i < 0)
- i = 0;
- if (j < 0)
- j = 0; /* Avoid signed/unsigned bug in next line */
- if (j > Py_SIZE(a))
- j = Py_SIZE(a);
- if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
- /* It's the same as a */
- Py_INCREF(a);
- return (PyObject *)a;
- }
- if (j < i)
- j = i;
- return PyString_FromStringAndSize(a->ob_sval + i, j-i);
- }
- static int
- string_contains(PyObject *str_obj, PyObject *sub_obj)
- {
- if (!PyString_CheckExact(sub_obj)) {
- #ifdef Py_USING_UNICODE
- if (PyUnicode_Check(sub_obj))
- return PyUnicode_Contains(str_obj, sub_obj);
- #endif
- if (!PyString_Check(sub_obj)) {
- PyErr_Format(PyExc_TypeError,
- "'in <string>' requires string as left operand, "
- "not %.200s", Py_TYPE(sub_obj)->tp_name);
- return -1;
- }
- }
- return stringlib_contains_obj(str_obj, sub_obj);
- }
- static PyObject *
- string_item(PyStringObject *a, register Py_ssize_t i)
- {
- char pchar;
- PyObject *v;
- if (i < 0 || i >= Py_SIZE(a)) {
- PyErr_SetString(PyExc_IndexError, "string index out of range");
- return NULL;
- }
- pchar = a->ob_sval[i];
- v = (PyObject *)characters[pchar & UCHAR_MAX];
- if (v == NULL)
- v = PyString_FromStringAndSize(&pchar, 1);
- else {
- #ifdef COUNT_ALLOCS
- one_strings++;
- #endif
- Py_INCREF(v);
- }
- return v;
- }
- static PyObject*
- string_richcompare(PyStringObject *a, PyStringObject *b, int op)
- {
- int c;
- Py_ssize_t len_a, len_b;
- Py_ssize_t min_len;
- PyObject *result;
- /* Make sure both arguments are strings. */
- if (!(PyString_Check(a) && PyString_Check(b))) {
- result = Py_NotImplemented;
- goto out;
- }
- if (a == b) {
- switch (op) {
- case Py_EQ:case Py_LE:case Py_GE:
- result = Py_True;
- goto out;
- case Py_NE:case Py_LT:case Py_GT:
- result = Py_False;
- goto out;
- }
- }
- if (op == Py_EQ) {
- /* Supporting Py_NE here as well does not save
- much time, since Py_NE is rarely used. */
- if (Py_SIZE(a) == Py_SIZE(b)
- && (a->ob_sval[0] == b->ob_sval[0]
- && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
- result = Py_True;
- } else {
- result = Py_False;
- }
- goto out;
- }
- len_a = Py_SIZE(a); len_b = Py_SIZE(b);
- min_len = (len_a < len_b) ? len_a : len_b;
- if (min_len > 0) {
- c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
- if (c==0)
- c = memcmp(a->ob_sval, b->ob_sval, min_len);
- } else
- c = 0;
- if (c == 0)
- c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
- switch (op) {
- case Py_LT: c = c < 0; break;
- case Py_LE: c = c <= 0; break;
- case Py_EQ: assert(0); break; /* unreachable */
- case Py_NE: c = c != 0; break;
- case Py_GT: c = c > 0; break;
- case Py_GE: c = c >= 0; break;
- default:
- result = Py_NotImplemented;
- goto out;
- }
- result = c ? Py_True : Py_False;
- out:
- Py_INCREF(result);
- return result;
- }
- int
- _PyString_Eq(PyObject *o1, PyObject *o2)
- {
- PyStringObject *a = (PyStringObject*) o1;
- PyStringObject *b = (PyStringObject*) o2;
- return Py_SIZE(a) == Py_SIZE(b)
- && *a->ob_sval == *b->ob_sval
- && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
- }
- static long
- string_hash(PyStringObject *a)
- {
- register Py_ssize_t len;
- register unsigned char *p;
- register long x;
- if (a->ob_shash != -1)
- return a->ob_shash;
- len = Py_SIZE(a);
- p = (unsigned char *) a->ob_sval;
- x = *p << 7;
- while (--len >= 0)
- x = (1000003*x) ^ *p++;
- x ^= Py_SIZE(a);
- if (x == -1)
- x = -2;
- a->ob_shash = x;
- return x;
- }
- static PyObject*
- string_subscript(PyStringObject* self, PyObject* item)
- {
- if (PyIndex_Check(item)) {
- Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
- if (i == -1 && PyErr_Occurred())
- return NULL;
- if (i < 0)
- i += PyString_GET_SIZE(self);
- return string_item(self, i);
- }
- else if (PySlice_Check(item)) {
- Py_ssize_t start, stop, step, slicelength, cur, i;
- char* source_buf;
- char* result_buf;
- PyObject* result;
- if (PySlice_GetIndicesEx((PySliceObject*)item,
- PyString_GET_SIZE(self),
- &start, &stop, &step, &slicelength) < 0) {
- return NULL;
- }
- if (slicelength <= 0) {
- return PyString_FromStringAndSize("", 0);
- }
- else if (start == 0 && step == 1 &&
- slicelength == PyString_GET_SIZE(self) &&
- PyString_CheckExact(self)) {
- Py_INCREF(self);
- return (PyObject *)self;
- }
- else if (step == 1) {
- return PyString_FromStringAndSize(
- PyString_AS_STRING(self) + start,
- slicelength);
- }
- else {
- source_buf = PyString_AsString((PyObject*)self);
- result_buf = (char *)PyMem_Malloc(slicelength);
- if (result_buf == NULL)
- return PyErr_NoMemory();
- for (cur = start, i = 0; i < slicelength;
- cur += step, i++) {
- result_buf[i] = source_buf[cur];
- }
- result = PyString_FromStringAndSize(result_buf,
- slicelength);
- PyMem_Free(result_buf);
- return result;
- }
- }
- else {
- PyErr_Format(PyExc_TypeError,
- "string indices must be integers, not %.200s",
- Py_TYPE(item)->tp_name);
- return NULL;
- }
- }
- static Py_ssize_t
- string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
- {
- if ( index != 0 ) {
- PyErr_SetString(PyExc_SystemError,
- "accessing non-existent string segment");
- return -1;
- }
- *ptr = (void *)self->ob_sval;
- return Py_SIZE(self);
- }
- static Py_ssize_t
- string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
- {
- PyErr_SetString(PyExc_TypeError,
- "Cannot use string as modifiable buffer");
- return -1;
- }
- static Py_ssize_t
- string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
- {
- if ( lenp )
- *lenp = Py_SIZE(self);
- return 1;
- }
- static Py_ssize_t
- string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
- {
- if ( index != 0 ) {
- PyErr_SetString(PyExc_SystemError,
- "accessing non-existent string segment");
- return -1;
- }
- *ptr = self->ob_sval;
- return Py_SIZE(self);
- }
- static int
- string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
- {
- return PyBuffer_FillInfo(view, (PyObject*)self,
- (void *)self->ob_sval, Py_SIZE(self),
- 1, flags);
- }
- static PySequenceMethods string_as_sequence = {
- (lenfunc)string_length, /*sq_length*/
- (binaryfunc)string_concat, /*sq_concat*/
- (ssizeargfunc)string_repeat, /*sq_repeat*/
- (ssizeargfunc)string_item, /*sq_item*/
- (ssizessizeargfunc)string_slice, /*sq_slice*/
- 0, /*sq_ass_item*/
- 0, /*sq_ass_slice*/
- (objobjproc)string_contains /*sq_contains*/
- };
- static PyMappingMethods string_as_mapping = {
- (lenfunc)string_length,
- (binaryfunc)string_subscript,
- 0,
- };
- static PyBufferProcs string_as_buffer = {
- (readbufferproc)string_buffer_getreadbuf,
- (writebufferproc)string_buffer_getwritebuf,
- (segcountproc)string_buffer_getsegcount,
- (charbufferproc)string_buffer_getcharbuf,
- (getbufferproc)string_buffer_getbuffer,
- 0, /* XXX */
- };
- #define LEFTSTRIP 0
- #define RIGHTSTRIP 1
- #define BOTHSTRIP 2
- /* Arrays indexed by above */
- static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
- #define STRIPNAME(i) (stripformat[i]+3)
- /* Don't call if length < 2 */
- #define Py_STRING_MATCH(target, offset, pattern, length) \
- (target[offset] == pattern[0] && \
- target[offset+length-1] == pattern[length-1] && \
- !memcmp(target+offset+1, pattern+1, length-2) )
- /* Overallocate the initial list to reduce the number of reallocs for small
- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
- resizes, to sizes 4, 8, then 16. Most observed string splits are for human
- text (roughly 11 words per line) and field delimited data (usually 1-10
- fields). For large strings the split algorithms are bandwidth limited
- so increasing the preallocation likely will not improve things.*/
- #define MAX_PREALLOC 12
- /* 5 splits gives 6 elements */
- #define PREALLOC_SIZE(maxsplit) \
- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
- #define SPLIT_APPEND(data, left, right) \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
- #define SPLIT_ADD(data, left, right) { \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (count < MAX_PREALLOC) { \
- PyList_SET_ITEM(list, count, str); \
- } else { \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str); \
- } \
- count++; }
- /* Always force the list to the expected size. */
- #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
- #define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
- #define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
- #define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
- #define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
- Py_LOCAL_INLINE(PyObject *)
- split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
- {
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
- i = j = 0;
- while (maxsplit-- > 0) {
- SKIP_SPACE(s, i, len);
- if (i==len) break;
- j = i; i++;
- SKIP_NONSPACE(s, i, len);
- if (j == 0 && i == len && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, j, i);
- }
- if (i < len) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to end of string */
- SKIP_SPACE(s, i, len);
- if (i != len)
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- Py_LOCAL_INLINE(PyObject *)
- split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
- {
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
- if (list == NULL)
- return NULL;
- i = j = 0;
- while ((j < len) && (maxcount-- > 0)) {
- for(; j<len; j++) {
- /* I found that using memchr makes no difference */
- if (s[j] == ch) {
- SPLIT_ADD(s, i, j);
- i = j = j + 1;
- break;
- }
- }
- }
- if (i == 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (i <= len) {
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- PyDoc_STRVAR(split__doc__,
- "S.split([sep [,maxsplit]]) -> list of strings\n\
- \n\
- Return a list of the words in the string S, using sep as the\n\
- delimiter string. If maxsplit is given, at most maxsplit\n\
- splits are done. If sep is not specified or is None, any\n\
- whitespace string is a separator and empty strings are removed\n\
- from the result.");
- static PyObject *
- string_split(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
- const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
- #ifdef USE_FAST
- Py_ssize_t pos;
- #endif
- if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
- return NULL;
- if (maxsplit < 0)
- maxsplit = PY_SSIZE_T_MAX;
- if (subobj == Py_None)
- return split_whitespace(self, len, maxsplit);
- if (PyString_Check(subobj)) {
- sub = PyString_AS_STRING(subobj);
- n = PyString_GET_SIZE(subobj);
- }
- #ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(subobj))
- return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
- #endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
- return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return split_char(self, len, sub[0], maxsplit);
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
- #ifdef USE_FAST
- i = j = 0;
- while (maxsplit-- > 0) {
- pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
- if (pos < 0)
- break;
- j = i+pos;
- SPLIT_ADD(s, i, j);
- i = j + n;
- }
- #else
- i = j = 0;
- while ((j+n <= len) && (maxsplit-- > 0)) {
- for (; j+n <= len; j++) {
- if (Py_STRING_MATCH(s, j, sub, n)) {
- SPLIT_ADD(s, i, j);
- i = j = j + n;
- break;
- }
- }
- }
- #endif
- SPLIT_ADD(s, i, len);
- FIX_PREALLOC_SIZE(list);
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- PyDoc_STRVAR(partition__doc__,
- "S.partition(sep) -> (head, sep, tail)\n\
- \n\
- Search for the separator sep in S, and return the part before it,\n\
- the separator itself, and the part after it. If the separator is not\n\
- found, return S and two empty strings.");
- static PyObject *
- string_partition(PyStringObject *self, PyObject *sep_obj)
- {
- const char *sep;
- Py_ssize_t sep_len;
- if (PyString_Check(sep_obj)) {
- sep = PyString_AS_STRING(sep_obj);
- sep_len = PyString_GET_SIZE(sep_obj);
- }
- #ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(sep_obj))
- return PyUnicode_Partition((PyObject *) self, sep_obj);
- #endif
- else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
- return NULL;
- return stringlib_partition(
- (PyObject*) self,
- PyString_AS_STRING(self), PyString_GET_SIZE(self),
- sep_obj, sep, sep_len
- );
- }
- PyDoc_STRVAR(rpartition__doc__,
- "S.rpartition(sep) -> (tail, sep, head)\n\
- \n\
- Search for the separator sep in S, starting at the end of S, and return\n\
- the part before it, the separator itself, and the part after it. If the\n\
- separator is not found, return two empty strings and S.");
- static PyObject *
- string_rpartition(PyStringObject *self, PyObject *sep_obj)
- {
- const char *sep;
- Py_ssize_t sep_len;
- if (PyString_Check(sep_obj)) {
- sep = PyString_AS_STRING(sep_obj);
- sep_len = PyString_GET_SIZE(sep_obj);
- }
- #ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(sep_obj))
- return PyUnicode_RPartition((PyObject *) self, sep_obj);
- #endif
- else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
- return NULL;
- return stringlib_rpartition(
- (PyObject*) self,
- PyString_AS_STRING(self), PyString_GET_SIZE(self),
- sep_obj, sep, sep_len
- );
- }
- Py_LOCAL_INLINE(PyObject *)
- rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
- {
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
- i = j = len-1;
- while (maxsplit-- > 0) {
- RSKIP_SPACE(s, i);
- if (i<0) break;
- j = i; i--;
- RSKIP_NONSPACE(s, i);
- if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, i + 1, j + 1);
- }
- if (i >= 0) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to beginning of string */
- RSKIP_SPACE(s, i);
- if (i >= 0)
- SPLIT_ADD(s, 0, i + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- Py_LOCAL_INLINE(PyObject *)
- rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
- {
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
- if (list == NULL)
- return NULL;
- i = j = len - 1;
- while ((i >= 0) && (maxcount-- > 0)) {
- for (; i >= 0; i--) {
- if (s[i] == ch) {
- SPLIT_ADD(s, i + 1, j + 1);
- j = i = i - 1;
- break;
- }
- }
- }
- if (i < 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (j >= -1) {
- SPLIT_ADD(s, 0, j + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- PyDoc_STRVAR(rsplit__doc__,
- "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
- \n\
- Return a list of the words in the string S, using sep as the\n\
- delimiter string, starting at the end of the string and working\n\
- to the front. If maxsplit is given, at most maxsplit splits are\n\
- done. If sep is not specified or is None, any whitespace string\n\
- is a separator.");
- static PyObject *
- string_rsplit(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
- const char *s, *sub;
- PyObject *list, *str, *subobj = Py_None;
- if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
- return NULL;
- if (maxsplit < 0)
- maxsplit = PY_SSIZE_T_MAX;
- if (subobj == Py_None)
- return rsplit_whitespace(self, len, maxsplit);
- if (PyString_Check(subobj)) {
- sub = PyString_AS_STRING(subobj);
- n = PyString_GET_SIZE(subobj);
- }
- #ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(subobj))
- return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
- #endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
- return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return rsplit_char(self, len, sub[0], maxsplit);
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
- j = len;
- i = j - n;
- s = PyString_AS_STRING(self);
- while ( (i >= 0) && (maxsplit-- > 0) ) {
- for (; i>=0; i--) {
- if (Py_STRING_MATCH(s, i, sub, n)) {
- SPLIT_ADD(s, i + n, j);
- j = i;
- i -= n;
- break;
- }
- }
- }
- SPLIT_ADD(s, 0, j);
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
- }
- PyDoc_STRVAR(join__doc__,
- "S.join(sequence) -> string\n\
- \n\
- Return a string which is the concatenation of the strings in the\n\
- sequence. The separator between elements is S.");
- static PyObject *
- string_join(PyStringObject *self, PyObject *orig)
- {
- char *sep = PyString_AS_STRING(self);
- const Py_ssize_t seplen = PyString_GET_SIZE(self);
- PyObject *res = NULL;
- char *p;
- Py_ssize_t seqlen = 0;
- size_t sz = 0;
- Py_ssize_t i;
- PyObject *seq, *item;
- seq = PySequence_Fast(orig, "");
- if (seq == NULL) {
- return NULL;
- }
- seqlen = PySequence_Size(seq);
- if (seqlen == 0) {
- Py_DECREF(seq);
- return PyString_FromString("");
- }
- if (seqlen == 1) {
- item = PySequence_Fast_GET_ITEM(seq, 0);
- if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
- Py_INCREF(item);
- Py_DECREF(seq);
- return item;
- }
- }
- /* There are at least two things to join, or else we have a subclass
- * of the builtin types in the sequence.
- * Do a pre-pass to figure out the total amount of space we'll
- * need (sz), see whether any argument is absurd, and defer to
- * the Unicode join if appropriate.
- */
- for (i = 0; i < seqlen; i++) {
- const size_t old_sz = sz;
- item = PySequence_Fast_GET_ITEM(seq, i);
- if (!PyString_Check(item)){
- #ifdef Py_USING_UNICODE
- if (PyUnicode_Check(item)) {
- /* Defer to Unicode join.
- * CAUTION: There's no gurantee that the
- * original sequence can be iterated over
- * again, so we must pass seq here.
- */
- PyObject *result;
- result = PyUnicode_Join((PyObject *)self, seq);
- Py_DECREF(seq);
- return result;
- }
- #endif
- PyErr_Format(PyExc_TypeError,
- "sequence item %zd: expected string,"
- " %.80s found",
- i, Py_TYPE(item)->tp_name);
- Py_DECREF(seq);
- return NULL;
- }
- sz += PyString_GET_SIZE(item);
- if (i != 0)
- sz += seplen;
- if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
- PyErr_SetString(PyExc_OverflowError,
- "join() result is too long for a Python string");
- Py_DECREF(seq);
- return NULL;
- }
- }
- /* Allocate result space. */
- res = PyString_FromStringAndSize((char*)NULL, sz);
- if (res == NULL) {
- Py_DECREF(seq);
- return NULL;
- }
- /* Catenate everything. */
- p = PyString_AS_STRING(res);
- for (i = 0; i < seqlen; ++i) {
- size_t n;
- item = PySequence_Fast_GET_ITEM(seq, i);
- n = PyString_GET_SIZE(item);
- Py_MEMCPY(p, PyString_AS_STRING(item), n);
- p += n;
- if (i < seqlen - 1) {
- Py_MEMCPY(p, sep, seplen);
- p += seplen;
- }
- }
- Py_DECREF(seq);
- return res;
- }
- PyObject *
- _PyString_Join(PyObject *sep, PyObject *x)
- {
- assert(sep != NULL && PyString_Check(sep));
- assert(x != NULL);
- return string_join((PyStringObject *)sep, x);
- }
- Py_LOCAL_INLINE(void)
- string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
- {
- if (*end > len)
- *end = len;
- else if (*end < 0)
- *end += len;
- if (*end < 0)
- *end = 0;
- if (*start < 0)
- *start += len;
- if (*start < 0)
- *start = 0;
- }
- Py_LOCAL_INLINE(Py_ssize_t)
- string_find_internal(PyStringObject *self, PyObject *args, int dir)
- {
- PyObject *subobj;
- const char *sub;
- Py_ssize_t sub_len;
- Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
- PyObject *obj_start=Py_None, *obj_end=Py_None;
- if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
- &obj_start, &obj_end))
- return -2;
- /* To support None in "start" and "end" arguments, meaning
- the same as if they were not passed.
- */
- if (obj_start != Py_None)
- if (!_PyEval_SliceIndex(obj_start, &start))
- return -2;
- if (obj_end != Py_None)
- if (!_PyEval_SliceIndex(obj_end, &end))
- return -2;
- if (PyString_Check(subobj)) {
- sub = PyString_AS_STRING(subobj);
- sub_len = PyString_GET_SIZE(subobj);
- }
- #ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(subobj))
- return PyUnicode_Find(
- (PyObject *)self, subobj, start, end, dir);
- #endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
- /* XXX - the "expected a character buffer object" is pretty
- confusing for a non-expert. remap to something else ? */
- return -2;
- if (dir > 0)
- return stringlib_find_slice(
- PyString_AS_STRING(self), PyString_GET_SIZE(self),
- sub, sub_len, start, end);
- else
- return stringlib_rfind_slice(
- PyString_AS_STRING(self), PyString_GET_SIZE(self),
- sub, sub_len, start, end);
- }
- PyDoc_STRVAR(find__doc__,
- "S.find(sub [,start [,end]]) -> int\n\
- \n\
- Return the lowest index in S where substring sub is found,\n\
- such that sub is contained within s[start:end]. Optional\n\
- arguments start and end are interpreted as in slice notation.\n\
- \n\
- Return -1 on failure.");
- static PyObject *
- string_find(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t result = string_find_internal(self, args, +1);
- if (result == -2)
- return NULL;
- return PyInt_FromSsize_t(result);
- }
- PyDoc_STRVAR(index__doc__,
- "S.index(sub [,start [,end]]) -> int\n\
- \n\
- Like S.find() but raise ValueError when the substring is not found.");
- static PyObject *
- string_index(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t result = string_find_internal(self, args, +1);
- if (result == -2)
- return NULL;
- if (result == -1) {
- PyErr_SetString(PyExc_ValueError,
- "substring not found");
- return NULL;
- }
- return PyInt_FromSsize_t(result);
- }
- PyDoc_STRVAR(rfind__doc__,
- "S.rfind(sub [,start [,end]]) -> int\n\
- \n\
- Return the highest index in S where substring sub is found,\n\
- such that sub is contained within s[start:end]. Optional\n\
- arguments start and end are interpreted as in slice notation.\n\
- \n\
- Return -1 on failure.");
- static PyObject *
- string_rfind(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t result = string_find_internal(self, args, -1);
- if (result == -2)
- return NULL;
- return PyInt_FromSsize_t(result);
- }
- PyDoc_STRVAR(rindex__doc__,
- "S.rindex(sub [,start [,end]]) -> int\n\
- \n\
- Like S.rfind() but raise ValueError when the substring is not found.");
- static PyObject *
- string_rindex(PyStringObject *self, PyObject *args)
- {
- Py_ssize_t result = string_find_internal(self, args, -1);
- if (result == -2)
- return NULL;
- if (result == -1) {
- PyErr_SetString(PyExc_ValueError,
- "substring not found");
- return NULL;
- }
- return PyInt_FromSsize_t(result);
- }
- Py_LOCAL_INLINE(PyObject *)
- do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
- {
- char *s = PyString_AS_STRING(self);
- Py_ssize_…