PageRenderTime 202ms CodeModel.GetById 95ms app.highlight 90ms RepoModel.GetById 1ms app.codeStats 1ms

/Objects/stringobject.c

http://unladen-swallow.googlecode.com/
C | 5242 lines | 4329 code | 538 blank | 375 comment | 1221 complexity | a1c18aba068e8f25c3e3da8e22507f9f MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/* String (str/bytes) object implementation */
   2
   3#define PY_SSIZE_T_CLEAN
   4
   5#include "Python.h"
   6#include <ctype.h>
   7
   8#ifdef COUNT_ALLOCS
   9int null_strings, one_strings;
  10#endif
  11
  12static PyStringObject *characters[UCHAR_MAX + 1];
  13static PyStringObject *nullstring;
  14
  15/* This dictionary holds all interned strings.  Note that references to
  16   strings in this dictionary are *not* counted in the string's ob_refcnt.
  17   When the interned string reaches a refcnt of 0 the string deallocation
  18   function will delete the reference from this dictionary.
  19
  20   Another way to look at this is that to say that the actual reference
  21   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
  22*/
  23static PyObject *interned;
  24
  25/*
  26   For both PyString_FromString() and PyString_FromStringAndSize(), the
  27   parameter `size' denotes number of characters to allocate, not counting any
  28   null terminating character.
  29
  30   For PyString_FromString(), the parameter `str' points to a null-terminated
  31   string containing exactly `size' bytes.
  32
  33   For PyString_FromStringAndSize(), the parameter the parameter `str' is
  34   either NULL or else points to a string containing at least `size' bytes.
  35   For PyString_FromStringAndSize(), the string in the `str' parameter does
  36   not have to be null-terminated.  (Therefore it is safe to construct a
  37   substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
  38   If `str' is NULL then PyString_FromStringAndSize() will allocate `size+1'
  39   bytes (setting the last byte to the null terminating character) and you can
  40   fill in the data yourself.  If `str' is non-NULL then the resulting
  41   PyString object must be treated as immutable and you must not fill in nor
  42   alter the data yourself, since the strings may be shared.
  43
  44   The PyObject member `op->ob_size', which denotes the number of "extra
  45   items" in a variable-size object, will contain the number of bytes
  46   allocated for string data, not counting the null terminating character.  It
  47   is therefore equal to the equal to the `size' parameter (for
  48   PyString_FromStringAndSize()) or the length of the string in the `str'
  49   parameter (for PyString_FromString()).
  50*/
  51PyObject *
  52PyString_FromStringAndSize(const char *str, Py_ssize_t size)
  53{
  54	register PyStringObject *op;
  55	if (size < 0) {
  56		PyErr_SetString(PyExc_SystemError,
  57		    "Negative size passed to PyString_FromStringAndSize");
  58		return NULL;
  59	}
  60	if (size == 0 && (op = nullstring) != NULL) {
  61#ifdef COUNT_ALLOCS
  62		null_strings++;
  63#endif
  64		Py_INCREF(op);
  65		return (PyObject *)op;
  66	}
  67	if (size == 1 && str != NULL &&
  68	    (op = characters[*str & UCHAR_MAX]) != NULL)
  69	{
  70#ifdef COUNT_ALLOCS
  71		one_strings++;
  72#endif
  73		Py_INCREF(op);
  74		return (PyObject *)op;
  75	}
  76
  77	if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
  78		PyErr_SetString(PyExc_OverflowError, "string is too large");
  79		return NULL;
  80	}
  81
  82	/* Inline PyObject_NewVar */
  83	op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
  84	if (op == NULL)
  85		return PyErr_NoMemory();
  86	PyObject_INIT_VAR(op, &PyString_Type, size);
  87	op->ob_shash = -1;
  88	op->ob_sstate = SSTATE_NOT_INTERNED;
  89	if (str != NULL)
  90		Py_MEMCPY(op->ob_sval, str, size);
  91	op->ob_sval[size] = '\0';
  92	/* share short strings */
  93	if (size == 0) {
  94		PyObject *t = (PyObject *)op;
  95		PyString_InternInPlace(&t);
  96		op = (PyStringObject *)t;
  97		nullstring = op;
  98		Py_INCREF(op);
  99	} else if (size == 1 && str != NULL) {
 100		PyObject *t = (PyObject *)op;
 101		PyString_InternInPlace(&t);
 102		op = (PyStringObject *)t;
 103		characters[*str & UCHAR_MAX] = op;
 104		Py_INCREF(op);
 105	}
 106	return (PyObject *) op;
 107}
 108
 109PyObject *
 110PyString_FromString(const char *str)
 111{
 112	register size_t size;
 113	register PyStringObject *op;
 114
 115	assert(str != NULL);
 116	size = strlen(str);
 117	if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
 118		PyErr_SetString(PyExc_OverflowError,
 119			"string is too long for a Python string");
 120		return NULL;
 121	}
 122	if (size == 0 && (op = nullstring) != NULL) {
 123#ifdef COUNT_ALLOCS
 124		null_strings++;
 125#endif
 126		Py_INCREF(op);
 127		return (PyObject *)op;
 128	}
 129	if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
 130#ifdef COUNT_ALLOCS
 131		one_strings++;
 132#endif
 133		Py_INCREF(op);
 134		return (PyObject *)op;
 135	}
 136
 137	/* Inline PyObject_NewVar */
 138	op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
 139	if (op == NULL)
 140		return PyErr_NoMemory();
 141	PyObject_INIT_VAR(op, &PyString_Type, size);
 142	op->ob_shash = -1;
 143	op->ob_sstate = SSTATE_NOT_INTERNED;
 144	Py_MEMCPY(op->ob_sval, str, size+1);
 145	/* share short strings */
 146	if (size == 0) {
 147		PyObject *t = (PyObject *)op;
 148		PyString_InternInPlace(&t);
 149		op = (PyStringObject *)t;
 150		nullstring = op;
 151		Py_INCREF(op);
 152	} else if (size == 1) {
 153		PyObject *t = (PyObject *)op;
 154		PyString_InternInPlace(&t);
 155		op = (PyStringObject *)t;
 156		characters[*str & UCHAR_MAX] = op;
 157		Py_INCREF(op);
 158	}
 159	return (PyObject *) op;
 160}
 161
 162PyObject *
 163PyString_FromFormatV(const char *format, va_list vargs)
 164{
 165	va_list count;
 166	Py_ssize_t n = 0;
 167	const char* f;
 168	char *s;
 169	PyObject* string;
 170
 171#ifdef VA_LIST_IS_ARRAY
 172	Py_MEMCPY(count, vargs, sizeof(va_list));
 173#else
 174#ifdef  __va_copy
 175	__va_copy(count, vargs);
 176#else
 177	count = vargs;
 178#endif
 179#endif
 180	/* step 1: figure out how large a buffer we need */
 181	for (f = format; *f; f++) {
 182		if (*f == '%') {
 183			const char* p = f;
 184			while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 185				;
 186
 187			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 188			 * they don't affect the amount of space we reserve.
 189			 */
 190			if ((*f == 'l' || *f == 'z') &&
 191					(f[1] == 'd' || f[1] == 'u'))
 192				++f;
 193
 194			switch (*f) {
 195			case 'c':
 196				(void)va_arg(count, int);
 197				/* fall through... */
 198			case '%':
 199				n++;
 200				break;
 201			case 'd': case 'u': case 'i': case 'x':
 202				(void) va_arg(count, int);
 203				/* 20 bytes is enough to hold a 64-bit
 204				   integer.  Decimal takes the most space.
 205				   This isn't enough for octal. */
 206				n += 20;
 207				break;
 208			case 's':
 209				s = va_arg(count, char*);
 210				n += strlen(s);
 211				break;
 212			case 'p':
 213				(void) va_arg(count, int);
 214				/* maximum 64-bit pointer representation:
 215				 * 0xffffffffffffffff
 216				 * so 19 characters is enough.
 217				 * XXX I count 18 -- what's the extra for?
 218				 */
 219				n += 19;
 220				break;
 221			default:
 222				/* if we stumble upon an unknown
 223				   formatting code, copy the rest of
 224				   the format string to the output
 225				   string. (we cannot just skip the
 226				   code, since there's no way to know
 227				   what's in the argument list) */
 228				n += strlen(p);
 229				goto expand;
 230			}
 231		} else
 232			n++;
 233	}
 234 expand:
 235	/* step 2: fill the buffer */
 236	/* Since we've analyzed how much space we need for the worst case,
 237	   use sprintf directly instead of the slower PyOS_snprintf. */
 238	string = PyString_FromStringAndSize(NULL, n);
 239	if (!string)
 240		return NULL;
 241
 242	s = PyString_AsString(string);
 243
 244	for (f = format; *f; f++) {
 245		if (*f == '%') {
 246			const char* p = f++;
 247			Py_ssize_t i;
 248			int longflag = 0;
 249			int size_tflag = 0;
 250			/* parse the width.precision part (we're only
 251			   interested in the precision value, if any) */
 252			n = 0;
 253			while (isdigit(Py_CHARMASK(*f)))
 254				n = (n*10) + *f++ - '0';
 255			if (*f == '.') {
 256				f++;
 257				n = 0;
 258				while (isdigit(Py_CHARMASK(*f)))
 259					n = (n*10) + *f++ - '0';
 260			}
 261			while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
 262				f++;
 263			/* handle the long flag, but only for %ld and %lu.
 264			   others can be added when necessary. */
 265			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 266				longflag = 1;
 267				++f;
 268			}
 269			/* handle the size_t flag. */
 270			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 271				size_tflag = 1;
 272				++f;
 273			}
 274
 275			switch (*f) {
 276			case 'c':
 277				*s++ = va_arg(vargs, int);
 278				break;
 279			case 'd':
 280				if (longflag)
 281					sprintf(s, "%ld", va_arg(vargs, long));
 282				else if (size_tflag)
 283					sprintf(s, "%" PY_FORMAT_SIZE_T "d",
 284					        va_arg(vargs, Py_ssize_t));
 285				else
 286					sprintf(s, "%d", va_arg(vargs, int));
 287				s += strlen(s);
 288				break;
 289			case 'u':
 290				if (longflag)
 291					sprintf(s, "%lu",
 292						va_arg(vargs, unsigned long));
 293				else if (size_tflag)
 294					sprintf(s, "%" PY_FORMAT_SIZE_T "u",
 295					        va_arg(vargs, size_t));
 296				else
 297					sprintf(s, "%u",
 298						va_arg(vargs, unsigned int));
 299				s += strlen(s);
 300				break;
 301			case 'i':
 302				sprintf(s, "%i", va_arg(vargs, int));
 303				s += strlen(s);
 304				break;
 305			case 'x':
 306				sprintf(s, "%x", va_arg(vargs, int));
 307				s += strlen(s);
 308				break;
 309			case 's':
 310				p = va_arg(vargs, char*);
 311				i = strlen(p);
 312				if (n > 0 && i > n)
 313					i = n;
 314				Py_MEMCPY(s, p, i);
 315				s += i;
 316				break;
 317			case 'p':
 318				sprintf(s, "%p", va_arg(vargs, void*));
 319				/* %p is ill-defined:  ensure leading 0x. */
 320				if (s[1] == 'X')
 321					s[1] = 'x';
 322				else if (s[1] != 'x') {
 323					memmove(s+2, s, strlen(s)+1);
 324					s[0] = '0';
 325					s[1] = 'x';
 326				}
 327				s += strlen(s);
 328				break;
 329			case '%':
 330				*s++ = '%';
 331				break;
 332			default:
 333				strcpy(s, p);
 334				s += strlen(s);
 335				goto end;
 336			}
 337		} else
 338			*s++ = *f;
 339	}
 340
 341 end:
 342	_PyString_Resize(&string, s - PyString_AS_STRING(string));
 343	return string;
 344}
 345
 346PyObject *
 347PyString_FromFormat(const char *format, ...)
 348{
 349	PyObject* ret;
 350	va_list vargs;
 351
 352#ifdef HAVE_STDARG_PROTOTYPES
 353	va_start(vargs, format);
 354#else
 355	va_start(vargs);
 356#endif
 357	ret = PyString_FromFormatV(format, vargs);
 358	va_end(vargs);
 359	return ret;
 360}
 361
 362
 363PyObject *PyString_Decode(const char *s,
 364			  Py_ssize_t size,
 365			  const char *encoding,
 366			  const char *errors)
 367{
 368    PyObject *v, *str;
 369
 370    str = PyString_FromStringAndSize(s, size);
 371    if (str == NULL)
 372	return NULL;
 373    v = PyString_AsDecodedString(str, encoding, errors);
 374    Py_DECREF(str);
 375    return v;
 376}
 377
 378PyObject *PyString_AsDecodedObject(PyObject *str,
 379				   const char *encoding,
 380				   const char *errors)
 381{
 382    PyObject *v;
 383
 384    if (!PyString_Check(str)) {
 385        PyErr_BadArgument();
 386        goto onError;
 387    }
 388
 389    if (encoding == NULL) {
 390#ifdef Py_USING_UNICODE
 391	encoding = PyUnicode_GetDefaultEncoding();
 392#else
 393	PyErr_SetString(PyExc_ValueError, "no encoding specified");
 394	goto onError;
 395#endif
 396    }
 397
 398    /* Decode via the codec registry */
 399    v = PyCodec_Decode(str, encoding, errors);
 400    if (v == NULL)
 401        goto onError;
 402
 403    return v;
 404
 405 onError:
 406    return NULL;
 407}
 408
 409PyObject *PyString_AsDecodedString(PyObject *str,
 410				   const char *encoding,
 411				   const char *errors)
 412{
 413    PyObject *v;
 414
 415    v = PyString_AsDecodedObject(str, encoding, errors);
 416    if (v == NULL)
 417        goto onError;
 418
 419#ifdef Py_USING_UNICODE
 420    /* Convert Unicode to a string using the default encoding */
 421    if (PyUnicode_Check(v)) {
 422	PyObject *temp = v;
 423	v = PyUnicode_AsEncodedString(v, NULL, NULL);
 424	Py_DECREF(temp);
 425	if (v == NULL)
 426	    goto onError;
 427    }
 428#endif
 429    if (!PyString_Check(v)) {
 430        PyErr_Format(PyExc_TypeError,
 431                     "decoder did not return a string object (type=%.400s)",
 432                     Py_TYPE(v)->tp_name);
 433        Py_DECREF(v);
 434        goto onError;
 435    }
 436
 437    return v;
 438
 439 onError:
 440    return NULL;
 441}
 442
 443PyObject *PyString_Encode(const char *s,
 444			  Py_ssize_t size,
 445			  const char *encoding,
 446			  const char *errors)
 447{
 448    PyObject *v, *str;
 449
 450    str = PyString_FromStringAndSize(s, size);
 451    if (str == NULL)
 452	return NULL;
 453    v = PyString_AsEncodedString(str, encoding, errors);
 454    Py_DECREF(str);
 455    return v;
 456}
 457
 458PyObject *PyString_AsEncodedObject(PyObject *str,
 459				   const char *encoding,
 460				   const char *errors)
 461{
 462    PyObject *v;
 463
 464    if (!PyString_Check(str)) {
 465        PyErr_BadArgument();
 466        goto onError;
 467    }
 468
 469    if (encoding == NULL) {
 470#ifdef Py_USING_UNICODE
 471	encoding = PyUnicode_GetDefaultEncoding();
 472#else
 473	PyErr_SetString(PyExc_ValueError, "no encoding specified");
 474	goto onError;
 475#endif
 476    }
 477
 478    /* Encode via the codec registry */
 479    v = PyCodec_Encode(str, encoding, errors);
 480    if (v == NULL)
 481        goto onError;
 482
 483    return v;
 484
 485 onError:
 486    return NULL;
 487}
 488
 489PyObject *PyString_AsEncodedString(PyObject *str,
 490				   const char *encoding,
 491				   const char *errors)
 492{
 493    PyObject *v;
 494
 495    v = PyString_AsEncodedObject(str, encoding, errors);
 496    if (v == NULL)
 497        goto onError;
 498
 499#ifdef Py_USING_UNICODE
 500    /* Convert Unicode to a string using the default encoding */
 501    if (PyUnicode_Check(v)) {
 502	PyObject *temp = v;
 503	v = PyUnicode_AsEncodedString(v, NULL, NULL);
 504	Py_DECREF(temp);
 505	if (v == NULL)
 506	    goto onError;
 507    }
 508#endif
 509    if (!PyString_Check(v)) {
 510        PyErr_Format(PyExc_TypeError,
 511                     "encoder did not return a string object (type=%.400s)",
 512                     Py_TYPE(v)->tp_name);
 513        Py_DECREF(v);
 514        goto onError;
 515    }
 516
 517    return v;
 518
 519 onError:
 520    return NULL;
 521}
 522
 523static void
 524string_dealloc(PyObject *op)
 525{
 526	switch (PyString_CHECK_INTERNED(op)) {
 527		case SSTATE_NOT_INTERNED:
 528			break;
 529
 530		case SSTATE_INTERNED_MORTAL:
 531			/* revive dead object temporarily for DelItem */
 532			Py_REFCNT(op) = 3;
 533			if (PyDict_DelItem(interned, op) != 0)
 534				Py_FatalError(
 535					"deletion of interned string failed");
 536			break;
 537
 538		case SSTATE_INTERNED_IMMORTAL:
 539			Py_FatalError("Immortal interned string died.");
 540
 541		default:
 542			Py_FatalError("Inconsistent interned string state.");
 543	}
 544	Py_TYPE(op)->tp_free(op);
 545}
 546
 547/* Unescape a backslash-escaped string. If unicode is non-zero,
 548   the string is a u-literal. If recode_encoding is non-zero,
 549   the string is UTF-8 encoded and should be re-encoded in the
 550   specified encoding.  */
 551
 552PyObject *PyString_DecodeEscape(const char *s,
 553				Py_ssize_t len,
 554				const char *errors,
 555				Py_ssize_t unicode,
 556				const char *recode_encoding)
 557{
 558	int c;
 559	char *p, *buf;
 560	const char *end;
 561	PyObject *v;
 562	Py_ssize_t newlen = recode_encoding ? 4*len:len;
 563	v = PyString_FromStringAndSize((char *)NULL, newlen);
 564	if (v == NULL)
 565		return NULL;
 566	p = buf = PyString_AsString(v);
 567	end = s + len;
 568	while (s < end) {
 569		if (*s != '\\') {
 570		  non_esc:
 571#ifdef Py_USING_UNICODE
 572			if (recode_encoding && (*s & 0x80)) {
 573				PyObject *u, *w;
 574				char *r;
 575				const char* t;
 576				Py_ssize_t rn;
 577				t = s;
 578				/* Decode non-ASCII bytes as UTF-8. */
 579				while (t < end && (*t & 0x80)) t++;
 580				u = PyUnicode_DecodeUTF8(s, t - s, errors);
 581				if(!u) goto failed;
 582
 583				/* Recode them in target encoding. */
 584				w = PyUnicode_AsEncodedString(
 585					u, recode_encoding, errors);
 586				Py_DECREF(u);
 587				if (!w)	goto failed;
 588
 589				/* Append bytes to output buffer. */
 590				assert(PyString_Check(w));
 591				r = PyString_AS_STRING(w);
 592				rn = PyString_GET_SIZE(w);
 593				Py_MEMCPY(p, r, rn);
 594				p += rn;
 595				Py_DECREF(w);
 596				s = t;
 597			} else {
 598				*p++ = *s++;
 599			}
 600#else
 601			*p++ = *s++;
 602#endif
 603			continue;
 604		}
 605		s++;
 606                if (s==end) {
 607			PyErr_SetString(PyExc_ValueError,
 608					"Trailing \\ in string");
 609			goto failed;
 610		}
 611		switch (*s++) {
 612		/* XXX This assumes ASCII! */
 613		case '\n': break;
 614		case '\\': *p++ = '\\'; break;
 615		case '\'': *p++ = '\''; break;
 616		case '\"': *p++ = '\"'; break;
 617		case 'b': *p++ = '\b'; break;
 618		case 'f': *p++ = '\014'; break; /* FF */
 619		case 't': *p++ = '\t'; break;
 620		case 'n': *p++ = '\n'; break;
 621		case 'r': *p++ = '\r'; break;
 622		case 'v': *p++ = '\013'; break; /* VT */
 623		case 'a': *p++ = '\007'; break; /* BEL, not classic C */
 624		case '0': case '1': case '2': case '3':
 625		case '4': case '5': case '6': case '7':
 626			c = s[-1] - '0';
 627			if (s < end && '0' <= *s && *s <= '7') {
 628				c = (c<<3) + *s++ - '0';
 629				if (s < end && '0' <= *s && *s <= '7')
 630					c = (c<<3) + *s++ - '0';
 631			}
 632			*p++ = c;
 633			break;
 634		case 'x':
 635			if (s+1 < end &&
 636                            isxdigit(Py_CHARMASK(s[0])) &&
 637			    isxdigit(Py_CHARMASK(s[1])))
 638                        {
 639				unsigned int x = 0;
 640				c = Py_CHARMASK(*s);
 641				s++;
 642				if (isdigit(c))
 643					x = c - '0';
 644				else if (islower(c))
 645					x = 10 + c - 'a';
 646				else
 647					x = 10 + c - 'A';
 648				x = x << 4;
 649				c = Py_CHARMASK(*s);
 650				s++;
 651				if (isdigit(c))
 652					x += c - '0';
 653				else if (islower(c))
 654					x += 10 + c - 'a';
 655				else
 656					x += 10 + c - 'A';
 657				*p++ = x;
 658				break;
 659			}
 660			if (!errors || strcmp(errors, "strict") == 0) {
 661				PyErr_SetString(PyExc_ValueError,
 662						"invalid \\x escape");
 663				goto failed;
 664			}
 665			if (strcmp(errors, "replace") == 0) {
 666				*p++ = '?';
 667			} else if (strcmp(errors, "ignore") == 0)
 668				/* do nothing */;
 669			else {
 670				PyErr_Format(PyExc_ValueError,
 671					     "decoding error; "
 672					     "unknown error handling code: %.400s",
 673					     errors);
 674				goto failed;
 675			}
 676#ifndef Py_USING_UNICODE
 677		case 'u':
 678		case 'U':
 679		case 'N':
 680			if (unicode) {
 681				PyErr_SetString(PyExc_ValueError,
 682					  "Unicode escapes not legal "
 683					  "when Unicode disabled");
 684				goto failed;
 685			}
 686#endif
 687		default:
 688			*p++ = '\\';
 689			s--;
 690			goto non_esc; /* an arbitry number of unescaped
 691					 UTF-8 bytes may follow. */
 692		}
 693	}
 694	if (p-buf < newlen)
 695		_PyString_Resize(&v, p - buf);
 696	return v;
 697  failed:
 698	Py_DECREF(v);
 699	return NULL;
 700}
 701
 702/* -------------------------------------------------------------------- */
 703/* object api */
 704
 705static Py_ssize_t
 706string_getsize(register PyObject *op)
 707{
 708    	char *s;
 709    	Py_ssize_t len;
 710	if (PyString_AsStringAndSize(op, &s, &len))
 711		return -1;
 712	return len;
 713}
 714
 715static /*const*/ char *
 716string_getbuffer(register PyObject *op)
 717{
 718    	char *s;
 719    	Py_ssize_t len;
 720	if (PyString_AsStringAndSize(op, &s, &len))
 721		return NULL;
 722	return s;
 723}
 724
 725Py_ssize_t
 726PyString_Size(register PyObject *op)
 727{
 728	if (!PyString_Check(op))
 729		return string_getsize(op);
 730	return Py_SIZE(op);
 731}
 732
 733/*const*/ char *
 734PyString_AsString(register PyObject *op)
 735{
 736	if (!PyString_Check(op))
 737		return string_getbuffer(op);
 738	return ((PyStringObject *)op) -> ob_sval;
 739}
 740
 741int
 742PyString_AsStringAndSize(register PyObject *obj,
 743			 register char **s,
 744			 register Py_ssize_t *len)
 745{
 746	if (s == NULL) {
 747		PyErr_BadInternalCall();
 748		return -1;
 749	}
 750
 751	if (!PyString_Check(obj)) {
 752#ifdef Py_USING_UNICODE
 753		if (PyUnicode_Check(obj)) {
 754			obj = _PyUnicode_AsDefaultEncodedString(obj, NULL);
 755			if (obj == NULL)
 756				return -1;
 757		}
 758		else
 759#endif
 760		{
 761			PyErr_Format(PyExc_TypeError,
 762				     "expected string or Unicode object, "
 763				     "%.200s found", Py_TYPE(obj)->tp_name);
 764			return -1;
 765		}
 766	}
 767
 768	*s = PyString_AS_STRING(obj);
 769	if (len != NULL)
 770		*len = PyString_GET_SIZE(obj);
 771	else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) {
 772		PyErr_SetString(PyExc_TypeError,
 773				"expected string without null bytes");
 774		return -1;
 775	}
 776	return 0;
 777}
 778
 779/* -------------------------------------------------------------------- */
 780/* Methods */
 781
 782#include "stringlib/stringdefs.h"
 783#include "stringlib/fastsearch.h"
 784
 785#include "stringlib/count.h"
 786#include "stringlib/find.h"
 787#include "stringlib/partition.h"
 788
 789#define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
 790#include "stringlib/localeutil.h"
 791
 792
 793
 794static int
 795string_print(PyStringObject *op, FILE *fp, int flags)
 796{
 797	Py_ssize_t i, str_len;
 798	char c;
 799	int quote;
 800
 801	/* XXX Ought to check for interrupts when writing long strings */
 802	if (! PyString_CheckExact(op)) {
 803		int ret;
 804		/* A str subclass may have its own __str__ method. */
 805		op = (PyStringObject *) PyObject_Str((PyObject *)op);
 806		if (op == NULL)
 807			return -1;
 808		ret = string_print(op, fp, flags);
 809		Py_DECREF(op);
 810		return ret;
 811	}
 812	if (flags & Py_PRINT_RAW) {
 813		char *data = op->ob_sval;
 814		Py_ssize_t size = Py_SIZE(op);
 815		Py_BEGIN_ALLOW_THREADS
 816		while (size > INT_MAX) {
 817			/* Very long strings cannot be written atomically.
 818			 * But don't write exactly INT_MAX bytes at a time
 819			 * to avoid memory aligment issues.
 820			 */
 821			const int chunk_size = INT_MAX & ~0x3FFF;
 822			fwrite(data, 1, chunk_size, fp);
 823			data += chunk_size;
 824			size -= chunk_size;
 825		}
 826#ifdef __VMS
 827                if (size) fwrite(data, (int)size, 1, fp);
 828#else
 829                fwrite(data, 1, (int)size, fp);
 830#endif
 831		Py_END_ALLOW_THREADS
 832		return 0;
 833	}
 834
 835	/* figure out which quote to use; single is preferred */
 836	quote = '\'';
 837	if (memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 838	    !memchr(op->ob_sval, '"', Py_SIZE(op)))
 839		quote = '"';
 840
 841	str_len = Py_SIZE(op);
 842	Py_BEGIN_ALLOW_THREADS
 843	fputc(quote, fp);
 844	for (i = 0; i < str_len; i++) {
 845		/* Since strings are immutable and the caller should have a
 846		reference, accessing the interal buffer should not be an issue
 847		with the GIL released. */
 848		c = op->ob_sval[i];
 849		if (c == quote || c == '\\')
 850			fprintf(fp, "\\%c", c);
 851                else if (c == '\t')
 852                        fprintf(fp, "\\t");
 853                else if (c == '\n')
 854                        fprintf(fp, "\\n");
 855                else if (c == '\r')
 856                        fprintf(fp, "\\r");
 857		else if (c < ' ' || c >= 0x7f)
 858			fprintf(fp, "\\x%02x", c & 0xff);
 859		else
 860			fputc(c, fp);
 861	}
 862	fputc(quote, fp);
 863	Py_END_ALLOW_THREADS
 864	return 0;
 865}
 866
 867PyObject *
 868PyString_Repr(PyObject *obj, int smartquotes)
 869{
 870	register PyStringObject* op = (PyStringObject*) obj;
 871	size_t newsize = 2 + 4 * Py_SIZE(op);
 872	PyObject *v;
 873	if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) {
 874		PyErr_SetString(PyExc_OverflowError,
 875			"string is too large to make repr");
 876                return NULL;
 877	}
 878	v = PyString_FromStringAndSize((char *)NULL, newsize);
 879	if (v == NULL) {
 880		return NULL;
 881	}
 882	else {
 883		register Py_ssize_t i;
 884		register char c;
 885		register char *p;
 886		int quote;
 887
 888		/* figure out which quote to use; single is preferred */
 889		quote = '\'';
 890		if (smartquotes &&
 891		    memchr(op->ob_sval, '\'', Py_SIZE(op)) &&
 892		    !memchr(op->ob_sval, '"', Py_SIZE(op)))
 893			quote = '"';
 894
 895		p = PyString_AS_STRING(v);
 896		*p++ = quote;
 897		for (i = 0; i < Py_SIZE(op); i++) {
 898			/* There's at least enough room for a hex escape
 899			   and a closing quote. */
 900			assert(newsize - (p - PyString_AS_STRING(v)) >= 5);
 901			c = op->ob_sval[i];
 902			if (c == quote || c == '\\')
 903				*p++ = '\\', *p++ = c;
 904			else if (c == '\t')
 905				*p++ = '\\', *p++ = 't';
 906			else if (c == '\n')
 907				*p++ = '\\', *p++ = 'n';
 908			else if (c == '\r')
 909				*p++ = '\\', *p++ = 'r';
 910			else if (c < ' ' || c >= 0x7f) {
 911				/* For performance, we don't want to call
 912				   PyOS_snprintf here (extra layers of
 913				   function call). */
 914				sprintf(p, "\\x%02x", c & 0xff);
 915                                p += 4;
 916			}
 917			else
 918				*p++ = c;
 919		}
 920		assert(newsize - (p - PyString_AS_STRING(v)) >= 1);
 921		*p++ = quote;
 922		*p = '\0';
 923		_PyString_Resize(
 924			&v, (p - PyString_AS_STRING(v)));
 925		return v;
 926	}
 927}
 928
 929static PyObject *
 930string_repr(PyObject *op)
 931{
 932	return PyString_Repr(op, 1);
 933}
 934
 935static PyObject *
 936string_str(PyObject *s)
 937{
 938	assert(PyString_Check(s));
 939	if (PyString_CheckExact(s)) {
 940		Py_INCREF(s);
 941		return s;
 942	}
 943	else {
 944		/* Subtype -- return genuine string with the same value. */
 945		PyStringObject *t = (PyStringObject *) s;
 946		return PyString_FromStringAndSize(t->ob_sval, Py_SIZE(t));
 947	}
 948}
 949
 950static Py_ssize_t
 951string_length(PyStringObject *a)
 952{
 953	return Py_SIZE(a);
 954}
 955
 956static PyObject *
 957string_concat(register PyStringObject *a, register PyObject *bb)
 958{
 959	register Py_ssize_t size;
 960	register PyStringObject *op;
 961	if (!PyString_Check(bb)) {
 962#ifdef Py_USING_UNICODE
 963		if (PyUnicode_Check(bb))
 964		    return PyUnicode_Concat((PyObject *)a, bb);
 965#endif
 966		if (PyByteArray_Check(bb))
 967		    return PyByteArray_Concat((PyObject *)a, bb);
 968		PyErr_Format(PyExc_TypeError,
 969			     "cannot concatenate 'str' and '%.200s' objects",
 970			     Py_TYPE(bb)->tp_name);
 971		return NULL;
 972	}
 973#define b ((PyStringObject *)bb)
 974	/* Optimize cases with empty left or right operand */
 975	if ((Py_SIZE(a) == 0 || Py_SIZE(b) == 0) &&
 976	    PyString_CheckExact(a) && PyString_CheckExact(b)) {
 977		if (Py_SIZE(a) == 0) {
 978			Py_INCREF(bb);
 979			return bb;
 980		}
 981		Py_INCREF(a);
 982		return (PyObject *)a;
 983	}
 984	size = Py_SIZE(a) + Py_SIZE(b);
 985	/* Check that string sizes are not negative, to prevent an
 986	   overflow in cases where we are passed incorrectly-created
 987	   strings with negative lengths (due to a bug in other code).
 988        */
 989	if (Py_SIZE(a) < 0 || Py_SIZE(b) < 0 ||
 990	    Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b)) {
 991		PyErr_SetString(PyExc_OverflowError,
 992				"strings are too large to concat");
 993		return NULL;
 994	}
 995	  
 996	/* Inline PyObject_NewVar */
 997	if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) {
 998		PyErr_SetString(PyExc_OverflowError,
 999				"strings are too large to concat");
1000		return NULL;
1001	}
1002	op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
1003	if (op == NULL)
1004		return PyErr_NoMemory();
1005	PyObject_INIT_VAR(op, &PyString_Type, size);
1006	op->ob_shash = -1;
1007	op->ob_sstate = SSTATE_NOT_INTERNED;
1008	Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1009	Py_MEMCPY(op->ob_sval + Py_SIZE(a), b->ob_sval, Py_SIZE(b));
1010	op->ob_sval[size] = '\0';
1011	return (PyObject *) op;
1012#undef b
1013}
1014
1015static PyObject *
1016string_repeat(register PyStringObject *a, register Py_ssize_t n)
1017{
1018	register Py_ssize_t i;
1019	register Py_ssize_t j;
1020	register Py_ssize_t size;
1021	register PyStringObject *op;
1022	size_t nbytes;
1023	if (n < 0)
1024		n = 0;
1025	/* watch out for overflows:  the size can overflow int,
1026	 * and the # of bytes needed can overflow size_t
1027	 */
1028	size = Py_SIZE(a) * n;
1029	if (n && size / n != Py_SIZE(a)) {
1030		PyErr_SetString(PyExc_OverflowError,
1031			"repeated string is too long");
1032		return NULL;
1033	}
1034	if (size == Py_SIZE(a) && PyString_CheckExact(a)) {
1035		Py_INCREF(a);
1036		return (PyObject *)a;
1037	}
1038	nbytes = (size_t)size;
1039	if (nbytes + sizeof(PyStringObject) <= nbytes) {
1040		PyErr_SetString(PyExc_OverflowError,
1041			"repeated string is too long");
1042		return NULL;
1043	}
1044	op = (PyStringObject *)
1045		PyObject_MALLOC(sizeof(PyStringObject) + nbytes);
1046	if (op == NULL)
1047		return PyErr_NoMemory();
1048	PyObject_INIT_VAR(op, &PyString_Type, size);
1049	op->ob_shash = -1;
1050	op->ob_sstate = SSTATE_NOT_INTERNED;
1051	op->ob_sval[size] = '\0';
1052	if (Py_SIZE(a) == 1 && n > 0) {
1053		memset(op->ob_sval, a->ob_sval[0] , n);
1054		return (PyObject *) op;
1055	}
1056	i = 0;
1057	if (i < size) {
1058		Py_MEMCPY(op->ob_sval, a->ob_sval, Py_SIZE(a));
1059		i = Py_SIZE(a);
1060	}
1061	while (i < size) {
1062		j = (i <= size-i)  ?  i  :  size-i;
1063		Py_MEMCPY(op->ob_sval+i, op->ob_sval, j);
1064		i += j;
1065	}
1066	return (PyObject *) op;
1067}
1068
1069/* String slice a[i:j] consists of characters a[i] ... a[j-1] */
1070
1071static PyObject *
1072string_slice(register PyStringObject *a, register Py_ssize_t i,
1073	     register Py_ssize_t j)
1074     /* j -- may be negative! */
1075{
1076	if (i < 0)
1077		i = 0;
1078	if (j < 0)
1079		j = 0; /* Avoid signed/unsigned bug in next line */
1080	if (j > Py_SIZE(a))
1081		j = Py_SIZE(a);
1082	if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) {
1083		/* It's the same as a */
1084		Py_INCREF(a);
1085		return (PyObject *)a;
1086	}
1087	if (j < i)
1088		j = i;
1089	return PyString_FromStringAndSize(a->ob_sval + i, j-i);
1090}
1091
1092static int
1093string_contains(PyObject *str_obj, PyObject *sub_obj)
1094{
1095	if (!PyString_CheckExact(sub_obj)) {
1096#ifdef Py_USING_UNICODE
1097		if (PyUnicode_Check(sub_obj))
1098			return PyUnicode_Contains(str_obj, sub_obj);
1099#endif
1100		if (!PyString_Check(sub_obj)) {
1101			PyErr_Format(PyExc_TypeError,
1102			    "'in <string>' requires string as left operand, "
1103			    "not %.200s", Py_TYPE(sub_obj)->tp_name);
1104			return -1;
1105		}
1106	}
1107
1108	return stringlib_contains_obj(str_obj, sub_obj);
1109}
1110
1111static PyObject *
1112string_item(PyStringObject *a, register Py_ssize_t i)
1113{
1114	char pchar;
1115	PyObject *v;
1116	if (i < 0 || i >= Py_SIZE(a)) {
1117		PyErr_SetString(PyExc_IndexError, "string index out of range");
1118		return NULL;
1119	}
1120	pchar = a->ob_sval[i];
1121	v = (PyObject *)characters[pchar & UCHAR_MAX];
1122	if (v == NULL)
1123		v = PyString_FromStringAndSize(&pchar, 1);
1124	else {
1125#ifdef COUNT_ALLOCS
1126		one_strings++;
1127#endif
1128		Py_INCREF(v);
1129	}
1130	return v;
1131}
1132
1133static PyObject*
1134string_richcompare(PyStringObject *a, PyStringObject *b, int op)
1135{
1136	int c;
1137	Py_ssize_t len_a, len_b;
1138	Py_ssize_t min_len;
1139	PyObject *result;
1140
1141	/* Make sure both arguments are strings. */
1142	if (!(PyString_Check(a) && PyString_Check(b))) {
1143		result = Py_NotImplemented;
1144		goto out;
1145	}
1146	if (a == b) {
1147		switch (op) {
1148		case Py_EQ:case Py_LE:case Py_GE:
1149			result = Py_True;
1150			goto out;
1151		case Py_NE:case Py_LT:case Py_GT:
1152			result = Py_False;
1153			goto out;
1154		}
1155	}
1156	if (op == Py_EQ) {
1157		/* Supporting Py_NE here as well does not save
1158		   much time, since Py_NE is rarely used.  */
1159		if (Py_SIZE(a) == Py_SIZE(b)
1160		    && (a->ob_sval[0] == b->ob_sval[0]
1161			&& memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0)) {
1162			result = Py_True;
1163		} else {
1164			result = Py_False;
1165		}
1166		goto out;
1167	}
1168	len_a = Py_SIZE(a); len_b = Py_SIZE(b);
1169	min_len = (len_a < len_b) ? len_a : len_b;
1170	if (min_len > 0) {
1171		c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval);
1172		if (c==0)
1173			c = memcmp(a->ob_sval, b->ob_sval, min_len);
1174	} else
1175		c = 0;
1176	if (c == 0)
1177		c = (len_a < len_b) ? -1 : (len_a > len_b) ? 1 : 0;
1178	switch (op) {
1179	case Py_LT: c = c <  0; break;
1180	case Py_LE: c = c <= 0; break;
1181	case Py_EQ: assert(0);  break; /* unreachable */
1182	case Py_NE: c = c != 0; break;
1183	case Py_GT: c = c >  0; break;
1184	case Py_GE: c = c >= 0; break;
1185	default:
1186		result = Py_NotImplemented;
1187		goto out;
1188	}
1189	result = c ? Py_True : Py_False;
1190  out:
1191	Py_INCREF(result);
1192	return result;
1193}
1194
1195int
1196_PyString_Eq(PyObject *o1, PyObject *o2)
1197{
1198	PyStringObject *a = (PyStringObject*) o1;
1199	PyStringObject *b = (PyStringObject*) o2;
1200        return Py_SIZE(a) == Py_SIZE(b)
1201          && *a->ob_sval == *b->ob_sval
1202          && memcmp(a->ob_sval, b->ob_sval, Py_SIZE(a)) == 0;
1203}
1204
1205static long
1206string_hash(PyStringObject *a)
1207{
1208	register Py_ssize_t len;
1209	register unsigned char *p;
1210	register long x;
1211
1212	if (a->ob_shash != -1)
1213		return a->ob_shash;
1214	len = Py_SIZE(a);
1215	p = (unsigned char *) a->ob_sval;
1216	x = *p << 7;
1217	while (--len >= 0)
1218		x = (1000003*x) ^ *p++;
1219	x ^= Py_SIZE(a);
1220	if (x == -1)
1221		x = -2;
1222	a->ob_shash = x;
1223	return x;
1224}
1225
1226static PyObject*
1227string_subscript(PyStringObject* self, PyObject* item)
1228{
1229	if (PyIndex_Check(item)) {
1230		Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
1231		if (i == -1 && PyErr_Occurred())
1232			return NULL;
1233		if (i < 0)
1234			i += PyString_GET_SIZE(self);
1235		return string_item(self, i);
1236	}
1237	else if (PySlice_Check(item)) {
1238		Py_ssize_t start, stop, step, slicelength, cur, i;
1239		char* source_buf;
1240		char* result_buf;
1241		PyObject* result;
1242
1243		if (PySlice_GetIndicesEx((PySliceObject*)item,
1244				 PyString_GET_SIZE(self),
1245				 &start, &stop, &step, &slicelength) < 0) {
1246			return NULL;
1247		}
1248
1249		if (slicelength <= 0) {
1250			return PyString_FromStringAndSize("", 0);
1251		}
1252		else if (start == 0 && step == 1 &&
1253			 slicelength == PyString_GET_SIZE(self) &&
1254			 PyString_CheckExact(self)) {
1255			Py_INCREF(self);
1256			return (PyObject *)self;
1257		}
1258		else if (step == 1) {
1259			return PyString_FromStringAndSize(
1260				PyString_AS_STRING(self) + start,
1261				slicelength);
1262		}
1263		else {
1264			source_buf = PyString_AsString((PyObject*)self);
1265			result_buf = (char *)PyMem_Malloc(slicelength);
1266			if (result_buf == NULL)
1267				return PyErr_NoMemory();
1268
1269			for (cur = start, i = 0; i < slicelength;
1270			     cur += step, i++) {
1271				result_buf[i] = source_buf[cur];
1272			}
1273
1274			result = PyString_FromStringAndSize(result_buf,
1275							    slicelength);
1276			PyMem_Free(result_buf);
1277			return result;
1278		}
1279	}
1280	else {
1281		PyErr_Format(PyExc_TypeError,
1282			     "string indices must be integers, not %.200s",
1283			     Py_TYPE(item)->tp_name);
1284		return NULL;
1285	}
1286}
1287
1288static Py_ssize_t
1289string_buffer_getreadbuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1290{
1291	if ( index != 0 ) {
1292		PyErr_SetString(PyExc_SystemError,
1293				"accessing non-existent string segment");
1294		return -1;
1295	}
1296	*ptr = (void *)self->ob_sval;
1297	return Py_SIZE(self);
1298}
1299
1300static Py_ssize_t
1301string_buffer_getwritebuf(PyStringObject *self, Py_ssize_t index, const void **ptr)
1302{
1303	PyErr_SetString(PyExc_TypeError,
1304			"Cannot use string as modifiable buffer");
1305	return -1;
1306}
1307
1308static Py_ssize_t
1309string_buffer_getsegcount(PyStringObject *self, Py_ssize_t *lenp)
1310{
1311	if ( lenp )
1312		*lenp = Py_SIZE(self);
1313	return 1;
1314}
1315
1316static Py_ssize_t
1317string_buffer_getcharbuf(PyStringObject *self, Py_ssize_t index, const char **ptr)
1318{
1319	if ( index != 0 ) {
1320		PyErr_SetString(PyExc_SystemError,
1321				"accessing non-existent string segment");
1322		return -1;
1323	}
1324	*ptr = self->ob_sval;
1325	return Py_SIZE(self);
1326}
1327
1328static int
1329string_buffer_getbuffer(PyStringObject *self, Py_buffer *view, int flags)
1330{
1331	return PyBuffer_FillInfo(view, (PyObject*)self,
1332				 (void *)self->ob_sval, Py_SIZE(self),
1333				 1, flags);
1334}
1335
1336static PySequenceMethods string_as_sequence = {
1337	(lenfunc)string_length, /*sq_length*/
1338	(binaryfunc)string_concat, /*sq_concat*/
1339	(ssizeargfunc)string_repeat, /*sq_repeat*/
1340	(ssizeargfunc)string_item, /*sq_item*/
1341	(ssizessizeargfunc)string_slice, /*sq_slice*/
1342	0,		/*sq_ass_item*/
1343	0,		/*sq_ass_slice*/
1344	(objobjproc)string_contains /*sq_contains*/
1345};
1346
1347static PyMappingMethods string_as_mapping = {
1348	(lenfunc)string_length,
1349	(binaryfunc)string_subscript,
1350	0,
1351};
1352
1353static PyBufferProcs string_as_buffer = {
1354	(readbufferproc)string_buffer_getreadbuf,
1355	(writebufferproc)string_buffer_getwritebuf,
1356	(segcountproc)string_buffer_getsegcount,
1357	(charbufferproc)string_buffer_getcharbuf,
1358	(getbufferproc)string_buffer_getbuffer,
1359	0, /* XXX */
1360};
1361
1362
1363
1364#define LEFTSTRIP 0
1365#define RIGHTSTRIP 1
1366#define BOTHSTRIP 2
1367
1368/* Arrays indexed by above */
1369static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
1370
1371#define STRIPNAME(i) (stripformat[i]+3)
1372
1373
1374/* Don't call if length < 2 */
1375#define Py_STRING_MATCH(target, offset, pattern, length)	\
1376  (target[offset] == pattern[0] &&				\
1377   target[offset+length-1] == pattern[length-1] &&		\
1378   !memcmp(target+offset+1, pattern+1, length-2) )
1379
1380
1381/* Overallocate the initial list to reduce the number of reallocs for small
1382   split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
1383   resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
1384   text (roughly 11 words per line) and field delimited data (usually 1-10
1385   fields).  For large strings the split algorithms are bandwidth limited
1386   so increasing the preallocation likely will not improve things.*/
1387
1388#define MAX_PREALLOC 12
1389
1390/* 5 splits gives 6 elements */
1391#define PREALLOC_SIZE(maxsplit) \
1392	(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
1393
1394#define SPLIT_APPEND(data, left, right)				\
1395	str = PyString_FromStringAndSize((data) + (left),	\
1396					 (right) - (left));	\
1397	if (str == NULL)					\
1398		goto onError;					\
1399	if (PyList_Append(list, str)) {				\
1400		Py_DECREF(str);					\
1401		goto onError;					\
1402	}							\
1403	else							\
1404		Py_DECREF(str);
1405
1406#define SPLIT_ADD(data, left, right) {				\
1407	str = PyString_FromStringAndSize((data) + (left),	\
1408					 (right) - (left));	\
1409	if (str == NULL)					\
1410		goto onError;					\
1411	if (count < MAX_PREALLOC) {				\
1412		PyList_SET_ITEM(list, count, str);		\
1413	} else {						\
1414		if (PyList_Append(list, str)) {			\
1415			Py_DECREF(str);				\
1416			goto onError;				\
1417		}						\
1418		else						\
1419			Py_DECREF(str);				\
1420	}							\
1421	count++; }
1422
1423/* Always force the list to the expected size. */
1424#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
1425
1426#define SKIP_SPACE(s, i, len)    { while (i<len &&  isspace(Py_CHARMASK(s[i]))) i++; }
1427#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
1428#define RSKIP_SPACE(s, i)        { while (i>=0  &&  isspace(Py_CHARMASK(s[i]))) i--; }
1429#define RSKIP_NONSPACE(s, i)     { while (i>=0  && !isspace(Py_CHARMASK(s[i]))) i--; }
1430
1431Py_LOCAL_INLINE(PyObject *)
1432split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1433{
1434	const char *s = PyString_AS_STRING(self);
1435	Py_ssize_t i, j, count=0;
1436	PyObject *str;
1437	PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1438
1439	if (list == NULL)
1440		return NULL;
1441
1442	i = j = 0;
1443
1444	while (maxsplit-- > 0) {
1445		SKIP_SPACE(s, i, len);
1446		if (i==len) break;
1447		j = i; i++;
1448		SKIP_NONSPACE(s, i, len);
1449		if (j == 0 && i == len && PyString_CheckExact(self)) {
1450			/* No whitespace in self, so just use it as list[0] */
1451			Py_INCREF(self);
1452			PyList_SET_ITEM(list, 0, (PyObject *)self);
1453			count++;
1454			break;
1455		}
1456		SPLIT_ADD(s, j, i);
1457	}
1458
1459	if (i < len) {
1460		/* Only occurs when maxsplit was reached */
1461		/* Skip any remaining whitespace and copy to end of string */
1462		SKIP_SPACE(s, i, len);
1463		if (i != len)
1464			SPLIT_ADD(s, i, len);
1465	}
1466	FIX_PREALLOC_SIZE(list);
1467	return list;
1468  onError:
1469	Py_DECREF(list);
1470	return NULL;
1471}
1472
1473Py_LOCAL_INLINE(PyObject *)
1474split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1475{
1476	const char *s = PyString_AS_STRING(self);
1477	register Py_ssize_t i, j, count=0;
1478	PyObject *str;
1479	PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1480
1481	if (list == NULL)
1482		return NULL;
1483
1484	i = j = 0;
1485	while ((j < len) && (maxcount-- > 0)) {
1486		for(; j<len; j++) {
1487			/* I found that using memchr makes no difference */
1488			if (s[j] == ch) {
1489				SPLIT_ADD(s, i, j);
1490				i = j = j + 1;
1491				break;
1492			}
1493		}
1494	}
1495	if (i == 0 && count == 0 && PyString_CheckExact(self)) {
1496		/* ch not in self, so just use self as list[0] */
1497		Py_INCREF(self);
1498		PyList_SET_ITEM(list, 0, (PyObject *)self);
1499		count++;
1500	}
1501	else if (i <= len) {
1502		SPLIT_ADD(s, i, len);
1503	}
1504	FIX_PREALLOC_SIZE(list);
1505	return list;
1506
1507  onError:
1508	Py_DECREF(list);
1509	return NULL;
1510}
1511
1512PyDoc_STRVAR(split__doc__,
1513"S.split([sep [,maxsplit]]) -> list of strings\n\
1514\n\
1515Return a list of the words in the string S, using sep as the\n\
1516delimiter string.  If maxsplit is given, at most maxsplit\n\
1517splits are done. If sep is not specified or is None, any\n\
1518whitespace string is a separator and empty strings are removed\n\
1519from the result.");
1520
1521static PyObject *
1522string_split(PyStringObject *self, PyObject *args)
1523{
1524	Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1525	Py_ssize_t maxsplit = -1, count=0;
1526	const char *s = PyString_AS_STRING(self), *sub;
1527	PyObject *list, *str, *subobj = Py_None;
1528#ifdef USE_FAST
1529	Py_ssize_t pos;
1530#endif
1531
1532	if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
1533		return NULL;
1534	if (maxsplit < 0)
1535		maxsplit = PY_SSIZE_T_MAX;
1536	if (subobj == Py_None)
1537		return split_whitespace(self, len, maxsplit);
1538	if (PyString_Check(subobj)) {
1539		sub = PyString_AS_STRING(subobj);
1540		n = PyString_GET_SIZE(subobj);
1541	}
1542#ifdef Py_USING_UNICODE
1543	else if (PyUnicode_Check(subobj))
1544		return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
1545#endif
1546	else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1547		return NULL;
1548
1549	if (n == 0) {
1550		PyErr_SetString(PyExc_ValueError, "empty separator");
1551		return NULL;
1552	}
1553	else if (n == 1)
1554		return split_char(self, len, sub[0], maxsplit);
1555
1556	list = PyList_New(PREALLOC_SIZE(maxsplit));
1557	if (list == NULL)
1558		return NULL;
1559
1560#ifdef USE_FAST
1561	i = j = 0;
1562	while (maxsplit-- > 0) {
1563		pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
1564		if (pos < 0)
1565			break;
1566		j = i+pos;
1567		SPLIT_ADD(s, i, j);
1568		i = j + n;
1569	}
1570#else
1571	i = j = 0;
1572	while ((j+n <= len) && (maxsplit-- > 0)) {
1573		for (; j+n <= len; j++) {
1574			if (Py_STRING_MATCH(s, j, sub, n)) {
1575				SPLIT_ADD(s, i, j);
1576				i = j = j + n;
1577				break;
1578			}
1579		}
1580	}
1581#endif
1582	SPLIT_ADD(s, i, len);
1583	FIX_PREALLOC_SIZE(list);
1584	return list;
1585
1586 onError:
1587	Py_DECREF(list);
1588	return NULL;
1589}
1590
1591PyDoc_STRVAR(partition__doc__,
1592"S.partition(sep) -> (head, sep, tail)\n\
1593\n\
1594Search for the separator sep in S, and return the part before it,\n\
1595the separator itself, and the part after it.  If the separator is not\n\
1596found, return S and two empty strings.");
1597
1598static PyObject *
1599string_partition(PyStringObject *self, PyObject *sep_obj)
1600{
1601	const char *sep;
1602	Py_ssize_t sep_len;
1603
1604	if (PyString_Check(sep_obj)) {
1605		sep = PyString_AS_STRING(sep_obj);
1606		sep_len = PyString_GET_SIZE(sep_obj);
1607	}
1608#ifdef Py_USING_UNICODE
1609	else if (PyUnicode_Check(sep_obj))
1610		return PyUnicode_Partition((PyObject *) self, sep_obj);
1611#endif
1612	else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1613		return NULL;
1614
1615	return stringlib_partition(
1616		(PyObject*) self,
1617		PyString_AS_STRING(self), PyString_GET_SIZE(self),
1618		sep_obj, sep, sep_len
1619		);
1620}
1621
1622PyDoc_STRVAR(rpartition__doc__,
1623"S.rpartition(sep) -> (tail, sep, head)\n\
1624\n\
1625Search for the separator sep in S, starting at the end of S, and return\n\
1626the part before it, the separator itself, and the part after it.  If the\n\
1627separator is not found, return two empty strings and S.");
1628
1629static PyObject *
1630string_rpartition(PyStringObject *self, PyObject *sep_obj)
1631{
1632	const char *sep;
1633	Py_ssize_t sep_len;
1634
1635	if (PyString_Check(sep_obj)) {
1636		sep = PyString_AS_STRING(sep_obj);
1637		sep_len = PyString_GET_SIZE(sep_obj);
1638	}
1639#ifdef Py_USING_UNICODE
1640	else if (PyUnicode_Check(sep_obj))
1641		return PyUnicode_RPartition((PyObject *) self, sep_obj);
1642#endif
1643	else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
1644		return NULL;
1645
1646	return stringlib_rpartition(
1647		(PyObject*) self,
1648		PyString_AS_STRING(self), PyString_GET_SIZE(self),
1649		sep_obj, sep, sep_len
1650		);
1651}
1652
1653Py_LOCAL_INLINE(PyObject *)
1654rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
1655{
1656	const char *s = PyString_AS_STRING(self);
1657	Py_ssize_t i, j, count=0;
1658	PyObject *str;
1659	PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
1660
1661	if (list == NULL)
1662		return NULL;
1663
1664	i = j = len-1;
1665
1666	while (maxsplit-- > 0) {
1667		RSKIP_SPACE(s, i);
1668		if (i<0) break;
1669		j = i; i--;
1670		RSKIP_NONSPACE(s, i);
1671		if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
1672			/* No whitespace in self, so just use it as list[0] */
1673			Py_INCREF(self);
1674			PyList_SET_ITEM(list, 0, (PyObject *)self);
1675			count++;
1676			break;
1677		}
1678		SPLIT_ADD(s, i + 1, j + 1);
1679	}
1680	if (i >= 0) {
1681		/* Only occurs when maxsplit was reached */
1682		/* Skip any remaining whitespace and copy to beginning of string */
1683		RSKIP_SPACE(s, i);
1684		if (i >= 0)
1685			SPLIT_ADD(s, 0, i + 1);
1686
1687	}
1688	FIX_PREALLOC_SIZE(list);
1689	if (PyList_Reverse(list) < 0)
1690		goto onError;
1691	return list;
1692  onError:
1693	Py_DECREF(list);
1694	return NULL;
1695}
1696
1697Py_LOCAL_INLINE(PyObject *)
1698rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
1699{
1700	const char *s = PyString_AS_STRING(self);
1701	register Py_ssize_t i, j, count=0;
1702	PyObject *str;
1703	PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
1704
1705	if (list == NULL)
1706		return NULL;
1707
1708	i = j = len - 1;
1709	while ((i >= 0) && (maxcount-- > 0)) {
1710		for (; i >= 0; i--) {
1711			if (s[i] == ch) {
1712				SPLIT_ADD(s, i + 1, j + 1);
1713				j = i = i - 1;
1714				break;
1715			}
1716		}
1717	}
1718	if (i < 0 && count == 0 && PyString_CheckExact(self)) {
1719		/* ch not in self, so just use self as list[0] */
1720		Py_INCREF(self);
1721		PyList_SET_ITEM(list, 0, (PyObject *)self);
1722		count++;
1723	}
1724	else if (j >= -1) {
1725		SPLIT_ADD(s, 0, j + 1);
1726	}
1727	FIX_PREALLOC_SIZE(list);
1728	if (PyList_Reverse(list) < 0)
1729		goto onError;
1730	return list;
1731
1732 onError:
1733	Py_DECREF(list);
1734	return NULL;
1735}
1736
1737PyDoc_STRVAR(rsplit__doc__,
1738"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
1739\n\
1740Return a list of the words in the string S, using sep as the\n\
1741delimiter string, starting at the end of the string and working\n\
1742to the front.  If maxsplit is given, at most maxsplit splits are\n\
1743done. If sep is not specified or is None, any whitespace string\n\
1744is a separator.");
1745
1746static PyObject *
1747string_rsplit(PyStringObject *self, PyObject *args)
1748{
1749	Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
1750	Py_ssize_t maxsplit = -1, count=0;
1751	const char *s, *sub;
1752	PyObject *list, *str, *subobj = Py_None;
1753
1754	if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
1755		return NULL;
1756	if (maxsplit < 0)
1757		maxsplit = PY_SSIZE_T_MAX;
1758	if (subobj == Py_None)
1759		return rsplit_whitespace(self, len, maxsplit);
1760	if (PyString_Check(subobj)) {
1761		sub = PyString_AS_STRING(subobj);
1762		n = PyString_GET_SIZE(subobj);
1763	}
1764#ifdef Py_USING_UNICODE
1765	else if (PyUnicode_Check(subobj))
1766		return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
1767#endif
1768	else if (PyObject_AsCharBuffer(subobj, &sub, &n))
1769		return NULL;
1770
1771	if (n == 0) {
1772		PyErr_SetString(PyExc_ValueError, "empty separator");
1773		return NULL;
1774	}
1775	else if (n == 1)
1776		return rsplit_char(self, len, sub[0], maxsplit);
1777
1778	list = PyList_New(PREALLOC_SIZE(maxsplit));
1779	if (list == NULL)
1780		return NULL;
1781
1782	j = len;
1783	i = j - n;
1784
1785	s = PyString_AS_STRING(self);
1786	while ( (i >= 0) && (maxsplit-- > 0) ) {
1787		for (; i>=0; i--) {
1788			if (Py_STRING_MATCH(s, i, sub, n)) {
1789				SPLIT_ADD(s, i + n, j);
1790				j = i;
1791				i -= n;
1792				break;
1793			}
1794		}
1795	}
1796	SPLIT_ADD(s, 0, j);
1797	FIX_PREALLOC_SIZE(list);
1798	if (PyList_Reverse(list) < 0)
1799		goto onError;
1800	return list;
1801
1802onError:
1803	Py_DECREF(list);
1804	return NULL;
1805}
1806
1807
1808PyDoc_STRVAR(join__doc__,
1809"S.join(sequence) -> string\n\
1810\n\
1811Return a string which is the concatenation of the strings in the\n\
1812sequence.  The separator between elements is S.");
1813
1814static PyObject *
1815string_join(PyStringObject *self, PyObject *orig)
1816{
1817	char *sep = PyString_AS_STRING(self);
1818	const Py_ssize_t seplen = PyString_GET_SIZE(self);
1819	PyObject *res = NULL;
1820	char *p;
1821	Py_ssize_t seqlen = 0;
1822	size_t sz = 0;
1823	Py_ssize_t i;
1824	PyObject *seq, *item;
1825
1826	seq = PySequence_Fast(orig, "");
1827	if (seq == NULL) {
1828		return NULL;
1829	}
1830
1831	seqlen = PySequence_Size(seq);
1832	if (seqlen == 0) {
1833		Py_DECREF(seq);
1834		return PyString_FromString("");
1835	}
1836	if (seqlen == 1) {
1837		item = PySequence_Fast_GET_ITEM(seq, 0);
1838		if (PyString_CheckExact(item) || PyUnicode_CheckExact(item)) {
1839			Py_INCREF(item);
1840			Py_DECREF(seq);
1841			return item;
1842		}
1843	}
1844
1845	/* There are at least two things to join, or else we have a subclass
1846	 * of the builtin types in the sequence.
1847	 * Do a pre-pass to figure out the total amount of space we'll
1848	 * need (sz), see whether any argument is absurd, and defer to
1849	 * the Unicode join if appropriate.
1850	 */
1851	for (i = 0; i < seqlen; i++) {
1852		const size_t old_sz = sz;
1853		item = PySequence_Fast_GET_ITEM(seq, i);
1854		if (!PyString_Check(item)){
1855#ifdef Py_USING_UNICODE
1856			if (PyUnicode_Check(item)) {
1857				/* Defer to Unicode join.
1858				 * CAUTION:  There's no gurantee that the
1859				 * original sequence can be iterated over
1860				 * again, so we must pass seq here.
1861				 */
1862				PyObject *result;
1863				result = PyUnicode_Join((PyObject *)self, seq);
1864				Py_DECREF(seq);
1865				return result;
1866			}
1867#endif
1868			PyErr_Format(PyExc_TypeError,
1869				     "sequence item %zd: expected string,"
1870				     " %.80s found",
1871				     i, Py_TYPE(item)->tp_name);
1872			Py_DECREF(seq);
1873			return NULL;
1874		}
1875		sz += PyString_GET_SIZE(item);
1876		if (i != 0)
1877			sz += seplen;
1878		if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1879			PyErr_SetString(PyExc_OverflowError,
1880				"join() result is too long for a Python string");
1881			Py_DECREF(seq);
1882			return NULL;
1883		}
1884	}
1885
1886	/* Allocate result space. */
1887	res = PyString_FromStringAndSize((char*)NULL, sz);
1888	if (res == NULL) {
1889		Py_DECREF(seq);
1890		return NULL;
1891	}
1892
1893	/* Catenate everything. */
1894	p = PyString_AS_STRING(res);
1895	for (i = 0; i < seqlen; ++i) {
1896		size_t n;
1897		item = PySequence_Fast_GET_ITEM(seq, i);
1898		n = PyString_GET_SIZE(item);
1899		Py_MEMCPY(p, PyString_AS_STRING(item), n);
1900		p += n;
1901		if (i < seqlen - 1) {
1902			Py_MEMCPY(p, sep, seplen);
1903			p += seplen;
1904		}
1905	}
1906
1907	Py_DECREF(seq);
1908	return res;
1909}
1910
1911PyObject *
1912_PyString_Join(PyObject *sep, PyObject *x)
1913{
1914	assert(sep != NULL && PyString_Check(sep));
1915	assert(x != NULL);
1916	return string_join((PyStringObject *)sep, x);
1917}
1918
1919Py_LOCAL_INLINE(void)
1920string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
1921{
1922	if (*end > len)
1923		*end = len;
1924	else if (*end < 0)
1925		*end += len;
1926	if (*end < 0)
1927		*end = 0;
1928	if (*start < 0)
1929		*start += len;
1930	if (*start < 0)
1931		*start = 0;
1932}
1933
1934Py_LOCAL_INLINE(Py_ssize_t)
1935string_find_internal(PyStringObject *self, PyObject *args, int dir)
1936{
1937	PyObject *subobj;
1938	const char *sub;
1939	Py_ssize_t sub_len;
1940	Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
1941	PyObject *obj_start=Py_None, *obj_end=Py_None;
1942
1943	if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj,
1944		&obj_start, &obj_end))
1945		return -2;
1946	/* To support None in "start" and "end" arguments, meaning
1947	   the same as if they were not passed.
1948	*/
1949	if (obj_start != Py_None)
1950		if (!_PyEval_SliceIndex(obj_start, &start))
1951	        return -2;
1952	if (obj_end != Py_None)
1953		if (!_PyEval_SliceIndex(obj_end, &end))
1954	        return -2;
1955
1956	if (PyString_Check(subobj)) {
1957		sub = PyString_AS_STRING(subobj);
1958		sub_len = PyString_GET_SIZE(subobj);
1959	}
1960#ifdef Py_USING_UNICODE
1961	else if (PyUnicode_Check(subobj))
1962		return PyUnicode_Find(
1963			(PyObject *)self, subobj, start, end, dir);
1964#endif
1965	else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
1966		/* XXX - the "expected a character buffer object" is pretty
1967		   confusing for a non-expert.  remap to something else ? */
1968		return -2;
1969
1970	if (dir > 0)
1971		return stringlib_find_slice(
1972			PyString_AS_STRING(self), PyString_GET_SIZE(self),
1973			sub, sub_len, start, end);
1974	else
1975		return stringlib_rfind_slice(
1976			PyString_AS_STRING(self), PyString_GET_SIZE(self),
1977			sub, sub_len, start, end);
1978}
1979
1980
1981PyDoc_STRVAR(find__doc__,
1982"S.find(sub [,start [,end]]) -> int\n\
1983\n\
1984Return the lowest index in S where substring sub is found,\n\
1985such that sub is contained within s[start:end].  Optional\n\
1986arguments start and end are interpreted as in slice notation.\n\
1987\n\
1988Return -1 on failure.");
1989
1990static PyObject *
1991string_find(PyStringObject *self, PyObject *args)
1992{
1993	Py_ssize_t result = string_find_internal(self, args, +1);
1994	if (result == -2)
1995		return NULL;
1996	return PyInt_FromSsize_t(result);
1997}
1998
1999
2000PyDoc_STRVAR(index__doc__,
2001"S.index(sub [,start [,end]]) -> int\n\
2002\n\
2003Like S.find() but raise ValueError when the substring is not found.");
2004
2005static PyObject *
2006string_index(PyStringObject *self, PyObject *args)
2007{
2008	Py_ssize_t result = string_find_internal(self, args, +1);
2009	if (result == -2)
2010		return NULL;
2011	if (result == -1) {
2012		PyErr_SetString(PyExc_ValueError,
2013				"substring not found");
2014		return NULL;
2015	}
2016	return PyInt_FromSsize_t(result);
2017}
2018
2019
2020PyDoc_STRVAR(rfind__doc__,
2021"S.rfind(sub [,start [,end]]) -> int\n\
2022\n\
2023Return the highest index in S where substring sub is found,\n\
2024such that sub is contained within s[start:end].  Optional\n\
2025arguments start and end are interpreted as in slice notation.\n\
2026\n\
2027Return -1 on failure.");
2028
2029static PyObject *
2030string_rfind(PyStringObject *self, PyObject *args)
2031{
2032	Py_ssize_t result = string_find_internal(self, args, -1);
2033	if (result == -2)
2034		return NULL;
2035	return PyInt_FromSsize_t(result);
2036}
2037
2038
2039PyDoc_STRVAR(rindex__doc__,
2040"S.rindex(sub [,start [,end]]) -> int\n\
2041\n\
2042Like S.rfind() but raise ValueError when the substring is not found.");
2043
2044static PyObject *
2045string_rindex(PyStringObject *self, PyObject *args)
2046{
2047	Py_ssize_t result = string_find_internal(self, args, -1);
2048	if (result == -2)
2049		return NULL;
2050	if (result == -1) {
2051		PyErr_SetString(PyExc_ValueError,
2052				"substring not found");
2053		return NULL;
2054	}
2055	return PyInt_FromSsize_t(result);
2056}
2057
2058
2059Py_LOCAL_INLINE(PyObject *)
2060do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
2061{
2062	char *s = PyString_AS_STRING(self);
2063	Py_ssize_

Large files files are truncated, but you can click here to view the full file