PageRenderTime 75ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 2ms

/Objects/unicodeobject.c

https://bitbucket.org/xg/python-embed-patches
C | 9090 lines | 7327 code | 956 blank | 807 comment | 1895 complexity | 006bfd2076beebb5b7ac4c266131626d MD5 | raw file
Possible License(s): BSD-3-Clause, 0BSD

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. Unicode implementation based on original code by Fredrik Lundh,
  3. modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
  4. Unicode Integration Proposal (see file Misc/unicode.txt).
  5. Major speed upgrades to the method implementations at the Reykjavik
  6. NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
  7. Copyright (c) Corporation for National Research Initiatives.
  8. --------------------------------------------------------------------
  9. The original string type implementation is:
  10. Copyright (c) 1999 by Secret Labs AB
  11. Copyright (c) 1999 by Fredrik Lundh
  12. By obtaining, using, and/or copying this software and/or its
  13. associated documentation, you agree that you have read, understood,
  14. and will comply with the following terms and conditions:
  15. Permission to use, copy, modify, and distribute this software and its
  16. associated documentation for any purpose and without fee is hereby
  17. granted, provided that the above copyright notice appears in all
  18. copies, and that both that copyright notice and this permission notice
  19. appear in supporting documentation, and that the name of Secret Labs
  20. AB or the author not be used in advertising or publicity pertaining to
  21. distribution of the software without specific, written prior
  22. permission.
  23. SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  24. THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  25. FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  26. ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  27. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  28. ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  29. OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  30. --------------------------------------------------------------------
  31. */
  32. #define PY_SSIZE_T_CLEAN
  33. #include "Python.h"
  34. #include "unicodeobject.h"
  35. #include "ucnhash.h"
  36. #ifdef MS_WINDOWS
  37. #include <windows.h>
  38. #endif
  39. /* Limit for the Unicode object free list */
  40. #define PyUnicode_MAXFREELIST 1024
  41. /* Limit for the Unicode object free list stay alive optimization.
  42. The implementation will keep allocated Unicode memory intact for
  43. all objects on the free list having a size less than this
  44. limit. This reduces malloc() overhead for small Unicode objects.
  45. At worst this will result in PyUnicode_MAXFREELIST *
  46. (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  47. malloc()-overhead) bytes of unused garbage.
  48. Setting the limit to 0 effectively turns the feature off.
  49. Note: This is an experimental feature ! If you get core dumps when
  50. using Unicode objects, turn this feature off.
  51. */
  52. #define KEEPALIVE_SIZE_LIMIT 9
  53. /* Endianness switches; defaults to little endian */
  54. #ifdef WORDS_BIGENDIAN
  55. # define BYTEORDER_IS_BIG_ENDIAN
  56. #else
  57. # define BYTEORDER_IS_LITTLE_ENDIAN
  58. #endif
  59. /* --- Globals ------------------------------------------------------------
  60. The globals are initialized by the _PyUnicode_Init() API and should
  61. not be used before calling that API.
  62. */
  63. #ifdef __cplusplus
  64. extern "C" {
  65. #endif
  66. /* Free list for Unicode objects */
  67. static PyUnicodeObject *free_list;
  68. static int numfree;
  69. /* The empty Unicode object is shared to improve performance. */
  70. static PyUnicodeObject *unicode_empty;
  71. /* Single character Unicode strings in the Latin-1 range are being
  72. shared as well. */
  73. static PyUnicodeObject *unicode_latin1[256];
  74. /* Default encoding to use and assume when NULL is passed as encoding
  75. parameter; it is initialized by _PyUnicode_Init().
  76. Always use the PyUnicode_SetDefaultEncoding() and
  77. PyUnicode_GetDefaultEncoding() APIs to access this global.
  78. */
  79. static char unicode_default_encoding[100];
  80. /* Fast detection of the most frequent whitespace characters */
  81. const unsigned char _Py_ascii_whitespace[] = {
  82. 0, 0, 0, 0, 0, 0, 0, 0,
  83. /* case 0x0009: * HORIZONTAL TABULATION */
  84. /* case 0x000A: * LINE FEED */
  85. /* case 0x000B: * VERTICAL TABULATION */
  86. /* case 0x000C: * FORM FEED */
  87. /* case 0x000D: * CARRIAGE RETURN */
  88. 0, 1, 1, 1, 1, 1, 0, 0,
  89. 0, 0, 0, 0, 0, 0, 0, 0,
  90. /* case 0x001C: * FILE SEPARATOR */
  91. /* case 0x001D: * GROUP SEPARATOR */
  92. /* case 0x001E: * RECORD SEPARATOR */
  93. /* case 0x001F: * UNIT SEPARATOR */
  94. 0, 0, 0, 0, 1, 1, 1, 1,
  95. /* case 0x0020: * SPACE */
  96. 1, 0, 0, 0, 0, 0, 0, 0,
  97. 0, 0, 0, 0, 0, 0, 0, 0,
  98. 0, 0, 0, 0, 0, 0, 0, 0,
  99. 0, 0, 0, 0, 0, 0, 0, 0,
  100. 0, 0, 0, 0, 0, 0, 0, 0,
  101. 0, 0, 0, 0, 0, 0, 0, 0,
  102. 0, 0, 0, 0, 0, 0, 0, 0,
  103. 0, 0, 0, 0, 0, 0, 0, 0,
  104. 0, 0, 0, 0, 0, 0, 0, 0,
  105. 0, 0, 0, 0, 0, 0, 0, 0,
  106. 0, 0, 0, 0, 0, 0, 0, 0,
  107. 0, 0, 0, 0, 0, 0, 0, 0
  108. };
  109. /* Same for linebreaks */
  110. static unsigned char ascii_linebreak[] = {
  111. 0, 0, 0, 0, 0, 0, 0, 0,
  112. /* 0x000A, * LINE FEED */
  113. /* 0x000D, * CARRIAGE RETURN */
  114. 0, 0, 1, 0, 0, 1, 0, 0,
  115. 0, 0, 0, 0, 0, 0, 0, 0,
  116. /* 0x001C, * FILE SEPARATOR */
  117. /* 0x001D, * GROUP SEPARATOR */
  118. /* 0x001E, * RECORD SEPARATOR */
  119. 0, 0, 0, 0, 1, 1, 1, 0,
  120. 0, 0, 0, 0, 0, 0, 0, 0,
  121. 0, 0, 0, 0, 0, 0, 0, 0,
  122. 0, 0, 0, 0, 0, 0, 0, 0,
  123. 0, 0, 0, 0, 0, 0, 0, 0,
  124. 0, 0, 0, 0, 0, 0, 0, 0,
  125. 0, 0, 0, 0, 0, 0, 0, 0,
  126. 0, 0, 0, 0, 0, 0, 0, 0,
  127. 0, 0, 0, 0, 0, 0, 0, 0,
  128. 0, 0, 0, 0, 0, 0, 0, 0,
  129. 0, 0, 0, 0, 0, 0, 0, 0,
  130. 0, 0, 0, 0, 0, 0, 0, 0,
  131. 0, 0, 0, 0, 0, 0, 0, 0
  132. };
  133. Py_UNICODE
  134. PyUnicode_GetMax(void)
  135. {
  136. #ifdef Py_UNICODE_WIDE
  137. return 0x10FFFF;
  138. #else
  139. /* This is actually an illegal character, so it should
  140. not be passed to unichr. */
  141. return 0xFFFF;
  142. #endif
  143. }
  144. /* --- Bloom Filters ----------------------------------------------------- */
  145. /* stuff to implement simple "bloom filters" for Unicode characters.
  146. to keep things simple, we use a single bitmask, using the least 5
  147. bits from each unicode characters as the bit index. */
  148. /* the linebreak mask is set up by Unicode_Init below */
  149. #define BLOOM_MASK unsigned long
  150. static BLOOM_MASK bloom_linebreak;
  151. #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
  152. #define BLOOM_LINEBREAK(ch) \
  153. ((ch) < 128U ? ascii_linebreak[(ch)] : \
  154. (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
  155. Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
  156. {
  157. /* calculate simple bloom-style bitmask for a given unicode string */
  158. long mask;
  159. Py_ssize_t i;
  160. mask = 0;
  161. for (i = 0; i < len; i++)
  162. mask |= (1 << (ptr[i] & 0x1F));
  163. return mask;
  164. }
  165. Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
  166. {
  167. Py_ssize_t i;
  168. for (i = 0; i < setlen; i++)
  169. if (set[i] == chr)
  170. return 1;
  171. return 0;
  172. }
  173. #define BLOOM_MEMBER(mask, chr, set, setlen) \
  174. BLOOM(mask, chr) && unicode_member(chr, set, setlen)
  175. /* --- Unicode Object ----------------------------------------------------- */
  176. static
  177. int unicode_resize(register PyUnicodeObject *unicode,
  178. Py_ssize_t length)
  179. {
  180. void *oldstr;
  181. /* Shortcut if there's nothing much to do. */
  182. if (unicode->length == length)
  183. goto reset;
  184. /* Resizing shared object (unicode_empty or single character
  185. objects) in-place is not allowed. Use PyUnicode_Resize()
  186. instead ! */
  187. if (unicode == unicode_empty ||
  188. (unicode->length == 1 &&
  189. unicode->str[0] < 256U &&
  190. unicode_latin1[unicode->str[0]] == unicode)) {
  191. PyErr_SetString(PyExc_SystemError,
  192. "can't resize shared unicode objects");
  193. return -1;
  194. }
  195. /* We allocate one more byte to make sure the string is Ux0000 terminated.
  196. The overallocation is also used by fastsearch, which assumes that it's
  197. safe to look at str[length] (without making any assumptions about what
  198. it contains). */
  199. oldstr = unicode->str;
  200. unicode->str = PyObject_REALLOC(unicode->str,
  201. sizeof(Py_UNICODE) * (length + 1));
  202. if (!unicode->str) {
  203. unicode->str = (Py_UNICODE *)oldstr;
  204. PyErr_NoMemory();
  205. return -1;
  206. }
  207. unicode->str[length] = 0;
  208. unicode->length = length;
  209. reset:
  210. /* Reset the object caches */
  211. if (unicode->defenc) {
  212. Py_DECREF(unicode->defenc);
  213. unicode->defenc = NULL;
  214. }
  215. unicode->hash = -1;
  216. return 0;
  217. }
  218. /* We allocate one more byte to make sure the string is
  219. Ux0000 terminated -- XXX is this needed ?
  220. XXX This allocator could further be enhanced by assuring that the
  221. free list never reduces its size below 1.
  222. */
  223. static
  224. PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
  225. {
  226. register PyUnicodeObject *unicode;
  227. /* Optimization for empty strings */
  228. if (length == 0 && unicode_empty != NULL) {
  229. Py_INCREF(unicode_empty);
  230. return unicode_empty;
  231. }
  232. /* Ensure we won't overflow the size. */
  233. if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
  234. return (PyUnicodeObject *)PyErr_NoMemory();
  235. }
  236. /* Unicode freelist & memory allocation */
  237. if (free_list) {
  238. unicode = free_list;
  239. free_list = *(PyUnicodeObject **)unicode;
  240. numfree--;
  241. if (unicode->str) {
  242. /* Keep-Alive optimization: we only upsize the buffer,
  243. never downsize it. */
  244. if ((unicode->length < length) &&
  245. unicode_resize(unicode, length) < 0) {
  246. PyObject_DEL(unicode->str);
  247. unicode->str = NULL;
  248. }
  249. }
  250. else {
  251. size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
  252. unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
  253. }
  254. PyObject_INIT(unicode, &PyUnicode_Type);
  255. }
  256. else {
  257. size_t new_size;
  258. unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
  259. if (unicode == NULL)
  260. return NULL;
  261. new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
  262. unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
  263. }
  264. if (!unicode->str) {
  265. PyErr_NoMemory();
  266. goto onError;
  267. }
  268. /* Initialize the first element to guard against cases where
  269. * the caller fails before initializing str -- unicode_resize()
  270. * reads str[0], and the Keep-Alive optimization can keep memory
  271. * allocated for str alive across a call to unicode_dealloc(unicode).
  272. * We don't want unicode_resize to read uninitialized memory in
  273. * that case.
  274. */
  275. unicode->str[0] = 0;
  276. unicode->str[length] = 0;
  277. unicode->length = length;
  278. unicode->hash = -1;
  279. unicode->defenc = NULL;
  280. return unicode;
  281. onError:
  282. /* XXX UNREF/NEWREF interface should be more symmetrical */
  283. _Py_DEC_REFTOTAL;
  284. _Py_ForgetReference((PyObject *)unicode);
  285. PyObject_Del(unicode);
  286. return NULL;
  287. }
  288. static
  289. void unicode_dealloc(register PyUnicodeObject *unicode)
  290. {
  291. if (PyUnicode_CheckExact(unicode) &&
  292. numfree < PyUnicode_MAXFREELIST) {
  293. /* Keep-Alive optimization */
  294. if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
  295. PyObject_DEL(unicode->str);
  296. unicode->str = NULL;
  297. unicode->length = 0;
  298. }
  299. if (unicode->defenc) {
  300. Py_DECREF(unicode->defenc);
  301. unicode->defenc = NULL;
  302. }
  303. /* Add to free list */
  304. *(PyUnicodeObject **)unicode = free_list;
  305. free_list = unicode;
  306. numfree++;
  307. }
  308. else {
  309. PyObject_DEL(unicode->str);
  310. Py_XDECREF(unicode->defenc);
  311. Py_TYPE(unicode)->tp_free((PyObject *)unicode);
  312. }
  313. }
  314. static
  315. int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
  316. {
  317. register PyUnicodeObject *v;
  318. /* Argument checks */
  319. if (unicode == NULL) {
  320. PyErr_BadInternalCall();
  321. return -1;
  322. }
  323. v = *unicode;
  324. if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
  325. PyErr_BadInternalCall();
  326. return -1;
  327. }
  328. /* Resizing unicode_empty and single character objects is not
  329. possible since these are being shared. We simply return a fresh
  330. copy with the same Unicode content. */
  331. if (v->length != length &&
  332. (v == unicode_empty || v->length == 1)) {
  333. PyUnicodeObject *w = _PyUnicode_New(length);
  334. if (w == NULL)
  335. return -1;
  336. Py_UNICODE_COPY(w->str, v->str,
  337. length < v->length ? length : v->length);
  338. Py_DECREF(*unicode);
  339. *unicode = w;
  340. return 0;
  341. }
  342. /* Note that we don't have to modify *unicode for unshared Unicode
  343. objects, since we can modify them in-place. */
  344. return unicode_resize(v, length);
  345. }
  346. int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
  347. {
  348. return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
  349. }
  350. PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
  351. Py_ssize_t size)
  352. {
  353. PyUnicodeObject *unicode;
  354. /* If the Unicode data is known at construction time, we can apply
  355. some optimizations which share commonly used objects. */
  356. if (u != NULL) {
  357. /* Optimization for empty strings */
  358. if (size == 0 && unicode_empty != NULL) {
  359. Py_INCREF(unicode_empty);
  360. return (PyObject *)unicode_empty;
  361. }
  362. /* Single character Unicode objects in the Latin-1 range are
  363. shared when using this constructor */
  364. if (size == 1 && *u < 256) {
  365. unicode = unicode_latin1[*u];
  366. if (!unicode) {
  367. unicode = _PyUnicode_New(1);
  368. if (!unicode)
  369. return NULL;
  370. unicode->str[0] = *u;
  371. unicode_latin1[*u] = unicode;
  372. }
  373. Py_INCREF(unicode);
  374. return (PyObject *)unicode;
  375. }
  376. }
  377. unicode = _PyUnicode_New(size);
  378. if (!unicode)
  379. return NULL;
  380. /* Copy the Unicode data into the new object */
  381. if (u != NULL)
  382. Py_UNICODE_COPY(unicode->str, u, size);
  383. return (PyObject *)unicode;
  384. }
  385. PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
  386. {
  387. PyUnicodeObject *unicode;
  388. if (size < 0) {
  389. PyErr_SetString(PyExc_SystemError,
  390. "Negative size passed to PyUnicode_FromStringAndSize");
  391. return NULL;
  392. }
  393. /* If the Unicode data is known at construction time, we can apply
  394. some optimizations which share commonly used objects.
  395. Also, this means the input must be UTF-8, so fall back to the
  396. UTF-8 decoder at the end. */
  397. if (u != NULL) {
  398. /* Optimization for empty strings */
  399. if (size == 0 && unicode_empty != NULL) {
  400. Py_INCREF(unicode_empty);
  401. return (PyObject *)unicode_empty;
  402. }
  403. /* Single characters are shared when using this constructor.
  404. Restrict to ASCII, since the input must be UTF-8. */
  405. if (size == 1 && Py_CHARMASK(*u) < 128) {
  406. unicode = unicode_latin1[Py_CHARMASK(*u)];
  407. if (!unicode) {
  408. unicode = _PyUnicode_New(1);
  409. if (!unicode)
  410. return NULL;
  411. unicode->str[0] = Py_CHARMASK(*u);
  412. unicode_latin1[Py_CHARMASK(*u)] = unicode;
  413. }
  414. Py_INCREF(unicode);
  415. return (PyObject *)unicode;
  416. }
  417. return PyUnicode_DecodeUTF8(u, size, NULL);
  418. }
  419. unicode = _PyUnicode_New(size);
  420. if (!unicode)
  421. return NULL;
  422. return (PyObject *)unicode;
  423. }
  424. PyObject *PyUnicode_FromString(const char *u)
  425. {
  426. size_t size = strlen(u);
  427. if (size > PY_SSIZE_T_MAX) {
  428. PyErr_SetString(PyExc_OverflowError, "input too long");
  429. return NULL;
  430. }
  431. return PyUnicode_FromStringAndSize(u, size);
  432. }
  433. #ifdef HAVE_WCHAR_H
  434. PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
  435. Py_ssize_t size)
  436. {
  437. PyUnicodeObject *unicode;
  438. if (w == NULL) {
  439. PyErr_BadInternalCall();
  440. return NULL;
  441. }
  442. unicode = _PyUnicode_New(size);
  443. if (!unicode)
  444. return NULL;
  445. /* Copy the wchar_t data into the new object */
  446. #ifdef HAVE_USABLE_WCHAR_T
  447. memcpy(unicode->str, w, size * sizeof(wchar_t));
  448. #else
  449. {
  450. register Py_UNICODE *u;
  451. register Py_ssize_t i;
  452. u = PyUnicode_AS_UNICODE(unicode);
  453. for (i = size; i > 0; i--)
  454. *u++ = *w++;
  455. }
  456. #endif
  457. return (PyObject *)unicode;
  458. }
  459. static void
  460. makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
  461. {
  462. *fmt++ = '%';
  463. if (width) {
  464. if (zeropad)
  465. *fmt++ = '0';
  466. fmt += sprintf(fmt, "%d", width);
  467. }
  468. if (precision)
  469. fmt += sprintf(fmt, ".%d", precision);
  470. if (longflag)
  471. *fmt++ = 'l';
  472. else if (size_tflag) {
  473. char *f = PY_FORMAT_SIZE_T;
  474. while (*f)
  475. *fmt++ = *f++;
  476. }
  477. *fmt++ = c;
  478. *fmt = '\0';
  479. }
  480. #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
  481. PyObject *
  482. PyUnicode_FromFormatV(const char *format, va_list vargs)
  483. {
  484. va_list count;
  485. Py_ssize_t callcount = 0;
  486. PyObject **callresults = NULL;
  487. PyObject **callresult = NULL;
  488. Py_ssize_t n = 0;
  489. int width = 0;
  490. int precision = 0;
  491. int zeropad;
  492. const char* f;
  493. Py_UNICODE *s;
  494. PyObject *string;
  495. /* used by sprintf */
  496. char buffer[21];
  497. /* use abuffer instead of buffer, if we need more space
  498. * (which can happen if there's a format specifier with width). */
  499. char *abuffer = NULL;
  500. char *realbuffer;
  501. Py_ssize_t abuffersize = 0;
  502. char fmt[60]; /* should be enough for %0width.precisionld */
  503. const char *copy;
  504. #ifdef VA_LIST_IS_ARRAY
  505. Py_MEMCPY(count, vargs, sizeof(va_list));
  506. #else
  507. #ifdef __va_copy
  508. __va_copy(count, vargs);
  509. #else
  510. count = vargs;
  511. #endif
  512. #endif
  513. /* step 1: count the number of %S/%R/%s format specifications
  514. * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
  515. * objects once during step 3 and put the result in an array) */
  516. for (f = format; *f; f++) {
  517. if (*f == '%') {
  518. if (*(f+1)=='%')
  519. continue;
  520. if (*(f+1)=='S' || *(f+1)=='R')
  521. ++callcount;
  522. while (isdigit((unsigned)*f))
  523. width = (width*10) + *f++ - '0';
  524. while (*++f && *f != '%' && !isalpha((unsigned)*f))
  525. ;
  526. if (*f == 's')
  527. ++callcount;
  528. }
  529. }
  530. /* step 2: allocate memory for the results of
  531. * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
  532. if (callcount) {
  533. callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
  534. if (!callresults) {
  535. PyErr_NoMemory();
  536. return NULL;
  537. }
  538. callresult = callresults;
  539. }
  540. /* step 3: figure out how large a buffer we need */
  541. for (f = format; *f; f++) {
  542. if (*f == '%') {
  543. const char* p = f;
  544. width = 0;
  545. while (isdigit((unsigned)*f))
  546. width = (width*10) + *f++ - '0';
  547. while (*++f && *f != '%' && !isalpha((unsigned)*f))
  548. ;
  549. /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
  550. * they don't affect the amount of space we reserve.
  551. */
  552. if ((*f == 'l' || *f == 'z') &&
  553. (f[1] == 'd' || f[1] == 'u'))
  554. ++f;
  555. switch (*f) {
  556. case 'c':
  557. (void)va_arg(count, int);
  558. /* fall through... */
  559. case '%':
  560. n++;
  561. break;
  562. case 'd': case 'u': case 'i': case 'x':
  563. (void) va_arg(count, int);
  564. /* 20 bytes is enough to hold a 64-bit
  565. integer. Decimal takes the most space.
  566. This isn't enough for octal.
  567. If a width is specified we need more
  568. (which we allocate later). */
  569. if (width < 20)
  570. width = 20;
  571. n += width;
  572. if (abuffersize < width)
  573. abuffersize = width;
  574. break;
  575. case 's':
  576. {
  577. /* UTF-8 */
  578. unsigned char *s = va_arg(count, unsigned char*);
  579. PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
  580. if (!str)
  581. goto fail;
  582. n += PyUnicode_GET_SIZE(str);
  583. /* Remember the str and switch to the next slot */
  584. *callresult++ = str;
  585. break;
  586. }
  587. case 'U':
  588. {
  589. PyObject *obj = va_arg(count, PyObject *);
  590. assert(obj && PyUnicode_Check(obj));
  591. n += PyUnicode_GET_SIZE(obj);
  592. break;
  593. }
  594. case 'V':
  595. {
  596. PyObject *obj = va_arg(count, PyObject *);
  597. const char *str = va_arg(count, const char *);
  598. assert(obj || str);
  599. assert(!obj || PyUnicode_Check(obj));
  600. if (obj)
  601. n += PyUnicode_GET_SIZE(obj);
  602. else
  603. n += strlen(str);
  604. break;
  605. }
  606. case 'S':
  607. {
  608. PyObject *obj = va_arg(count, PyObject *);
  609. PyObject *str;
  610. assert(obj);
  611. str = PyObject_Str(obj);
  612. if (!str)
  613. goto fail;
  614. n += PyUnicode_GET_SIZE(str);
  615. /* Remember the str and switch to the next slot */
  616. *callresult++ = str;
  617. break;
  618. }
  619. case 'R':
  620. {
  621. PyObject *obj = va_arg(count, PyObject *);
  622. PyObject *repr;
  623. assert(obj);
  624. repr = PyObject_Repr(obj);
  625. if (!repr)
  626. goto fail;
  627. n += PyUnicode_GET_SIZE(repr);
  628. /* Remember the repr and switch to the next slot */
  629. *callresult++ = repr;
  630. break;
  631. }
  632. case 'p':
  633. (void) va_arg(count, int);
  634. /* maximum 64-bit pointer representation:
  635. * 0xffffffffffffffff
  636. * so 19 characters is enough.
  637. * XXX I count 18 -- what's the extra for?
  638. */
  639. n += 19;
  640. break;
  641. default:
  642. /* if we stumble upon an unknown
  643. formatting code, copy the rest of
  644. the format string to the output
  645. string. (we cannot just skip the
  646. code, since there's no way to know
  647. what's in the argument list) */
  648. n += strlen(p);
  649. goto expand;
  650. }
  651. } else
  652. n++;
  653. }
  654. expand:
  655. if (abuffersize > 20) {
  656. abuffer = PyObject_Malloc(abuffersize);
  657. if (!abuffer) {
  658. PyErr_NoMemory();
  659. goto fail;
  660. }
  661. realbuffer = abuffer;
  662. }
  663. else
  664. realbuffer = buffer;
  665. /* step 4: fill the buffer */
  666. /* Since we've analyzed how much space we need for the worst case,
  667. we don't have to resize the string.
  668. There can be no errors beyond this point. */
  669. string = PyUnicode_FromUnicode(NULL, n);
  670. if (!string)
  671. goto fail;
  672. s = PyUnicode_AS_UNICODE(string);
  673. callresult = callresults;
  674. for (f = format; *f; f++) {
  675. if (*f == '%') {
  676. const char* p = f++;
  677. int longflag = 0;
  678. int size_tflag = 0;
  679. zeropad = (*f == '0');
  680. /* parse the width.precision part */
  681. width = 0;
  682. while (isdigit((unsigned)*f))
  683. width = (width*10) + *f++ - '0';
  684. precision = 0;
  685. if (*f == '.') {
  686. f++;
  687. while (isdigit((unsigned)*f))
  688. precision = (precision*10) + *f++ - '0';
  689. }
  690. /* handle the long flag, but only for %ld and %lu.
  691. others can be added when necessary. */
  692. if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
  693. longflag = 1;
  694. ++f;
  695. }
  696. /* handle the size_t flag. */
  697. if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
  698. size_tflag = 1;
  699. ++f;
  700. }
  701. switch (*f) {
  702. case 'c':
  703. *s++ = va_arg(vargs, int);
  704. break;
  705. case 'd':
  706. makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
  707. if (longflag)
  708. sprintf(realbuffer, fmt, va_arg(vargs, long));
  709. else if (size_tflag)
  710. sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
  711. else
  712. sprintf(realbuffer, fmt, va_arg(vargs, int));
  713. appendstring(realbuffer);
  714. break;
  715. case 'u':
  716. makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
  717. if (longflag)
  718. sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
  719. else if (size_tflag)
  720. sprintf(realbuffer, fmt, va_arg(vargs, size_t));
  721. else
  722. sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
  723. appendstring(realbuffer);
  724. break;
  725. case 'i':
  726. makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
  727. sprintf(realbuffer, fmt, va_arg(vargs, int));
  728. appendstring(realbuffer);
  729. break;
  730. case 'x':
  731. makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
  732. sprintf(realbuffer, fmt, va_arg(vargs, int));
  733. appendstring(realbuffer);
  734. break;
  735. case 's':
  736. {
  737. /* unused, since we already have the result */
  738. (void) va_arg(vargs, char *);
  739. Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
  740. PyUnicode_GET_SIZE(*callresult));
  741. s += PyUnicode_GET_SIZE(*callresult);
  742. /* We're done with the unicode()/repr() => forget it */
  743. Py_DECREF(*callresult);
  744. /* switch to next unicode()/repr() result */
  745. ++callresult;
  746. break;
  747. }
  748. case 'U':
  749. {
  750. PyObject *obj = va_arg(vargs, PyObject *);
  751. Py_ssize_t size = PyUnicode_GET_SIZE(obj);
  752. Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
  753. s += size;
  754. break;
  755. }
  756. case 'V':
  757. {
  758. PyObject *obj = va_arg(vargs, PyObject *);
  759. const char *str = va_arg(vargs, const char *);
  760. if (obj) {
  761. Py_ssize_t size = PyUnicode_GET_SIZE(obj);
  762. Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
  763. s += size;
  764. } else {
  765. appendstring(str);
  766. }
  767. break;
  768. }
  769. case 'S':
  770. case 'R':
  771. {
  772. Py_UNICODE *ucopy;
  773. Py_ssize_t usize;
  774. Py_ssize_t upos;
  775. /* unused, since we already have the result */
  776. (void) va_arg(vargs, PyObject *);
  777. ucopy = PyUnicode_AS_UNICODE(*callresult);
  778. usize = PyUnicode_GET_SIZE(*callresult);
  779. for (upos = 0; upos<usize;)
  780. *s++ = ucopy[upos++];
  781. /* We're done with the unicode()/repr() => forget it */
  782. Py_DECREF(*callresult);
  783. /* switch to next unicode()/repr() result */
  784. ++callresult;
  785. break;
  786. }
  787. case 'p':
  788. sprintf(buffer, "%p", va_arg(vargs, void*));
  789. /* %p is ill-defined: ensure leading 0x. */
  790. if (buffer[1] == 'X')
  791. buffer[1] = 'x';
  792. else if (buffer[1] != 'x') {
  793. memmove(buffer+2, buffer, strlen(buffer)+1);
  794. buffer[0] = '0';
  795. buffer[1] = 'x';
  796. }
  797. appendstring(buffer);
  798. break;
  799. case '%':
  800. *s++ = '%';
  801. break;
  802. default:
  803. appendstring(p);
  804. goto end;
  805. }
  806. } else
  807. *s++ = *f;
  808. }
  809. end:
  810. if (callresults)
  811. PyObject_Free(callresults);
  812. if (abuffer)
  813. PyObject_Free(abuffer);
  814. PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
  815. return string;
  816. fail:
  817. if (callresults) {
  818. PyObject **callresult2 = callresults;
  819. while (callresult2 < callresult) {
  820. Py_DECREF(*callresult2);
  821. ++callresult2;
  822. }
  823. PyObject_Free(callresults);
  824. }
  825. if (abuffer)
  826. PyObject_Free(abuffer);
  827. return NULL;
  828. }
  829. #undef appendstring
  830. PyObject *
  831. PyUnicode_FromFormat(const char *format, ...)
  832. {
  833. PyObject* ret;
  834. va_list vargs;
  835. #ifdef HAVE_STDARG_PROTOTYPES
  836. va_start(vargs, format);
  837. #else
  838. va_start(vargs);
  839. #endif
  840. ret = PyUnicode_FromFormatV(format, vargs);
  841. va_end(vargs);
  842. return ret;
  843. }
  844. Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
  845. wchar_t *w,
  846. Py_ssize_t size)
  847. {
  848. if (unicode == NULL) {
  849. PyErr_BadInternalCall();
  850. return -1;
  851. }
  852. /* If possible, try to copy the 0-termination as well */
  853. if (size > PyUnicode_GET_SIZE(unicode))
  854. size = PyUnicode_GET_SIZE(unicode) + 1;
  855. #ifdef HAVE_USABLE_WCHAR_T
  856. memcpy(w, unicode->str, size * sizeof(wchar_t));
  857. #else
  858. {
  859. register Py_UNICODE *u;
  860. register Py_ssize_t i;
  861. u = PyUnicode_AS_UNICODE(unicode);
  862. for (i = size; i > 0; i--)
  863. *w++ = *u++;
  864. }
  865. #endif
  866. if (size > PyUnicode_GET_SIZE(unicode))
  867. return PyUnicode_GET_SIZE(unicode);
  868. else
  869. return size;
  870. }
  871. #endif
  872. PyObject *PyUnicode_FromOrdinal(int ordinal)
  873. {
  874. Py_UNICODE s[1];
  875. #ifdef Py_UNICODE_WIDE
  876. if (ordinal < 0 || ordinal > 0x10ffff) {
  877. PyErr_SetString(PyExc_ValueError,
  878. "unichr() arg not in range(0x110000) "
  879. "(wide Python build)");
  880. return NULL;
  881. }
  882. #else
  883. if (ordinal < 0 || ordinal > 0xffff) {
  884. PyErr_SetString(PyExc_ValueError,
  885. "unichr() arg not in range(0x10000) "
  886. "(narrow Python build)");
  887. return NULL;
  888. }
  889. #endif
  890. s[0] = (Py_UNICODE)ordinal;
  891. return PyUnicode_FromUnicode(s, 1);
  892. }
  893. PyObject *PyUnicode_FromObject(register PyObject *obj)
  894. {
  895. /* XXX Perhaps we should make this API an alias of
  896. PyObject_Unicode() instead ?! */
  897. if (PyUnicode_CheckExact(obj)) {
  898. Py_INCREF(obj);
  899. return obj;
  900. }
  901. if (PyUnicode_Check(obj)) {
  902. /* For a Unicode subtype that's not a Unicode object,
  903. return a true Unicode object with the same data. */
  904. return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
  905. PyUnicode_GET_SIZE(obj));
  906. }
  907. return PyUnicode_FromEncodedObject(obj, NULL, "strict");
  908. }
  909. PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
  910. const char *encoding,
  911. const char *errors)
  912. {
  913. const char *s = NULL;
  914. Py_ssize_t len;
  915. PyObject *v;
  916. if (obj == NULL) {
  917. PyErr_BadInternalCall();
  918. return NULL;
  919. }
  920. #if 0
  921. /* For b/w compatibility we also accept Unicode objects provided
  922. that no encodings is given and then redirect to
  923. PyObject_Unicode() which then applies the additional logic for
  924. Unicode subclasses.
  925. NOTE: This API should really only be used for object which
  926. represent *encoded* Unicode !
  927. */
  928. if (PyUnicode_Check(obj)) {
  929. if (encoding) {
  930. PyErr_SetString(PyExc_TypeError,
  931. "decoding Unicode is not supported");
  932. return NULL;
  933. }
  934. return PyObject_Unicode(obj);
  935. }
  936. #else
  937. if (PyUnicode_Check(obj)) {
  938. PyErr_SetString(PyExc_TypeError,
  939. "decoding Unicode is not supported");
  940. return NULL;
  941. }
  942. #endif
  943. /* Coerce object */
  944. if (PyString_Check(obj)) {
  945. s = PyString_AS_STRING(obj);
  946. len = PyString_GET_SIZE(obj);
  947. }
  948. else if (PyByteArray_Check(obj)) {
  949. /* Python 2.x specific */
  950. PyErr_Format(PyExc_TypeError,
  951. "decoding bytearray is not supported");
  952. return NULL;
  953. }
  954. else if (PyObject_AsCharBuffer(obj, &s, &len)) {
  955. /* Overwrite the error message with something more useful in
  956. case of a TypeError. */
  957. if (PyErr_ExceptionMatches(PyExc_TypeError))
  958. PyErr_Format(PyExc_TypeError,
  959. "coercing to Unicode: need string or buffer, "
  960. "%.80s found",
  961. Py_TYPE(obj)->tp_name);
  962. goto onError;
  963. }
  964. /* Convert to Unicode */
  965. if (len == 0) {
  966. Py_INCREF(unicode_empty);
  967. v = (PyObject *)unicode_empty;
  968. }
  969. else
  970. v = PyUnicode_Decode(s, len, encoding, errors);
  971. return v;
  972. onError:
  973. return NULL;
  974. }
  975. PyObject *PyUnicode_Decode(const char *s,
  976. Py_ssize_t size,
  977. const char *encoding,
  978. const char *errors)
  979. {
  980. PyObject *buffer = NULL, *unicode;
  981. if (encoding == NULL)
  982. encoding = PyUnicode_GetDefaultEncoding();
  983. /* Shortcuts for common default encodings */
  984. if (strcmp(encoding, "utf-8") == 0)
  985. return PyUnicode_DecodeUTF8(s, size, errors);
  986. else if (strcmp(encoding, "latin-1") == 0)
  987. return PyUnicode_DecodeLatin1(s, size, errors);
  988. #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
  989. else if (strcmp(encoding, "mbcs") == 0)
  990. return PyUnicode_DecodeMBCS(s, size, errors);
  991. #endif
  992. else if (strcmp(encoding, "ascii") == 0)
  993. return PyUnicode_DecodeASCII(s, size, errors);
  994. /* Decode via the codec registry */
  995. buffer = PyBuffer_FromMemory((void *)s, size);
  996. if (buffer == NULL)
  997. goto onError;
  998. unicode = PyCodec_Decode(buffer, encoding, errors);
  999. if (unicode == NULL)
  1000. goto onError;
  1001. if (!PyUnicode_Check(unicode)) {
  1002. PyErr_Format(PyExc_TypeError,
  1003. "decoder did not return an unicode object (type=%.400s)",
  1004. Py_TYPE(unicode)->tp_name);
  1005. Py_DECREF(unicode);
  1006. goto onError;
  1007. }
  1008. Py_DECREF(buffer);
  1009. return unicode;
  1010. onError:
  1011. Py_XDECREF(buffer);
  1012. return NULL;
  1013. }
  1014. PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
  1015. const char *encoding,
  1016. const char *errors)
  1017. {
  1018. PyObject *v;
  1019. if (!PyUnicode_Check(unicode)) {
  1020. PyErr_BadArgument();
  1021. goto onError;
  1022. }
  1023. if (encoding == NULL)
  1024. encoding = PyUnicode_GetDefaultEncoding();
  1025. /* Decode via the codec registry */
  1026. v = PyCodec_Decode(unicode, encoding, errors);
  1027. if (v == NULL)
  1028. goto onError;
  1029. return v;
  1030. onError:
  1031. return NULL;
  1032. }
  1033. PyObject *PyUnicode_Encode(const Py_UNICODE *s,
  1034. Py_ssize_t size,
  1035. const char *encoding,
  1036. const char *errors)
  1037. {
  1038. PyObject *v, *unicode;
  1039. unicode = PyUnicode_FromUnicode(s, size);
  1040. if (unicode == NULL)
  1041. return NULL;
  1042. v = PyUnicode_AsEncodedString(unicode, encoding, errors);
  1043. Py_DECREF(unicode);
  1044. return v;
  1045. }
  1046. PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
  1047. const char *encoding,
  1048. const char *errors)
  1049. {
  1050. PyObject *v;
  1051. if (!PyUnicode_Check(unicode)) {
  1052. PyErr_BadArgument();
  1053. goto onError;
  1054. }
  1055. if (encoding == NULL)
  1056. encoding = PyUnicode_GetDefaultEncoding();
  1057. /* Encode via the codec registry */
  1058. v = PyCodec_Encode(unicode, encoding, errors);
  1059. if (v == NULL)
  1060. goto onError;
  1061. return v;
  1062. onError:
  1063. return NULL;
  1064. }
  1065. PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
  1066. const char *encoding,
  1067. const char *errors)
  1068. {
  1069. PyObject *v;
  1070. if (!PyUnicode_Check(unicode)) {
  1071. PyErr_BadArgument();
  1072. goto onError;
  1073. }
  1074. if (encoding == NULL)
  1075. encoding = PyUnicode_GetDefaultEncoding();
  1076. /* Shortcuts for common default encodings */
  1077. if (errors == NULL) {
  1078. if (strcmp(encoding, "utf-8") == 0)
  1079. return PyUnicode_AsUTF8String(unicode);
  1080. else if (strcmp(encoding, "latin-1") == 0)
  1081. return PyUnicode_AsLatin1String(unicode);
  1082. #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
  1083. else if (strcmp(encoding, "mbcs") == 0)
  1084. return PyUnicode_AsMBCSString(unicode);
  1085. #endif
  1086. else if (strcmp(encoding, "ascii") == 0)
  1087. return PyUnicode_AsASCIIString(unicode);
  1088. }
  1089. /* Encode via the codec registry */
  1090. v = PyCodec_Encode(unicode, encoding, errors);
  1091. if (v == NULL)
  1092. goto onError;
  1093. if (!PyString_Check(v)) {
  1094. PyErr_Format(PyExc_TypeError,
  1095. "encoder did not return a string object (type=%.400s)",
  1096. Py_TYPE(v)->tp_name);
  1097. Py_DECREF(v);
  1098. goto onError;
  1099. }
  1100. return v;
  1101. onError:
  1102. return NULL;
  1103. }
  1104. PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
  1105. const char *errors)
  1106. {
  1107. PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
  1108. if (v)
  1109. return v;
  1110. v = PyUnicode_AsEncodedString(unicode, NULL, errors);
  1111. if (v && errors == NULL)
  1112. ((PyUnicodeObject *)unicode)->defenc = v;
  1113. return v;
  1114. }
  1115. Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
  1116. {
  1117. if (!PyUnicode_Check(unicode)) {
  1118. PyErr_BadArgument();
  1119. goto onError;
  1120. }
  1121. return PyUnicode_AS_UNICODE(unicode);
  1122. onError:
  1123. return NULL;
  1124. }
  1125. Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
  1126. {
  1127. if (!PyUnicode_Check(unicode)) {
  1128. PyErr_BadArgument();
  1129. goto onError;
  1130. }
  1131. return PyUnicode_GET_SIZE(unicode);
  1132. onError:
  1133. return -1;
  1134. }
  1135. const char *PyUnicode_GetDefaultEncoding(void)
  1136. {
  1137. return unicode_default_encoding;
  1138. }
  1139. int PyUnicode_SetDefaultEncoding(const char *encoding)
  1140. {
  1141. PyObject *v;
  1142. /* Make sure the encoding is valid. As side effect, this also
  1143. loads the encoding into the codec registry cache. */
  1144. v = _PyCodec_Lookup(encoding);
  1145. if (v == NULL)
  1146. goto onError;
  1147. Py_DECREF(v);
  1148. strncpy(unicode_default_encoding,
  1149. encoding,
  1150. sizeof(unicode_default_encoding));
  1151. return 0;
  1152. onError:
  1153. return -1;
  1154. }
  1155. /* error handling callback helper:
  1156. build arguments, call the callback and check the arguments,
  1157. if no exception occurred, copy the replacement to the output
  1158. and adjust various state variables.
  1159. return 0 on success, -1 on error
  1160. */
  1161. static
  1162. int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
  1163. const char *encoding, const char *reason,
  1164. const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
  1165. Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
  1166. PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
  1167. {
  1168. static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
  1169. PyObject *restuple = NULL;
  1170. PyObject *repunicode = NULL;
  1171. Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
  1172. Py_ssize_t requiredsize;
  1173. Py_ssize_t newpos;
  1174. Py_UNICODE *repptr;
  1175. Py_ssize_t repsize;
  1176. int res = -1;
  1177. if (*errorHandler == NULL) {
  1178. *errorHandler = PyCodec_LookupError(errors);
  1179. if (*errorHandler == NULL)
  1180. goto onError;
  1181. }
  1182. if (*exceptionObject == NULL) {
  1183. *exceptionObject = PyUnicodeDecodeError_Create(
  1184. encoding, input, insize, *startinpos, *endinpos, reason);
  1185. if (*exceptionObject == NULL)
  1186. goto onError;
  1187. }
  1188. else {
  1189. if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
  1190. goto onError;
  1191. if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
  1192. goto onError;
  1193. if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
  1194. goto onError;
  1195. }
  1196. restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
  1197. if (restuple == NULL)
  1198. goto onError;
  1199. if (!PyTuple_Check(restuple)) {
  1200. PyErr_SetString(PyExc_TypeError, &argparse[4]);
  1201. goto onError;
  1202. }
  1203. if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
  1204. goto onError;
  1205. if (newpos<0)
  1206. newpos = insize+newpos;
  1207. if (newpos<0 || newpos>insize) {
  1208. PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
  1209. goto onError;
  1210. }
  1211. /* need more space? (at least enough for what we
  1212. have+the replacement+the rest of the string (starting
  1213. at the new input position), so we won't have to check space
  1214. when there are no errors in the rest of the string) */
  1215. repptr = PyUnicode_AS_UNICODE(repunicode);
  1216. repsize = PyUnicode_GET_SIZE(repunicode);
  1217. requiredsize = *outpos + repsize + insize-newpos;
  1218. if (requiredsize > outsize) {
  1219. if (requiredsize<2*outsize)
  1220. requiredsize = 2*outsize;
  1221. if (_PyUnicode_Resize(output, requiredsize) < 0)
  1222. goto onError;
  1223. *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
  1224. }
  1225. *endinpos = newpos;
  1226. *inptr = input + newpos;
  1227. Py_UNICODE_COPY(*outptr, repptr, repsize);
  1228. *outptr += repsize;
  1229. *outpos += repsize;
  1230. /* we made it! */
  1231. res = 0;
  1232. onError:
  1233. Py_XDECREF(restuple);
  1234. return res;
  1235. }
  1236. /* --- UTF-7 Codec -------------------------------------------------------- */
  1237. /* see RFC2152 for details */
  1238. static
  1239. char utf7_special[128] = {
  1240. /* indicate whether a UTF-7 character is special i.e. cannot be directly
  1241. encoded:
  1242. 0 - not special
  1243. 1 - special
  1244. 2 - whitespace (optional)
  1245. 3 - RFC2152 Set O (optional) */
  1246. 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
  1247. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1248. 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
  1249. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
  1250. 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1251. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
  1252. 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1253. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
  1254. };
  1255. /* Note: The comparison (c) <= 0 is a trick to work-around gcc
  1256. warnings about the comparison always being false; since
  1257. utf7_special[0] is 1, we can safely make that one comparison
  1258. true */
  1259. #define SPECIAL(c, encodeO, encodeWS) \
  1260. ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
  1261. (encodeWS && (utf7_special[(c)] == 2)) || \
  1262. (encodeO && (utf7_special[(c)] == 3)))
  1263. #define B64(n) \
  1264. ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
  1265. #define B64CHAR(c) \
  1266. (isalnum(c) || (c) == '+' || (c) == '/')
  1267. #define UB64(c) \
  1268. ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
  1269. (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
  1270. #define ENCODE(out, ch, bits) \
  1271. while (bits >= 6) { \
  1272. *out++ = B64(ch >> (bits-6)); \
  1273. bits -= 6; \
  1274. }
  1275. #define DECODE(out, ch, bits, surrogate) \
  1276. while (bits >= 16) { \
  1277. Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
  1278. bits -= 16; \
  1279. if (surrogate) { \
  1280. /* We have already generated an error for the high surrogate \
  1281. so let's not bother seeing if the low surrogate is correct or not */ \
  1282. surrogate = 0; \
  1283. } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
  1284. /* This is a surrogate pair. Unfortunately we can't represent \
  1285. it in a 16-bit character */ \
  1286. surrogate = 1; \
  1287. errmsg = "code pairs are not supported"; \
  1288. goto utf7Error; \
  1289. } else { \
  1290. *out++ = outCh; \
  1291. } \
  1292. }
  1293. PyObject *PyUnicode_DecodeUTF7(const char *s,
  1294. Py_ssize_t size,
  1295. const char *errors)
  1296. {
  1297. return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
  1298. }
  1299. PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
  1300. Py_ssize_t size,
  1301. const char *errors,
  1302. Py_ssize_t *consumed)
  1303. {
  1304. const char *starts = s;
  1305. Py_ssize_t startinpos;
  1306. Py_ssize_t endinpos;
  1307. Py_ssize_t outpos;
  1308. const char *e;
  1309. PyUnicodeObject *unicode;
  1310. Py_UNICODE *p;
  1311. const char *errmsg = "";
  1312. int inShift = 0;
  1313. unsigned int bitsleft = 0;
  1314. unsigned long charsleft = 0;
  1315. int surrogate = 0;
  1316. PyObject *errorHandler = NULL;
  1317. PyObject *exc = NULL;
  1318. unicode = _PyUnicode_New(size);
  1319. if (!unicode)
  1320. return NULL;
  1321. if (size == 0) {
  1322. if (consumed)
  1323. *consumed = 0;
  1324. return (PyObject *)unicode;
  1325. }
  1326. p = unicode->str;
  1327. e = s + size;
  1328. while (s < e) {
  1329. Py_UNICODE ch;
  1330. restart:
  1331. ch = (unsigned char) *s;
  1332. if (inShift) {
  1333. if ((ch == '-') || !B64CHAR(ch)) {
  1334. inShift = 0;
  1335. s++;
  1336. /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
  1337. if (bitsleft >= 6) {
  1338. /* The shift sequence has a partial character in it. If
  1339. bitsleft < 6 then we could just classify it as padding
  1340. but that is not the case here */
  1341. errmsg = "partial character in shift sequence";
  1342. goto utf7Error;
  1343. }
  1344. /* According to RFC2152 the remaining bits should be zero. We
  1345. choose to signal an error/insert a replacement character
  1346. here so indicate the potential of a misencoded character. */
  1347. /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
  1348. if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
  1349. errmsg = "non-zero padding bits in shift sequence";
  1350. goto utf7Error;
  1351. }
  1352. if (ch == '-') {
  1353. if ((s < e) && (*(s) == '-')) {
  1354. *p++ = '-';
  1355. inShift = 1;
  1356. }
  1357. } else if (SPECIAL(ch,0,0)) {
  1358. errmsg = "unexpected special character";
  1359. goto utf7Error;
  1360. } else {
  1361. *p++ = ch;
  1362. }
  1363. } else {
  1364. charsleft = (charsleft << 6) | UB64(ch);
  1365. bitsleft += 6;
  1366. s++;
  1367. /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
  1368. }
  1369. }
  1370. else if ( ch == '+' ) {
  1371. startinpos = s-starts;
  1372. s++;
  1373. if (s < e && *s == '-') {
  1374. s++;
  1375. *p++ = '+';
  1376. } else
  1377. {
  1378. inShift = 1;
  1379. bitsleft = 0;
  1380. }
  1381. }
  1382. else if (SPECIAL(ch,0,0)) {
  1383. startinpos = s-starts;
  1384. errmsg = "unexpected special character";
  1385. s++;
  1386. goto utf7Error;
  1387. }
  1388. else {
  1389. *p++ = ch;
  1390. s++;
  1391. }
  1392. continue;
  1393. utf7Error:
  1394. outpos = p-PyUnicode_AS_UNICODE(unicode);
  1395. endinpos = s-starts;
  1396. if (unicode_decode_call_errorhandler(
  1397. errors, &errorHandler,
  1398. "utf7", errmsg,
  1399. starts, size, &startinpos, &endinpos, &exc, &s,
  1400. &unicode, &outpos, &p))
  1401. goto onError;
  1402. }
  1403. if (inShift && !consumed) {
  1404. outpos = p-PyUnicode_AS_UNICODE(unicode);
  1405. endinpos = size;
  1406. if (unicode_decode_call_errorhandler(
  1407. errors, &errorHandler,
  1408. "utf7", "unterminated shift sequence",
  1409. starts, size, &startinpos, &endinpos, &exc, &s,
  1410. &unicode, &outpos, &p))
  1411. goto onError;
  1412. if (s < e)
  1413. goto restart;
  1414. }
  1415. if (consumed) {
  1416. if(inShift)
  1417. *consumed = startinpos;
  1418. else
  1419. *consumed = s-starts;
  1420. }
  1421. if (_PyUnicode_Resiz

Large files files are truncated, but you can click here to view the full file