PageRenderTime 96ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 1ms

/pypy/module/cpyext/test/_sre.c

https://bitbucket.org/kcr/pypy
C | 3908 lines | 2992 code | 556 blank | 360 comment | 613 complexity | 30e62d123b897db7a9341ac71625a5d9 MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Secret Labs' Regular Expression Engine
  3. *
  4. * regular expression matching engine
  5. *
  6. * partial history:
  7. * 1999-10-24 fl created (based on existing template matcher code)
  8. * 2000-03-06 fl first alpha, sort of
  9. * 2000-08-01 fl fixes for 1.6b1
  10. * 2000-08-07 fl use PyOS_CheckStack() if available
  11. * 2000-09-20 fl added expand method
  12. * 2001-03-20 fl lots of fixes for 2.1b2
  13. * 2001-04-15 fl export copyright as Python attribute, not global
  14. * 2001-04-28 fl added __copy__ methods (work in progress)
  15. * 2001-05-14 fl fixes for 1.5.2 compatibility
  16. * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
  17. * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
  18. * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
  19. * 2001-10-21 fl added sub/subn primitive
  20. * 2001-10-24 fl added finditer primitive (for 2.2 only)
  21. * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
  22. * 2002-11-09 fl fixed empty sub/subn return type
  23. * 2003-04-18 mvl fully support 4-byte codes
  24. * 2003-10-17 gn implemented non recursive scheme
  25. *
  26. * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  27. *
  28. * This version of the SRE library can be redistributed under CNRI's
  29. * Python 1.6 license. For any other use, please contact Secret Labs
  30. * AB (info@pythonware.com).
  31. *
  32. * Portions of this engine have been developed in cooperation with
  33. * CNRI. Hewlett-Packard provided funding for 1.6 integration and
  34. * other compatibility work.
  35. */
  36. #ifndef SRE_RECURSIVE
  37. static char copyright[] =
  38. " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
  39. #define PY_SSIZE_T_CLEAN
  40. #include "Python.h"
  41. #include "structmember.h" /* offsetof */
  42. #include "sre.h"
  43. #include <ctype.h>
  44. /* name of this module, minus the leading underscore */
  45. #if !defined(SRE_MODULE)
  46. #define SRE_MODULE "sre"
  47. #endif
  48. #define SRE_PY_MODULE "re"
  49. /* defining this one enables tracing */
  50. #undef VERBOSE
  51. #if PY_VERSION_HEX >= 0x01060000
  52. #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
  53. /* defining this enables unicode support (default under 1.6a1 and later) */
  54. #define HAVE_UNICODE
  55. #endif
  56. #endif
  57. /* -------------------------------------------------------------------- */
  58. /* optional features */
  59. /* enables fast searching */
  60. #define USE_FAST_SEARCH
  61. /* enables aggressive inlining (always on for Visual C) */
  62. #undef USE_INLINE
  63. /* enables copy/deepcopy handling (work in progress) */
  64. #undef USE_BUILTIN_COPY
  65. #if PY_VERSION_HEX < 0x01060000
  66. #define PyObject_DEL(op) PyMem_DEL((op))
  67. #endif
  68. /* -------------------------------------------------------------------- */
  69. #if defined(_MSC_VER)
  70. #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
  71. #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
  72. /* fastest possible local call under MSVC */
  73. #define LOCAL(type) static __inline type __fastcall
  74. #elif defined(USE_INLINE)
  75. #define LOCAL(type) static inline type
  76. #else
  77. #define LOCAL(type) static type
  78. #endif
  79. /* error codes */
  80. #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
  81. #define SRE_ERROR_STATE -2 /* illegal state */
  82. #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
  83. #define SRE_ERROR_MEMORY -9 /* out of memory */
  84. #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
  85. #if defined(VERBOSE)
  86. #define TRACE(v) printf v
  87. #else
  88. #define TRACE(v)
  89. #endif
  90. /* -------------------------------------------------------------------- */
  91. /* search engine state */
  92. /* default character predicates (run sre_chars.py to regenerate tables) */
  93. #define SRE_DIGIT_MASK 1
  94. #define SRE_SPACE_MASK 2
  95. #define SRE_LINEBREAK_MASK 4
  96. #define SRE_ALNUM_MASK 8
  97. #define SRE_WORD_MASK 16
  98. /* FIXME: this assumes ASCII. create tables in init_sre() instead */
  99. static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
  100. 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
  101. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
  102. 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  103. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
  104. 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  105. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
  106. static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
  107. 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
  108. 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
  109. 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
  110. 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
  111. 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
  112. 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
  113. 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
  114. 120, 121, 122, 123, 124, 125, 126, 127 };
  115. #define SRE_IS_DIGIT(ch)\
  116. ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
  117. #define SRE_IS_SPACE(ch)\
  118. ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
  119. #define SRE_IS_LINEBREAK(ch)\
  120. ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
  121. #define SRE_IS_ALNUM(ch)\
  122. ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
  123. #define SRE_IS_WORD(ch)\
  124. ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
  125. static unsigned int sre_lower(unsigned int ch)
  126. {
  127. return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
  128. }
  129. /* locale-specific character predicates */
  130. /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
  131. * warnings when c's type supports only numbers < N+1 */
  132. #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
  133. #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
  134. #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
  135. #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
  136. #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
  137. static unsigned int sre_lower_locale(unsigned int ch)
  138. {
  139. return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
  140. }
  141. /* unicode-specific character predicates */
  142. #if defined(HAVE_UNICODE)
  143. #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
  144. #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
  145. #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
  146. #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
  147. #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
  148. static unsigned int sre_lower_unicode(unsigned int ch)
  149. {
  150. return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
  151. }
  152. #endif
  153. LOCAL(int)
  154. sre_category(SRE_CODE category, unsigned int ch)
  155. {
  156. switch (category) {
  157. case SRE_CATEGORY_DIGIT:
  158. return SRE_IS_DIGIT(ch);
  159. case SRE_CATEGORY_NOT_DIGIT:
  160. return !SRE_IS_DIGIT(ch);
  161. case SRE_CATEGORY_SPACE:
  162. return SRE_IS_SPACE(ch);
  163. case SRE_CATEGORY_NOT_SPACE:
  164. return !SRE_IS_SPACE(ch);
  165. case SRE_CATEGORY_WORD:
  166. return SRE_IS_WORD(ch);
  167. case SRE_CATEGORY_NOT_WORD:
  168. return !SRE_IS_WORD(ch);
  169. case SRE_CATEGORY_LINEBREAK:
  170. return SRE_IS_LINEBREAK(ch);
  171. case SRE_CATEGORY_NOT_LINEBREAK:
  172. return !SRE_IS_LINEBREAK(ch);
  173. case SRE_CATEGORY_LOC_WORD:
  174. return SRE_LOC_IS_WORD(ch);
  175. case SRE_CATEGORY_LOC_NOT_WORD:
  176. return !SRE_LOC_IS_WORD(ch);
  177. #if defined(HAVE_UNICODE)
  178. case SRE_CATEGORY_UNI_DIGIT:
  179. return SRE_UNI_IS_DIGIT(ch);
  180. case SRE_CATEGORY_UNI_NOT_DIGIT:
  181. return !SRE_UNI_IS_DIGIT(ch);
  182. case SRE_CATEGORY_UNI_SPACE:
  183. return SRE_UNI_IS_SPACE(ch);
  184. case SRE_CATEGORY_UNI_NOT_SPACE:
  185. return !SRE_UNI_IS_SPACE(ch);
  186. case SRE_CATEGORY_UNI_WORD:
  187. return SRE_UNI_IS_WORD(ch);
  188. case SRE_CATEGORY_UNI_NOT_WORD:
  189. return !SRE_UNI_IS_WORD(ch);
  190. case SRE_CATEGORY_UNI_LINEBREAK:
  191. return SRE_UNI_IS_LINEBREAK(ch);
  192. case SRE_CATEGORY_UNI_NOT_LINEBREAK:
  193. return !SRE_UNI_IS_LINEBREAK(ch);
  194. #else
  195. case SRE_CATEGORY_UNI_DIGIT:
  196. return SRE_IS_DIGIT(ch);
  197. case SRE_CATEGORY_UNI_NOT_DIGIT:
  198. return !SRE_IS_DIGIT(ch);
  199. case SRE_CATEGORY_UNI_SPACE:
  200. return SRE_IS_SPACE(ch);
  201. case SRE_CATEGORY_UNI_NOT_SPACE:
  202. return !SRE_IS_SPACE(ch);
  203. case SRE_CATEGORY_UNI_WORD:
  204. return SRE_LOC_IS_WORD(ch);
  205. case SRE_CATEGORY_UNI_NOT_WORD:
  206. return !SRE_LOC_IS_WORD(ch);
  207. case SRE_CATEGORY_UNI_LINEBREAK:
  208. return SRE_IS_LINEBREAK(ch);
  209. case SRE_CATEGORY_UNI_NOT_LINEBREAK:
  210. return !SRE_IS_LINEBREAK(ch);
  211. #endif
  212. }
  213. return 0;
  214. }
  215. /* helpers */
  216. static void
  217. data_stack_dealloc(SRE_STATE* state)
  218. {
  219. if (state->data_stack) {
  220. PyMem_FREE(state->data_stack);
  221. state->data_stack = NULL;
  222. }
  223. state->data_stack_size = state->data_stack_base = 0;
  224. }
  225. static int
  226. data_stack_grow(SRE_STATE* state, Py_ssize_t size)
  227. {
  228. Py_ssize_t minsize, cursize;
  229. minsize = state->data_stack_base+size;
  230. cursize = state->data_stack_size;
  231. if (cursize < minsize) {
  232. void* stack;
  233. cursize = minsize+minsize/4+1024;
  234. TRACE(("allocate/grow stack %d\n", cursize));
  235. stack = PyMem_REALLOC(state->data_stack, cursize);
  236. if (!stack) {
  237. data_stack_dealloc(state);
  238. return SRE_ERROR_MEMORY;
  239. }
  240. state->data_stack = (char *)stack;
  241. state->data_stack_size = cursize;
  242. }
  243. return 0;
  244. }
  245. /* generate 8-bit version */
  246. #define SRE_CHAR unsigned char
  247. #define SRE_AT sre_at
  248. #define SRE_COUNT sre_count
  249. #define SRE_CHARSET sre_charset
  250. #define SRE_INFO sre_info
  251. #define SRE_MATCH sre_match
  252. #define SRE_MATCH_CONTEXT sre_match_context
  253. #define SRE_SEARCH sre_search
  254. #define SRE_LITERAL_TEMPLATE sre_literal_template
  255. #if defined(HAVE_UNICODE)
  256. #define SRE_RECURSIVE
  257. #include "_sre.c"
  258. #undef SRE_RECURSIVE
  259. #undef SRE_LITERAL_TEMPLATE
  260. #undef SRE_SEARCH
  261. #undef SRE_MATCH
  262. #undef SRE_MATCH_CONTEXT
  263. #undef SRE_INFO
  264. #undef SRE_CHARSET
  265. #undef SRE_COUNT
  266. #undef SRE_AT
  267. #undef SRE_CHAR
  268. /* generate 16-bit unicode version */
  269. #define SRE_CHAR Py_UNICODE
  270. #define SRE_AT sre_uat
  271. #define SRE_COUNT sre_ucount
  272. #define SRE_CHARSET sre_ucharset
  273. #define SRE_INFO sre_uinfo
  274. #define SRE_MATCH sre_umatch
  275. #define SRE_MATCH_CONTEXT sre_umatch_context
  276. #define SRE_SEARCH sre_usearch
  277. #define SRE_LITERAL_TEMPLATE sre_uliteral_template
  278. #endif
  279. #endif /* SRE_RECURSIVE */
  280. /* -------------------------------------------------------------------- */
  281. /* String matching engine */
  282. /* the following section is compiled twice, with different character
  283. settings */
  284. LOCAL(int)
  285. SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
  286. {
  287. /* check if pointer is at given position */
  288. Py_ssize_t thisp, thatp;
  289. switch (at) {
  290. case SRE_AT_BEGINNING:
  291. case SRE_AT_BEGINNING_STRING:
  292. return ((void*) ptr == state->beginning);
  293. case SRE_AT_BEGINNING_LINE:
  294. return ((void*) ptr == state->beginning ||
  295. SRE_IS_LINEBREAK((int) ptr[-1]));
  296. case SRE_AT_END:
  297. return (((void*) (ptr+1) == state->end &&
  298. SRE_IS_LINEBREAK((int) ptr[0])) ||
  299. ((void*) ptr == state->end));
  300. case SRE_AT_END_LINE:
  301. return ((void*) ptr == state->end ||
  302. SRE_IS_LINEBREAK((int) ptr[0]));
  303. case SRE_AT_END_STRING:
  304. return ((void*) ptr == state->end);
  305. case SRE_AT_BOUNDARY:
  306. if (state->beginning == state->end)
  307. return 0;
  308. thatp = ((void*) ptr > state->beginning) ?
  309. SRE_IS_WORD((int) ptr[-1]) : 0;
  310. thisp = ((void*) ptr < state->end) ?
  311. SRE_IS_WORD((int) ptr[0]) : 0;
  312. return thisp != thatp;
  313. case SRE_AT_NON_BOUNDARY:
  314. if (state->beginning == state->end)
  315. return 0;
  316. thatp = ((void*) ptr > state->beginning) ?
  317. SRE_IS_WORD((int) ptr[-1]) : 0;
  318. thisp = ((void*) ptr < state->end) ?
  319. SRE_IS_WORD((int) ptr[0]) : 0;
  320. return thisp == thatp;
  321. case SRE_AT_LOC_BOUNDARY:
  322. if (state->beginning == state->end)
  323. return 0;
  324. thatp = ((void*) ptr > state->beginning) ?
  325. SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
  326. thisp = ((void*) ptr < state->end) ?
  327. SRE_LOC_IS_WORD((int) ptr[0]) : 0;
  328. return thisp != thatp;
  329. case SRE_AT_LOC_NON_BOUNDARY:
  330. if (state->beginning == state->end)
  331. return 0;
  332. thatp = ((void*) ptr > state->beginning) ?
  333. SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
  334. thisp = ((void*) ptr < state->end) ?
  335. SRE_LOC_IS_WORD((int) ptr[0]) : 0;
  336. return thisp == thatp;
  337. #if defined(HAVE_UNICODE)
  338. case SRE_AT_UNI_BOUNDARY:
  339. if (state->beginning == state->end)
  340. return 0;
  341. thatp = ((void*) ptr > state->beginning) ?
  342. SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
  343. thisp = ((void*) ptr < state->end) ?
  344. SRE_UNI_IS_WORD((int) ptr[0]) : 0;
  345. return thisp != thatp;
  346. case SRE_AT_UNI_NON_BOUNDARY:
  347. if (state->beginning == state->end)
  348. return 0;
  349. thatp = ((void*) ptr > state->beginning) ?
  350. SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
  351. thisp = ((void*) ptr < state->end) ?
  352. SRE_UNI_IS_WORD((int) ptr[0]) : 0;
  353. return thisp == thatp;
  354. #endif
  355. }
  356. return 0;
  357. }
  358. LOCAL(int)
  359. SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
  360. {
  361. /* check if character is a member of the given set */
  362. int ok = 1;
  363. for (;;) {
  364. switch (*set++) {
  365. case SRE_OP_FAILURE:
  366. return !ok;
  367. case SRE_OP_LITERAL:
  368. /* <LITERAL> <code> */
  369. if (ch == set[0])
  370. return ok;
  371. set++;
  372. break;
  373. case SRE_OP_CATEGORY:
  374. /* <CATEGORY> <code> */
  375. if (sre_category(set[0], (int) ch))
  376. return ok;
  377. set += 1;
  378. break;
  379. case SRE_OP_CHARSET:
  380. if (sizeof(SRE_CODE) == 2) {
  381. /* <CHARSET> <bitmap> (16 bits per code word) */
  382. if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
  383. return ok;
  384. set += 16;
  385. }
  386. else {
  387. /* <CHARSET> <bitmap> (32 bits per code word) */
  388. if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
  389. return ok;
  390. set += 8;
  391. }
  392. break;
  393. case SRE_OP_RANGE:
  394. /* <RANGE> <lower> <upper> */
  395. if (set[0] <= ch && ch <= set[1])
  396. return ok;
  397. set += 2;
  398. break;
  399. case SRE_OP_NEGATE:
  400. ok = !ok;
  401. break;
  402. case SRE_OP_BIGCHARSET:
  403. /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
  404. {
  405. Py_ssize_t count, block;
  406. count = *(set++);
  407. if (sizeof(SRE_CODE) == 2) {
  408. block = ((unsigned char*)set)[ch >> 8];
  409. set += 128;
  410. if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
  411. return ok;
  412. set += count*16;
  413. }
  414. else {
  415. /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
  416. * warnings when c's type supports only numbers < N+1 */
  417. if (!(ch & ~65535))
  418. block = ((unsigned char*)set)[ch >> 8];
  419. else
  420. block = -1;
  421. set += 64;
  422. if (block >=0 &&
  423. (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
  424. return ok;
  425. set += count*8;
  426. }
  427. break;
  428. }
  429. default:
  430. /* internal error -- there's not much we can do about it
  431. here, so let's just pretend it didn't match... */
  432. return 0;
  433. }
  434. }
  435. }
  436. LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
  437. LOCAL(Py_ssize_t)
  438. SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
  439. {
  440. SRE_CODE chr;
  441. SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
  442. SRE_CHAR* end = (SRE_CHAR *)state->end;
  443. Py_ssize_t i;
  444. /* adjust end */
  445. if (maxcount < end - ptr && maxcount != 65535)
  446. end = ptr + maxcount;
  447. switch (pattern[0]) {
  448. case SRE_OP_IN:
  449. /* repeated set */
  450. TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
  451. while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
  452. ptr++;
  453. break;
  454. case SRE_OP_ANY:
  455. /* repeated dot wildcard. */
  456. TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
  457. while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
  458. ptr++;
  459. break;
  460. case SRE_OP_ANY_ALL:
  461. /* repeated dot wildcard. skip to the end of the target
  462. string, and backtrack from there */
  463. TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
  464. ptr = end;
  465. break;
  466. case SRE_OP_LITERAL:
  467. /* repeated literal */
  468. chr = pattern[1];
  469. TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
  470. while (ptr < end && (SRE_CODE) *ptr == chr)
  471. ptr++;
  472. break;
  473. case SRE_OP_LITERAL_IGNORE:
  474. /* repeated literal */
  475. chr = pattern[1];
  476. TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
  477. while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
  478. ptr++;
  479. break;
  480. case SRE_OP_NOT_LITERAL:
  481. /* repeated non-literal */
  482. chr = pattern[1];
  483. TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
  484. while (ptr < end && (SRE_CODE) *ptr != chr)
  485. ptr++;
  486. break;
  487. case SRE_OP_NOT_LITERAL_IGNORE:
  488. /* repeated non-literal */
  489. chr = pattern[1];
  490. TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
  491. while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
  492. ptr++;
  493. break;
  494. default:
  495. /* repeated single character pattern */
  496. TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
  497. while ((SRE_CHAR*) state->ptr < end) {
  498. i = SRE_MATCH(state, pattern);
  499. if (i < 0)
  500. return i;
  501. if (!i)
  502. break;
  503. }
  504. TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
  505. (SRE_CHAR*) state->ptr - ptr));
  506. return (SRE_CHAR*) state->ptr - ptr;
  507. }
  508. TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
  509. return ptr - (SRE_CHAR*) state->ptr;
  510. }
  511. #if 0 /* not used in this release */
  512. LOCAL(int)
  513. SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
  514. {
  515. /* check if an SRE_OP_INFO block matches at the current position.
  516. returns the number of SRE_CODE objects to skip if successful, 0
  517. if no match */
  518. SRE_CHAR* end = state->end;
  519. SRE_CHAR* ptr = state->ptr;
  520. Py_ssize_t i;
  521. /* check minimal length */
  522. if (pattern[3] && (end - ptr) < pattern[3])
  523. return 0;
  524. /* check known prefix */
  525. if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
  526. /* <length> <skip> <prefix data> <overlap data> */
  527. for (i = 0; i < pattern[5]; i++)
  528. if ((SRE_CODE) ptr[i] != pattern[7 + i])
  529. return 0;
  530. return pattern[0] + 2 * pattern[6];
  531. }
  532. return pattern[0];
  533. }
  534. #endif
  535. /* The macros below should be used to protect recursive SRE_MATCH()
  536. * calls that *failed* and do *not* return immediately (IOW, those
  537. * that will backtrack). Explaining:
  538. *
  539. * - Recursive SRE_MATCH() returned true: that's usually a success
  540. * (besides atypical cases like ASSERT_NOT), therefore there's no
  541. * reason to restore lastmark;
  542. *
  543. * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
  544. * is returning to the caller: If the current SRE_MATCH() is the
  545. * top function of the recursion, returning false will be a matching
  546. * failure, and it doesn't matter where lastmark is pointing to.
  547. * If it's *not* the top function, it will be a recursive SRE_MATCH()
  548. * failure by itself, and the calling SRE_MATCH() will have to deal
  549. * with the failure by the same rules explained here (it will restore
  550. * lastmark by itself if necessary);
  551. *
  552. * - Recursive SRE_MATCH() returned false, and will continue the
  553. * outside 'for' loop: must be protected when breaking, since the next
  554. * OP could potentially depend on lastmark;
  555. *
  556. * - Recursive SRE_MATCH() returned false, and will be called again
  557. * inside a local for/while loop: must be protected between each
  558. * loop iteration, since the recursive SRE_MATCH() could do anything,
  559. * and could potentially depend on lastmark.
  560. *
  561. * For more information, check the discussion at SF patch #712900.
  562. */
  563. #define LASTMARK_SAVE() \
  564. do { \
  565. ctx->lastmark = state->lastmark; \
  566. ctx->lastindex = state->lastindex; \
  567. } while (0)
  568. #define LASTMARK_RESTORE() \
  569. do { \
  570. state->lastmark = ctx->lastmark; \
  571. state->lastindex = ctx->lastindex; \
  572. } while (0)
  573. #define RETURN_ERROR(i) do { return i; } while(0)
  574. #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
  575. #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
  576. #define RETURN_ON_ERROR(i) \
  577. do { if (i < 0) RETURN_ERROR(i); } while (0)
  578. #define RETURN_ON_SUCCESS(i) \
  579. do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
  580. #define RETURN_ON_FAILURE(i) \
  581. do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
  582. #define SFY(x) #x
  583. #define DATA_STACK_ALLOC(state, type, ptr) \
  584. do { \
  585. alloc_pos = state->data_stack_base; \
  586. TRACE(("allocating %s in %d (%d)\n", \
  587. SFY(type), alloc_pos, sizeof(type))); \
  588. if (state->data_stack_size < alloc_pos+sizeof(type)) { \
  589. int j = data_stack_grow(state, sizeof(type)); \
  590. if (j < 0) return j; \
  591. if (ctx_pos != -1) \
  592. DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
  593. } \
  594. ptr = (type*)(state->data_stack+alloc_pos); \
  595. state->data_stack_base += sizeof(type); \
  596. } while (0)
  597. #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
  598. do { \
  599. TRACE(("looking up %s at %d\n", SFY(type), pos)); \
  600. ptr = (type*)(state->data_stack+pos); \
  601. } while (0)
  602. #define DATA_STACK_PUSH(state, data, size) \
  603. do { \
  604. TRACE(("copy data in %p to %d (%d)\n", \
  605. data, state->data_stack_base, size)); \
  606. if (state->data_stack_size < state->data_stack_base+size) { \
  607. int j = data_stack_grow(state, size); \
  608. if (j < 0) return j; \
  609. if (ctx_pos != -1) \
  610. DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
  611. } \
  612. memcpy(state->data_stack+state->data_stack_base, data, size); \
  613. state->data_stack_base += size; \
  614. } while (0)
  615. #define DATA_STACK_POP(state, data, size, discard) \
  616. do { \
  617. TRACE(("copy data to %p from %d (%d)\n", \
  618. data, state->data_stack_base-size, size)); \
  619. memcpy(data, state->data_stack+state->data_stack_base-size, size); \
  620. if (discard) \
  621. state->data_stack_base -= size; \
  622. } while (0)
  623. #define DATA_STACK_POP_DISCARD(state, size) \
  624. do { \
  625. TRACE(("discard data from %d (%d)\n", \
  626. state->data_stack_base-size, size)); \
  627. state->data_stack_base -= size; \
  628. } while(0)
  629. #define DATA_PUSH(x) \
  630. DATA_STACK_PUSH(state, (x), sizeof(*(x)))
  631. #define DATA_POP(x) \
  632. DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
  633. #define DATA_POP_DISCARD(x) \
  634. DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
  635. #define DATA_ALLOC(t,p) \
  636. DATA_STACK_ALLOC(state, t, p)
  637. #define DATA_LOOKUP_AT(t,p,pos) \
  638. DATA_STACK_LOOKUP_AT(state,t,p,pos)
  639. #define MARK_PUSH(lastmark) \
  640. do if (lastmark > 0) { \
  641. i = lastmark; /* ctx->lastmark may change if reallocated */ \
  642. DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
  643. } while (0)
  644. #define MARK_POP(lastmark) \
  645. do if (lastmark > 0) { \
  646. DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
  647. } while (0)
  648. #define MARK_POP_KEEP(lastmark) \
  649. do if (lastmark > 0) { \
  650. DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
  651. } while (0)
  652. #define MARK_POP_DISCARD(lastmark) \
  653. do if (lastmark > 0) { \
  654. DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
  655. } while (0)
  656. #define JUMP_NONE 0
  657. #define JUMP_MAX_UNTIL_1 1
  658. #define JUMP_MAX_UNTIL_2 2
  659. #define JUMP_MAX_UNTIL_3 3
  660. #define JUMP_MIN_UNTIL_1 4
  661. #define JUMP_MIN_UNTIL_2 5
  662. #define JUMP_MIN_UNTIL_3 6
  663. #define JUMP_REPEAT 7
  664. #define JUMP_REPEAT_ONE_1 8
  665. #define JUMP_REPEAT_ONE_2 9
  666. #define JUMP_MIN_REPEAT_ONE 10
  667. #define JUMP_BRANCH 11
  668. #define JUMP_ASSERT 12
  669. #define JUMP_ASSERT_NOT 13
  670. #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
  671. DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
  672. nextctx->last_ctx_pos = ctx_pos; \
  673. nextctx->jump = jumpvalue; \
  674. nextctx->pattern = nextpattern; \
  675. ctx_pos = alloc_pos; \
  676. ctx = nextctx; \
  677. goto entrance; \
  678. jumplabel: \
  679. while (0) /* gcc doesn't like labels at end of scopes */ \
  680. typedef struct {
  681. Py_ssize_t last_ctx_pos;
  682. Py_ssize_t jump;
  683. SRE_CHAR* ptr;
  684. SRE_CODE* pattern;
  685. Py_ssize_t count;
  686. Py_ssize_t lastmark;
  687. Py_ssize_t lastindex;
  688. union {
  689. SRE_CODE chr;
  690. SRE_REPEAT* rep;
  691. } u;
  692. } SRE_MATCH_CONTEXT;
  693. /* check if string matches the given pattern. returns <0 for
  694. error, 0 for failure, and 1 for success */
  695. LOCAL(Py_ssize_t)
  696. SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
  697. {
  698. SRE_CHAR* end = (SRE_CHAR *)state->end;
  699. Py_ssize_t alloc_pos, ctx_pos = -1;
  700. Py_ssize_t i, ret = 0;
  701. Py_ssize_t jump;
  702. unsigned int sigcount=0;
  703. SRE_MATCH_CONTEXT* ctx;
  704. SRE_MATCH_CONTEXT* nextctx;
  705. TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
  706. DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
  707. ctx->last_ctx_pos = -1;
  708. ctx->jump = JUMP_NONE;
  709. ctx->pattern = pattern;
  710. ctx_pos = alloc_pos;
  711. entrance:
  712. ctx->ptr = (SRE_CHAR *)state->ptr;
  713. if (ctx->pattern[0] == SRE_OP_INFO) {
  714. /* optimization info block */
  715. /* <INFO> <1=skip> <2=flags> <3=min> ... */
  716. if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
  717. TRACE(("reject (got %d chars, need %d)\n",
  718. (end - ctx->ptr), ctx->pattern[3]));
  719. RETURN_FAILURE;
  720. }
  721. ctx->pattern += ctx->pattern[1] + 1;
  722. }
  723. for (;;) {
  724. ++sigcount;
  725. if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
  726. RETURN_ERROR(SRE_ERROR_INTERRUPTED);
  727. switch (*ctx->pattern++) {
  728. case SRE_OP_MARK:
  729. /* set mark */
  730. /* <MARK> <gid> */
  731. TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
  732. ctx->ptr, ctx->pattern[0]));
  733. i = ctx->pattern[0];
  734. if (i & 1)
  735. state->lastindex = i/2 + 1;
  736. if (i > state->lastmark) {
  737. /* state->lastmark is the highest valid index in the
  738. state->mark array. If it is increased by more than 1,
  739. the intervening marks must be set to NULL to signal
  740. that these marks have not been encountered. */
  741. Py_ssize_t j = state->lastmark + 1;
  742. while (j < i)
  743. state->mark[j++] = NULL;
  744. state->lastmark = i;
  745. }
  746. state->mark[i] = ctx->ptr;
  747. ctx->pattern++;
  748. break;
  749. case SRE_OP_LITERAL:
  750. /* match literal string */
  751. /* <LITERAL> <code> */
  752. TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
  753. ctx->ptr, *ctx->pattern));
  754. if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
  755. RETURN_FAILURE;
  756. ctx->pattern++;
  757. ctx->ptr++;
  758. break;
  759. case SRE_OP_NOT_LITERAL:
  760. /* match anything that is not literal character */
  761. /* <NOT_LITERAL> <code> */
  762. TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
  763. ctx->ptr, *ctx->pattern));
  764. if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
  765. RETURN_FAILURE;
  766. ctx->pattern++;
  767. ctx->ptr++;
  768. break;
  769. case SRE_OP_SUCCESS:
  770. /* end of pattern */
  771. TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
  772. state->ptr = ctx->ptr;
  773. RETURN_SUCCESS;
  774. case SRE_OP_AT:
  775. /* match at given position */
  776. /* <AT> <code> */
  777. TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
  778. if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
  779. RETURN_FAILURE;
  780. ctx->pattern++;
  781. break;
  782. case SRE_OP_CATEGORY:
  783. /* match at given category */
  784. /* <CATEGORY> <code> */
  785. TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
  786. ctx->ptr, *ctx->pattern));
  787. if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
  788. RETURN_FAILURE;
  789. ctx->pattern++;
  790. ctx->ptr++;
  791. break;
  792. case SRE_OP_ANY:
  793. /* match anything (except a newline) */
  794. /* <ANY> */
  795. TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
  796. if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
  797. RETURN_FAILURE;
  798. ctx->ptr++;
  799. break;
  800. case SRE_OP_ANY_ALL:
  801. /* match anything */
  802. /* <ANY_ALL> */
  803. TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
  804. if (ctx->ptr >= end)
  805. RETURN_FAILURE;
  806. ctx->ptr++;
  807. break;
  808. case SRE_OP_IN:
  809. /* match set member (or non_member) */
  810. /* <IN> <skip> <set> */
  811. TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
  812. if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
  813. RETURN_FAILURE;
  814. ctx->pattern += ctx->pattern[0];
  815. ctx->ptr++;
  816. break;
  817. case SRE_OP_LITERAL_IGNORE:
  818. TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
  819. ctx->pattern, ctx->ptr, ctx->pattern[0]));
  820. if (ctx->ptr >= end ||
  821. state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
  822. RETURN_FAILURE;
  823. ctx->pattern++;
  824. ctx->ptr++;
  825. break;
  826. case SRE_OP_NOT_LITERAL_IGNORE:
  827. TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
  828. ctx->pattern, ctx->ptr, *ctx->pattern));
  829. if (ctx->ptr >= end ||
  830. state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
  831. RETURN_FAILURE;
  832. ctx->pattern++;
  833. ctx->ptr++;
  834. break;
  835. case SRE_OP_IN_IGNORE:
  836. TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
  837. if (ctx->ptr >= end
  838. || !SRE_CHARSET(ctx->pattern+1,
  839. (SRE_CODE)state->lower(*ctx->ptr)))
  840. RETURN_FAILURE;
  841. ctx->pattern += ctx->pattern[0];
  842. ctx->ptr++;
  843. break;
  844. case SRE_OP_JUMP:
  845. case SRE_OP_INFO:
  846. /* jump forward */
  847. /* <JUMP> <offset> */
  848. TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
  849. ctx->ptr, ctx->pattern[0]));
  850. ctx->pattern += ctx->pattern[0];
  851. break;
  852. case SRE_OP_BRANCH:
  853. /* alternation */
  854. /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
  855. TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
  856. LASTMARK_SAVE();
  857. ctx->u.rep = state->repeat;
  858. if (ctx->u.rep)
  859. MARK_PUSH(ctx->lastmark);
  860. for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
  861. if (ctx->pattern[1] == SRE_OP_LITERAL &&
  862. (ctx->ptr >= end ||
  863. (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
  864. continue;
  865. if (ctx->pattern[1] == SRE_OP_IN &&
  866. (ctx->ptr >= end ||
  867. !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
  868. continue;
  869. state->ptr = ctx->ptr;
  870. DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
  871. if (ret) {
  872. if (ctx->u.rep)
  873. MARK_POP_DISCARD(ctx->lastmark);
  874. RETURN_ON_ERROR(ret);
  875. RETURN_SUCCESS;
  876. }
  877. if (ctx->u.rep)
  878. MARK_POP_KEEP(ctx->lastmark);
  879. LASTMARK_RESTORE();
  880. }
  881. if (ctx->u.rep)
  882. MARK_POP_DISCARD(ctx->lastmark);
  883. RETURN_FAILURE;
  884. case SRE_OP_REPEAT_ONE:
  885. /* match repeated sequence (maximizing regexp) */
  886. /* this operator only works if the repeated item is
  887. exactly one character wide, and we're not already
  888. collecting backtracking points. for other cases,
  889. use the MAX_REPEAT operator */
  890. /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
  891. TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
  892. ctx->pattern[1], ctx->pattern[2]));
  893. if (ctx->ptr + ctx->pattern[1] > end)
  894. RETURN_FAILURE; /* cannot match */
  895. state->ptr = ctx->ptr;
  896. ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
  897. RETURN_ON_ERROR(ret);
  898. DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
  899. ctx->count = ret;
  900. ctx->ptr += ctx->count;
  901. /* when we arrive here, count contains the number of
  902. matches, and ctx->ptr points to the tail of the target
  903. string. check if the rest of the pattern matches,
  904. and backtrack if not. */
  905. if (ctx->count < (Py_ssize_t) ctx->pattern[1])
  906. RETURN_FAILURE;
  907. if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
  908. /* tail is empty. we're finished */
  909. state->ptr = ctx->ptr;
  910. RETURN_SUCCESS;
  911. }
  912. LASTMARK_SAVE();
  913. if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
  914. /* tail starts with a literal. skip positions where
  915. the rest of the pattern cannot possibly match */
  916. ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
  917. for (;;) {
  918. while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
  919. (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
  920. ctx->ptr--;
  921. ctx->count--;
  922. }
  923. if (ctx->count < (Py_ssize_t) ctx->pattern[1])
  924. break;
  925. state->ptr = ctx->ptr;
  926. DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
  927. ctx->pattern+ctx->pattern[0]);
  928. if (ret) {
  929. RETURN_ON_ERROR(ret);
  930. RETURN_SUCCESS;
  931. }
  932. LASTMARK_RESTORE();
  933. ctx->ptr--;
  934. ctx->count--;
  935. }
  936. } else {
  937. /* general case */
  938. while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
  939. state->ptr = ctx->ptr;
  940. DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
  941. ctx->pattern+ctx->pattern[0]);
  942. if (ret) {
  943. RETURN_ON_ERROR(ret);
  944. RETURN_SUCCESS;
  945. }
  946. ctx->ptr--;
  947. ctx->count--;
  948. LASTMARK_RESTORE();
  949. }
  950. }
  951. RETURN_FAILURE;
  952. case SRE_OP_MIN_REPEAT_ONE:
  953. /* match repeated sequence (minimizing regexp) */
  954. /* this operator only works if the repeated item is
  955. exactly one character wide, and we're not already
  956. collecting backtracking points. for other cases,
  957. use the MIN_REPEAT operator */
  958. /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
  959. TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
  960. ctx->pattern[1], ctx->pattern[2]));
  961. if (ctx->ptr + ctx->pattern[1] > end)
  962. RETURN_FAILURE; /* cannot match */
  963. state->ptr = ctx->ptr;
  964. if (ctx->pattern[1] == 0)
  965. ctx->count = 0;
  966. else {
  967. /* count using pattern min as the maximum */
  968. ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
  969. RETURN_ON_ERROR(ret);
  970. DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
  971. if (ret < (Py_ssize_t) ctx->pattern[1])
  972. /* didn't match minimum number of times */
  973. RETURN_FAILURE;
  974. /* advance past minimum matches of repeat */
  975. ctx->count = ret;
  976. ctx->ptr += ctx->count;
  977. }
  978. if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
  979. /* tail is empty. we're finished */
  980. state->ptr = ctx->ptr;
  981. RETURN_SUCCESS;
  982. } else {
  983. /* general case */
  984. LASTMARK_SAVE();
  985. while ((Py_ssize_t)ctx->pattern[2] == 65535
  986. || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
  987. state->ptr = ctx->ptr;
  988. DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
  989. ctx->pattern+ctx->pattern[0]);
  990. if (ret) {
  991. RETURN_ON_ERROR(ret);
  992. RETURN_SUCCESS;
  993. }
  994. state->ptr = ctx->ptr;
  995. ret = SRE_COUNT(state, ctx->pattern+3, 1);
  996. RETURN_ON_ERROR(ret);
  997. DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
  998. if (ret == 0)
  999. break;
  1000. assert(ret == 1);
  1001. ctx->ptr++;
  1002. ctx->count++;
  1003. LASTMARK_RESTORE();
  1004. }
  1005. }
  1006. RETURN_FAILURE;
  1007. case SRE_OP_REPEAT:
  1008. /* create repeat context. all the hard work is done
  1009. by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
  1010. /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
  1011. TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
  1012. ctx->pattern[1], ctx->pattern[2]));
  1013. /* install new repeat context */
  1014. ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
  1015. if (!ctx->u.rep) {
  1016. PyErr_NoMemory();
  1017. RETURN_FAILURE;
  1018. }
  1019. ctx->u.rep->count = -1;
  1020. ctx->u.rep->pattern = ctx->pattern;
  1021. ctx->u.rep->prev = state->repeat;
  1022. ctx->u.rep->last_ptr = NULL;
  1023. state->repeat = ctx->u.rep;
  1024. state->ptr = ctx->ptr;
  1025. DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
  1026. state->repeat = ctx->u.rep->prev;
  1027. PyObject_FREE(ctx->u.rep);
  1028. if (ret) {
  1029. RETURN_ON_ERROR(ret);
  1030. RETURN_SUCCESS;
  1031. }
  1032. RETURN_FAILURE;
  1033. case SRE_OP_MAX_UNTIL:
  1034. /* maximizing repeat */
  1035. /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
  1036. /* FIXME: we probably need to deal with zero-width
  1037. matches in here... */
  1038. ctx->u.rep = state->repeat;
  1039. if (!ctx->u.rep)
  1040. RETURN_ERROR(SRE_ERROR_STATE);
  1041. state->ptr = ctx->ptr;
  1042. ctx->count = ctx->u.rep->count+1;
  1043. TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
  1044. ctx->ptr, ctx->count));
  1045. if (ctx->count < ctx->u.rep->pattern[1]) {
  1046. /* not enough matches */
  1047. ctx->u.rep->count = ctx->count;
  1048. DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
  1049. ctx->u.rep->pattern+3);
  1050. if (ret) {
  1051. RETURN_ON_ERROR(ret);
  1052. RETURN_SUCCESS;
  1053. }
  1054. ctx->u.rep->count = ctx->count-1;
  1055. state->ptr = ctx->ptr;
  1056. RETURN_FAILURE;
  1057. }
  1058. if ((ctx->count < ctx->u.rep->pattern[2] ||
  1059. ctx->u.rep->pattern[2] == 65535) &&
  1060. state->ptr != ctx->u.rep->last_ptr) {
  1061. /* we may have enough matches, but if we can
  1062. match another item, do so */
  1063. ctx->u.rep->count = ctx->count;
  1064. LASTMARK_SAVE();
  1065. MARK_PUSH(ctx->lastmark);
  1066. /* zero-width match protection */
  1067. DATA_PUSH(&ctx->u.rep->last_ptr);
  1068. ctx->u.rep->last_ptr = state->ptr;
  1069. DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
  1070. ctx->u.rep->pattern+3);
  1071. DATA_POP(&ctx->u.rep->last_ptr);
  1072. if (ret) {
  1073. MARK_POP_DISCARD(ctx->lastmark);
  1074. RETURN_ON_ERROR(ret);
  1075. RETURN_SUCCESS;
  1076. }
  1077. MARK_POP(ctx->lastmark);
  1078. LASTMARK_RESTORE();
  1079. ctx->u.rep->count = ctx->count-1;
  1080. state->ptr = ctx->ptr;
  1081. }
  1082. /* cannot match more repeated items here. make sure the
  1083. tail matches */
  1084. state->repeat = ctx->u.rep->prev;
  1085. DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
  1086. RETURN_ON_SUCCESS(ret);
  1087. state->repeat = ctx->u.rep;
  1088. state->ptr = ctx->ptr;
  1089. RETURN_FAILURE;
  1090. case SRE_OP_MIN_UNTIL:
  1091. /* minimizing repeat */
  1092. /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
  1093. ctx->u.rep = state->repeat;
  1094. if (!ctx->u.rep)
  1095. RETURN_ERROR(SRE_ERROR_STATE);
  1096. state->ptr = ctx->ptr;
  1097. ctx->count = ctx->u.rep->count+1;
  1098. TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
  1099. ctx->ptr, ctx->count, ctx->u.rep->pattern));
  1100. if (ctx->count < ctx->u.rep->pattern[1]) {
  1101. /* not enough matches */
  1102. ctx->u.rep->count = ctx->count;
  1103. DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
  1104. ctx->u.rep->pattern+3);
  1105. if (ret) {
  1106. RETURN_ON_ERROR(ret);
  1107. RETURN_SUCCESS;
  1108. }
  1109. ctx->u.rep->count = ctx->count-1;
  1110. state->ptr = ctx->ptr;
  1111. RETURN_FAILURE;
  1112. }
  1113. LASTMARK_SAVE();
  1114. /* see if the tail matches */
  1115. state->repeat = ctx->u.rep->prev;
  1116. DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
  1117. if (ret) {
  1118. RETURN_ON_ERROR(ret);
  1119. RETURN_SUCCESS;
  1120. }
  1121. state->repeat = ctx->u.rep;
  1122. state->ptr = ctx->ptr;
  1123. LASTMARK_RESTORE();
  1124. if (ctx->count >= ctx->u.rep->pattern[2]
  1125. && ctx->u.rep->pattern[2] != 65535)
  1126. RETURN_FAILURE;
  1127. ctx->u.rep->count = ctx->count;
  1128. DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
  1129. ctx->u.rep->pattern+3);
  1130. if (ret) {
  1131. RETURN_ON_ERROR(ret);
  1132. RETURN_SUCCESS;
  1133. }
  1134. ctx->u.rep->count = ctx->count-1;
  1135. state->ptr = ctx->ptr;
  1136. RETURN_FAILURE;
  1137. case SRE_OP_GROUPREF:
  1138. /* match backreference */
  1139. TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
  1140. ctx->ptr, ctx->pattern[0]));
  1141. i = ctx->pattern[0];
  1142. {
  1143. Py_ssize_t groupref = i+i;
  1144. if (groupref >= state->lastmark) {
  1145. RETURN_FAILURE;
  1146. } else {
  1147. SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
  1148. SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
  1149. if (!p || !e || e < p)
  1150. RETURN_FAILURE;
  1151. while (p < e) {
  1152. if (ctx->ptr >= end || *ctx->ptr != *p)
  1153. RETURN_FAILURE;
  1154. p++; ctx->ptr++;
  1155. }
  1156. }
  1157. }
  1158. ctx->pattern++;
  1159. break;
  1160. case SRE_OP_GROUPREF_IGNORE:
  1161. /* match backreference */
  1162. TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
  1163. ctx->ptr, ctx->pattern[0]));
  1164. i = ctx->pattern[0];
  1165. {
  1166. Py_ssize_t groupref = i+i;
  1167. if (groupref >= state->lastmark) {
  1168. RETURN_FAILURE;
  1169. } else {
  1170. SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
  1171. SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
  1172. if (!p || !e || e < p)
  1173. RETURN_FAILURE;
  1174. while (p < e) {
  1175. if (ctx->ptr >= end ||
  1176. state->lower(*ctx->ptr) != state->lower(*p))
  1177. RETURN_FAILURE;
  1178. p++; ctx->ptr++;
  1179. }
  1180. }
  1181. }
  1182. ctx->pattern++;
  1183. break;
  1184. case SRE_OP_GROUPREF_EXISTS:
  1185. TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
  1186. ctx->ptr, ctx->pattern[0]));
  1187. /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
  1188. i = ctx->pattern[0];
  1189. {
  1190. Py_ssize_t groupref = i+i;
  1191. if (groupref >= state->lastmark) {
  1192. ctx->pattern += ctx->pattern[1];
  1193. break;
  1194. } else {
  1195. SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
  1196. SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
  1197. if (!p || !e || e < p) {
  1198. ctx->pattern += ctx->pattern[1];
  1199. break;
  1200. }
  1201. }
  1202. }
  1203. ctx->pattern += 2;
  1204. break;
  1205. case SRE_OP_ASSERT:
  1206. /* assert subpattern */
  1207. /* <ASSERT> <skip> <back> <pattern> */
  1208. TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
  1209. ctx->ptr, ctx->pattern[1]));
  1210. state->ptr = ctx->ptr - ctx->pattern[1];
  1211. if (state->ptr < state->beginning)
  1212. RETURN_FAILURE;
  1213. DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
  1214. RETURN_ON_FAILURE(ret);
  1215. ctx->pattern += ctx->pattern[0];
  1216. break;
  1217. case SRE_OP_ASSERT_NOT:
  1218. /* assert not subpattern */
  1219. /* <ASSERT_NOT> <skip> <back> <pattern> */
  1220. TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
  1221. ctx->ptr, ctx->pattern[1]));
  1222. state->ptr = ctx->ptr - ctx->pattern[1];
  1223. if (state->ptr >= state->beginning) {
  1224. DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
  1225. if (ret) {
  1226. RETURN_ON_ERROR(ret);
  1227. RETURN_FAILURE;
  1228. }
  1229. }
  1230. ctx->pattern += ctx->pattern[0];
  1231. break;
  1232. case SRE_OP_FAILURE:
  1233. /* immediate failure */
  1234. TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
  1235. RETURN_FAILURE;
  1236. default:
  1237. TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
  1238. ctx->pattern[-1]));
  1239. RETURN_ERROR(SRE_ERROR_ILLEGAL);
  1240. }
  1241. }
  1242. exit:
  1243. ctx_pos = ctx->last_ctx_pos;
  1244. jump = ctx->jump;
  1245. DATA_POP_DISCARD(ctx);
  1246. if (ctx_pos == -1)
  1247. return ret;
  1248. DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
  1249. switch (jump) {
  1250. case JUMP_MAX_UNTIL_2:
  1251. TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
  1252. goto jump_max_until_2;
  1253. case JUMP_MAX_UNTIL_3:
  1254. TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
  1255. goto jump_max_until_3;
  1256. case JUMP_MIN_UNTIL_2:
  1257. TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
  1258. goto jump_min_until_2;
  1259. case JUMP_MIN_UNTIL_3:
  1260. TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
  1261. goto jump_min_until_3;
  1262. case JUMP_BRANCH:
  1263. TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
  1264. goto jump_branch;
  1265. case JUMP_MAX_UNTIL_1:
  1266. TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
  1267. goto jump_max_until_1;
  1268. case JUMP_MIN_UNTIL_1:
  1269. TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
  1270. goto jump_min_until_1;
  1271. case JUMP_REPEAT:
  1272. TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
  1273. goto jump_repeat;
  1274. case JUMP_REPEAT_ONE_1:
  1275. TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
  1276. goto jump_repeat_one_1;
  1277. case JUMP_REPEAT_ONE_2:
  1278. TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
  1279. goto jump_repeat_one_2;
  1280. case JUMP_MIN_REPEAT_ONE:
  1281. TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
  1282. goto jump_min_repeat_one;
  1283. case JUMP_ASSERT:
  1284. TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
  1285. goto jump_assert;
  1286. case JUMP_ASSERT_NOT:
  1287. TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
  1288. goto jump_assert_not;
  1289. case JUMP_NONE:
  1290. TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
  1291. break;
  1292. }
  1293. return ret; /* should never get here */
  1294. }
  1295. LOCAL(Py_ssize_t)
  1296. SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
  1297. {
  1298. SRE_CHAR* ptr = (SRE_CHAR *)state->start;
  1299. SRE_CHAR* end = (SRE_CHAR *)state->end;
  1300. Py_ssize_t status = 0;
  1301. Py_ssize_t prefix_len = 0;
  1302. Py_ssize_t prefix_skip = 0;
  1303. SRE_CODE* prefix = NULL;
  1304. SRE_CODE* charset = NULL;
  1305. SRE_CODE* overlap = NULL;
  1306. int flags = 0;
  1307. if (pattern[0] == SRE_OP_INFO) {
  1308. /* optimization info block */
  1309. /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
  1310. flags = pattern[2];
  1311. if (pattern[3] > 1) {
  1312. /* adjust end point (but make sure we leave at least one
  1313. character in there, so literal search will work) */
  1314. end -= pattern[3]-1;
  1315. if (end <= ptr)
  1316. end = ptr+1;
  1317. }
  1318. if (flags & SRE_INFO_PREFIX) {
  1319. /* pattern starts with a known prefix */
  1320. /* <length> <skip> <prefix data> <overlap data> */
  1321. prefix_len = pattern[5];
  1322. prefix_skip = pattern[6];
  1323. prefix = pattern + 7;
  1324. overlap = prefix + prefix_len - 1;
  1325. } else if (flags & SRE_INFO_CHARSET)
  1326. /* pattern starts with a character from a known set */
  1327. /* <charset> */
  1328. charset = pattern + 5;
  1329. pattern += 1 + pattern[1];
  1330. }
  1331. TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
  1332. TRACE(("charset = %p\n", charset));
  1333. #if defined(USE_FAST_SEARCH)
  1334. if (prefix_len > 1) {
  1335. /* pattern starts with a known prefix. use the overlap
  1336. table to skip forward as fast as we possibly can */
  1337. Py_ssize_t i = 0;
  1338. end = (SRE_CHAR *)state->end;
  1339. while (ptr < end) {
  1340. for (;;) {
  1341. if ((SRE_CODE) ptr[0] != prefix[i]) {
  1342. if (!i)
  1343. break;
  1344. else
  1345. i = overlap[i];
  1346. } else {
  1347. if (++i == prefix_len) {
  1348. /* found a potential match */
  1349. TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
  1350. state->start = ptr + 1 - prefix_len;
  1351. state->ptr = ptr + 1 - prefix_len + prefix_skip;
  1352. if (flags & SRE_INFO_LITERAL)
  1353. return 1; /* we got all of it */
  1354. status = SRE_MATCH(state, pattern + 2*prefix_skip);
  1355. if (status != 0)
  1356. return status;
  1357. /* close but no cigar -- try again */
  1358. i = overlap[i];
  1359. }
  1360. break;
  1361. }
  1362. }
  1363. ptr++;
  1364. }
  1365. return 0;
  1366. }
  1367. #endif
  1368. if (pattern[0] == SRE_OP_LITERAL) {
  1369. /* pattern starts with a literal character. this is used
  1370. for short prefixes, and if fast search is disabled */
  1371. SRE_CODE chr = pattern[1];
  1372. end = (SRE_CHAR *)state->end;
  1373. for (;;) {
  1374. while (ptr < end && (SRE_CODE) ptr[0] != chr)
  1375. ptr++;
  1376. if (ptr >= end)
  1377. return 0;
  1378. TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
  1379. state->start = ptr;
  1380. state->ptr = ++ptr;
  1381. if (flags & SRE_INFO_LITERAL)
  1382. return 1; /* we got all of it */
  1383. status = SRE_MATCH(state, pattern + 2);
  1384. if (status != 0)
  1385. break;
  1386. }
  1387. } else if (charset) {
  1388. /* pattern starts with a character from a known set */
  1389. end = (SRE_CHAR *)state->end;
  1390. for (;;) {
  1391. while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
  1392. ptr++;
  1393. if (ptr >= end)
  1394. return 0;
  1395. TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
  1396. state->start = ptr;
  1397. state->ptr = ptr;
  1398. status = SRE_MATCH(state, pattern);
  1399. if (status != 0)
  1400. break;
  1401. ptr++;
  1402. }
  1403. } else
  1404. /* general case */
  1405. while (ptr <= end) {
  1406. TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
  1407. state->start = state->ptr = ptr++;
  1408. status = SRE_MATCH(state, pattern);
  1409. if (status != 0)
  1410. break;
  1411. }
  1412. return status;
  1413. }
  1414. LOCAL(int)
  1415. SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
  1416. {
  1417. /* check if given string is a literal template (i.e. no escapes) */
  1418. while (len-- > 0)
  1419. if (*ptr++ == '\\')
  1420. return 0;
  1421. return 1;
  1422. }
  1423. #if !defined(SRE_RECURSIVE)
  1424. /* -------------------------------------------------------------------- */
  1425. /* factories and destructors */
  1426. /* see sre.h for object declarations */
  1427. static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
  1428. static PyObject*pattern_scanner(PatternObject*, PyObject*);
  1429. static PyObject *
  1430. sre_codesize(PyObject* self, PyObject *unused)
  1431. {
  1432. return Py_BuildValue("l", sizeof(SRE_CODE));
  1433. }
  1434. static PyObject *
  1435. sre_getlower(PyObject* self, PyObject* args)
  1436. {
  1437. int character, flags;
  1438. if (!PyArg_ParseTuple(args, "ii", &character, &flags))
  1439. return NULL;
  1440. if (flags & SRE_FLAG_LOCALE)
  1441. return Py_BuildValue("i", sre_lower_locale(character));
  1442. if (flags & SRE_FLAG_UNICODE)
  1443. #if defined(HAVE_UNICODE)
  1444. return Py_BuildValue("i", sre_lower_unicode(character));
  1445. #else
  1446. return Py_BuildValue("i", sre_lower_locale(character));
  1447. #endif
  1448. return Py_BuildValue("i", sre_lower(character));
  1449. }
  1450. LOCAL(void)
  1451. state_reset(SRE_STATE* state)
  1452. {
  1453. /* FIXME: dynamic! */
  1454. /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
  1455. state->lastmark = -1;
  1456. state->lastindex = -1;
  1457. state->repeat = NULL;
  1458. data_stack_dealloc(state);
  1459. }
  1460. static void*
  1461. getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
  1462. {
  1463. /* given a python object, return a data pointer, a length (in
  1464. characters), and a character size. return NULL if the object
  1465. is not a string (or not compatible) */
  1466. PyBufferProcs *buffer;
  1467. Py_ssize_t size, bytes;
  1468. int charsize;
  1469. void* ptr;
  1470. #if defined(HAVE_UNICODE)
  1471. if (PyUnicode_Check(string)) {
  1472. /* unicode strings doesn't always support the buffer interface */
  1473. ptr = (void*) PyUnicode_AS_DATA(string);
  1474. /* bytes = PyUnicode_GET_DATA_SIZE(string); */
  1475. size = PyUnicode_GET_SIZE(string);
  1476. charsize = sizeof(Py_UNICODE);
  1477. } else {
  1478. #endif
  1479. /* get pointer to string buffer */
  1480. buffer = Py_TYPE(string)->tp_as_buffer;
  1481. if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
  1482. buffer->bf_getsegcount(string, NULL) != 1) {
  1483. PyErr_SetString(PyExc_TypeError, "expected string or buffer");
  1484. return NULL;
  1485. }
  1486. /* determine buffer size */
  1487. bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
  1488. if (bytes < 0) {
  1489. PyErr_SetString(PyExc_TypeError, "buffer has negative size");
  1490. return NULL;
  1491. }
  1492. /* determine character size */
  1493. #if PY_VERSION_HEX >= 0x01060000
  1494. size = PyObject_Size(string);
  1495. #else
  1496. size = PyObject_Length(string);
  1497. #endif
  1498. if (PyString_Check(string) || bytes == size)
  1499. charsize = 1;
  1500. #if defined(HAVE_UNICODE)
  1501. else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
  1502. charsize = sizeof(Py_UNICODE);
  1503. #endif
  1504. else {
  1505. PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
  1506. return NULL;
  1507. }
  1508. #if defined(HAVE_UNICODE)
  1509. }
  1510. #endif
  1511. *p_length = size;
  1512. *p_charsize = charsize;
  1513. return ptr;
  1514. }
  1515. LOCAL(PyObject*)
  1516. state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
  1517. Py_ssize_t start, Py_ssize_t end)
  1518. {
  1519. /* prepare state object */
  1520. Py_ssize_t length;
  1521. int charsize;
  1522. void* ptr;
  1523. memset(state, 0, sizeof(SRE_STATE));
  1524. state->lastmark = -1;
  1525. state->lastindex = -1;
  1526. ptr = getstring(string, &length, &charsize);
  1527. if (!ptr)
  1528. return NULL;
  1529. /* adjust boundaries */
  1530. if (start < 0)
  1531. start = 0;
  1532. else if (start > length)
  1533. start = length;
  1534. if (end < 0)
  1535. end = 0;
  1536. else if (end > length)
  1537. end = length;
  1538. state->charsize = charsize;
  1539. state->beginning = ptr;
  1540. state->start = (void*) ((char*) ptr + start * state->charsize);
  1541. state->end = (void*) ((char*) ptr + end * state->charsize);
  1542. Py_INCREF(string);
  1543. state->string = string;
  1544. state->pos = start;
  1545. state->endpos = end;
  1546. if (pattern->flags & SRE_FLAG_LOCALE)
  1547. state->lower = sre_lower_locale;
  1548. else if (pattern->flags & SRE_FLAG_UNICODE)
  1549. #if defined(HAVE_UNICODE)
  1550. state->lower = sre_lower_unicode;
  1551. #else
  1552. state->lower = sre_lower_locale;
  1553. #endif
  1554. else
  1555. state->lower = sre_lower;
  1556. return string;
  1557. }
  1558. LOCAL(void)
  1559. state_fini(SRE_STATE* state)
  1560. {
  1561. Py_XDECREF(state->string);
  1562. data_stack_dealloc(state);
  1563. }
  1564. /* calculate offset from start of string */
  1565. #define STATE_OFFSET(state, member)\
  1566. (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
  1567. LOCAL(PyObject*)
  1568. state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
  1569. {
  1570. Py_ssize_t i, j;
  1571. index = (index - 1) * 2;
  1572. if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
  1573. if (empty)
  1574. /* want empty string */
  1575. i = j = 0;
  1576. else {
  1577. Py_INCREF(Py_None);
  1578. return Py_None;
  1579. }
  1580. } else {
  1581. i = STATE_OFFSET(state, state->mark[index]);
  1582. j = STATE_OFFSET(state, state->mark[index+1]);
  1583. }
  1584. return PySequence_GetSlice(string, i, j);
  1585. }
  1586. static void
  1587. pattern_error(int status)
  1588. {
  1589. switch (status) {
  1590. case SRE_ERROR_RECURSION_LIMIT:
  1591. PyErr_SetString(
  1592. PyExc_RuntimeError,
  1593. "maximum recursion limit exceeded"
  1594. );
  1595. break;
  1596. case SRE_ERROR_MEMORY:
  1597. PyErr_NoMemory();
  1598. break;
  1599. case SRE_ERROR_INTERRUPTED:
  1600. /* An exception has already been raised, so let it fly */
  1601. break;
  1602. default:
  1603. /* other error codes indicate compiler/engine bugs */
  1604. PyErr_SetString(
  1605. PyExc_RuntimeError,
  1606. "internal error in regular expression engine"
  1607. );
  1608. }
  1609. }
  1610. static void
  1611. pattern_dealloc(PatternObject* self)
  1612. {
  1613. if (self->weakreflist != NULL)
  1614. PyObject_ClearWeakRefs((PyObject *) self);
  1615. Py_XDECREF(self->pattern);
  1616. Py_XDECREF(self->groupindex);
  1617. Py_XDECREF(self->indexgroup);
  1618. PyObject_DEL(self);
  1619. }
  1620. static PyObject*
  1621. pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
  1622. {
  1623. SRE_STATE state;
  1624. int status;
  1625. PyObject* string;
  1626. Py_ssize_t start = 0;
  1627. Py_ssize_t end = PY_SSIZE_T_MAX;
  1628. static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
  1629. if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
  1630. &string, &start, &end))
  1631. return NULL;
  1632. string = state_init(&state, self, string, start, end);
  1633. if (!string)
  1634. return NULL;
  1635. state.ptr = state.start;
  1636. TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
  1637. if (state.charsize == 1) {
  1638. status = sre_match(&state, PatternObject_GetCode(self));
  1639. } else {
  1640. #if defined(HAVE_UNICODE)
  1641. status = sre_umatch(&state, PatternObject_GetCode(self));
  1642. #endif
  1643. }
  1644. TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
  1645. if (PyErr_Occurred())
  1646. return NULL;
  1647. state_fini(&state);
  1648. return pattern_new_match(self, &state, status);
  1649. }
  1650. static PyObject*
  1651. pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
  1652. {
  1653. SRE_STATE state;
  1654. int status;
  1655. PyObject* string;
  1656. Py_ssize_t start = 0;
  1657. Py_ssize_t end = PY_SSIZE_T_MAX;
  1658. static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
  1659. if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
  1660. &string, &start, &end))
  1661. return NULL;
  1662. string = state_init(&state, self, string, start, end);
  1663. if (!string)
  1664. return NULL;
  1665. TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
  1666. if (state.charsize == 1) {
  1667. status = sre_search(&state, PatternObject_GetCode(self));
  1668. } else {
  1669. #if defined(HAVE_UNICODE)
  1670. status = sre_usearch(&state, PatternObject_GetCode(self));
  1671. #endif
  1672. }
  1673. TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
  1674. state_fini(&state);
  1675. if (PyErr_Occurred())
  1676. return NULL;
  1677. return pattern_new_match(self, &state, status);
  1678. }
  1679. static PyObject*
  1680. call(char* module, char* function, PyObject* args)
  1681. {
  1682. PyObject* name;
  1683. PyObject* mod;
  1684. PyObject* func;
  1685. PyObject* result;
  1686. if (!args)
  1687. return NULL;
  1688. name = PyString_FromString(module);
  1689. if (!name)
  1690. return NULL;
  1691. mod = PyImport_Import(name);
  1692. Py_DECREF(name);
  1693. if (!mod)
  1694. return NULL;
  1695. func = PyObject_GetAttrString(mod, function);
  1696. Py_DECREF(mod);
  1697. if (!func)
  1698. return NULL;
  1699. result = PyObject_CallObject(func, args);
  1700. Py_DECREF(func);
  1701. Py_DECREF(args);
  1702. return result;
  1703. }
  1704. #ifdef USE_BUILTIN_COPY
  1705. static int
  1706. deepcopy(PyObject** object, PyObject* memo)
  1707. {
  1708. PyObject* copy;
  1709. copy = call(
  1710. "copy", "deepcopy",
  1711. PyTuple_Pack(2, *object, memo)
  1712. );
  1713. if (!copy)
  1714. return 0;
  1715. Py_DECREF(*object);
  1716. *object = copy;
  1717. return 1; /* success */
  1718. }
  1719. #endif
  1720. static PyObject*
  1721. join_list(PyObject* list, PyObject* string)
  1722. {
  1723. /* join list elements */
  1724. PyObject* joiner;
  1725. #if PY_VERSION_HEX >= 0x01060000
  1726. PyObject* function;
  1727. PyObject* args;
  1728. #endif
  1729. PyObject* result;
  1730. joiner = PySequence_GetSlice(string, 0, 0);
  1731. if (!joiner)
  1732. return NULL;
  1733. if (PyList_GET_SIZE(list) == 0) {
  1734. Py_DECREF(list);
  1735. return joiner;
  1736. }
  1737. #if PY_VERSION_HEX >= 0x01060000
  1738. function = PyObject_GetAttrString(joiner, "join");
  1739. if (!function) {
  1740. Py_DECREF(joiner);
  1741. return NULL;
  1742. }
  1743. args = PyTuple_New(1);
  1744. if (!args) {
  1745. Py_DECREF(function);
  1746. Py_DECREF(joiner);
  1747. return NULL;
  1748. }
  1749. PyTuple_SET_ITEM(args, 0, list);
  1750. result = PyObject_CallObject(function, args);
  1751. Py_DECREF(args); /* also removes list */
  1752. Py_DECREF(function);
  1753. #else
  1754. result = call(
  1755. "string", "join",
  1756. PyTuple_Pack(2, list, joiner)
  1757. );
  1758. #endif
  1759. Py_DECREF(joiner);
  1760. return result;
  1761. }
  1762. static PyObject*
  1763. pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
  1764. {
  1765. SRE_STATE state;
  1766. PyObject* list;
  1767. int status;
  1768. Py_ssize_t i, b, e;
  1769. PyObject* string;
  1770. Py_ssize_t start = 0;
  1771. Py_ssize_t end = PY_SSIZE_T_MAX;
  1772. static char* kwlist[] = { "source", "pos", "endpos", NULL };
  1773. if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
  1774. &string, &start, &end))
  1775. return NULL;
  1776. string = state_init(&state, self, string, start, end);
  1777. if (!string)
  1778. return NULL;
  1779. list = PyList_New(0);
  1780. if (!list) {
  1781. state_fini(&state);
  1782. return NULL;
  1783. }
  1784. while (state.start <= state.end) {
  1785. PyObject* item;
  1786. state_reset(&state);
  1787. state.ptr = state.start;
  1788. if (state.charsize == 1) {
  1789. status = sre_search(&state, PatternObject_GetCode(self));
  1790. } else {
  1791. #if defined(HAVE_UNICODE)
  1792. status = sre_usearch(&state, PatternObject_GetCode(self));
  1793. #endif
  1794. }
  1795. if (PyErr_Occurred())
  1796. goto error;
  1797. if (status <= 0) {
  1798. if (status == 0)
  1799. break;
  1800. pattern_error(status);
  1801. goto error;
  1802. }
  1803. /* don't bother to build a match object */
  1804. switch (self->groups) {
  1805. case 0:
  1806. b = STATE_OFFSET(&state, state.start);
  1807. e = STATE_OFFSET(&state, state.ptr);
  1808. item = PySequence_GetSlice(string, b, e);
  1809. if (!item)
  1810. goto error;
  1811. break;
  1812. case 1:
  1813. item = state_getslice(&state, 1, string, 1);
  1814. if (!item)
  1815. goto error;
  1816. break;
  1817. default:
  1818. item = PyTuple_New(self->groups);
  1819. if (!item)
  1820. goto error;
  1821. for (i = 0; i < self->groups; i++) {
  1822. PyObject* o = state_getslice(&state, i+1, string, 1);
  1823. if (!o) {
  1824. Py_DECREF(item);
  1825. goto error;
  1826. }
  1827. PyTuple_SET_ITEM(item, i, o);
  1828. }
  1829. break;
  1830. }
  1831. status = PyList_Append(list, item);
  1832. Py_DECREF(item);
  1833. if (status < 0)
  1834. goto error;
  1835. if (state.ptr == state.start)
  1836. state.start = (void*) ((char*) state.ptr + state.charsize);
  1837. else
  1838. state.start = state.ptr;
  1839. }
  1840. state_fini(&state);
  1841. return list;
  1842. error:
  1843. Py_DECREF(list);
  1844. state_fini(&state);
  1845. return NULL;
  1846. }
  1847. #if PY_VERSION_HEX >= 0x02020000
  1848. static PyObject*
  1849. pattern_finditer(PatternObject* pattern, PyObject* args)
  1850. {
  1851. PyObject* scanner;
  1852. PyObject* search;
  1853. PyObject* iterator;
  1854. scanner = pattern_scanner(pattern, args);
  1855. if (!scanner)
  1856. return NULL;
  1857. search = PyObject_GetAttrString(scanner, "search");
  1858. Py_DECREF(scanner);
  1859. if (!search)
  1860. return NULL;
  1861. iterator = PyCallIter_New(search, Py_None);
  1862. Py_DECREF(search);
  1863. return iterator;
  1864. }
  1865. #endif
  1866. static PyObject*
  1867. pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
  1868. {
  1869. SRE_STATE state;
  1870. PyObject* list;
  1871. PyObject* item;
  1872. int status;
  1873. Py_ssize_t n;
  1874. Py_ssize_t i;
  1875. void* last;
  1876. PyObject* string;
  1877. Py_ssize_t maxsplit = 0;
  1878. static char* kwlist[] = { "source", "maxsplit", NULL };
  1879. if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
  1880. &string, &maxsplit))
  1881. return NULL;
  1882. string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
  1883. if (!string)
  1884. return NULL;
  1885. list = PyList_New(0);
  1886. if (!list) {
  1887. state_fini(&state);
  1888. return NULL;
  1889. }
  1890. n = 0;
  1891. last = state.start;
  1892. while (!maxsplit || n < maxsplit) {
  1893. state_reset(&state);
  1894. state.ptr = state.start;
  1895. if (state.charsize == 1) {
  1896. status = sre_search(&state, PatternObject_GetCode(self));
  1897. } else {
  1898. #if defined(HAVE_UNICODE)
  1899. status = sre_usearch(&state, PatternObject_GetCode(self));
  1900. #endif
  1901. }
  1902. if (PyErr_Occurred())
  1903. goto error;
  1904. if (status <= 0) {
  1905. if (status == 0)
  1906. break;
  1907. pattern_error(status);
  1908. goto error;
  1909. }
  1910. if (state.start == state.ptr) {
  1911. if (last == state.end)
  1912. break;
  1913. /* skip one character */
  1914. state.start = (void*) ((char*) state.ptr + state.charsize);
  1915. continue;
  1916. }
  1917. /* get segment before this match */
  1918. item = PySequence_GetSlice(
  1919. string, STATE_OFFSET(&state, last),
  1920. STATE_OFFSET(&state, state.start)
  1921. );
  1922. if (!item)
  1923. goto error;
  1924. status = PyList_Append(list, item);
  1925. Py_DECREF(item);
  1926. if (status < 0)
  1927. goto error;
  1928. /* add groups (if any) */
  1929. for (i = 0; i < self->groups; i++) {
  1930. item = state_getslice(&state, i+1, string, 0);
  1931. if (!item)
  1932. goto error;
  1933. status = PyList_Append(list, item);
  1934. Py_DECREF(item);
  1935. if (status < 0)
  1936. goto error;
  1937. }
  1938. n = n + 1;
  1939. last = state.start = state.ptr;
  1940. }
  1941. /* get segment following last match (even if empty) */
  1942. item = PySequence_GetSlice(
  1943. string, STATE_OFFSET(&state, last), state.endpos
  1944. );
  1945. if (!item)
  1946. goto error;
  1947. status = PyList_Append(list, item);
  1948. Py_DECREF(item);
  1949. if (status < 0)
  1950. goto error;
  1951. state_fini(&state);
  1952. return list;
  1953. error:
  1954. Py_DECREF(list);
  1955. state_fini(&state);
  1956. return NULL;
  1957. }
  1958. static PyObject*
  1959. pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
  1960. Py_ssize_t count, Py_ssize_t subn)
  1961. {
  1962. SRE_STATE state;
  1963. PyObject* list;
  1964. PyObject* item;
  1965. PyObject* filter;
  1966. PyObject* args;
  1967. PyObject* match;
  1968. void* ptr;
  1969. int status;
  1970. Py_ssize_t n;
  1971. Py_ssize_t i, b, e;
  1972. int bint;
  1973. int filter_is_callable;
  1974. if (PyCallable_Check(ptemplate)) {
  1975. /* sub/subn takes either a function or a template */
  1976. filter = ptemplate;
  1977. Py_INCREF(filter);
  1978. filter_is_callable = 1;
  1979. } else {
  1980. /* if not callable, check if it's a literal string */
  1981. int literal;
  1982. ptr = getstring(ptemplate, &n, &bint);
  1983. b = bint;
  1984. if (ptr) {
  1985. if (b == 1) {
  1986. literal = sre_literal_template((unsigned char *)ptr, n);
  1987. } else {
  1988. #if defined(HAVE_UNICODE)
  1989. literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
  1990. #endif
  1991. }
  1992. } else {
  1993. PyErr_Clear();
  1994. literal = 0;
  1995. }
  1996. if (literal) {
  1997. filter = ptemplate;
  1998. Py_INCREF(filter);
  1999. filter_is_callable = 0;
  2000. } else {
  2001. /* not a literal; hand it over to the template compiler */
  2002. filter = call(
  2003. SRE_PY_MODULE, "_subx",
  2004. PyTuple_Pack(2, self, ptemplate)
  2005. );
  2006. if (!filter)
  2007. return NULL;
  2008. filter_is_callable = PyCallable_Check(filter);
  2009. }
  2010. }
  2011. string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
  2012. if (!string) {
  2013. Py_DECREF(filter);
  2014. return NULL;
  2015. }
  2016. list = PyList_New(0);
  2017. if (!list) {
  2018. Py_DECREF(filter);
  2019. state_fini(&state);
  2020. return NULL;
  2021. }
  2022. n = i = 0;
  2023. while (!count || n < count) {
  2024. state_reset(&state);
  2025. state.ptr = state.start;
  2026. if (state.charsize == 1) {
  2027. status = sre_search(&state, PatternObject_GetCode(self));
  2028. } else {
  2029. #if defined(HAVE_UNICODE)
  2030. status = sre_usearch(&state, PatternObject_GetCode(self));
  2031. #endif
  2032. }
  2033. if (PyErr_Occurred())
  2034. goto error;
  2035. if (status <= 0) {
  2036. if (status == 0)
  2037. break;
  2038. pattern_error(status);
  2039. goto error;
  2040. }
  2041. b = STATE_OFFSET(&state, state.start);
  2042. e = STATE_OFFSET(&state, state.ptr);
  2043. if (i < b) {
  2044. /* get segment before this match */
  2045. item = PySequence_GetSlice(string, i, b);
  2046. if (!item)
  2047. goto error;
  2048. status = PyList_Append(list, item);
  2049. Py_DECREF(item);
  2050. if (status < 0)
  2051. goto error;
  2052. } else if (i == b && i == e && n > 0)
  2053. /* ignore empty match on latest position */
  2054. goto next;
  2055. if (filter_is_callable) {
  2056. /* pass match object through filter */
  2057. match = pattern_new_match(self, &state, 1);
  2058. if (!match)
  2059. goto error;
  2060. args = PyTuple_Pack(1, match);
  2061. if (!args) {
  2062. Py_DECREF(match);
  2063. goto error;
  2064. }
  2065. item = PyObject_CallObject(filter, args);
  2066. Py_DECREF(args);
  2067. Py_DECREF(match);
  2068. if (!item)
  2069. goto error;
  2070. } else {
  2071. /* filter is literal string */
  2072. item = filter;
  2073. Py_INCREF(item);
  2074. }
  2075. /* add to list */
  2076. if (item != Py_None) {
  2077. status = PyList_Append(list, item);
  2078. Py_DECREF(item);
  2079. if (status < 0)
  2080. goto error;
  2081. }
  2082. i = e;
  2083. n = n + 1;
  2084. next:
  2085. /* move on */
  2086. if (state.ptr == state.start)
  2087. state.start = (void*) ((char*) state.ptr + state.charsize);
  2088. else
  2089. state.start = state.ptr;
  2090. }
  2091. /* get segment following last match */
  2092. if (i < state.endpos) {
  2093. item = PySequence_GetSlice(string, i, state.endpos);
  2094. if (!item)
  2095. goto error;
  2096. status = PyList_Append(list, item);
  2097. Py_DECREF(item);
  2098. if (status < 0)
  2099. goto error;
  2100. }
  2101. state_fini(&state);
  2102. Py_DECREF(filter);
  2103. /* convert list to single string (also removes list) */
  2104. item = join_list(list, string);
  2105. if (!item)
  2106. return NULL;
  2107. if (subn)
  2108. return Py_BuildValue("Ni", item, n);
  2109. return item;
  2110. error:
  2111. Py_DECREF(list);
  2112. state_fini(&state);
  2113. Py_DECREF(filter);
  2114. return NULL;
  2115. }
  2116. static PyObject*
  2117. pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
  2118. {
  2119. PyObject* ptemplate;
  2120. PyObject* string;
  2121. Py_ssize_t count = 0;
  2122. static char* kwlist[] = { "repl", "string", "count", NULL };
  2123. if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
  2124. &ptemplate, &string, &count))
  2125. return NULL;
  2126. return pattern_subx(self, ptemplate, string, count, 0);
  2127. }
  2128. static PyObject*
  2129. pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
  2130. {
  2131. PyObject* ptemplate;
  2132. PyObject* string;
  2133. Py_ssize_t count = 0;
  2134. static char* kwlist[] = { "repl", "string", "count", NULL };
  2135. if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
  2136. &ptemplate, &string, &count))
  2137. return NULL;
  2138. return pattern_subx(self, ptemplate, string, count, 1);
  2139. }
  2140. static PyObject*
  2141. pattern_copy(PatternObject* self, PyObject *unused)
  2142. {
  2143. #ifdef USE_BUILTIN_COPY
  2144. PatternObject* copy;
  2145. int offset;
  2146. copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
  2147. if (!copy)
  2148. return NULL;
  2149. offset = offsetof(PatternObject, groups);
  2150. Py_XINCREF(self->groupindex);
  2151. Py_XINCREF(self->indexgroup);
  2152. Py_XINCREF(self->pattern);
  2153. memcpy((char*) copy + offset, (char*) self + offset,
  2154. sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
  2155. copy->weakreflist = NULL;
  2156. return (PyObject*) copy;
  2157. #else
  2158. PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
  2159. return NULL;
  2160. #endif
  2161. }
  2162. static PyObject*
  2163. pattern_deepcopy(PatternObject* self, PyObject* memo)
  2164. {
  2165. #ifdef USE_BUILTIN_COPY
  2166. PatternObject* copy;
  2167. copy = (PatternObject*) pattern_copy(self);
  2168. if (!copy)
  2169. return NULL;
  2170. if (!deepcopy(&copy->groupindex, memo) ||
  2171. !deepcopy(&copy->indexgroup, memo) ||
  2172. !deepcopy(&copy->pattern, memo)) {
  2173. Py_DECREF(copy);
  2174. return NULL;
  2175. }
  2176. #else
  2177. PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
  2178. return NULL;
  2179. #endif
  2180. }
  2181. PyDoc_STRVAR(pattern_match_doc,
  2182. "match(string[, pos[, endpos]]) --> match object or None.\n\
  2183. Matches zero or more characters at the beginning of the string");
  2184. PyDoc_STRVAR(pattern_search_doc,
  2185. "search(string[, pos[, endpos]]) --> match object or None.\n\
  2186. Scan through string looking for a match, and return a corresponding\n\
  2187. MatchObject instance. Return None if no position in the string matches.");
  2188. PyDoc_STRVAR(pattern_split_doc,
  2189. "split(string[, maxsplit = 0]) --> list.\n\
  2190. Split string by the occurrences of pattern.");
  2191. PyDoc_STRVAR(pattern_findall_doc,
  2192. "findall(string[, pos[, endpos]]) --> list.\n\
  2193. Return a list of all non-overlapping matches of pattern in string.");
  2194. PyDoc_STRVAR(pattern_finditer_doc,
  2195. "finditer(string[, pos[, endpos]]) --> iterator.\n\
  2196. Return an iterator over all non-overlapping matches for the \n\
  2197. RE pattern in string. For each match, the iterator returns a\n\
  2198. match object.");
  2199. PyDoc_STRVAR(pattern_sub_doc,
  2200. "sub(repl, string[, count = 0]) --> newstring\n\
  2201. Return the string obtained by replacing the leftmost non-overlapping\n\
  2202. occurrences of pattern in string by the replacement repl.");
  2203. PyDoc_STRVAR(pattern_subn_doc,
  2204. "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
  2205. Return the tuple (new_string, number_of_subs_made) found by replacing\n\
  2206. the leftmost non-overlapping occurrences of pattern with the\n\
  2207. replacement repl.");
  2208. PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
  2209. static PyMethodDef pattern_methods[] = {
  2210. {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
  2211. pattern_match_doc},
  2212. {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
  2213. pattern_search_doc},
  2214. {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
  2215. pattern_sub_doc},
  2216. {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
  2217. pattern_subn_doc},
  2218. {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
  2219. pattern_split_doc},
  2220. {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
  2221. pattern_findall_doc},
  2222. #if PY_VERSION_HEX >= 0x02020000
  2223. {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
  2224. pattern_finditer_doc},
  2225. #endif
  2226. {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
  2227. {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
  2228. {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
  2229. {NULL, NULL}
  2230. };
  2231. #define PAT_OFF(x) offsetof(PatternObject, x)
  2232. static PyMemberDef pattern_members[] = {
  2233. {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
  2234. {"flags", T_INT, PAT_OFF(flags), READONLY},
  2235. {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
  2236. {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
  2237. {NULL} /* Sentinel */
  2238. };
  2239. statichere PyTypeObject Pattern_Type = {
  2240. PyObject_HEAD_INIT(NULL)
  2241. 0, "_" SRE_MODULE ".SRE_Pattern",
  2242. sizeof(PatternObject), sizeof(SRE_CODE),
  2243. (destructor)pattern_dealloc, /*tp_dealloc*/
  2244. 0, /* tp_print */
  2245. 0, /* tp_getattrn */
  2246. 0, /* tp_setattr */
  2247. 0, /* tp_compare */
  2248. 0, /* tp_repr */
  2249. 0, /* tp_as_number */
  2250. 0, /* tp_as_sequence */
  2251. 0, /* tp_as_mapping */
  2252. 0, /* tp_hash */
  2253. 0, /* tp_call */
  2254. 0, /* tp_str */
  2255. 0, /* tp_getattro */
  2256. 0, /* tp_setattro */
  2257. 0, /* tp_as_buffer */
  2258. Py_TPFLAGS_DEFAULT, /* tp_flags */
  2259. pattern_doc, /* tp_doc */
  2260. 0, /* tp_traverse */
  2261. 0, /* tp_clear */
  2262. 0, /* tp_richcompare */
  2263. offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
  2264. 0, /* tp_iter */
  2265. 0, /* tp_iternext */
  2266. pattern_methods, /* tp_methods */
  2267. pattern_members, /* tp_members */
  2268. };
  2269. static int _validate(PatternObject *self); /* Forward */
  2270. static PyObject *
  2271. _compile(PyObject* self_, PyObject* args)
  2272. {
  2273. /* "compile" pattern descriptor to pattern object */
  2274. PatternObject* self;
  2275. Py_ssize_t i, n;
  2276. PyObject* pattern;
  2277. int flags = 0;
  2278. PyObject* code;
  2279. Py_ssize_t groups = 0;
  2280. PyObject* groupindex = NULL;
  2281. PyObject* indexgroup = NULL;
  2282. if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
  2283. &PyList_Type, &code, &groups,
  2284. &groupindex, &indexgroup))
  2285. return NULL;
  2286. n = PyList_GET_SIZE(code);
  2287. /* coverity[ampersand_in_size] */
  2288. self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
  2289. if (!self)
  2290. return NULL;
  2291. self->weakreflist = NULL;
  2292. self->pattern = NULL;
  2293. self->groupindex = NULL;
  2294. self->indexgroup = NULL;
  2295. self->codesize = n;
  2296. for (i = 0; i < n; i++) {
  2297. PyObject *o = PyList_GET_ITEM(code, i);
  2298. unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
  2299. : PyLong_AsUnsignedLong(o);
  2300. self->code[i] = (SRE_CODE) value;
  2301. if ((unsigned long) self->code[i] != value) {
  2302. PyErr_SetString(PyExc_OverflowError,
  2303. "regular expression code size limit exceeded");
  2304. break;
  2305. }
  2306. }
  2307. if (PyErr_Occurred()) {
  2308. Py_DECREF(self);
  2309. return NULL;
  2310. }
  2311. Py_INCREF(pattern);
  2312. self->pattern = pattern;
  2313. self->flags = flags;
  2314. self->groups = groups;
  2315. Py_XINCREF(groupindex);
  2316. self->groupindex = groupindex;
  2317. Py_XINCREF(indexgroup);
  2318. self->indexgroup = indexgroup;
  2319. self->weakreflist = NULL;
  2320. if (!_validate(self)) {
  2321. Py_DECREF(self);
  2322. return NULL;
  2323. }
  2324. return (PyObject*) self;
  2325. }
  2326. /* -------------------------------------------------------------------- */
  2327. /* Code validation */
  2328. /* To learn more about this code, have a look at the _compile() function in
  2329. Lib/sre_compile.py. The validation functions below checks the code array
  2330. for conformance with the code patterns generated there.
  2331. The nice thing about the generated code is that it is position-independent:
  2332. all jumps are relative jumps forward. Also, jumps don't cross each other:
  2333. the target of a later jump is always earlier than the target of an earlier
  2334. jump. IOW, this is okay:
  2335. J---------J-------T--------T
  2336. \ \_____/ /
  2337. \______________________/
  2338. but this is not:
  2339. J---------J-------T--------T
  2340. \_________\_____/ /
  2341. \____________/
  2342. It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
  2343. bytes wide (the latter if Python is compiled for "wide" unicode support).
  2344. */
  2345. /* Defining this one enables tracing of the validator */
  2346. #undef VVERBOSE
  2347. /* Trace macro for the validator */
  2348. #if defined(VVERBOSE)
  2349. #define VTRACE(v) printf v
  2350. #else
  2351. #define VTRACE(v) do {} while(0) /* do nothing */
  2352. #endif
  2353. /* Report failure */
  2354. #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
  2355. /* Extract opcode, argument, or skip count from code array */
  2356. #define GET_OP \
  2357. do { \
  2358. VTRACE(("%p: ", code)); \
  2359. if (code >= end) FAIL; \
  2360. op = *code++; \
  2361. VTRACE(("%lu (op)\n", (unsigned long)op)); \
  2362. } while (0)
  2363. #define GET_ARG \
  2364. do { \
  2365. VTRACE(("%p= ", code)); \
  2366. if (code >= end) FAIL; \
  2367. arg = *code++; \
  2368. VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
  2369. } while (0)
  2370. #define GET_SKIP_ADJ(adj) \
  2371. do { \
  2372. VTRACE(("%p= ", code)); \
  2373. if (code >= end) FAIL; \
  2374. skip = *code; \
  2375. VTRACE(("%lu (skip to %p)\n", \
  2376. (unsigned long)skip, code+skip)); \
  2377. if (code+skip-adj < code || code+skip-adj > end)\
  2378. FAIL; \
  2379. code++; \
  2380. } while (0)
  2381. #define GET_SKIP GET_SKIP_ADJ(0)
  2382. static int
  2383. _validate_charset(SRE_CODE *code, SRE_CODE *end)
  2384. {
  2385. /* Some variables are manipulated by the macros above */
  2386. SRE_CODE op;
  2387. SRE_CODE arg;
  2388. SRE_CODE offset;
  2389. int i;
  2390. while (code < end) {
  2391. GET_OP;
  2392. switch (op) {
  2393. case SRE_OP_NEGATE:
  2394. break;
  2395. case SRE_OP_LITERAL:
  2396. GET_ARG;
  2397. break;
  2398. case SRE_OP_RANGE:
  2399. GET_ARG;
  2400. GET_ARG;
  2401. break;
  2402. case SRE_OP_CHARSET:
  2403. offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
  2404. if (code+offset < code || code+offset > end)
  2405. FAIL;
  2406. code += offset;
  2407. break;
  2408. case SRE_OP_BIGCHARSET:
  2409. GET_ARG; /* Number of blocks */
  2410. offset = 256/sizeof(SRE_CODE); /* 256-byte table */
  2411. if (code+offset < code || code+offset > end)
  2412. FAIL;
  2413. /* Make sure that each byte points to a valid block */
  2414. for (i = 0; i < 256; i++) {
  2415. if (((unsigned char *)code)[i] >= arg)
  2416. FAIL;
  2417. }
  2418. code += offset;
  2419. offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
  2420. if (code+offset < code || code+offset > end)
  2421. FAIL;
  2422. code += offset;
  2423. break;
  2424. case SRE_OP_CATEGORY:
  2425. GET_ARG;
  2426. switch (arg) {
  2427. case SRE_CATEGORY_DIGIT:
  2428. case SRE_CATEGORY_NOT_DIGIT:
  2429. case SRE_CATEGORY_SPACE:
  2430. case SRE_CATEGORY_NOT_SPACE:
  2431. case SRE_CATEGORY_WORD:
  2432. case SRE_CATEGORY_NOT_WORD:
  2433. case SRE_CATEGORY_LINEBREAK:
  2434. case SRE_CATEGORY_NOT_LINEBREAK:
  2435. case SRE_CATEGORY_LOC_WORD:
  2436. case SRE_CATEGORY_LOC_NOT_WORD:
  2437. case SRE_CATEGORY_UNI_DIGIT:
  2438. case SRE_CATEGORY_UNI_NOT_DIGIT:
  2439. case SRE_CATEGORY_UNI_SPACE:
  2440. case SRE_CATEGORY_UNI_NOT_SPACE:
  2441. case SRE_CATEGORY_UNI_WORD:
  2442. case SRE_CATEGORY_UNI_NOT_WORD:
  2443. case SRE_CATEGORY_UNI_LINEBREAK:
  2444. case SRE_CATEGORY_UNI_NOT_LINEBREAK:
  2445. break;
  2446. default:
  2447. FAIL;
  2448. }
  2449. break;
  2450. default:
  2451. FAIL;
  2452. }
  2453. }
  2454. return 1;
  2455. }
  2456. static int
  2457. _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
  2458. {
  2459. /* Some variables are manipulated by the macros above */
  2460. SRE_CODE op;
  2461. SRE_CODE arg;
  2462. SRE_CODE skip;
  2463. VTRACE(("code=%p, end=%p\n", code, end));
  2464. if (code > end)
  2465. FAIL;
  2466. while (code < end) {
  2467. GET_OP;
  2468. switch (op) {
  2469. case SRE_OP_MARK:
  2470. /* We don't check whether marks are properly nested; the
  2471. sre_match() code is robust even if they don't, and the worst
  2472. you can get is nonsensical match results. */
  2473. GET_ARG;
  2474. if (arg > 2*groups+1) {
  2475. VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
  2476. FAIL;
  2477. }
  2478. break;
  2479. case SRE_OP_LITERAL:
  2480. case SRE_OP_NOT_LITERAL:
  2481. case SRE_OP_LITERAL_IGNORE:
  2482. case SRE_OP_NOT_LITERAL_IGNORE:
  2483. GET_ARG;
  2484. /* The arg is just a character, nothing to check */
  2485. break;
  2486. case SRE_OP_SUCCESS:
  2487. case SRE_OP_FAILURE:
  2488. /* Nothing to check; these normally end the matching process */
  2489. break;
  2490. case SRE_OP_AT:
  2491. GET_ARG;
  2492. switch (arg) {
  2493. case SRE_AT_BEGINNING:
  2494. case SRE_AT_BEGINNING_STRING:
  2495. case SRE_AT_BEGINNING_LINE:
  2496. case SRE_AT_END:
  2497. case SRE_AT_END_LINE:
  2498. case SRE_AT_END_STRING:
  2499. case SRE_AT_BOUNDARY:
  2500. case SRE_AT_NON_BOUNDARY:
  2501. case SRE_AT_LOC_BOUNDARY:
  2502. case SRE_AT_LOC_NON_BOUNDARY:
  2503. case SRE_AT_UNI_BOUNDARY:
  2504. case SRE_AT_UNI_NON_BOUNDARY:
  2505. break;
  2506. default:
  2507. FAIL;
  2508. }
  2509. break;
  2510. case SRE_OP_ANY:
  2511. case SRE_OP_ANY_ALL:
  2512. /* These have no operands */
  2513. break;
  2514. case SRE_OP_IN:
  2515. case SRE_OP_IN_IGNORE:
  2516. GET_SKIP;
  2517. /* Stop 1 before the end; we check the FAILURE below */
  2518. if (!_validate_charset(code, code+skip-2))
  2519. FAIL;
  2520. if (code[skip-2] != SRE_OP_FAILURE)
  2521. FAIL;
  2522. code += skip-1;
  2523. break;
  2524. case SRE_OP_INFO:
  2525. {
  2526. /* A minimal info field is
  2527. <INFO> <1=skip> <2=flags> <3=min> <4=max>;
  2528. If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
  2529. more follows. */
  2530. SRE_CODE flags, i;
  2531. SRE_CODE *newcode;
  2532. GET_SKIP;
  2533. newcode = code+skip-1;
  2534. GET_ARG; flags = arg;
  2535. GET_ARG; /* min */
  2536. GET_ARG; /* max */
  2537. /* Check that only valid flags are present */
  2538. if ((flags & ~(SRE_INFO_PREFIX |
  2539. SRE_INFO_LITERAL |
  2540. SRE_INFO_CHARSET)) != 0)
  2541. FAIL;
  2542. /* PREFIX and CHARSET are mutually exclusive */
  2543. if ((flags & SRE_INFO_PREFIX) &&
  2544. (flags & SRE_INFO_CHARSET))
  2545. FAIL;
  2546. /* LITERAL implies PREFIX */
  2547. if ((flags & SRE_INFO_LITERAL) &&
  2548. !(flags & SRE_INFO_PREFIX))
  2549. FAIL;
  2550. /* Validate the prefix */
  2551. if (flags & SRE_INFO_PREFIX) {
  2552. SRE_CODE prefix_len;
  2553. GET_ARG; prefix_len = arg;
  2554. GET_ARG; /* prefix skip */
  2555. /* Here comes the prefix string */
  2556. if (code+prefix_len < code || code+prefix_len > newcode)
  2557. FAIL;
  2558. code += prefix_len;
  2559. /* And here comes the overlap table */
  2560. if (code+prefix_len < code || code+prefix_len > newcode)
  2561. FAIL;
  2562. /* Each overlap value should be < prefix_len */
  2563. for (i = 0; i < prefix_len; i++) {
  2564. if (code[i] >= prefix_len)
  2565. FAIL;
  2566. }
  2567. code += prefix_len;
  2568. }
  2569. /* Validate the charset */
  2570. if (flags & SRE_INFO_CHARSET) {
  2571. if (!_validate_charset(code, newcode-1))
  2572. FAIL;
  2573. if (newcode[-1] != SRE_OP_FAILURE)
  2574. FAIL;
  2575. code = newcode;
  2576. }
  2577. else if (code != newcode) {
  2578. VTRACE(("code=%p, newcode=%p\n", code, newcode));
  2579. FAIL;
  2580. }
  2581. }
  2582. break;
  2583. case SRE_OP_BRANCH:
  2584. {
  2585. SRE_CODE *target = NULL;
  2586. for (;;) {
  2587. GET_SKIP;
  2588. if (skip == 0)
  2589. break;
  2590. /* Stop 2 before the end; we check the JUMP below */
  2591. if (!_validate_inner(code, code+skip-3, groups))
  2592. FAIL;
  2593. code += skip-3;
  2594. /* Check that it ends with a JUMP, and that each JUMP
  2595. has the same target */
  2596. GET_OP;
  2597. if (op != SRE_OP_JUMP)
  2598. FAIL;
  2599. GET_SKIP;
  2600. if (target == NULL)
  2601. target = code+skip-1;
  2602. else if (code+skip-1 != target)
  2603. FAIL;
  2604. }
  2605. }
  2606. break;
  2607. case SRE_OP_REPEAT_ONE:
  2608. case SRE_OP_MIN_REPEAT_ONE:
  2609. {
  2610. SRE_CODE min, max;
  2611. GET_SKIP;
  2612. GET_ARG; min = arg;
  2613. GET_ARG; max = arg;
  2614. if (min > max)
  2615. FAIL;
  2616. #ifdef Py_UNICODE_WIDE
  2617. if (max > 65535)
  2618. FAIL;
  2619. #endif
  2620. if (!_validate_inner(code, code+skip-4, groups))
  2621. FAIL;
  2622. code += skip-4;
  2623. GET_OP;
  2624. if (op != SRE_OP_SUCCESS)
  2625. FAIL;
  2626. }
  2627. break;
  2628. case SRE_OP_REPEAT:
  2629. {
  2630. SRE_CODE min, max;
  2631. GET_SKIP;
  2632. GET_ARG; min = arg;
  2633. GET_ARG; max = arg;
  2634. if (min > max)
  2635. FAIL;
  2636. #ifdef Py_UNICODE_WIDE
  2637. if (max > 65535)
  2638. FAIL;
  2639. #endif
  2640. if (!_validate_inner(code, code+skip-3, groups))
  2641. FAIL;
  2642. code += skip-3;
  2643. GET_OP;
  2644. if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
  2645. FAIL;
  2646. }
  2647. break;
  2648. case SRE_OP_GROUPREF:
  2649. case SRE_OP_GROUPREF_IGNORE:
  2650. GET_ARG;
  2651. if (arg >= groups)
  2652. FAIL;
  2653. break;
  2654. case SRE_OP_GROUPREF_EXISTS:
  2655. /* The regex syntax for this is: '(?(group)then|else)', where
  2656. 'group' is either an integer group number or a group name,
  2657. 'then' and 'else' are sub-regexes, and 'else' is optional. */
  2658. GET_ARG;
  2659. if (arg >= groups)
  2660. FAIL;
  2661. GET_SKIP_ADJ(1);
  2662. code--; /* The skip is relative to the first arg! */
  2663. /* There are two possibilities here: if there is both a 'then'
  2664. part and an 'else' part, the generated code looks like:
  2665. GROUPREF_EXISTS
  2666. <group>
  2667. <skipyes>
  2668. ...then part...
  2669. JUMP
  2670. <skipno>
  2671. (<skipyes> jumps here)
  2672. ...else part...
  2673. (<skipno> jumps here)
  2674. If there is only a 'then' part, it looks like:
  2675. GROUPREF_EXISTS
  2676. <group>
  2677. <skip>
  2678. ...then part...
  2679. (<skip> jumps here)
  2680. There is no direct way to decide which it is, and we don't want
  2681. to allow arbitrary jumps anywhere in the code; so we just look
  2682. for a JUMP opcode preceding our skip target.
  2683. */
  2684. if (skip >= 3 && code+skip-3 >= code &&
  2685. code[skip-3] == SRE_OP_JUMP)
  2686. {
  2687. VTRACE(("both then and else parts present\n"));
  2688. if (!_validate_inner(code+1, code+skip-3, groups))
  2689. FAIL;
  2690. code += skip-2; /* Position after JUMP, at <skipno> */
  2691. GET_SKIP;
  2692. if (!_validate_inner(code, code+skip-1, groups))
  2693. FAIL;
  2694. code += skip-1;
  2695. }
  2696. else {
  2697. VTRACE(("only a then part present\n"));
  2698. if (!_validate_inner(code+1, code+skip-1, groups))
  2699. FAIL;
  2700. code += skip-1;
  2701. }
  2702. break;
  2703. case SRE_OP_ASSERT:
  2704. case SRE_OP_ASSERT_NOT:
  2705. GET_SKIP;
  2706. GET_ARG; /* 0 for lookahead, width for lookbehind */
  2707. code--; /* Back up over arg to simplify math below */
  2708. if (arg & 0x80000000)
  2709. FAIL; /* Width too large */
  2710. /* Stop 1 before the end; we check the SUCCESS below */
  2711. if (!_validate_inner(code+1, code+skip-2, groups))
  2712. FAIL;
  2713. code += skip-2;
  2714. GET_OP;
  2715. if (op != SRE_OP_SUCCESS)
  2716. FAIL;
  2717. break;
  2718. default:
  2719. FAIL;
  2720. }
  2721. }
  2722. VTRACE(("okay\n"));
  2723. return 1;
  2724. }
  2725. static int
  2726. _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
  2727. {
  2728. if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
  2729. FAIL;
  2730. if (groups == 0) /* fix for simplejson */
  2731. groups = 100; /* 100 groups should always be safe */
  2732. return _validate_inner(code, end-1, groups);
  2733. }
  2734. static int
  2735. _validate(PatternObject *self)
  2736. {
  2737. if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
  2738. {
  2739. PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
  2740. return 0;
  2741. }
  2742. else
  2743. VTRACE(("Success!\n"));
  2744. return 1;
  2745. }
  2746. /* -------------------------------------------------------------------- */
  2747. /* match methods */
  2748. static void
  2749. match_dealloc(MatchObject* self)
  2750. {
  2751. Py_XDECREF(self->regs);
  2752. Py_XDECREF(self->string);
  2753. Py_DECREF(self->pattern);
  2754. PyObject_DEL(self);
  2755. }
  2756. static PyObject*
  2757. match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
  2758. {
  2759. if (index < 0 || index >= self->groups) {
  2760. /* raise IndexError if we were given a bad group number */
  2761. PyErr_SetString(
  2762. PyExc_IndexError,
  2763. "no such group"
  2764. );
  2765. return NULL;
  2766. }
  2767. index *= 2;
  2768. if (self->string == Py_None || self->mark[index] < 0) {
  2769. /* return default value if the string or group is undefined */
  2770. Py_INCREF(def);
  2771. return def;
  2772. }
  2773. return PySequence_GetSlice(
  2774. self->string, self->mark[index], self->mark[index+1]
  2775. );
  2776. }
  2777. static Py_ssize_t
  2778. match_getindex(MatchObject* self, PyObject* index)
  2779. {
  2780. Py_ssize_t i;
  2781. if (PyInt_Check(index))
  2782. return PyInt_AsSsize_t(index);
  2783. i = -1;
  2784. if (self->pattern->groupindex) {
  2785. index = PyObject_GetItem(self->pattern->groupindex, index);
  2786. if (index) {
  2787. if (PyInt_Check(index) || PyLong_Check(index))
  2788. i = PyInt_AsSsize_t(index);
  2789. Py_DECREF(index);
  2790. } else
  2791. PyErr_Clear();
  2792. }
  2793. return i;
  2794. }
  2795. static PyObject*
  2796. match_getslice(MatchObject* self, PyObject* index, PyObject* def)
  2797. {
  2798. return match_getslice_by_index(self, match_getindex(self, index), def);
  2799. }
  2800. static PyObject*
  2801. match_expand(MatchObject* self, PyObject* ptemplate)
  2802. {
  2803. /* delegate to Python code */
  2804. return call(
  2805. SRE_PY_MODULE, "_expand",
  2806. PyTuple_Pack(3, self->pattern, self, ptemplate)
  2807. );
  2808. }
  2809. static PyObject*
  2810. match_group(MatchObject* self, PyObject* args)
  2811. {
  2812. PyObject* result;
  2813. Py_ssize_t i, size;
  2814. size = PyTuple_GET_SIZE(args);
  2815. switch (size) {
  2816. case 0:
  2817. result = match_getslice(self, Py_False, Py_None);
  2818. break;
  2819. case 1:
  2820. result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
  2821. break;
  2822. default:
  2823. /* fetch multiple items */
  2824. result = PyTuple_New(size);
  2825. if (!result)
  2826. return NULL;
  2827. for (i = 0; i < size; i++) {
  2828. PyObject* item = match_getslice(
  2829. self, PyTuple_GET_ITEM(args, i), Py_None
  2830. );
  2831. if (!item) {
  2832. Py_DECREF(result);
  2833. return NULL;
  2834. }
  2835. PyTuple_SET_ITEM(result, i, item);
  2836. }
  2837. break;
  2838. }
  2839. return result;
  2840. }
  2841. static PyObject*
  2842. match_groups(MatchObject* self, PyObject* args, PyObject* kw)
  2843. {
  2844. PyObject* result;
  2845. Py_ssize_t index;
  2846. PyObject* def = Py_None;
  2847. static char* kwlist[] = { "default", NULL };
  2848. if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
  2849. return NULL;
  2850. result = PyTuple_New(self->groups-1);
  2851. if (!result)
  2852. return NULL;
  2853. for (index = 1; index < self->groups; index++) {
  2854. PyObject* item;
  2855. item = match_getslice_by_index(self, index, def);
  2856. if (!item) {
  2857. Py_DECREF(result);
  2858. return NULL;
  2859. }
  2860. PyTuple_SET_ITEM(result, index-1, item);
  2861. }
  2862. return result;
  2863. }
  2864. static PyObject*
  2865. match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
  2866. {
  2867. PyObject* result;
  2868. PyObject* keys;
  2869. Py_ssize_t index;
  2870. PyObject* def = Py_None;
  2871. static char* kwlist[] = { "default", NULL };
  2872. if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
  2873. return NULL;
  2874. result = PyDict_New();
  2875. if (!result || !self->pattern->groupindex)
  2876. return result;
  2877. keys = PyMapping_Keys(self->pattern->groupindex);
  2878. if (!keys)
  2879. goto failed;
  2880. for (index = 0; index < PyList_GET_SIZE(keys); index++) {
  2881. int status;
  2882. PyObject* key;
  2883. PyObject* value;
  2884. key = PyList_GET_ITEM(keys, index);
  2885. if (!key)
  2886. goto failed;
  2887. value = match_getslice(self, key, def);
  2888. if (!value) {
  2889. Py_DECREF(key);
  2890. goto failed;
  2891. }
  2892. status = PyDict_SetItem(result, key, value);
  2893. Py_DECREF(value);
  2894. if (status < 0)
  2895. goto failed;
  2896. }
  2897. Py_DECREF(keys);
  2898. return result;
  2899. failed:
  2900. Py_XDECREF(keys);
  2901. Py_DECREF(result);
  2902. return NULL;
  2903. }
  2904. static PyObject*
  2905. match_start(MatchObject* self, PyObject* args)
  2906. {
  2907. Py_ssize_t index;
  2908. PyObject* index_ = Py_False; /* zero */
  2909. if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
  2910. return NULL;
  2911. index = match_getindex(self, index_);
  2912. if (index < 0 || index >= self->groups) {
  2913. PyErr_SetString(
  2914. PyExc_IndexError,
  2915. "no such group"
  2916. );
  2917. return NULL;
  2918. }
  2919. /* mark is -1 if group is undefined */
  2920. return Py_BuildValue("i", self->mark[index*2]);
  2921. }
  2922. static PyObject*
  2923. match_end(MatchObject* self, PyObject* args)
  2924. {
  2925. Py_ssize_t index;
  2926. PyObject* index_ = Py_False; /* zero */
  2927. if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
  2928. return NULL;
  2929. index = match_getindex(self, index_);
  2930. if (index < 0 || index >= self->groups) {
  2931. PyErr_SetString(
  2932. PyExc_IndexError,
  2933. "no such group"
  2934. );
  2935. return NULL;
  2936. }
  2937. /* mark is -1 if group is undefined */
  2938. return Py_BuildValue("i", self->mark[index*2+1]);
  2939. }
  2940. LOCAL(PyObject*)
  2941. _pair(Py_ssize_t i1, Py_ssize_t i2)
  2942. {
  2943. PyObject* pair;
  2944. PyObject* item;
  2945. pair = PyTuple_New(2);
  2946. if (!pair)
  2947. return NULL;
  2948. item = PyInt_FromSsize_t(i1);
  2949. if (!item)
  2950. goto error;
  2951. PyTuple_SET_ITEM(pair, 0, item);
  2952. item = PyInt_FromSsize_t(i2);
  2953. if (!item)
  2954. goto error;
  2955. PyTuple_SET_ITEM(pair, 1, item);
  2956. return pair;
  2957. error:
  2958. Py_DECREF(pair);
  2959. return NULL;
  2960. }
  2961. static PyObject*
  2962. match_span(MatchObject* self, PyObject* args)
  2963. {
  2964. Py_ssize_t index;
  2965. PyObject* index_ = Py_False; /* zero */
  2966. if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
  2967. return NULL;
  2968. index = match_getindex(self, index_);
  2969. if (index < 0 || index >= self->groups) {
  2970. PyErr_SetString(
  2971. PyExc_IndexError,
  2972. "no such group"
  2973. );
  2974. return NULL;
  2975. }
  2976. /* marks are -1 if group is undefined */
  2977. return _pair(self->mark[index*2], self->mark[index*2+1]);
  2978. }
  2979. static PyObject*
  2980. match_regs(MatchObject* self)
  2981. {
  2982. PyObject* regs;
  2983. PyObject* item;
  2984. Py_ssize_t index;
  2985. regs = PyTuple_New(self->groups);
  2986. if (!regs)
  2987. return NULL;
  2988. for (index = 0; index < self->groups; index++) {
  2989. item = _pair(self->mark[index*2], self->mark[index*2+1]);
  2990. if (!item) {
  2991. Py_DECREF(regs);
  2992. return NULL;
  2993. }
  2994. PyTuple_SET_ITEM(regs, index, item);
  2995. }
  2996. Py_INCREF(regs);
  2997. self->regs = regs;
  2998. return regs;
  2999. }
  3000. static PyObject*
  3001. match_copy(MatchObject* self, PyObject *unused)
  3002. {
  3003. #ifdef USE_BUILTIN_COPY
  3004. MatchObject* copy;
  3005. Py_ssize_t slots, offset;
  3006. slots = 2 * (self->pattern->groups+1);
  3007. copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
  3008. if (!copy)
  3009. return NULL;
  3010. /* this value a constant, but any compiler should be able to
  3011. figure that out all by itself */
  3012. offset = offsetof(MatchObject, string);
  3013. Py_XINCREF(self->pattern);
  3014. Py_XINCREF(self->string);
  3015. Py_XINCREF(self->regs);
  3016. memcpy((char*) copy + offset, (char*) self + offset,
  3017. sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
  3018. return (PyObject*) copy;
  3019. #else
  3020. PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
  3021. return NULL;
  3022. #endif
  3023. }
  3024. static PyObject*
  3025. match_deepcopy(MatchObject* self, PyObject* memo)
  3026. {
  3027. #ifdef USE_BUILTIN_COPY
  3028. MatchObject* copy;
  3029. copy = (MatchObject*) match_copy(self);
  3030. if (!copy)
  3031. return NULL;
  3032. if (!deepcopy((PyObject**) &copy->pattern, memo) ||
  3033. !deepcopy(&copy->string, memo) ||
  3034. !deepcopy(&copy->regs, memo)) {
  3035. Py_DECREF(copy);
  3036. return NULL;
  3037. }
  3038. #else
  3039. PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
  3040. return NULL;
  3041. #endif
  3042. }
  3043. static struct PyMethodDef match_methods[] = {
  3044. {"group", (PyCFunction) match_group, METH_VARARGS},
  3045. {"start", (PyCFunction) match_start, METH_VARARGS},
  3046. {"end", (PyCFunction) match_end, METH_VARARGS},
  3047. {"span", (PyCFunction) match_span, METH_VARARGS},
  3048. {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
  3049. {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
  3050. {"expand", (PyCFunction) match_expand, METH_O},
  3051. {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
  3052. {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
  3053. {NULL, NULL}
  3054. };
  3055. static PyObject *
  3056. match_lastindex_get(MatchObject *self)
  3057. {
  3058. if (self->lastindex >= 0)
  3059. return Py_BuildValue("i", self->lastindex);
  3060. Py_INCREF(Py_None);
  3061. return Py_None;
  3062. }
  3063. static PyObject *
  3064. match_lastgroup_get(MatchObject *self)
  3065. {
  3066. if (self->pattern->indexgroup && self->lastindex >= 0) {
  3067. PyObject* result = PySequence_GetItem(
  3068. self->pattern->indexgroup, self->lastindex
  3069. );
  3070. if (result)
  3071. return result;
  3072. PyErr_Clear();
  3073. }
  3074. Py_INCREF(Py_None);
  3075. return Py_None;
  3076. }
  3077. static PyObject *
  3078. match_regs_get(MatchObject *self)
  3079. {
  3080. if (self->regs) {
  3081. Py_INCREF(self->regs);
  3082. return self->regs;
  3083. } else
  3084. return match_regs(self);
  3085. }
  3086. static PyGetSetDef match_getset[] = {
  3087. {"lastindex", (getter)match_lastindex_get, (setter)NULL},
  3088. {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
  3089. {"regs", (getter)match_regs_get, (setter)NULL},
  3090. {NULL}
  3091. };
  3092. #define MATCH_OFF(x) offsetof(MatchObject, x)
  3093. static PyMemberDef match_members[] = {
  3094. {"string", T_OBJECT, MATCH_OFF(string), READONLY},
  3095. {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
  3096. {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
  3097. {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
  3098. {NULL}
  3099. };
  3100. /* FIXME: implement setattr("string", None) as a special case (to
  3101. detach the associated string, if any */
  3102. static PyTypeObject Match_Type = {
  3103. PyVarObject_HEAD_INIT(NULL, 0)
  3104. "_" SRE_MODULE ".SRE_Match",
  3105. sizeof(MatchObject), sizeof(Py_ssize_t),
  3106. (destructor)match_dealloc, /* tp_dealloc */
  3107. 0, /* tp_print */
  3108. 0, /* tp_getattr */
  3109. 0, /* tp_setattr */
  3110. 0, /* tp_compare */
  3111. 0, /* tp_repr */
  3112. 0, /* tp_as_number */
  3113. 0, /* tp_as_sequence */
  3114. 0, /* tp_as_mapping */
  3115. 0, /* tp_hash */
  3116. 0, /* tp_call */
  3117. 0, /* tp_str */
  3118. 0, /* tp_getattro */
  3119. 0, /* tp_setattro */
  3120. 0, /* tp_as_buffer */
  3121. Py_TPFLAGS_DEFAULT,
  3122. 0, /* tp_doc */
  3123. 0, /* tp_traverse */
  3124. 0, /* tp_clear */
  3125. 0, /* tp_richcompare */
  3126. 0, /* tp_weaklistoffset */
  3127. 0, /* tp_iter */
  3128. 0, /* tp_iternext */
  3129. match_methods, /* tp_methods */
  3130. match_members, /* tp_members */
  3131. match_getset, /* tp_getset */
  3132. };
  3133. static PyObject*
  3134. pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
  3135. {
  3136. /* create match object (from state object) */
  3137. MatchObject* match;
  3138. Py_ssize_t i, j;
  3139. char* base;
  3140. int n;
  3141. if (status > 0) {
  3142. /* create match object (with room for extra group marks) */
  3143. /* coverity[ampersand_in_size] */
  3144. match = PyObject_NEW_VAR(MatchObject, &Match_Type,
  3145. 2*(pattern->groups+1));
  3146. if (!match)
  3147. return NULL;
  3148. Py_INCREF(pattern);
  3149. match->pattern = pattern;
  3150. Py_INCREF(state->string);
  3151. match->string = state->string;
  3152. match->regs = NULL;
  3153. match->groups = pattern->groups+1;
  3154. /* fill in group slices */
  3155. base = (char*) state->beginning;
  3156. n = state->charsize;
  3157. match->mark[0] = ((char*) state->start - base) / n;
  3158. match->mark[1] = ((char*) state->ptr - base) / n;
  3159. for (i = j = 0; i < pattern->groups; i++, j+=2)
  3160. if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
  3161. match->mark[j+2] = ((char*) state->mark[j] - base) / n;
  3162. match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
  3163. } else
  3164. match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
  3165. match->pos = state->pos;
  3166. match->endpos = state->endpos;
  3167. match->lastindex = state->lastindex;
  3168. return (PyObject*) match;
  3169. } else if (status == 0) {
  3170. /* no match */
  3171. Py_INCREF(Py_None);
  3172. return Py_None;
  3173. }
  3174. /* internal error */
  3175. pattern_error(status);
  3176. return NULL;
  3177. }
  3178. /* -------------------------------------------------------------------- */
  3179. /* scanner methods (experimental) */
  3180. static void
  3181. scanner_dealloc(ScannerObject* self)
  3182. {
  3183. state_fini(&self->state);
  3184. Py_XDECREF(self->pattern);
  3185. PyObject_DEL(self);
  3186. }
  3187. static PyObject*
  3188. scanner_match(ScannerObject* self, PyObject *unused)
  3189. {
  3190. SRE_STATE* state = &self->state;
  3191. PyObject* match;
  3192. int status;
  3193. state_reset(state);
  3194. state->ptr = state->start;
  3195. if (state->charsize == 1) {
  3196. status = sre_match(state, PatternObject_GetCode(self->pattern));
  3197. } else {
  3198. #if defined(HAVE_UNICODE)
  3199. status = sre_umatch(state, PatternObject_GetCode(self->pattern));
  3200. #endif
  3201. }
  3202. if (PyErr_Occurred())
  3203. return NULL;
  3204. match = pattern_new_match((PatternObject*) self->pattern,
  3205. state, status);
  3206. if (status == 0 || state->ptr == state->start)
  3207. state->start = (void*) ((char*) state->ptr + state->charsize);
  3208. else
  3209. state->start = state->ptr;
  3210. return match;
  3211. }
  3212. static PyObject*
  3213. scanner_search(ScannerObject* self, PyObject *unused)
  3214. {
  3215. SRE_STATE* state = &self->state;
  3216. PyObject* match;
  3217. int status;
  3218. state_reset(state);
  3219. state->ptr = state->start;
  3220. if (state->charsize == 1) {
  3221. status = sre_search(state, PatternObject_GetCode(self->pattern));
  3222. } else {
  3223. #if defined(HAVE_UNICODE)
  3224. status = sre_usearch(state, PatternObject_GetCode(self->pattern));
  3225. #endif
  3226. }
  3227. if (PyErr_Occurred())
  3228. return NULL;
  3229. match = pattern_new_match((PatternObject*) self->pattern,
  3230. state, status);
  3231. if (status == 0 || state->ptr == state->start)
  3232. state->start = (void*) ((char*) state->ptr + state->charsize);
  3233. else
  3234. state->start = state->ptr;
  3235. return match;
  3236. }
  3237. static PyMethodDef scanner_methods[] = {
  3238. {"match", (PyCFunction) scanner_match, METH_NOARGS},
  3239. {"search", (PyCFunction) scanner_search, METH_NOARGS},
  3240. {NULL, NULL}
  3241. };
  3242. #define SCAN_OFF(x) offsetof(ScannerObject, x)
  3243. static PyMemberDef scanner_members[] = {
  3244. {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
  3245. {NULL} /* Sentinel */
  3246. };
  3247. statichere PyTypeObject Scanner_Type = {
  3248. PyObject_HEAD_INIT(NULL)
  3249. 0, "_" SRE_MODULE ".SRE_Scanner",
  3250. sizeof(ScannerObject), 0,
  3251. (destructor)scanner_dealloc, /*tp_dealloc*/
  3252. 0, /* tp_print */
  3253. 0, /* tp_getattr */
  3254. 0, /* tp_setattr */
  3255. 0, /* tp_reserved */
  3256. 0, /* tp_repr */
  3257. 0, /* tp_as_number */
  3258. 0, /* tp_as_sequence */
  3259. 0, /* tp_as_mapping */
  3260. 0, /* tp_hash */
  3261. 0, /* tp_call */
  3262. 0, /* tp_str */
  3263. 0, /* tp_getattro */
  3264. 0, /* tp_setattro */
  3265. 0, /* tp_as_buffer */
  3266. Py_TPFLAGS_DEFAULT, /* tp_flags */
  3267. 0, /* tp_doc */
  3268. 0, /* tp_traverse */
  3269. 0, /* tp_clear */
  3270. 0, /* tp_richcompare */
  3271. 0, /* tp_weaklistoffset */
  3272. 0, /* tp_iter */
  3273. 0, /* tp_iternext */
  3274. scanner_methods, /* tp_methods */
  3275. scanner_members, /* tp_members */
  3276. 0, /* tp_getset */
  3277. };
  3278. static PyObject*
  3279. pattern_scanner(PatternObject* pattern, PyObject* args)
  3280. {
  3281. /* create search state object */
  3282. ScannerObject* self;
  3283. PyObject* string;
  3284. Py_ssize_t start = 0;
  3285. Py_ssize_t end = PY_SSIZE_T_MAX;
  3286. if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
  3287. return NULL;
  3288. /* create scanner object */
  3289. self = PyObject_NEW(ScannerObject, &Scanner_Type);
  3290. if (!self)
  3291. return NULL;
  3292. self->pattern = NULL;
  3293. string = state_init(&self->state, pattern, string, start, end);
  3294. if (!string) {
  3295. Py_DECREF(self);
  3296. return NULL;
  3297. }
  3298. Py_INCREF(pattern);
  3299. self->pattern = (PyObject*) pattern;
  3300. return (PyObject*) self;
  3301. }
  3302. static PyMethodDef _functions[] = {
  3303. {"compile", _compile, METH_VARARGS},
  3304. {"getcodesize", sre_codesize, METH_NOARGS},
  3305. {"getlower", sre_getlower, METH_VARARGS},
  3306. {NULL, NULL}
  3307. };
  3308. #if PY_VERSION_HEX < 0x02030000
  3309. DL_EXPORT(void) init_sre(void)
  3310. #else
  3311. PyMODINIT_FUNC init_sre(void)
  3312. #endif
  3313. {
  3314. PyObject* m;
  3315. PyObject* d;
  3316. PyObject* x;
  3317. /* Patch object types */
  3318. if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
  3319. PyType_Ready(&Scanner_Type))
  3320. return;
  3321. m = Py_InitModule("_" SRE_MODULE, _functions);
  3322. if (m == NULL)
  3323. return;
  3324. d = PyModule_GetDict(m);
  3325. x = PyInt_FromLong(SRE_MAGIC);
  3326. if (x) {
  3327. PyDict_SetItemString(d, "MAGIC", x);
  3328. Py_DECREF(x);
  3329. }
  3330. x = PyInt_FromLong(sizeof(SRE_CODE));
  3331. if (x) {
  3332. PyDict_SetItemString(d, "CODESIZE", x);
  3333. Py_DECREF(x);
  3334. }
  3335. x = PyString_FromString(copyright);
  3336. if (x) {
  3337. PyDict_SetItemString(d, "copyright", x);
  3338. Py_DECREF(x);
  3339. }
  3340. }
  3341. #endif /* !defined(SRE_RECURSIVE) */
  3342. /* vim:ts=4:sw=4:et
  3343. */