PageRenderTime 59ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/pypy/rlib/rsre/rsre_core.py

https://bitbucket.org/dac_io/pypy
Python | 1116 lines | 999 code | 51 blank | 66 comment | 105 complexity | b61366500195088e7b52a0dd7a66c752 MD5 | raw file
  1. import sys
  2. from pypy.rlib.debug import check_nonneg
  3. from pypy.rlib.unroll import unrolling_iterable
  4. from pypy.rlib.rsre import rsre_char
  5. from pypy.tool.sourcetools import func_with_new_name
  6. from pypy.rlib.objectmodel import we_are_translated
  7. from pypy.rlib import jit
  8. from pypy.rlib.rsre.rsre_jit import install_jitdriver, install_jitdriver_spec
  9. OPCODE_FAILURE = 0
  10. OPCODE_SUCCESS = 1
  11. OPCODE_ANY = 2
  12. OPCODE_ANY_ALL = 3
  13. OPCODE_ASSERT = 4
  14. OPCODE_ASSERT_NOT = 5
  15. OPCODE_AT = 6
  16. OPCODE_BRANCH = 7
  17. #OPCODE_CALL = 8
  18. OPCODE_CATEGORY = 9
  19. #OPCODE_CHARSET = 10
  20. #OPCODE_BIGCHARSET = 11
  21. OPCODE_GROUPREF = 12
  22. OPCODE_GROUPREF_EXISTS = 13
  23. OPCODE_GROUPREF_IGNORE = 14
  24. OPCODE_IN = 15
  25. OPCODE_IN_IGNORE = 16
  26. OPCODE_INFO = 17
  27. OPCODE_JUMP = 18
  28. OPCODE_LITERAL = 19
  29. OPCODE_LITERAL_IGNORE = 20
  30. OPCODE_MARK = 21
  31. OPCODE_MAX_UNTIL = 22
  32. OPCODE_MIN_UNTIL = 23
  33. OPCODE_NOT_LITERAL = 24
  34. OPCODE_NOT_LITERAL_IGNORE = 25
  35. #OPCODE_NEGATE = 26
  36. #OPCODE_RANGE = 27
  37. OPCODE_REPEAT = 28
  38. OPCODE_REPEAT_ONE = 29
  39. #OPCODE_SUBPATTERN = 30
  40. OPCODE_MIN_REPEAT_ONE = 31
  41. # ____________________________________________________________
  42. _seen_specname = {}
  43. def specializectx(func):
  44. """A decorator that specializes 'func(ctx,...)' for each concrete subclass
  45. of AbstractMatchContext. During annotation, if 'ctx' is known to be a
  46. specific subclass, calling 'func' is a direct call; if 'ctx' is only known
  47. to be of class AbstractMatchContext, calling 'func' is an indirect call.
  48. """
  49. assert func.func_code.co_varnames[0] == 'ctx'
  50. specname = '_spec_' + func.func_name
  51. while specname in _seen_specname:
  52. specname += '_'
  53. _seen_specname[specname] = True
  54. # Install a copy of the function under the name '_spec_funcname' in each
  55. # concrete subclass
  56. specialized_methods = []
  57. for prefix, concreteclass in [('str', StrMatchContext),
  58. ('uni', UnicodeMatchContext)]:
  59. newfunc = func_with_new_name(func, prefix + specname)
  60. assert not hasattr(concreteclass, specname)
  61. setattr(concreteclass, specname, newfunc)
  62. specialized_methods.append(newfunc)
  63. # Return a dispatcher function, specialized on the exact type of 'ctx'
  64. def dispatch(ctx, *args):
  65. return getattr(ctx, specname)(*args)
  66. dispatch._annspecialcase_ = 'specialize:argtype(0)'
  67. dispatch._specialized_methods_ = specialized_methods
  68. return func_with_new_name(dispatch, specname)
  69. # ____________________________________________________________
  70. class Error(Exception):
  71. def __init__(self, msg):
  72. self.msg = msg
  73. class AbstractMatchContext(object):
  74. """Abstract base class"""
  75. _immutable_fields_ = ['pattern[*]', 'flags', 'end']
  76. match_start = 0
  77. match_end = 0
  78. match_marks = None
  79. match_marks_flat = None
  80. def __init__(self, pattern, match_start, end, flags):
  81. # 'match_start' and 'end' must be known to be non-negative
  82. # and they must not be more than len(string).
  83. check_nonneg(match_start)
  84. check_nonneg(end)
  85. self.pattern = pattern
  86. self.match_start = match_start
  87. self.end = end
  88. self.flags = flags
  89. def reset(self, start):
  90. self.match_start = start
  91. self.match_marks = None
  92. self.match_marks_flat = None
  93. def pat(self, index):
  94. check_nonneg(index)
  95. result = self.pattern[index]
  96. # Check that we only return non-negative integers from this helper.
  97. # It is possible that self.pattern contains negative integers
  98. # (see set_charset() and set_bigcharset() in rsre_char.py)
  99. # but they should not be fetched via this helper here.
  100. assert result >= 0
  101. return result
  102. def str(self, index):
  103. """NOT_RPYTHON: Must be overridden in a concrete subclass.
  104. The tag ^^^ here is used to generate a translation-time crash
  105. if there is a call to str() that is indirect. All calls must
  106. be direct for performance reasons; you need to specialize the
  107. caller with @specializectx."""
  108. raise NotImplementedError
  109. def lowstr(self, index):
  110. """NOT_RPYTHON: Similar to str()."""
  111. raise NotImplementedError
  112. def get_mark(self, gid):
  113. return find_mark(self.match_marks, gid)
  114. def flatten_marks(self):
  115. # for testing
  116. if self.match_marks_flat is None:
  117. self.match_marks_flat = [self.match_start, self.match_end]
  118. mark = self.match_marks
  119. if mark is not None:
  120. self.match_lastindex = mark.gid
  121. else:
  122. self.match_lastindex = -1
  123. while mark is not None:
  124. index = mark.gid + 2
  125. while index >= len(self.match_marks_flat):
  126. self.match_marks_flat.append(-1)
  127. if self.match_marks_flat[index] == -1:
  128. self.match_marks_flat[index] = mark.position
  129. mark = mark.prev
  130. self.match_marks = None # clear
  131. return self.match_marks_flat
  132. def span(self, groupnum=0):
  133. # compatibility
  134. fmarks = self.flatten_marks()
  135. groupnum *= 2
  136. if groupnum >= len(fmarks):
  137. return (-1, -1)
  138. return (fmarks[groupnum], fmarks[groupnum+1])
  139. def group(self, groupnum=0):
  140. frm, to = self.span(groupnum)
  141. if 0 <= frm <= to:
  142. return self._string[frm:to]
  143. else:
  144. return None
  145. def fresh_copy(self, start):
  146. raise NotImplementedError
  147. class StrMatchContext(AbstractMatchContext):
  148. """Concrete subclass for matching in a plain string."""
  149. def __init__(self, pattern, string, match_start, end, flags):
  150. AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
  151. self._string = string
  152. if not we_are_translated() and isinstance(string, unicode):
  153. self.flags |= rsre_char.SRE_FLAG_UNICODE # for rsre_re.py
  154. def str(self, index):
  155. check_nonneg(index)
  156. return ord(self._string[index])
  157. def lowstr(self, index):
  158. c = self.str(index)
  159. return rsre_char.getlower(c, self.flags)
  160. def fresh_copy(self, start):
  161. return StrMatchContext(self.pattern, self._string, start,
  162. self.end, self.flags)
  163. class UnicodeMatchContext(AbstractMatchContext):
  164. """Concrete subclass for matching in a unicode string."""
  165. def __init__(self, pattern, unicodestr, match_start, end, flags):
  166. AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
  167. self._unicodestr = unicodestr
  168. def str(self, index):
  169. check_nonneg(index)
  170. return ord(self._unicodestr[index])
  171. def lowstr(self, index):
  172. c = self.str(index)
  173. return rsre_char.getlower(c, self.flags)
  174. def fresh_copy(self, start):
  175. return UnicodeMatchContext(self.pattern, self._unicodestr, start,
  176. self.end, self.flags)
  177. # ____________________________________________________________
  178. class Mark(object):
  179. _immutable_ = True
  180. def __init__(self, gid, position, prev):
  181. self.gid = gid
  182. self.position = position
  183. self.prev = prev # chained list
  184. def find_mark(mark, gid):
  185. while mark is not None:
  186. if mark.gid == gid:
  187. return mark.position
  188. mark = mark.prev
  189. return -1
  190. # ____________________________________________________________
  191. class MatchResult(object):
  192. subresult = None
  193. def move_to_next_result(self, ctx):
  194. # returns either 'self' or None
  195. result = self.subresult
  196. if result is None:
  197. return
  198. if result.move_to_next_result(ctx):
  199. return self
  200. return self.find_next_result(ctx)
  201. def find_next_result(self, ctx):
  202. raise NotImplementedError
  203. MATCHED_OK = MatchResult()
  204. class BranchMatchResult(MatchResult):
  205. def __init__(self, ppos, ptr, marks):
  206. self.ppos = ppos
  207. self.start_ptr = ptr
  208. self.start_marks = marks
  209. @jit.unroll_safe
  210. def find_first_result(self, ctx):
  211. ppos = jit.hint(self.ppos, promote=True)
  212. while ctx.pat(ppos):
  213. result = sre_match(ctx, ppos + 1, self.start_ptr, self.start_marks)
  214. ppos += ctx.pat(ppos)
  215. if result is not None:
  216. self.subresult = result
  217. self.ppos = ppos
  218. return self
  219. find_next_result = find_first_result
  220. class RepeatOneMatchResult(MatchResult):
  221. install_jitdriver('RepeatOne',
  222. greens=['nextppos', 'ctx.pattern'],
  223. reds=['ptr', 'self', 'ctx'],
  224. debugprint=(1, 0)) # indices in 'greens'
  225. def __init__(self, nextppos, minptr, ptr, marks):
  226. self.nextppos = nextppos
  227. self.minptr = minptr
  228. self.start_ptr = ptr
  229. self.start_marks = marks
  230. def find_first_result(self, ctx):
  231. ptr = self.start_ptr
  232. nextppos = self.nextppos
  233. while ptr >= self.minptr:
  234. ctx.jitdriver_RepeatOne.jit_merge_point(
  235. self=self, ptr=ptr, ctx=ctx, nextppos=nextppos)
  236. result = sre_match(ctx, nextppos, ptr, self.start_marks)
  237. ptr -= 1
  238. if result is not None:
  239. self.subresult = result
  240. self.start_ptr = ptr
  241. return self
  242. find_next_result = find_first_result
  243. class MinRepeatOneMatchResult(MatchResult):
  244. install_jitdriver('MinRepeatOne',
  245. greens=['nextppos', 'ppos3', 'ctx.pattern'],
  246. reds=['ptr', 'self', 'ctx'],
  247. debugprint=(2, 0)) # indices in 'greens'
  248. def __init__(self, nextppos, ppos3, maxptr, ptr, marks):
  249. self.nextppos = nextppos
  250. self.ppos3 = ppos3
  251. self.maxptr = maxptr
  252. self.start_ptr = ptr
  253. self.start_marks = marks
  254. def find_first_result(self, ctx):
  255. ptr = self.start_ptr
  256. nextppos = self.nextppos
  257. ppos3 = self.ppos3
  258. while ptr <= self.maxptr:
  259. ctx.jitdriver_MinRepeatOne.jit_merge_point(
  260. self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3)
  261. result = sre_match(ctx, nextppos, ptr, self.start_marks)
  262. if result is not None:
  263. self.subresult = result
  264. self.start_ptr = ptr
  265. return self
  266. if not self.next_char_ok(ctx, ptr, ppos3):
  267. break
  268. ptr += 1
  269. def find_next_result(self, ctx):
  270. ptr = self.start_ptr
  271. if not self.next_char_ok(ctx, ptr, self.ppos3):
  272. return
  273. self.start_ptr = ptr + 1
  274. return self.find_first_result(ctx)
  275. def next_char_ok(self, ctx, ptr, ppos):
  276. if ptr == ctx.end:
  277. return False
  278. op = ctx.pat(ppos)
  279. for op1, checkerfn in unroll_char_checker:
  280. if op1 == op:
  281. return checkerfn(ctx, ptr, ppos)
  282. raise Error("next_char_ok[%d]" % op)
  283. class AbstractUntilMatchResult(MatchResult):
  284. def __init__(self, ppos, tailppos, ptr, marks):
  285. self.ppos = ppos
  286. self.tailppos = tailppos
  287. self.cur_ptr = ptr
  288. self.cur_marks = marks
  289. self.pending = None
  290. self.num_pending = 0
  291. class Pending(object):
  292. def __init__(self, ptr, marks, enum, next):
  293. self.ptr = ptr
  294. self.marks = marks
  295. self.enum = enum
  296. self.next = next # chained list
  297. class MaxUntilMatchResult(AbstractUntilMatchResult):
  298. install_jitdriver('MaxUntil',
  299. greens=['ppos', 'tailppos', 'match_more', 'ctx.pattern'],
  300. reds=['ptr', 'marks', 'self', 'ctx'],
  301. debugprint=(3, 0, 2))
  302. def find_first_result(self, ctx):
  303. return self.search_next(ctx, match_more=True)
  304. def find_next_result(self, ctx):
  305. return self.search_next(ctx, match_more=False)
  306. def search_next(self, ctx, match_more):
  307. ppos = self.ppos
  308. tailppos = self.tailppos
  309. ptr = self.cur_ptr
  310. marks = self.cur_marks
  311. while True:
  312. ctx.jitdriver_MaxUntil.jit_merge_point(
  313. ppos=ppos, tailppos=tailppos, match_more=match_more,
  314. ptr=ptr, marks=marks, self=self, ctx=ctx)
  315. if match_more:
  316. max = ctx.pat(ppos+2)
  317. if max == 65535 or self.num_pending < max:
  318. # try to match one more 'item'
  319. enum = sre_match(ctx, ppos + 3, ptr, marks)
  320. else:
  321. enum = None # 'max' reached, no more matches
  322. else:
  323. p = self.pending
  324. if p is None:
  325. return
  326. self.pending = p.next
  327. self.num_pending -= 1
  328. ptr = p.ptr
  329. marks = p.marks
  330. enum = p.enum.move_to_next_result(ctx)
  331. #
  332. # zero-width match protection
  333. min = ctx.pat(ppos+1)
  334. if self.num_pending >= min:
  335. while enum is not None and ptr == ctx.match_end:
  336. enum = enum.move_to_next_result(ctx)
  337. # matched marks for zero-width assertions
  338. marks = ctx.match_marks
  339. #
  340. if enum is not None:
  341. # matched one more 'item'. record it and continue.
  342. self.pending = Pending(ptr, marks, enum, self.pending)
  343. self.num_pending += 1
  344. ptr = ctx.match_end
  345. marks = ctx.match_marks
  346. match_more = True
  347. else:
  348. # 'item' no longer matches.
  349. if self.num_pending >= min:
  350. # try to match 'tail' if we have enough 'item'
  351. result = sre_match(ctx, tailppos, ptr, marks)
  352. if result is not None:
  353. self.subresult = result
  354. self.cur_ptr = ptr
  355. self.cur_marks = marks
  356. return self
  357. match_more = False
  358. class MinUntilMatchResult(AbstractUntilMatchResult):
  359. def find_first_result(self, ctx):
  360. return self.search_next(ctx, resume=False)
  361. def find_next_result(self, ctx):
  362. return self.search_next(ctx, resume=True)
  363. def search_next(self, ctx, resume):
  364. # XXX missing jit support here
  365. ppos = self.ppos
  366. min = ctx.pat(ppos+1)
  367. max = ctx.pat(ppos+2)
  368. ptr = self.cur_ptr
  369. marks = self.cur_marks
  370. while True:
  371. # try to match 'tail' if we have enough 'item'
  372. if not resume and self.num_pending >= min:
  373. result = sre_match(ctx, self.tailppos, ptr, marks)
  374. if result is not None:
  375. self.subresult = result
  376. self.cur_ptr = ptr
  377. self.cur_marks = marks
  378. return self
  379. resume = False
  380. if max == 65535 or self.num_pending < max:
  381. # try to match one more 'item'
  382. enum = sre_match(ctx, ppos + 3, ptr, marks)
  383. #
  384. # zero-width match protection
  385. if self.num_pending >= min:
  386. while enum is not None and ptr == ctx.match_end:
  387. enum = enum.move_to_next_result(ctx)
  388. else:
  389. enum = None # 'max' reached, no more matches
  390. while enum is None:
  391. # 'item' does not match; try to get further results from
  392. # the 'pending' list.
  393. p = self.pending
  394. if p is None:
  395. return
  396. self.pending = p.next
  397. self.num_pending -= 1
  398. ptr = p.ptr
  399. marks = p.marks
  400. enum = p.enum.move_to_next_result(ctx)
  401. # matched one more 'item'. record it and continue
  402. self.pending = Pending(ptr, marks, enum, self.pending)
  403. self.num_pending += 1
  404. ptr = ctx.match_end
  405. marks = ctx.match_marks
  406. # ____________________________________________________________
  407. @specializectx
  408. @jit.unroll_safe
  409. def sre_match(ctx, ppos, ptr, marks):
  410. """Returns either None or a MatchResult object. Usually we only need
  411. the first result, but there is the case of REPEAT...UNTIL where we
  412. need all results; in that case we use the method move_to_next_result()
  413. of the MatchResult."""
  414. while True:
  415. op = ctx.pat(ppos)
  416. ppos += 1
  417. #jit.jit_debug("sre_match", op, ppos, ptr)
  418. #
  419. # When using the JIT, calls to sre_match() must always have a constant
  420. # (green) argument for 'ppos'. If not, the following assert fails.
  421. jit.assert_green(op)
  422. if op == OPCODE_FAILURE:
  423. return
  424. if (op == OPCODE_SUCCESS or
  425. op == OPCODE_MAX_UNTIL or
  426. op == OPCODE_MIN_UNTIL):
  427. ctx.match_end = ptr
  428. ctx.match_marks = marks
  429. return MATCHED_OK
  430. elif op == OPCODE_ANY:
  431. # match anything (except a newline)
  432. # <ANY>
  433. if ptr >= ctx.end or rsre_char.is_linebreak(ctx.str(ptr)):
  434. return
  435. ptr += 1
  436. elif op == OPCODE_ANY_ALL:
  437. # match anything
  438. # <ANY_ALL>
  439. if ptr >= ctx.end:
  440. return
  441. ptr += 1
  442. elif op == OPCODE_ASSERT:
  443. # assert subpattern
  444. # <ASSERT> <0=skip> <1=back> <pattern>
  445. ptr1 = ptr - ctx.pat(ppos+1)
  446. if ptr1 < 0 or sre_match(ctx, ppos + 2, ptr1, marks) is None:
  447. return
  448. marks = ctx.match_marks
  449. ppos += ctx.pat(ppos)
  450. elif op == OPCODE_ASSERT_NOT:
  451. # assert not subpattern
  452. # <ASSERT_NOT> <0=skip> <1=back> <pattern>
  453. ptr1 = ptr - ctx.pat(ppos+1)
  454. if ptr1 >= 0 and sre_match(ctx, ppos + 2, ptr1, marks) is not None:
  455. return
  456. ppos += ctx.pat(ppos)
  457. elif op == OPCODE_AT:
  458. # match at given position (e.g. at beginning, at boundary, etc.)
  459. # <AT> <code>
  460. if not sre_at(ctx, ctx.pat(ppos), ptr):
  461. return
  462. ppos += 1
  463. elif op == OPCODE_BRANCH:
  464. # alternation
  465. # <BRANCH> <0=skip> code <JUMP> ... <NULL>
  466. result = BranchMatchResult(ppos, ptr, marks)
  467. return result.find_first_result(ctx)
  468. elif op == OPCODE_CATEGORY:
  469. # seems to be never produced, but used by some tests from
  470. # pypy/module/_sre/test
  471. # <CATEGORY> <category>
  472. if (ptr == ctx.end or
  473. not rsre_char.category_dispatch(ctx.pat(ppos), ctx.str(ptr))):
  474. return
  475. ptr += 1
  476. ppos += 1
  477. elif op == OPCODE_GROUPREF:
  478. # match backreference
  479. # <GROUPREF> <groupnum>
  480. startptr, length = get_group_ref(marks, ctx.pat(ppos))
  481. if length < 0:
  482. return # group was not previously defined
  483. if not match_repeated(ctx, ptr, startptr, length):
  484. return # no match
  485. ptr += length
  486. ppos += 1
  487. elif op == OPCODE_GROUPREF_IGNORE:
  488. # match backreference
  489. # <GROUPREF> <groupnum>
  490. startptr, length = get_group_ref(marks, ctx.pat(ppos))
  491. if length < 0:
  492. return # group was not previously defined
  493. if not match_repeated_ignore(ctx, ptr, startptr, length):
  494. return # no match
  495. ptr += length
  496. ppos += 1
  497. elif op == OPCODE_GROUPREF_EXISTS:
  498. # conditional match depending on the existence of a group
  499. # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
  500. _, length = get_group_ref(marks, ctx.pat(ppos))
  501. if length >= 0:
  502. ppos += 2 # jump to 'codeyes'
  503. else:
  504. ppos += ctx.pat(ppos+1) # jump to 'codeno'
  505. elif op == OPCODE_IN:
  506. # match set member (or non_member)
  507. # <IN> <skip> <set>
  508. if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
  509. ppos+1,
  510. ctx.str(ptr)):
  511. return
  512. ppos += ctx.pat(ppos)
  513. ptr += 1
  514. elif op == OPCODE_IN_IGNORE:
  515. # match set member (or non_member), ignoring case
  516. # <IN> <skip> <set>
  517. if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
  518. ppos+1,
  519. ctx.lowstr(ptr)):
  520. return
  521. ppos += ctx.pat(ppos)
  522. ptr += 1
  523. elif op == OPCODE_INFO:
  524. # optimization info block
  525. # <INFO> <0=skip> <1=flags> <2=min> ...
  526. if (ctx.end - ptr) < ctx.pat(ppos+2):
  527. return
  528. ppos += ctx.pat(ppos)
  529. elif op == OPCODE_JUMP:
  530. ppos += ctx.pat(ppos)
  531. elif op == OPCODE_LITERAL:
  532. # match literal string
  533. # <LITERAL> <code>
  534. if ptr >= ctx.end or ctx.str(ptr) != ctx.pat(ppos):
  535. return
  536. ppos += 1
  537. ptr += 1
  538. elif op == OPCODE_LITERAL_IGNORE:
  539. # match literal string, ignoring case
  540. # <LITERAL_IGNORE> <code>
  541. if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos):
  542. return
  543. ppos += 1
  544. ptr += 1
  545. elif op == OPCODE_MARK:
  546. # set mark
  547. # <MARK> <gid>
  548. gid = ctx.pat(ppos)
  549. marks = Mark(gid, ptr, marks)
  550. ppos += 1
  551. elif op == OPCODE_NOT_LITERAL:
  552. # match if it's not a literal string
  553. # <NOT_LITERAL> <code>
  554. if ptr >= ctx.end or ctx.str(ptr) == ctx.pat(ppos):
  555. return
  556. ppos += 1
  557. ptr += 1
  558. elif op == OPCODE_NOT_LITERAL_IGNORE:
  559. # match if it's not a literal string, ignoring case
  560. # <NOT_LITERAL> <code>
  561. if ptr >= ctx.end or ctx.lowstr(ptr) == ctx.pat(ppos):
  562. return
  563. ppos += 1
  564. ptr += 1
  565. elif op == OPCODE_REPEAT:
  566. # general repeat. in this version of the re module, all the work
  567. # is done here, and not on the later UNTIL operator.
  568. # <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail
  569. # FIXME: we probably need to deal with zero-width matches in here..
  570. # decode the later UNTIL operator to see if it is actually
  571. # a MAX_UNTIL or MIN_UNTIL
  572. untilppos = ppos + ctx.pat(ppos)
  573. tailppos = untilppos + 1
  574. op = ctx.pat(untilppos)
  575. if op == OPCODE_MAX_UNTIL:
  576. # the hard case: we have to match as many repetitions as
  577. # possible, followed by the 'tail'. we do this by
  578. # remembering each state for each possible number of
  579. # 'item' matching.
  580. result = MaxUntilMatchResult(ppos, tailppos, ptr, marks)
  581. return result.find_first_result(ctx)
  582. elif op == OPCODE_MIN_UNTIL:
  583. # first try to match the 'tail', and if it fails, try
  584. # to match one more 'item' and try again
  585. result = MinUntilMatchResult(ppos, tailppos, ptr, marks)
  586. return result.find_first_result(ctx)
  587. else:
  588. raise Error("missing UNTIL after REPEAT")
  589. elif op == OPCODE_REPEAT_ONE:
  590. # match repeated sequence (maximizing regexp).
  591. # this operator only works if the repeated item is
  592. # exactly one character wide, and we're not already
  593. # collecting backtracking points. for other cases,
  594. # use the MAX_REPEAT operator.
  595. # <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail
  596. start = ptr
  597. minptr = start + ctx.pat(ppos+1)
  598. if minptr > ctx.end:
  599. return # cannot match
  600. ptr = find_repetition_end(ctx, ppos+3, start, ctx.pat(ppos+2))
  601. # when we arrive here, ptr points to the tail of the target
  602. # string. check if the rest of the pattern matches,
  603. # and backtrack if not.
  604. nextppos = ppos + ctx.pat(ppos)
  605. result = RepeatOneMatchResult(nextppos, minptr, ptr, marks)
  606. return result.find_first_result(ctx)
  607. elif op == OPCODE_MIN_REPEAT_ONE:
  608. # match repeated sequence (minimizing regexp).
  609. # this operator only works if the repeated item is
  610. # exactly one character wide, and we're not already
  611. # collecting backtracking points. for other cases,
  612. # use the MIN_REPEAT operator.
  613. # <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail
  614. start = ptr
  615. min = ctx.pat(ppos+1)
  616. if min > 0:
  617. minptr = ptr + min
  618. if minptr > ctx.end:
  619. return # cannot match
  620. # count using pattern min as the maximum
  621. ptr = find_repetition_end(ctx, ppos+3, ptr, min)
  622. if ptr < minptr:
  623. return # did not match minimum number of times
  624. maxptr = ctx.end
  625. max = ctx.pat(ppos+2)
  626. if max != 65535:
  627. maxptr1 = start + max
  628. if maxptr1 <= maxptr:
  629. maxptr = maxptr1
  630. nextppos = ppos + ctx.pat(ppos)
  631. result = MinRepeatOneMatchResult(nextppos, ppos+3, maxptr,
  632. ptr, marks)
  633. return result.find_first_result(ctx)
  634. else:
  635. raise Error("bad pattern code %d" % op)
  636. def get_group_ref(marks, groupnum):
  637. gid = groupnum * 2
  638. startptr = find_mark(marks, gid)
  639. if startptr < 0:
  640. return 0, -1
  641. endptr = find_mark(marks, gid + 1)
  642. length = endptr - startptr # < 0 if endptr < startptr (or if endptr=-1)
  643. return startptr, length
  644. @specializectx
  645. def match_repeated(ctx, ptr, oldptr, length):
  646. if ptr + length > ctx.end:
  647. return False
  648. for i in range(length):
  649. if ctx.str(ptr + i) != ctx.str(oldptr + i):
  650. return False
  651. return True
  652. @specializectx
  653. def match_repeated_ignore(ctx, ptr, oldptr, length):
  654. if ptr + length > ctx.end:
  655. return False
  656. for i in range(length):
  657. if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
  658. return False
  659. return True
  660. @specializectx
  661. def find_repetition_end(ctx, ppos, ptr, maxcount):
  662. end = ctx.end
  663. ptrp1 = ptr + 1
  664. # First get rid of the cases where we don't have room for any match.
  665. if maxcount <= 0 or ptrp1 > end:
  666. return ptr
  667. # Check the first character directly. If it doesn't match, we are done.
  668. # The idea is to be fast for cases like re.search("b+"), where we expect
  669. # the common case to be a non-match. It's much faster with the JIT to
  670. # have the non-match inlined here rather than detect it in the fre() call.
  671. op = ctx.pat(ppos)
  672. for op1, checkerfn in unroll_char_checker:
  673. if op1 == op:
  674. if checkerfn(ctx, ptr, ppos):
  675. break
  676. else:
  677. return ptr
  678. # It matches at least once. If maxcount == 1 (relatively common),
  679. # then we are done.
  680. if maxcount == 1:
  681. return ptrp1
  682. # Else we really need to count how many times it matches.
  683. if maxcount != 65535:
  684. # adjust end
  685. end1 = ptr + maxcount
  686. if end1 <= end:
  687. end = end1
  688. op = ctx.pat(ppos)
  689. for op1, fre in unroll_fre_checker:
  690. if op1 == op:
  691. return fre(ctx, ptrp1, end, ppos)
  692. raise Error("rsre.find_repetition_end[%d]" % op)
  693. @specializectx
  694. def match_ANY(ctx, ptr, ppos): # dot wildcard.
  695. return not rsre_char.is_linebreak(ctx.str(ptr))
  696. def match_ANY_ALL(ctx, ptr, ppos):
  697. return True # match anything (including a newline)
  698. @specializectx
  699. def match_IN(ctx, ptr, ppos):
  700. return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.str(ptr))
  701. @specializectx
  702. def match_IN_IGNORE(ctx, ptr, ppos):
  703. return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.lowstr(ptr))
  704. @specializectx
  705. def match_LITERAL(ctx, ptr, ppos):
  706. return ctx.str(ptr) == ctx.pat(ppos+1)
  707. @specializectx
  708. def match_LITERAL_IGNORE(ctx, ptr, ppos):
  709. return ctx.lowstr(ptr) == ctx.pat(ppos+1)
  710. @specializectx
  711. def match_NOT_LITERAL(ctx, ptr, ppos):
  712. return ctx.str(ptr) != ctx.pat(ppos+1)
  713. @specializectx
  714. def match_NOT_LITERAL_IGNORE(ctx, ptr, ppos):
  715. return ctx.lowstr(ptr) != ctx.pat(ppos+1)
  716. def _make_fre(checkerfn):
  717. if checkerfn == match_ANY_ALL:
  718. def fre(ctx, ptr, end, ppos):
  719. return end
  720. elif checkerfn == match_IN:
  721. install_jitdriver_spec('MatchIn',
  722. greens=['ppos', 'ctx.pattern'],
  723. reds=['ptr', 'end', 'ctx'],
  724. debugprint=(1, 0))
  725. @specializectx
  726. def fre(ctx, ptr, end, ppos):
  727. while True:
  728. ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr,
  729. end=end, ppos=ppos)
  730. if ptr < end and checkerfn(ctx, ptr, ppos):
  731. ptr += 1
  732. else:
  733. return ptr
  734. elif checkerfn == match_IN_IGNORE:
  735. install_jitdriver_spec('MatchInIgnore',
  736. greens=['ppos', 'ctx.pattern'],
  737. reds=['ptr', 'end', 'ctx'],
  738. debugprint=(1, 0))
  739. @specializectx
  740. def fre(ctx, ptr, end, ppos):
  741. while True:
  742. ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
  743. end=end, ppos=ppos)
  744. if ptr < end and checkerfn(ctx, ptr, ppos):
  745. ptr += 1
  746. else:
  747. return ptr
  748. else:
  749. # in the other cases, the fre() function is not JITted at all
  750. # and is present as a residual call.
  751. @specializectx
  752. def fre(ctx, ptr, end, ppos):
  753. while ptr < end and checkerfn(ctx, ptr, ppos):
  754. ptr += 1
  755. return ptr
  756. fre = func_with_new_name(fre, 'fre_' + checkerfn.__name__)
  757. return fre
  758. unroll_char_checker = [
  759. (OPCODE_ANY, match_ANY),
  760. (OPCODE_ANY_ALL, match_ANY_ALL),
  761. (OPCODE_IN, match_IN),
  762. (OPCODE_IN_IGNORE, match_IN_IGNORE),
  763. (OPCODE_LITERAL, match_LITERAL),
  764. (OPCODE_LITERAL_IGNORE, match_LITERAL_IGNORE),
  765. (OPCODE_NOT_LITERAL, match_NOT_LITERAL),
  766. (OPCODE_NOT_LITERAL_IGNORE, match_NOT_LITERAL_IGNORE),
  767. ]
  768. unroll_fre_checker = [(_op, _make_fre(_fn))
  769. for (_op, _fn) in unroll_char_checker]
  770. unroll_char_checker = unrolling_iterable(unroll_char_checker)
  771. unroll_fre_checker = unrolling_iterable(unroll_fre_checker)
  772. ##### At dispatch
  773. AT_BEGINNING = 0
  774. AT_BEGINNING_LINE = 1
  775. AT_BEGINNING_STRING = 2
  776. AT_BOUNDARY = 3
  777. AT_NON_BOUNDARY = 4
  778. AT_END = 5
  779. AT_END_LINE = 6
  780. AT_END_STRING = 7
  781. AT_LOC_BOUNDARY = 8
  782. AT_LOC_NON_BOUNDARY = 9
  783. AT_UNI_BOUNDARY = 10
  784. AT_UNI_NON_BOUNDARY = 11
  785. @specializectx
  786. def sre_at(ctx, atcode, ptr):
  787. if (atcode == AT_BEGINNING or
  788. atcode == AT_BEGINNING_STRING):
  789. return ptr == 0
  790. elif atcode == AT_BEGINNING_LINE:
  791. prevptr = ptr - 1
  792. return prevptr < 0 or rsre_char.is_linebreak(ctx.str(prevptr))
  793. elif atcode == AT_BOUNDARY:
  794. return at_boundary(ctx, ptr)
  795. elif atcode == AT_NON_BOUNDARY:
  796. return at_non_boundary(ctx, ptr)
  797. elif atcode == AT_END:
  798. remaining_chars = ctx.end - ptr
  799. return remaining_chars <= 0 or (
  800. remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
  801. elif atcode == AT_END_LINE:
  802. return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
  803. elif atcode == AT_END_STRING:
  804. return ptr == ctx.end
  805. elif atcode == AT_LOC_BOUNDARY:
  806. return at_loc_boundary(ctx, ptr)
  807. elif atcode == AT_LOC_NON_BOUNDARY:
  808. return at_loc_non_boundary(ctx, ptr)
  809. elif atcode == AT_UNI_BOUNDARY:
  810. return at_uni_boundary(ctx, ptr)
  811. elif atcode == AT_UNI_NON_BOUNDARY:
  812. return at_uni_non_boundary(ctx, ptr)
  813. return False
  814. def _make_boundary(word_checker):
  815. @specializectx
  816. def at_boundary(ctx, ptr):
  817. if ctx.end == 0:
  818. return False
  819. prevptr = ptr - 1
  820. that = prevptr >= 0 and word_checker(ctx.str(prevptr))
  821. this = ptr < ctx.end and word_checker(ctx.str(ptr))
  822. return this != that
  823. @specializectx
  824. def at_non_boundary(ctx, ptr):
  825. if ctx.end == 0:
  826. return False
  827. prevptr = ptr - 1
  828. that = prevptr >= 0 and word_checker(ctx.str(prevptr))
  829. this = ptr < ctx.end and word_checker(ctx.str(ptr))
  830. return this == that
  831. return at_boundary, at_non_boundary
  832. at_boundary, at_non_boundary = _make_boundary(rsre_char.is_word)
  833. at_loc_boundary, at_loc_non_boundary = _make_boundary(rsre_char.is_loc_word)
  834. at_uni_boundary, at_uni_non_boundary = _make_boundary(rsre_char.is_uni_word)
  835. # ____________________________________________________________
  836. def _adjust(start, end, length):
  837. if start < 0: start = 0
  838. elif start > length: start = length
  839. if end < 0: end = 0
  840. elif end > length: end = length
  841. return start, end
  842. def match(pattern, string, start=0, end=sys.maxint, flags=0):
  843. start, end = _adjust(start, end, len(string))
  844. ctx = StrMatchContext(pattern, string, start, end, flags)
  845. if match_context(ctx):
  846. return ctx
  847. else:
  848. return None
  849. def search(pattern, string, start=0, end=sys.maxint, flags=0):
  850. start, end = _adjust(start, end, len(string))
  851. ctx = StrMatchContext(pattern, string, start, end, flags)
  852. if search_context(ctx):
  853. return ctx
  854. else:
  855. return None
  856. install_jitdriver('Match',
  857. greens=['ctx.pattern'], reds=['ctx'],
  858. debugprint=(0,))
  859. def match_context(ctx):
  860. ctx.original_pos = ctx.match_start
  861. if ctx.end < ctx.match_start:
  862. return False
  863. ctx.jitdriver_Match.jit_merge_point(ctx=ctx)
  864. return sre_match(ctx, 0, ctx.match_start, None) is not None
  865. def search_context(ctx):
  866. ctx.original_pos = ctx.match_start
  867. if ctx.end < ctx.match_start:
  868. return False
  869. base = 0
  870. charset = False
  871. if ctx.pat(base) == OPCODE_INFO:
  872. flags = ctx.pat(2)
  873. if flags & rsre_char.SRE_INFO_PREFIX:
  874. if ctx.pat(5) > 1:
  875. return fast_search(ctx)
  876. else:
  877. charset = (flags & rsre_char.SRE_INFO_CHARSET)
  878. base += 1 + ctx.pat(1)
  879. if ctx.pat(base) == OPCODE_LITERAL:
  880. return literal_search(ctx, base)
  881. if charset:
  882. return charset_search(ctx, base)
  883. return regular_search(ctx, base)
  884. install_jitdriver('RegularSearch',
  885. greens=['base', 'ctx.pattern'],
  886. reds=['start', 'ctx'],
  887. debugprint=(1, 0))
  888. def regular_search(ctx, base):
  889. start = ctx.match_start
  890. while start <= ctx.end:
  891. ctx.jitdriver_RegularSearch.jit_merge_point(ctx=ctx, start=start,
  892. base=base)
  893. if sre_match(ctx, base, start, None) is not None:
  894. ctx.match_start = start
  895. return True
  896. start += 1
  897. return False
  898. install_jitdriver_spec("LiteralSearch",
  899. greens=['base', 'character', 'ctx.pattern'],
  900. reds=['start', 'ctx'],
  901. debugprint=(2, 0, 1))
  902. @specializectx
  903. def literal_search(ctx, base):
  904. # pattern starts with a literal character. this is used
  905. # for short prefixes, and if fast search is disabled
  906. character = ctx.pat(base + 1)
  907. base += 2
  908. start = ctx.match_start
  909. while start < ctx.end:
  910. ctx.jitdriver_LiteralSearch.jit_merge_point(ctx=ctx, start=start,
  911. base=base, character=character)
  912. if ctx.str(start) == character:
  913. if sre_match(ctx, base, start + 1, None) is not None:
  914. ctx.match_start = start
  915. return True
  916. start += 1
  917. return False
  918. install_jitdriver_spec("CharsetSearch",
  919. greens=['base', 'ctx.pattern'],
  920. reds=['start', 'ctx'],
  921. debugprint=(1, 0))
  922. @specializectx
  923. def charset_search(ctx, base):
  924. # pattern starts with a character from a known set
  925. start = ctx.match_start
  926. while start < ctx.end:
  927. ctx.jitdriver_CharsetSearch.jit_merge_point(ctx=ctx, start=start,
  928. base=base)
  929. if rsre_char.check_charset(ctx.pattern, 5, ctx.str(start)):
  930. if sre_match(ctx, base, start, None) is not None:
  931. ctx.match_start = start
  932. return True
  933. start += 1
  934. return False
  935. install_jitdriver_spec('FastSearch',
  936. greens=['i', 'prefix_len', 'ctx.pattern'],
  937. reds=['string_position', 'ctx'],
  938. debugprint=(2, 0))
  939. @specializectx
  940. def fast_search(ctx):
  941. # skips forward in a string as fast as possible using information from
  942. # an optimization info block
  943. # <INFO> <1=skip> <2=flags> <3=min> <4=...>
  944. # <5=length> <6=skip> <7=prefix data> <overlap data>
  945. string_position = ctx.match_start
  946. if string_position >= ctx.end:
  947. return False
  948. prefix_len = ctx.pat(5)
  949. assert prefix_len >= 0
  950. i = 0
  951. while True:
  952. ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx,
  953. string_position=string_position, i=i, prefix_len=prefix_len)
  954. char_ord = ctx.str(string_position)
  955. if char_ord != ctx.pat(7 + i):
  956. if i > 0:
  957. overlap_offset = prefix_len + (7 - 1)
  958. i = ctx.pat(overlap_offset + i)
  959. continue
  960. else:
  961. i += 1
  962. if i == prefix_len:
  963. # found a potential match
  964. start = string_position + 1 - prefix_len
  965. assert start >= 0
  966. prefix_skip = ctx.pat(6)
  967. ptr = start + prefix_skip
  968. #flags = ctx.pat(2)
  969. #if flags & rsre_char.SRE_INFO_LITERAL:
  970. # # matched all of pure literal pattern
  971. # ctx.match_start = start
  972. # ctx.match_end = ptr
  973. # ctx.match_marks = None
  974. # return True
  975. pattern_offset = ctx.pat(1) + 1
  976. ppos_start = pattern_offset + 2 * prefix_skip
  977. if sre_match(ctx, ppos_start, ptr, None) is not None:
  978. ctx.match_start = start
  979. return True
  980. overlap_offset = prefix_len + (7 - 1)
  981. i = ctx.pat(overlap_offset + i)
  982. string_position += 1
  983. if string_position >= ctx.end:
  984. return False