/Lib/sre_compile.py

http://unladen-swallow.googlecode.com/ · Python · 530 lines · 428 code · 33 blank · 69 comment · 153 complexity · 437e9e6833641b6bc0ae4cb6bce68b14 MD5 · raw file

  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert template to internal format
  5. #
  6. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. import _sre, sys
  12. import sre_parse
  13. from sre_constants import *
  14. assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  15. if _sre.CODESIZE == 2:
  16. MAXCODE = 65535
  17. else:
  18. MAXCODE = 0xFFFFFFFFL
  19. def _identityfunction(x):
  20. return x
  21. def set(seq):
  22. s = {}
  23. for elem in seq:
  24. s[elem] = 1
  25. return s
  26. _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
  27. _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
  28. _SUCCESS_CODES = set([SUCCESS, FAILURE])
  29. _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
  30. def _compile(code, pattern, flags):
  31. # internal: compile a (sub)pattern
  32. emit = code.append
  33. _len = len
  34. LITERAL_CODES = _LITERAL_CODES
  35. REPEATING_CODES = _REPEATING_CODES
  36. SUCCESS_CODES = _SUCCESS_CODES
  37. ASSERT_CODES = _ASSERT_CODES
  38. for op, av in pattern:
  39. if op in LITERAL_CODES:
  40. if flags & SRE_FLAG_IGNORECASE:
  41. emit(OPCODES[OP_IGNORE[op]])
  42. emit(_sre.getlower(av, flags))
  43. else:
  44. emit(OPCODES[op])
  45. emit(av)
  46. elif op is IN:
  47. if flags & SRE_FLAG_IGNORECASE:
  48. emit(OPCODES[OP_IGNORE[op]])
  49. def fixup(literal, flags=flags):
  50. return _sre.getlower(literal, flags)
  51. else:
  52. emit(OPCODES[op])
  53. fixup = _identityfunction
  54. skip = _len(code); emit(0)
  55. _compile_charset(av, flags, code, fixup)
  56. code[skip] = _len(code) - skip
  57. elif op is ANY:
  58. if flags & SRE_FLAG_DOTALL:
  59. emit(OPCODES[ANY_ALL])
  60. else:
  61. emit(OPCODES[ANY])
  62. elif op in REPEATING_CODES:
  63. if flags & SRE_FLAG_TEMPLATE:
  64. raise error, "internal: unsupported template operator"
  65. emit(OPCODES[REPEAT])
  66. skip = _len(code); emit(0)
  67. emit(av[0])
  68. emit(av[1])
  69. _compile(code, av[2], flags)
  70. emit(OPCODES[SUCCESS])
  71. code[skip] = _len(code) - skip
  72. elif _simple(av) and op is not REPEAT:
  73. if op is MAX_REPEAT:
  74. emit(OPCODES[REPEAT_ONE])
  75. else:
  76. emit(OPCODES[MIN_REPEAT_ONE])
  77. skip = _len(code); emit(0)
  78. emit(av[0])
  79. emit(av[1])
  80. _compile(code, av[2], flags)
  81. emit(OPCODES[SUCCESS])
  82. code[skip] = _len(code) - skip
  83. else:
  84. emit(OPCODES[REPEAT])
  85. skip = _len(code); emit(0)
  86. emit(av[0])
  87. emit(av[1])
  88. _compile(code, av[2], flags)
  89. code[skip] = _len(code) - skip
  90. if op is MAX_REPEAT:
  91. emit(OPCODES[MAX_UNTIL])
  92. else:
  93. emit(OPCODES[MIN_UNTIL])
  94. elif op is SUBPATTERN:
  95. if av[0]:
  96. emit(OPCODES[MARK])
  97. emit((av[0]-1)*2)
  98. # _compile_info(code, av[1], flags)
  99. _compile(code, av[1], flags)
  100. if av[0]:
  101. emit(OPCODES[MARK])
  102. emit((av[0]-1)*2+1)
  103. elif op in SUCCESS_CODES:
  104. emit(OPCODES[op])
  105. elif op in ASSERT_CODES:
  106. emit(OPCODES[op])
  107. skip = _len(code); emit(0)
  108. if av[0] >= 0:
  109. emit(0) # look ahead
  110. else:
  111. lo, hi = av[1].getwidth()
  112. if lo != hi:
  113. raise error, "look-behind requires fixed-width pattern"
  114. emit(lo) # look behind
  115. _compile(code, av[1], flags)
  116. emit(OPCODES[SUCCESS])
  117. code[skip] = _len(code) - skip
  118. elif op is CALL:
  119. emit(OPCODES[op])
  120. skip = _len(code); emit(0)
  121. _compile(code, av, flags)
  122. emit(OPCODES[SUCCESS])
  123. code[skip] = _len(code) - skip
  124. elif op is AT:
  125. emit(OPCODES[op])
  126. if flags & SRE_FLAG_MULTILINE:
  127. av = AT_MULTILINE.get(av, av)
  128. if flags & SRE_FLAG_LOCALE:
  129. av = AT_LOCALE.get(av, av)
  130. elif flags & SRE_FLAG_UNICODE:
  131. av = AT_UNICODE.get(av, av)
  132. emit(ATCODES[av])
  133. elif op is BRANCH:
  134. emit(OPCODES[op])
  135. tail = []
  136. tailappend = tail.append
  137. for av in av[1]:
  138. skip = _len(code); emit(0)
  139. # _compile_info(code, av, flags)
  140. _compile(code, av, flags)
  141. emit(OPCODES[JUMP])
  142. tailappend(_len(code)); emit(0)
  143. code[skip] = _len(code) - skip
  144. emit(0) # end of branch
  145. for tail in tail:
  146. code[tail] = _len(code) - tail
  147. elif op is CATEGORY:
  148. emit(OPCODES[op])
  149. if flags & SRE_FLAG_LOCALE:
  150. av = CH_LOCALE[av]
  151. elif flags & SRE_FLAG_UNICODE:
  152. av = CH_UNICODE[av]
  153. emit(CHCODES[av])
  154. elif op is GROUPREF:
  155. if flags & SRE_FLAG_IGNORECASE:
  156. emit(OPCODES[OP_IGNORE[op]])
  157. else:
  158. emit(OPCODES[op])
  159. emit(av-1)
  160. elif op is GROUPREF_EXISTS:
  161. emit(OPCODES[op])
  162. emit(av[0]-1)
  163. skipyes = _len(code); emit(0)
  164. _compile(code, av[1], flags)
  165. if av[2]:
  166. emit(OPCODES[JUMP])
  167. skipno = _len(code); emit(0)
  168. code[skipyes] = _len(code) - skipyes + 1
  169. _compile(code, av[2], flags)
  170. code[skipno] = _len(code) - skipno
  171. else:
  172. code[skipyes] = _len(code) - skipyes + 1
  173. else:
  174. raise ValueError, ("unsupported operand type", op)
  175. def _compile_charset(charset, flags, code, fixup=None):
  176. # compile charset subprogram
  177. emit = code.append
  178. if fixup is None:
  179. fixup = _identityfunction
  180. for op, av in _optimize_charset(charset, fixup):
  181. emit(OPCODES[op])
  182. if op is NEGATE:
  183. pass
  184. elif op is LITERAL:
  185. emit(fixup(av))
  186. elif op is RANGE:
  187. emit(fixup(av[0]))
  188. emit(fixup(av[1]))
  189. elif op is CHARSET:
  190. code.extend(av)
  191. elif op is BIGCHARSET:
  192. code.extend(av)
  193. elif op is CATEGORY:
  194. if flags & SRE_FLAG_LOCALE:
  195. emit(CHCODES[CH_LOCALE[av]])
  196. elif flags & SRE_FLAG_UNICODE:
  197. emit(CHCODES[CH_UNICODE[av]])
  198. else:
  199. emit(CHCODES[av])
  200. else:
  201. raise error, "internal: unsupported set operator"
  202. emit(OPCODES[FAILURE])
  203. def _optimize_charset(charset, fixup):
  204. # internal: optimize character set
  205. out = []
  206. outappend = out.append
  207. charmap = [0]*256
  208. try:
  209. for op, av in charset:
  210. if op is NEGATE:
  211. outappend((op, av))
  212. elif op is LITERAL:
  213. charmap[fixup(av)] = 1
  214. elif op is RANGE:
  215. for i in range(fixup(av[0]), fixup(av[1])+1):
  216. charmap[i] = 1
  217. elif op is CATEGORY:
  218. # XXX: could append to charmap tail
  219. return charset # cannot compress
  220. except IndexError:
  221. # character set contains unicode characters
  222. return _optimize_unicode(charset, fixup)
  223. # compress character map
  224. i = p = n = 0
  225. runs = []
  226. runsappend = runs.append
  227. for c in charmap:
  228. if c:
  229. if n == 0:
  230. p = i
  231. n = n + 1
  232. elif n:
  233. runsappend((p, n))
  234. n = 0
  235. i = i + 1
  236. if n:
  237. runsappend((p, n))
  238. if len(runs) <= 2:
  239. # use literal/range
  240. for p, n in runs:
  241. if n == 1:
  242. outappend((LITERAL, p))
  243. else:
  244. outappend((RANGE, (p, p+n-1)))
  245. if len(out) < len(charset):
  246. return out
  247. else:
  248. # use bitmap
  249. data = _mk_bitmap(charmap)
  250. outappend((CHARSET, data))
  251. return out
  252. return charset
  253. def _mk_bitmap(bits):
  254. data = []
  255. dataappend = data.append
  256. if _sre.CODESIZE == 2:
  257. start = (1, 0)
  258. else:
  259. start = (1L, 0L)
  260. m, v = start
  261. for c in bits:
  262. if c:
  263. v = v + m
  264. m = m + m
  265. if m > MAXCODE:
  266. dataappend(v)
  267. m, v = start
  268. return data
  269. # To represent a big charset, first a bitmap of all characters in the
  270. # set is constructed. Then, this bitmap is sliced into chunks of 256
  271. # characters, duplicate chunks are eliminated, and each chunk is
  272. # given a number. In the compiled expression, the charset is
  273. # represented by a 16-bit word sequence, consisting of one word for
  274. # the number of different chunks, a sequence of 256 bytes (128 words)
  275. # of chunk numbers indexed by their original chunk position, and a
  276. # sequence of chunks (16 words each).
  277. # Compression is normally good: in a typical charset, large ranges of
  278. # Unicode will be either completely excluded (e.g. if only cyrillic
  279. # letters are to be matched), or completely included (e.g. if large
  280. # subranges of Kanji match). These ranges will be represented by
  281. # chunks of all one-bits or all zero-bits.
  282. # Matching can be also done efficiently: the more significant byte of
  283. # the Unicode character is an index into the chunk number, and the
  284. # less significant byte is a bit index in the chunk (just like the
  285. # CHARSET matching).
  286. # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
  287. # of the basic multilingual plane; an efficient representation
  288. # for all of UTF-16 has not yet been developed. This means,
  289. # in particular, that negated charsets cannot be represented as
  290. # bigcharsets.
  291. def _optimize_unicode(charset, fixup):
  292. try:
  293. import array
  294. except ImportError:
  295. return charset
  296. charmap = [0]*65536
  297. negate = 0
  298. try:
  299. for op, av in charset:
  300. if op is NEGATE:
  301. negate = 1
  302. elif op is LITERAL:
  303. charmap[fixup(av)] = 1
  304. elif op is RANGE:
  305. for i in xrange(fixup(av[0]), fixup(av[1])+1):
  306. charmap[i] = 1
  307. elif op is CATEGORY:
  308. # XXX: could expand category
  309. return charset # cannot compress
  310. except IndexError:
  311. # non-BMP characters
  312. return charset
  313. if negate:
  314. if sys.maxunicode != 65535:
  315. # XXX: negation does not work with big charsets
  316. return charset
  317. for i in xrange(65536):
  318. charmap[i] = not charmap[i]
  319. comps = {}
  320. mapping = [0]*256
  321. block = 0
  322. data = []
  323. for i in xrange(256):
  324. chunk = tuple(charmap[i*256:(i+1)*256])
  325. new = comps.setdefault(chunk, block)
  326. mapping[i] = new
  327. if new == block:
  328. block = block + 1
  329. data = data + _mk_bitmap(chunk)
  330. header = [block]
  331. if _sre.CODESIZE == 2:
  332. code = 'H'
  333. else:
  334. code = 'I'
  335. # Convert block indices to byte array of 256 bytes
  336. mapping = array.array('b', mapping).tostring()
  337. # Convert byte array to word array
  338. mapping = array.array(code, mapping)
  339. assert mapping.itemsize == _sre.CODESIZE
  340. header = header + mapping.tolist()
  341. data[0:0] = header
  342. return [(BIGCHARSET, data)]
  343. def _simple(av):
  344. # check if av is a "simple" operator
  345. lo, hi = av[2].getwidth()
  346. if lo == 0 and hi == MAXREPEAT:
  347. raise error, "nothing to repeat"
  348. return lo == hi == 1 and av[2][0][0] != SUBPATTERN
  349. def _compile_info(code, pattern, flags):
  350. # internal: compile an info block. in the current version,
  351. # this contains min/max pattern width, and an optional literal
  352. # prefix or a character map
  353. lo, hi = pattern.getwidth()
  354. if lo == 0:
  355. return # not worth it
  356. # look for a literal prefix
  357. prefix = []
  358. prefixappend = prefix.append
  359. prefix_skip = 0
  360. charset = [] # not used
  361. charsetappend = charset.append
  362. if not (flags & SRE_FLAG_IGNORECASE):
  363. # look for literal prefix
  364. for op, av in pattern.data:
  365. if op is LITERAL:
  366. if len(prefix) == prefix_skip:
  367. prefix_skip = prefix_skip + 1
  368. prefixappend(av)
  369. elif op is SUBPATTERN and len(av[1]) == 1:
  370. op, av = av[1][0]
  371. if op is LITERAL:
  372. prefixappend(av)
  373. else:
  374. break
  375. else:
  376. break
  377. # if no prefix, look for charset prefix
  378. if not prefix and pattern.data:
  379. op, av = pattern.data[0]
  380. if op is SUBPATTERN and av[1]:
  381. op, av = av[1][0]
  382. if op is LITERAL:
  383. charsetappend((op, av))
  384. elif op is BRANCH:
  385. c = []
  386. cappend = c.append
  387. for p in av[1]:
  388. if not p:
  389. break
  390. op, av = p[0]
  391. if op is LITERAL:
  392. cappend((op, av))
  393. else:
  394. break
  395. else:
  396. charset = c
  397. elif op is BRANCH:
  398. c = []
  399. cappend = c.append
  400. for p in av[1]:
  401. if not p:
  402. break
  403. op, av = p[0]
  404. if op is LITERAL:
  405. cappend((op, av))
  406. else:
  407. break
  408. else:
  409. charset = c
  410. elif op is IN:
  411. charset = av
  412. ## if prefix:
  413. ## print "*** PREFIX", prefix, prefix_skip
  414. ## if charset:
  415. ## print "*** CHARSET", charset
  416. # add an info block
  417. emit = code.append
  418. emit(OPCODES[INFO])
  419. skip = len(code); emit(0)
  420. # literal flag
  421. mask = 0
  422. if prefix:
  423. mask = SRE_INFO_PREFIX
  424. if len(prefix) == prefix_skip == len(pattern.data):
  425. mask = mask + SRE_INFO_LITERAL
  426. elif charset:
  427. mask = mask + SRE_INFO_CHARSET
  428. emit(mask)
  429. # pattern length
  430. if lo < MAXCODE:
  431. emit(lo)
  432. else:
  433. emit(MAXCODE)
  434. prefix = prefix[:MAXCODE]
  435. if hi < MAXCODE:
  436. emit(hi)
  437. else:
  438. emit(0)
  439. # add literal prefix
  440. if prefix:
  441. emit(len(prefix)) # length
  442. emit(prefix_skip) # skip
  443. code.extend(prefix)
  444. # generate overlap table
  445. table = [-1] + ([0]*len(prefix))
  446. for i in xrange(len(prefix)):
  447. table[i+1] = table[i]+1
  448. while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
  449. table[i+1] = table[table[i+1]-1]+1
  450. code.extend(table[1:]) # don't store first entry
  451. elif charset:
  452. _compile_charset(charset, flags, code)
  453. code[skip] = len(code) - skip
  454. try:
  455. unicode
  456. except NameError:
  457. STRING_TYPES = (type(""),)
  458. else:
  459. STRING_TYPES = (type(""), type(unicode("")))
  460. def isstring(obj):
  461. for tp in STRING_TYPES:
  462. if isinstance(obj, tp):
  463. return 1
  464. return 0
  465. def _code(p, flags):
  466. flags = p.pattern.flags | flags
  467. code = []
  468. # compile info block
  469. _compile_info(code, p, flags)
  470. # compile the pattern
  471. _compile(code, p.data, flags)
  472. code.append(OPCODES[SUCCESS])
  473. return code
  474. def compile(p, flags=0):
  475. # internal: convert pattern list to internal format
  476. if isstring(p):
  477. pattern = p
  478. p = sre_parse.parse(p, flags)
  479. else:
  480. pattern = None
  481. code = _code(p, flags)
  482. # print code
  483. # XXX: <fl> get rid of this limitation!
  484. if p.pattern.groups > 100:
  485. raise AssertionError(
  486. "sorry, but this version only supports 100 named groups"
  487. )
  488. # map in either direction
  489. groupindex = p.pattern.groupdict
  490. indexgroup = [None] * p.pattern.groups
  491. for k, i in groupindex.items():
  492. indexgroup[i] = k
  493. return _sre.compile(
  494. pattern, flags | p.pattern.flags, code,
  495. p.pattern.groups-1,
  496. groupindex, indexgroup
  497. )