PageRenderTime 63ms CodeModel.GetById 32ms RepoModel.GetById 1ms app.codeStats 0ms

/org.modelsphere.sms/lib/jython-2.2.1/Lib/sre_compile.py

https://github.com/DarioGT/OMS-PluginXML
Python | 455 lines | 384 code | 16 blank | 55 comment | 131 complexity | 0636e7ffda56b782d5edb7a204015a96 MD5 | raw file
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert template to internal format
  5. #
  6. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. import _sre, sys
  12. from sre_constants import *
  13. assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  14. MAXCODE = 65535
  15. def _compile(code, pattern, flags):
  16. # internal: compile a (sub)pattern
  17. emit = code.append
  18. for op, av in pattern:
  19. if op in (LITERAL, NOT_LITERAL):
  20. if flags & SRE_FLAG_IGNORECASE:
  21. emit(OPCODES[OP_IGNORE[op]])
  22. emit(_sre.getlower(av, flags))
  23. else:
  24. emit(OPCODES[op])
  25. emit(av)
  26. elif op is IN:
  27. if flags & SRE_FLAG_IGNORECASE:
  28. emit(OPCODES[OP_IGNORE[op]])
  29. def fixup(literal, flags=flags):
  30. return _sre.getlower(literal, flags)
  31. else:
  32. emit(OPCODES[op])
  33. fixup = lambda x: x
  34. skip = len(code); emit(0)
  35. _compile_charset(av, flags, code, fixup)
  36. code[skip] = len(code) - skip
  37. elif op is ANY:
  38. if flags & SRE_FLAG_DOTALL:
  39. emit(OPCODES[ANY_ALL])
  40. else:
  41. emit(OPCODES[ANY])
  42. elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
  43. if flags & SRE_FLAG_TEMPLATE:
  44. raise error, "internal: unsupported template operator"
  45. emit(OPCODES[REPEAT])
  46. skip = len(code); emit(0)
  47. emit(av[0])
  48. emit(av[1])
  49. _compile(code, av[2], flags)
  50. emit(OPCODES[SUCCESS])
  51. code[skip] = len(code) - skip
  52. elif _simple(av) and op == MAX_REPEAT:
  53. emit(OPCODES[REPEAT_ONE])
  54. skip = len(code); emit(0)
  55. emit(av[0])
  56. emit(av[1])
  57. _compile(code, av[2], flags)
  58. emit(OPCODES[SUCCESS])
  59. code[skip] = len(code) - skip
  60. else:
  61. emit(OPCODES[REPEAT])
  62. skip = len(code); emit(0)
  63. emit(av[0])
  64. emit(av[1])
  65. _compile(code, av[2], flags)
  66. code[skip] = len(code) - skip
  67. if op == MAX_REPEAT:
  68. emit(OPCODES[MAX_UNTIL])
  69. else:
  70. emit(OPCODES[MIN_UNTIL])
  71. elif op is SUBPATTERN:
  72. if av[0]:
  73. emit(OPCODES[MARK])
  74. emit((av[0]-1)*2)
  75. # _compile_info(code, av[1], flags)
  76. _compile(code, av[1], flags)
  77. if av[0]:
  78. emit(OPCODES[MARK])
  79. emit((av[0]-1)*2+1)
  80. elif op in (SUCCESS, FAILURE):
  81. emit(OPCODES[op])
  82. elif op in (ASSERT, ASSERT_NOT):
  83. emit(OPCODES[op])
  84. skip = len(code); emit(0)
  85. if av[0] >= 0:
  86. emit(0) # look ahead
  87. else:
  88. lo, hi = av[1].getwidth()
  89. if lo != hi:
  90. raise error, "look-behind requires fixed-width pattern"
  91. emit(lo) # look behind
  92. _compile(code, av[1], flags)
  93. emit(OPCODES[SUCCESS])
  94. code[skip] = len(code) - skip
  95. elif op is CALL:
  96. emit(OPCODES[op])
  97. skip = len(code); emit(0)
  98. _compile(code, av, flags)
  99. emit(OPCODES[SUCCESS])
  100. code[skip] = len(code) - skip
  101. elif op is AT:
  102. emit(OPCODES[op])
  103. if flags & SRE_FLAG_MULTILINE:
  104. av = AT_MULTILINE.get(av, av)
  105. if flags & SRE_FLAG_LOCALE:
  106. av = AT_LOCALE.get(av, av)
  107. elif flags & SRE_FLAG_UNICODE:
  108. av = AT_UNICODE.get(av, av)
  109. emit(ATCODES[av])
  110. elif op is BRANCH:
  111. emit(OPCODES[op])
  112. tail = []
  113. for av in av[1]:
  114. skip = len(code); emit(0)
  115. # _compile_info(code, av, flags)
  116. _compile(code, av, flags)
  117. emit(OPCODES[JUMP])
  118. tail.append(len(code)); emit(0)
  119. code[skip] = len(code) - skip
  120. emit(0) # end of branch
  121. for tail in tail:
  122. code[tail] = len(code) - tail
  123. elif op is CATEGORY:
  124. emit(OPCODES[op])
  125. if flags & SRE_FLAG_LOCALE:
  126. av = CH_LOCALE[av]
  127. elif flags & SRE_FLAG_UNICODE:
  128. av = CH_UNICODE[av]
  129. emit(CHCODES[av])
  130. elif op is GROUPREF:
  131. if flags & SRE_FLAG_IGNORECASE:
  132. emit(OPCODES[OP_IGNORE[op]])
  133. else:
  134. emit(OPCODES[op])
  135. emit(av-1)
  136. else:
  137. raise ValueError, ("unsupported operand type", op)
  138. def _compile_charset(charset, flags, code, fixup=None):
  139. # compile charset subprogram
  140. emit = code.append
  141. if not fixup:
  142. fixup = lambda x: x
  143. for op, av in _optimize_charset(charset, fixup):
  144. emit(OPCODES[op])
  145. if op is NEGATE:
  146. pass
  147. elif op is LITERAL:
  148. emit(fixup(av))
  149. elif op is RANGE:
  150. emit(fixup(av[0]))
  151. emit(fixup(av[1]))
  152. elif op is CHARSET:
  153. code.extend(av)
  154. elif op is BIGCHARSET:
  155. code.extend(av)
  156. elif op is CATEGORY:
  157. if flags & SRE_FLAG_LOCALE:
  158. emit(CHCODES[CH_LOCALE[av]])
  159. elif flags & SRE_FLAG_UNICODE:
  160. emit(CHCODES[CH_UNICODE[av]])
  161. else:
  162. emit(CHCODES[av])
  163. else:
  164. raise error, "internal: unsupported set operator"
  165. emit(OPCODES[FAILURE])
  166. def _optimize_charset(charset, fixup):
  167. # internal: optimize character set
  168. out = []
  169. charmap = [0]*256
  170. try:
  171. for op, av in charset:
  172. if op is NEGATE:
  173. out.append((op, av))
  174. elif op is LITERAL:
  175. charmap[fixup(av)] = 1
  176. elif op is RANGE:
  177. for i in range(fixup(av[0]), fixup(av[1])+1):
  178. charmap[i] = 1
  179. elif op is CATEGORY:
  180. # XXX: could append to charmap tail
  181. return charset # cannot compress
  182. except IndexError:
  183. if sys.maxunicode != 65535:
  184. # XXX: big charsets don't work in UCS-4 builds
  185. return charset
  186. # character set contains unicode characters
  187. return _optimize_unicode(charset, fixup)
  188. # compress character map
  189. i = p = n = 0
  190. runs = []
  191. for c in charmap:
  192. if c:
  193. if n == 0:
  194. p = i
  195. n = n + 1
  196. elif n:
  197. runs.append((p, n))
  198. n = 0
  199. i = i + 1
  200. if n:
  201. runs.append((p, n))
  202. if len(runs) <= 2:
  203. # use literal/range
  204. for p, n in runs:
  205. if n == 1:
  206. out.append((LITERAL, p))
  207. else:
  208. out.append((RANGE, (p, p+n-1)))
  209. if len(out) < len(charset):
  210. return out
  211. else:
  212. # use bitmap
  213. data = _mk_bitmap(charmap)
  214. out.append((CHARSET, data))
  215. return out
  216. return charset
  217. def _mk_bitmap(bits):
  218. data = []
  219. m = 1; v = 0
  220. for c in bits:
  221. if c:
  222. v = v + m
  223. m = m << 1
  224. if m > MAXCODE:
  225. data.append(v)
  226. m = 1; v = 0
  227. return data
  228. # To represent a big charset, first a bitmap of all characters in the
  229. # set is constructed. Then, this bitmap is sliced into chunks of 256
  230. # characters, duplicate chunks are eliminitated, and each chunk is
  231. # given a number. In the compiled expression, the charset is
  232. # represented by a 16-bit word sequence, consisting of one word for
  233. # the number of different chunks, a sequence of 256 bytes (128 words)
  234. # of chunk numbers indexed by their original chunk position, and a
  235. # sequence of chunks (16 words each).
  236. # Compression is normally good: in a typical charset, large ranges of
  237. # Unicode will be either completely excluded (e.g. if only cyrillic
  238. # letters are to be matched), or completely included (e.g. if large
  239. # subranges of Kanji match). These ranges will be represented by
  240. # chunks of all one-bits or all zero-bits.
  241. # Matching can be also done efficiently: the more significant byte of
  242. # the Unicode character is an index into the chunk number, and the
  243. # less significant byte is a bit index in the chunk (just like the
  244. # CHARSET matching).
  245. def _optimize_unicode(charset, fixup):
  246. charmap = [0]*65536
  247. negate = 0
  248. for op, av in charset:
  249. if op is NEGATE:
  250. negate = 1
  251. elif op is LITERAL:
  252. charmap[fixup(av)] = 1
  253. elif op is RANGE:
  254. for i in range(fixup(av[0]), fixup(av[1])+1):
  255. charmap[i] = 1
  256. elif op is CATEGORY:
  257. # XXX: could expand category
  258. return charset # cannot compress
  259. if negate:
  260. for i in range(65536):
  261. charmap[i] = not charmap[i]
  262. comps = {}
  263. mapping = [0]*256
  264. block = 0
  265. data = []
  266. for i in range(256):
  267. chunk = tuple(charmap[i*256:(i+1)*256])
  268. new = comps.setdefault(chunk, block)
  269. mapping[i] = new
  270. if new == block:
  271. block = block + 1
  272. data = data + _mk_bitmap(chunk)
  273. header = [block]
  274. assert MAXCODE == 65535
  275. for i in range(128):
  276. if sys.byteorder == 'big':
  277. header.append(256*mapping[2*i]+mapping[2*i+1])
  278. else:
  279. header.append(mapping[2*i]+256*mapping[2*i+1])
  280. data[0:0] = header
  281. return [(BIGCHARSET, data)]
  282. def _simple(av):
  283. # check if av is a "simple" operator
  284. lo, hi = av[2].getwidth()
  285. if lo == 0 and hi == MAXREPEAT:
  286. raise error, "nothing to repeat"
  287. return lo == hi == 1 and av[2][0][0] != SUBPATTERN
  288. def _compile_info(code, pattern, flags):
  289. # internal: compile an info block. in the current version,
  290. # this contains min/max pattern width, and an optional literal
  291. # prefix or a character map
  292. lo, hi = pattern.getwidth()
  293. if lo == 0:
  294. return # not worth it
  295. # look for a literal prefix
  296. prefix = []
  297. prefix_skip = 0
  298. charset = [] # not used
  299. if not (flags & SRE_FLAG_IGNORECASE):
  300. # look for literal prefix
  301. for op, av in pattern.data:
  302. if op is LITERAL:
  303. if len(prefix) == prefix_skip:
  304. prefix_skip = prefix_skip + 1
  305. prefix.append(av)
  306. elif op is SUBPATTERN and len(av[1]) == 1:
  307. op, av = av[1][0]
  308. if op is LITERAL:
  309. prefix.append(av)
  310. else:
  311. break
  312. else:
  313. break
  314. # if no prefix, look for charset prefix
  315. if not prefix and pattern.data:
  316. op, av = pattern.data[0]
  317. if op is SUBPATTERN and av[1]:
  318. op, av = av[1][0]
  319. if op is LITERAL:
  320. charset.append((op, av))
  321. elif op is BRANCH:
  322. c = []
  323. for p in av[1]:
  324. if not p:
  325. break
  326. op, av = p[0]
  327. if op is LITERAL:
  328. c.append((op, av))
  329. else:
  330. break
  331. else:
  332. charset = c
  333. elif op is BRANCH:
  334. c = []
  335. for p in av[1]:
  336. if not p:
  337. break
  338. op, av = p[0]
  339. if op is LITERAL:
  340. c.append((op, av))
  341. else:
  342. break
  343. else:
  344. charset = c
  345. elif op is IN:
  346. charset = av
  347. ## if prefix:
  348. ## print "*** PREFIX", prefix, prefix_skip
  349. ## if charset:
  350. ## print "*** CHARSET", charset
  351. # add an info block
  352. emit = code.append
  353. emit(OPCODES[INFO])
  354. skip = len(code); emit(0)
  355. # literal flag
  356. mask = 0
  357. if prefix:
  358. mask = SRE_INFO_PREFIX
  359. if len(prefix) == prefix_skip == len(pattern.data):
  360. mask = mask + SRE_INFO_LITERAL
  361. elif charset:
  362. mask = mask + SRE_INFO_CHARSET
  363. emit(mask)
  364. # pattern length
  365. if lo < MAXCODE:
  366. emit(lo)
  367. else:
  368. emit(MAXCODE)
  369. prefix = prefix[:MAXCODE]
  370. if hi < MAXCODE:
  371. emit(hi)
  372. else:
  373. emit(0)
  374. # add literal prefix
  375. if prefix:
  376. emit(len(prefix)) # length
  377. emit(prefix_skip) # skip
  378. code.extend(prefix)
  379. # generate overlap table
  380. table = [-1] + ([0]*len(prefix))
  381. for i in range(len(prefix)):
  382. table[i+1] = table[i]+1
  383. while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
  384. table[i+1] = table[table[i+1]-1]+1
  385. code.extend(table[1:]) # don't store first entry
  386. elif charset:
  387. _compile_charset(charset, flags, code)
  388. code[skip] = len(code) - skip
  389. STRING_TYPES = [type("")]
  390. try:
  391. STRING_TYPES.append(type(unicode("")))
  392. except NameError:
  393. pass
  394. def _code(p, flags):
  395. flags = p.pattern.flags | flags
  396. code = []
  397. # compile info block
  398. _compile_info(code, p, flags)
  399. # compile the pattern
  400. _compile(code, p.data, flags)
  401. code.append(OPCODES[SUCCESS])
  402. return code
  403. def compile(p, flags=0):
  404. # internal: convert pattern list to internal format
  405. if type(p) in STRING_TYPES:
  406. import sre_parse
  407. pattern = p
  408. p = sre_parse.parse(p, flags)
  409. else:
  410. pattern = None
  411. code = _code(p, flags)
  412. # print code
  413. # XXX: <fl> get rid of this limitation!
  414. assert p.pattern.groups <= 100,\
  415. "sorry, but this version only supports 100 named groups"
  416. # map in either direction
  417. groupindex = p.pattern.groupdict
  418. indexgroup = [None] * p.pattern.groups
  419. for k, i in groupindex.items():
  420. indexgroup[i] = k
  421. return _sre.compile(
  422. pattern, flags, code,
  423. p.pattern.groups-1,
  424. groupindex, indexgroup
  425. )