PageRenderTime 50ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/rpython/rlib/parsing/test/test_pcre_regtest.py

https://bitbucket.org/buck_golemon/pypy
Python | 307 lines | 302 code | 0 blank | 5 comment | 0 complexity | 91df9d9404b3d0b120a7c521d5614eac MD5 | raw file
Possible License(s): Apache-2.0
  1. """This test can read and parse PCRE regression tests to try out
  2. on our regular expression library.
  3. We currently only test against testoutput7 (DFA tests). We were doing
  4. testoutput1, but that was PCRE matching, which was inconsistent with
  5. our matching on strings like "[ab]{1,3}(ab*|b)" against 'aabbbb'.
  6. """
  7. pcre_license = """
  8. # The PCRE library is distributed under the BSD license. We have borrowed some
  9. # of the regression tests (the ones that fit under the DFA scope) in order to
  10. # exercise our regex implementation. Those tests are distributed under PCRE's
  11. # BSD license. Here is the text:
  12. # PCRE LICENCE
  13. # ------------
  14. #
  15. # PCRE is a library of functions to support regular expressions whose syntax
  16. # and semantics are as close as possible to those of the Perl 5 language.
  17. #
  18. # Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
  19. # specified below. The documentation for PCRE, supplied in the "doc"
  20. # directory, is distributed under the same terms as the software itself.
  21. #
  22. # The basic library functions are written in C and are freestanding. Also
  23. # included in the distribution is a set of C++ wrapper functions.
  24. #
  25. # THE BASIC LIBRARY FUNCTIONS
  26. # ---------------------------
  27. #
  28. # Written by: Philip Hazel
  29. # Email local part: ph10
  30. # Email domain: cam.ac.uk
  31. #
  32. # University of Cambridge Computing Service,
  33. # Cambridge, England.
  34. #
  35. # Copyright (c) 1997-2008 University of Cambridge
  36. # All rights reserved.
  37. #
  38. # THE C++ WRAPPER FUNCTIONS
  39. # -------------------------
  40. #
  41. # Contributed by: Google Inc.
  42. #
  43. # Copyright (c) 2007-2008, Google Inc.
  44. # All rights reserved.
  45. #
  46. # THE "BSD" LICENCE
  47. # -----------------
  48. #
  49. # Redistribution and use in source and binary forms, with or without
  50. # modification, are permitted provided that the following conditions are met:
  51. #
  52. # * Redistributions of source code must retain the above copyright notice,
  53. # this list of conditions and the following disclaimer.
  54. #
  55. # * Redistributions in binary form must reproduce the above copyright
  56. # notice, this list of conditions and the following disclaimer in the
  57. # documentation and/or other materials provided with the distribution.
  58. #
  59. # * Neither the name of the University of Cambridge nor the name of Google
  60. # Inc. nor the names of their contributors may be used to endorse or
  61. # promote products derived from this software without specific prior
  62. # written permission.
  63. #
  64. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  65. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  66. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  67. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  68. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  69. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  70. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  71. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  72. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  73. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  74. # POSSIBILITY OF SUCH DAMAGE.
  75. #
  76. # End
  77. """
  78. import py
  79. from rpython.rlib.parsing.regexparse import make_runner, unescape
  80. import string
  81. import re
  82. this_dir = py.path.local(__file__).join('..')
  83. #py.test.skip("Still in progress")
  84. # Dumper's are objects that can dump/load the suite
  85. class Dumper(object):
  86. def __init__(self, file):
  87. pass
  88. def dump(self, tests):
  89. pass
  90. def load(self):
  91. return []
  92. class PickleDumper(Dumper):
  93. import pickle
  94. def __init__(self, fileobj):
  95. self.file = fileobj
  96. def dump(self, tests):
  97. pickle.dump(suite, self.file)
  98. def load(self):
  99. suite = pickle.load(file)
  100. return suite
  101. class PythonDumper(Dumper):
  102. def __init__(self, fileobj):
  103. self.file = fileobj
  104. def dump(self, tests):
  105. self.file.write('# Auto-generated file of regular expressions from PCRE library\n')
  106. self.file.write(pcre_license)
  107. self.file.write('suite = []\n')
  108. for test in tests:
  109. self.file.write('suite.append(%r)\n' % test)
  110. def load(self):
  111. d = {}
  112. text = self.file.read()
  113. exec text in d
  114. return d['suite']
  115. def generate_output7():
  116. """Create the testoutput7.py file from the PCRE file testoutput7"""
  117. create_pcre_pickle(this_dir.join('testoutput7').open(),
  118. PythonDumper(this_dir.join('pcre_test_7.py').open('w')))
  119. def create_pcre_pickle(file, dumper):
  120. """Create a filtered PCRE test file for the test."""
  121. lines = [line for line in file.readlines()]
  122. # Look for things to skip...
  123. no_escape = r'(^|[^\\])(\\\\)*' # Make sure there's no escaping \
  124. greedy_ops = re.compile(no_escape + r'[*?+}\(]\?') # Look for *? +? }? (?
  125. back_refs = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1
  126. caret_in_middle = re.compile(no_escape + r'[^\[\\]\^')
  127. posix_char_classes = re.compile(no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]') # like [[:digit:]]
  128. bad_backslashes = re.compile(no_escape + r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)') # PCRE allows \Q.....\E to quote substrings, we dont.
  129. # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example
  130. expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)')
  131. # suite = [
  132. # [regex, flags, [(test,result),(test,result),...]]
  133. # [regex, flags, [(test,result),(test,result),...]]
  134. # ]
  135. suite = []
  136. while lines:
  137. delim = None
  138. regex = ''
  139. # A line is marked by a start-delimeter and an end-delimeter.
  140. # The delimeter is non-alphanumeric
  141. # If a backslash follows the delimiter, then the backslash should
  142. # be appended to the end. (Otherwise, \ + delim would not be a
  143. # delim anymore!)
  144. while 1:
  145. regex += lines.pop(0)
  146. if not delim:
  147. if not regex.strip(): # Suppress blank lanes before delim
  148. regex = ''
  149. continue
  150. delim = regex.strip()[0]
  151. assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
  152. test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim})
  153. # last two groups are an optional backslash and optional flags
  154. matches = test_re.findall(regex)
  155. if matches:
  156. break
  157. assert len(matches)==1 # check to make sure we matched right
  158. regex = matches[0][0]
  159. regex += matches[0][-2] # Add the backslash, if we gotta
  160. flags = matches[0][-1] # Get the flags for the regex
  161. # Gotta tolerate Perl's short hexes
  162. regex = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), regex)
  163. tests = []
  164. if greedy_ops.search(regex) or back_refs.search(regex):
  165. # Suppress complex features we can't do
  166. pass
  167. elif flags:
  168. # Suppress any test that requires PCRE flags
  169. pass
  170. elif posix_char_classes.search(regex):
  171. pass
  172. elif caret_in_middle.search(regex):
  173. pass
  174. elif bad_backslashes.search(regex):
  175. pass
  176. else:
  177. # In any other case, we're going to add the test
  178. # All the above test fall through and DONT get appended
  179. suite.append([regex, flags, tests])
  180. # Now find the test and expected result
  181. while lines:
  182. test = lines.pop(0).strip()
  183. if not test:
  184. break # blank line ends the set
  185. if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
  186. assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous
  187. if not test.endswith('\\\\'): # Two \'s means a real \
  188. test = test[:-1]
  189. test = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), test)
  190. disqualify_test = bad_backslashes.search(test)
  191. try:
  192. test = unescape(test)
  193. except Exception:
  194. disqualify_test = True
  195. print "Warning: could not unescape %r" % test
  196. # Third line in the OUTPUT is the result, either:
  197. # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
  198. # 'No match' for no match
  199. # (other kinds exist, but we ignore them)
  200. while lines:
  201. match = lines.pop(0).rstrip('\r\n')
  202. match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match)
  203. if match.startswith('No match') or match.startswith('Error') or match.startswith('Partial'):
  204. match = None
  205. break
  206. elif match.startswith(' 0:'):
  207. # Now we need to eat any further lines like:
  208. # ' 1: ....' a subgroup match
  209. match = match[4:]
  210. while lines[0].strip():
  211. # ' 0+ ...' is also possible here
  212. if lines[0][2] in [':','+']:
  213. lines.pop(0)
  214. else:
  215. break
  216. break
  217. elif not match:
  218. print " *** %r ***" % match
  219. raise Exception("Lost sync in output.")
  220. if not disqualify_test:
  221. tests.append((test,match))
  222. # Last step, if there are regex's that dont have any tests,
  223. # might as well strip them out
  224. suite = [test for test in suite if test[2]]
  225. dumper.dump(suite)
  226. def run_individual_test(regex, tests):
  227. """Run a test from the PCRE suite."""
  228. # Process the regex and make it ready for make_runner
  229. regex_to_use = regex
  230. anchor_left = regex_to_use.startswith('^')
  231. anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
  232. if anchor_left:
  233. regex_to_use = regex_to_use[1:] # chop the ^ if it's there
  234. if anchor_right:
  235. regex_to_use = regex_to_use[:-1] # chop the $ if it's there
  236. if not regex_to_use:
  237. #print " SKIPPED (Cant do blank regex)"
  238. return
  239. print "%s:" % regex_to_use
  240. runner = make_runner(regex_to_use)
  241. # Now run the test expressions against the Regex
  242. for test, match in tests:
  243. print "/%r/%r/" % (test, match)
  244. # Create possible subsequences that we should test
  245. if anchor_left:
  246. start_range = [0]
  247. else:
  248. start_range = range(0, len(test))
  249. if anchor_right:
  250. subseq_gen = ( (start, len(test)) for start in start_range )
  251. else:
  252. # Go backwards to simulate greediness
  253. subseq_gen = ( (start, end) for start in start_range for end in range(len(test)+1, start-1, -1) )
  254. # Search the possibilities for a match...
  255. for start, end in subseq_gen:
  256. attempt = test[start:end]
  257. if runner.recognize(attempt):
  258. assert attempt==match
  259. break
  260. else:
  261. assert match is None
  262. def test_output7():
  263. suite = PythonDumper(this_dir.join('pcre_test_7.py').open()).load()
  264. while suite:
  265. regex, flags, tests = suite.pop(0)
  266. yield run_individual_test, regex, tests
  267. if __name__=="__main__":
  268. for fcn, regex, tests in test_output7():
  269. fcn(regex,tests)