PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/pypy/rlib/parsing/test/test_pcre_regtest.py

https://bitbucket.org/yrttyr/pypy
Python | 308 lines | 303 code | 0 blank | 5 comment | 0 complexity | db6933a813f65e495a39ca967785611a MD5 | raw file
  1. """This test can read and parse PCRE regression tests to try out
  2. on our regular expression library.
  3. We currently only test against testoutput7 (DFA tests). We were doing
  4. testoutput1, but that was PCRE matching, which was inconsistent with
  5. our matching on strings like "[ab]{1,3}(ab*|b)" against 'aabbbb'.
  6. """
  7. pcre_license = """
  8. # The PCRE library is distributed under the BSD license. We have borrowed some
  9. # of the regression tests (the ones that fit under the DFA scope) in order to
  10. # exercise our regex implementation. Those tests are distributed under PCRE's
  11. # BSD license. Here is the text:
  12. # PCRE LICENCE
  13. # ------------
  14. #
  15. # PCRE is a library of functions to support regular expressions whose syntax
  16. # and semantics are as close as possible to those of the Perl 5 language.
  17. #
  18. # Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
  19. # specified below. The documentation for PCRE, supplied in the "doc"
  20. # directory, is distributed under the same terms as the software itself.
  21. #
  22. # The basic library functions are written in C and are freestanding. Also
  23. # included in the distribution is a set of C++ wrapper functions.
  24. #
  25. # THE BASIC LIBRARY FUNCTIONS
  26. # ---------------------------
  27. #
  28. # Written by: Philip Hazel
  29. # Email local part: ph10
  30. # Email domain: cam.ac.uk
  31. #
  32. # University of Cambridge Computing Service,
  33. # Cambridge, England.
  34. #
  35. # Copyright (c) 1997-2008 University of Cambridge
  36. # All rights reserved.
  37. #
  38. # THE C++ WRAPPER FUNCTIONS
  39. # -------------------------
  40. #
  41. # Contributed by: Google Inc.
  42. #
  43. # Copyright (c) 2007-2008, Google Inc.
  44. # All rights reserved.
  45. #
  46. # THE "BSD" LICENCE
  47. # -----------------
  48. #
  49. # Redistribution and use in source and binary forms, with or without
  50. # modification, are permitted provided that the following conditions are met:
  51. #
  52. # * Redistributions of source code must retain the above copyright notice,
  53. # this list of conditions and the following disclaimer.
  54. #
  55. # * Redistributions in binary form must reproduce the above copyright
  56. # notice, this list of conditions and the following disclaimer in the
  57. # documentation and/or other materials provided with the distribution.
  58. #
  59. # * Neither the name of the University of Cambridge nor the name of Google
  60. # Inc. nor the names of their contributors may be used to endorse or
  61. # promote products derived from this software without specific prior
  62. # written permission.
  63. #
  64. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  65. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  66. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  67. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  68. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  69. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  70. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  71. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  72. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  73. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  74. # POSSIBILITY OF SUCH DAMAGE.
  75. #
  76. # End
  77. """
  78. import py
  79. from pypy.rlib.parsing.regexparse import make_runner, unescape
  80. import string
  81. import re
  82. import autopath
  83. this_dir = py.path.local(autopath.this_dir)
  84. #py.test.skip("Still in progress")
  85. # Dumper's are objects that can dump/load the suite
  86. class Dumper(object):
  87. def __init__(self, file):
  88. pass
  89. def dump(self, tests):
  90. pass
  91. def load(self):
  92. return []
  93. class PickleDumper(Dumper):
  94. import pickle
  95. def __init__(self, fileobj):
  96. self.file = fileobj
  97. def dump(self, tests):
  98. pickle.dump(suite, self.file)
  99. def load(self):
  100. suite = pickle.load(file)
  101. return suite
  102. class PythonDumper(Dumper):
  103. def __init__(self, fileobj):
  104. self.file = fileobj
  105. def dump(self, tests):
  106. self.file.write('# Auto-generated file of regular expressions from PCRE library\n')
  107. self.file.write(pcre_license)
  108. self.file.write('suite = []\n')
  109. for test in tests:
  110. self.file.write('suite.append(%r)\n' % test)
  111. def load(self):
  112. d = {}
  113. text = self.file.read()
  114. exec text in d
  115. return d['suite']
  116. def generate_output7():
  117. """Create the testoutput7.py file from the PCRE file testoutput7"""
  118. create_pcre_pickle(this_dir.join('testoutput7').open(),
  119. PythonDumper(this_dir.join('pcre_test_7.py').open('w')))
  120. def create_pcre_pickle(file, dumper):
  121. """Create a filtered PCRE test file for the test."""
  122. lines = [line for line in file.readlines()]
  123. # Look for things to skip...
  124. no_escape = r'(^|[^\\])(\\\\)*' # Make sure there's no escaping \
  125. greedy_ops = re.compile(no_escape + r'[*?+}\(]\?') # Look for *? +? }? (?
  126. back_refs = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1
  127. caret_in_middle = re.compile(no_escape + r'[^\[\\]\^')
  128. posix_char_classes = re.compile(no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]') # like [[:digit:]]
  129. bad_backslashes = re.compile(no_escape + r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)') # PCRE allows \Q.....\E to quote substrings, we dont.
  130. # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example
  131. expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)')
  132. # suite = [
  133. # [regex, flags, [(test,result),(test,result),...]]
  134. # [regex, flags, [(test,result),(test,result),...]]
  135. # ]
  136. suite = []
  137. while lines:
  138. delim = None
  139. regex = ''
  140. # A line is marked by a start-delimeter and an end-delimeter.
  141. # The delimeter is non-alphanumeric
  142. # If a backslash follows the delimiter, then the backslash should
  143. # be appended to the end. (Otherwise, \ + delim would not be a
  144. # delim anymore!)
  145. while 1:
  146. regex += lines.pop(0)
  147. if not delim:
  148. if not regex.strip(): # Suppress blank lanes before delim
  149. regex = ''
  150. continue
  151. delim = regex.strip()[0]
  152. assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
  153. test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim})
  154. # last two groups are an optional backslash and optional flags
  155. matches = test_re.findall(regex)
  156. if matches:
  157. break
  158. assert len(matches)==1 # check to make sure we matched right
  159. regex = matches[0][0]
  160. regex += matches[0][-2] # Add the backslash, if we gotta
  161. flags = matches[0][-1] # Get the flags for the regex
  162. # Gotta tolerate Perl's short hexes
  163. regex = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), regex)
  164. tests = []
  165. if greedy_ops.search(regex) or back_refs.search(regex):
  166. # Suppress complex features we can't do
  167. pass
  168. elif flags:
  169. # Suppress any test that requires PCRE flags
  170. pass
  171. elif posix_char_classes.search(regex):
  172. pass
  173. elif caret_in_middle.search(regex):
  174. pass
  175. elif bad_backslashes.search(regex):
  176. pass
  177. else:
  178. # In any other case, we're going to add the test
  179. # All the above test fall through and DONT get appended
  180. suite.append([regex, flags, tests])
  181. # Now find the test and expected result
  182. while lines:
  183. test = lines.pop(0).strip()
  184. if not test:
  185. break # blank line ends the set
  186. if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
  187. assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous
  188. if not test.endswith('\\\\'): # Two \'s means a real \
  189. test = test[:-1]
  190. test = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), test)
  191. disqualify_test = bad_backslashes.search(test)
  192. try:
  193. test = unescape(test)
  194. except Exception:
  195. disqualify_test = True
  196. print "Warning: could not unescape %r" % test
  197. # Third line in the OUTPUT is the result, either:
  198. # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
  199. # 'No match' for no match
  200. # (other kinds exist, but we ignore them)
  201. while lines:
  202. match = lines.pop(0).rstrip('\r\n')
  203. match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match)
  204. if match.startswith('No match') or match.startswith('Error') or match.startswith('Partial'):
  205. match = None
  206. break
  207. elif match.startswith(' 0:'):
  208. # Now we need to eat any further lines like:
  209. # ' 1: ....' a subgroup match
  210. match = match[4:]
  211. while lines[0].strip():
  212. # ' 0+ ...' is also possible here
  213. if lines[0][2] in [':','+']:
  214. lines.pop(0)
  215. else:
  216. break
  217. break
  218. elif not match:
  219. print " *** %r ***" % match
  220. raise Exception("Lost sync in output.")
  221. if not disqualify_test:
  222. tests.append((test,match))
  223. # Last step, if there are regex's that dont have any tests,
  224. # might as well strip them out
  225. suite = [test for test in suite if test[2]]
  226. dumper.dump(suite)
  227. def run_individual_test(regex, tests):
  228. """Run a test from the PCRE suite."""
  229. # Process the regex and make it ready for make_runner
  230. regex_to_use = regex
  231. anchor_left = regex_to_use.startswith('^')
  232. anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
  233. if anchor_left:
  234. regex_to_use = regex_to_use[1:] # chop the ^ if it's there
  235. if anchor_right:
  236. regex_to_use = regex_to_use[:-1] # chop the $ if it's there
  237. if not regex_to_use:
  238. #print " SKIPPED (Cant do blank regex)"
  239. return
  240. print "%s:" % regex_to_use
  241. runner = make_runner(regex_to_use)
  242. # Now run the test expressions against the Regex
  243. for test, match in tests:
  244. print "/%r/%r/" % (test, match)
  245. # Create possible subsequences that we should test
  246. if anchor_left:
  247. start_range = [0]
  248. else:
  249. start_range = range(0, len(test))
  250. if anchor_right:
  251. subseq_gen = ( (start, len(test)) for start in start_range )
  252. else:
  253. # Go backwards to simulate greediness
  254. subseq_gen = ( (start, end) for start in start_range for end in range(len(test)+1, start-1, -1) )
  255. # Search the possibilities for a match...
  256. for start, end in subseq_gen:
  257. attempt = test[start:end]
  258. if runner.recognize(attempt):
  259. assert attempt==match
  260. break
  261. else:
  262. assert match is None
  263. def test_output7():
  264. suite = PythonDumper(this_dir.join('pcre_test_7.py').open()).load()
  265. while suite:
  266. regex, flags, tests = suite.pop(0)
  267. yield run_individual_test, regex, tests
  268. if __name__=="__main__":
  269. for fcn, regex, tests in test_output7():
  270. fcn(regex,tests)