PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/rlib/rsre/rsre_char.py

https://bitbucket.org/kkris/pypy
Python | 237 lines | 167 code | 37 blank | 33 comment | 44 complexity | abb17d48d8b5ae93d409559333823f31 MD5 | raw file
  1. """
  2. Character categories and charsets.
  3. """
  4. import sys
  5. from rpython.rlib.rlocale import tolower, isalnum
  6. from rpython.rlib.unroll import unrolling_iterable
  7. from rpython.rlib import jit
  8. from rpython.rlib.rarithmetic import int_between
  9. # Note: the unicode parts of this module require you to call
  10. # rsre_char.set_unicode_db() first, to select one of the modules
  11. # pypy.module.unicodedata.unicodedb_x_y_z. This allows PyPy to use sre
  12. # with the same version of the unicodedb as it uses for
  13. # unicodeobject.py. If unset, the RPython program cannot use unicode
  14. # matching.
  15. unicodedb = None # possibly patched by set_unicode_db()
  16. def set_unicode_db(newunicodedb):
  17. global unicodedb
  18. unicodedb = newunicodedb
  19. #### Constants
  20. # Identifying as _sre from Python 2.3 and onwards (at least up to 2.7)
  21. MAGIC = 20031017
  22. # In _sre.c this is bytesize of the code word type of the C implementation.
  23. # There it's 2 for normal Python builds and more for wide unicode builds (large
  24. # enough to hold a 32-bit UCS-4 encoded character). Since here in pure Python
  25. # we only see re bytecodes as Python longs, we shouldn't have to care about the
  26. # codesize. But sre_compile will compile some stuff differently depending on the
  27. # codesize (e.g., charsets).
  28. from rpython.rlib.runicode import MAXUNICODE
  29. if MAXUNICODE == 65535:
  30. CODESIZE = 2
  31. else:
  32. CODESIZE = 4
  33. copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann"
  34. BIG_ENDIAN = sys.byteorder == "big"
  35. # XXX can we import those safely from sre_constants?
  36. SRE_INFO_PREFIX = 1
  37. SRE_INFO_LITERAL = 2
  38. SRE_INFO_CHARSET = 4
  39. SRE_FLAG_LOCALE = 4 # honour system locale
  40. SRE_FLAG_UNICODE = 32 # use unicode locale
  41. OPCODE_INFO = 17
  42. OPCODE_LITERAL = 19
  43. MAXREPEAT = 65535
  44. def getlower(char_ord, flags):
  45. if flags & SRE_FLAG_LOCALE:
  46. if char_ord < 256: # cheating! Well, CPython does too.
  47. char_ord = tolower(char_ord)
  48. return char_ord
  49. elif flags & SRE_FLAG_UNICODE:
  50. assert unicodedb is not None
  51. char_ord = unicodedb.tolower(char_ord)
  52. else:
  53. if int_between(ord('A'), char_ord, ord('Z') + 1): # ASCII lower
  54. char_ord += ord('a') - ord('A')
  55. return char_ord
  56. #### Category helpers
  57. is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
  58. linebreak = ord("\n")
  59. underline = ord("_")
  60. def is_digit(code):
  61. return int_between(48, code, 58)
  62. def is_uni_digit(code):
  63. assert unicodedb is not None
  64. return unicodedb.isdecimal(code)
  65. def is_space(code):
  66. return (code == 32) | int_between(9, code, 14)
  67. def is_uni_space(code):
  68. assert unicodedb is not None
  69. return unicodedb.isspace(code)
  70. def is_word(code):
  71. assert code >= 0
  72. return code < 256 and is_a_word[code]
  73. def is_uni_word(code):
  74. assert unicodedb is not None
  75. return unicodedb.isalnum(code) or code == underline
  76. def is_loc_alnum(code):
  77. return code < 256 and isalnum(code)
  78. def is_loc_word(code):
  79. return code == underline or is_loc_alnum(code)
  80. def is_linebreak(code):
  81. return code == linebreak
  82. def is_uni_linebreak(code):
  83. assert unicodedb is not None
  84. return unicodedb.islinebreak(code)
  85. #### Category dispatch
  86. def category_dispatch(category_code, char_code):
  87. i = 0
  88. for function, negate in category_dispatch_unroll:
  89. if category_code == i:
  90. result = function(char_code)
  91. if negate:
  92. return not result # XXX this might lead to a guard
  93. else:
  94. return result
  95. i = i + 1
  96. else:
  97. return False
  98. # Maps opcodes by indices to (function, negate) tuples.
  99. category_dispatch_table = [
  100. (is_digit, False), (is_digit, True), (is_space, False),
  101. (is_space, True), (is_word, False), (is_word, True),
  102. (is_linebreak, False), (is_linebreak, True), (is_loc_word, False),
  103. (is_loc_word, True), (is_uni_digit, False), (is_uni_digit, True),
  104. (is_uni_space, False), (is_uni_space, True), (is_uni_word, False),
  105. (is_uni_word, True), (is_uni_linebreak, False),
  106. (is_uni_linebreak, True)
  107. ]
  108. category_dispatch_unroll = unrolling_iterable(category_dispatch_table)
  109. ##### Charset evaluation
  110. @jit.unroll_safe
  111. def check_charset(pattern, ppos, char_code):
  112. """Checks whether a character matches set of arbitrary length.
  113. The set starts at pattern[ppos]."""
  114. negated = False
  115. result = False
  116. while True:
  117. opcode = pattern[ppos]
  118. i = 0
  119. for function in set_dispatch_unroll:
  120. if function is not None and opcode == i:
  121. newresult, ppos = function(pattern, ppos, char_code)
  122. result |= newresult
  123. break
  124. i = i + 1
  125. else:
  126. if opcode == 0: # FAILURE
  127. break
  128. elif opcode == 26: # NEGATE
  129. negated ^= True
  130. ppos += 1
  131. else:
  132. return False
  133. if negated:
  134. return not result
  135. return result
  136. def set_literal(pat, index, char_code):
  137. # <LITERAL> <code>
  138. match = pat[index+1] == char_code
  139. return match, index + 2
  140. def set_category(pat, index, char_code):
  141. # <CATEGORY> <code>
  142. match = category_dispatch(pat[index+1], char_code)
  143. return match, index + 2
  144. def set_charset(pat, index, char_code):
  145. # <CHARSET> <bitmap> (16 bits per code word)
  146. if CODESIZE == 2:
  147. match = char_code < 256 and \
  148. (pat[index+1+(char_code >> 4)] & (1 << (char_code & 15)))
  149. return match, index + 17 # skip bitmap
  150. else:
  151. match = char_code < 256 and \
  152. (pat[index+1+(char_code >> 5)] & (1 << (char_code & 31)))
  153. return match, index + 9 # skip bitmap
  154. def set_range(pat, index, char_code):
  155. # <RANGE> <lower> <upper>
  156. match = int_between(pat[index+1], char_code, pat[index+2] + 1)
  157. return match, index + 3
  158. def set_bigcharset(pat, index, char_code):
  159. # <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
  160. # XXX this function needs a makeover, it's very bad
  161. count = pat[index+1]
  162. index += 2
  163. if char_code < 65536:
  164. block_index = char_code >> 8
  165. # NB: there are CODESIZE block indices per bytecode
  166. a = to_byte_array(pat[index+(block_index / CODESIZE)])
  167. block = a[block_index % CODESIZE]
  168. index += 256 / CODESIZE # skip block indices
  169. if CODESIZE == 2:
  170. shift = 4
  171. else:
  172. shift = 5
  173. block_value = pat[index+(block * (32 / CODESIZE)
  174. + ((char_code & 255) >> shift))]
  175. match = (block_value & (1 << (char_code & ((8 * CODESIZE) - 1)))) != 0
  176. else:
  177. index += 256 / CODESIZE # skip block indices
  178. match = False
  179. index += count * (32 / CODESIZE) # skip blocks
  180. return match, index
  181. def to_byte_array(int_value):
  182. """Creates a list of bytes out of an integer representing data that is
  183. CODESIZE bytes wide."""
  184. byte_array = [0] * CODESIZE
  185. for i in range(CODESIZE):
  186. byte_array[i] = int_value & 0xff
  187. int_value = int_value >> 8
  188. if BIG_ENDIAN:
  189. byte_array.reverse()
  190. return byte_array
  191. set_dispatch_table = [
  192. None, # FAILURE
  193. None, None, None, None, None, None, None, None,
  194. set_category, set_charset, set_bigcharset, None, None, None,
  195. None, None, None, None, set_literal, None, None, None, None,
  196. None, None,
  197. None, # NEGATE
  198. set_range
  199. ]
  200. set_dispatch_unroll = unrolling_iterable(set_dispatch_table)