PageRenderTime 55ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/rlib/rsre/rsre_char.py

https://bitbucket.org/pjenvey/pypy-mq
Python | 265 lines | 179 code | 41 blank | 45 comment | 46 complexity | cd6b69c0af84fee7c5c0288e759db2ef MD5 | raw file
Possible License(s): Apache-2.0, AGPL-3.0, BSD-3-Clause
  1. """
  2. Character categories and charsets.
  3. """
  4. import sys
  5. from rpython.rlib.rlocale import tolower, isalnum
  6. from rpython.rlib.unroll import unrolling_iterable
  7. from rpython.rlib import jit
  8. from rpython.rlib.rarithmetic import int_between
  9. # Note: the unicode parts of this module require you to call
  10. # rsre_char.set_unicode_db() first, to select one of the modules
  11. # pypy.module.unicodedata.unicodedb_x_y_z. This allows PyPy to use sre
  12. # with the same version of the unicodedb as it uses for
  13. # unicodeobject.py. If unset, the RPython program cannot use unicode
  14. # matching.
  15. unicodedb = None # possibly patched by set_unicode_db()
  16. def set_unicode_db(newunicodedb):
  17. global unicodedb
  18. unicodedb = newunicodedb
  19. #### Constants
  20. if sys.maxint > 2**32:
  21. MAXREPEAT = int(2**32 - 1)
  22. MAXGROUPS = int(2**31 - 1)
  23. else:
  24. MAXREPEAT = int(2**31 - 1)
  25. MAXGROUPS = int((2**31 / sys.maxint / 2) - 1)
  26. # In _sre.c this is bytesize of the code word type of the C implementation.
  27. # There it's 2 for normal Python builds and more for wide unicode builds (large
  28. # enough to hold a 32-bit UCS-4 encoded character). Since here in pure Python
  29. # we only see re bytecodes as Python longs, we shouldn't have to care about the
  30. # codesize. But sre_compile will compile some stuff differently depending on the
  31. # codesize (e.g., charsets).
  32. from rpython.rlib.runicode import MAXUNICODE
  33. if MAXUNICODE == 65535:
  34. CODESIZE = 2
  35. else:
  36. CODESIZE = 4
  37. copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann"
  38. BIG_ENDIAN = sys.byteorder == "big"
  39. # XXX can we import those safely from sre_constants?
  40. SRE_INFO_PREFIX = 1
  41. SRE_INFO_LITERAL = 2
  42. SRE_INFO_CHARSET = 4
  43. SRE_FLAG_LOCALE = 4 # honour system locale
  44. SRE_FLAG_UNICODE = 32 # use unicode locale
  45. def getlower(char_ord, flags):
  46. if flags & SRE_FLAG_LOCALE:
  47. if char_ord < 256: # cheating! Well, CPython does too.
  48. char_ord = tolower(char_ord)
  49. return char_ord
  50. elif flags & SRE_FLAG_UNICODE:
  51. assert unicodedb is not None
  52. char_ord = unicodedb.tolower(char_ord)
  53. else:
  54. if int_between(ord('A'), char_ord, ord('Z') + 1): # ASCII lower
  55. char_ord += ord('a') - ord('A')
  56. return char_ord
  57. #### Category helpers
  58. is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
  59. linebreak = ord("\n")
  60. underline = ord("_")
  61. def is_digit(code):
  62. return int_between(48, code, 58)
  63. def is_uni_digit(code):
  64. assert unicodedb is not None
  65. return unicodedb.isdecimal(code)
  66. def is_space(code):
  67. return (code == 32) | int_between(9, code, 14)
  68. def is_uni_space(code):
  69. assert unicodedb is not None
  70. return unicodedb.isspace(code)
  71. def is_word(code):
  72. assert code >= 0
  73. return code < 256 and is_a_word[code]
  74. def is_uni_word(code):
  75. assert unicodedb is not None
  76. return unicodedb.isalnum(code) or code == underline
  77. def is_loc_alnum(code):
  78. return code < 256 and isalnum(code)
  79. def is_loc_word(code):
  80. return code == underline or is_loc_alnum(code)
  81. def is_linebreak(code):
  82. return code == linebreak
  83. def is_uni_linebreak(code):
  84. assert unicodedb is not None
  85. return unicodedb.islinebreak(code)
  86. #### Category dispatch
  87. def category_dispatch(category_code, char_code):
  88. i = 0
  89. for function, negate in category_dispatch_unroll:
  90. if category_code == i:
  91. result = function(char_code)
  92. if negate:
  93. return not result # XXX this might lead to a guard
  94. else:
  95. return result
  96. i = i + 1
  97. else:
  98. return False
  99. # Maps opcodes by indices to (function, negate) tuples.
  100. category_dispatch_table = [
  101. (is_digit, False), (is_digit, True), (is_space, False),
  102. (is_space, True), (is_word, False), (is_word, True),
  103. (is_linebreak, False), (is_linebreak, True), (is_loc_word, False),
  104. (is_loc_word, True), (is_uni_digit, False), (is_uni_digit, True),
  105. (is_uni_space, False), (is_uni_space, True), (is_uni_word, False),
  106. (is_uni_word, True), (is_uni_linebreak, False),
  107. (is_uni_linebreak, True)
  108. ]
  109. category_dispatch_unroll = unrolling_iterable(category_dispatch_table)
  110. ##### Charset evaluation
  111. @jit.unroll_safe
  112. def check_charset(pattern, ppos, char_code):
  113. """Checks whether a character matches set of arbitrary length.
  114. The set starts at pattern[ppos]."""
  115. negated = False
  116. result = False
  117. while True:
  118. opcode = pattern[ppos]
  119. for i, function in set_dispatch_unroll:
  120. if opcode == i:
  121. newresult, ppos = function(pattern, ppos, char_code)
  122. result |= newresult
  123. break
  124. else:
  125. if opcode == 0: # FAILURE
  126. break
  127. elif opcode == 26: # NEGATE
  128. negated ^= True
  129. ppos += 1
  130. else:
  131. return False
  132. if negated:
  133. return not result
  134. return result
  135. def set_literal(pat, index, char_code):
  136. # <LITERAL> <code>
  137. match = pat[index+1] == char_code
  138. return match, index + 2
  139. def set_category(pat, index, char_code):
  140. # <CATEGORY> <code>
  141. match = category_dispatch(pat[index+1], char_code)
  142. return match, index + 2
  143. def set_charset(pat, index, char_code):
  144. # <CHARSET> <bitmap> (16 bits per code word)
  145. if CODESIZE == 2:
  146. match = char_code < 256 and \
  147. (pat[index+1+(char_code >> 4)] & (1 << (char_code & 15)))
  148. return match, index + 17 # skip bitmap
  149. else:
  150. match = char_code < 256 and \
  151. (pat[index+1+(char_code >> 5)] & (1 << (char_code & 31)))
  152. return match, index + 9 # skip bitmap
  153. def set_range(pat, index, char_code):
  154. # <RANGE> <lower> <upper>
  155. match = int_between(pat[index+1], char_code, pat[index+2] + 1)
  156. return match, index + 3
  157. def set_bigcharset(pat, index, char_code):
  158. # <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
  159. count = pat[index+1]
  160. index += 2
  161. if CODESIZE == 2:
  162. # One bytecode is 2 bytes, so contains 2 of the blockindices.
  163. # So the 256 blockindices are packed in 128 bytecodes, but
  164. # we need to unpack it as a byte.
  165. assert char_code < 65536
  166. shift = 4
  167. else:
  168. # One bytecode is 4 bytes, so contains 4 of the blockindices.
  169. # So the 256 blockindices are packed in 64 bytecodes, but
  170. # we need to unpack it as a byte.
  171. if char_code >= 65536:
  172. index += 256 / CODESIZE + count * (32 / CODESIZE)
  173. return False, index
  174. shift = 5
  175. block = pat[index + (char_code >> (shift + 5))]
  176. block_shift = char_code >> 5
  177. if BIG_ENDIAN:
  178. block_shift = ~block_shift
  179. block_shift &= (CODESIZE - 1) * 8
  180. block = (block >> block_shift) & 0xFF
  181. index += 256 / CODESIZE
  182. block_value = pat[index+(block * (32 / CODESIZE)
  183. + ((char_code & 255) >> shift))]
  184. match = (block_value & (1 << (char_code & ((8 * CODESIZE) - 1))))
  185. index += count * (32 / CODESIZE) # skip blocks
  186. return match, index
  187. def set_unicode_general_category(pat, index, char_code):
  188. # Unicode "General category property code" (not used by Python).
  189. # A general category is two letters. 'pat[index+1]' contains both
  190. # the first character, and the second character shifted by 8.
  191. # http://en.wikipedia.org/wiki/Unicode_character_property#General_Category
  192. # Also supports single-character categories, if the second character is 0.
  193. # Negative matches are triggered by bit number 7.
  194. assert unicodedb is not None
  195. cat = unicodedb.category(char_code)
  196. category_code = pat[index + 1]
  197. first_character = category_code & 0x7F
  198. second_character = (category_code >> 8) & 0x7F
  199. negative_match = category_code & 0x80
  200. #
  201. if second_character == 0:
  202. # single-character match
  203. check = ord(cat[0])
  204. expected = first_character
  205. else:
  206. # two-characters match
  207. check = ord(cat[0]) | (ord(cat[1]) << 8)
  208. expected = first_character | (second_character << 8)
  209. #
  210. if negative_match:
  211. result = check != expected
  212. else:
  213. result = check == expected
  214. #
  215. return result, index + 2
  216. set_dispatch_table = {
  217. 9: set_category,
  218. 10: set_charset,
  219. 11: set_bigcharset,
  220. 19: set_literal,
  221. 27: set_range,
  222. 70: set_unicode_general_category,
  223. }
  224. set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))