PageRenderTime 91ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 1ms

/pypy/interpreter/pyparser/parsestring.py

https://bitbucket.org/pypy/pypy/
Python | 234 lines | 222 code | 2 blank | 10 comment | 0 complexity | 892cc5198d3b45ba20ee7aecc07968e2 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. from pypy.interpreter.error import OperationError, oefmt
  2. from pypy.interpreter import unicodehelper
  3. from rpython.rlib.rstring import StringBuilder
  4. def parsestr(space, encoding, s, unicode_literal=False):
  5. """Parses a string or unicode literal, and return a wrapped value.
  6. If encoding=iso8859-1, the source string is also in this encoding.
  7. If encoding=None, the source string is ascii only.
  8. In other cases, the source string is in utf-8 encoding.
  9. When a bytes string is returned, it will be encoded with the
  10. original encoding.
  11. Yes, it's very inefficient.
  12. Yes, CPython has very similar code.
  13. """
  14. # we use ps as "pointer to s"
  15. # q is the virtual last char index of the string
  16. ps = 0
  17. quote = s[ps]
  18. rawmode = False
  19. # string decoration handling
  20. if quote == 'b' or quote == 'B':
  21. ps += 1
  22. quote = s[ps]
  23. unicode_literal = False
  24. elif quote == 'u' or quote == 'U':
  25. ps += 1
  26. quote = s[ps]
  27. unicode_literal = True
  28. if quote == 'r' or quote == 'R':
  29. ps += 1
  30. quote = s[ps]
  31. rawmode = True
  32. if quote != "'" and quote != '"':
  33. raise_app_valueerror(space,
  34. 'Internal error: parser passed unquoted literal')
  35. ps += 1
  36. q = len(s) - 1
  37. if s[q] != quote:
  38. raise_app_valueerror(space, 'Internal error: parser passed unmatched '
  39. 'quotes in literal')
  40. if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote:
  41. # triple quotes
  42. ps += 2
  43. if s[q-1] != quote or s[q-2] != quote:
  44. raise_app_valueerror(space, 'Internal error: parser passed '
  45. 'unmatched triple quotes in literal')
  46. q -= 2
  47. if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
  48. if encoding is None or encoding == "iso-8859-1":
  49. # 'unicode_escape' expects latin-1 bytes, string is ready.
  50. assert 0 <= ps <= q
  51. substr = s[ps:q]
  52. else:
  53. substr = decode_unicode_utf8(space, s, ps, q)
  54. if rawmode:
  55. v = unicodehelper.decode_raw_unicode_escape(space, substr)
  56. else:
  57. v = unicodehelper.decode_unicode_escape(space, substr)
  58. return space.wrap(v)
  59. need_encoding = (encoding is not None and
  60. encoding != "utf-8" and encoding != "utf8" and
  61. encoding != "iso-8859-1")
  62. assert 0 <= ps <= q
  63. substr = s[ps : q]
  64. if rawmode or '\\' not in s[ps:]:
  65. if need_encoding:
  66. w_u = space.wrap(unicodehelper.decode_utf8(space, substr))
  67. w_v = unicodehelper.encode(space, w_u, encoding)
  68. return w_v
  69. else:
  70. return space.wrap(substr)
  71. enc = None
  72. if need_encoding:
  73. enc = encoding
  74. v = PyString_DecodeEscape(space, substr, 'strict', enc)
  75. return space.newbytes(v)
  76. def decode_unicode_utf8(space, s, ps, q):
  77. # ****The Python 2.7 version, producing UTF-32 escapes****
  78. # String is utf8-encoded, but 'unicode_escape' expects
  79. # latin-1; So multibyte sequences must be escaped.
  80. lis = [] # using a list to assemble the value
  81. end = q
  82. # Worst case:
  83. # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes)
  84. while ps < end:
  85. if s[ps] == '\\':
  86. lis.append(s[ps])
  87. ps += 1
  88. if ord(s[ps]) & 0x80:
  89. # A multibyte sequence will follow, it will be
  90. # escaped like \u1234. To avoid confusion with
  91. # the backslash we just wrote, we emit "\u005c"
  92. # instead.
  93. lis.append("u005c")
  94. if ord(s[ps]) & 0x80: # XXX inefficient
  95. w, ps = decode_utf8(space, s, ps, end)
  96. for c in w:
  97. # The equivalent of %08x, which is not supported by RPython.
  98. # 7 zeroes are enough for the unicode range, and the
  99. # result still fits in 32-bit.
  100. hexa = hex(ord(c) + 0x10000000)
  101. lis.append('\\U0')
  102. lis.append(hexa[3:]) # Skip 0x and the leading 1
  103. else:
  104. lis.append(s[ps])
  105. ps += 1
  106. return ''.join(lis)
  107. def PyString_DecodeEscape(space, s, errors, recode_encoding):
  108. """
  109. Unescape a backslash-escaped string. If recode_encoding is non-zero,
  110. the string is UTF-8 encoded and should be re-encoded in the
  111. specified encoding.
  112. """
  113. builder = StringBuilder(len(s))
  114. ps = 0
  115. end = len(s)
  116. while ps < end:
  117. if s[ps] != '\\':
  118. # note that the C code has a label here.
  119. # the logic is the same.
  120. if recode_encoding and ord(s[ps]) & 0x80:
  121. w, ps = decode_utf8_recode(space, s, ps, end, recode_encoding)
  122. # Append bytes to output buffer.
  123. builder.append(w)
  124. else:
  125. builder.append(s[ps])
  126. ps += 1
  127. continue
  128. ps += 1
  129. if ps == end:
  130. raise_app_valueerror(space, 'Trailing \\ in string')
  131. prevps = ps
  132. ch = s[ps]
  133. ps += 1
  134. # XXX This assumes ASCII!
  135. if ch == '\n':
  136. pass
  137. elif ch == '\\':
  138. builder.append('\\')
  139. elif ch == "'":
  140. builder.append("'")
  141. elif ch == '"':
  142. builder.append('"')
  143. elif ch == 'b':
  144. builder.append("\010")
  145. elif ch == 'f':
  146. builder.append('\014') # FF
  147. elif ch == 't':
  148. builder.append('\t')
  149. elif ch == 'n':
  150. builder.append('\n')
  151. elif ch == 'r':
  152. builder.append('\r')
  153. elif ch == 'v':
  154. builder.append('\013') # VT
  155. elif ch == 'a':
  156. builder.append('\007') # BEL, not classic C
  157. elif ch in '01234567':
  158. # Look for up to two more octal digits
  159. span = ps
  160. span += (span < end) and (s[span] in '01234567')
  161. span += (span < end) and (s[span] in '01234567')
  162. octal = s[prevps : span]
  163. # emulate a strange wrap-around behavior of CPython:
  164. # \400 is the same as \000 because 0400 == 256
  165. num = int(octal, 8) & 0xFF
  166. builder.append(chr(num))
  167. ps = span
  168. elif ch == 'x':
  169. if ps+2 <= end and isxdigit(s[ps]) and isxdigit(s[ps + 1]):
  170. hexa = s[ps : ps + 2]
  171. num = int(hexa, 16)
  172. builder.append(chr(num))
  173. ps += 2
  174. else:
  175. if errors == 'strict':
  176. raise_app_valueerror(space, 'invalid \\x escape')
  177. elif errors == 'replace':
  178. builder.append('?')
  179. elif errors == 'ignore':
  180. pass
  181. else:
  182. raise oefmt(space.w_ValueError, "decoding error; "
  183. "unknown error handling code: %s", errors)
  184. if ps+1 <= end and isxdigit(s[ps]):
  185. ps += 1
  186. else:
  187. # this was not an escape, so the backslash
  188. # has to be added, and we start over in
  189. # non-escape mode.
  190. builder.append('\\')
  191. ps -= 1
  192. assert ps >= 0
  193. continue
  194. # an arbitry number of unescaped UTF-8 bytes may follow.
  195. buf = builder.build()
  196. return buf
  197. def isxdigit(ch):
  198. return (ch >= '0' and ch <= '9' or
  199. ch >= 'a' and ch <= 'f' or
  200. ch >= 'A' and ch <= 'F')
  201. def decode_utf8(space, s, ps, end):
  202. assert ps >= 0
  203. pt = ps
  204. # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
  205. while ps < end and ord(s[ps]) & 0x80:
  206. ps += 1
  207. u = unicodehelper.decode_utf8(space, s[pt:ps])
  208. return u, ps
  209. def decode_utf8_recode(space, s, ps, end, recode_encoding):
  210. u, ps = decode_utf8(space, s, ps, end)
  211. w_v = unicodehelper.encode(space, space.wrap(u), recode_encoding)
  212. v = space.str_w(w_v)
  213. return v, ps
  214. def raise_app_valueerror(space, msg):
  215. raise OperationError(space.w_ValueError, space.wrap(msg))