PageRenderTime 56ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/rlib/runicode.py

https://bitbucket.org/pypy/pypy/
Python | 1780 lines | 1653 code | 46 blank | 81 comment | 150 complexity | 9fbc640ccab6a9fd9fe52a1cd1afb765 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. import sys
  2. from rpython.rlib.objectmodel import specialize, we_are_translated
  3. from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
  4. from rpython.rlib.rarithmetic import r_uint, intmask
  5. from rpython.rlib.unicodedata import unicodedb
  6. from rpython.rtyper.lltypesystem import lltype, rffi
  7. from rpython.rlib import jit
  8. if rffi.sizeof(lltype.UniChar) == 4:
  9. MAXUNICODE = 0x10ffff
  10. allow_surrogate_by_default = False
  11. else:
  12. MAXUNICODE = 0xffff
  13. allow_surrogate_by_default = True
  14. BYTEORDER = sys.byteorder
  15. # python 2.7 has a preview of py3k behavior, so those functions
  16. # are used either when we're testing wide pypy on narrow cpython
  17. # or in unicodedata in pypy
  18. def unichr_returns_surrogate(c):
  19. if c <= 0xffff or c > 0x10ffff:
  20. return unichr(c)
  21. else:
  22. c -= 0x10000
  23. return (unichr(0xD800 + (c >> 10)) +
  24. unichr(0xDC00 + (c & 0x03FF)))
  25. def ord_accepts_surrogate(u):
  26. if isinstance(u, unicode) and len(u) == 2:
  27. ch1 = ord(u[0])
  28. ch2 = ord(u[1])
  29. if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
  30. return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
  31. if not we_are_translated():
  32. return ord(u)
  33. else:
  34. if len(u) == 1:
  35. return ord(u[0])
  36. raise TypeError
  37. if MAXUNICODE > sys.maxunicode:
  38. # A version of unichr which allows codes outside the BMP
  39. # even on narrow unicode builds.
  40. # It will be used when interpreting code on top of a UCS2 CPython,
  41. # when sizeof(wchar_t) == 4.
  42. # Note that Python3 uses a similar implementation.
  43. def UNICHR(c):
  44. assert not we_are_translated()
  45. return unichr_returns_surrogate(c)
  46. UNICHR._flowspace_rewrite_directly_as_ = unichr
  47. # ^^^ NB.: for translation, it's essential to use this hack instead
  48. # of calling unichr() from UNICHR(), because unichr() detects if there
  49. # is a "try:except ValueError" immediately around it.
  50. def ORD(u):
  51. assert not we_are_translated()
  52. return ord_accepts_surrogate(u)
  53. ORD._flowspace_rewrite_directly_as_ = ord
  54. else:
  55. UNICHR = unichr
  56. ORD = ord
  57. if MAXUNICODE > 0xFFFF:
  58. def code_to_unichr(code):
  59. if is_narrow_host():
  60. # Host CPython is narrow build, generate surrogates
  61. return unichr_returns_surrogate(code)
  62. else:
  63. return unichr(code)
  64. else:
  65. def code_to_unichr(code):
  66. # generate surrogates for large codes
  67. return unichr_returns_surrogate(code)
  68. def _STORECHAR(result, CH, byteorder):
  69. hi = chr(((CH) >> 8) & 0xff)
  70. lo = chr((CH) & 0xff)
  71. if byteorder == 'little':
  72. result.append(lo)
  73. result.append(hi)
  74. else:
  75. result.append(hi)
  76. result.append(lo)
  77. def is_narrow_host():
  78. return not we_are_translated() and sys.maxunicode == 0xFFFF
  79. def default_unicode_error_decode(errors, encoding, msg, s,
  80. startingpos, endingpos):
  81. if errors == 'replace':
  82. return u'\ufffd', endingpos
  83. if errors == 'ignore':
  84. return u'', endingpos
  85. raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
  86. def default_unicode_error_encode(errors, encoding, msg, u,
  87. startingpos, endingpos):
  88. if errors == 'replace':
  89. return u'?', None, endingpos
  90. if errors == 'ignore':
  91. return u'', None, endingpos
  92. raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
  93. # ____________________________________________________________
  94. # utf-8
  95. utf8_code_length = [
  96. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 00-0F
  97. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  98. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  99. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  100. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  101. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  102. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  103. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 70-7F
  104. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
  105. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  106. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  107. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # B0-BF
  108. 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-C1 + C2-CF
  109. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
  110. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
  111. 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # F0-F4 - F5-FF
  112. ]
  113. def str_decode_utf_8(s, size, errors, final=False,
  114. errorhandler=None, allow_surrogates=allow_surrogate_by_default):
  115. if errorhandler is None:
  116. errorhandler = default_unicode_error_decode
  117. result = UnicodeBuilder(size)
  118. pos = str_decode_utf_8_impl(s, size, errors, final, errorhandler,
  119. allow_surrogates=allow_surrogates,
  120. result=result)
  121. return result.build(), pos
  122. @specialize.argtype(6)
  123. def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
  124. allow_surrogates, result):
  125. if size == 0:
  126. return 0
  127. pos = 0
  128. while pos < size:
  129. ordch1 = ord(s[pos])
  130. # fast path for ASCII
  131. # XXX maybe use a while loop here
  132. if ordch1 < 0x80:
  133. result.append(unichr(ordch1))
  134. pos += 1
  135. continue
  136. n = utf8_code_length[ordch1]
  137. if pos + n > size:
  138. if not final:
  139. break
  140. charsleft = size - pos - 1 # either 0, 1, 2
  141. # note: when we get the 'unexpected end of data' we don't care
  142. # about the pos anymore and we just ignore the value
  143. if not charsleft:
  144. # there's only the start byte and nothing else
  145. r, pos = errorhandler(errors, 'utf8',
  146. 'unexpected end of data',
  147. s, pos, pos+1)
  148. result.append(r)
  149. break
  150. ordch2 = ord(s[pos+1])
  151. if n == 3:
  152. # 3-bytes seq with only a continuation byte
  153. if (ordch2>>6 != 0x2 or # 0b10
  154. (ordch1 == 0xe0 and ordch2 < 0xa0)):
  155. # or (ordch1 == 0xed and ordch2 > 0x9f)
  156. # second byte invalid, take the first and continue
  157. r, pos = errorhandler(errors, 'utf8',
  158. 'invalid continuation byte',
  159. s, pos, pos+1)
  160. result.append(r)
  161. continue
  162. else:
  163. # second byte valid, but third byte missing
  164. r, pos = errorhandler(errors, 'utf8',
  165. 'unexpected end of data',
  166. s, pos, pos+2)
  167. result.append(r)
  168. break
  169. elif n == 4:
  170. # 4-bytes seq with 1 or 2 continuation bytes
  171. if (ordch2>>6 != 0x2 or # 0b10
  172. (ordch1 == 0xf0 and ordch2 < 0x90) or
  173. (ordch1 == 0xf4 and ordch2 > 0x8f)):
  174. # second byte invalid, take the first and continue
  175. r, pos = errorhandler(errors, 'utf8',
  176. 'invalid continuation byte',
  177. s, pos, pos+1)
  178. result.append(r)
  179. continue
  180. elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2: # 0b10
  181. # third byte invalid, take the first two and continue
  182. r, pos = errorhandler(errors, 'utf8',
  183. 'invalid continuation byte',
  184. s, pos, pos+2)
  185. result.append(r)
  186. continue
  187. else:
  188. # there's only 1 or 2 valid cb, but the others are missing
  189. r, pos = errorhandler(errors, 'utf8',
  190. 'unexpected end of data',
  191. s, pos, pos+charsleft+1)
  192. result.append(r)
  193. break
  194. if n == 0:
  195. r, pos = errorhandler(errors, 'utf8',
  196. 'invalid start byte',
  197. s, pos, pos+1)
  198. result.append(r)
  199. elif n == 1:
  200. assert 0, "ascii should have gone through the fast path"
  201. elif n == 2:
  202. ordch2 = ord(s[pos+1])
  203. if ordch2>>6 != 0x2: # 0b10
  204. r, pos = errorhandler(errors, 'utf8',
  205. 'invalid continuation byte',
  206. s, pos, pos+1)
  207. result.append(r)
  208. continue
  209. # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
  210. result.append(unichr(((ordch1 & 0x1F) << 6) + # 0b00011111
  211. (ordch2 & 0x3F))) # 0b00111111
  212. pos += 2
  213. elif n == 3:
  214. ordch2 = ord(s[pos+1])
  215. ordch3 = ord(s[pos+2])
  216. if (ordch2>>6 != 0x2 or # 0b10
  217. (ordch1 == 0xe0 and ordch2 < 0xa0)
  218. # surrogates shouldn't be valid UTF-8!
  219. or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
  220. ):
  221. r, pos = errorhandler(errors, 'utf8',
  222. 'invalid continuation byte',
  223. s, pos, pos+1)
  224. result.append(r)
  225. continue
  226. elif ordch3>>6 != 0x2: # 0b10
  227. r, pos = errorhandler(errors, 'utf8',
  228. 'invalid continuation byte',
  229. s, pos, pos+2)
  230. result.append(r)
  231. continue
  232. # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
  233. result.append(unichr(((ordch1 & 0x0F) << 12) + # 0b00001111
  234. ((ordch2 & 0x3F) << 6) + # 0b00111111
  235. (ordch3 & 0x3F))) # 0b00111111
  236. pos += 3
  237. elif n == 4:
  238. ordch2 = ord(s[pos+1])
  239. ordch3 = ord(s[pos+2])
  240. ordch4 = ord(s[pos+3])
  241. if (ordch2>>6 != 0x2 or # 0b10
  242. (ordch1 == 0xf0 and ordch2 < 0x90) or
  243. (ordch1 == 0xf4 and ordch2 > 0x8f)):
  244. r, pos = errorhandler(errors, 'utf8',
  245. 'invalid continuation byte',
  246. s, pos, pos+1)
  247. result.append(r)
  248. continue
  249. elif ordch3>>6 != 0x2: # 0b10
  250. r, pos = errorhandler(errors, 'utf8',
  251. 'invalid continuation byte',
  252. s, pos, pos+2)
  253. result.append(r)
  254. continue
  255. elif ordch4>>6 != 0x2: # 0b10
  256. r, pos = errorhandler(errors, 'utf8',
  257. 'invalid continuation byte',
  258. s, pos, pos+3)
  259. result.append(r)
  260. continue
  261. # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
  262. c = (((ordch1 & 0x07) << 18) + # 0b00000111
  263. ((ordch2 & 0x3F) << 12) + # 0b00111111
  264. ((ordch3 & 0x3F) << 6) + # 0b00111111
  265. (ordch4 & 0x3F)) # 0b00111111
  266. if c <= MAXUNICODE:
  267. result.append(UNICHR(c))
  268. else:
  269. # compute and append the two surrogates:
  270. # translate from 10000..10FFFF to 0..FFFF
  271. c -= 0x10000
  272. # high surrogate = top 10 bits added to D800
  273. result.append(unichr(0xD800 + (c >> 10)))
  274. # low surrogate = bottom 10 bits added to DC00
  275. result.append(unichr(0xDC00 + (c & 0x03FF)))
  276. pos += 4
  277. return pos
  278. def _encodeUCS4(result, ch):
  279. # Encode UCS4 Unicode ordinals
  280. result.append((chr((0xf0 | (ch >> 18)))))
  281. result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
  282. result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
  283. result.append((chr((0x80 | (ch & 0x3f)))))
  284. def unicode_encode_utf_8(s, size, errors, errorhandler=None,
  285. allow_surrogates=allow_surrogate_by_default):
  286. if errorhandler is None:
  287. errorhandler = default_unicode_error_encode
  288. return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
  289. allow_surrogates=allow_surrogates)
  290. def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
  291. allow_surrogates=False):
  292. assert(size >= 0)
  293. result = StringBuilder(size)
  294. pos = 0
  295. while pos < size:
  296. ch = ord(s[pos])
  297. pos += 1
  298. if ch < 0x80:
  299. # Encode ASCII
  300. result.append(chr(ch))
  301. elif ch < 0x0800:
  302. # Encode Latin-1
  303. result.append(chr((0xc0 | (ch >> 6))))
  304. result.append(chr((0x80 | (ch & 0x3f))))
  305. else:
  306. # Encode UCS2 Unicode ordinals
  307. if ch < 0x10000:
  308. # Special case: check for high surrogate
  309. if 0xD800 <= ch <= 0xDFFF:
  310. if pos != size:
  311. ch2 = ord(s[pos])
  312. # Check for low surrogate and combine the two to
  313. # form a UCS4 value
  314. if ((allow_surrogates or MAXUNICODE < 65536
  315. or is_narrow_host()) and
  316. ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
  317. ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
  318. assert ch3 >= 0
  319. pos += 1
  320. _encodeUCS4(result, ch3)
  321. continue
  322. if not allow_surrogates:
  323. ru, rs, pos = errorhandler(errors, 'utf8',
  324. 'surrogates not allowed',
  325. s, pos-1, pos)
  326. if rs is not None:
  327. # py3k only
  328. result.append(rs)
  329. continue
  330. for ch in ru:
  331. if ord(ch) < 0x80:
  332. result.append(chr(ord(ch)))
  333. else:
  334. errorhandler('strict', 'utf8',
  335. 'surrogates not allowed',
  336. s, pos-1, pos)
  337. continue
  338. # else: Fall through and handles isolated high surrogates
  339. result.append((chr((0xe0 | (ch >> 12)))))
  340. result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
  341. result.append((chr((0x80 | (ch & 0x3f)))))
  342. else:
  343. _encodeUCS4(result, ch)
  344. return result.build()
  345. # ____________________________________________________________
  346. # utf-16
  347. def str_decode_utf_16(s, size, errors, final=True,
  348. errorhandler=None):
  349. result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
  350. errorhandler, "native")
  351. return result, length
  352. def str_decode_utf_16_be(s, size, errors, final=True,
  353. errorhandler=None):
  354. result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
  355. errorhandler, "big")
  356. return result, length
  357. def str_decode_utf_16_le(s, size, errors, final=True,
  358. errorhandler=None):
  359. result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
  360. errorhandler, "little")
  361. return result, length
  362. def str_decode_utf_16_helper(s, size, errors, final=True,
  363. errorhandler=None,
  364. byteorder="native"):
  365. if errorhandler is None:
  366. errorhandler = default_unicode_error_decode
  367. bo = 0
  368. if BYTEORDER == 'little':
  369. ihi = 1
  370. ilo = 0
  371. else:
  372. ihi = 0
  373. ilo = 1
  374. # Check for BOM marks (U+FEFF) in the input and adjust current
  375. # byte order setting accordingly. In native mode, the leading BOM
  376. # mark is skipped, in all other modes, it is copied to the output
  377. # stream as-is (giving a ZWNBSP character).
  378. pos = 0
  379. if byteorder == 'native':
  380. if size >= 2:
  381. bom = (ord(s[ihi]) << 8) | ord(s[ilo])
  382. if BYTEORDER == 'little':
  383. if bom == 0xFEFF:
  384. pos += 2
  385. bo = -1
  386. elif bom == 0xFFFE:
  387. pos += 2
  388. bo = 1
  389. else:
  390. if bom == 0xFEFF:
  391. pos += 2
  392. bo = 1
  393. elif bom == 0xFFFE:
  394. pos += 2
  395. bo = -1
  396. elif byteorder == 'little':
  397. bo = -1
  398. else:
  399. bo = 1
  400. if size == 0:
  401. return u'', 0, bo
  402. if bo == -1:
  403. # force little endian
  404. ihi = 1
  405. ilo = 0
  406. elif bo == 1:
  407. # force big endian
  408. ihi = 0
  409. ilo = 1
  410. result = UnicodeBuilder(size // 2)
  411. #XXX I think the errors are not correctly handled here
  412. while pos < size:
  413. # remaining bytes at the end? (size should be even)
  414. if len(s) - pos < 2:
  415. if not final:
  416. break
  417. r, pos = errorhandler(errors, 'utf16', "truncated data",
  418. s, pos, len(s))
  419. result.append(r)
  420. if len(s) - pos < 2:
  421. break
  422. ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
  423. pos += 2
  424. if ch < 0xD800 or ch > 0xDFFF:
  425. result.append(unichr(ch))
  426. continue
  427. # UTF-16 code pair:
  428. if len(s) - pos < 2:
  429. pos -= 2
  430. if not final:
  431. break
  432. errmsg = "unexpected end of data"
  433. r, pos = errorhandler(errors, 'utf16', errmsg, s, pos, len(s))
  434. result.append(r)
  435. if len(s) - pos < 2:
  436. break
  437. elif 0xD800 <= ch <= 0xDBFF:
  438. ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
  439. pos += 2
  440. if 0xDC00 <= ch2 <= 0xDFFF:
  441. if MAXUNICODE < 65536:
  442. result.append(unichr(ch))
  443. result.append(unichr(ch2))
  444. else:
  445. result.append(UNICHR((((ch & 0x3FF)<<10) |
  446. (ch2 & 0x3FF)) + 0x10000))
  447. continue
  448. else:
  449. r, pos = errorhandler(errors, 'utf16',
  450. "illegal UTF-16 surrogate",
  451. s, pos - 4, pos - 2)
  452. result.append(r)
  453. else:
  454. r, pos = errorhandler(errors, 'utf16',
  455. "illegal encoding",
  456. s, pos - 2, pos)
  457. result.append(r)
  458. return result.build(), pos, bo
  459. def unicode_encode_utf_16_helper(s, size, errors,
  460. errorhandler=None,
  461. byteorder='little'):
  462. if size == 0:
  463. if byteorder == 'native':
  464. result = StringBuilder(2)
  465. _STORECHAR(result, 0xFEFF, BYTEORDER)
  466. return result.build()
  467. return ""
  468. result = StringBuilder(size * 2 + 2)
  469. if byteorder == 'native':
  470. _STORECHAR(result, 0xFEFF, BYTEORDER)
  471. byteorder = BYTEORDER
  472. i = 0
  473. while i < size:
  474. ch = ord(s[i])
  475. i += 1
  476. ch2 = 0
  477. if ch >= 0x10000:
  478. ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
  479. ch = 0xD800 | ((ch-0x10000) >> 10)
  480. _STORECHAR(result, ch, byteorder)
  481. if ch2:
  482. _STORECHAR(result, ch2, byteorder)
  483. return result.build()
  484. def unicode_encode_utf_16(s, size, errors,
  485. errorhandler=None):
  486. return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
  487. def unicode_encode_utf_16_be(s, size, errors,
  488. errorhandler=None):
  489. return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
  490. def unicode_encode_utf_16_le(s, size, errors,
  491. errorhandler=None):
  492. return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
  493. # ____________________________________________________________
  494. # utf-32
  495. def str_decode_utf_32(s, size, errors, final=True,
  496. errorhandler=None):
  497. result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
  498. errorhandler, "native")
  499. return result, length
  500. def str_decode_utf_32_be(s, size, errors, final=True,
  501. errorhandler=None):
  502. result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
  503. errorhandler, "big")
  504. return result, length
  505. def str_decode_utf_32_le(s, size, errors, final=True,
  506. errorhandler=None):
  507. result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
  508. errorhandler, "little")
  509. return result, length
  510. BOM32_DIRECT = intmask(0x0000FEFF)
  511. BOM32_REVERSE = intmask(0xFFFE0000)
  512. def str_decode_utf_32_helper(s, size, errors, final=True,
  513. errorhandler=None,
  514. byteorder="native"):
  515. if errorhandler is None:
  516. errorhandler = default_unicode_error_decode
  517. bo = 0
  518. if BYTEORDER == 'little':
  519. iorder = [0, 1, 2, 3]
  520. else:
  521. iorder = [3, 2, 1, 0]
  522. # Check for BOM marks (U+FEFF) in the input and adjust current
  523. # byte order setting accordingly. In native mode, the leading BOM
  524. # mark is skipped, in all other modes, it is copied to the output
  525. # stream as-is (giving a ZWNBSP character).
  526. pos = 0
  527. if byteorder == 'native':
  528. if size >= 4:
  529. bom = intmask(
  530. (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
  531. (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
  532. if BYTEORDER == 'little':
  533. if bom == BOM32_DIRECT:
  534. pos += 4
  535. bo = -1
  536. elif bom == BOM32_REVERSE:
  537. pos += 4
  538. bo = 1
  539. else:
  540. if bom == BOM32_DIRECT:
  541. pos += 4
  542. bo = 1
  543. elif bom == BOM32_REVERSE:
  544. pos += 4
  545. bo = -1
  546. elif byteorder == 'little':
  547. bo = -1
  548. else:
  549. bo = 1
  550. if size == 0:
  551. return u'', 0, bo
  552. if bo == -1:
  553. # force little endian
  554. iorder = [0, 1, 2, 3]
  555. elif bo == 1:
  556. # force big endian
  557. iorder = [3, 2, 1, 0]
  558. result = UnicodeBuilder(size // 4)
  559. while pos < size:
  560. # remaining bytes at the end? (size should be divisible by 4)
  561. if len(s) - pos < 4:
  562. if not final:
  563. break
  564. r, pos = errorhandler(errors, 'utf32', "truncated data",
  565. s, pos, len(s))
  566. result.append(r)
  567. if len(s) - pos < 4:
  568. break
  569. continue
  570. ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
  571. (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
  572. if ch >= 0x110000:
  573. r, pos = errorhandler(errors, 'utf32', "codepoint not in range(0x110000)",
  574. s, pos, len(s))
  575. result.append(r)
  576. continue
  577. if MAXUNICODE < 65536 and ch >= 0x10000:
  578. ch -= 0x10000L
  579. result.append(unichr(0xD800 + (ch >> 10)))
  580. result.append(unichr(0xDC00 + (ch & 0x03FF)))
  581. else:
  582. result.append(UNICHR(ch))
  583. pos += 4
  584. return result.build(), pos, bo
  585. def _STORECHAR32(result, CH, byteorder):
  586. c0 = chr(((CH) >> 24) & 0xff)
  587. c1 = chr(((CH) >> 16) & 0xff)
  588. c2 = chr(((CH) >> 8) & 0xff)
  589. c3 = chr((CH) & 0xff)
  590. if byteorder == 'little':
  591. result.append(c3)
  592. result.append(c2)
  593. result.append(c1)
  594. result.append(c0)
  595. else:
  596. result.append(c0)
  597. result.append(c1)
  598. result.append(c2)
  599. result.append(c3)
  600. def unicode_encode_utf_32_helper(s, size, errors,
  601. errorhandler=None,
  602. byteorder='little'):
  603. if size == 0:
  604. if byteorder == 'native':
  605. result = StringBuilder(4)
  606. _STORECHAR32(result, 0xFEFF, BYTEORDER)
  607. return result.build()
  608. return ""
  609. result = StringBuilder(size * 4 + 4)
  610. if byteorder == 'native':
  611. _STORECHAR32(result, 0xFEFF, BYTEORDER)
  612. byteorder = BYTEORDER
  613. i = 0
  614. while i < size:
  615. ch = ord(s[i])
  616. i += 1
  617. ch2 = 0
  618. if MAXUNICODE < 65536 and 0xD800 <= ch <= 0xDBFF and i < size:
  619. ch2 = ord(s[i])
  620. if 0xDC00 <= ch2 <= 0xDFFF:
  621. ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
  622. i += 1
  623. _STORECHAR32(result, ch, byteorder)
  624. return result.build()
  625. def unicode_encode_utf_32(s, size, errors,
  626. errorhandler=None):
  627. return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "native")
  628. def unicode_encode_utf_32_be(s, size, errors,
  629. errorhandler=None):
  630. return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "big")
  631. def unicode_encode_utf_32_le(s, size, errors,
  632. errorhandler=None):
  633. return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "little")
  634. # ____________________________________________________________
  635. # utf-7
  636. # Three simple macros defining base-64
  637. def _utf7_IS_BASE64(oc):
  638. "Is c a base-64 character?"
  639. c = chr(oc)
  640. return c.isalnum() or c == '+' or c == '/'
  641. def _utf7_TO_BASE64(n):
  642. "Returns the base-64 character of the bottom 6 bits of n"
  643. return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
  644. def _utf7_FROM_BASE64(c):
  645. "given that c is a base-64 character, what is its base-64 value?"
  646. if c >= 'a':
  647. return ord(c) - 71
  648. elif c >= 'A':
  649. return ord(c) - 65
  650. elif c >= '0':
  651. return ord(c) + 4
  652. elif c == '+':
  653. return 62
  654. else: # c == '/'
  655. return 63
  656. def _utf7_DECODE_DIRECT(oc):
  657. return oc <= 127 and oc != ord('+')
  658. # The UTF-7 encoder treats ASCII characters differently according to
  659. # whether they are Set D, Set O, Whitespace, or special (i.e. none of
  660. # the above). See RFC2152. This array identifies these different
  661. # sets:
  662. # 0 : "Set D"
  663. # alphanumeric and '(),-./:?
  664. # 1 : "Set O"
  665. # !"#$%&*;<=>@[]^_`{|}
  666. # 2 : "whitespace"
  667. # ht nl cr sp
  668. # 3 : special (must be base64 encoded)
  669. # everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
  670. utf7_category = [
  671. # nul soh stx etx eot enq ack bel bs ht nl vt np cr so si
  672. 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
  673. # dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us
  674. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  675. # sp ! " # $ % & ' ( ) * + , - . /
  676. 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
  677. # 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
  678. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
  679. # @ A B C D E F G H I J K L M N O
  680. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  681. # P Q R S T U V W X Y Z [ \ ] ^ _
  682. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
  683. # ` a b c d e f g h i j k l m n o
  684. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  685. # p q r s t u v w x y z { | } ~ del
  686. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
  687. ]
  688. # ENCODE_DIRECT: this character should be encoded as itself. The
  689. # answer depends on whether we are encoding set O as itself, and also
  690. # on whether we are encoding whitespace as itself. RFC2152 makes it
  691. # clear that the answers to these questions vary between
  692. # applications, so this code needs to be flexible.
  693. def _utf7_ENCODE_DIRECT(oc, directO, directWS):
  694. return(oc < 128 and oc > 0 and
  695. (utf7_category[oc] == 0 or
  696. (directWS and utf7_category[oc] == 2) or
  697. (directO and utf7_category[oc] == 1)))
  698. def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
  699. if MAXUNICODE > 65535 and oc >= 0x10000:
  700. # code first surrogate
  701. base64bits += 16
  702. base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
  703. while base64bits >= 6:
  704. result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
  705. base64bits -= 6
  706. # prepare second surrogate
  707. oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
  708. base64bits += 16
  709. base64buffer = (base64buffer << 16) | oc
  710. while base64bits >= 6:
  711. result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
  712. base64bits -= 6
  713. return base64bits, base64buffer
  714. def str_decode_utf_7(s, size, errors, final=False,
  715. errorhandler=None):
  716. if errorhandler is None:
  717. errorhandler = default_unicode_error_decode
  718. if size == 0:
  719. return u'', 0
  720. inShift = False
  721. base64bits = 0
  722. base64buffer = 0
  723. surrogate = 0
  724. result = UnicodeBuilder(size)
  725. pos = 0
  726. shiftOutStartPos = 0
  727. startinpos = 0
  728. while pos < size:
  729. ch = s[pos]
  730. oc = ord(ch)
  731. if inShift: # in a base-64 section
  732. if _utf7_IS_BASE64(oc): #consume a base-64 character
  733. base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
  734. base64bits += 6
  735. pos += 1
  736. if base64bits >= 16:
  737. # enough bits for a UTF-16 value
  738. outCh = base64buffer >> (base64bits - 16)
  739. base64bits -= 16
  740. base64buffer &= (1 << base64bits) - 1 # clear high bits
  741. assert outCh <= 0xffff
  742. if surrogate:
  743. # expecting a second surrogate
  744. if outCh >= 0xDC00 and outCh <= 0xDFFFF:
  745. if MAXUNICODE < 65536:
  746. result.append(unichr(surrogate))
  747. result.append(unichr(outCh))
  748. else:
  749. result.append(
  750. UNICHR((((surrogate & 0x3FF)<<10) |
  751. (outCh & 0x3FF)) + 0x10000))
  752. surrogate = 0
  753. continue
  754. else:
  755. result.append(unichr(surrogate))
  756. surrogate = 0
  757. # Not done with outCh: falls back to next line
  758. if outCh >= 0xD800 and outCh <= 0xDBFF:
  759. # first surrogate
  760. surrogate = outCh
  761. else:
  762. result.append(unichr(outCh))
  763. else:
  764. # now leaving a base-64 section
  765. inShift = False
  766. pos += 1
  767. if surrogate:
  768. result.append(unichr(surrogate))
  769. surrogate = 0
  770. if base64bits > 0: # left-over bits
  771. if base64bits >= 6:
  772. # We've seen at least one base-64 character
  773. msg = "partial character in shift sequence"
  774. res, pos = errorhandler(errors, 'utf7',
  775. msg, s, pos-1, pos)
  776. result.append(res)
  777. continue
  778. else:
  779. # Some bits remain; they should be zero
  780. if base64buffer != 0:
  781. msg = "non-zero padding bits in shift sequence"
  782. res, pos = errorhandler(errors, 'utf7',
  783. msg, s, pos-1, pos)
  784. result.append(res)
  785. continue
  786. if ch == '-':
  787. # '-' is absorbed; other terminating characters are
  788. # preserved
  789. base64bits = 0
  790. base64buffer = 0
  791. surrogate = 0
  792. else:
  793. result.append(unichr(ord(ch)))
  794. elif ch == '+':
  795. startinpos = pos
  796. pos += 1 # consume '+'
  797. if pos < size and s[pos] == '-': # '+-' encodes '+'
  798. pos += 1
  799. result.append(u'+')
  800. else: # begin base64-encoded section
  801. inShift = 1
  802. shiftOutStartPos = result.getlength()
  803. base64bits = 0
  804. base64buffer = 0
  805. elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
  806. result.append(unichr(oc))
  807. pos += 1
  808. else:
  809. startinpos = pos
  810. pos += 1
  811. msg = "unexpected special character"
  812. res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
  813. result.append(res)
  814. # end of string
  815. final_length = result.getlength()
  816. if inShift and final: # in shift sequence, no more to follow
  817. # if we're in an inconsistent state, that's an error
  818. if (surrogate or
  819. base64bits >= 6 or
  820. (base64bits > 0 and base64buffer != 0)):
  821. msg = "unterminated shift sequence"
  822. res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos)
  823. result.append(res)
  824. final_length = result.getlength()
  825. elif inShift:
  826. pos = startinpos
  827. final_length = shiftOutStartPos # back off output
  828. assert final_length >= 0
  829. return result.build()[:final_length], pos
  830. def unicode_encode_utf_7(s, size, errors, errorhandler=None):
  831. if size == 0:
  832. return ''
  833. result = StringBuilder(size)
  834. encodeSetO = encodeWhiteSpace = False
  835. inShift = False
  836. base64bits = 0
  837. base64buffer = 0
  838. pos = 0
  839. while pos < size:
  840. ch = s[pos]
  841. oc = ord(ch)
  842. if not inShift:
  843. if ch == u'+':
  844. result.append('+-')
  845. elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
  846. result.append(chr(oc))
  847. else:
  848. result.append('+')
  849. inShift = True
  850. base64bits, base64buffer = _utf7_ENCODE_CHAR(
  851. result, oc, base64bits, base64buffer)
  852. else:
  853. if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
  854. # shifting out
  855. if base64bits: # output remaining bits
  856. result.append(_utf7_TO_BASE64(base64buffer << (6-base64bits)))
  857. base64buffer = 0
  858. base64bits = 0
  859. inShift = False
  860. ## Characters not in the BASE64 set implicitly unshift the
  861. ## sequence so no '-' is required, except if the character is
  862. ## itself a '-'
  863. if _utf7_IS_BASE64(oc) or ch == u'-':
  864. result.append('-')
  865. result.append(chr(oc))
  866. else:
  867. base64bits, base64buffer = _utf7_ENCODE_CHAR(
  868. result, oc, base64bits, base64buffer)
  869. pos += 1
  870. if base64bits:
  871. result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
  872. if inShift:
  873. result.append('-')
  874. return result.build()
  875. # ____________________________________________________________
  876. # ascii and latin-1
  877. def str_decode_latin_1(s, size, errors, final=False,
  878. errorhandler=None):
  879. # latin1 is equivalent to the first 256 ordinals in Unicode.
  880. pos = 0
  881. result = UnicodeBuilder(size)
  882. while pos < size:
  883. result.append(unichr(ord(s[pos])))
  884. pos += 1
  885. return result.build(), pos
  886. def str_decode_ascii(s, size, errors, final=False,
  887. errorhandler=None):
  888. if errorhandler is None:
  889. errorhandler = default_unicode_error_decode
  890. # ASCII is equivalent to the first 128 ordinals in Unicode.
  891. result = UnicodeBuilder(size)
  892. pos = 0
  893. while pos < size:
  894. c = s[pos]
  895. if ord(c) < 128:
  896. result.append(unichr(ord(c)))
  897. pos += 1
  898. else:
  899. r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
  900. s, pos, pos + 1)
  901. result.append(r)
  902. return result.build(), pos
  903. # An elidable version, for a subset of the cases
  904. @jit.elidable
  905. def fast_str_decode_ascii(s):
  906. result = UnicodeBuilder(len(s))
  907. for c in s:
  908. if ord(c) >= 128:
  909. raise ValueError
  910. result.append(unichr(ord(c)))
  911. return result.build()
  912. def unicode_encode_ucs1_helper(p, size, errors,
  913. errorhandler=None, limit=256):
  914. if errorhandler is None:
  915. errorhandler = default_unicode_error_encode
  916. if limit == 256:
  917. reason = "ordinal not in range(256)"
  918. encoding = "latin-1"
  919. else:
  920. reason = "ordinal not in range(128)"
  921. encoding = "ascii"
  922. if size == 0:
  923. return ''
  924. result = StringBuilder(size)
  925. pos = 0
  926. while pos < size:
  927. ch = p[pos]
  928. if ord(ch) < limit:
  929. result.append(chr(ord(ch)))
  930. pos += 1
  931. else:
  932. # startpos for collecting unencodable chars
  933. collstart = pos
  934. collend = pos+1
  935. while collend < len(p) and ord(p[collend]) >= limit:
  936. collend += 1
  937. ru, rs, pos = errorhandler(errors, encoding, reason, p,
  938. collstart, collend)
  939. if rs is not None:
  940. # py3k only
  941. result.append(rs)
  942. continue
  943. for ch in ru:
  944. if ord(ch) < limit:
  945. result.append(chr(ord(ch)))
  946. else:
  947. errorhandler("strict", encoding, reason, p,
  948. collstart, collend)
  949. return result.build()
  950. def unicode_encode_latin_1(p, size, errors, errorhandler=None):
  951. res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 256)
  952. return res
  953. def unicode_encode_ascii(p, size, errors, errorhandler=None):
  954. res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 128)
  955. return res
  956. # ____________________________________________________________
  957. # Charmap
  958. ERROR_CHAR = u'\ufffe'
  959. @specialize.argtype(5)
  960. def str_decode_charmap(s, size, errors, final=False,
  961. errorhandler=None, mapping=None):
  962. "mapping can be a rpython dictionary, or a dict-like object."
  963. # Default to Latin-1
  964. if mapping is None:
  965. return str_decode_latin_1(s, size, errors, final=final,
  966. errorhandler=errorhandler)
  967. if errorhandler is None:
  968. errorhandler = default_unicode_error_decode
  969. if size == 0:
  970. return u'', 0
  971. pos = 0
  972. result = UnicodeBuilder(size)
  973. while pos < size:
  974. ch = s[pos]
  975. c = mapping.get(ch, ERROR_CHAR)
  976. if c == ERROR_CHAR:
  977. r, pos = errorhandler(errors, "charmap",
  978. "character maps to <undefined>",
  979. s, pos, pos + 1)
  980. result.append(r)
  981. continue
  982. result.append(c)
  983. pos += 1
  984. return result.build(), pos
  985. def unicode_encode_charmap(s, size, errors, errorhandler=None,
  986. mapping=None):
  987. if mapping is None:
  988. return unicode_encode_latin_1(s, size, errors,
  989. errorhandler=errorhandler)
  990. if errorhandler is None:
  991. errorhandler = default_unicode_error_encode
  992. if size == 0:
  993. return ''
  994. result = StringBuilder(size)
  995. pos = 0
  996. while pos < size:
  997. ch = s[pos]
  998. c = mapping.get(ch, '')
  999. if len(c) == 0:
  1000. # collect all unencodable chars. Important for narrow builds.
  1001. collend = pos + 1
  1002. while collend < size and mapping.get(s[collend], '') == '':
  1003. collend += 1
  1004. ru, rs, pos = errorhandler(errors, "charmap",
  1005. "character maps to <undefined>",
  1006. s, pos, collend)
  1007. if rs is not None:
  1008. # py3k only
  1009. result.append(rs)
  1010. continue
  1011. for ch2 in ru:
  1012. c2 = mapping.get(ch2, '')
  1013. if len(c2) == 0:
  1014. errorhandler(
  1015. "strict", "charmap",
  1016. "character maps to <undefined>",
  1017. s, pos, pos + 1)
  1018. result.append(c2)
  1019. continue
  1020. result.append(c)
  1021. pos += 1
  1022. return result.build()
  1023. # ____________________________________________________________
  1024. # Unicode escape
  1025. hexdigits = "0123456789ABCDEFabcdef"
  1026. def hexescape(builder, s, pos, digits,
  1027. encoding, errorhandler, message, errors):
  1028. chr = 0
  1029. if pos + digits > len(s):
  1030. endinpos = pos
  1031. while endinpos < len(s) and s[endinpos] in hexdigits:
  1032. endinpos += 1
  1033. res, pos = errorhandler(errors, encoding,
  1034. message, s, pos-2, endinpos)
  1035. builder.append(res)
  1036. else:
  1037. try:
  1038. chr = r_uint(int(s[pos:pos+digits], 16))
  1039. except ValueError:
  1040. endinpos = pos
  1041. while s[endinpos] in hexdigits:
  1042. endinpos += 1
  1043. res, pos = errorhandler(errors, encoding,
  1044. message, s, pos-2, endinpos)
  1045. builder.append(res)
  1046. else:
  1047. # when we get here, chr is a 32-bit unicode character
  1048. if chr <= MAXUNICODE:
  1049. builder.append(UNICHR(chr))
  1050. pos += digits
  1051. elif chr <= 0x10ffff:
  1052. chr -= 0x10000L
  1053. builder.append(unichr(0xD800 + (chr >> 10)))
  1054. builder.append(unichr(0xDC00 + (chr & 0x03FF)))
  1055. pos += digits
  1056. else:
  1057. message = "illegal Unicode character"
  1058. res, pos = errorhandler(errors, encoding,
  1059. message, s, pos-2, pos+digits)
  1060. builder.append(res)
  1061. return pos
  1062. def str_decode_unicode_escape(s, size, errors, final=False,
  1063. errorhandler=None,
  1064. unicodedata_handler=None):
  1065. if errorhandler is None:
  1066. errorhandler = default_unicode_error_decode
  1067. if size == 0:
  1068. return u'', 0
  1069. builder = UnicodeBuilder(size)
  1070. pos = 0
  1071. while pos < size:
  1072. ch = s[pos]
  1073. # Non-escape characters are interpreted as Unicode ordinals
  1074. if ch != '\\':
  1075. builder.append(unichr(ord(ch)))
  1076. pos += 1
  1077. continue
  1078. # - Escapes
  1079. pos += 1
  1080. if pos >= size:
  1081. message = "\\ at end of string"
  1082. res, pos = errorhandler(errors, "unicodeescape",
  1083. message, s, pos-1, size)
  1084. builder.append(res)
  1085. continue
  1086. ch = s[pos]
  1087. pos += 1
  1088. # \x escapes
  1089. if ch == '\n': pass
  1090. elif ch == '\\': builder.append(u'\\')
  1091. elif ch == '\'': builder.append(u'\'')
  1092. elif ch == '\"': builder.append(u'\"')
  1093. elif ch == 'b' : builder.append(u'\b')
  1094. elif ch == 'f' : builder.append(u'\f')
  1095. elif ch == 't' : builder.append(u'\t')
  1096. elif ch == 'n' : builder.append(u'\n')
  1097. elif ch == 'r' : builder.append(u'\r')
  1098. elif ch == 'v' : builder.append(u'\v')
  1099. elif ch == 'a' : builder.append(u'\a')
  1100. elif '0' <= ch <= '7':
  1101. x = ord(ch) - ord('0')
  1102. if pos < size:
  1103. ch = s[pos]
  1104. if '0' <= ch <= '7':
  1105. pos += 1
  1106. x = (x<<3) + ord(ch) - ord('0')
  1107. if pos < size:
  1108. ch = s[pos]
  1109. if '0' <= ch <= '7':
  1110. pos += 1
  1111. x = (x<<3) + ord(ch) - ord('0')
  1112. builder.append(unichr(x))
  1113. # hex escapes
  1114. # \xXX
  1115. elif ch == 'x':
  1116. digits = 2
  1117. message = "truncated \\xXX escape"
  1118. pos = hexescape(builder, s, pos, digits,
  1119. "unicodeescape", errorhandler, message, errors)
  1120. # \uXXXX
  1121. elif ch == 'u':
  1122. digits = 4
  1123. message = "truncated \\uXXXX escape"
  1124. pos = hexescape(builder, s, pos, digits,
  1125. "unicodeescape", errorhandler, message, errors)
  1126. # \UXXXXXXXX
  1127. elif ch == 'U':
  1128. digits = 8
  1129. message = "truncated \\UXXXXXXXX escape"
  1130. pos = hexescape(builder, s, pos, digits,
  1131. "unicodeescape", errorhandler, message, errors)
  1132. # \N{name}
  1133. elif ch == 'N' and unicodedata_handler is not None:
  1134. message = "malformed \\N character escape"
  1135. look = pos
  1136. if look < size and s[look] == '{':
  1137. # look for the closing brace
  1138. while look < size and s[look] != '}':
  1139. look += 1
  1140. if look < size and s[look] == '}':
  1141. # found a name. look it up in the unicode database
  1142. message = "unknown Unicode character name"
  1143. name = s[pos+1:look]
  1144. code = unicodedata_handler.call(name)
  1145. if code < 0:
  1146. res, pos = errorhandler(errors, "unicodeescape",
  1147. message, s, pos-1, look+1)
  1148. builder.append(res)
  1149. continue
  1150. pos = look + 1
  1151. if code <= MAXUNICODE:
  1152. builder.append(UNICHR(code))
  1153. else:
  1154. code -= 0x10000L
  1155. builder.append(unichr(0xD800 + (code >> 10)))
  1156. builder.append(unichr(0xDC00 + (code & 0x03FF)))
  1157. else:
  1158. res, pos = errorhandler(errors, "unicodeescape",
  1159. message, s, pos-1, look+1)
  1160. builder.append(res)
  1161. else:
  1162. res, pos = errorhandler(errors, "unicodeescape",
  1163. message, s, pos-1, look+1)
  1164. builder.append(res)
  1165. else:
  1166. builder.append(u'\\')
  1167. builder.append(unichr(ord(ch)))
  1168. return builder.build(), pos
  1169. def make_unicode_escape_function(pass_printable=False, unicode_output=False,
  1170. quotes=False, prefix=None):
  1171. # Python3 has two similar escape functions: One to implement
  1172. # encode('unicode_escape') and which outputs bytes, and unicode.__repr__
  1173. # which outputs unicode. They cannot share RPython code, so we generate
  1174. # them with the template below.
  1175. # Python2 does not really need this, but it reduces diffs between branches.
  1176. if unicode_output:
  1177. STRING_BUILDER = UnicodeBuilder
  1178. STR = unicode
  1179. CHR = UNICHR
  1180. else:
  1181. STRING_BUILDER = StringBuilder
  1182. STR = str
  1183. CHR = chr
  1184. def unicode_escape(s, size, errors, errorhandler=None):
  1185. # errorhandler is not used: this function cannot cause Unicode errors
  1186. result = STRING_BUILDER(size)
  1187. if quotes:
  1188. if prefix:
  1189. result.append(STR(prefix))
  1190. if s.find(u'\'') != -1 and s.find(u'\"') == -1:
  1191. quote = ord('\"')
  1192. result.append(STR('"'))
  1193. else:
  1194. quote = ord('\'')
  1195. result.append(STR('\''))
  1196. else:
  1197. quote = 0
  1198. if size == 0:
  1199. return STR('')
  1200. pos = 0
  1201. while pos < size:
  1202. ch = s[pos]
  1203. oc = ord(ch)
  1204. # Escape quotes
  1205. if quotes and (oc == quote or ch == '\\'):
  1206. result.append(STR('\\'))
  1207. result.append(CHR(oc))
  1208. pos += 1
  1209. continue
  1210. # The following logic is enabled only if MAXUNICODE == 0xffff, or
  1211. # for testing on top of a host Python where sys.maxunicode == 0xffff
  1212. if ((MAXUNICODE < 65536 or is_narrow_host())
  1213. and 0xD800 <= oc < 0xDC00 and pos + 1 < size):
  1214. # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
  1215. pos += 1
  1216. oc2 = ord(s[pos])
  1217. if 0xDC00 <= oc2 <= 0xDFFF:
  1218. ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
  1219. char_escape_helpeā€¦

Large files files are truncated, but you can click here to view the full file