PageRenderTime 75ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/pypy/module/_io/interp_textio.py

https://bitbucket.org/pypy/pypy/
Python | 1062 lines | 1026 code | 24 blank | 12 comment | 20 complexity | 37b99bfba5611f7045bb37cf26139d8b MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. import sys
  2. from pypy.interpreter.baseobjspace import W_Root
  3. from pypy.interpreter.error import OperationError, oefmt
  4. from pypy.interpreter.gateway import WrappedDefault, interp2app, unwrap_spec
  5. from pypy.interpreter.typedef import (
  6. GetSetProperty, TypeDef, generic_new_descr, interp_attrproperty,
  7. interp_attrproperty_w)
  8. from pypy.module._codecs import interp_codecs
  9. from pypy.module._io.interp_iobase import W_IOBase, convert_size, trap_eintr
  10. from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
  11. from rpython.rlib.rbigint import rbigint
  12. from rpython.rlib.rstring import UnicodeBuilder
  13. STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
  14. SEEN_CR = 1
  15. SEEN_LF = 2
  16. SEEN_CRLF = 4
  17. SEEN_ALL = SEEN_CR | SEEN_LF | SEEN_CRLF
  18. _WINDOWS = sys.platform == 'win32'
  19. class W_IncrementalNewlineDecoder(W_Root):
  20. seennl = 0
  21. pendingcr = False
  22. w_decoder = None
  23. def __init__(self, space):
  24. self.w_newlines_dict = {
  25. SEEN_CR: space.wrap(u"\r"),
  26. SEEN_LF: space.wrap(u"\n"),
  27. SEEN_CRLF: space.wrap(u"\r\n"),
  28. SEEN_CR | SEEN_LF: space.newtuple(
  29. [space.wrap(u"\r"), space.wrap(u"\n")]),
  30. SEEN_CR | SEEN_CRLF: space.newtuple(
  31. [space.wrap(u"\r"), space.wrap(u"\r\n")]),
  32. SEEN_LF | SEEN_CRLF: space.newtuple(
  33. [space.wrap(u"\n"), space.wrap(u"\r\n")]),
  34. SEEN_CR | SEEN_LF | SEEN_CRLF: space.newtuple(
  35. [space.wrap(u"\r"), space.wrap(u"\n"), space.wrap(u"\r\n")]),
  36. }
  37. @unwrap_spec(translate=int)
  38. def descr_init(self, space, w_decoder, translate, w_errors=None):
  39. self.w_decoder = w_decoder
  40. self.translate = translate
  41. if space.is_none(w_errors):
  42. self.w_errors = space.wrap("strict")
  43. else:
  44. self.w_errors = w_errors
  45. self.seennl = 0
  46. def newlines_get_w(self, space):
  47. return self.w_newlines_dict.get(self.seennl, space.w_None)
  48. @unwrap_spec(final=int)
  49. def decode_w(self, space, w_input, final=False):
  50. if self.w_decoder is None:
  51. raise oefmt(space.w_ValueError,
  52. "IncrementalNewlineDecoder.__init__ not called")
  53. # decode input (with the eventual \r from a previous pass)
  54. if not space.is_w(self.w_decoder, space.w_None):
  55. w_output = space.call_method(self.w_decoder, "decode",
  56. w_input, space.wrap(final))
  57. else:
  58. w_output = w_input
  59. if not space.isinstance_w(w_output, space.w_unicode):
  60. raise oefmt(space.w_TypeError,
  61. "decoder should return a string result")
  62. output = space.unicode_w(w_output)
  63. output_len = len(output)
  64. if self.pendingcr and (final or output_len):
  65. output = u'\r' + output
  66. self.pendingcr = False
  67. output_len += 1
  68. # retain last \r even when not translating data:
  69. # then readline() is sure to get \r\n in one pass
  70. if not final and output_len > 0:
  71. last = output_len - 1
  72. assert last >= 0
  73. if output[last] == u'\r':
  74. output = output[:last]
  75. self.pendingcr = True
  76. output_len -= 1
  77. if output_len == 0:
  78. return space.wrap(u"")
  79. # Record which newlines are read and do newline translation if
  80. # desired, all in one pass.
  81. seennl = self.seennl
  82. # If, up to now, newlines are consistently \n, do a quick check
  83. # for the \r
  84. only_lf = False
  85. if seennl == SEEN_LF or seennl == 0:
  86. only_lf = (output.find(u'\r') < 0)
  87. if only_lf:
  88. # If not already seen, quick scan for a possible "\n" character.
  89. # (there's nothing else to be done, even when in translation mode)
  90. if seennl == 0 and output.find(u'\n') >= 0:
  91. seennl |= SEEN_LF
  92. # Finished: we have scanned for newlines, and none of them
  93. # need translating.
  94. elif not self.translate:
  95. i = 0
  96. while i < output_len:
  97. if seennl == SEEN_ALL:
  98. break
  99. c = output[i]
  100. i += 1
  101. if c == u'\n':
  102. seennl |= SEEN_LF
  103. elif c == u'\r':
  104. if i < output_len and output[i] == u'\n':
  105. seennl |= SEEN_CRLF
  106. i += 1
  107. else:
  108. seennl |= SEEN_CR
  109. elif output.find(u'\r') >= 0:
  110. # Translate!
  111. builder = UnicodeBuilder(output_len)
  112. i = 0
  113. while i < output_len:
  114. c = output[i]
  115. i += 1
  116. if c == u'\n':
  117. seennl |= SEEN_LF
  118. elif c == u'\r':
  119. if i < output_len and output[i] == u'\n':
  120. seennl |= SEEN_CRLF
  121. i += 1
  122. else:
  123. seennl |= SEEN_CR
  124. builder.append(u'\n')
  125. continue
  126. builder.append(c)
  127. output = builder.build()
  128. self.seennl |= seennl
  129. return space.wrap(output)
  130. def reset_w(self, space):
  131. self.seennl = 0
  132. self.pendingcr = False
  133. if self.w_decoder and not space.is_w(self.w_decoder, space.w_None):
  134. space.call_method(self.w_decoder, "reset")
  135. def getstate_w(self, space):
  136. if self.w_decoder and not space.is_w(self.w_decoder, space.w_None):
  137. w_state = space.call_method(self.w_decoder, "getstate")
  138. w_buffer, w_flag = space.unpackiterable(w_state, 2)
  139. flag = space.r_longlong_w(w_flag)
  140. else:
  141. w_buffer = space.newbytes("")
  142. flag = 0
  143. flag <<= 1
  144. if self.pendingcr:
  145. flag |= 1
  146. return space.newtuple([w_buffer, space.wrap(flag)])
  147. def setstate_w(self, space, w_state):
  148. w_buffer, w_flag = space.unpackiterable(w_state, 2)
  149. flag = space.r_longlong_w(w_flag)
  150. self.pendingcr = bool(flag & 1)
  151. flag >>= 1
  152. if self.w_decoder and not space.is_w(self.w_decoder, space.w_None):
  153. w_state = space.newtuple([w_buffer, space.wrap(flag)])
  154. space.call_method(self.w_decoder, "setstate", w_state)
  155. W_IncrementalNewlineDecoder.typedef = TypeDef(
  156. '_io.IncrementalNewlineDecoder',
  157. __new__ = generic_new_descr(W_IncrementalNewlineDecoder),
  158. __init__ = interp2app(W_IncrementalNewlineDecoder.descr_init),
  159. decode = interp2app(W_IncrementalNewlineDecoder.decode_w),
  160. reset = interp2app(W_IncrementalNewlineDecoder.reset_w),
  161. getstate = interp2app(W_IncrementalNewlineDecoder.getstate_w),
  162. setstate = interp2app(W_IncrementalNewlineDecoder.setstate_w),
  163. newlines = GetSetProperty(W_IncrementalNewlineDecoder.newlines_get_w),
  164. )
  165. class W_TextIOBase(W_IOBase):
  166. w_encoding = None
  167. def __init__(self, space):
  168. W_IOBase.__init__(self, space)
  169. def read_w(self, space, w_size=None):
  170. self._unsupportedoperation(space, "read")
  171. def readline_w(self, space, w_limit=None):
  172. self._unsupportedoperation(space, "readline")
  173. def write_w(self, space, w_data):
  174. self._unsupportedoperation(space, "write")
  175. def detach_w(self, space):
  176. self._unsupportedoperation(space, "detach")
  177. def errors_get_w(self, space):
  178. return space.w_None
  179. def newlines_get_w(self, space):
  180. return space.w_None
  181. def _find_line_ending(self, line, start, end):
  182. size = end - start
  183. if self.readtranslate:
  184. # Newlines are already translated, only search for \n
  185. pos = line.find(u'\n', start, end)
  186. if pos >= 0:
  187. return pos - start + 1, 0
  188. else:
  189. return -1, size
  190. elif self.readuniversal:
  191. # Universal newline search. Find any of \r, \r\n, \n
  192. # The decoder ensures that \r\n are not split in two pieces
  193. i = 0
  194. while True:
  195. # Fast path for non-control chars. The loop always ends
  196. # since the Py_UNICODE storage is NUL-terminated.
  197. while i < size and line[start + i] > '\r':
  198. i += 1
  199. if i >= size:
  200. return -1, size
  201. ch = line[start + i]
  202. i += 1
  203. if ch == '\n':
  204. return i, 0
  205. if ch == '\r':
  206. if line[start + i] == '\n':
  207. return i + 1, 0
  208. else:
  209. return i, 0
  210. else:
  211. # Non-universal mode.
  212. pos = line.find(self.readnl, start, end)
  213. if pos >= 0:
  214. return pos - start + len(self.readnl), 0
  215. else:
  216. pos = line.find(self.readnl[0], start, end)
  217. if pos >= 0:
  218. return -1, pos - start
  219. return -1, size
  220. W_TextIOBase.typedef = TypeDef(
  221. '_io._TextIOBase', W_IOBase.typedef,
  222. __new__ = generic_new_descr(W_TextIOBase),
  223. read = interp2app(W_TextIOBase.read_w),
  224. readline = interp2app(W_TextIOBase.readline_w),
  225. write = interp2app(W_TextIOBase.write_w),
  226. detach = interp2app(W_TextIOBase.detach_w),
  227. encoding = interp_attrproperty_w("w_encoding", W_TextIOBase),
  228. newlines = GetSetProperty(W_TextIOBase.newlines_get_w),
  229. errors = GetSetProperty(W_TextIOBase.errors_get_w),
  230. )
  231. def _determine_encoding(space, encoding):
  232. if encoding is not None:
  233. return space.wrap(encoding)
  234. try:
  235. w_locale = space.call_method(space.builtin, '__import__',
  236. space.wrap('locale'))
  237. w_encoding = space.call_method(w_locale, 'getpreferredencoding')
  238. except OperationError as e:
  239. # getpreferredencoding() may also raise ImportError
  240. if not e.match(space, space.w_ImportError):
  241. raise
  242. return space.wrap('ascii')
  243. else:
  244. if space.isinstance_w(w_encoding, space.w_str):
  245. return w_encoding
  246. raise oefmt(space.w_IOError, "could not determine default encoding")
  247. class PositionCookie(object):
  248. def __init__(self, bigint):
  249. self.start_pos = bigint.ulonglongmask()
  250. bigint = bigint.rshift(r_ulonglong.BITS)
  251. x = intmask(bigint.uintmask())
  252. assert x >= 0
  253. self.dec_flags = x
  254. bigint = bigint.rshift(r_uint.BITS)
  255. x = intmask(bigint.uintmask())
  256. assert x >= 0
  257. self.bytes_to_feed = x
  258. bigint = bigint.rshift(r_uint.BITS)
  259. x = intmask(bigint.uintmask())
  260. assert x >= 0
  261. self.chars_to_skip = x
  262. bigint = bigint.rshift(r_uint.BITS)
  263. self.need_eof = bigint.tobool()
  264. def pack(self):
  265. # The meaning of a tell() cookie is: seek to position, set the
  266. # decoder flags to dec_flags, read bytes_to_feed bytes, feed them
  267. # into the decoder with need_eof as the EOF flag, then skip
  268. # chars_to_skip characters of the decoded result. For most simple
  269. # decoders, tell() will often just give a byte offset in the file.
  270. rb = rbigint.fromrarith_int
  271. res = rb(self.start_pos)
  272. bits = r_ulonglong.BITS
  273. res = res.or_(rb(r_uint(self.dec_flags)).lshift(bits))
  274. bits += r_uint.BITS
  275. res = res.or_(rb(r_uint(self.bytes_to_feed)).lshift(bits))
  276. bits += r_uint.BITS
  277. res = res.or_(rb(r_uint(self.chars_to_skip)).lshift(bits))
  278. bits += r_uint.BITS
  279. return res.or_(rb(r_uint(self.need_eof)).lshift(bits))
  280. class PositionSnapshot:
  281. def __init__(self, flags, input):
  282. self.flags = flags
  283. self.input = input
  284. def check_decoded(space, w_decoded):
  285. if not space.isinstance_w(w_decoded, space.w_unicode):
  286. msg = "decoder should return a string result, not '%T'"
  287. raise oefmt(space.w_TypeError, msg, w_decoded)
  288. class W_TextIOWrapper(W_TextIOBase):
  289. def __init__(self, space):
  290. W_TextIOBase.__init__(self, space)
  291. self.state = STATE_ZERO
  292. self.w_encoder = None
  293. self.w_decoder = None
  294. self.decoded_chars = None # buffer for text returned from decoder
  295. self.decoded_chars_used = 0 # offset into _decoded_chars for read()
  296. self.pending_bytes = None # list of bytes objects waiting to be
  297. # written, or NULL
  298. self.chunk_size = 8192
  299. self.readuniversal = False
  300. self.readtranslate = False
  301. self.readnl = None
  302. self.encodefunc = None # Specialized encoding func (see below)
  303. self.encoding_start_of_stream = False # Whether or not it's the start
  304. # of the stream
  305. self.snapshot = None
  306. @unwrap_spec(encoding="str_or_None", line_buffering=int)
  307. def descr_init(self, space, w_buffer, encoding=None,
  308. w_errors=None, w_newline=None, line_buffering=0):
  309. self.state = STATE_ZERO
  310. self.w_buffer = w_buffer
  311. self.w_encoding = _determine_encoding(space, encoding)
  312. if space.is_none(w_errors):
  313. w_errors = space.wrap("strict")
  314. self.w_errors = w_errors
  315. if space.is_none(w_newline):
  316. newline = None
  317. else:
  318. newline = space.unicode_w(w_newline)
  319. if newline and newline not in (u'\n', u'\r\n', u'\r'):
  320. r = space.str_w(space.repr(w_newline))
  321. raise oefmt(space.w_ValueError,
  322. "illegal newline value: %s", r)
  323. self.line_buffering = line_buffering
  324. self.readuniversal = not newline # null or empty
  325. self.readtranslate = newline is None
  326. self.readnl = newline
  327. self.writetranslate = (newline != u'')
  328. if not self.readuniversal:
  329. self.writenl = self.readnl
  330. if self.writenl == u'\n':
  331. self.writenl = None
  332. elif _WINDOWS:
  333. self.writenl = u"\r\n"
  334. else:
  335. self.writenl = None
  336. # build the decoder object
  337. if space.is_true(space.call_method(w_buffer, "readable")):
  338. w_codec = interp_codecs.lookup_codec(space,
  339. space.str_w(self.w_encoding))
  340. self.w_decoder = space.call_method(w_codec,
  341. "incrementaldecoder", w_errors)
  342. if self.readuniversal:
  343. self.w_decoder = space.call_function(
  344. space.gettypeobject(W_IncrementalNewlineDecoder.typedef),
  345. self.w_decoder, space.wrap(self.readtranslate))
  346. # build the encoder object
  347. if space.is_true(space.call_method(w_buffer, "writable")):
  348. w_codec = interp_codecs.lookup_codec(space,
  349. space.str_w(self.w_encoding))
  350. self.w_encoder = space.call_method(w_codec,
  351. "incrementalencoder", w_errors)
  352. self.seekable = space.is_true(space.call_method(w_buffer, "seekable"))
  353. self.telling = self.seekable
  354. self.encoding_start_of_stream = False
  355. if self.seekable and self.w_encoder:
  356. self.encoding_start_of_stream = True
  357. w_cookie = space.call_method(self.w_buffer, "tell")
  358. if not space.eq_w(w_cookie, space.wrap(0)):
  359. self.encoding_start_of_stream = False
  360. space.call_method(self.w_encoder, "setstate", space.wrap(0))
  361. self.state = STATE_OK
  362. def _check_init(self, space):
  363. if self.state == STATE_ZERO:
  364. raise oefmt(space.w_ValueError,
  365. "I/O operation on uninitialized object")
  366. def _check_attached(self, space):
  367. if self.state == STATE_DETACHED:
  368. raise oefmt(space.w_ValueError,
  369. "underlying buffer has been detached")
  370. self._check_init(space)
  371. def _check_closed(self, space, message=None):
  372. self._check_init(space)
  373. W_TextIOBase._check_closed(self, space, message)
  374. def descr_repr(self, space):
  375. self._check_init(space)
  376. w_name = space.findattr(self, space.wrap("name"))
  377. if w_name is None:
  378. w_name_str = space.wrap("")
  379. else:
  380. w_name_str = space.mod(space.wrap("name=%r "), w_name)
  381. w_args = space.newtuple([w_name_str, self.w_encoding])
  382. return space.mod(
  383. space.wrap("<_io.TextIOWrapper %sencoding=%r>"), w_args
  384. )
  385. def readable_w(self, space):
  386. self._check_attached(space)
  387. return space.call_method(self.w_buffer, "readable")
  388. def writable_w(self, space):
  389. self._check_attached(space)
  390. return space.call_method(self.w_buffer, "writable")
  391. def seekable_w(self, space):
  392. self._check_attached(space)
  393. return space.call_method(self.w_buffer, "seekable")
  394. def isatty_w(self, space):
  395. self._check_attached(space)
  396. return space.call_method(self.w_buffer, "isatty")
  397. def fileno_w(self, space):
  398. self._check_attached(space)
  399. return space.call_method(self.w_buffer, "fileno")
  400. def closed_get_w(self, space):
  401. self._check_attached(space)
  402. return space.getattr(self.w_buffer, space.wrap("closed"))
  403. def newlines_get_w(self, space):
  404. self._check_attached(space)
  405. if self.w_decoder is None:
  406. return space.w_None
  407. return space.findattr(self.w_decoder, space.wrap("newlines"))
  408. def name_get_w(self, space):
  409. self._check_attached(space)
  410. return space.getattr(self.w_buffer, space.wrap("name"))
  411. def flush_w(self, space):
  412. self._check_attached(space)
  413. self._check_closed(space)
  414. self.telling = self.seekable
  415. self._writeflush(space)
  416. space.call_method(self.w_buffer, "flush")
  417. @unwrap_spec(w_pos = WrappedDefault(None))
  418. def truncate_w(self, space, w_pos=None):
  419. self._check_attached(space)
  420. space.call_method(self, "flush")
  421. return space.call_method(self.w_buffer, "truncate", w_pos)
  422. def close_w(self, space):
  423. self._check_attached(space)
  424. if not space.is_true(space.getattr(self.w_buffer,
  425. space.wrap("closed"))):
  426. try:
  427. space.call_method(self, "flush")
  428. finally:
  429. ret = space.call_method(self.w_buffer, "close")
  430. return ret
  431. # _____________________________________________________________
  432. # read methods
  433. def _set_decoded_chars(self, chars):
  434. self.decoded_chars = chars
  435. self.decoded_chars_used = 0
  436. def _get_decoded_chars(self, size):
  437. if self.decoded_chars is None:
  438. return u""
  439. available = len(self.decoded_chars) - self.decoded_chars_used
  440. if size < 0 or size > available:
  441. size = available
  442. assert size >= 0
  443. if self.decoded_chars_used > 0 or size < available:
  444. start = self.decoded_chars_used
  445. end = self.decoded_chars_used + size
  446. assert start >= 0
  447. assert end >= 0
  448. chars = self.decoded_chars[start:end]
  449. else:
  450. chars = self.decoded_chars
  451. self.decoded_chars_used += size
  452. return chars
  453. def _read_chunk(self, space):
  454. """Read and decode the next chunk of data from the BufferedReader.
  455. The return value is True unless EOF was reached. The decoded string
  456. is placed in self._decoded_chars (replacing its previous value).
  457. The entire input chunk is sent to the decoder, though some of it may
  458. remain buffered in the decoder, yet to be converted."""
  459. if not self.w_decoder:
  460. raise oefmt(space.w_IOError, "not readable")
  461. if self.telling:
  462. # To prepare for tell(), we need to snapshot a point in the file
  463. # where the decoder's input buffer is empty.
  464. w_state = space.call_method(self.w_decoder, "getstate")
  465. # Given this, we know there was a valid snapshot point
  466. # len(dec_buffer) bytes ago with decoder state (b'', dec_flags).
  467. w_dec_buffer, w_dec_flags = space.unpackiterable(w_state, 2)
  468. dec_buffer = space.bytes_w(w_dec_buffer)
  469. dec_flags = space.int_w(w_dec_flags)
  470. else:
  471. dec_buffer = None
  472. dec_flags = 0
  473. # Read a chunk, decode it, and put the result in self._decoded_chars
  474. w_input = space.call_method(self.w_buffer, "read1",
  475. space.wrap(self.chunk_size))
  476. if not space.isinstance_w(w_input, space.w_str):
  477. msg = "decoder getstate() should have returned a bytes " \
  478. "object not '%T'"
  479. raise oefmt(space.w_TypeError, msg, w_input)
  480. eof = space.len_w(w_input) == 0
  481. w_decoded = space.call_method(self.w_decoder, "decode",
  482. w_input, space.wrap(eof))
  483. check_decoded(space, w_decoded)
  484. self._set_decoded_chars(space.unicode_w(w_decoded))
  485. if space.len_w(w_decoded) > 0:
  486. eof = False
  487. if self.telling:
  488. # At the snapshot point, len(dec_buffer) bytes before the read,
  489. # the next input to be decoded is dec_buffer + input_chunk.
  490. next_input = dec_buffer + space.bytes_w(w_input)
  491. self.snapshot = PositionSnapshot(dec_flags, next_input)
  492. return not eof
  493. def next_w(self, space):
  494. self._check_attached(space)
  495. self.telling = False
  496. try:
  497. return W_TextIOBase.next_w(self, space)
  498. except OperationError as e:
  499. if e.match(space, space.w_StopIteration):
  500. self.telling = self.seekable
  501. raise
  502. def read_w(self, space, w_size=None):
  503. self._check_attached(space)
  504. self._check_closed(space)
  505. if not self.w_decoder:
  506. raise oefmt(space.w_IOError, "not readable")
  507. size = convert_size(space, w_size)
  508. self._writeflush(space)
  509. if size < 0:
  510. # Read everything
  511. w_bytes = space.call_method(self.w_buffer, "read")
  512. w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True)
  513. check_decoded(space, w_decoded)
  514. w_result = space.wrap(self._get_decoded_chars(-1))
  515. w_final = space.add(w_result, w_decoded)
  516. self.snapshot = None
  517. return w_final
  518. remaining = size
  519. builder = UnicodeBuilder(size)
  520. # Keep reading chunks until we have n characters to return
  521. while True:
  522. data = self._get_decoded_chars(remaining)
  523. builder.append(data)
  524. remaining -= len(data)
  525. if remaining <= 0: # Done
  526. break
  527. try:
  528. if not self._read_chunk(space):
  529. # EOF
  530. break
  531. except OperationError as e:
  532. if trap_eintr(space, e):
  533. continue
  534. raise
  535. return space.wrap(builder.build())
  536. def readline_w(self, space, w_limit=None):
  537. self._check_attached(space)
  538. self._check_closed(space)
  539. self._writeflush(space)
  540. limit = convert_size(space, w_limit)
  541. chunked = 0
  542. line = None
  543. remaining = None
  544. chunks = []
  545. while True:
  546. # First, get some data if necessary
  547. has_data = True
  548. while not self.decoded_chars:
  549. try:
  550. if not self._read_chunk(space):
  551. has_data = False
  552. break
  553. except OperationError as e:
  554. if trap_eintr(space, e):
  555. continue
  556. raise
  557. if not has_data:
  558. # end of file
  559. self._set_decoded_chars(None)
  560. self.snapshot = None
  561. start = endpos = offset_to_buffer = 0
  562. break
  563. if not remaining:
  564. line = self.decoded_chars
  565. start = self.decoded_chars_used
  566. offset_to_buffer = 0
  567. else:
  568. assert self.decoded_chars_used == 0
  569. line = remaining + self.decoded_chars
  570. start = 0
  571. offset_to_buffer = len(remaining)
  572. remaining = None
  573. line_len = len(line)
  574. endpos, consumed = self._find_line_ending(line, start, line_len)
  575. if endpos >= 0:
  576. endpos += start
  577. if limit >= 0 and endpos >= start + limit - chunked:
  578. endpos = start + limit - chunked
  579. assert endpos >= 0
  580. break
  581. assert consumed >= 0
  582. # We can put aside up to `endpos`
  583. endpos = consumed + start
  584. if limit >= 0 and endpos >= start + limit - chunked:
  585. # Didn't find line ending, but reached length limit
  586. endpos = start + limit - chunked
  587. assert endpos >= 0
  588. break
  589. # No line ending seen yet - put aside current data
  590. if endpos > start:
  591. s = line[start:endpos]
  592. chunks.append(s)
  593. chunked += len(s)
  594. # There may be some remaining bytes we'll have to prepend to the
  595. # next chunk of data
  596. if endpos < line_len:
  597. remaining = line[endpos:]
  598. line = None
  599. # We have consumed the buffer
  600. self._set_decoded_chars(None)
  601. if line:
  602. # Our line ends in the current buffer
  603. decoded_chars_used = endpos - offset_to_buffer
  604. assert decoded_chars_used >= 0
  605. self.decoded_chars_used = decoded_chars_used
  606. if start > 0 or endpos < len(line):
  607. line = line[start:endpos]
  608. if remaining:
  609. chunks.append(remaining)
  610. remaining = None
  611. if chunks:
  612. if line:
  613. chunks.append(line)
  614. line = u''.join(chunks)
  615. if line:
  616. return space.wrap(line)
  617. else:
  618. return space.wrap(u'')
  619. # _____________________________________________________________
  620. # write methods
  621. def write_w(self, space, w_text):
  622. self._check_attached(space)
  623. self._check_closed(space)
  624. if not self.w_encoder:
  625. raise oefmt(space.w_IOError, "not writable")
  626. if not space.isinstance_w(w_text, space.w_unicode):
  627. raise oefmt(space.w_TypeError,
  628. "unicode argument expected, got '%T'", w_text)
  629. text = space.unicode_w(w_text)
  630. textlen = len(text)
  631. haslf = False
  632. if (self.writetranslate and self.writenl) or self.line_buffering:
  633. if text.find(u'\n') >= 0:
  634. haslf = True
  635. if haslf and self.writetranslate and self.writenl:
  636. w_text = space.call_method(w_text, "replace", space.wrap(u'\n'),
  637. space.wrap(self.writenl))
  638. text = space.unicode_w(w_text)
  639. needflush = False
  640. if self.line_buffering and (haslf or text.find(u'\r') >= 0):
  641. needflush = True
  642. # XXX What if we were just reading?
  643. if self.encodefunc:
  644. w_bytes = self.encodefunc(space, w_text, self.errors)
  645. self.encoding_start_of_stream = False
  646. else:
  647. w_bytes = space.call_method(self.w_encoder, "encode", w_text)
  648. b = space.bytes_w(w_bytes)
  649. if not self.pending_bytes:
  650. self.pending_bytes = []
  651. self.pending_bytes_count = 0
  652. self.pending_bytes.append(b)
  653. self.pending_bytes_count += len(b)
  654. if self.pending_bytes_count > self.chunk_size or needflush:
  655. self._writeflush(space)
  656. if needflush:
  657. space.call_method(self.w_buffer, "flush")
  658. self.snapshot = None
  659. if self.w_decoder:
  660. space.call_method(self.w_decoder, "reset")
  661. return space.wrap(textlen)
  662. def _writeflush(self, space):
  663. if not self.pending_bytes:
  664. return
  665. pending_bytes = ''.join(self.pending_bytes)
  666. self.pending_bytes = None
  667. self.pending_bytes_count = 0
  668. while True:
  669. try:
  670. space.call_method(self.w_buffer, "write",
  671. space.newbytes(pending_bytes))
  672. except OperationError as e:
  673. if trap_eintr(space, e):
  674. continue
  675. raise
  676. else:
  677. break
  678. def detach_w(self, space):
  679. self._check_attached(space)
  680. space.call_method(self, "flush")
  681. w_buffer = self.w_buffer
  682. self.w_buffer = None
  683. self.state = STATE_DETACHED
  684. return w_buffer
  685. # _____________________________________________________________
  686. # seek/tell
  687. def _decoder_setstate(self, space, cookie):
  688. # When seeking to the start of the stream, we call decoder.reset()
  689. # rather than decoder.getstate().
  690. # This is for a few decoders such as utf-16 for which the state value
  691. # at start is not (b"", 0) but e.g. (b"", 2) (meaning, in the case of
  692. # utf-16, that we are expecting a BOM).
  693. if cookie.start_pos == 0 and cookie.dec_flags == 0:
  694. space.call_method(self.w_decoder, "reset")
  695. else:
  696. space.call_method(self.w_decoder, "setstate",
  697. space.newtuple([space.newbytes(""),
  698. space.wrap(cookie.dec_flags)]))
  699. def _encoder_setstate(self, space, cookie):
  700. if cookie.start_pos == 0 and cookie.dec_flags == 0:
  701. space.call_method(self.w_encoder, "reset")
  702. self.encoding_start_of_stream = True
  703. else:
  704. space.call_method(self.w_encoder, "setstate", space.wrap(0))
  705. self.encoding_start_of_stream = False
  706. @unwrap_spec(whence=int)
  707. def seek_w(self, space, w_pos, whence=0):
  708. self._check_attached(space)
  709. if not self.seekable:
  710. raise oefmt(space.w_IOError, "underlying stream is not seekable")
  711. if whence == 1:
  712. # seek relative to current position
  713. if not space.is_true(space.eq(w_pos, space.wrap(0))):
  714. raise oefmt(space.w_IOError,
  715. "can't do nonzero cur-relative seeks")
  716. # Seeking to the current position should attempt to sync the
  717. # underlying buffer with the current position.
  718. w_pos = space.call_method(self, "tell")
  719. elif whence == 2:
  720. # seek relative to end of file
  721. if not space.is_true(space.eq(w_pos, space.wrap(0))):
  722. raise oefmt(space.w_IOError,
  723. "can't do nonzero end-relative seeks")
  724. space.call_method(self, "flush")
  725. self._set_decoded_chars(None)
  726. self.snapshot = None
  727. if self.w_decoder:
  728. space.call_method(self.w_decoder, "reset")
  729. return space.call_method(self.w_buffer, "seek",
  730. w_pos, space.wrap(whence))
  731. elif whence != 0:
  732. raise oefmt(space.w_ValueError,
  733. "invalid whence (%d, should be 0, 1 or 2)",
  734. whence)
  735. if space.is_true(space.lt(w_pos, space.wrap(0))):
  736. r = space.str_w(space.repr(w_pos))
  737. raise oefmt(space.w_ValueError,
  738. "negative seek position %s", r)
  739. space.call_method(self, "flush")
  740. # The strategy of seek() is to go back to the safe start point and
  741. # replay the effect of read(chars_to_skip) from there.
  742. cookie = PositionCookie(space.bigint_w(w_pos))
  743. # Seek back to the safe start point
  744. space.call_method(self.w_buffer, "seek", space.wrap(cookie.start_pos))
  745. self._set_decoded_chars(None)
  746. self.snapshot = None
  747. # Restore the decoder to its state from the safe start point.
  748. if self.w_decoder:
  749. self._decoder_setstate(space, cookie)
  750. if cookie.chars_to_skip:
  751. # Just like _read_chunk, feed the decoder and save a snapshot.
  752. w_chunk = space.call_method(self.w_buffer, "read",
  753. space.wrap(cookie.bytes_to_feed))
  754. if not space.isinstance_w(w_chunk, space.w_str):
  755. msg = "underlying read() should have returned " \
  756. "a bytes object, not '%T'"
  757. raise oefmt(space.w_TypeError, msg, w_chunk)
  758. self.snapshot = PositionSnapshot(cookie.dec_flags,
  759. space.bytes_w(w_chunk))
  760. w_decoded = space.call_method(self.w_decoder, "decode",
  761. w_chunk, space.wrap(cookie.need_eof))
  762. check_decoded(space, w_decoded)
  763. self._set_decoded_chars(space.unicode_w(w_decoded))
  764. # Skip chars_to_skip of the decoded characters
  765. if len(self.decoded_chars) < cookie.chars_to_skip:
  766. raise oefmt(space.w_IOError,
  767. "can't restore logical file position")
  768. self.decoded_chars_used = cookie.chars_to_skip
  769. else:
  770. self.snapshot = PositionSnapshot(cookie.dec_flags, "")
  771. # Finally, reset the encoder (merely useful for proper BOM handling)
  772. if self.w_encoder:
  773. self._encoder_setstate(space, cookie)
  774. return w_pos
  775. def tell_w(self, space):
  776. self._check_closed(space)
  777. if not self.seekable:
  778. raise oefmt(space.w_IOError, "underlying stream is not seekable")
  779. if not self.telling:
  780. raise oefmt(space.w_IOError,
  781. "telling position disabled by next() call")
  782. self._writeflush(space)
  783. space.call_method(self, "flush")
  784. w_pos = space.call_method(self.w_buffer, "tell")
  785. if self.w_decoder is None or self.snapshot is None:
  786. assert not self.decoded_chars
  787. return w_pos
  788. cookie = PositionCookie(space.bigint_w(w_pos))
  789. # Skip backward to the snapshot point (see _read_chunk)
  790. cookie.dec_flags = self.snapshot.flags
  791. input = self.snapshot.input
  792. cookie.start_pos -= len(input)
  793. # How many decoded characters have been used up since the snapshot?
  794. if not self.decoded_chars_used:
  795. # We haven't moved from the snapshot point.
  796. return space.newlong_from_rbigint(cookie.pack())
  797. chars_to_skip = self.decoded_chars_used
  798. # Starting from the snapshot position, we will walk the decoder
  799. # forward until it gives us enough decoded characters.
  800. w_saved_state = space.call_method(self.w_decoder, "getstate")
  801. try:
  802. # Note our initial start point
  803. self._decoder_setstate(space, cookie)
  804. # Feed the decoder one byte at a time. As we go, note the nearest
  805. # "safe start point" before the current location (a point where
  806. # the decoder has nothing buffered, so seek() can safely start
  807. # from there and advance to this location).
  808. chars_decoded = 0
  809. i = 0
  810. while i < len(input):
  811. w_decoded = space.call_method(self.w_decoder, "decode",
  812. space.newbytes(input[i]))
  813. check_decoded(space, w_decoded)
  814. chars_decoded += len(space.unicode_w(w_decoded))
  815. cookie.bytes_to_feed += 1
  816. w_state = space.call_method(self.w_decoder, "getstate")
  817. w_dec_buffer, w_flags = space.unpackiterable(w_state, 2)
  818. dec_buffer_len = len(space.str_w(w_dec_buffer))
  819. if dec_buffer_len == 0 and chars_decoded <= chars_to_skip:
  820. # Decoder buffer is empty, so this is a safe start point.
  821. cookie.start_pos += cookie.bytes_to_feed
  822. chars_to_skip -= chars_decoded
  823. assert chars_to_skip >= 0
  824. cookie.dec_flags = space.int_w(w_flags)
  825. cookie.bytes_to_feed = 0
  826. chars_decoded = 0
  827. if chars_decoded >= chars_to_skip:
  828. break
  829. i += 1
  830. else:
  831. # We didn't get enough decoded data; signal EOF to get more.
  832. w_decoded = space.call_method(self.w_decoder, "decode",
  833. space.wrap(""),
  834. space.wrap(1)) # final=1
  835. check_decoded(space, w_decoded)
  836. chars_decoded += len(space.unicode_w(w_decoded))
  837. cookie.need_eof = 1
  838. if chars_decoded < chars_to_skip:
  839. raise oefmt(space.w_IOError,
  840. "can't reconstruct logical file position")
  841. finally:
  842. space.call_method(self.w_decoder, "setstate", w_saved_state)
  843. # The returned cookie corresponds to the last safe start point.
  844. cookie.chars_to_skip = chars_to_skip
  845. return space.newlong_from_rbigint(cookie.pack())
  846. def chunk_size_get_w(self, space):
  847. self._check_attached(space)
  848. return space.wrap(self.chunk_size)
  849. def chunk_size_set_w(self, space, w_size):
  850. self._check_attached(space)
  851. size = space.int_w(w_size)
  852. if size <= 0:
  853. raise oefmt(space.w_ValueError,
  854. "a strictly positive integer is required")
  855. self.chunk_size = size
  856. W_TextIOWrapper.typedef = TypeDef(
  857. '_io.TextIOWrapper', W_TextIOBase.typedef,
  858. __new__ = generic_new_descr(W_TextIOWrapper),
  859. __init__ = interp2app(W_TextIOWrapper.descr_init),
  860. __repr__ = interp2app(W_TextIOWrapper.descr_repr),
  861. next = interp2app(W_TextIOWrapper.next_w),
  862. read = interp2app(W_TextIOWrapper.read_w),
  863. readline = interp2app(W_TextIOWrapper.readline_w),
  864. write = interp2app(W_TextIOWrapper.write_w),
  865. seek = interp2app(W_TextIOWrapper.seek_w),
  866. tell = interp2app(W_TextIOWrapper.tell_w),
  867. detach = interp2app(W_TextIOWrapper.detach_w),
  868. flush = interp2app(W_TextIOWrapper.flush_w),
  869. truncate = interp2app(W_TextIOWrapper.truncate_w),
  870. close = interp2app(W_TextIOWrapper.close_w),
  871. line_buffering = interp_attrproperty("line_buffering", W_TextIOWrapper),
  872. readable = interp2app(W_TextIOWrapper.readable_w),
  873. writable = interp2app(W_TextIOWrapper.writable_w),
  874. seekable = interp2app(W_TextIOWrapper.seekable_w),
  875. isatty = interp2app(W_TextIOWrapper.isatty_w),
  876. fileno = interp2app(W_TextIOWrapper.fileno_w),
  877. name = GetSetProperty(W_TextIOWrapper.name_get_w),
  878. buffer = interp_attrproperty_w("w_buffer", cls=W_TextIOWrapper),
  879. closed = GetSetProperty(W_TextIOWrapper.closed_get_w),
  880. errors = interp_attrproperty_w("w_errors", cls=W_TextIOWrapper),
  881. newlines = GetSetProperty(W_TextIOWrapper.newlines_get_w),
  882. _CHUNK_SIZE = GetSetProperty(
  883. W_TextIOWrapper.chunk_size_get_w, W_TextIOWrapper.chunk_size_set_w
  884. ),
  885. )