PageRenderTime 1093ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/documentor/libraries/docutils-0.9.1-py3.2/docutils/io.py

https://github.com/tictactatic/Superdesk
Python | 497 lines | 445 code | 20 blank | 32 comment | 5 complexity | 6491853e1d5e764dfb4cd5a477bd3dce MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-3.0, GPL-2.0
  1. # $Id: io.py 7440 2012-06-13 14:14:12Z milde $
  2. # Author: David Goodger <goodger@python.org>
  3. # Copyright: This module has been placed in the public domain.
  4. """
  5. I/O classes provide a uniform API for low-level input and output. Subclasses
  6. will exist for a variety of input/output mechanisms.
  7. """
  8. __docformat__ = 'reStructuredText'
  9. import sys
  10. import os
  11. import re
  12. import codecs
  13. from docutils import TransformSpec
  14. from docutils._compat import b
  15. from docutils.error_reporting import locale_encoding, ErrorString, ErrorOutput
  16. class InputError(IOError): pass
  17. class OutputError(IOError): pass
  18. def check_encoding(stream, encoding):
  19. """Test, whether the encoding of `stream` matches `encoding`.
  20. Returns
  21. :None: if `encoding` or `stream.encoding` are not a valid encoding
  22. argument (e.g. ``None``) or `stream.encoding is missing.
  23. :True: if the encoding argument resolves to the same value as `encoding`,
  24. :False: if the encodings differ.
  25. """
  26. try:
  27. return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
  28. except (LookupError, AttributeError, TypeError):
  29. return None
  30. class Input(TransformSpec):
  31. """
  32. Abstract base class for input wrappers.
  33. """
  34. component_type = 'input'
  35. default_source_path = None
  36. def __init__(self, source=None, source_path=None, encoding=None,
  37. error_handler='strict'):
  38. self.encoding = encoding
  39. """Text encoding for the input source."""
  40. self.error_handler = error_handler
  41. """Text decoding error handler."""
  42. self.source = source
  43. """The source of input data."""
  44. self.source_path = source_path
  45. """A text reference to the source."""
  46. if not source_path:
  47. self.source_path = self.default_source_path
  48. self.successful_encoding = None
  49. """The encoding that successfully decoded the source data."""
  50. def __repr__(self):
  51. return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
  52. self.source_path)
  53. def read(self):
  54. raise NotImplementedError
  55. def decode(self, data):
  56. """
  57. Decode a string, `data`, heuristically.
  58. Raise UnicodeError if unsuccessful.
  59. The client application should call ``locale.setlocale`` at the
  60. beginning of processing::
  61. locale.setlocale(locale.LC_ALL, '')
  62. """
  63. if self.encoding and self.encoding.lower() == 'unicode':
  64. assert isinstance(data, str), (
  65. 'input encoding is "unicode" '
  66. 'but input is not a unicode object')
  67. if isinstance(data, str):
  68. # Accept unicode even if self.encoding != 'unicode'.
  69. return data
  70. if self.encoding:
  71. # We believe the user/application when the encoding is
  72. # explicitly given.
  73. encodings = [self.encoding]
  74. else:
  75. data_encoding = self.determine_encoding_from_data(data)
  76. if data_encoding:
  77. # If the data declares its encoding (explicitly or via a BOM),
  78. # we believe it.
  79. encodings = [data_encoding]
  80. else:
  81. # Apply heuristics only if no encoding is explicitly given and
  82. # no BOM found. Start with UTF-8, because that only matches
  83. # data that *IS* UTF-8:
  84. encodings = ['utf-8', 'latin-1']
  85. if locale_encoding:
  86. encodings.insert(1, locale_encoding)
  87. for enc in encodings:
  88. try:
  89. decoded = str(data, enc, self.error_handler)
  90. self.successful_encoding = enc
  91. # Return decoded, removing BOMs.
  92. return decoded.replace('\ufeff', '')
  93. except (UnicodeError, LookupError) as err:
  94. error = err # in Python 3, the <exception instance> is
  95. # local to the except clause
  96. raise UnicodeError(
  97. 'Unable to decode input data. Tried the following encodings: '
  98. '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
  99. ErrorString(error)))
  100. coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
  101. """Encoding declaration pattern."""
  102. byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
  103. (codecs.BOM_UTF16_BE, 'utf-16-be'),
  104. (codecs.BOM_UTF16_LE, 'utf-16-le'),)
  105. """Sequence of (start_bytes, encoding) tuples for encoding detection.
  106. The first bytes of input data are checked against the start_bytes strings.
  107. A match indicates the given encoding."""
  108. def determine_encoding_from_data(self, data):
  109. """
  110. Try to determine the encoding of `data` by looking *in* `data`.
  111. Check for a byte order mark (BOM) or an encoding declaration.
  112. """
  113. # check for a byte order mark:
  114. for start_bytes, encoding in self.byte_order_marks:
  115. if data.startswith(start_bytes):
  116. return encoding
  117. # check for an encoding declaration pattern in first 2 lines of file:
  118. for line in data.splitlines()[:2]:
  119. match = self.coding_slug.search(line)
  120. if match:
  121. return match.group(1).decode('ascii')
  122. return None
  123. class Output(TransformSpec):
  124. """
  125. Abstract base class for output wrappers.
  126. """
  127. component_type = 'output'
  128. default_destination_path = None
  129. def __init__(self, destination=None, destination_path=None,
  130. encoding=None, error_handler='strict'):
  131. self.encoding = encoding
  132. """Text encoding for the output destination."""
  133. self.error_handler = error_handler or 'strict'
  134. """Text encoding error handler."""
  135. self.destination = destination
  136. """The destination for output data."""
  137. self.destination_path = destination_path
  138. """A text reference to the destination."""
  139. if not destination_path:
  140. self.destination_path = self.default_destination_path
  141. def __repr__(self):
  142. return ('%s: destination=%r, destination_path=%r'
  143. % (self.__class__, self.destination, self.destination_path))
  144. def write(self, data):
  145. """`data` is a Unicode string, to be encoded by `self.encode`."""
  146. raise NotImplementedError
  147. def encode(self, data):
  148. if self.encoding and self.encoding.lower() == 'unicode':
  149. assert isinstance(data, str), (
  150. 'the encoding given is "unicode" but the output is not '
  151. 'a Unicode string')
  152. return data
  153. if not isinstance(data, str):
  154. # Non-unicode (e.g. binary) output.
  155. return data
  156. else:
  157. return data.encode(self.encoding, self.error_handler)
  158. class FileInput(Input):
  159. """
  160. Input for single, simple file-like objects.
  161. """
  162. def __init__(self, source=None, source_path=None,
  163. encoding=None, error_handler='strict',
  164. autoclose=True, handle_io_errors=True, mode='rU'):
  165. """
  166. :Parameters:
  167. - `source`: either a file-like object (which is read directly), or
  168. `None` (which implies `sys.stdin` if no `source_path` given).
  169. - `source_path`: a path to a file, which is opened and then read.
  170. - `encoding`: the expected text encoding of the input file.
  171. - `error_handler`: the encoding error handler to use.
  172. - `autoclose`: close automatically after read (except when
  173. `sys.stdin` is the source).
  174. - `handle_io_errors`: summarize I/O errors here, and exit?
  175. - `mode`: how the file is to be opened (see standard function
  176. `open`). The default 'rU' provides universal newline support
  177. for text files.
  178. """
  179. Input.__init__(self, source, source_path, encoding, error_handler)
  180. self.autoclose = autoclose
  181. self.handle_io_errors = handle_io_errors
  182. self._stderr = ErrorOutput()
  183. if source is None:
  184. if source_path:
  185. # Specify encoding in Python 3
  186. if sys.version_info >= (3,0):
  187. kwargs = {'encoding': self.encoding,
  188. 'errors': self.error_handler}
  189. else:
  190. kwargs = {}
  191. try:
  192. self.source = open(source_path, mode, **kwargs)
  193. except IOError as error:
  194. if handle_io_errors:
  195. print(ErrorString(error), file=self._stderr)
  196. print((
  197. 'Unable to open source file for reading ("%s").'
  198. 'Exiting.' % source_path), file=self._stderr)
  199. sys.exit(1)
  200. raise InputError(error.errno, error.strerror, source_path)
  201. else:
  202. self.source = sys.stdin
  203. elif (sys.version_info >= (3,0) and
  204. check_encoding(self.source, self.encoding) is False):
  205. # TODO: re-open, warn or raise error?
  206. raise UnicodeError('Encoding clash: encoding given is "%s" '
  207. 'but source is opened with encoding "%s".' %
  208. (self.encoding, self.source.encoding))
  209. if not source_path:
  210. try:
  211. self.source_path = self.source.name
  212. except AttributeError:
  213. pass
  214. def read(self):
  215. """
  216. Read and decode a single file and return the data (Unicode string).
  217. """
  218. try: # In Python < 2.5, try...except has to be nested in try...finally.
  219. try:
  220. if self.source is sys.stdin and sys.version_info >= (3,0):
  221. # read as binary data to circumvent auto-decoding
  222. data = self.source.buffer.read()
  223. # normalize newlines
  224. data = b('\n').join(data.splitlines()) + b('\n')
  225. else:
  226. data = self.source.read()
  227. except (UnicodeError, LookupError) as err: # (in Py3k read() decodes)
  228. if not self.encoding and self.source_path:
  229. # re-read in binary mode and decode with heuristics
  230. b_source = open(self.source_path, 'rb')
  231. data = b_source.read()
  232. b_source.close()
  233. # normalize newlines
  234. data = b('\n').join(data.splitlines()) + b('\n')
  235. else:
  236. raise
  237. finally:
  238. if self.autoclose:
  239. self.close()
  240. return self.decode(data)
  241. def readlines(self):
  242. """
  243. Return lines of a single file as list of Unicode strings.
  244. """
  245. return self.read().splitlines(True)
  246. def close(self):
  247. if self.source is not sys.stdin:
  248. self.source.close()
  249. class FileOutput(Output):
  250. """
  251. Output for single, simple file-like objects.
  252. """
  253. mode = 'w'
  254. """The mode argument for `open()`."""
  255. # 'wb' for binary (e.g. OpenOffice) files.
  256. # (Do not use binary mode ('wb') for text files, as this prevents the
  257. # conversion of newlines to the system specific default.)
  258. def __init__(self, destination=None, destination_path=None,
  259. encoding=None, error_handler='strict', autoclose=True,
  260. handle_io_errors=True, mode=None):
  261. """
  262. :Parameters:
  263. - `destination`: either a file-like object (which is written
  264. directly) or `None` (which implies `sys.stdout` if no
  265. `destination_path` given).
  266. - `destination_path`: a path to a file, which is opened and then
  267. written.
  268. - `encoding`: the text encoding of the output file.
  269. - `error_handler`: the encoding error handler to use.
  270. - `autoclose`: close automatically after write (except when
  271. `sys.stdout` or `sys.stderr` is the destination).
  272. - `handle_io_errors`: summarize I/O errors here, and exit?
  273. - `mode`: how the file is to be opened (see standard function
  274. `open`). The default is 'w', providing universal newline
  275. support for text files.
  276. """
  277. Output.__init__(self, destination, destination_path,
  278. encoding, error_handler)
  279. self.opened = True
  280. self.autoclose = autoclose
  281. self.handle_io_errors = handle_io_errors
  282. if mode is not None:
  283. self.mode = mode
  284. self._stderr = ErrorOutput()
  285. if destination is None:
  286. if destination_path:
  287. self.opened = False
  288. else:
  289. self.destination = sys.stdout
  290. elif (# destination is file-type object -> check mode:
  291. mode and hasattr(self.destination, 'mode')
  292. and mode != self.destination.mode):
  293. print(('Destination mode "%s" '
  294. 'differs from specified mode "%s"' %
  295. (self.destination.mode, mode)), file=self._stderr)
  296. if not destination_path:
  297. try:
  298. self.destination_path = self.destination.name
  299. except AttributeError:
  300. pass
  301. # Special cases under Python 3: different encoding or binary output
  302. if sys.version_info >= (3,0):
  303. if ('b' in self.mode
  304. and self.destination in (sys.stdout, sys.stderr)
  305. ):
  306. self.destination = self.destination.buffer
  307. if check_encoding(self.destination, self.encoding) is False:
  308. if self.destination in (sys.stdout, sys.stderr):
  309. self.destination = self.destination.buffer
  310. else: # TODO: try the `write to .buffer` scheme instead?
  311. raise ValueError('Encoding of %s (%s) differs \n'
  312. ' from specified encoding (%s)' %
  313. (self.destination_path or 'destination',
  314. destination.encoding, encoding))
  315. def open(self):
  316. # Specify encoding in Python 3.
  317. if sys.version_info >= (3,0):
  318. kwargs = {'encoding': self.encoding,
  319. 'errors': self.error_handler}
  320. else:
  321. kwargs = {}
  322. try:
  323. self.destination = open(self.destination_path, self.mode, **kwargs)
  324. except IOError as error:
  325. if self.handle_io_errors:
  326. print(ErrorString(error), file=self._stderr)
  327. print(('Unable to open destination file'
  328. " for writing ('%s'). Exiting." % self.destination_path), file=self._stderr)
  329. sys.exit(1)
  330. raise OutputError(error.errno, error.strerror,
  331. self.destination_path)
  332. self.opened = True
  333. def write(self, data):
  334. """Encode `data`, write it to a single file, and return it.
  335. With Python 3 or binary output mode, `data` is returned unchanged,
  336. except when specified encoding and output encoding differ.
  337. """
  338. if not self.opened:
  339. self.open()
  340. try: # In Python < 2.5, try...except has to be nested in try...finally.
  341. try:
  342. if 'b' not in self.mode and (sys.version_info < (3,0) or
  343. check_encoding(self.destination, self.encoding) is False):
  344. data = self.encode(data)
  345. if sys.version_info >= (3,0) and os.linesep != '\n':
  346. # writing as binary data -> fix endings
  347. data = data.replace('\n', os.linesep)
  348. self.destination.write(data)
  349. except (UnicodeError, LookupError) as err:
  350. raise UnicodeError(
  351. 'Unable to encode output data. output-encoding is: '
  352. '%s.\n(%s)' % (self.encoding, ErrorString(err)))
  353. finally:
  354. if self.autoclose:
  355. self.close()
  356. return data
  357. def close(self):
  358. if self.destination not in (sys.stdout, sys.stderr):
  359. self.destination.close()
  360. self.opened = False
  361. class BinaryFileOutput(FileOutput):
  362. """
  363. A version of docutils.io.FileOutput which writes to a binary file.
  364. """
  365. # Used by core.publish_cmdline_to_binary() which in turn is used by
  366. # rst2odt (OpenOffice writer)
  367. mode = 'wb'
  368. class StringInput(Input):
  369. """
  370. Direct string input.
  371. """
  372. default_source_path = '<string>'
  373. def read(self):
  374. """Decode and return the source string."""
  375. return self.decode(self.source)
  376. class StringOutput(Output):
  377. """
  378. Direct string output.
  379. """
  380. default_destination_path = '<string>'
  381. def write(self, data):
  382. """Encode `data`, store it in `self.destination`, and return it."""
  383. self.destination = self.encode(data)
  384. return self.destination
  385. class NullInput(Input):
  386. """
  387. Degenerate input: read nothing.
  388. """
  389. default_source_path = 'null input'
  390. def read(self):
  391. """Return a null string."""
  392. return ''
  393. class NullOutput(Output):
  394. """
  395. Degenerate output: write nothing.
  396. """
  397. default_destination_path = 'null output'
  398. def write(self, data):
  399. """Do nothing ([don't even] send data to the bit bucket)."""
  400. pass
  401. class DocTreeInput(Input):
  402. """
  403. Adapter for document tree input.
  404. The document tree must be passed in the ``source`` parameter.
  405. """
  406. default_source_path = 'doctree input'
  407. def read(self):
  408. """Return the document tree."""
  409. return self.source