PageRenderTime 44ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/xmldocs.py

https://github.com/BrucePHill/nltk
Python | 385 lines | 319 code | 10 blank | 56 comment | 20 complexity | d6a10684ca5e1add31006154daa69f38 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: XML Corpus Reader
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for corpora whose documents are xml files.
  9. (note -- not named 'xml' to avoid conflicting w/ standard xml package)
  10. """
  11. from __future__ import print_function, unicode_literals
  12. import codecs
  13. # Use the c version of ElementTree, which is faster, if possible:
  14. try: from xml.etree import cElementTree as ElementTree
  15. except ImportError: from xml.etree import ElementTree
  16. from nltk import compat
  17. from nltk.data import SeekableUnicodeStreamReader
  18. from nltk.tokenize import WordPunctTokenizer
  19. from nltk.internals import ElementWrapper
  20. from nltk.corpus.reader.api import CorpusReader
  21. from nltk.corpus.reader.util import *
  22. class XMLCorpusReader(CorpusReader):
  23. """
  24. Corpus reader for corpora whose documents are xml files.
  25. Note that the ``XMLCorpusReader`` constructor does not take an
  26. ``encoding`` argument, because the unicode encoding is specified by
  27. the XML files themselves. See the XML specs for more info.
  28. """
  29. def __init__(self, root, fileids, wrap_etree=False):
  30. self._wrap_etree = wrap_etree
  31. CorpusReader.__init__(self, root, fileids)
  32. def xml(self, fileid=None):
  33. # Make sure we have exactly one file -- no concatenating XML.
  34. if fileid is None and len(self._fileids) == 1:
  35. fileid = self._fileids[0]
  36. if not isinstance(fileid, compat.string_types):
  37. raise TypeError('Expected a single file identifier string')
  38. # Read the XML in using ElementTree.
  39. elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
  40. # If requested, wrap it.
  41. if self._wrap_etree:
  42. elt = ElementWrapper(elt)
  43. # Return the ElementTree element.
  44. return elt
  45. def words(self, fileid=None):
  46. """
  47. Returns all of the words and punctuation symbols in the specified file
  48. that were in text nodes -- ie, tags are ignored. Like the xml() method,
  49. fileid can only specify one file.
  50. :return: the given file's text nodes as a list of words and punctuation symbols
  51. :rtype: list(str)
  52. """
  53. elt = self.xml(fileid)
  54. encoding = self.encoding(fileid)
  55. word_tokenizer=WordPunctTokenizer()
  56. iterator = elt.getiterator()
  57. out = []
  58. for node in iterator:
  59. text = node.text
  60. if text is not None:
  61. if isinstance(text, bytes):
  62. text = text.decode(encoding)
  63. toks = word_tokenizer.tokenize(text)
  64. out.extend(toks)
  65. return out
  66. def raw(self, fileids=None):
  67. if fileids is None: fileids = self._fileids
  68. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  69. return concat([self.open(f).read() for f in fileids])
  70. class XMLCorpusView(StreamBackedCorpusView):
  71. """
  72. A corpus view that selects out specified elements from an XML
  73. file, and provides a flat list-like interface for accessing them.
  74. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
  75. but may be used by subclasses of ``XMLCorpusReader``.)
  76. Every XML corpus view has a "tag specification", indicating what
  77. XML elements should be included in the view; and each (non-nested)
  78. element that matches this specification corresponds to one item in
  79. the view. Tag specifications are regular expressions over tag
  80. paths, where a tag path is a list of element tag names, separated
  81. by '/', indicating the ancestry of the element. Some examples:
  82. - ``'foo'``: A top-level element whose tag is ``foo``.
  83. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
  84. is a top-level element whose tag is ``foo``.
  85. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
  86. in the xml tree.
  87. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
  88. appearing anywhere in the xml tree.
  89. The view items are generated from the selected XML elements via
  90. the method ``handle_elt()``. By default, this method returns the
  91. element as-is (i.e., as an ElementTree object); but it can be
  92. overridden, either via subclassing or via the ``elt_handler``
  93. constructor parameter.
  94. """
  95. #: If true, then display debugging output to stdout when reading
  96. #: blocks.
  97. _DEBUG = False
  98. #: The number of characters read at a time by this corpus reader.
  99. _BLOCK_SIZE = 1024
  100. def __init__(self, fileid, tagspec, elt_handler=None):
  101. """
  102. Create a new corpus view based on a specified XML file.
  103. Note that the ``XMLCorpusView`` constructor does not take an
  104. ``encoding`` argument, because the unicode encoding is
  105. specified by the XML files themselves.
  106. :type tagspec: str
  107. :param tagspec: A tag specification, indicating what XML
  108. elements should be included in the view. Each non-nested
  109. element that matches this specification corresponds to one
  110. item in the view.
  111. :param elt_handler: A function used to transform each element
  112. to a value for the view. If no handler is specified, then
  113. ``self.handle_elt()`` is called, which returns the element
  114. as an ElementTree object. The signature of elt_handler is::
  115. elt_handler(elt, tagspec) -> value
  116. """
  117. if elt_handler: self.handle_elt = elt_handler
  118. self._tagspec = re.compile(tagspec+r'\Z')
  119. """The tag specification for this corpus view."""
  120. self._tag_context = {0: ()}
  121. """A dictionary mapping from file positions (as returned by
  122. ``stream.seek()`` to XML contexts. An XML context is a
  123. tuple of XML tag names, indicating which tags have not yet
  124. been closed."""
  125. encoding = self._detect_encoding(fileid)
  126. StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
  127. def _detect_encoding(self, fileid):
  128. if isinstance(fileid, PathPointer):
  129. s = fileid.open().readline()
  130. else:
  131. with open(fileid, 'rb') as fp:
  132. s = fp.readline()
  133. if s.startswith(codecs.BOM_UTF16_BE):
  134. return 'utf-16-be'
  135. if s.startswith(codecs.BOM_UTF16_LE):
  136. return 'utf-16-le'
  137. if s.startswith(codecs.BOM_UTF32_BE):
  138. return 'utf-32-be'
  139. if s.startswith(codecs.BOM_UTF32_LE):
  140. return 'utf-32-le'
  141. if s.startswith(codecs.BOM_UTF8):
  142. return 'utf-8'
  143. m = re.match(br'\s*<?xml\b.*\bencoding="([^"]+)"', s)
  144. if m: return m.group(1)
  145. m = re.match(br"\s*<?xml\b.*\bencoding='([^']+)'", s)
  146. if m: return m.group(1)
  147. # No encoding found -- what should the default be?
  148. return 'utf-8'
  149. def handle_elt(self, elt, context):
  150. """
  151. Convert an element into an appropriate value for inclusion in
  152. the view. Unless overridden by a subclass or by the
  153. ``elt_handler`` constructor argument, this method simply
  154. returns ``elt``.
  155. :return: The view value corresponding to ``elt``.
  156. :type elt: ElementTree
  157. :param elt: The element that should be converted.
  158. :type context: str
  159. :param context: A string composed of element tags separated by
  160. forward slashes, indicating the XML context of the given
  161. element. For example, the string ``'foo/bar/baz'``
  162. indicates that the element is a ``baz`` element whose
  163. parent is a ``bar`` element and whose grandparent is a
  164. top-level ``foo`` element.
  165. """
  166. return elt
  167. #: A regular expression that matches XML fragments that do not
  168. #: contain any un-closed tags.
  169. _VALID_XML_RE = re.compile(r"""
  170. [^<]*
  171. (
  172. ((<!--.*?-->) | # comment
  173. (<![CDATA[.*?]]) | # raw character data
  174. (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
  175. (<[^>]*>)) # tag or PI
  176. [^<]*)*
  177. \Z""",
  178. re.DOTALL|re.VERBOSE)
  179. #: A regular expression used to extract the tag name from a start tag,
  180. #: end tag, or empty-elt tag string.
  181. _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
  182. #: A regular expression used to find all start-tags, end-tags, and
  183. #: emtpy-elt tags in an XML file. This regexp is more lenient than
  184. #: the XML spec -- e.g., it allows spaces in some places where the
  185. #: spec does not.
  186. _XML_PIECE = re.compile(r"""
  187. # Include these so we can skip them:
  188. (?P<COMMENT> <!--.*?--> )|
  189. (?P<CDATA> <![CDATA[.*?]]> )|
  190. (?P<PI> <\?.*?\?> )|
  191. (?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
  192. # These are the ones we actually care about:
  193. (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
  194. (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
  195. (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
  196. re.DOTALL|re.VERBOSE)
  197. def _read_xml_fragment(self, stream):
  198. """
  199. Read a string from the given stream that does not contain any
  200. un-closed tags. In particular, this function first reads a
  201. block from the stream of size ``self._BLOCK_SIZE``. It then
  202. checks if that block contains an un-closed tag. If it does,
  203. then this function either backtracks to the last '<', or reads
  204. another block.
  205. """
  206. fragment = ''
  207. while True:
  208. if isinstance(stream, SeekableUnicodeStreamReader):
  209. startpos = stream.tell()
  210. # Read a block and add it to the fragment.
  211. xml_block = stream.read(self._BLOCK_SIZE)
  212. fragment += xml_block
  213. # Do we have a well-formed xml fragment?
  214. if self._VALID_XML_RE.match(fragment):
  215. return fragment
  216. # Do we have a fragment that will never be well-formed?
  217. if re.search('[<>]', fragment).group(0) == '>':
  218. pos = stream.tell() - (
  219. len(fragment)-re.search('[<>]', fragment).end())
  220. raise ValueError('Unexpected ">" near char %s' % pos)
  221. # End of file?
  222. if not xml_block:
  223. raise ValueError('Unexpected end of file: tag not closed')
  224. # If not, then we must be in the middle of a <..tag..>.
  225. # If appropriate, backtrack to the most recent '<'
  226. # character.
  227. last_open_bracket = fragment.rfind('<')
  228. if last_open_bracket > 0:
  229. if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
  230. if isinstance(stream, SeekableUnicodeStreamReader):
  231. stream.seek(startpos)
  232. stream.char_seek_forward(last_open_bracket)
  233. else:
  234. stream.seek(-(len(fragment)-last_open_bracket), 1)
  235. return fragment[:last_open_bracket]
  236. # Otherwise, read another block. (i.e., return to the
  237. # top of the loop.)
  238. def read_block(self, stream, tagspec=None, elt_handler=None):
  239. """
  240. Read from ``stream`` until we find at least one element that
  241. matches ``tagspec``, and return the result of applying
  242. ``elt_handler`` to each element found.
  243. """
  244. if tagspec is None: tagspec = self._tagspec
  245. if elt_handler is None: elt_handler = self.handle_elt
  246. # Use a stack of strings to keep track of our context:
  247. context = list(self._tag_context.get(stream.tell()))
  248. assert context is not None # check this -- could it ever happen?
  249. elts = []
  250. elt_start = None # where does the elt start
  251. elt_depth = None # what context depth
  252. elt_text = ''
  253. while elts==[] or elt_start is not None:
  254. if isinstance(stream, SeekableUnicodeStreamReader):
  255. startpos = stream.tell()
  256. xml_fragment = self._read_xml_fragment(stream)
  257. # End of file.
  258. if not xml_fragment:
  259. if elt_start is None: break
  260. else: raise ValueError('Unexpected end of file')
  261. # Process each <tag> in the xml fragment.
  262. for piece in self._XML_PIECE.finditer(xml_fragment):
  263. if self._DEBUG:
  264. print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
  265. if piece.group('START_TAG'):
  266. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  267. # Keep context up-to-date.
  268. context.append(name)
  269. # Is this one of the elts we're looking for?
  270. if elt_start is None:
  271. if re.match(tagspec, '/'.join(context)):
  272. elt_start = piece.start()
  273. elt_depth = len(context)
  274. elif piece.group('END_TAG'):
  275. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  276. # sanity checks:
  277. if not context:
  278. raise ValueError('Unmatched tag </%s>' % name)
  279. if name != context[-1]:
  280. raise ValueError('Unmatched tag <%s>...</%s>' %
  281. (context[-1], name))
  282. # Is this the end of an element?
  283. if elt_start is not None and elt_depth == len(context):
  284. elt_text += xml_fragment[elt_start:piece.end()]
  285. elts.append( (elt_text, '/'.join(context)) )
  286. elt_start = elt_depth = None
  287. elt_text = ''
  288. # Keep context up-to-date
  289. context.pop()
  290. elif piece.group('EMPTY_ELT_TAG'):
  291. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  292. if elt_start is None:
  293. if re.match(tagspec, '/'.join(context)+'/'+name):
  294. elts.append((piece.group(),
  295. '/'.join(context)+'/'+name))
  296. if elt_start is not None:
  297. # If we haven't found any elements yet, then keep
  298. # looping until we do.
  299. if elts == []:
  300. elt_text += xml_fragment[elt_start:]
  301. elt_start = 0
  302. # If we've found at least one element, then try
  303. # backtracking to the start of the element that we're
  304. # inside of.
  305. else:
  306. # take back the last start-tag, and return what
  307. # we've gotten so far (elts is non-empty).
  308. if self._DEBUG:
  309. print(' '*36+'(backtrack)')
  310. if isinstance(stream, SeekableUnicodeStreamReader):
  311. stream.seek(startpos)
  312. stream.char_seek_forward(elt_start)
  313. else:
  314. stream.seek(-(len(xml_fragment)-elt_start), 1)
  315. context = context[:elt_depth-1]
  316. elt_start = elt_depth = None
  317. elt_text = ''
  318. # Update the _tag_context dict.
  319. pos = stream.tell()
  320. if pos in self._tag_context:
  321. assert tuple(context) == self._tag_context[pos]
  322. else:
  323. self._tag_context[pos] = tuple(context)
  324. return [elt_handler(ElementTree.fromstring(
  325. elt.encode('ascii', 'xmlcharrefreplace')),
  326. context)
  327. for (elt, context) in elts]