PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/xmldocs.py

https://github.com/haewoon/nltk
Python | 380 lines | 317 code | 10 blank | 53 comment | 20 complexity | f48ac8a5def7a2740d7c5652619f1d89 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: XML Corpus Reader
  2. #
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Steven Bird <sb@csse.unimelb.edu.au>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for corpora whose documents are xml files.
  9. (note -- not named 'xml' to avoid conflicting w/ standard xml package)
  10. """
  11. import codecs
  12. # Use the c version of ElementTree, which is faster, if possible:
  13. try: from xml.etree import cElementTree as ElementTree
  14. except ImportError: from xml.etree import ElementTree
  15. from nltk.data import SeekableUnicodeStreamReader
  16. from nltk.tokenize import WordPunctTokenizer
  17. from nltk.internals import ElementWrapper
  18. from nltk.corpus.reader.api import CorpusReader
  19. from nltk.corpus.reader.util import *
  20. class XMLCorpusReader(CorpusReader):
  21. """
  22. Corpus reader for corpora whose documents are xml files.
  23. Note that the ``XMLCorpusReader`` constructor does not take an
  24. ``encoding`` argument, because the unicode encoding is specified by
  25. the XML files themselves. See the XML specs for more info.
  26. """
  27. def __init__(self, root, fileids, wrap_etree=False):
  28. self._wrap_etree = wrap_etree
  29. CorpusReader.__init__(self, root, fileids)
  30. def xml(self, fileid=None):
  31. # Make sure we have exactly one file -- no concatenating XML.
  32. if fileid is None and len(self._fileids) == 1:
  33. fileid = self._fileids[0]
  34. if not isinstance(fileid, basestring):
  35. raise TypeError('Expected a single file identifier string')
  36. # Read the XML in using ElementTree.
  37. elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
  38. # If requested, wrap it.
  39. if self._wrap_etree:
  40. elt = ElementWrapper(elt)
  41. # Return the ElementTree element.
  42. return elt
  43. def words(self, fileid=None):
  44. """
  45. Returns all of the words and punctuation symbols in the specified file
  46. that were in text nodes -- ie, tags are ignored. Like the xml() method,
  47. fileid can only specify one file.
  48. :return: the given file's text nodes as a list of words and punctuation symbols
  49. :rtype: list(str)
  50. """
  51. elt = self.xml(fileid)
  52. word_tokenizer=WordPunctTokenizer()
  53. iterator = elt.getiterator()
  54. out = []
  55. for node in iterator:
  56. text = node.text
  57. if text is not None:
  58. toks = word_tokenizer.tokenize(text)
  59. out.extend(toks)
  60. return out
  61. def raw(self, fileids=None):
  62. if fileids is None: fileids = self._fileids
  63. elif isinstance(fileids, basestring): fileids = [fileids]
  64. return concat([self.open(f).read() for f in fileids])
  65. class XMLCorpusView(StreamBackedCorpusView):
  66. """
  67. A corpus view that selects out specified elements from an XML
  68. file, and provides a flat list-like interface for accessing them.
  69. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
  70. but may be used by subclasses of ``XMLCorpusReader``.)
  71. Every XML corpus view has a "tag specification", indicating what
  72. XML elements should be included in the view; and each (non-nested)
  73. element that matches this specification corresponds to one item in
  74. the view. Tag specifications are regular expressions over tag
  75. paths, where a tag path is a list of element tag names, separated
  76. by '/', indicating the ancestry of the element. Some examples:
  77. - ``'foo'``: A top-level element whose tag is ``foo``.
  78. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
  79. is a top-level element whose tag is ``foo``.
  80. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
  81. in the xml tree.
  82. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
  83. appearing anywhere in the xml tree.
  84. The view items are generated from the selected XML elements via
  85. the method ``handle_elt()``. By default, this method returns the
  86. element as-is (i.e., as an ElementTree object); but it can be
  87. overridden, either via subclassing or via the ``elt_handler``
  88. constructor parameter.
  89. """
  90. #: If true, then display debugging output to stdout when reading
  91. #: blocks.
  92. _DEBUG = False
  93. #: The number of characters read at a time by this corpus reader.
  94. _BLOCK_SIZE = 1024
  95. def __init__(self, fileid, tagspec, elt_handler=None):
  96. """
  97. Create a new corpus view based on a specified XML file.
  98. Note that the ``XMLCorpusView`` constructor does not take an
  99. ``encoding`` argument, because the unicode encoding is
  100. specified by the XML files themselves.
  101. :type tagspec: str
  102. :param tagspec: A tag specification, indicating what XML
  103. elements should be included in the view. Each non-nested
  104. element that matches this specification corresponds to one
  105. item in the view.
  106. :param elt_handler: A function used to transform each element
  107. to a value for the view. If no handler is specified, then
  108. ``self.handle_elt()`` is called, which returns the element
  109. as an ElementTree object. The signature of elt_handler is::
  110. elt_handler(elt, tagspec) -> value
  111. """
  112. if elt_handler: self.handle_elt = elt_handler
  113. self._tagspec = re.compile(tagspec+r'\Z')
  114. """The tag specification for this corpus view."""
  115. self._tag_context = {0: ()}
  116. """A dictionary mapping from file positions (as returned by
  117. ``stream.seek()`` to XML contexts. An XML context is a
  118. tuple of XML tag names, indicating which tags have not yet
  119. been closed."""
  120. encoding = self._detect_encoding(fileid)
  121. StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
  122. def _detect_encoding(self, fileid):
  123. if isinstance(fileid, PathPointer):
  124. s = fileid.open().readline()
  125. else:
  126. s = open(fileid, 'rb').readline()
  127. if s.startswith(codecs.BOM_UTF16_BE):
  128. return 'utf-16-be'
  129. if s.startswith(codecs.BOM_UTF16_LE):
  130. return 'utf-16-le'
  131. if s.startswith(codecs.BOM_UTF32_BE):
  132. return 'utf-32-be'
  133. if s.startswith(codecs.BOM_UTF32_LE):
  134. return 'utf-32-le'
  135. if s.startswith(codecs.BOM_UTF8):
  136. return 'utf-8'
  137. m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s)
  138. if m: return m.group(1)
  139. m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s)
  140. if m: return m.group(1)
  141. # No encoding found -- what should the default be?
  142. return 'utf-8'
  143. def handle_elt(self, elt, context):
  144. """
  145. Convert an element into an appropriate value for inclusion in
  146. the view. Unless overridden by a subclass or by the
  147. ``elt_handler`` constructor argument, this method simply
  148. returns ``elt``.
  149. :return: The view value corresponding to ``elt``.
  150. :type elt: ElementTree
  151. :param elt: The element that should be converted.
  152. :type context: str
  153. :param context: A string composed of element tags separated by
  154. forward slashes, indicating the XML context of the given
  155. element. For example, the string ``'foo/bar/baz'``
  156. indicates that the element is a ``baz`` element whose
  157. parent is a ``bar`` element and whose grandparent is a
  158. top-level ``foo`` element.
  159. """
  160. return elt
  161. #: A regular expression that matches XML fragments that do not
  162. #: contain any un-closed tags.
  163. _VALID_XML_RE = re.compile(r"""
  164. [^<]*
  165. (
  166. ((<!--.*?-->) | # comment
  167. (<![CDATA[.*?]]) | # raw character data
  168. (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
  169. (<[^>]*>)) # tag or PI
  170. [^<]*)*
  171. \Z""",
  172. re.DOTALL|re.VERBOSE)
  173. #: A regular expression used to extract the tag name from a start tag,
  174. #: end tag, or empty-elt tag string.
  175. _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
  176. #: A regular expression used to find all start-tags, end-tags, and
  177. #: emtpy-elt tags in an XML file. This regexp is more lenient than
  178. #: the XML spec -- e.g., it allows spaces in some places where the
  179. #: spec does not.
  180. _XML_PIECE = re.compile(r"""
  181. # Include these so we can skip them:
  182. (?P<COMMENT> <!--.*?--> )|
  183. (?P<CDATA> <![CDATA[.*?]]> )|
  184. (?P<PI> <\?.*?\?> )|
  185. (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )|
  186. # These are the ones we actually care about:
  187. (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
  188. (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
  189. (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
  190. re.DOTALL|re.VERBOSE)
  191. def _read_xml_fragment(self, stream):
  192. """
  193. Read a string from the given stream that does not contain any
  194. un-closed tags. In particular, this function first reads a
  195. block from the stream of size ``self._BLOCK_SIZE``. It then
  196. checks if that block contains an un-closed tag. If it does,
  197. then this function either backtracks to the last '<', or reads
  198. another block.
  199. """
  200. fragment = ''
  201. while True:
  202. if isinstance(stream, SeekableUnicodeStreamReader):
  203. startpos = stream.tell()
  204. # Read a block and add it to the fragment.
  205. xml_block = stream.read(self._BLOCK_SIZE)
  206. fragment += xml_block
  207. # Do we have a well-formed xml fragment?
  208. if self._VALID_XML_RE.match(fragment):
  209. return fragment
  210. # Do we have a fragment that will never be well-formed?
  211. if re.search('[<>]', fragment).group(0) == '>':
  212. pos = stream.tell() - (
  213. len(fragment)-re.search('[<>]', fragment).end())
  214. raise ValueError('Unexpected ">" near char %s' % pos)
  215. # End of file?
  216. if not xml_block:
  217. raise ValueError('Unexpected end of file: tag not closed')
  218. # If not, then we must be in the middle of a <..tag..>.
  219. # If appropriate, backtrack to the most recent '<'
  220. # character.
  221. last_open_bracket = fragment.rfind('<')
  222. if last_open_bracket > 0:
  223. if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
  224. if isinstance(stream, SeekableUnicodeStreamReader):
  225. stream.seek(startpos)
  226. stream.char_seek_forward(last_open_bracket)
  227. else:
  228. stream.seek(-(len(fragment)-last_open_bracket), 1)
  229. return fragment[:last_open_bracket]
  230. # Otherwise, read another block. (i.e., return to the
  231. # top of the loop.)
  232. def read_block(self, stream, tagspec=None, elt_handler=None):
  233. """
  234. Read from ``stream`` until we find at least one element that
  235. matches ``tagspec``, and return the result of applying
  236. ``elt_handler`` to each element found.
  237. """
  238. if tagspec is None: tagspec = self._tagspec
  239. if elt_handler is None: elt_handler = self.handle_elt
  240. # Use a stack of strings to keep track of our context:
  241. context = list(self._tag_context.get(stream.tell()))
  242. assert context is not None # check this -- could it ever happen?
  243. elts = []
  244. elt_start = None # where does the elt start
  245. elt_depth = None # what context depth
  246. elt_text = ''
  247. while elts==[] or elt_start is not None:
  248. if isinstance(stream, SeekableUnicodeStreamReader):
  249. startpos = stream.tell()
  250. xml_fragment = self._read_xml_fragment(stream)
  251. # End of file.
  252. if not xml_fragment:
  253. if elt_start is None: break
  254. else: raise ValueError('Unexpected end of file')
  255. # Process each <tag> in the xml fragment.
  256. for piece in self._XML_PIECE.finditer(xml_fragment):
  257. if self._DEBUG:
  258. print '%25s %s' % ('/'.join(context)[-20:], piece.group())
  259. if piece.group('START_TAG'):
  260. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  261. # Keep context up-to-date.
  262. context.append(name)
  263. # Is this one of the elts we're looking for?
  264. if elt_start is None:
  265. if re.match(tagspec, '/'.join(context)):
  266. elt_start = piece.start()
  267. elt_depth = len(context)
  268. elif piece.group('END_TAG'):
  269. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  270. # sanity checks:
  271. if not context:
  272. raise ValueError('Unmatched tag </%s>' % name)
  273. if name != context[-1]:
  274. raise ValueError('Unmatched tag <%s>...</%s>' %
  275. (context[-1], name))
  276. # Is this the end of an element?
  277. if elt_start is not None and elt_depth == len(context):
  278. elt_text += xml_fragment[elt_start:piece.end()]
  279. elts.append( (elt_text, '/'.join(context)) )
  280. elt_start = elt_depth = None
  281. elt_text = ''
  282. # Keep context up-to-date
  283. context.pop()
  284. elif piece.group('EMPTY_ELT_TAG'):
  285. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  286. if elt_start is None:
  287. if re.match(tagspec, '/'.join(context)+'/'+name):
  288. elts.append((piece.group(),
  289. '/'.join(context)+'/'+name))
  290. if elt_start is not None:
  291. # If we haven't found any elements yet, then keep
  292. # looping until we do.
  293. if elts == []:
  294. elt_text += xml_fragment[elt_start:]
  295. elt_start = 0
  296. # If we've found at least one element, then try
  297. # backtracking to the start of the element that we're
  298. # inside of.
  299. else:
  300. # take back the last start-tag, and return what
  301. # we've gotten so far (elts is non-empty).
  302. if self._DEBUG:
  303. print ' '*36+'(backtrack)'
  304. if isinstance(stream, SeekableUnicodeStreamReader):
  305. stream.seek(startpos)
  306. stream.char_seek_forward(elt_start)
  307. else:
  308. stream.seek(-(len(xml_fragment)-elt_start), 1)
  309. context = context[:elt_depth-1]
  310. elt_start = elt_depth = None
  311. elt_text = ''
  312. # Update the _tag_context dict.
  313. pos = stream.tell()
  314. if pos in self._tag_context:
  315. assert tuple(context) == self._tag_context[pos]
  316. else:
  317. self._tag_context[pos] = tuple(context)
  318. return [elt_handler(ElementTree.fromstring(
  319. elt.encode('ascii', 'xmlcharrefreplace')),
  320. context)
  321. for (elt, context) in elts]