PageRenderTime 72ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk_contrib/nltk_contrib/coref/muc.py

http://nltk.googlecode.com/
Python | 591 lines | 397 code | 23 blank | 171 comment | 35 complexity | 1538e98eb6b74e5f0ea53fef08840cd3 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit (NLTK) MUC Corpus Reader
  2. #
  3. # Copyright (C) 2001-2011 NLTK Project
  4. # Author: Joseph Frazee <jfrazee@mail.utexas.edu>
  5. # Steven Bird <sb@csse.unimelb.edu.au> (original IEER Corpus Reader)
  6. # Edward Loper <edloper@gradient.cis.upenn.edu> (original IEER Corpus
  7. # Reader)
  8. # URL: <http://www.nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. # Adapted from nltk.corpus.reader.ieer.IEERCorpusReader
  11. import re
  12. import codecs
  13. from itertools import chain
  14. from nltk import Tree
  15. from nltk.util import LazyMap, LazyConcatenation
  16. from nltk.tokenize.treebank import TreebankWordTokenizer
  17. from nltk.tokenize.punkt import PunktSentenceTokenizer
  18. from nltk.corpus.util import LazyCorpusLoader
  19. from nltk.corpus.reader.api import CorpusReader
  20. from nltk.corpus.reader.util import concat, StreamBackedCorpusView
  21. muc6_titles = {
  22. '891102-0189.ne.v1.3.sgm':'',
  23. '891102-0189.co.v2.0.sgm':'',
  24. '891101-0050.ne.v1.3.sgm':'',
  25. }
  26. muc6_documents = sorted(muc6_titles)
  27. muc7_titles = {
  28. 'dryrun01.muc7':'',
  29. 'dryrun02.muc7':'',
  30. 'dryrun03.muc7':'',
  31. }
  32. muc7_documents = sorted(muc7_titles)
  33. _MUC_CHUNK_TYPES = [
  34. 'DATE',
  35. 'IDENT',
  36. 'LOCATION',
  37. 'MONEY',
  38. 'ORGANIZATION',
  39. 'PERCENT',
  40. 'PERSON',
  41. 'TIME'
  42. ]
  43. _MUC6_DOC_RE = re.compile(
  44. r'\s*<DOC>\s*'
  45. r"""
  46. (\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>|
  47. <CODER>\s*.+?\s*</CODER>|
  48. <DD>\s*.+?\s*</DD>|
  49. <AN>\s*.+?\s*</AN>|
  50. <HL>\s*(?P<headline>.+?)\s*</HL>|
  51. <SO>\s*.+?\s*</SO>|
  52. <CO>\s*.+?\s*</CO>|
  53. <IN>\s*.+?\s*</IN>|
  54. <GV>\s*.+?\s*</GV>|
  55. <DATELINE>\s*(?P<dateline>.+?)\s*</DATELINE>)\s*)*
  56. """
  57. r'<TXT>\s*(?P<text>(<p>\s*(<s>\s*.+?\s*</s>)+\s*</p>)+)\s*</TXT>\s*'
  58. r'</DOC>\s*', re.DOTALL | re.I | re.VERBOSE)
  59. _MUC6_PARA_RE = re.compile('(<p>\s*(?P<para>.+?)\s*</p>?)+', re.DOTALL | re.I)
  60. _MUC6_SENT_RE = re.compile('(<s>\s*(?P<sent>.+?)\s*</s>)+', re.DOTALL | re.I)
  61. _MUC7_DOC_RE = re.compile(
  62. r'\s*<DOC>\s*'
  63. r"""
  64. (\s*(<DOCID>\s*(?P<docid>.+?)\s*</DOCID>|
  65. <STORYID\s+[^>]*?>\s*.+?\s*</STORYID>|
  66. <SLUG\s+[^>]*?>\s*.+?\s*</SLUG>|
  67. <DATE>\s*(?P<date>.+?)\s*</DATE>|
  68. <NWORDS>\s*.+?\s*</NWORDS>|
  69. <PREAMBLE>\s*.+?\s*</PREAMBLE>)\s*)*
  70. """
  71. r'<TEXT>\s*(?P<text>.+?)\s*</TEXT>\s*'
  72. r'(<TRAILER>\s*(?P<trailer>.+?)\s*</TRAILER>\s*)?'
  73. r'</DOC>\s*', re.DOTALL | re.I | re.VERBOSE)
  74. _MUC7_PARA_RE = re.compile(r'\s*<p>\s*.+?\s*(<p>\s*.+?\s*?)*\s*', re.DOTALL | re.I)
  75. _MUC7_PARA_SPLIT_RE = re.compile(r'\s*<p>\s*', re.DOTALL | re.I)
  76. _MUC_NE_B_RE = re.compile('<(ENAMEX|NUMEX|TIMEX)\s+[^>]*?TYPE="(?P<type>\w+)"', re.DOTALL | re.I)
  77. _MUC_NE_E_RE = re.compile('</(ENAMEX|NUMEX|TIMEX)>', re.DOTALL | re.I)
  78. _MUC_CO_B_RE = re.compile('<COREF\s+[^>]*?ID="(?P<id>\w+)"(\s+TYPE="(?P<type>\w+)")?(\s+REF="(?P<ref>\w+)")?', re.DOTALL | re.I)
  79. _MUC_CO_E_RE = re.compile('</COREF>', re.DOTALL | re.I)
  80. _WORD_TOKENIZER = TreebankWordTokenizer()
  81. _SENT_TOKENIZER = PunktSentenceTokenizer()
  82. class MUCDocument:
  83. # def __init__(self, text, docno=None, dateline=None, headline=''):
  84. def __init__(self, **text):
  85. self.text = None
  86. if isinstance(text, basestring):
  87. self.text = text
  88. elif isinstance(text, dict):
  89. for key, val in text.items():
  90. setattr(self, key, val)
  91. else:
  92. raise
  93. assert self.text
  94. def __repr__(self):
  95. if self.headline:
  96. headline = ' '.join(self.headline.leaves())
  97. else:
  98. headline = ' '.join([w for w in self.text.leaves()
  99. if w[:1] != '<'][:11])+'...'
  100. if self.docno is not None:
  101. return '<MUCDocument %s: %r>' % (self.docno, headline)
  102. else:
  103. return '<MUCDocument: %r>' % headline
  104. class MUCCorpusReader(CorpusReader):
  105. """
  106. A corpus reader for MUC SGML files. Each file begins with a preamble
  107. of SGML-tagged metadata. The document text follows. The text of the
  108. document is contained in <TXT> tags for MUC6 and <TEXT> tags for MUC7.
  109. Paragraphs are contained in <p> tags in both corpus formats. Sentences are
  110. contained in <s> tags in MUC6 only. For MUC7 corpus files L{sents()},
  111. L{chunked_sents()}, and L{iob_sents()} return sentences via tokenizing
  112. with C{PunktSentenceTokenizer}.
  113. Additionally named entities and coreference mentions may be marked within
  114. the document text and document metadata. The MUC6 corpus provides
  115. named entity and coreference annotations in two separate sets of files.
  116. The MUC7 corpus contains coreference annotations only. Only one kind of
  117. metadata will be returned depending on which kind of file is being read.
  118. Named entities are tagged as ENAMEX (name expressions), NUMEX
  119. (number expressions), or TIMEX (time expressions), all of which include
  120. TYPE attributes.
  121. Coreference mentions are tagged as COREF and include ID, TYPE, REF, and
  122. MIN attributes. ID is used to give each coreference mention a unique
  123. numeric idenitifier. REF indicates the ID of the intended referent of the
  124. coreference mention and is not required for first mentions. MIN contains
  125. the minimum coreferential string of the coreference mention.
  126. """
  127. def raw(self, fileids=None):
  128. """
  129. @return: A list of corpus file contents.
  130. @rtype: C{list} of C{str}
  131. @param fileids: A list of corpus files.
  132. @type fileids: C{list} of C{str} or regular expression
  133. """
  134. if fileids is None:
  135. fileids = self._fileids
  136. elif isinstance(fileids, basestring):
  137. fileids = [fileids]
  138. return concat([self.open(f).read() for f in fileids])
  139. def docs(self, fileids=None):
  140. """
  141. @return: A list of corpus document strings.
  142. @rtype: C{list} of C{StreamBackedCorpusView}
  143. @param fileids: A list of corpus files.
  144. @type fileids: C{list} of C{str} or regular expression
  145. """
  146. return concat([StreamBackedCorpusView(fileid,
  147. self._read_block,
  148. encoding=enc)
  149. for (fileid, enc) in self.abspaths(fileids, True)])
  150. def parsed_docs(self, fileids=None):
  151. """
  152. @return: A list of parsed corpus documents.
  153. @rtype: C{list} of C{StreamBackedCorpusView}
  154. @param fileids: A list of corpus files.
  155. @type fileids: C{list} of C{str} or regular expression
  156. """
  157. return concat([StreamBackedCorpusView(fileid,
  158. self._read_parsed_block,
  159. encoding=enc)
  160. for (fileid, enc) in self.abspaths(fileids, True)])
  161. def paras(self, fileids=None, **kwargs):
  162. """
  163. @return: A list of paragraphs.
  164. @rtype: C{list} of C{list} of C{list} of C{str}
  165. @param fileids: A list of corpus files.
  166. @type fileids: C{list} of C{str} or regular expression.
  167. """
  168. def __para(para):
  169. return [sent.leaves() for sent in list(para)]
  170. return LazyMap(__para, self._paras(fileids))
  171. def sents(self, fileids=None):
  172. """
  173. @return: A list of sentences.
  174. @rtype: C{list} of C{list} of C{str}
  175. @param fileids: A list of corpus files.
  176. @type fileids: C{list} of C{str} or regular expression
  177. """
  178. return LazyConcatenation(self.paras(fileids))
  179. def chunked_sents(self, fileids=None, **kwargs):
  180. """
  181. @return: A list of sentence chunks as tuples of string/tag pairs.
  182. @rtype: C{list} of C{list} of C{tuple}
  183. @param fileids: A list of corpus files.
  184. @type fileids: C{list} of C{str} or regular expression
  185. @kwparam depth: Depth of chunk parsing for nested chunks.
  186. @type depth: C{int}
  187. """
  188. def __chunked_sent(sent):
  189. chunks = []
  190. # Map each sentence subtree into a tuple.
  191. for token in map(tree2tuple, sent):
  192. # If the token's contents is a list of chunk pieces, append it
  193. # as a list of word/tag pairs.
  194. if isinstance(token[0], list):
  195. chunks.append([(word, None) for word in token[0]])
  196. # If the token's contents is a string, append it as a
  197. # word/tag tuple.
  198. elif isinstance(token[0], basestring):
  199. chunks.append((token[0], None))
  200. # Something bad happened.
  201. else:
  202. raise
  203. return chunks
  204. depth = kwargs.get('depth', 0)
  205. sents = self._chunked_sents(self._sents(fileids, **kwargs), depth)
  206. return LazyMap(__chunked_sent, sents)
  207. def iob_sents(self, fileids=None, **kwargs):
  208. """
  209. @return: A list of sentences as iob word/iob/other tag pairs.
  210. @rtype: C{list} of C{list} of C{tuple}
  211. @param fileids: A list of corpus files.
  212. @type fileids: C{list} of C{str} or regular expression
  213. @kwparam depth: Depth of chunk parsing for nested chunks.
  214. @type depth: C{int}
  215. """
  216. def __iob_sent(sent):
  217. chunks = []
  218. # Map each sentence subtree into a tuple.
  219. for token in map(tree2tuple, sent):
  220. # If the token has a chunk type, parse the token contents.
  221. if token[1] is not None:
  222. for index, word in enumerate(token[0]):
  223. # The first word in a chunk B-egins the chunk.
  224. if index == 0:
  225. chunks.append((word, 'B-%s' % token[1:2]) + token[2:])
  226. # All other words in a chunk are I-n the chunk.
  227. else:
  228. chunks.append((word, 'I-%s' % token[1:2]) + token[2:])
  229. # If the token doesn't have a chunk type, it's O-ut.
  230. else:
  231. chunks.append((token[0], 'O'))
  232. return chunks
  233. depth = kwargs.get('depth', 0)
  234. sents = self._chunked_sents(self._sents(fileids), depth)
  235. return LazyMap(__iob_sent, sents)
  236. def words(self, fileids=None):
  237. """
  238. @return: A list of words.
  239. @rtype: C{list} of C{str}
  240. @param fileids: A list of corpus files.
  241. @type fileids: C{list} of C{str} or regular expression
  242. @kwparam depth: Depth of chunk parsing for nested chunks.
  243. @type depth: C{int}
  244. """
  245. # Concatenate the list of lists given by sents().
  246. return LazyConcatenation(self.sents(fileids))
  247. def iob_words(self, fileids=None, **kwargs):
  248. """
  249. @return: A list of word/iob/other tag tuples.
  250. @rtype: C{list} of C{tuple}
  251. @param fileids: A list of corpus files.
  252. @type fileids: C{list} of C{str} or regular expression
  253. @kwparam depth: Depth of chunk parsing for nested chunks.
  254. @type depth: C{int}
  255. """
  256. # Concatenate the list of lists given by iob_sents().
  257. return LazyConcatenation(self.iob_sents(fileids, **kwargs))
  258. def chunks(self, fileids=None, **kwargs):
  259. """
  260. @return: A list of chunked sents where chunks are multi-word strings.
  261. @rtype: C{list} of C{list} of C{str}
  262. @param fileids: A list of corpus files.
  263. @type fileids: C{list} of C{str} or regular expression
  264. @kwparam depth: Depth of chunk parsing for nested chunks.
  265. @type depth: C{int}
  266. @kwparam concat: Concatenate sentence lists into one list; works like
  267. itertools.chain()
  268. @type concat: C{bool}
  269. """
  270. def __chunks(sent):
  271. chunks = []
  272. for token in sent:
  273. # If the token is a list of chunk pieces, append the piece's
  274. # contents as a string.
  275. if isinstance(token, list):
  276. # TODO: Better if able to reverse Treebank-style
  277. # tokenization. The join leaves some weird whitespace.
  278. chunks.append(' '.join([word[0] for word in token]))
  279. # If the token is a tuple, append the token's contents.
  280. elif isinstance(token, tuple):
  281. chunks.append(token[0])
  282. # Something bad happened.
  283. else:
  284. raise
  285. return chunks
  286. sents = self.chunked_sents(fileids, **kwargs)
  287. # Concatenate the lists.
  288. if kwargs.get('concat'):
  289. return LazyConcatenation(LazyMap(__chunks, sents))
  290. # Or not.
  291. else:
  292. return LazyMap(__chunks, sents)
  293. def mentions(self, fileids=None, **kwargs):
  294. """
  295. @return: A list of mentions as the tuple of
  296. ([words...], id, referent, type)
  297. @rtype: C{list} of C{list} of C{tuple}
  298. @param fileids: A list of corpus files.
  299. @type fileids: C{list} of C{str} or regular expression
  300. @kwparam depth: Depth of chunk parsing for nested chunks.
  301. @type depth: C{int}
  302. @kwparam concat: Concatenate sentence lists into one list; works like
  303. itertools.chain(). Defaults to False.
  304. @type concat: C{bool}
  305. @kwparam nonmentions: Return nonmentions as well as mentions. Defaults
  306. to False.
  307. @type nonmentions: C{bool}
  308. """
  309. def __mentions(sent):
  310. mentions = []
  311. # Map each sentence subtree into a tuple.
  312. for token in map(tree2tuple, sent):
  313. # If the token type is COREF then append the token contents
  314. # and everything but the token type.
  315. if token[1] == 'COREF':
  316. mentions.append(token[:1] + token[2:])
  317. # If including nonmentions, append the token contents only.
  318. elif kwargs.get('nonmentions'):
  319. mentions.append(token[:1])
  320. return mentions
  321. # TODO: Is depth doing what it's expected to?
  322. depth = kwargs.get('depth', 0)
  323. sents = self._chunked_sents(self._sents(fileids), depth)
  324. # Concatenate the lists.
  325. if kwargs.get('concat'):
  326. return LazyConcatenation(LazyMap(__mentions, sents))
  327. # Or not.
  328. else:
  329. return LazyMap(__mentions, sents)
  330. def _paras(self, fileids=None):
  331. """
  332. @return: A list of paragraphs.
  333. @rtype: C{list} of C{Tree}
  334. @param fileids: A list of corpus files.
  335. @type fileids: C{list} of C{str} or regular expression.
  336. """
  337. def __para(doc):
  338. return list(doc.text)
  339. return LazyConcatenation(LazyMap(__para, self.parsed_docs(fileids)))
  340. def _sents(self, fileids=None):
  341. """
  342. @return: A list of sentence trees.
  343. @rtype: C{list} of C{list} of C{Tree}
  344. @param fileids: A list of corpus files.
  345. @type fileids: C{list} of C{str} or regular expression
  346. """
  347. def __sents(para):
  348. return list(para)
  349. # Flatten this because it's a list of list of trees for each doc. It
  350. # doesn't matter which doc the list is from so chain them together.
  351. return LazyConcatenation(LazyMap(__sents, self._paras(fileids)))
  352. def _chunked_sents(self, sents, depth=0):
  353. """
  354. @return: A list of sentence chunk trees which are flatter than the
  355. original trees.
  356. @rtype: C{list} of C{list} of C{Tree}
  357. @param sents: A list of sentence trees.
  358. @type sents: C{list} of C{list} of C{Tree}
  359. @param depth: How deep to read nested chunks off of the trees. If
  360. depth is None, all possible chunk substrees are returned,
  361. otherwise, chunks are returned starting at the highest level 0,
  362. then the next highest 1, etc.
  363. @type depth: C{int}
  364. """
  365. def __chunked_sent(sent):
  366. for chunk in sent:
  367. # If the chunk is a Tree, append it's immediate subtrees.
  368. if isinstance(chunk, Tree):
  369. return list(chunk)
  370. # If the chunk is not a tree, append it.
  371. else:
  372. return chunk
  373. # If depth is None, return all possible subtrees
  374. if depth is None:
  375. return LazyMap(lambda sent: sent.subtrees(), sents)
  376. # If depth is too small, no need to recurse and read further.
  377. if not depth - 1 >= 0:
  378. return sents
  379. # Otherwise, apply __chunked_sent() and recurse.
  380. return self._chunked_sents(LazyConcatenation(LazyMap(__chunked_sent, sents)), depth - 1)
  381. def _read_parsed_block(self, stream):
  382. # TODO: LazyMap but StreamBackedCorpusView doesn't support
  383. # AbstractLazySequence currently.
  384. return map(self._parse, self._read_block(stream))
  385. def _parse(self, doc):
  386. """
  387. @return: A parsed MUC document.
  388. @rtype: C{MUCDocument}
  389. @param doc: The string contents of a MUC document.
  390. @type doc: C{str}
  391. """
  392. tree = mucstr2tree(doc, top_node='DOC')
  393. if isinstance(tree, dict):
  394. return MUCDocument(**tree)
  395. else:
  396. return MUCDocument(tree)
  397. def _read_block(self, stream):
  398. return ['\n'.join(stream.readlines())]
  399. def mucstr2tree(s, chunk_types=_MUC_CHUNK_TYPES, top_node='S'):
  400. """
  401. Convert MUC document contents into a tree.
  402. @return: A MUC document as a tree.
  403. @rtype: C{Tree}
  404. @param s: Contents of a MUC document.
  405. @type s: C{str}
  406. @param chunk_types: Chunk types to extract from the MUC document.
  407. @type chunk_types: C{list} of C{str}
  408. @param top_node: Label to assign to the root of the tree.
  409. @type top_node: C{str}
  410. """
  411. tree = None
  412. match = _MUC6_DOC_RE.match(s)
  413. if match:
  414. # If the MUC document is valid, read the document element groups off its
  415. # contents and return a dictionary of each part.
  416. if match:
  417. tree = {
  418. 'text': _muc_read_text(match.group('text'), top_node),
  419. 'docno': match.group('docno'),
  420. # Capture named entities/mentions in the front-matter too.
  421. 'dateline': _muc_read_text(match.group('dateline'), top_node),
  422. 'headline': _muc_read_text(match.group('headline'), top_node),
  423. }
  424. else:
  425. match = _MUC7_DOC_RE.match(s)
  426. if match:
  427. tree = {
  428. 'text': _muc_read_text(match.group('text'), top_node),
  429. 'docid': match.group('docid'),
  430. # Capture named entities/mentions in the front-matter too.
  431. 'date': _muc_read_text(match.group('date'), top_node),
  432. }
  433. assert tree
  434. return tree
  435. def tree2tuple(tree):
  436. """
  437. Convert a tree or string into a flat tuple of leaves and a label.
  438. @return: A tuple of tree leaves and their parent's label.
  439. @rtype: C{tuple}
  440. @param tree: A tree.
  441. @type tree: C{Tree}
  442. """
  443. result = ()
  444. # If the tree is a tree then create a tuple out of the leaves and label.
  445. if isinstance(tree, Tree):
  446. # Get the leaves.
  447. s = (tree.leaves(),)
  448. # Get the label
  449. if isinstance(tree.node, basestring):
  450. node = (tree.node,)
  451. elif isinstance(tree.node, tuple):
  452. node = tree.node
  453. else:
  454. raise
  455. # Merge the leaves and the label.
  456. return s + node
  457. # If the tree is a string just convert it to a tuple.
  458. elif isinstance(tree, basestring):
  459. return (tree, None)
  460. # Something bad happened.
  461. else:
  462. raise
  463. def _muc_read_text(s, top_node):
  464. # The tokenizer sometimes splits within coref tags.
  465. def __fix_tokenization(sents):
  466. for index in range(len(sents)):
  467. next = 1
  468. while sents[index].count('<COREF') != sents[index].count('</COREF>'):
  469. sents[index] += ' '
  470. sents[index] += sents[index + next]
  471. sents[index + next] = ''
  472. next += 1
  473. sents = filter(None, sents)
  474. return sents
  475. if s:
  476. tree = Tree(top_node, [])
  477. if _MUC6_PARA_RE.match(s):
  478. for para in _MUC6_PARA_RE.findall(s):
  479. if para and para[0] and para[0].strip():
  480. tree.append(Tree('P', []))
  481. for sent in _MUC6_SENT_RE.findall(para[0]):
  482. words = _MUC6_SENT_RE.match(sent[0]).group('sent').strip()
  483. # There are empty sentences <s></s> in the MUC6 corpus.
  484. if words:
  485. tree[-1].append(_muc_read_words(words, 'S'))
  486. elif _MUC7_PARA_RE.match(s):
  487. for para in _MUC7_PARA_SPLIT_RE.split(s):
  488. if para and para.strip():
  489. tree.append(Tree('P', []))
  490. for sent in __fix_tokenization(_SENT_TOKENIZER.tokenize(para)):
  491. tree[-1].append(_muc_read_words(sent, 'S'))
  492. return tree
  493. def _muc_read_words(s, top_node):
  494. if not s: return []
  495. stack = [Tree(top_node, [])]
  496. for word in re.findall('<[^>]+>|[^\s<]+', s):
  497. ne_match = _MUC_NE_B_RE.match(word)
  498. co_match = _MUC_CO_B_RE.match(word)
  499. if ne_match:
  500. chunk = Tree(ne_match.group('type'), [])
  501. stack[-1].append(chunk)
  502. stack.append(chunk)
  503. elif co_match:
  504. chunk = Tree(('COREF', co_match.group('id'),
  505. co_match.group('ref'), co_match.group('type')), [])
  506. stack[-1].append(chunk)
  507. stack.append(chunk)
  508. elif _MUC_NE_E_RE.match(word) or _MUC_CO_E_RE.match(word):
  509. stack.pop()
  510. else:
  511. stack[-1].extend(_WORD_TOKENIZER.tokenize(word))
  512. if len(stack) != 1:
  513. print stack
  514. assert len(stack) == 1
  515. return stack[0]
  516. def demo(**kwargs):
  517. import nltk
  518. from nltk_contrib.coref import NLTK_COREF_DATA
  519. from nltk_contrib.coref.muc import muc6_documents, muc7_documents
  520. from nltk_contrib.coref.muc import MUCCorpusReader
  521. nltk.data.path.insert(0, NLTK_COREF_DATA)
  522. muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents)
  523. for sent in muc6.iob_sents()[:]:
  524. for word in sent:
  525. print word
  526. print
  527. print
  528. for sent in muc6.mentions(depth=None):
  529. for mention in sent:
  530. print mention
  531. if sent: print
  532. print
  533. muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents)
  534. for sent in muc7.iob_sents()[:]:
  535. for word in sent:
  536. print word
  537. print
  538. print
  539. for sent in muc7.mentions(depth=None):
  540. for mention in sent:
  541. print mention
  542. if sent: print
  543. print
  544. if __name__ == '__main__':
  545. demo()