PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/chunk/util.py

https://github.com/haewoon/nltk
Python | 594 lines | 480 code | 34 blank | 80 comment | 46 complexity | bbd8e5e729e3af5069aa99beb3c7ea62 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Chunk format conversions
  2. #
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # Steven Bird <sb@csse.unimelb.edu.au> (minor additions)
  6. # URL: <http://www.nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. import re
  9. import string
  10. from nltk.tree import Tree
  11. from nltk.tag.util import str2tuple
  12. ##//////////////////////////////////////////////////////
  13. ## EVALUATION
  14. ##//////////////////////////////////////////////////////
  15. from nltk.metrics import accuracy as _accuracy
  16. def accuracy(chunker, gold):
  17. """
  18. Score the accuracy of the chunker against the gold standard.
  19. Strip the chunk information from the gold standard and rechunk it using
  20. the chunker, then compute the accuracy score.
  21. :type chunker: ChunkParserI
  22. :param chunker: The chunker being evaluated.
  23. :type gold: tree
  24. :param gold: The chunk structures to score the chunker on.
  25. :rtype: float
  26. """
  27. gold_tags = []
  28. test_tags = []
  29. for gold_tree in gold:
  30. test_tree = chunker.parse(gold_tree.flatten())
  31. gold_tags += tree2conlltags(gold_tree)
  32. test_tags += tree2conlltags(test_tree)
  33. # print 'GOLD:', gold_tags[:50]
  34. # print 'TEST:', test_tags[:50]
  35. return _accuracy(gold_tags, test_tags)
  36. # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
  37. # -- statistics are evaluated only on demand, instead of at every sentence evaluation
  38. #
  39. # SB: use nltk.metrics for precision/recall scoring?
  40. #
  41. class ChunkScore(object):
  42. """
  43. A utility class for scoring chunk parsers. ``ChunkScore`` can
  44. evaluate a chunk parser's output, based on a number of statistics
  45. (precision, recall, f-measure, misssed chunks, incorrect chunks).
  46. It can also combine the scores from the parsing of multiple texts;
  47. this makes it significantly easier to evaluate a chunk parser that
  48. operates one sentence at a time.
  49. Texts are evaluated with the ``score`` method. The results of
  50. evaluation can be accessed via a number of accessor methods, such
  51. as ``precision`` and ``f_measure``. A typical use of the
  52. ``ChunkScore`` class is::
  53. >>> chunkscore = ChunkScore()
  54. >>> for correct in correct_sentences:
  55. ... guess = chunkparser.parse(correct.leaves())
  56. ... chunkscore.score(correct, guess)
  57. >>> print 'F Measure:', chunkscore.f_measure()
  58. F Measure: 0.823
  59. :ivar kwargs: Keyword arguments:
  60. - max_tp_examples: The maximum number actual examples of true
  61. positives to record. This affects the ``correct`` member
  62. function: ``correct`` will not return more than this number
  63. of true positive examples. This does *not* affect any of
  64. the numerical metrics (precision, recall, or f-measure)
  65. - max_fp_examples: The maximum number actual examples of false
  66. positives to record. This affects the ``incorrect`` member
  67. function and the ``guessed`` member function: ``incorrect``
  68. will not return more than this number of examples, and
  69. ``guessed`` will not return more than this number of true
  70. positive examples. This does *not* affect any of the
  71. numerical metrics (precision, recall, or f-measure)
  72. - max_fn_examples: The maximum number actual examples of false
  73. negatives to record. This affects the ``missed`` member
  74. function and the ``correct`` member function: ``missed``
  75. will not return more than this number of examples, and
  76. ``correct`` will not return more than this number of true
  77. negative examples. This does *not* affect any of the
  78. numerical metrics (precision, recall, or f-measure)
  79. - chunk_node: A regular expression indicating which chunks
  80. should be compared. Defaults to ``'.*'`` (i.e., all chunks).
  81. :type _tp: list(Token)
  82. :ivar _tp: List of true positives
  83. :type _fp: list(Token)
  84. :ivar _fp: List of false positives
  85. :type _fn: list(Token)
  86. :ivar _fn: List of false negatives
  87. :type _tp_num: int
  88. :ivar _tp_num: Number of true positives
  89. :type _fp_num: int
  90. :ivar _fp_num: Number of false positives
  91. :type _fn_num: int
  92. :ivar _fn_num: Number of false negatives.
  93. """
  94. def __init__(self, **kwargs):
  95. self._correct = set()
  96. self._guessed = set()
  97. self._tp = set()
  98. self._fp = set()
  99. self._fn = set()
  100. self._max_tp = kwargs.get('max_tp_examples', 100)
  101. self._max_fp = kwargs.get('max_fp_examples', 100)
  102. self._max_fn = kwargs.get('max_fn_examples', 100)
  103. self._chunk_node = kwargs.get('chunk_node', '.*')
  104. self._tp_num = 0
  105. self._fp_num = 0
  106. self._fn_num = 0
  107. self._count = 0
  108. self._tags_correct = 0.0
  109. self._tags_total = 0.0
  110. self._measuresNeedUpdate = False
  111. def _updateMeasures(self):
  112. if (self._measuresNeedUpdate):
  113. self._tp = self._guessed & self._correct
  114. self._fn = self._correct - self._guessed
  115. self._fp = self._guessed - self._correct
  116. self._tp_num = len(self._tp)
  117. self._fp_num = len(self._fp)
  118. self._fn_num = len(self._fn)
  119. self._measuresNeedUpdate = False
  120. def score(self, correct, guessed):
  121. """
  122. Given a correctly chunked sentence, score another chunked
  123. version of the same sentence.
  124. :type correct: chunk structure
  125. :param correct: The known-correct ("gold standard") chunked
  126. sentence.
  127. :type guessed: chunk structure
  128. :param guessed: The chunked sentence to be scored.
  129. """
  130. self._correct |= _chunksets(correct, self._count, self._chunk_node)
  131. self._guessed |= _chunksets(guessed, self._count, self._chunk_node)
  132. self._count += 1
  133. self._measuresNeedUpdate = True
  134. # Keep track of per-tag accuracy (if possible)
  135. try:
  136. correct_tags = tree2conlltags(correct)
  137. guessed_tags = tree2conlltags(guessed)
  138. except ValueError:
  139. # This exception case is for nested chunk structures,
  140. # where tree2conlltags will fail with a ValueError: "Tree
  141. # is too deeply nested to be printed in CoNLL format."
  142. correct_tags = guessed_tags = ()
  143. self._tags_total += len(correct_tags)
  144. self._tags_correct += sum(1 for (t,g) in zip(guessed_tags,
  145. correct_tags)
  146. if t==g)
  147. def accuracy(self):
  148. """
  149. Return the overall tag-based accuracy for all text that have
  150. been scored by this ``ChunkScore``, using the IOB (conll2000)
  151. tag encoding.
  152. :rtype: float
  153. """
  154. if self._tags_total == 0: return 1
  155. return self._tags_correct/self._tags_total
  156. def precision(self):
  157. """
  158. Return the overall precision for all texts that have been
  159. scored by this ``ChunkScore``.
  160. :rtype: float
  161. """
  162. self._updateMeasures()
  163. div = self._tp_num + self._fp_num
  164. if div == 0: return 0
  165. else: return float(self._tp_num) / div
  166. def recall(self):
  167. """
  168. Return the overall recall for all texts that have been
  169. scored by this ``ChunkScore``.
  170. :rtype: float
  171. """
  172. self._updateMeasures()
  173. div = self._tp_num + self._fn_num
  174. if div == 0: return 0
  175. else: return float(self._tp_num) / div
  176. def f_measure(self, alpha=0.5):
  177. """
  178. Return the overall F measure for all texts that have been
  179. scored by this ``ChunkScore``.
  180. :param alpha: the relative weighting of precision and recall.
  181. Larger alpha biases the score towards the precision value,
  182. while smaller alpha biases the score towards the recall
  183. value. ``alpha`` should have a value in the range [0,1].
  184. :type alpha: float
  185. :rtype: float
  186. """
  187. self._updateMeasures()
  188. p = self.precision()
  189. r = self.recall()
  190. if p == 0 or r == 0: # what if alpha is 0 or 1?
  191. return 0
  192. return 1/(alpha/p + (1-alpha)/r)
  193. def missed(self):
  194. """
  195. Return the chunks which were included in the
  196. correct chunk structures, but not in the guessed chunk
  197. structures, listed in input order.
  198. :rtype: list of chunks
  199. """
  200. self._updateMeasures()
  201. chunks = list(self._fn)
  202. return [c[1] for c in chunks] # discard position information
  203. def incorrect(self):
  204. """
  205. Return the chunks which were included in the guessed chunk structures,
  206. but not in the correct chunk structures, listed in input order.
  207. :rtype: list of chunks
  208. """
  209. self._updateMeasures()
  210. chunks = list(self._fp)
  211. return [c[1] for c in chunks] # discard position information
  212. def correct(self):
  213. """
  214. Return the chunks which were included in the correct
  215. chunk structures, listed in input order.
  216. :rtype: list of chunks
  217. """
  218. chunks = list(self._correct)
  219. return [c[1] for c in chunks] # discard position information
  220. def guessed(self):
  221. """
  222. Return the chunks which were included in the guessed
  223. chunk structures, listed in input order.
  224. :rtype: list of chunks
  225. """
  226. chunks = list(self._guessed)
  227. return [c[1] for c in chunks] # discard position information
  228. def __len__(self):
  229. self._updateMeasures()
  230. return self._tp_num + self._fn_num
  231. def __repr__(self):
  232. """
  233. Return a concise representation of this ``ChunkScoring``.
  234. :rtype: str
  235. """
  236. return '<ChunkScoring of '+`len(self)`+' chunks>'
  237. def __str__(self):
  238. """
  239. Return a verbose representation of this ``ChunkScoring``.
  240. This representation includes the precision, recall, and
  241. f-measure scores. For other information about the score,
  242. use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
  243. :rtype: str
  244. """
  245. return ("ChunkParse score:\n" +
  246. (" IOB Accuracy: %5.1f%%\n" % (self.accuracy()*100)) +
  247. (" Precision: %5.1f%%\n" % (self.precision()*100)) +
  248. (" Recall: %5.1f%%\n" % (self.recall()*100))+
  249. (" F-Measure: %5.1f%%" % (self.f_measure()*100)))
  250. # extract chunks, and assign unique id, the absolute position of
  251. # the first word of the chunk
  252. def _chunksets(t, count, chunk_node):
  253. pos = 0
  254. chunks = []
  255. for child in t:
  256. if isinstance(child, Tree):
  257. if re.match(chunk_node, child.node):
  258. chunks.append(((count, pos), child.freeze()))
  259. pos += len(child.leaves())
  260. else:
  261. pos += 1
  262. return set(chunks)
  263. def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
  264. """
  265. Divide a string of bracketted tagged text into
  266. chunks and unchunked tokens, and produce a Tree.
  267. Chunks are marked by square brackets (``[...]``). Words are
  268. delimited by whitespace, and each word should have the form
  269. ``text/tag``. Words that do not contain a slash are
  270. assigned a ``tag`` of None.
  271. :param s: The string to be converted
  272. :type s: str
  273. :param chunk_node: The label to use for chunk nodes
  274. :type chunk_node: str
  275. :param top_node: The label to use for the root of the tree
  276. :type top_node: str
  277. :rtype: Tree
  278. """
  279. WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
  280. stack = [Tree(top_node, [])]
  281. for match in WORD_OR_BRACKET.finditer(s):
  282. text = match.group()
  283. if text[0] == '[':
  284. if len(stack) != 1:
  285. raise ValueError('Unexpected [ at char %d' % match.start())
  286. chunk = Tree(chunk_node, [])
  287. stack[-1].append(chunk)
  288. stack.append(chunk)
  289. elif text[0] == ']':
  290. if len(stack) != 2:
  291. raise ValueError('Unexpected ] at char %d' % match.start())
  292. stack.pop()
  293. else:
  294. if sep is None:
  295. stack[-1].append(text)
  296. else:
  297. stack[-1].append(str2tuple(text, sep))
  298. if len(stack) != 1:
  299. raise ValueError('Expected ] at char %d' % len(s))
  300. return stack[0]
  301. ### CONLL
  302. _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
  303. def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
  304. """
  305. Return a chunk structure for a single sentence
  306. encoded in the given CONLL 2000 style string.
  307. This function converts a CoNLL IOB string into a tree.
  308. It uses the specified chunk types
  309. (defaults to NP, PP and VP), and creates a tree rooted at a node
  310. labeled S (by default).
  311. :param s: The CoNLL string to be converted.
  312. :type s: str
  313. :param chunk_types: The chunk types to be converted.
  314. :type chunk_types: tuple
  315. :param top_node: The node label to use for the root.
  316. :type top_node: str
  317. :rtype: Tree
  318. """
  319. stack = [Tree(top_node, [])]
  320. for lineno, line in enumerate(s.split('\n')):
  321. if not line.strip(): continue
  322. # Decode the line.
  323. match = _LINE_RE.match(line)
  324. if match is None:
  325. raise ValueError, 'Error on line %d' % lineno
  326. (word, tag, state, chunk_type) = match.groups()
  327. # If it's a chunk type we don't care about, treat it as O.
  328. if (chunk_types is not None and
  329. chunk_type not in chunk_types):
  330. state = 'O'
  331. # For "Begin"/"Outside", finish any completed chunks -
  332. # also do so for "Inside" which don't match the previous token.
  333. mismatch_I = state == 'I' and chunk_type != stack[-1].node
  334. if state in 'BO' or mismatch_I:
  335. if len(stack) == 2: stack.pop()
  336. # For "Begin", start a new chunk.
  337. if state == 'B' or mismatch_I:
  338. chunk = Tree(chunk_type, [])
  339. stack[-1].append(chunk)
  340. stack.append(chunk)
  341. # Add the new word token.
  342. stack[-1].append((word, tag))
  343. return stack[0]
  344. def tree2conlltags(t):
  345. """
  346. Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
  347. Convert a tree to the CoNLL IOB tag format.
  348. :param t: The tree to be converted.
  349. :type t: Tree
  350. :rtype: list(tuple)
  351. """
  352. tags = []
  353. for child in t:
  354. try:
  355. category = child.node
  356. prefix = "B-"
  357. for contents in child:
  358. if isinstance(contents, Tree):
  359. raise ValueError, "Tree is too deeply nested to be printed in CoNLL format"
  360. tags.append((contents[0], contents[1], prefix+category))
  361. prefix = "I-"
  362. except AttributeError:
  363. tags.append((child[0], child[1], "O"))
  364. return tags
  365. def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
  366. top_node='S', strict=False):
  367. """
  368. Convert the CoNLL IOB format to a tree.
  369. """
  370. tree = Tree(top_node, [])
  371. for (word, postag, chunktag) in sentence:
  372. if chunktag is None:
  373. if strict:
  374. raise ValueError("Bad conll tag sequence")
  375. else:
  376. # Treat as O
  377. tree.append((word,postag))
  378. elif chunktag.startswith('B-'):
  379. tree.append(Tree(chunktag[2:], [(word,postag)]))
  380. elif chunktag.startswith('I-'):
  381. if (len(tree)==0 or not isinstance(tree[-1], Tree) or
  382. tree[-1].node != chunktag[2:]):
  383. if strict:
  384. raise ValueError("Bad conll tag sequence")
  385. else:
  386. # Treat as B-*
  387. tree.append(Tree(chunktag[2:], [(word,postag)]))
  388. else:
  389. tree[-1].append((word,postag))
  390. elif chunktag == 'O':
  391. tree.append((word,postag))
  392. else:
  393. raise ValueError("Bad conll tag %r" % chunktag)
  394. return tree
  395. def tree2conllstr(t):
  396. """
  397. Return a multiline string where each line contains a word, tag and IOB tag.
  398. Convert a tree to the CoNLL IOB string format
  399. :param t: The tree to be converted.
  400. :type t: Tree
  401. :rtype: str
  402. """
  403. lines = [string.join(token) for token in tree2conlltags(t)]
  404. return '\n'.join(lines)
  405. ### IEER
  406. _IEER_DOC_RE = re.compile(r'<DOC>\s*'
  407. r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
  408. r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
  409. r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
  410. r'<BODY>\s*'
  411. r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
  412. r'<TEXT>(?P<text>.*?)</TEXT>\s*'
  413. r'</BODY>\s*</DOC>\s*', re.DOTALL)
  414. _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
  415. def _ieer_read_text(s, top_node):
  416. stack = [Tree(top_node, [])]
  417. # s will be None if there is no headline in the text
  418. # return the empty list in place of a Tree
  419. if s is None:
  420. return []
  421. for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
  422. piece = piece_m.group()
  423. try:
  424. if piece.startswith('<b_'):
  425. m = _IEER_TYPE_RE.match(piece)
  426. if m is None: print 'XXXX', piece
  427. chunk = Tree(m.group('type'), [])
  428. stack[-1].append(chunk)
  429. stack.append(chunk)
  430. elif piece.startswith('<e_'):
  431. stack.pop()
  432. # elif piece.startswith('<'):
  433. # print "ERROR:", piece
  434. # raise ValueError # Unexpected HTML
  435. else:
  436. stack[-1].append(piece)
  437. except (IndexError, ValueError):
  438. raise ValueError('Bad IEER string (error at character %d)' %
  439. piece_m.start())
  440. if len(stack) != 1:
  441. raise ValueError('Bad IEER string')
  442. return stack[0]
  443. def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
  444. 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], top_node="S"):
  445. """
  446. Return a chunk structure containing the chunked tagged text that is
  447. encoded in the given IEER style string.
  448. Convert a string of chunked tagged text in the IEER named
  449. entity format into a chunk structure. Chunks are of several
  450. types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
  451. PERCENT, MONEY, and MEASURE.
  452. :rtype: Tree
  453. """
  454. # Try looking for a single document. If that doesn't work, then just
  455. # treat everything as if it was within the <TEXT>...</TEXT>.
  456. m = _IEER_DOC_RE.match(s)
  457. if m:
  458. return {
  459. 'text': _ieer_read_text(m.group('text'), top_node),
  460. 'docno': m.group('docno'),
  461. 'doctype': m.group('doctype'),
  462. 'date_time': m.group('date_time'),
  463. #'headline': m.group('headline')
  464. # we want to capture NEs in the headline too!
  465. 'headline': _ieer_read_text(m.group('headline'), top_node),
  466. }
  467. else:
  468. return _ieer_read_text(s, top_node)
  469. def demo():
  470. s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
  471. import nltk
  472. t = nltk.chunk.tagstr2tree(s, chunk_node='NP')
  473. print t.pprint()
  474. print
  475. s = """
  476. These DT B-NP
  477. research NN I-NP
  478. protocols NNS I-NP
  479. offer VBP B-VP
  480. to TO B-PP
  481. the DT B-NP
  482. patient NN I-NP
  483. not RB O
  484. only RB O
  485. the DT B-NP
  486. very RB I-NP
  487. best JJS I-NP
  488. therapy NN I-NP
  489. which WDT B-NP
  490. we PRP B-NP
  491. have VBP B-VP
  492. established VBN I-VP
  493. today NN B-NP
  494. but CC B-NP
  495. also RB I-NP
  496. the DT B-NP
  497. hope NN I-NP
  498. of IN B-PP
  499. something NN B-NP
  500. still RB B-ADJP
  501. better JJR I-ADJP
  502. . . O
  503. """
  504. conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
  505. print conll_tree.pprint()
  506. # Demonstrate CoNLL output
  507. print "CoNLL output:"
  508. print nltk.chunk.tree2conllstr(conll_tree)
  509. print
  510. if __name__ == '__main__':
  511. demo()