PageRenderTime 56ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/nltk_contrib/nltk_contrib/misc/marshalbrill.py

http://nltk.googlecode.com/
Python | 1231 lines | 1169 code | 16 blank | 46 comment | 23 complexity | 0fe5cdfc02e79609fbd8c7c8a4dc2dd2 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. # Natural Language Toolkit: Brill Tagger
  2. #
  3. # Copyright (C) 2001-2005 NLTK Project
  4. # Authors: Christopher Maloof <cjmaloof@gradient.cis.upenn.edu>
  5. # Edward Loper <edloper@gradient.cis.upenn.edu>
  6. # Steven Bird <sb@ldc.upenn.edu>
  7. # URL: <http://www.nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Brill's transformational rule-based tagger.
  11. """
  12. from nltk.tag import TagI
  13. import bisect # for binary search through a subset of indices
  14. import os # for finding WSJ files
  15. import random # for shuffling WSJ files
  16. import sys # for getting command-line arguments
  17. import re # for performing regular expression matching
  18. ######################################################################
  19. ## The Brill Tagger
  20. ######################################################################
  21. class Brill(TagI):
  22. """
  23. Brill's transformational rule-based tagger. Brill taggers use an
  24. X{initial tagger} (such as L{tag.Default}) to assign an intial
  25. tag sequence to a text; and then apply an ordered list of
  26. transformational rules to correct the tags of individual tokens.
  27. These transformation rules are specified by the L{BrillRuleI}
  28. interface.
  29. Brill taggers can be created directly, from an initial tagger and
  30. a list of transformational rules; but more often, Brill taggers
  31. are created by learning rules from a training corpus, using either
  32. L{BrillTrainer} or L{FastBrillTrainer}.
  33. """
  34. # TODO: move into __init__() when all marshalling classes will be moved into
  35. # standard tree
  36. _classname = "BrillTagger"
  37. def __init__(self, initial_tagger, rules):
  38. """
  39. @param initial_tagger: The initial tagger
  40. @type initial_tagger: L{TagI}
  41. @param rules: An ordered list of transformation rules that
  42. should be used to correct the initial tagging.
  43. @type rules: C{list} of L{BrillRuleI}
  44. """
  45. self._initial_tagger = initial_tagger
  46. self._rules = rules
  47. def rules(self):
  48. return self._rules[:]
  49. def tag (self, tokens):
  50. # Inherit documentation from TagI
  51. # Run the initial tagger.
  52. tagged_tokens = list(self._initial_tagger.tag(tokens))
  53. # Create a dictionary that maps each tag to a list of the
  54. # indices of tokens that have that tag.
  55. tag_to_positions = {}
  56. for i, (token, tag) in enumerate(tagged_tokens):
  57. if tag not in tag_to_positions:
  58. tag_to_positions[tag] = set([i])
  59. else:
  60. tag_to_positions[tag].add(i)
  61. # Apply each rule, in order. Only try to apply rules at
  62. # positions that have the desired original tag.
  63. for rule in self._rules:
  64. # Find the positions where it might apply
  65. positions = tag_to_positions.get(rule.original_tag(), [])
  66. # Apply the rule at those positions.
  67. changed = rule.apply_at(tagged_tokens, positions)
  68. # Update tag_to_positions with the positions of tags that
  69. # were modified.
  70. for i in changed:
  71. tag_to_positions[rule.original_tag()].remove(i)
  72. if rule.replacement_tag() not in tag_to_positions:
  73. tag_to_positions[rule.replacement_tag()] = set([i])
  74. else:
  75. tag_to_positions[rule.replacement_tag()].add(i)
  76. for t in tagged_tokens:
  77. yield t
  78. # marshal() and unmarshal() methods by Tiago Tresoldi <tresoldi@users.sf.net>
  79. def marshal (self, filename):
  80. """
  81. Marshals (saves to a plain text file) the tagger model.
  82. @param filename: Name of the file to which save the model (will
  83. be overwritten if it already exists).
  84. @type filename: C{string}
  85. """
  86. handler = file(filename, "w")
  87. for rule in self.rules():
  88. handler.write("%s\n" % rule)
  89. handler.close()
  90. def unmarshal (self, filename):
  91. """
  92. Unmarshals (loads from a plain text file) the tagger model. This
  93. operation will override any previously stored rules.
  94. @param filename: Name of the file from which the model will
  95. be read.
  96. @type filename: C{string}
  97. """
  98. rule_a = re.compile(r"^(.+) -> (.+) if the (.+) of words i([+-]\d+)...i([+-]\d+) is '(.+)'$", re.UNICODE)
  99. rule_b = re.compile(r"^(.+) -> (.+) if the (.+) of the (.+) word is '(.+)'$", re.UNICODE)
  100. # erase any previous rules
  101. self._rules = []
  102. # load from file
  103. handler = file(filename, "r")
  104. lines = handler.readlines()
  105. handler.close()
  106. # remove '\n's, even though $ would catch them
  107. lines = [line[:-1] for line in lines]
  108. # remove empty lines
  109. lines = [line for line in lines if len(line)>0]
  110. # parse rules
  111. for rule in lines:
  112. match = re.match(rule_b, rule)
  113. if match:
  114. groups = list( match.groups() )
  115. if groups[3] == "preceding":
  116. groups.pop(3)
  117. groups.insert(3, "-1")
  118. groups.insert(4, "-1")
  119. else:
  120. groups.pop(3)
  121. groups.insert(3, "1")
  122. groups.insert(4, "1")
  123. else:
  124. match = re.match(rule_a, rule)
  125. groups = list( match.groups() )
  126. conditions = (int(groups[3]), int(groups[4]), groups[5])
  127. if groups[2] == "tag":
  128. r = ProximateTagsRule(groups[0], groups[1], conditions)
  129. else:
  130. r = ProximateWordsRule(groups[0], groups[1], conditions)
  131. self._rules.append(r)
  132. ######################################################################
  133. ## Brill Rules
  134. ######################################################################
  135. class BrillRuleI(object):
  136. """
  137. An interface for tag transformations on a tagged corpus, as
  138. performed by brill taggers. Each transformation finds all tokens
  139. in the corpus that are tagged with a specific X{original tag} and
  140. satisfy a specific X{condition}, and replaces their tags with a
  141. X{replacement tag}. For any given transformation, the original
  142. tag, replacement tag, and condition are fixed. Conditions may
  143. depend on the token under consideration, as well as any other
  144. tokens in the corpus.
  145. Brill rules must be comparable and hashable.
  146. """
  147. def apply_to(self, tokens):
  148. """
  149. Apply this rule everywhere it applies in the corpus. I.e.,
  150. for each token in the corpus that is tagged with this rule's
  151. original tag, and that satisfies this rule's condition, set
  152. its tag to be this rule's replacement tag.
  153. @param tokens: The tagged corpus
  154. @type tokens: C{list} of C{tuple}
  155. @return: The indices of tokens whose tags were changed by this
  156. rule.
  157. @rtype: C{list} of C{int}
  158. """
  159. return self.apply_at(tokens, range(len(tokens)))
  160. def apply_at(self, tokens, positions):
  161. """
  162. Apply this rule at every position in C{positions} where it
  163. applies to the corpus. I.e., for each position M{p} in
  164. C{positions}, if C{tokens[M{p}]} is tagged with this rule's
  165. original tag, and satisfies this rule's condition, then set
  166. its tag to be this rule's replacement tag.
  167. @param tokens: The tagged corpus
  168. @type tokens: list of Token
  169. @type positions: C{list} of C{int}
  170. @param positions: The positions where the transformation is to
  171. be tried.
  172. @return: The indices of tokens whose tags were changed by this
  173. rule.
  174. @rtype: C{int}
  175. """
  176. assert False, "BrillRuleI is an abstract interface"
  177. def applies(self, tokens, index):
  178. """
  179. @return: True if the rule would change the tag of
  180. C{tokens[index]}, False otherwise
  181. @rtype: Boolean
  182. @param tokens: A tagged corpus
  183. @type tokens: list of Token
  184. @param index: The index to check
  185. @type index: int
  186. """
  187. assert False, "BrillRuleI is an abstract interface"
  188. def original_tag(self):
  189. """
  190. @return: The tag which this C{BrillRuleI} may cause to be
  191. replaced.
  192. @rtype: any
  193. """
  194. assert False, "BrillRuleI is an abstract interface"
  195. def replacement_tag(self):
  196. """
  197. @return: the tag with which this C{BrillRuleI} may replace
  198. another tag.
  199. @rtype: any
  200. """
  201. assert False, "BrillRuleI is an abstract interface"
  202. # Rules must be comparable and hashable for the algorithm to work
  203. def __eq__(self):
  204. assert False, "Brill rules must be comparable"
  205. def __hash__(self):
  206. assert False, "Brill rules must be hashable"
  207. class ProximateTokensRule(BrillRuleI):
  208. """
  209. An abstract base class for brill rules whose condition checks for
  210. the presence of tokens with given properties at given ranges of
  211. positions, relative to the token.
  212. Each subclass of proximate tokens brill rule defines a method
  213. M{extract_property}, which extracts a specific property from the
  214. the token, such as its text or tag. Each instance is
  215. parameterized by a set of tuples, specifying ranges of positions
  216. and property values to check for in those ranges:
  217. - (M{start}, M{end}, M{value})
  218. The brill rule is then applicable to the M{n}th token iff:
  219. - The M{n}th token is tagged with the rule's original tag; and
  220. - For each (M{start}, M{end}, M{value}) triple:
  221. - The property value of at least one token between
  222. M{n+start} and M{n+end} (inclusive) is M{value}.
  223. For example, a proximate token brill template with M{start=end=-1}
  224. generates rules that check just the property of the preceding
  225. token. Note that multiple properties may be included in a single
  226. rule; the rule applies if they all hold.
  227. """
  228. def __init__(self, original_tag, replacement_tag, *conditions):
  229. """
  230. Construct a new brill rule that changes a token's tag from
  231. C{original_tag} to C{replacement_tag} if all of the properties
  232. specified in C{conditions} hold.
  233. @type conditions: C{tuple} of C{(int, int, *)}
  234. @param conditions: A list of 3-tuples C{(start, end, value)},
  235. each of which specifies that the property of at least one
  236. token between M{n}+C{start} and M{n}+C{end} (inclusive) is
  237. C{value}.
  238. @raise ValueError: If C{start}>C{end} for any condition.
  239. """
  240. assert self.__class__ != ProximateTokensRule, \
  241. "ProximateTokensRule is an abstract base class"
  242. self._original = original_tag
  243. self._replacement = replacement_tag
  244. self._conditions = conditions
  245. for (s,e,v) in conditions:
  246. if s>e:
  247. raise ValueError('Condition %s has an invalid range' %
  248. ((s,e,v),))
  249. def extract_property(token): # [staticmethod]
  250. """
  251. Returns some property characterizing this token, such as its
  252. base lexical item or its tag.
  253. Each implentation of this method should correspond to an
  254. implementation of the method with the same name in a subclass
  255. of L{ProximateTokensTemplate}.
  256. @param token: The token
  257. @type token: Token
  258. @return: The property
  259. @rtype: any
  260. """
  261. assert False, "ProximateTokensRule is an abstract interface"
  262. extract_property = staticmethod(extract_property)
  263. def apply_at(self, tokens, positions):
  264. # Inherit docs from BrillRuleI
  265. # Find all locations where the rule is applicable
  266. change = []
  267. for i in positions:
  268. if self.applies(tokens, i):
  269. change.append(i)
  270. # Make the changes. Note: this must be done in a separate
  271. # step from finding applicable locations, since we don't want
  272. # the rule to interact with itself.
  273. for i in change:
  274. (token, tag) = tokens[i]
  275. tokens[i] = (token, self._replacement)
  276. return change
  277. def applies(self, tokens, index):
  278. # Inherit docs from BrillRuleI
  279. # Does the given token have this rule's "original tag"?
  280. if tokens[index][1] != self._original:
  281. return False
  282. # Check to make sure that every condition holds.
  283. for (start, end, val) in self._conditions:
  284. # Find the (absolute) start and end indices.
  285. s = max(0, index+start)
  286. e = min(index+end+1, len(tokens))
  287. # Look for *any* token that satisfies the condition.
  288. for i in range(s, e):
  289. if self.extract_property(tokens[i]) == val:
  290. break
  291. else:
  292. # No token satisfied the condition; return false.
  293. return False
  294. # Every condition checked out, so the rule is applicable.
  295. return True
  296. def original_tag(self):
  297. # Inherit docs from BrillRuleI
  298. return self._original
  299. def replacement_tag(self):
  300. # Inherit docs from BrillRuleI
  301. return self._replacement
  302. def __eq__(self, other):
  303. return (other != None and
  304. other.__class__ == self.__class__ and
  305. self._original == other._original and
  306. self._replacement == other._replacement and
  307. self._conditions == other._conditions)
  308. def __hash__(self):
  309. # Needs to include extract_property in order to distinguish subclasses
  310. # A nicer way would be welcome.
  311. return hash( (self._original, self._replacement, self._conditions,
  312. self.extract_property.func_code) )
  313. def __repr__(self):
  314. conditions = ' and '.join(['%s in %d...%d' % (v,s,e)
  315. for (s,e,v) in self._conditions])
  316. return '<%s: %s->%s if %s>' % (self.__class__.__name__,
  317. self._original, self._replacement,
  318. conditions)
  319. def __str__(self):
  320. replacement = '%s -> %s' % (self._original,
  321. self._replacement)
  322. if len(self._conditions) == 0:
  323. conditions = ''
  324. else:
  325. conditions = ' if '+ ', and '.join([self._condition_to_str(c)
  326. for c in self._conditions])
  327. return replacement+conditions
  328. def _condition_to_str(self, condition):
  329. """
  330. Return a string representation of the given condition.
  331. This helper method is used by L{__str__}.
  332. """
  333. (start, end, value) = condition
  334. return ('the %s of %s is %r' %
  335. (self.PROPERTY_NAME, self._range_to_str(start, end), value))
  336. def _range_to_str(self, start, end):
  337. """
  338. Return a string representation for the given range. This
  339. helper method is used by L{__str__}.
  340. """
  341. if start == end == 0:
  342. return 'this word'
  343. if start == end == -1:
  344. return 'the preceding word'
  345. elif start == end == 1:
  346. return 'the following word'
  347. elif start == end and start < 0:
  348. return 'word i-%d' % -start
  349. elif start == end and start > 0:
  350. return 'word i+%d' % start
  351. else:
  352. if start >= 0: start = '+%d' % start
  353. if end >= 0: end = '+%d' % end
  354. return 'words i%s...i%s' % (start, end)
  355. class ProximateTagsRule(ProximateTokensRule):
  356. """
  357. A rule which examines the tags of nearby tokens.
  358. @see: superclass L{ProximateTokensRule} for details.
  359. @see: L{ProximateTagsTemplate}, which generates these rules.
  360. """
  361. PROPERTY_NAME = 'tag' # for printing.
  362. def extract_property(token): # [staticmethod]
  363. """@return: The given token's tag."""
  364. return token[1]
  365. extract_property = staticmethod(extract_property)
  366. class ProximateWordsRule(ProximateTokensRule):
  367. """
  368. A rule which examines the base types of nearby tokens.
  369. @see: L{ProximateTokensRule} for details.
  370. @see: L{ProximateWordsTemplate}, which generates these rules.
  371. """
  372. PROPERTY_NAME = 'text' # for printing.
  373. def extract_property(token): # [staticmethod]
  374. """@return: The given token's text."""
  375. return token[0]
  376. extract_property = staticmethod(extract_property)
  377. ######################################################################
  378. ## Brill Templates
  379. ######################################################################
  380. class BrillTemplateI(object):
  381. """
  382. An interface for generating lists of transformational rules that
  383. apply at given corpus positions. C{BrillTemplateI} is used by
  384. C{Brill} training algorithms to generate candidate rules.
  385. """
  386. def __init__(self):
  387. raise AssertionError, "BrillTemplateI is an abstract interface"
  388. def applicable_rules(self, tokens, i, correctTag):
  389. """
  390. Return a list of the transformational rules that would correct
  391. the C{i}th subtoken's tag in the given token. In particular,
  392. return a list of zero or more rules that would change
  393. C{tagged_tokens[i][1]} to C{correctTag}, if applied
  394. to C{token}.
  395. If the C{i}th subtoken already has the correct tag (i.e., if
  396. C{tagged_tokens[i][1]} == C{correctTag}), then
  397. C{applicable_rules} should return the empty list.
  398. @param tokens: The tagged tokens being tagged.
  399. @type tokens: C{list} of C{tuple}
  400. @param i: The index of the token whose tag should be corrected.
  401. @type i: C{int}
  402. @param correctTag: The correct tag for the C{i}th token.
  403. @type correctTag: (any)
  404. @rtype: C{list} of L{BrillRuleI}
  405. """
  406. raise AssertionError, "BrillTemplateI is an abstract interface"
  407. def get_neighborhood(self, token, index):
  408. """
  409. Returns the set of indices C{i} such that
  410. C{applicable_rules(token, index, ...)} depends on the value of
  411. the C{i}th subtoken of C{token}.
  412. This method is used by the \"fast\" Brill tagger trainer.
  413. @param token: The tokens being tagged.
  414. @type token: C{list} of C{tuple}
  415. @param index: The index whose neighborhood should be returned.
  416. @type index: C{int}
  417. @rtype: C{Set}
  418. """
  419. raise AssertionError, "BrillTemplateI is an abstract interface"
  420. class ProximateTokensTemplate(BrillTemplateI):
  421. """
  422. An brill templates that generates a list of
  423. L{ProximateTokensRule}s that apply at a given corpus
  424. position. In particular, each C{ProximateTokensTemplate} is
  425. parameterized by a proximate token brill rule class and a list of
  426. boundaries, and generates all rules that:
  427. - use the given brill rule class
  428. - use the given list of boundaries as the C{start} and C{end}
  429. points for their conditions
  430. - are applicable to the given token.
  431. """
  432. def __init__(self, rule_class, *boundaries):
  433. """
  434. Construct a template for generating proximate token brill
  435. rules.
  436. @type rule_class: C{class}
  437. @param rule_class: The proximate token brill rule class that
  438. should be used to generate new rules. This class must be a
  439. subclass of L{ProximateTokensRule}.
  440. @type boundaries: C{tuple} of C{(int, int)}
  441. @param boundaries: A list of tuples C{(start, end)}, each of
  442. which specifies a range for which a condition should be
  443. created by each rule.
  444. @raise ValueError: If C{start}>C{end} for any boundary.
  445. """
  446. self._rule_class = rule_class
  447. self._boundaries = boundaries
  448. for (s,e) in boundaries:
  449. if s>e:
  450. raise ValueError('Boundary %s has an invalid range' %
  451. ((s,e),))
  452. def applicable_rules(self, tokens, index, correct_tag):
  453. if tokens[index][1] == correct_tag:
  454. return []
  455. # For each of this template's boundaries, Find the conditions
  456. # that are applicable for the given token.
  457. applicable_conditions = \
  458. [self._applicable_conditions(tokens, index, start, end)
  459. for (start, end) in self._boundaries]
  460. # Find all combinations of these applicable conditions. E.g.,
  461. # if applicable_conditions=[[A,B], [C,D]], then this will
  462. # generate [[A,C], [A,D], [B,C], [B,D]].
  463. condition_combos = [[]]
  464. for conditions in applicable_conditions:
  465. condition_combos = [old_conditions+[new_condition]
  466. for old_conditions in condition_combos
  467. for new_condition in conditions]
  468. # Translate the condition sets into rules.
  469. return [self._rule_class(tokens[index][1], correct_tag, *conds)
  470. for conds in condition_combos]
  471. def _applicable_conditions(self, tokens, index, start, end):
  472. """
  473. @return: A set of all conditions for proximate token rules
  474. that are applicable to C{tokens[index]}, given boundaries of
  475. C{(start, end)}. I.e., return a list of all tuples C{(start,
  476. end, M{value})}, such the property value of at least one token
  477. between M{index+start} and M{index+end} (inclusive) is
  478. M{value}.
  479. """
  480. conditions = set()
  481. s = max(0, index+start)
  482. e = min(index+end+1, len(tokens))
  483. for i in range(s, e):
  484. value = self._rule_class.extract_property(tokens[i])
  485. conditions.add( (start, end, value) )
  486. return conditions
  487. def get_neighborhood(self, tokens, index):
  488. # inherit docs from BrillTemplateI
  489. neighborhood = set([index])
  490. for (start, end) in self._boundaries:
  491. s = max(0, index+start)
  492. e = min(index+end+1, len(tokens))
  493. for i in range(s, e):
  494. neighborhood.add(i)
  495. return neighborhood
  496. class SymmetricProximateTokensTemplate(BrillTemplateI):
  497. """
  498. Simulates two L{ProximateTokensTemplate}s which are symmetric
  499. across the location of the token. For rules of the form \"If the
  500. M{n}th token is tagged C{A}, and any tag preceding B{or} following
  501. the M{n}th token by a distance between M{x} and M{y} is C{B}, and
  502. ... , then change the tag of the nth token from C{A} to C{C}.\"
  503. One C{ProximateTokensTemplate} is formed by passing in the
  504. same arguments given to this class's constructor: tuples
  505. representing intervals in which a tag may be found. The other
  506. C{ProximateTokensTemplate} is constructed with the negative
  507. of all the arguments in reversed order. For example, a
  508. C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the
  509. constructor C{ProximateTagsTemplate} generates the same rules as a
  510. C{ProximateTagsTemplate} using (-2,-1) plus a second
  511. C{ProximateTagsTemplate} using (1,2).
  512. This is useful because we typically don't want templates to
  513. specify only \"following\" or only \"preceding\"; we'd like our
  514. rules to be able to look in either direction.
  515. """
  516. def __init__(self, rule_class, *boundaries):
  517. """
  518. Construct a template for generating proximate token brill
  519. rules.
  520. @type rule_class: C{class}
  521. @param rule_class: The proximate token brill rule class that
  522. should be used to generate new rules. This class must be a
  523. subclass of L{ProximateTokensRule}.
  524. @type boundaries: C{tuple} of C{(int, int)}
  525. @param boundaries: A list of tuples C{(start, end)}, each of
  526. which specifies a range for which a condition should be
  527. created by each rule.
  528. @raise ValueError: If C{start}>C{end} for any boundary.
  529. """
  530. self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries)
  531. reversed = [(-e,-s) for (s,e) in boundaries]
  532. self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
  533. # Generates lists of a subtype of ProximateTokensRule.
  534. def applicable_rules(self, tokens, index, correctTag):
  535. """
  536. See L{BrillTemplateI} for full specifications.
  537. @rtype: list of ProximateTokensRule
  538. """
  539. return (self._ptt1.applicable_rules(tokens, index, correctTag) +
  540. self._ptt2.applicable_rules(tokens, index, correctTag))
  541. def get_neighborhood(self, tokens, index):
  542. # inherit docs from BrillTemplateI
  543. n1 = self._ptt1.get_neighborhood(tokens, index)
  544. n2 = self._ptt2.get_neighborhood(tokens, index)
  545. return n1.union(n2)
  546. ######################################################################
  547. ## Brill Tagger Trainer
  548. ######################################################################
  549. class BrillTrainer(object):
  550. """
  551. A trainer for brill taggers.
  552. """
  553. def __init__(self, initial_tagger, templates, trace=0):
  554. self._initial_tagger = initial_tagger
  555. self._templates = templates
  556. self._trace = trace
  557. #////////////////////////////////////////////////////////////
  558. # Training
  559. #////////////////////////////////////////////////////////////
  560. def train(self, train_tokens, max_rules=200, min_score=2):
  561. """
  562. Trains the Brill tagger on the corpus C{train_token},
  563. producing at most C{max_rules} transformations, each of which
  564. reduces the net number of errors in the corpus by at least
  565. C{min_score}.
  566. @type train_tokens: C{list} of L{tuple}
  567. @param train_tokens: The corpus of tagged tokens
  568. @type max_rules: C{int}
  569. @param max_rules: The maximum number of transformations to be created
  570. @type min_score: C{int}
  571. @param min_score: The minimum acceptable net error reduction
  572. that each transformation must produce in the corpus.
  573. """
  574. if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
  575. len(train_tokens))
  576. # Create a new copy of the training token, and run the initial
  577. # tagger on this. We will progressively update this test
  578. # token to look more like the training token.
  579. test_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
  580. if self._trace > 2: self._trace_header()
  581. # Look for useful rules.
  582. rules = []
  583. try:
  584. while len(rules) < max_rules:
  585. old_tags = [t[1] for t in test_tokens]
  586. (rule, score, fixscore) = self._best_rule(test_tokens,
  587. train_tokens)
  588. if rule is None or score < min_score:
  589. if self._trace > 1:
  590. print 'Insufficient improvement; stopping'
  591. break
  592. else:
  593. # Add the rule to our list of rules.
  594. rules.append(rule)
  595. # Use the rules to update the test token.
  596. k = rule.apply_to(test_tokens)
  597. # Display trace output.
  598. if self._trace > 1:
  599. self._trace_rule(rule, score, fixscore, len(k))
  600. # The user can also cancel training manually:
  601. except KeyboardInterrupt: pass
  602. # Create and return a tagger from the rules we found.
  603. return Brill(self._initial_tagger, rules)
  604. #////////////////////////////////////////////////////////////
  605. # Finding the best rule
  606. #////////////////////////////////////////////////////////////
  607. # Finds the rule that makes the biggest net improvement in the corpus.
  608. # Returns a (rule, score) pair.
  609. def _best_rule(self, test_tokens, train_tokens):
  610. # Create a dictionary mapping from each tag to a list of the
  611. # indices that have that tag in both test_tokens and
  612. # train_tokens (i.e., where it is correctly tagged).
  613. correct_indices = {}
  614. for i in range(len(test_tokens)):
  615. if test_tokens[i][1] == train_tokens[i][1]:
  616. tag = test_tokens[i][1]
  617. correct_indices.setdefault(tag, []).append(i)
  618. # Find all the rules that correct at least one token's tag,
  619. # and the number of tags that each rule corrects (in
  620. # descending order of number of tags corrected).
  621. rules = self._find_rules(test_tokens, train_tokens)
  622. # Keep track of the current best rule, and its score.
  623. best_rule, best_score, best_fixscore = None, 0, 0
  624. # Consider each rule, in descending order of fixscore (the
  625. # number of tags that the rule corrects, not including the
  626. # number that it breaks).
  627. for (rule, fixscore) in rules:
  628. # The actual score must be <= fixscore; so if best_score
  629. # is bigger than fixscore, then we already have the best
  630. # rule.
  631. if best_score >= fixscore:
  632. return best_rule, best_score, best_fixscore
  633. # Calculate the actual score, by decrementing fixscore
  634. # once for each tag that the rule changes to an incorrect
  635. # value.
  636. score = fixscore
  637. if correct_indices.has_key(rule.original_tag()):
  638. for i in correct_indices[rule.original_tag()]:
  639. if rule.applies(test_tokens, i):
  640. score -= 1
  641. # If the score goes below best_score, then we know
  642. # that this isn't the best rule; so move on:
  643. if score <= best_score: break
  644. #print '%5d %5d %s' % (fixscore, score, rule)
  645. # If the actual score is better than the best score, then
  646. # update best_score and best_rule.
  647. if score > best_score:
  648. best_rule, best_score, best_fixscore = rule, score, fixscore
  649. # Return the best rule, and its score.
  650. return best_rule, best_score, best_fixscore
  651. def _find_rules(self, test_tokens, train_tokens):
  652. """
  653. Find all rules that correct at least one token's tag in
  654. C{test_tokens}.
  655. @return: A list of tuples C{(rule, fixscore)}, where C{rule}
  656. is a brill rule and C{fixscore} is the number of tokens
  657. whose tag the rule corrects. Note that C{fixscore} does
  658. I{not} include the number of tokens whose tags are changed
  659. to incorrect values.
  660. """
  661. # Create a list of all indices that are incorrectly tagged.
  662. error_indices = [i for i in range(len(test_tokens))
  663. if (test_tokens[i][1] !=
  664. train_tokens[i][1])]
  665. # Create a dictionary mapping from rules to their positive-only
  666. # scores.
  667. rule_score_dict = {}
  668. for i in range(len(test_tokens)):
  669. rules = self._find_rules_at(test_tokens, train_tokens, i)
  670. for rule in rules:
  671. rule_score_dict[rule] = rule_score_dict.get(rule,0) + 1
  672. # Convert the dictionary into a list of (rule, score) tuples,
  673. # sorted in descending order of score.
  674. rule_score_items = rule_score_dict.items()
  675. temp = [(-score, rule) for (rule, score) in rule_score_items]
  676. temp.sort()
  677. return [(rule, -negscore) for (negscore, rule) in temp]
  678. def _find_rules_at(self, test_tokens, train_tokens, i):
  679. """
  680. @rtype: C{Set}
  681. @return: the set of all rules (based on the templates) that
  682. correct token C{i}'s tag in C{test_tokens}.
  683. """
  684. applicable_rules = set()
  685. if test_tokens[i][1] != train_tokens[i][1]:
  686. correct_tag = train_tokens[i][1]
  687. for template in self._templates:
  688. new_rules = template.applicable_rules(test_tokens, i,
  689. correct_tag)
  690. applicable_rules.update(new_rules)
  691. return applicable_rules
  692. #////////////////////////////////////////////////////////////
  693. # Tracing
  694. #////////////////////////////////////////////////////////////
  695. def _trace_header(self):
  696. print """
  697. B |
  698. S F r O | Score = Fixed - Broken
  699. c i o t | R Fixed = num tags changed incorrect -> correct
  700. o x k h | u Broken = num tags changed correct -> incorrect
  701. r e e e | l Other = num tags changed incorrect -> incorrect
  702. e d n r | e
  703. ------------------+-------------------------------------------------------
  704. """.rstrip()
  705. def _trace_rule(self, rule, score, fixscore, numchanges):
  706. if self._trace > 2:
  707. print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score,
  708. numchanges-fixscore*2+score)), '|',
  709. print rule
  710. ######################################################################
  711. ## Fast Brill Tagger Trainer
  712. ######################################################################
  713. class FastBrillTrainer(object):
  714. """
  715. A faster trainer for brill taggers.
  716. """
  717. def __init__(self, initial_tagger, templates, trace=0):
  718. self._initial_tagger = initial_tagger
  719. self._templates = templates
  720. self._trace = trace
  721. #////////////////////////////////////////////////////////////
  722. # Training
  723. #////////////////////////////////////////////////////////////
  724. def train(self, train_tokens, max_rules=200, min_score=2):
  725. # If TESTING is true, extra computation is done to determine whether
  726. # each "best" rule actually reduces net error by the score it received.
  727. TESTING = False
  728. # Basic idea: Keep track of the rules that apply at each position.
  729. # And keep track of the positions to which each rule applies.
  730. # The set of somewhere-useful rules that apply at each position
  731. rulesByPosition = []
  732. for i in range(len(train_tokens)):
  733. rulesByPosition.append(set())
  734. # Mapping somewhere-useful rules to the positions where they apply.
  735. # Then maps each position to the score change the rule generates there.
  736. # (always -1, 0, or 1)
  737. positionsByRule = {}
  738. # Map scores to sets of rules known to achieve *at most* that score.
  739. rulesByScore = {0:{}}
  740. # Conversely, map somewhere-useful rules to their minimal scores.
  741. ruleScores = {}
  742. tagIndices = {} # Lists of indices, mapped to by their tags
  743. # Maps rules to the first index in the corpus where it may not be known
  744. # whether the rule applies. (Rules can't be chosen for inclusion
  745. # unless this value = len(corpus). But most rules are bad, and
  746. # we won't need to check the whole corpus to know that.)
  747. # Some indices past this may actually have been checked; it just isn't
  748. # guaranteed.
  749. firstUnknownIndex = {}
  750. # Make entries in the rule-mapping dictionaries.
  751. # Should be called before _updateRuleApplies.
  752. def _initRule (rule):
  753. positionsByRule[rule] = {}
  754. rulesByScore[0][rule] = None
  755. ruleScores[rule] = 0
  756. firstUnknownIndex[rule] = 0
  757. # Takes a somewhere-useful rule which applies at index i;
  758. # Updates all rule data to reflect that the rule so applies.
  759. def _updateRuleApplies (rule, i):
  760. # If the rule is already known to apply here, ignore.
  761. # (This only happens if the position's tag hasn't changed.)
  762. if positionsByRule[rule].has_key(i):
  763. return
  764. if rule.replacement_tag() == train_tokens[i][1]:
  765. positionsByRule[rule][i] = 1
  766. elif rule.original_tag() == train_tokens[i][1]:
  767. positionsByRule[rule][i] = -1
  768. else: # was wrong, remains wrong
  769. positionsByRule[rule][i] = 0
  770. # Update rules in the other dictionaries
  771. del rulesByScore[ruleScores[rule]][rule]
  772. ruleScores[rule] += positionsByRule[rule][i]
  773. if not rulesByScore.has_key(ruleScores[rule]):
  774. rulesByScore[ruleScores[rule]] = {}
  775. rulesByScore[ruleScores[rule]][rule] = None
  776. rulesByPosition[i].add(rule)
  777. # Takes a rule which no longer applies at index i;
  778. # Updates all rule data to reflect that the rule doesn't apply.
  779. def _updateRuleNotApplies (rule, i):
  780. del rulesByScore[ruleScores[rule]][rule]
  781. ruleScores[rule] -= positionsByRule[rule][i]
  782. if not rulesByScore.has_key(ruleScores[rule]):
  783. rulesByScore[ruleScores[rule]] = {}
  784. rulesByScore[ruleScores[rule]][rule] = None
  785. del positionsByRule[rule][i]
  786. rulesByPosition[i].remove(rule)
  787. # Optional addition: if the rule now applies nowhere, delete
  788. # all its dictionary entries.
  789. tagged_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
  790. # First sort the corpus by tag, and also note where the errors are.
  791. errorIndices = [] # only used in initialization
  792. for i in range(len(tagged_tokens)):
  793. tag = tagged_tokens[i][1]
  794. if tag != train_tokens[i][1]:
  795. errorIndices.append(i)
  796. if not tagIndices.has_key(tag):
  797. tagIndices[tag] = []
  798. tagIndices[tag].append(i)
  799. print "Finding useful rules..."
  800. # Collect all rules that fix any errors, with their positive scores.
  801. for i in errorIndices:
  802. for template in self._templates:
  803. # Find the templated rules that could fix the error.
  804. for rule in template.applicable_rules(tagged_tokens, i,
  805. train_tokens[i][1]):
  806. if not positionsByRule.has_key(rule):
  807. _initRule(rule)
  808. _updateRuleApplies(rule, i)
  809. print "Done initializing %i useful rules." %len(positionsByRule)
  810. if TESTING:
  811. after = -1 # bug-check only
  812. # Each iteration through the loop tries a new maxScore.
  813. maxScore = max(rulesByScore.keys())
  814. rules = []
  815. while len(rules) < max_rules and maxScore >= min_score:
  816. # Find the next best rule. This is done by repeatedly taking a rule with
  817. # the highest score and stepping through the corpus to see where it
  818. # applies. When it makes an error (decreasing its score) it's bumped
  819. # down, and we try a new rule with the highest score.
  820. # When we find a rule which has the highest score AND which has been
  821. # tested against the entire corpus, we can conclude that it's the next
  822. # best rule.
  823. bestRule = None
  824. bestRules = rulesByScore[maxScore].keys()
  825. for rule in bestRules:
  826. # Find the first relevant index at or following the first
  827. # unknown index. (Only check indices with the right tag.)
  828. ti = bisect.bisect_left(tagIndices[rule.original_tag()],
  829. firstUnknownIndex[rule])
  830. for nextIndex in tagIndices[rule.original_tag()][ti:]:
  831. if rule.applies(tagged_tokens, nextIndex):
  832. _updateRuleApplies(rule, nextIndex)
  833. if ruleScores[rule] < maxScore:
  834. firstUnknownIndex[rule] = nextIndex+1
  835. break # the _update demoted the rule
  836. # If we checked all remaining indices and found no more errors:
  837. if ruleScores[rule] == maxScore:
  838. firstUnknownIndex[rule] = len(tagged_tokens) # i.e., we checked them all
  839. print "%i) %s (score: %i)" %(len(rules)+1, rule, maxScore)
  840. bestRule = rule
  841. break
  842. if bestRule == None: # all rules dropped below maxScore
  843. del rulesByScore[maxScore]
  844. maxScore = max(rulesByScore.keys())
  845. continue # with next-best rules
  846. # bug-check only
  847. if TESTING:
  848. before = len(_errorPositions(tagged_tokens, train_tokens))
  849. print "There are %i errors before applying this rule." %before
  850. assert after == -1 or before == after, \
  851. "after=%i but before=%i" %(after,before)
  852. print "Applying best rule at %i locations..." \
  853. %len(positionsByRule[bestRule].keys())
  854. # If we reach this point, we've found a new best rule.
  855. # Apply the rule at the relevant sites.
  856. # (apply_at is a little inefficient here, since we know the rule applies
  857. # and don't actually need to test it again.)
  858. rules.append(bestRule)
  859. bestRule.apply_at(tagged_tokens, positionsByRule[bestRule].keys())
  860. # Update the tag index accordingly.
  861. for i in positionsByRule[bestRule].keys(): # where it applied
  862. # Update positions of tags
  863. # First, find and delete the index for i from the old tag.
  864. oldIndex = bisect.bisect_left(tagIndices[bestRule.original_tag()], i)
  865. del tagIndices[bestRule.original_tag()][oldIndex]
  866. # Then, insert i into the index list of the new tag.
  867. if not tagIndices.has_key(bestRule.replacement_tag()):
  868. tagIndices[bestRule.replacement_tag()] = []
  869. newIndex = bisect.bisect_left(tagIndices[bestRule.replacement_tag()], i)
  870. tagIndices[bestRule.replacement_tag()].insert(newIndex, i)
  871. # This part is tricky.
  872. # We need to know which sites might now require new rules -- that
  873. # is, which sites are close enough to the changed site so that
  874. # a template might now generate different rules for it.
  875. # Only the templates can know this.
  876. #
  877. # If a template now generates a different set of rules, we have
  878. # to update our indices to reflect that.
  879. print "Updating neighborhoods of changed sites.\n"
  880. # First, collect all the indices that might get new rules.
  881. neighbors = set()
  882. for i in positionsByRule[bestRule].keys(): # sites changed
  883. for template in self._templates:
  884. neighbors.update(template.get_neighborhood(tagged_tokens, i))
  885. # Then collect the new set of rules for each such index.
  886. c = d = e = 0
  887. for i in neighbors:
  888. siteRules = set()
  889. for template in self._templates:
  890. # Get a set of the rules that the template now generates
  891. siteRules.update(set(template.applicable_rules(
  892. tagged_tokens, i, train_tokens[i][1])))
  893. # Update rules no longer generated here by any template
  894. for obsolete in rulesByPosition[i] - siteRules:
  895. c += 1
  896. _updateRuleNotApplies(obsolete, i)
  897. # Update rules only now generated by this template
  898. for newRule in siteRules - rulesByPosition[i]:
  899. d += 1
  900. if not positionsByRule.has_key(newRule):
  901. e += 1
  902. _initRule(newRule) # make a new rule w/score=0
  903. _updateRuleApplies(newRule, i) # increment score, etc.
  904. if TESTING:
  905. after = before - maxScore
  906. print "%i obsolete rule applications, %i new ones, " %(c,d)+ \
  907. "using %i previously-unseen rules." %e
  908. maxScore = max(rulesByScore.keys()) # may have gone up
  909. if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
  910. len(train_tokens))
  911. # Maintain a list of the rules that apply at each position.
  912. rules_by_position = [{} for tok in train_tokens]
  913. # Create and return a tagger from the rules we found.
  914. return Brill(self._initial_tagger, rules)
  915. ######################################################################
  916. ## Testing
  917. ######################################################################
  918. def _errorPositions (train_tokens, tokens):
  919. return [i for i in range(len(tokens))
  920. if tokens[i][1] !=
  921. train_tokens[i][1] ]
  922. # returns a list of errors in string format
  923. def errorList (train_tokens, tokens, radius=2):
  924. """
  925. Returns a list of human-readable strings indicating the errors in the
  926. given tagging of the corpus.
  927. @param train_tokens: The correct tagging of the corpus
  928. @type train_tokens: C{list} of C{tuple}
  929. @param tokens: The tagged corpus
  930. @type tokens: C{list} of C{tuple}
  931. @param radius: How many tokens on either side of a wrongly-tagged token
  932. to include in the error string. For example, if C{radius}=2, each error
  933. string will show the incorrect token plus two tokens on either side.
  934. @type radius: int
  935. """
  936. errors = []
  937. indices = _errorPositions(train_tokens, tokens)
  938. tokenLen = len(tokens)
  939. for i in indices:
  940. ei = tokens[i][1].rjust(3) + " -> " \
  941. + train_tokens[i][1].rjust(3) + ": "
  942. for j in range( max(i-radius, 0), min(i+radius+1, tokenLen) ):
  943. if tokens[j][0] == tokens[j][1]:
  944. s = tokens[j][0] # don't print punctuation tags
  945. else:
  946. s = tokens[j][0] + "/" + tokens[j][1]
  947. if j == i:
  948. ei += "**"+s+"** "
  949. else:
  950. ei += s + " "
  951. errors.append(ei)
  952. return errors
  953. #####################################################################################
  954. # Demonstration
  955. #####################################################################################
  956. def demo(num_sents=100, max_rules=200, min_score=2, error_output = "errors.out",
  957. rule_output="rules.out", randomize=False, train=.8, trace=3):
  958. """
  959. Brill Tagger Demonstration
  960. @param num_sents: how many sentences of training and testing data to use
  961. @type num_sents: L{int}
  962. @param max_rules: maximum number of rule instances to create
  963. @type max_rules: L{int}
  964. @param min_score: the minimum score for a rule in order for it to be considered
  965. @type min_score: L{int}
  966. @param error_output: the file where errors will be saved
  967. @type error_output: L{string}
  968. @param rule_output: the file where rules will be saved
  969. @type rule_output: L{string}
  970. @param randomize: whether the training data should be a random subset of the corpus
  971. @type randomize: L{boolean}
  972. @param train: the fraction of the the corpus to be used for training (1=all)
  973. @type train: L{float}
  974. @param trace: the level of diagnostic tracing output to produce (0-3)
  975. @type trace: L{int}
  976. """
  977. from nltk.corpus import treebank
  978. from nltk import tag
  979. from nltk.tag import brill
  980. NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
  981. # train is the proportion of data used in training; the rest is reserved
  982. # for testing.
  983. print "Loading tagged data..."
  984. sents = []
  985. for item in treebank.items:
  986. sents.extend(treebank.tagged(item))
  987. if randomize:
  988. random.seed(len(sents))
  989. random.shuffle(sents)
  990. tagged_data = [t for s in sents[:num_sents] for t in s]
  991. cutoff = int(len(tagged_data)*train)
  992. training_data = tagged_data[:cutoff]
  993. gold_data = tagged_data[cutoff:]
  994. testing_data = [t[0] for t in gold_data]
  995. # Unigram tagger
  996. print "Training unigram tagger:",
  997. u = tag.Unigram(backoff=NN_CD_tagger)
  998. # NB training and testing are required to use a list-of-lists structure,
  999. # so we wrap the flattened corpus data with the extra list structure.
  1000. u.train([training_data])
  1001. print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))
  1002. # Brill tagger
  1003. templates = [
  1004. brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
  1005. brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
  1006. brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
  1007. brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
  1008. brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
  1009. brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
  1010. brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
  1011. brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
  1012. brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
  1013. brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
  1014. ]
  1015. #trainer = brill.FastBrillTrainer(u, templates, trace)
  1016. trainer = brill.BrillTrainer(u, templates, trace)
  1017. b = trainer.train(training_data, max_rules, min_score)
  1018. print
  1019. print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))
  1020. print("\nRules: ")
  1021. printRules = file(rule_output, 'w')
  1022. for rule in b.rules():
  1023. print(str(rule))
  1024. printRules.write(str(rule)+"\n\n")
  1025. testing_data = list(b.tag(testing_data))
  1026. el = errorList(gold_data, testing_data)
  1027. errorFile =

Large files files are truncated, but you can click here to view the full file