PageRenderTime 52ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/nltk/tag/sequential.py

https://github.com/BrucePHill/nltk
Python | 672 lines | 549 code | 19 blank | 104 comment | 15 complexity | b67a423d2dcd926bf7b2cd86d8948504 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Sequential Backoff Taggers
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  6. # Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
  7. # URL: <http://www.nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Classes for tagging sentences sequentially, left to right. The
  11. abstract base class SequentialBackoffTagger serves as the base
  12. class for all the taggers in this module. Tagging of individual words
  13. is performed by the method ``choose_tag()``, which is defined by
  14. subclasses of SequentialBackoffTagger. If a tagger is unable to
  15. determine a tag for the specified token, then its backoff tagger is
  16. consulted instead. Any SequentialBackoffTagger may serve as a
  17. backoff tagger for any other SequentialBackoffTagger.
  18. """
  19. from __future__ import print_function, unicode_literals
  20. import re
  21. import yaml
  22. from nltk.probability import FreqDist, ConditionalFreqDist
  23. from nltk.classify.naivebayes import NaiveBayesClassifier
  24. from nltk.compat import python_2_unicode_compatible
  25. from nltk.tag.api import TaggerI, FeaturesetTaggerI
  26. ######################################################################
  27. #{ Abstract Base Classes
  28. ######################################################################
  29. class SequentialBackoffTagger(TaggerI):
  30. """
  31. An abstract base class for taggers that tags words sequentially,
  32. left to right. Tagging of individual words is performed by the
  33. ``choose_tag()`` method, which should be defined by subclasses. If
  34. a tagger is unable to determine a tag for the specified token,
  35. then its backoff tagger is consulted.
  36. :ivar _taggers: A list of all the taggers that should be tried to
  37. tag a token (i.e., self and its backoff taggers).
  38. """
  39. def __init__(self, backoff=None):
  40. if backoff is None:
  41. self._taggers = [self]
  42. else:
  43. self._taggers = [self] + backoff._taggers
  44. @property
  45. def backoff(self):
  46. """The backoff tagger for this tagger."""
  47. return (self._taggers[1] if len(self._taggers) > 1 else None)
  48. def tag(self, tokens):
  49. # docs inherited from TaggerI
  50. tags = []
  51. for i in range(len(tokens)):
  52. tags.append(self.tag_one(tokens, i, tags))
  53. return list(zip(tokens, tags))
  54. def tag_one(self, tokens, index, history):
  55. """
  56. Determine an appropriate tag for the specified token, and
  57. return that tag. If this tagger is unable to determine a tag
  58. for the specified token, then its backoff tagger is consulted.
  59. :rtype: str
  60. :type tokens: list
  61. :param tokens: The list of words that are being tagged.
  62. :type index: int
  63. :param index: The index of the word whose tag should be
  64. returned.
  65. :type history: list(str)
  66. :param history: A list of the tags for all words before *index*.
  67. """
  68. tag = None
  69. for tagger in self._taggers:
  70. tag = tagger.choose_tag(tokens, index, history)
  71. if tag is not None: break
  72. return tag
  73. def choose_tag(self, tokens, index, history):
  74. """
  75. Decide which tag should be used for the specified token, and
  76. return that tag. If this tagger is unable to determine a tag
  77. for the specified token, return None -- do not consult
  78. the backoff tagger. This method should be overridden by
  79. subclasses of SequentialBackoffTagger.
  80. :rtype: str
  81. :type tokens: list
  82. :param tokens: The list of words that are being tagged.
  83. :type index: int
  84. :param index: The index of the word whose tag should be
  85. returned.
  86. :type history: list(str)
  87. :param history: A list of the tags for all words before *index*.
  88. """
  89. raise NotImplementedError()
  90. @python_2_unicode_compatible
  91. class ContextTagger(SequentialBackoffTagger):
  92. """
  93. An abstract base class for sequential backoff taggers that choose
  94. a tag for a token based on the value of its "context". Different
  95. subclasses are used to define different contexts.
  96. A ContextTagger chooses the tag for a token by calculating the
  97. token's context, and looking up the corresponding tag in a table.
  98. This table can be constructed manually; or it can be automatically
  99. constructed based on a training corpus, using the ``_train()``
  100. factory method.
  101. :ivar _context_to_tag: Dictionary mapping contexts to tags.
  102. """
  103. def __init__(self, context_to_tag, backoff=None):
  104. """
  105. :param context_to_tag: A dictionary mapping contexts to tags.
  106. :param backoff: The backoff tagger that should be used for this tagger.
  107. """
  108. SequentialBackoffTagger.__init__(self, backoff)
  109. self._context_to_tag = (context_to_tag if context_to_tag else {})
  110. def context(self, tokens, index, history):
  111. """
  112. :return: the context that should be used to look up the tag
  113. for the specified token; or None if the specified token
  114. should not be handled by this tagger.
  115. :rtype: (hashable)
  116. """
  117. raise NotImplementedError()
  118. def choose_tag(self, tokens, index, history):
  119. context = self.context(tokens, index, history)
  120. return self._context_to_tag.get(context)
  121. def size(self):
  122. """
  123. :return: The number of entries in the table used by this
  124. tagger to map from contexts to tags.
  125. """
  126. return len(self._context_to_tag)
  127. def __repr__(self):
  128. return '<%s: size=%d>' % (self.__class__.__name__, self.size())
  129. def _train(self, tagged_corpus, cutoff=0, verbose=False):
  130. """
  131. Initialize this ContextTagger's ``_context_to_tag`` table
  132. based on the given training data. In particular, for each
  133. context ``c`` in the training data, set
  134. ``_context_to_tag[c]`` to the most frequent tag for that
  135. context. However, exclude any contexts that are already
  136. tagged perfectly by the backoff tagger(s).
  137. The old value of ``self._context_to_tag`` (if any) is discarded.
  138. :param tagged_corpus: A tagged corpus. Each item should be
  139. a list of (word, tag tuples.
  140. :param cutoff: If the most likely tag for a context occurs
  141. fewer than cutoff times, then exclude it from the
  142. context-to-tag table for the new tagger.
  143. """
  144. token_count = hit_count = 0
  145. # A context is considered 'useful' if it's not already tagged
  146. # perfectly by the backoff tagger.
  147. useful_contexts = set()
  148. # Count how many times each tag occurs in each context.
  149. fd = ConditionalFreqDist()
  150. for sentence in tagged_corpus:
  151. tokens, tags = zip(*sentence)
  152. for index, (token, tag) in enumerate(sentence):
  153. # Record the event.
  154. token_count += 1
  155. context = self.context(tokens, index, tags[:index])
  156. if context is None: continue
  157. fd[context].inc(tag)
  158. # If the backoff got it wrong, this context is useful:
  159. if (self.backoff is None or
  160. tag != self.backoff.tag_one(tokens, index, tags[:index])):
  161. useful_contexts.add(context)
  162. # Build the context_to_tag table -- for each context, figure
  163. # out what the most likely tag is. Only include contexts that
  164. # we've seen at least `cutoff` times.
  165. for context in useful_contexts:
  166. best_tag = fd[context].max()
  167. hits = fd[context][best_tag]
  168. if hits > cutoff:
  169. self._context_to_tag[context] = best_tag
  170. hit_count += hits
  171. # Display some stats, if requested.
  172. if verbose:
  173. size = len(self._context_to_tag)
  174. backoff = 100 - (hit_count * 100.0)/ token_count
  175. pruning = 100 - (size * 100.0) / len(fd.conditions())
  176. print("[Trained Unigram tagger:", end=' ')
  177. print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
  178. size, backoff, pruning))
  179. ######################################################################
  180. #{ Tagger Classes
  181. ######################################################################
  182. @python_2_unicode_compatible
  183. class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):
  184. """
  185. A tagger that assigns the same tag to every token.
  186. >>> from nltk.tag.sequential import DefaultTagger
  187. >>> default_tagger = DefaultTagger('NN')
  188. >>> list(default_tagger.tag('This is a test'.split()))
  189. [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
  190. This tagger is recommended as a backoff tagger, in cases where
  191. a more powerful tagger is unable to assign a tag to the word
  192. (e.g. because the word was not seen during training).
  193. :param tag: The tag to assign to each token
  194. :type tag: str
  195. """
  196. yaml_tag = '!nltk.DefaultTagger'
  197. def __init__(self, tag):
  198. self._tag = tag
  199. SequentialBackoffTagger.__init__(self, None)
  200. def choose_tag(self, tokens, index, history):
  201. return self._tag # ignore token and history
  202. def __repr__(self):
  203. return '<DefaultTagger: tag=%s>' % self._tag
  204. class NgramTagger(ContextTagger, yaml.YAMLObject):
  205. """
  206. A tagger that chooses a token's tag based on its word string and
  207. on the preceding n word's tags. In particular, a tuple
  208. (tags[i-n:i-1], words[i]) is looked up in a table, and the
  209. corresponding tag is returned. N-gram taggers are typically
  210. trained on a tagged corpus.
  211. Train a new NgramTagger using the given training data or
  212. the supplied model. In particular, construct a new tagger
  213. whose table maps from each context (tag[i-n:i-1], word[i])
  214. to the most frequent tag for that context. But exclude any
  215. contexts that are already tagged perfectly by the backoff
  216. tagger.
  217. :param train: A tagged corpus consisting of a list of tagged
  218. sentences, where each sentence is a list of (word, tag) tuples.
  219. :param backoff: A backoff tagger, to be used by the new
  220. tagger if it encounters an unknown context.
  221. :param cutoff: If the most likely tag for a context occurs
  222. fewer than *cutoff* times, then exclude it from the
  223. context-to-tag table for the new tagger.
  224. """
  225. yaml_tag = '!nltk.NgramTagger'
  226. def __init__(self, n, train=None, model=None,
  227. backoff=None, cutoff=0, verbose=False):
  228. self._n = n
  229. self._check_params(train, model)
  230. ContextTagger.__init__(self, model, backoff)
  231. if train:
  232. self._train(train, cutoff, verbose)
  233. def context(self, tokens, index, history):
  234. tag_context = tuple(history[max(0,index-self._n+1):index])
  235. return (tag_context, tokens[index])
  236. class UnigramTagger(NgramTagger):
  237. """
  238. Unigram Tagger
  239. The UnigramTagger finds the most likely tag for each word in a training
  240. corpus, and then uses that information to assign tags to new tokens.
  241. >>> from nltk.corpus import brown
  242. >>> from nltk.tag.sequential import UnigramTagger
  243. >>> test_sent = brown.sents(categories='news')[0]
  244. >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
  245. >>> for tok, tag in unigram_tagger.tag(test_sent):
  246. ... print("(%s, %s), " % (tok, tag))
  247. (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
  248. (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
  249. (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
  250. (primary, NN), (election, NN), (produced, VBD), (``, ``),
  251. (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI),
  252. (irregularities, NNS), (took, VBD), (place, NN), (., .),
  253. :param train: The corpus of training data, a list of tagged sentences
  254. :type train: list(list(tuple(str, str)))
  255. :param model: The tagger model
  256. :type model: dict
  257. :param backoff: Another tagger which this tagger will consult when it is
  258. unable to tag a word
  259. :type backoff: TaggerI
  260. :param cutoff: The number of instances of training data the tagger must see
  261. in order not to use the backoff tagger
  262. :type cutoff: int
  263. """
  264. yaml_tag = '!nltk.UnigramTagger'
  265. def __init__(self, train=None, model=None,
  266. backoff=None, cutoff=0, verbose=False):
  267. NgramTagger.__init__(self, 1, train, model,
  268. backoff, cutoff, verbose)
  269. def context(self, tokens, index, history):
  270. return tokens[index]
  271. class BigramTagger(NgramTagger):
  272. """
  273. A tagger that chooses a token's tag based its word string and on
  274. the preceding words' tag. In particular, a tuple consisting
  275. of the previous tag and the word is looked up in a table, and
  276. the corresponding tag is returned.
  277. :param train: The corpus of training data, a list of tagged sentences
  278. :type train: list(list(tuple(str, str)))
  279. :param model: The tagger model
  280. :type model: dict
  281. :param backoff: Another tagger which this tagger will consult when it is
  282. unable to tag a word
  283. :type backoff: TaggerI
  284. :param cutoff: The number of instances of training data the tagger must see
  285. in order not to use the backoff tagger
  286. :type cutoff: int
  287. """
  288. yaml_tag = '!nltk.BigramTagger'
  289. def __init__(self, train=None, model=None,
  290. backoff=None, cutoff=0, verbose=False):
  291. NgramTagger.__init__(self, 2, train, model,
  292. backoff, cutoff, verbose)
  293. class TrigramTagger(NgramTagger):
  294. """
  295. A tagger that chooses a token's tag based its word string and on
  296. the preceding two words' tags. In particular, a tuple consisting
  297. of the previous two tags and the word is looked up in a table, and
  298. the corresponding tag is returned.
  299. :param train: The corpus of training data, a list of tagged sentences
  300. :type train: list(list(tuple(str, str)))
  301. :param model: The tagger model
  302. :type model: dict
  303. :param backoff: Another tagger which this tagger will consult when it is
  304. unable to tag a word
  305. :type backoff: TaggerI
  306. :param cutoff: The number of instances of training data the tagger must see
  307. in order not to use the backoff tagger
  308. :type cutoff: int
  309. """
  310. yaml_tag = '!nltk.TrigramTagger'
  311. def __init__(self, train=None, model=None,
  312. backoff=None, cutoff=0, verbose=False):
  313. NgramTagger.__init__(self, 3, train, model,
  314. backoff, cutoff, verbose)
  315. class AffixTagger(ContextTagger, yaml.YAMLObject):
  316. """
  317. A tagger that chooses a token's tag based on a leading or trailing
  318. substring of its word string. (It is important to note that these
  319. substrings are not necessarily "true" morphological affixes). In
  320. particular, a fixed-length substring of the word is looked up in a
  321. table, and the corresponding tag is returned. Affix taggers are
  322. typically constructed by training them on a tagged corpus.
  323. Construct a new affix tagger.
  324. :param affix_length: The length of the affixes that should be
  325. considered during training and tagging. Use negative
  326. numbers for suffixes.
  327. :param min_stem_length: Any words whose length is less than
  328. min_stem_length+abs(affix_length) will be assigned a
  329. tag of None by this tagger.
  330. """
  331. yaml_tag = '!nltk.AffixTagger'
  332. def __init__(self, train=None, model=None, affix_length=-3,
  333. min_stem_length=2, backoff=None, cutoff=0, verbose=False):
  334. self._check_params(train, model)
  335. ContextTagger.__init__(self, model, backoff)
  336. self._affix_length = affix_length
  337. self._min_word_length = min_stem_length + abs(affix_length)
  338. if train:
  339. self._train(train, cutoff, verbose)
  340. def context(self, tokens, index, history):
  341. token = tokens[index]
  342. if len(token) < self._min_word_length:
  343. return None
  344. elif self._affix_length > 0:
  345. return token[:self._affix_length]
  346. else:
  347. return token[self._affix_length:]
  348. @python_2_unicode_compatible
  349. class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
  350. """
  351. Regular Expression Tagger
  352. The RegexpTagger assigns tags to tokens by comparing their
  353. word strings to a series of regular expressions. The following tagger
  354. uses word suffixes to make guesses about the correct Brown Corpus part
  355. of speech tag:
  356. >>> from nltk.corpus import brown
  357. >>> from nltk.tag.sequential import RegexpTagger
  358. >>> test_sent = brown.sents(categories='news')[0]
  359. >>> regexp_tagger = RegexpTagger(
  360. ... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
  361. ... (r'(The|the|A|a|An|an)$', 'AT'), # articles
  362. ... (r'.*able$', 'JJ'), # adjectives
  363. ... (r'.*ness$', 'NN'), # nouns formed from adjectives
  364. ... (r'.*ly$', 'RB'), # adverbs
  365. ... (r'.*s$', 'NNS'), # plural nouns
  366. ... (r'.*ing$', 'VBG'), # gerunds
  367. ... (r'.*ed$', 'VBD'), # past tense verbs
  368. ... (r'.*', 'NN') # nouns (default)
  369. ... ])
  370. >>> regexp_tagger
  371. <Regexp Tagger: size=9>
  372. >>> regexp_tagger.tag(test_sent)
  373. [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
  374. ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
  375. ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
  376. ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
  377. ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
  378. ('place', 'NN'), ('.', 'NN')]
  379. :type regexps: list(tuple(str, str))
  380. :param regexps: A list of ``(regexp, tag)`` pairs, each of
  381. which indicates that a word matching ``regexp`` should
  382. be tagged with ``tag``. The pairs will be evalutated in
  383. order. If none of the regexps match a word, then the
  384. optional backoff tagger is invoked, else it is
  385. assigned the tag None.
  386. """
  387. yaml_tag = '!nltk.RegexpTagger'
  388. def __init__(self, regexps, backoff=None):
  389. """
  390. """
  391. SequentialBackoffTagger.__init__(self, backoff)
  392. labels = ['g'+str(i) for i in range(len(regexps))]
  393. tags = [tag for regex, tag in regexps]
  394. self._map = dict(zip(labels, tags))
  395. regexps_labels = [(regex, label) for ((regex,tag),label) in zip(regexps,labels)]
  396. self._regexs = re.compile('|'.join('(?P<%s>%s)' % (label, regex) for regex,label in regexps_labels))
  397. self._size=len(regexps)
  398. def choose_tag(self, tokens, index, history):
  399. m = self._regexs.match(tokens[index])
  400. if m:
  401. return self._map[m.lastgroup]
  402. return None
  403. def __repr__(self):
  404. return '<Regexp Tagger: size=%d>' % self._size
  405. @python_2_unicode_compatible
  406. class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
  407. """
  408. A sequential tagger that uses a classifier to choose the tag for
  409. each token in a sentence. The featureset input for the classifier
  410. is generated by a feature detector function::
  411. feature_detector(tokens, index, history) -> featureset
  412. Where tokens is the list of unlabeled tokens in the sentence;
  413. index is the index of the token for which feature detection
  414. should be performed; and history is list of the tags for all
  415. tokens before index.
  416. Construct a new classifier-based sequential tagger.
  417. :param feature_detector: A function used to generate the
  418. featureset input for the classifier::
  419. feature_detector(tokens, index, history) -> featureset
  420. :param train: A tagged corpus consisting of a list of tagged
  421. sentences, where each sentence is a list of (word, tag) tuples.
  422. :param backoff: A backoff tagger, to be used by the new tagger
  423. if it encounters an unknown context.
  424. :param classifier_builder: A function used to train a new
  425. classifier based on the data in *train*. It should take
  426. one argument, a list of labeled featuresets (i.e.,
  427. (featureset, label) tuples).
  428. :param classifier: The classifier that should be used by the
  429. tagger. This is only useful if you want to manually
  430. construct the classifier; normally, you would use *train*
  431. instead.
  432. :param backoff: A backoff tagger, used if this tagger is
  433. unable to determine a tag for a given token.
  434. :param cutoff_prob: If specified, then this tagger will fall
  435. back on its backoff tagger if the probability of the most
  436. likely tag is less than *cutoff_prob*.
  437. """
  438. def __init__(self, feature_detector=None, train=None,
  439. classifier_builder=NaiveBayesClassifier.train,
  440. classifier=None, backoff=None,
  441. cutoff_prob=None, verbose=False):
  442. self._check_params(train, classifier)
  443. SequentialBackoffTagger.__init__(self, backoff)
  444. if (train and classifier) or (not train and not classifier):
  445. raise ValueError('Must specify either training data or '
  446. 'trained classifier.')
  447. if feature_detector is not None:
  448. self._feature_detector = feature_detector
  449. # The feature detector function, used to generate a featureset
  450. # or each token: feature_detector(tokens, index, history) -> featureset
  451. self._cutoff_prob = cutoff_prob
  452. """Cutoff probability for tagging -- if the probability of the
  453. most likely tag is less than this, then use backoff."""
  454. self._classifier = classifier
  455. """The classifier used to choose a tag for each token."""
  456. if train:
  457. self._train(train, classifier_builder, verbose)
  458. def choose_tag(self, tokens, index, history):
  459. # Use our feature detector to get the featureset.
  460. featureset = self.feature_detector(tokens, index, history)
  461. # Use the classifier to pick a tag. If a cutoff probability
  462. # was specified, then check that the tag's probability is
  463. # higher than that cutoff first; otherwise, return None.
  464. if self._cutoff_prob is None:
  465. return self._classifier.classify(featureset)
  466. pdist = self._classifier.prob_classify(featureset)
  467. tag = pdist.max()
  468. return (tag if pdist.prob(tag) >= self._cutoff_prob else None)
  469. def _train(self, tagged_corpus, classifier_builder, verbose):
  470. """
  471. Build a new classifier, based on the given training data
  472. *tagged_corpus*.
  473. """
  474. classifier_corpus = []
  475. if verbose:
  476. print('Constructing training corpus for classifier.')
  477. for sentence in tagged_corpus:
  478. history = []
  479. untagged_sentence, tags = zip(*sentence)
  480. for index in range(len(sentence)):
  481. featureset = self.feature_detector(untagged_sentence,
  482. index, history)
  483. classifier_corpus.append( (featureset, tags[index]) )
  484. history.append(tags[index])
  485. if verbose:
  486. print('Training classifier (%d instances)' % len(classifier_corpus))
  487. self._classifier = classifier_builder(classifier_corpus)
  488. def __repr__(self):
  489. return '<ClassifierBasedTagger: %r>' % self._classifier
  490. def feature_detector(self, tokens, index, history):
  491. """
  492. Return the feature detector that this tagger uses to generate
  493. featuresets for its classifier. The feature detector is a
  494. function with the signature::
  495. feature_detector(tokens, index, history) -> featureset
  496. See ``classifier()``
  497. """
  498. return self._feature_detector(tokens, index, history)
  499. def classifier(self):
  500. """
  501. Return the classifier that this tagger uses to choose a tag
  502. for each word in a sentence. The input for this classifier is
  503. generated using this tagger's feature detector.
  504. See ``feature_detector()``
  505. """
  506. return self._classifier
  507. class ClassifierBasedPOSTagger(ClassifierBasedTagger):
  508. """
  509. A classifier based part of speech tagger.
  510. """
  511. def feature_detector(self, tokens, index, history):
  512. word = tokens[index]
  513. if index == 0:
  514. prevword = prevprevword = None
  515. prevtag = prevprevtag = None
  516. elif index == 1:
  517. prevword = tokens[index-1].lower()
  518. prevprevword = None
  519. prevtag = history[index-1]
  520. prevprevtag = None
  521. else:
  522. prevword = tokens[index-1].lower()
  523. prevprevword = tokens[index-2].lower()
  524. prevtag = history[index-1]
  525. prevprevtag = history[index-2]
  526. if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
  527. shape = 'number'
  528. elif re.match('\W+$', word):
  529. shape = 'punct'
  530. elif re.match('[A-Z][a-z]+$', word):
  531. shape = 'upcase'
  532. elif re.match('[a-z]+$', word):
  533. shape = 'downcase'
  534. elif re.match('\w+$', word):
  535. shape = 'mixedcase'
  536. else:
  537. shape = 'other'
  538. features = {
  539. 'prevtag': prevtag,
  540. 'prevprevtag': prevprevtag,
  541. 'word': word,
  542. 'word.lower': word.lower(),
  543. 'suffix3': word.lower()[-3:],
  544. 'suffix2': word.lower()[-2:],
  545. 'suffix1': word.lower()[-1:],
  546. 'prevprevword': prevprevword,
  547. 'prevword': prevword,
  548. 'prevtag+word': '%s+%s' % (prevtag, word.lower()),
  549. 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
  550. 'prevword+word': '%s+%s' % (prevword, word.lower()),
  551. 'shape': shape,
  552. }
  553. return features
  554. if __name__ == "__main__":
  555. import doctest
  556. doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)